diff --git a/.clang-tidy b/.clang-tidy
index 6677e5e..14e9910 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -211,7 +211,7 @@ CheckOptions:
     value: ''
   - key: readability-identifier-naming.NamespaceSuffix
     value: ''
-  
+
   # Performance and modernization options
   - key: modernize-use-auto.MinTypeNameLength
     value: '5'
@@ -223,13 +223,13 @@ CheckOptions:
     value: 'true'
   - key: performance-unnecessary-value-param.IncludeStyle
     value: 'llvm'
-    
+
   # Certificate and security options
   - key: cert-dcl16-c.NewSuffixes
     value: 'L;LL;LU;LLU'
   - key: cert-oop54-cpp.WarnOnlyIfThisHasSuspiciousField
     value: 'false'
-    
+
   # Core guidelines options
   - key: cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor
     value: 'true'
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
index 7ed0b29..b6b0143 100644
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@@ -4,3 +4,6 @@
 
 # style: bulk reformat all source files with clang-format (2026-04-23)
 7221753
+
+# style: apply clang-format 19.1.7 to all source files (2026-05-03)
+662c172
diff --git a/.gitattributes b/.gitattributes
index a6df10c..50fd8dc 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -30,10 +30,10 @@ CMakeLists.txt text eol=lf
 # Scripts — always LF so they run correctly in bash/sh
 *.sh text eol=lf
 
-# Windows-only scripts stay CRLF
+# Windows batch/cmd scripts stay CRLF; PowerShell handles LF on all platforms
 *.bat text eol=crlf
 *.cmd text eol=crlf
-*.ps1 text eol=crlf
+*.ps1 text eol=lf
 
 # XML (HMM model files)
 *.xml text eol=lf
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7404884..46b4c4c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,54 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [3.3.0] - 2026-05-03
+
+SIMD performance phase: explicit vector kernels for transcendental
+operations and two additional Tier-2 distributions. 37/37 tests pass.
+
+### Added
+
+- **SIMD transcendental kernels** (`src/performance/transcendental_kernels.cpp`):
+  five inner-loop kernels used by `ForwardBackwardCalculator` (FB max-reduce
+  recurrence) and `BaumWelchTrainer` (dense-xi accumulation) now have
+  AVX-512 / AVX / SSE2 / NEON backends. The vector `exp` helper uses a
+  13-term Horner polynomial with Cephes `ln2` range reduction and branch-free
+  underflow masking at `MIN_LOG_PROBABILITY`. AVX path stays AVX-1 compatible
+  for Ivy Bridge / Catalina. Benchmarks on Zen 4 / AVX-512 (T=1000):
+  FB max-reduce 5.7× faster at N=32; BW xi accumulation 1.03–1.15×.
+- **LogNormal and Pareto promoted to Tier 2** (`src/distributions/`): explicit
+  SIMD `getBatchLogProbabilities` via a vector `log` helper (IEEE-754 exponent
+  extraction, 7-term Horner, split-LN2 reconstruction, ≤5 ULP).
+- **`simd_kernels_internal.h`**: single source of truth for vector exp/log
+  primitives shared by all Tier-2 distribution TUs and the transcendental
+  kernels TU.
+- **FB recurrence crossover retuned** (`fb_recurrence_policy.h`): threshold
+  moved from N≥5 to N≥4 on x86 after profiling post-SIMD (MaxReduce is 1.7×
+  faster at N=4).
+- **New tests** (37 total, up from 33):
+  - `test_simd_platform`: compile-time ISA hierarchy invariants (`#error`) and
+    runtime contracts on `simd_platform.h` utility functions.
+  - `test_transcendental_kernels`: SIMD vs `std::exp` parity for all five
+    kernels across 11 sizes; 1e-12 rel / 1e-15 abs tolerance.
+  - `test_fb_mode_parity`: Pairwise vs MaxReduce FB log-likelihood agreement.
+  - `test_bw_parity`: BW determinism (bit-exact) and EM monotonicity.
+- **New tools**: `bw_hotspot` (BW E-step phase breakdown), `hotspot_breakdown`
+  (FB phase-level timings), `fb_crossover_sweep` (Pairwise vs MaxReduce
+  timing across N), `fb_contour_sweep` (2-D N×T timing heatmap data).
+
+### Changed
+
+- `fb_recurrence_policy.h` moved from `include/libhmm/calculators/` to
+  `include/libhmm/performance/` (cross-cutting primitive, not calculator-specific).
+- Test group labels in `tests/CMakeLists.txt` changed from numeric Level N
+  notation to semantic names; Performance Primitives group reordered before
+  Distributions to reflect dependency order.
+- `performance/PERFORMANCE_ARCHITECTURE.md` updated: Tier-2 coverage,
+  delivered recurrence-kernel SIMD, corrected `LIBHMM_SIMD_SOURCES` list.
+- `*.ps1` line-ending rule in `.gitattributes` changed from `eol=crlf` to
+  `eol=lf` (PowerShell handles LF on all platforms; avoids CI pre-commit
+  mixed-line-ending failures).
+
 ## [3.2.1] - 2026-05-02
 
 CI hygiene fix; no functional changes.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 92e7bed..ded76de 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,7 +58,7 @@ if(APPLE AND NOT CMAKE_CXX_COMPILER)
 endif()
 
 project(libhmm
-    VERSION 3.2.1
+    VERSION 3.3.0
     DESCRIPTION "Modern C++20 Hidden Markov Model Library"
     LANGUAGES CXX
 )
@@ -479,6 +479,15 @@ set(LIBHMM_SIMD_SOURCES
     src/distributions/weibull_distribution.cpp
 )
 
+# Additional TUs that include simd_kernels_internal.h or transcendental_kernels.h
+# and therefore need LIBHMM_BEST_SIMD_FLAGS to activate the #if LIBHMM_HAS_* cascade.
+# (log_normal and pareto are already in LIBHMM_SIMD_SOURCES above.)
+list(APPEND LIBHMM_SIMD_SOURCES
+    src/performance/transcendental_kernels.cpp
+    src/calculators/forward_backward_calculator.cpp
+    src/training/baum_welch_trainer.cpp
+)
+
 if(LIBHMM_BEST_SIMD_FLAGS)
     foreach(simd_src ${LIBHMM_SIMD_SOURCES})
         set_source_files_properties(
@@ -499,6 +508,7 @@ set(LIBHMM_SOURCES
     src/common/common.cpp
     src/common/string_tokenizer.cpp
     src/common/numerical_stability.cpp
+    src/performance/transcendental_kernels.cpp
     src/distributions/distribution_base.cpp
     src/distributions/discrete_distribution.cpp
     src/distributions/gaussian_distribution.cpp
diff --git a/WARP.md b/WARP.md
index e78bc16..ac46e41 100644
--- a/WARP.md
+++ b/WARP.md
@@ -6,8 +6,8 @@ This file provides guidance to Warp (warp.dev) when working in this repository.
 
 ## Current Status
 
-**Version**: v3.2.1 — latest tag and published release on `main`.
-**Tests**: 33/33 passing on all four CI platforms (Linux/GCC, Linux/Clang, macOS/AppleClang, Windows/MSVC).
+**Version**: v3.3.0 — latest tag and published release on `main`.
+**Tests**: 37/37 passing on all four CI platforms (Linux/GCC, Linux/Clang, macOS/AppleClang, Windows/MSVC).
 **Active phase**: Complete. All phases through Post-Phase 5 (CI/tooling, benchmarks) are done.
 
 ---
@@ -36,7 +36,7 @@ include/libhmm/
 │   └── segmental_kmeans_trainer.h     # Discrete-state initialisation
 └── io/             # XML I/O
 src/                # Implementation (mirrors include/)
-tests/              # GTest suite — levels 0–7 (see tests/CMakeLists.txt)
+tests/              # GTest suite — semantic groups (see tests/CMakeLists.txt)
 examples/           # 13 usage demonstrations (all canonical API)
 tools/              # Standalone diagnostic/benchmarking executables
 benchmarks/         # Comparative benchmarks
@@ -70,7 +70,7 @@ Both are always produced regardless of `BUILD_SHARED_LIBS`. Tests link against
 
 2. **Two canonical calculators** — `ForwardBackwardCalculator` (log-space, precomputed log-trans) and `ViterbiCalculator`. Both call `getBatchLogProbabilities()` per state per time step.
 
-3. **Compile-time SIMD dispatch** — source-distributed; each machine builds for its own CPU. GCC/Clang: `-march=native`. MSVC: `check_cxx_source_runs`-verified `/arch:AVX512`/`AVX2`/`AVX`. All 15 distribution TUs in `LIBHMM_SIMD_SOURCES`. Tier 2 explicit intrinsics: Gaussian + Exponential via `detail::` free functions (extractable to separate TU for future runtime dispatch).
+3. **Compile-time SIMD dispatch** — source-distributed; each machine builds for its own CPU. GCC/Clang: `-march=native`. MSVC: `check_cxx_source_runs`-verified `/arch:AVX512`/`AVX2`/`AVX`. All 15 distribution TUs plus transcendental kernels, FB calculator, and BW trainer in `LIBHMM_SIMD_SOURCES`. Tier 2 explicit intrinsics: Gaussian, Exponential, LogNormal, Pareto via `detail::` free functions; recurrence kernels (FB max-reduce, BW xi) via `TranscendentalKernels` in `src/performance/`. Shared vector exp/log helpers in `include/libhmm/performance/simd_kernels_internal.h`.
 
 4. **Thread-safe cache** — `std::atomic<bool> cacheValid_` in `DistributionBase`. Avoids mutex; safe for concurrent const reads if the library is invoked from multiple threads (calculators and trainers themselves run single-threaded — see `performance/PERFORMANCE_ARCHITECTURE.md`).
 
@@ -210,6 +210,7 @@ CRLF: `.gitattributes` enforces LF. CRLF warnings on `git add` are normal.
 
 - Always run `./scripts/configure_catalina.sh build` for the first configure.
 - The script sanitizes toolchain-related environment variables, pins AppleClang via `xcrun`, and sets `CMAKE_OSX_DEPLOYMENT_TARGET=10.15`.
+- **Build type:** the script defaults to `Release` (`-O3`). This is required for correctness: at `-O0`, AppleClang inserts `VZEROUPPER` in the prologue of large-frame AVX functions before saving the `__m256d` argument, silently zeroing `x[2]` and `x[3]`. For debuggable builds use `RelWithDebInfo` (`-O2 -g`) — SIMD helpers inline at `-O2` so the issue cannot occur: `./scripts/configure_catalina.sh build -DCMAKE_BUILD_TYPE=RelWithDebInfo`. Pure `Debug` (`-O0`) is unsafe for any code path that passes `__m256d` through a real call boundary.
 - Do not point Catalina builds at Homebrew LLVM/libc++ (`/usr/local/opt/llvm`, `Cellar/llvm*`, libc++ include paths). The root `CMakeLists.txt` guard fails configure when those hints are detected.
 - Use `-DLIBHMM_ALLOW_UNSUPPORTED_CATALINA_HOMEBREW_LIBCXX=ON` only for explicit troubleshooting; runtime stability is not guaranteed.
 
@@ -217,17 +218,18 @@ CRLF: `.gitattributes` enforces LF. CRLF warnings on `git add` are normal.
 
 ## Test Suite Structure
 
-Tests in `tests/CMakeLists.txt` use `add_hmm_test()` helper organized into 8 levels:
+Tests in `tests/CMakeLists.txt` use `add_hmm_test()` helper organized into semantic groups:
 
-| Level | Content |
+| Group | Content |
 |---|---|
-| 1 | Math & Numerics |
-| 2 | Linear Algebra |
-| 3 | Distributions (all 15 + traits/header/type_safety) |
-| 4 | Core HMM |
-| 5 | Calculators (canonical + continuous + edge cases) |
-| 6 | Trainers (canonical + training + edge cases + BW convergence) |
-| 7 | IO + Integration (stream IO + end-to-end casino) |
+| Platform Capabilities | No tests yet (placeholder) |
+| Math & Numerics | constants, numerical stability, common types |
+| Performance Primitives | transcendental kernels (SIMD parity vs `std::exp`) |
+| Distributions | all 15 + traits/header/type_safety |
+| Core HMM | HMM construction and state management |
+| Calculators | canonical + continuous + edge cases + FB mode parity |
+| Trainers | canonical + training + edge cases + BW convergence + BW parity |
+| IO & Integration | stream IO + end-to-end casino |
 
 Custom targets: `check` (correctness, parallel), `check_timing` (serial).
 Note: named `check` not `run_tests` to avoid cmake's built-in `RUN_TESTS` on Windows.
diff --git a/benchmark-analysis/fb_contour_sweep_adaptive_static_v1.csv b/benchmark-analysis/fb_contour_sweep_adaptive_static_v1.csv
new file mode 100644
index 0000000..52de679
--- /dev/null
+++ b/benchmark-analysis/fb_contour_sweep_adaptive_static_v1.csv
@@ -0,0 +1,23 @@
+mode,n,t,runs,warmup,recurrence_work,emission_work,transition_ms,obs_copy_ms,emission_ms,alloc_ms,forward_ms,backward_ms,reduction_ms,total_ms
+adaptive_static_v1,2,1000,5,2,3996,2000,0.0002,0.0006,0.0006,0.0005,0.0555,0.053,0.0001,0.1109
+adaptive_static_v1,2,10000,5,2,39996,20000,0.0007,0.0071,0.0045,0.043,0.3578,0.3551,0,0.7707
+adaptive_static_v1,2,100000,5,2,399996,200000,0.0026,0.1488,0.2834,0.508,3.8598,3.6578,0.0003,9.0083
+adaptive_static_v1,2,1000000,5,2,3999996,2000000,0.0031,2.0429,3.4685,3.7612,36.9812,36.2041,0.0002,82.1594
+adaptive_static_v1,4,1000,5,2,15984,4000,0.001,0.0007,0.0106,0.0154,0.2256,0.2209,0.0001,0.4701
+adaptive_static_v1,4,10000,5,2,159984,40000,0.0018,0.0104,0.014,0.0139,1.4938,1.5459,0.0005,3.0504
+adaptive_static_v1,4,100000,5,2,1599984,400000,0.0036,0.1141,0.58,0.9126,14.5554,14.3194,0.0007,30.568
+adaptive_static_v1,8,1000,5,2,63936,8000,0.0012,0.0024,0.0157,0.0294,0.3975,0.3908,0.0002,0.8399
+adaptive_static_v1,8,5000,5,2,319936,40000,0.0006,0.0022,0.007,0.0059,1.9524,1.9707,0.0002,3.9503
+adaptive_static_v1,8,10000,5,2,639936,80000,0.002,0.0087,0.019,0.2104,3.9859,4.0981,0.0006,8.434
+adaptive_static_v1,16,1000,5,2,255744,16000,0.0024,0.0036,0.0276,0.0427,1.4421,1.4556,0.0005,2.9893
+adaptive_static_v1,16,2000,5,2,511744,32000,0.0015,0.0017,0.0057,0.0056,2.8761,2.9113,0.0005,5.7923
+adaptive_static_v1,16,5000,5,2,1279744,80000,0.0029,0.005,0.0262,0.1948,7.2773,7.3363,0.0007,14.8745
+adaptive_static_v1,32,500,5,2,510976,16000,0.0102,0.0007,0.0276,0.0519,4.0494,4.2193,0.0008,8.3801
+adaptive_static_v1,32,1000,5,2,1022976,32000,0.0134,0.0031,0.044,0.0831,8.221,8.6986,0.001,17.1867
+adaptive_static_v1,32,2000,5,2,2046976,64000,0.0158,0.0056,0.0887,0.1513,16.2641,16.9673,0.001,33.4698
+adaptive_static_v1,64,200,5,2,815104,12800,0.0268,0.0006,0.0238,0.0412,8.7132,8.7867,0.0017,17.5748
+adaptive_static_v1,64,500,5,2,2043904,32000,0.0417,0.0027,0.0657,0.1169,36.6388,36.9101,0.0019,74.5554
+adaptive_static_v1,64,1000,5,2,4091904,64000,0.0355,0.0045,0.1179,0.1798,45.2402,47.7388,0.0015,93.3553
+adaptive_static_v1,128,100,5,2,1622016,12800,0.0678,0.0005,0.0268,0.0428,21.5884,25.9046,0.0023,50.4003
+adaptive_static_v1,128,250,5,2,4079616,32000,0.0685,0.001,0.0247,0.0602,54.7442,59.1274,0.0025,111.21
+adaptive_static_v1,128,500,5,2,8175616,64000,0.0821,0.0013,0.0333,0.032,115.191,122.896,0.0026,231.18
diff --git a/benchmark-analysis/fb_contour_sweep_max_reduce.csv b/benchmark-analysis/fb_contour_sweep_max_reduce.csv
new file mode 100644
index 0000000..716e04d
--- /dev/null
+++ b/benchmark-analysis/fb_contour_sweep_max_reduce.csv
@@ -0,0 +1,23 @@
+mode,n,t,runs,warmup,recurrence_work,emission_work,transition_ms,obs_copy_ms,emission_ms,alloc_ms,forward_ms,backward_ms,reduction_ms,total_ms
+max_reduce,2,1000,5,2,3996,2000,0.0001,0.0003,0.0004,0.0003,0.0541,0.0557,0,0.1112
+max_reduce,2,10000,5,2,39996,20000,0.0003,0.0033,0.0036,0.0029,0.5451,0.5607,0.0001,1.1176
+max_reduce,2,100000,5,2,399996,200000,0.0024,0.1024,0.292,0.5074,5.9164,5.8783,0.0006,12.7317
+max_reduce,2,1000000,5,2,3999996,2000000,0.0019,1.5644,3.6518,4.0798,61.6187,65.8737,0.0008,138.632
+max_reduce,4,1000,5,2,15984,4000,0.0002,0.0003,0.0072,0.0148,0.1365,0.1401,0.0001,0.3002
+max_reduce,4,10000,5,2,159984,40000,0.0005,0.0036,0.0072,0.0061,1.3655,1.4421,0.0002,2.8389
+max_reduce,4,100000,5,2,1599984,400000,0.0039,0.1803,0.544,0.8251,14.3255,14.7261,0.0007,30.5996
+max_reduce,8,1000,5,2,63936,8000,0.0005,0.0024,0.015,0.0308,0.3906,0.4051,0.0002,0.8435
+max_reduce,8,5000,5,2,319936,40000,0.0015,0.0127,0.0492,0.094,1.9496,2.0359,0.0003,4.1927
+max_reduce,8,10000,5,2,639936,80000,0.0024,0.0097,0.0191,0.1943,3.9162,4.15,0.0005,8.2942
+max_reduce,16,1000,5,2,255744,16000,0.0012,0.0027,0.0325,0.045,1.4214,1.4575,0.0004,2.963
+max_reduce,16,2000,5,2,511744,32000,0.0018,0.0063,0.0454,0.0944,2.8557,2.9186,0.0006,6.0147
+max_reduce,16,5000,5,2,1279744,80000,0.0036,0.0147,0.1311,0.186,7.0892,7.4272,0.0006,15.147
+max_reduce,32,500,5,2,510976,16000,0.0045,0.0023,0.0257,0.0451,4.0341,4.1987,0.0008,8.3059
+max_reduce,32,1000,5,2,1022976,32000,0.0064,0.0067,0.0439,0.0748,8.1545,8.4885,0.0008,16.8164
+max_reduce,32,2000,5,2,2046976,64000,0.0069,0.0067,0.0793,0.151,16.8425,17.4785,0.0013,35.1039
+max_reduce,64,200,5,2,815104,12800,0.0297,0.0025,0.0322,0.0434,9.1157,9.1911,0.0018,18.3756
+max_reduce,64,500,5,2,2043904,32000,0.0483,0.0029,0.0804,0.1053,27.1055,28.3244,0.0024,55.0267
+max_reduce,64,1000,5,2,4091904,64000,0.0318,0.0042,0.1039,0.1689,62.8022,63.4727,0.0016,120.995
+max_reduce,128,100,5,2,1622016,12800,0.071,0.0007,0.0337,0.0426,21.6621,21.5886,0.0024,43.8249
+max_reduce,128,250,5,2,4079616,32000,0.0696,0.0008,0.0513,0.0852,77.0032,61.7649,0.0023,137.852
+max_reduce,128,500,5,2,8175616,64000,0.0756,0.0031,0.085,0.1356,128.719,119.591,0.0025,243.712
diff --git a/benchmark-analysis/fb_contour_sweep_pairwise.csv b/benchmark-analysis/fb_contour_sweep_pairwise.csv
new file mode 100644
index 0000000..bbbac66
--- /dev/null
+++ b/benchmark-analysis/fb_contour_sweep_pairwise.csv
@@ -0,0 +1,23 @@
+mode,n,t,runs,warmup,recurrence_work,emission_work,transition_ms,obs_copy_ms,emission_ms,alloc_ms,forward_ms,backward_ms,reduction_ms,total_ms
+pairwise,2,1000,5,1,3996,2000,0.0001,0.0003,0.0004,0.0003,0.0343,0.0336,0.0001,0.0693
+pairwise,2,10000,5,1,39996,20000,0.0001,0.0024,0.0047,0.0023,0.3434,0.3354,0,0.6895
+pairwise,2,100000,5,1,399996,200000,0.001,0.1048,0.2501,0.4206,3.461,3.3926,0.0001,7.6391
+pairwise,2,1000000,5,1,3999996,2000000,0.0049,1.5373,2.8471,3.7466,34.7657,34.3781,0.0004,78.5542
+pairwise,4,1000,5,1,15984,4000,0.0003,0.0004,0.0101,0.0187,0.2189,0.2153,0.0001,0.4634
+pairwise,4,10000,5,1,159984,40000,0.0019,0.0122,0.0167,0.0218,3.4942,3.2695,0.0002,6.8535
+pairwise,4,100000,5,1,1599984,400000,0.0033,0.1415,0.6652,1.1502,29.2175,26.0248,0.0002,58.7034
+pairwise,8,1000,5,1,63936,8000,0.0005,0.0034,0.0159,0.0316,1.166,1.1765,0.0002,2.3957
+pairwise,8,5000,5,1,319936,40000,0.0016,0.0156,0.052,0.1019,5.8452,5.8658,0.0002,11.8913
+pairwise,8,10000,5,1,639936,80000,0.0022,0.0079,0.0197,0.204,11.6961,11.7406,0.0002,23.715
+pairwise,16,1000,5,1,255744,16000,0.0019,0.0042,0.0326,0.0477,5.3054,5.3313,0.0004,10.7288
+pairwise,16,2000,5,1,511744,32000,0.0033,0.0073,0.0434,0.0883,10.6612,10.8194,0.0005,21.7072
+pairwise,16,5000,5,1,1279744,80000,0.0051,0.0149,0.0966,0.2077,26.5814,26.6937,0.0005,53.6173
+pairwise,32,500,5,1,510976,16000,0.0047,0.0028,0.029,0.044,9.7704,9.8929,0.0006,19.7958
+pairwise,32,1000,5,1,1022976,32000,0.0058,0.0047,0.0453,0.0761,19.5781,19.7934,0.0007,39.505
+pairwise,32,2000,5,1,2046976,64000,0.0064,0.0065,0.0791,0.1424,39.3132,40.2802,0.0008,80.4737
+pairwise,64,200,5,1,815104,12800,0.0311,0.0022,0.0302,0.0409,14.4688,14.2692,0.0014,28.7968
+pairwise,64,500,5,1,2043904,32000,0.0293,0.002,0.0509,0.0823,37.0369,38.7809,0.0014,76.2688
+pairwise,64,1000,5,1,4091904,64000,0.0298,0.0036,0.0765,0.1626,70.9994,71.0655,0.0013,142.836
+pairwise,128,100,5,1,1622016,12800,0.0658,0.0008,0.0361,0.044,27.5451,27.7767,0.002,55.5736
+pairwise,128,250,5,1,4079616,32000,0.0637,0.0008,0.0164,0.0593,66.9222,67.2184,0.002,134.272
+pairwise,128,500,5,1,8175616,64000,0.0677,0.001,0.0482,0.0731,133.704,135.611,0.0023,269.665
diff --git a/benchmark-analysis/focus-n2-8-catalina-ivybridge/focused_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-catalina-ivybridge/focused_max_reduce_n2_8.csv
new file mode 100644
index 0000000..3432d02
--- /dev/null
+++ b/benchmark-analysis/focus-n2-8-catalina-ivybridge/focused_max_reduce_n2_8.csv
@@ -0,0 +1,43 @@
+mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms
+max_reduce,2,500,5,2,0.3,0.114,0.13
+max_reduce,2,1000,5,2,0.637,0.233,0.252
+max_reduce,2,2000,5,2,1.217,0.467,0.527
+max_reduce,2,5000,5,2,3.092,1.191,1.347
+max_reduce,2,10000,5,2,6.021,2.482,2.443
+max_reduce,2,100000,5,2,63.802,26.135,26.283
+max_reduce,3,500,5,2,0.589,0.234,0.258
+max_reduce,3,1000,5,2,1.107,0.455,0.501
+max_reduce,3,2000,5,2,2.289,0.94,1.034
+max_reduce,3,5000,5,2,5.686,2.326,2.592
+max_reduce,3,10000,5,2,12.027,4.796,5.664
+max_reduce,3,100000,5,2,120.989,49.523,55.446
+max_reduce,4,500,5,2,0.884,0.372,0.416
+max_reduce,4,1000,5,2,1.879,0.792,0.877
+max_reduce,4,2000,5,2,3.776,1.606,1.767
+max_reduce,4,5000,5,2,9.505,4.148,4.381
+max_reduce,4,10000,5,2,19.404,8.402,8.949
+max_reduce,4,100000,5,2,201.829,84.693,96.849
+max_reduce,5,500,5,2,1.317,0.568,0.632
+max_reduce,5,1000,5,2,2.775,1.196,1.337
+max_reduce,5,2000,5,2,5.672,2.391,2.801
+max_reduce,5,5000,5,2,13.83,5.923,6.682
+max_reduce,5,10000,5,2,29.043,12.056,14.445
+max_reduce,5,100000,5,2,291.988,124.124,142.458
+max_reduce,6,500,5,2,1.933,0.836,0.951
+max_reduce,6,1000,5,2,4.947,2.178,2.407
+max_reduce,6,2000,5,2,8.027,3.517,3.891
+max_reduce,6,5000,5,2,19.475,8.439,9.547
+max_reduce,6,10000,5,2,39.116,17.027,19.181
+max_reduce,6,100000,5,2,410.151,176.87,203.052
+max_reduce,7,500,5,2,2.623,1.146,1.304
+max_reduce,7,1000,5,2,5.839,2.317,3.179
+max_reduce,7,2000,5,2,10.765,4.824,5.204
+max_reduce,7,5000,5,2,25.732,11.46,12.566
+max_reduce,7,10000,5,2,53.622,23.214,27.048
+max_reduce,7,100000,5,2,548.109,240.248,271.739
+max_reduce,8,500,5,2,3.935,1.592,2.096
+max_reduce,8,1000,5,2,7.416,3.137,3.887
+max_reduce,8,2000,5,2,13.338,5.863,6.718
+max_reduce,8,5000,5,2,35.927,14.932,19.053
+max_reduce,8,10000,5,2,67.716,29.651,34.379
+max_reduce,8,100000,5,2,707.026,309.823,357.473
diff --git a/benchmark-analysis/focus-n2-8-catalina-ivybridge/focused_pairwise_n2_8.csv b/benchmark-analysis/focus-n2-8-catalina-ivybridge/focused_pairwise_n2_8.csv
new file mode 100644
index 0000000..8096d21
--- /dev/null
+++ b/benchmark-analysis/focus-n2-8-catalina-ivybridge/focused_pairwise_n2_8.csv
@@ -0,0 +1,43 @@
+mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms
+pairwise,2,500,5,2,0.217,0.077,0.084
+pairwise,2,1000,5,2,0.412,0.15,0.155
+pairwise,2,2000,5,2,1.195,0.399,0.506
+pairwise,2,5000,5,2,2.078,0.759,0.773
+pairwise,2,10000,5,2,4.231,1.538,1.596
+pairwise,2,100000,5,2,44.74,16.476,17.079
+pairwise,3,500,5,2,0.469,0.185,0.205
+pairwise,3,1000,5,2,0.951,0.389,0.405
+pairwise,3,2000,5,2,1.851,0.775,0.773
+pairwise,3,5000,5,2,4.812,1.993,2.038
+pairwise,3,10000,5,2,9.393,3.795,4.022
+pairwise,3,100000,5,2,97.533,39.481,42.397
+pairwise,4,500,5,2,0.746,0.318,0.332
+pairwise,4,1000,5,2,1.577,0.672,0.702
+pairwise,4,2000,5,2,3.171,1.349,1.417
+pairwise,4,5000,5,2,8.058,3.536,3.523
+pairwise,4,10000,5,2,16.258,6.922,7.335
+pairwise,4,100000,5,2,165.673,71.192,74.499
+pairwise,5,500,5,2,1.113,0.485,0.509
+pairwise,5,1000,5,2,2.436,1.062,1.103
+pairwise,5,2000,5,2,5.02,2.064,2.462
+pairwise,5,5000,5,2,11.962,5.197,5.515
+pairwise,5,10000,5,2,24.438,10.759,11.021
+pairwise,5,100000,5,2,250.178,112.919,111.994
+pairwise,6,500,5,2,1.632,0.726,0.764
+pairwise,6,1000,5,2,3.284,1.456,1.531
+pairwise,6,2000,5,2,6.833,3.051,3.183
+pairwise,6,5000,5,2,16.789,7.384,7.872
+pairwise,6,10000,5,2,34.298,15.829,15.664
+pairwise,6,100000,5,2,348.493,155.326,161.492
+pairwise,7,500,5,2,2.257,1.014,1.038
+pairwise,7,1000,5,2,4.423,1.965,2.116
+pairwise,7,2000,5,2,9.453,3.95,4.715
+pairwise,7,5000,5,2,23.992,10.022,12.256
+pairwise,7,10000,5,2,44.92,20.22,21.249
+pairwise,7,100000,5,2,461.136,210.373,214.594
+pairwise,8,500,5,2,2.928,1.274,1.454
+pairwise,8,1000,5,2,5.612,2.515,2.718
+pairwise,8,2000,5,2,11.229,5.211,5.265
+pairwise,8,5000,5,2,28.531,12.717,13.92
+pairwise,8,10000,5,2,58.541,27.524,27.201
+pairwise,8,100000,5,2,591.284,270.222,280.583
diff --git a/benchmark-analysis/focus-n2-8-catalina-ivybridge/focused_pairwise_vs_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-catalina-ivybridge/focused_pairwise_vs_max_reduce_n2_8.csv
new file mode 100644
index 0000000..f65003c
--- /dev/null
+++ b/benchmark-analysis/focus-n2-8-catalina-ivybridge/focused_pairwise_vs_max_reduce_n2_8.csv
@@ -0,0 +1,43 @@
+n,t,pairwise_fb_total_ms,max_reduce_fb_total_ms,speedup_max_over_pair,winner
+2,500,0.217,0.3,0.7233333333333334,pairwise
+2,1000,0.412,0.637,0.6467817896389324,pairwise
+2,2000,1.195,1.217,0.9819227608874281,pairwise
+2,5000,2.078,3.092,0.6720569210866753,pairwise
+2,10000,4.231,6.021,0.7027071914964291,pairwise
+2,100000,44.74,63.802,0.7012319363029372,pairwise
+3,500,0.469,0.589,0.7962648556876061,pairwise
+3,1000,0.951,1.107,0.8590785907859079,pairwise
+3,2000,1.851,2.289,0.8086500655307994,pairwise
+3,5000,4.812,5.686,0.8462891311994373,pairwise
+3,10000,9.393,12.027,0.7809927662758793,pairwise
+3,100000,97.533,120.989,0.8061311358883865,pairwise
+4,500,0.746,0.884,0.8438914027149321,pairwise
+4,1000,1.577,1.879,0.8392762107503992,pairwise
+4,2000,3.171,3.776,0.8397775423728814,pairwise
+4,5000,8.058,9.505,0.8477643345607574,pairwise
+4,10000,16.258,19.404,0.8378684807256236,pairwise
+4,100000,165.673,201.829,0.8208582512919352,pairwise
+5,500,1.113,1.317,0.8451025056947609,pairwise
+5,1000,2.436,2.775,0.8778378378378379,pairwise
+5,2000,5.02,5.672,0.885049365303244,pairwise
+5,5000,11.962,13.83,0.8649313087490962,pairwise
+5,10000,24.438,29.043,0.8414419997934097,pairwise
+5,100000,250.178,291.988,0.856809183939066,pairwise
+6,500,1.632,1.933,0.8442834971546818,pairwise
+6,1000,3.284,4.947,0.6638366686880938,pairwise
+6,2000,6.833,8.027,0.8512520244175907,pairwise
+6,5000,16.789,19.475,0.8620795892169448,pairwise
+6,10000,34.298,39.116,0.8768278965129359,pairwise
+6,100000,348.493,410.151,0.8496699995855185,pairwise
+7,500,2.257,2.623,0.8604651162790697,pairwise
+7,1000,4.423,5.839,0.7574927213563966,pairwise
+7,2000,9.453,10.765,0.8781235485369251,pairwise
+7,5000,23.992,25.732,0.9323799160578269,pairwise
+7,10000,44.92,53.622,0.8377158628920965,pairwise
+7,100000,461.136,548.109,0.8413217079084635,pairwise
+8,500,2.928,3.935,0.7440914866581957,pairwise
+8,1000,5.612,7.416,0.7567421790722761,pairwise
+8,2000,11.229,13.338,0.8418803418803419,pairwise
+8,5000,28.531,35.927,0.7941381133966098,pairwise
+8,10000,58.541,67.716,0.864507649595369,pairwise
+8,100000,591.284,707.026,0.836297392175111,pairwise
diff --git a/benchmark-analysis/focus-n2-8-ryzen-windows-clangcl-rerun/focused_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-ryzen-windows-clangcl-rerun/focused_max_reduce_n2_8.csv
new file mode 100644
index 0000000..1fa2fea
--- /dev/null
+++ b/benchmark-analysis/focus-n2-8-ryzen-windows-clangcl-rerun/focused_max_reduce_n2_8.csv
@@ -0,0 +1,43 @@
+mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms
+max_reduce,2,500,5,2,0.125,0.05,0.069
+max_reduce,2,1000,5,2,0.316,0.143,0.156
+max_reduce,2,2000,5,2,0.468,0.199,0.217
+max_reduce,2,5000,5,2,1.115,0.481,0.524
+max_reduce,2,10000,5,2,2.146,0.962,1.051
+max_reduce,2,100000,5,2,22.075,9.8,10.74
+max_reduce,3,500,5,2,0.208,0.093,0.107
+max_reduce,3,1000,5,2,0.435,0.187,0.214
+max_reduce,3,2000,5,2,0.866,0.374,0.428
+max_reduce,3,5000,5,2,2.118,0.96,1.077
+max_reduce,3,10000,5,2,4.226,1.909,2.165
+max_reduce,3,100000,5,2,43.079,18.992,21.896
+max_reduce,4,500,5,2,0.34,0.153,0.178
+max_reduce,4,1000,5,2,0.706,0.311,0.356
+max_reduce,4,2000,5,2,1.408,0.617,0.711
+max_reduce,4,5000,5,2,3.501,1.552,1.793
+max_reduce,4,10000,5,2,6.805,3.084,3.568
+max_reduce,4,100000,5,2,71.122,31.764,36.614
+max_reduce,5,500,5,2,0.522,0.229,0.267
+max_reduce,5,1000,5,2,1.042,0.459,0.535
+max_reduce,5,2000,5,2,2.097,0.922,1.075
+max_reduce,5,5000,5,2,5.247,2.3,2.717
+max_reduce,5,10000,5,2,10.308,4.654,5.474
+max_reduce,5,100000,5,2,105.437,47.128,54.645
+max_reduce,6,500,5,2,0.724,0.318,0.376
+max_reduce,6,1000,5,2,1.455,0.639,0.756
+max_reduce,6,2000,5,2,2.849,1.276,1.507
+max_reduce,6,5000,5,2,7.09,3.207,3.778
+max_reduce,6,10000,5,2,14.272,6.488,7.566
+max_reduce,6,100000,5,2,146.633,65.236,77.093
+max_reduce,7,500,5,2,0.966,0.427,0.503
+max_reduce,7,1000,5,2,1.923,0.847,1.009
+max_reduce,7,2000,5,2,3.833,1.699,2.016
+max_reduce,7,5000,5,2,9.465,4.275,5.07
+max_reduce,7,10000,5,2,19.148,8.62,10.112
+max_reduce,7,100000,5,2,191.651,86.109,101.104
+max_reduce,8,500,5,2,1.23,0.542,0.649
+max_reduce,8,1000,5,2,2.548,1.09,1.366
+max_reduce,8,2000,5,2,4.963,2.237,2.637
+max_reduce,8,5000,5,2,12.596,5.686,6.769
+max_reduce,8,10000,5,2,25.42,11.105,13.834
+max_reduce,8,100000,5,2,249.409,111.539,132.687
diff --git a/benchmark-analysis/focus-n2-8-ryzen-windows-clangcl-rerun/focused_pairwise_n2_8.csv b/benchmark-analysis/focus-n2-8-ryzen-windows-clangcl-rerun/focused_pairwise_n2_8.csv
new file mode 100644
index 0000000..05bb3b6
--- /dev/null
+++ b/benchmark-analysis/focus-n2-8-ryzen-windows-clangcl-rerun/focused_pairwise_n2_8.csv
@@ -0,0 +1,43 @@
+mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms
+pairwise,2,500,5,2,0.078,0.035,0.038
+pairwise,2,1000,5,2,0.156,0.07,0.075
+pairwise,2,2000,5,2,0.339,0.14,0.151
+pairwise,2,5000,5,2,0.827,0.351,0.377
+pairwise,2,10000,5,2,1.551,0.699,0.756
+pairwise,2,100000,5,2,16.659,7.05,8.087
+pairwise,3,500,5,2,0.204,0.095,0.101
+pairwise,3,1000,5,2,0.432,0.194,0.203
+pairwise,3,2000,5,2,0.847,0.378,0.406
+pairwise,3,5000,5,2,2.112,1.006,1.022
+pairwise,3,10000,5,2,4.093,1.909,2.03
+pairwise,3,100000,5,2,57.89,25.89,29.252
+pairwise,4,500,5,2,0.392,0.186,0.197
+pairwise,4,1000,5,2,0.95,0.439,0.455
+pairwise,4,2000,5,2,1.644,0.751,0.79
+pairwise,4,5000,5,2,4.004,1.938,1.957
+pairwise,4,10000,5,2,7.862,3.753,3.95
+pairwise,4,100000,5,2,80.511,37.969,39.852
+pairwise,5,500,5,2,0.643,0.302,0.312
+pairwise,5,1000,5,2,1.289,0.609,0.631
+pairwise,5,2000,5,2,2.574,1.216,1.26
+pairwise,5,5000,5,2,6.444,3.054,3.193
+pairwise,5,10000,5,2,12.605,6.112,6.312
+pairwise,5,100000,5,2,130.656,62.657,64.69
+pairwise,6,500,5,2,0.945,0.452,0.464
+pairwise,6,1000,5,2,1.89,0.9,0.936
+pairwise,6,2000,5,2,3.749,1.811,1.864
+pairwise,6,5000,5,2,9.492,4.564,4.7
+pairwise,6,10000,5,2,19.026,9.206,9.39
+pairwise,6,100000,5,2,191.567,92.293,95.123
+pairwise,7,500,5,2,1.302,0.627,0.641
+pairwise,7,1000,5,2,2.604,1.258,1.293
+pairwise,7,2000,5,2,5.18,2.529,2.576
+pairwise,7,5000,5,2,13.197,6.366,6.569
+pairwise,7,10000,5,2,25.912,12.639,12.89
+pairwise,7,100000,5,2,266.082,128.95,132.57
+pairwise,8,500,5,2,1.914,0.897,0.957
+pairwise,8,1000,5,2,3.814,1.807,1.886
+pairwise,8,2000,5,2,7.895,3.715,4.004
+pairwise,8,5000,5,2,23.27,9.318,13.555
+pairwise,8,10000,5,2,34.83,16.856,17.516
+pairwise,8,100000,5,2,346.151,169.146,171.958
diff --git a/benchmark-analysis/focus-n2-8-ryzen-windows-clangcl-rerun/focused_pairwise_vs_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-ryzen-windows-clangcl-rerun/focused_pairwise_vs_max_reduce_n2_8.csv
new file mode 100644
index 0000000..debff59
--- /dev/null
+++ b/benchmark-analysis/focus-n2-8-ryzen-windows-clangcl-rerun/focused_pairwise_vs_max_reduce_n2_8.csv
@@ -0,0 +1,43 @@
+n,t,pairwise_fb_total_ms,max_reduce_fb_total_ms,speedup_max_over_pair,winner
+2,500,0.078,0.125,0.624,pairwise
+2,1000,0.156,0.316,0.4936708860759494,pairwise
+2,2000,0.339,0.468,0.7243589743589743,pairwise
+2,5000,0.827,1.115,0.7417040358744394,pairwise
+2,10000,1.551,2.146,0.722739981360671,pairwise
+2,100000,16.659,22.075,0.7546545866364666,pairwise
+3,500,0.204,0.208,0.9807692307692307,pairwise
+3,1000,0.432,0.435,0.993103448275862,pairwise
+3,2000,0.847,0.866,0.9780600461893765,pairwise
+3,5000,2.112,2.118,0.9971671388101984,pairwise
+3,10000,4.093,4.226,0.9685281590156176,pairwise
+3,100000,57.89,43.079,1.3438102091506303,max_reduce
+4,500,0.392,0.34,1.1529411764705881,max_reduce
+4,1000,0.95,0.706,1.3456090651558075,max_reduce
+4,2000,1.644,1.408,1.1676136363636365,max_reduce
+4,5000,4.004,3.501,1.1436732362182234,max_reduce
+4,10000,7.862,6.805,1.1553269654665688,max_reduce
+4,100000,80.511,71.122,1.1320125980709204,max_reduce
+5,500,0.643,0.522,1.2318007662835249,max_reduce
+5,1000,1.289,1.042,1.2370441458733203,max_reduce
+5,2000,2.574,2.097,1.2274678111587982,max_reduce
+5,5000,6.444,5.247,1.228130360205832,max_reduce
+5,10000,12.605,10.308,1.222836631742336,max_reduce
+5,100000,130.656,105.437,1.2391854851712398,max_reduce
+6,500,0.945,0.724,1.3052486187845305,max_reduce
+6,1000,1.89,1.455,1.2989690721649483,max_reduce
+6,2000,3.749,2.849,1.3159003159003158,max_reduce
+6,5000,9.492,7.09,1.338787023977433,max_reduce
+6,10000,19.026,14.272,1.3330997757847534,max_reduce
+6,100000,191.567,146.633,1.3064385233883231,max_reduce
+7,500,1.302,0.966,1.3478260869565217,max_reduce
+7,1000,2.604,1.923,1.3541341653666146,max_reduce
+7,2000,5.18,3.833,1.3514218627706756,max_reduce
+7,5000,13.197,9.465,1.3942947702060222,max_reduce
+7,10000,25.912,19.148,1.3532483810319615,max_reduce
+7,100000,266.082,191.651,1.3883673969872319,max_reduce
+8,500,1.914,1.23,1.5560975609756098,max_reduce
+8,1000,3.814,2.548,1.4968602825745683,max_reduce
+8,2000,7.895,4.963,1.5907717106588755,max_reduce
+8,5000,23.27,12.596,1.8474118767862813,max_reduce
+8,10000,34.83,25.42,1.3701809598741148,max_reduce
+8,100000,346.151,249.409,1.3878849600455478,max_reduce
diff --git a/benchmark-analysis/focus-n2-8-ryzen-windows-mingw-rerun/focused_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-ryzen-windows-mingw-rerun/focused_max_reduce_n2_8.csv
new file mode 100644
index 0000000..dc747ba
--- /dev/null
+++ b/benchmark-analysis/focus-n2-8-ryzen-windows-mingw-rerun/focused_max_reduce_n2_8.csv
@@ -0,0 +1,43 @@
+mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms
+max_reduce,2,500,5,2,0.153,0.066,0.079
+max_reduce,2,1000,5,2,0.327,0.136,0.161
+max_reduce,2,2000,5,2,0.787,0.293,0.416
+max_reduce,2,5000,5,2,1.602,0.666,0.797
+max_reduce,2,10000,5,2,3.165,1.331,1.587
+max_reduce,2,100000,5,2,29.774,13.093,14.691
+max_reduce,3,500,5,2,0.317,0.142,0.164
+max_reduce,3,1000,5,2,0.748,0.307,0.392
+max_reduce,3,2000,5,2,1.3,0.57,0.654
+max_reduce,3,5000,5,2,3.266,1.429,1.668
+max_reduce,3,10000,5,2,6.408,2.873,3.304
+max_reduce,3,100000,5,2,62.015,27.423,31.925
+max_reduce,4,500,5,2,0.556,0.264,0.28
+max_reduce,4,1000,5,2,1.142,0.528,0.565
+max_reduce,4,2000,5,2,2.288,1.07,1.123
+max_reduce,4,5000,5,2,5.714,2.681,2.822
+max_reduce,4,10000,5,2,11.323,5.32,5.628
+max_reduce,4,100000,5,2,108.579,51.289,53.479
+max_reduce,5,500,5,2,0.856,0.399,0.429
+max_reduce,5,1000,5,2,1.703,0.787,0.859
+max_reduce,5,2000,5,2,3.421,1.592,1.715
+max_reduce,5,5000,5,2,8.482,3.967,4.274
+max_reduce,5,10000,5,2,16.837,7.948,8.608
+max_reduce,5,100000,5,2,159.391,72.178,83.104
+max_reduce,6,500,5,2,1.18,0.547,0.599
+max_reduce,6,1000,5,2,2.401,1.104,1.21
+max_reduce,6,2000,5,2,4.729,2.196,2.416
+max_reduce,6,5000,5,2,11.973,5.492,6.173
+max_reduce,6,10000,5,2,23.521,11.061,12.136
+max_reduce,6,100000,5,2,218.719,97.497,116.585
+max_reduce,7,500,5,2,1.581,0.734,0.807
+max_reduce,7,1000,5,2,3.159,1.461,1.621
+max_reduce,7,2000,5,2,6.307,2.928,3.267
+max_reduce,7,5000,5,2,15.845,7.306,8.31
+max_reduce,7,10000,5,2,30.936,14.59,15.772
+max_reduce,7,100000,5,2,290.579,129.087,155.833
+max_reduce,8,500,5,2,2.022,0.931,1.044
+max_reduce,8,1000,5,2,4.077,1.876,2.11
+max_reduce,8,2000,5,2,8.136,3.744,4.21
+max_reduce,8,5000,5,2,19.676,9.409,10.057
+max_reduce,8,10000,5,2,39.402,17.655,21.184
+max_reduce,8,100000,5,2,376.802,168.902,201.718
diff --git a/benchmark-analysis/focus-n2-8-ryzen-windows-mingw-rerun/focused_pairwise_n2_8.csv b/benchmark-analysis/focus-n2-8-ryzen-windows-mingw-rerun/focused_pairwise_n2_8.csv
new file mode 100644
index 0000000..14e7274
--- /dev/null
+++ b/benchmark-analysis/focus-n2-8-ryzen-windows-mingw-rerun/focused_pairwise_n2_8.csv
@@ -0,0 +1,43 @@
+mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms
+pairwise,2,500,5,2,0.179,0.079,0.087
+pairwise,2,1000,5,2,0.257,0.11,0.116
+pairwise,2,2000,5,2,0.501,0.21,0.232
+pairwise,2,5000,5,2,1.267,0.551,0.589
+pairwise,2,10000,5,2,2.432,1.076,1.161
+pairwise,2,100000,5,2,24.278,10.558,11.428
+pairwise,3,500,5,2,0.34,0.156,0.169
+pairwise,3,1000,5,2,0.693,0.315,0.329
+pairwise,3,2000,5,2,1.432,0.641,0.675
+pairwise,3,5000,5,2,3.512,1.62,1.696
+pairwise,3,10000,5,2,7.273,3.138,3.68
+pairwise,3,100000,5,2,101.009,45.806,50.0
+pairwise,4,500,5,2,0.784,0.366,0.398
+pairwise,4,1000,5,2,1.666,0.786,0.807
+pairwise,4,2000,5,2,2.307,1.094,1.094
+pairwise,4,5000,5,2,5.513,2.485,2.853
+pairwise,4,10000,5,2,10.479,4.846,5.254
+pairwise,4,100000,5,2,103.305,48.905,50.64
+pairwise,5,500,5,2,0.835,0.392,0.414
+pairwise,5,1000,5,2,1.721,0.823,0.841
+pairwise,5,2000,5,2,3.409,1.567,1.723
+pairwise,5,5000,5,2,8.462,3.965,4.233
+pairwise,5,10000,5,2,16.672,7.849,8.367
+pairwise,5,100000,5,2,162.323,76.356,81.557
+pairwise,6,500,5,2,1.215,0.57,0.611
+pairwise,6,1000,5,2,2.418,1.129,1.221
+pairwise,6,2000,5,2,4.971,2.337,2.494
+pairwise,6,5000,5,2,11.924,5.688,6.041
+pairwise,6,10000,5,2,24.001,11.309,12.178
+pairwise,6,100000,5,2,233.534,109.951,118.681
+pairwise,7,500,5,2,1.673,0.783,0.849
+pairwise,7,1000,5,2,3.399,1.618,1.703
+pairwise,7,2000,5,2,6.617,3.116,3.356
+pairwise,7,5000,5,2,16.757,7.873,8.514
+pairwise,7,10000,5,2,33.121,15.73,16.863
+pairwise,7,100000,5,2,330.164,157.671,167.113
+pairwise,8,500,5,2,2.195,1.031,1.119
+pairwise,8,1000,5,2,4.401,2.063,2.23
+pairwise,8,2000,5,2,8.776,4.137,4.437
+pairwise,8,5000,5,2,21.755,10.29,11.116
+pairwise,8,10000,5,2,43.354,20.653,22.13
+pairwise,8,100000,5,2,427.122,203.754,216.973
diff --git a/benchmark-analysis/focus-n2-8-ryzen-windows-mingw-rerun/focused_pairwise_vs_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-ryzen-windows-mingw-rerun/focused_pairwise_vs_max_reduce_n2_8.csv
new file mode 100644
index 0000000..de1ff34
--- /dev/null
+++ b/benchmark-analysis/focus-n2-8-ryzen-windows-mingw-rerun/focused_pairwise_vs_max_reduce_n2_8.csv
@@ -0,0 +1,43 @@
+n,t,pairwise_fb_total_ms,max_reduce_fb_total_ms,speedup_max_over_pair,winner
+2,500,0.179,0.153,1.1699346405228759,max_reduce
+2,1000,0.257,0.327,0.7859327217125383,pairwise
+2,2000,0.501,0.787,0.6365946632782719,pairwise
+2,5000,1.267,1.602,0.7908863920099874,pairwise
+2,10000,2.432,3.165,0.7684044233807267,pairwise
+2,100000,24.278,29.774,0.8154094176126822,pairwise
+3,500,0.34,0.317,1.0725552050473186,max_reduce
+3,1000,0.693,0.748,0.926470588235294,pairwise
+3,2000,1.432,1.3,1.1015384615384614,max_reduce
+3,5000,3.512,3.266,1.0753214941824862,max_reduce
+3,10000,7.273,6.408,1.134987515605493,max_reduce
+3,100000,101.009,62.015,1.6287833588647909,max_reduce
+4,500,0.784,0.556,1.410071942446043,max_reduce
+4,1000,1.666,1.142,1.458844133099825,max_reduce
+4,2000,2.307,2.288,1.0083041958041958,max_reduce
+4,5000,5.513,5.714,0.964823241162058,pairwise
+4,10000,10.479,11.323,0.925461450145721,pairwise
+4,100000,103.305,108.579,0.9514270715331695,pairwise
+5,500,0.835,0.856,0.9754672897196262,pairwise
+5,1000,1.721,1.703,1.010569583088667,max_reduce
+5,2000,3.409,3.421,0.9964922537269804,pairwise
+5,5000,8.462,8.482,0.9976420655505778,pairwise
+5,10000,16.672,16.837,0.9902001544218092,pairwise
+5,100000,162.323,159.391,1.0183950160297635,max_reduce
+6,500,1.215,1.18,1.0296610169491527,max_reduce
+6,1000,2.418,2.401,1.0070803831736779,max_reduce
+6,2000,4.971,4.729,1.0511736096426305,max_reduce
+6,5000,11.924,11.973,0.995907458448175,pairwise
+6,10000,24.001,23.521,1.02040729560818,max_reduce
+6,100000,233.534,218.719,1.067735313347263,max_reduce
+7,500,1.673,1.581,1.0581910183428211,max_reduce
+7,1000,3.399,3.159,1.0759734093067428,max_reduce
+7,2000,6.617,6.307,1.0491517361661644,max_reduce
+7,5000,16.757,15.845,1.0575575891448408,max_reduce
+7,10000,33.121,30.936,1.07062968709594,max_reduce
+7,100000,330.164,290.579,1.1362280137243228,max_reduce
+8,500,2.195,2.022,1.0855588526211672,max_reduce
+8,1000,4.401,4.077,1.0794701986754967,max_reduce
+8,2000,8.776,8.136,1.0786627335299903,max_reduce
+8,5000,21.755,19.676,1.1056617198617607,max_reduce
+8,10000,43.354,39.402,1.1002994771838992,max_reduce
+8,100000,427.122,376.802,1.133544938721132,max_reduce
diff --git a/benchmark-analysis/focus-n2-8-ryzen-windows-msvc-rerun/focused_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-ryzen-windows-msvc-rerun/focused_max_reduce_n2_8.csv
new file mode 100644
index 0000000..8c5494e
--- /dev/null
+++ b/benchmark-analysis/focus-n2-8-ryzen-windows-msvc-rerun/focused_max_reduce_n2_8.csv
@@ -0,0 +1,43 @@
+mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms
+max_reduce,2,500,5,2,0.118,0.054,0.058
+max_reduce,2,1000,5,2,0.234,0.108,0.116
+max_reduce,2,2000,5,2,0.495,0.216,0.232
+max_reduce,2,5000,5,2,1.292,0.583,0.597
+max_reduce,2,10000,5,2,2.353,1.093,1.173
+max_reduce,2,100000,5,2,25.272,11.953,11.912
+max_reduce,3,500,5,2,0.358,0.162,0.186
+max_reduce,3,1000,5,2,0.51,0.228,0.253
+max_reduce,3,2000,5,2,1.014,0.457,0.495
+max_reduce,3,5000,5,2,2.415,1.087,1.254
+max_reduce,3,10000,5,2,4.969,2.216,2.596
+max_reduce,3,100000,5,2,51.875,24.161,25.746
+max_reduce,4,500,5,2,0.394,0.179,0.207
+max_reduce,4,1000,5,2,1.051,0.542,0.454
+max_reduce,4,2000,5,2,1.627,0.717,0.832
+max_reduce,4,5000,5,2,4.164,1.878,2.135
+max_reduce,4,10000,5,2,8.044,3.688,4.228
+max_reduce,4,100000,5,2,83.852,39.282,42.178
+max_reduce,5,500,5,2,0.608,0.266,0.311
+max_reduce,5,1000,5,2,1.206,0.538,0.623
+max_reduce,5,2000,5,2,2.385,1.078,1.255
+max_reduce,5,5000,5,2,5.902,2.677,3.119
+max_reduce,5,10000,5,2,11.849,5.404,6.29
+max_reduce,5,100000,5,2,123.388,56.537,63.592
+max_reduce,6,500,5,2,0.847,0.371,0.448
+max_reduce,6,1000,5,2,1.643,0.749,0.876
+max_reduce,6,2000,5,2,3.311,1.484,1.768
+max_reduce,6,5000,5,2,8.231,3.724,4.422
+max_reduce,6,10000,5,2,16.484,7.51,8.799
+max_reduce,6,100000,5,2,177.269,82.83,89.799
+max_reduce,7,500,5,2,1.106,0.492,0.581
+max_reduce,7,1000,5,2,2.283,1.041,1.176
+max_reduce,7,2000,5,2,4.423,2.035,2.327
+max_reduce,7,5000,5,2,11.124,4.986,5.9
+max_reduce,7,10000,5,2,27.072,12.387,14.291
+max_reduce,7,100000,5,2,232.871,106.143,122.576
+max_reduce,8,500,5,2,1.431,0.641,0.747
+max_reduce,8,1000,5,2,2.831,1.269,1.492
+max_reduce,8,2000,5,2,5.852,2.614,3.102
+max_reduce,8,5000,5,2,14.216,6.443,7.514
+max_reduce,8,10000,5,2,35.189,15.989,18.798
+max_reduce,8,100000,5,2,290.193,134.363,151.274
diff --git a/benchmark-analysis/focus-n2-8-ryzen-windows-msvc-rerun/focused_pairwise_n2_8.csv b/benchmark-analysis/focus-n2-8-ryzen-windows-msvc-rerun/focused_pairwise_n2_8.csv
new file mode 100644
index 0000000..92bf7fd
--- /dev/null
+++ b/benchmark-analysis/focus-n2-8-ryzen-windows-msvc-rerun/focused_pairwise_n2_8.csv
@@ -0,0 +1,43 @@
+mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms
+pairwise,2,500,5,2,0.114,0.051,0.053
+pairwise,2,1000,5,2,0.161,0.075,0.077
+pairwise,2,2000,5,2,0.356,0.155,0.154
+pairwise,2,5000,5,2,0.861,0.373,0.383
+pairwise,2,10000,5,2,1.603,0.749,0.771
+pairwise,2,100000,5,2,18.276,8.452,8.228
+pairwise,3,500,5,2,0.2,0.096,0.098
+pairwise,3,1000,5,2,0.419,0.192,0.195
+pairwise,3,2000,5,2,0.844,0.385,0.395
+pairwise,3,5000,5,2,2.013,0.964,0.977
+pairwise,3,10000,5,2,4.233,2.043,2.051
+pairwise,3,100000,5,2,44.618,21.469,21.058
+pairwise,4,500,5,2,0.373,0.182,0.184
+pairwise,4,1000,5,2,0.768,0.363,0.368
+pairwise,4,2000,5,2,1.545,0.727,0.737
+pairwise,4,5000,5,2,3.823,1.825,1.846
+pairwise,4,10000,5,2,7.495,3.659,3.701
+pairwise,4,100000,5,2,79.862,38.657,38.499
+pairwise,5,500,5,2,0.622,0.297,0.3
+pairwise,5,1000,5,2,1.24,0.594,0.6
+pairwise,5,2000,5,2,2.425,1.18,1.192
+pairwise,5,5000,5,2,6.159,2.98,2.994
+pairwise,5,10000,5,2,12.094,5.935,6.006
+pairwise,5,100000,5,2,127.933,62.751,61.402
+pairwise,6,500,5,2,0.91,0.437,0.438
+pairwise,6,1000,5,2,1.985,0.909,0.999
+pairwise,6,2000,5,2,3.654,1.76,1.795
+pairwise,6,5000,5,2,9.473,4.441,4.815
+pairwise,6,10000,5,2,18.321,9.059,9.083
+pairwise,6,100000,5,2,185.866,91.562,90.741
+pairwise,7,500,5,2,1.226,0.6,0.601
+pairwise,7,1000,5,2,2.486,1.208,1.208
+pairwise,7,2000,5,2,4.909,2.413,2.431
+pairwise,7,5000,5,2,12.285,6.08,6.103
+pairwise,7,10000,5,2,27.202,13.082,13.75
+pairwise,7,100000,5,2,254.356,125.283,125.012
+pairwise,8,500,5,2,1.7,0.852,0.811
+pairwise,8,1000,5,2,3.293,1.6,1.602
+pairwise,8,2000,5,2,6.597,3.26,3.199
+pairwise,8,5000,5,2,16.998,8.052,8.665
+pairwise,8,10000,5,2,34.379,17.394,16.564
+pairwise,8,100000,5,2,335.824,164.916,164.701
diff --git a/benchmark-analysis/focus-n2-8-ryzen-windows-msvc-rerun/focused_pairwise_vs_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-ryzen-windows-msvc-rerun/focused_pairwise_vs_max_reduce_n2_8.csv
new file mode 100644
index 0000000..6268db1
--- /dev/null
+++ b/benchmark-analysis/focus-n2-8-ryzen-windows-msvc-rerun/focused_pairwise_vs_max_reduce_n2_8.csv
@@ -0,0 +1,43 @@
+n,t,pairwise_fb_total_ms,max_reduce_fb_total_ms,speedup_max_over_pair,winner
+2,500,0.114,0.118,0.9661016949152543,pairwise
+2,1000,0.161,0.234,0.688034188034188,pairwise
+2,2000,0.356,0.495,0.7191919191919192,pairwise
+2,5000,0.861,1.292,0.6664086687306501,pairwise
+2,10000,1.603,2.353,0.6812579685507861,pairwise
+2,100000,18.276,25.272,0.7231718898385565,pairwise
+3,500,0.2,0.358,0.558659217877095,pairwise
+3,1000,0.419,0.51,0.8215686274509804,pairwise
+3,2000,0.844,1.014,0.8323471400394477,pairwise
+3,5000,2.013,2.415,0.8335403726708074,pairwise
+3,10000,4.233,4.969,0.8518816663312536,pairwise
+3,100000,44.618,51.875,0.8601060240963856,pairwise
+4,500,0.373,0.394,0.9467005076142132,pairwise
+4,1000,0.768,1.051,0.7307326355851571,pairwise
+4,2000,1.545,1.627,0.9496004917025199,pairwise
+4,5000,3.823,4.164,0.9181075888568685,pairwise
+4,10000,7.495,8.044,0.9317503729487817,pairwise
+4,100000,79.862,83.852,0.9524161618089013,pairwise
+5,500,0.622,0.608,1.0230263157894737,max_reduce
+5,1000,1.24,1.206,1.0281923714759535,max_reduce
+5,2000,2.425,2.385,1.0167714884696017,max_reduce
+5,5000,6.159,5.902,1.0435445611657064,max_reduce
+5,10000,12.094,11.849,1.0206768503671195,max_reduce
+5,100000,127.933,123.388,1.036835024475638,max_reduce
+6,500,0.91,0.847,1.0743801652892562,max_reduce
+6,1000,1.985,1.643,1.2081558125380403,max_reduce
+6,2000,3.654,3.311,1.1035940803382664,max_reduce
+6,5000,9.473,8.231,1.1508929656177864,max_reduce
+6,10000,18.321,16.484,1.1114413977190003,max_reduce
+6,100000,185.866,177.269,1.0484969171146676,max_reduce
+7,500,1.226,1.106,1.1084990958408678,max_reduce
+7,1000,2.486,2.283,1.088918090232151,max_reduce
+7,2000,4.909,4.423,1.1098801718290752,max_reduce
+7,5000,12.285,11.124,1.104368932038835,max_reduce
+7,10000,27.202,27.072,1.0048020094562649,max_reduce
+7,100000,254.356,232.871,1.0922613807644574,max_reduce
+8,500,1.7,1.431,1.187980433263452,max_reduce
+8,1000,3.293,2.831,1.1631932179441895,max_reduce
+8,2000,6.597,5.852,1.127306903622693,max_reduce
+8,5000,16.998,14.216,1.195694991558807,max_reduce
+8,10000,34.379,35.189,0.976981443064594,pairwise
+8,100000,335.824,290.193,1.1572436275168596,max_reduce
diff --git a/benchmark-analysis/focus-n2-8-tahoe-m1-appleclang-rerun/focused_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-tahoe-m1-appleclang-rerun/focused_max_reduce_n2_8.csv
new file mode 100644
index 0000000..adf6a25
--- /dev/null
+++ b/benchmark-analysis/focus-n2-8-tahoe-m1-appleclang-rerun/focused_max_reduce_n2_8.csv
@@ -0,0 +1,43 @@
+mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms
+max_reduce,2,500,5,2,0.157,0.06,0.065
+max_reduce,2,1000,5,2,0.306,0.118,0.127
+max_reduce,2,2000,5,2,0.607,0.232,0.253
+max_reduce,2,5000,5,2,1.51,0.581,0.629
+max_reduce,2,10000,5,2,3.032,1.167,1.264
+max_reduce,2,100000,5,2,30.532,11.674,12.775
+max_reduce,3,500,5,2,0.272,0.108,0.119
+max_reduce,3,1000,5,2,0.527,0.21,0.231
+max_reduce,3,2000,5,2,1.059,0.421,0.463
+max_reduce,3,5000,5,2,2.653,1.061,1.16
+max_reduce,3,10000,5,2,5.277,2.101,2.318
+max_reduce,3,100000,5,2,52.996,21.124,23.223
+max_reduce,4,500,5,2,0.416,0.17,0.188
+max_reduce,4,1000,5,2,0.829,0.34,0.377
+max_reduce,4,2000,5,2,1.662,0.682,0.756
+max_reduce,4,5000,5,2,4.139,1.702,1.88
+max_reduce,4,10000,5,2,8.288,3.414,3.759
+max_reduce,4,100000,5,2,83.281,34.132,37.948
+max_reduce,5,500,5,2,0.629,0.262,0.294
+max_reduce,5,1000,5,2,1.26,0.527,0.59
+max_reduce,5,2000,5,2,2.531,1.055,1.188
+max_reduce,5,5000,5,2,6.374,2.674,2.984
+max_reduce,5,10000,5,2,12.622,5.274,5.921
+max_reduce,5,100000,5,2,121.932,51.038,57.119
+max_reduce,6,500,5,2,0.823,0.337,0.403
+max_reduce,6,1000,5,2,1.645,0.676,0.805
+max_reduce,6,2000,5,2,3.302,1.352,1.623
+max_reduce,6,5000,5,2,8.271,3.393,4.062
+max_reduce,6,10000,5,2,16.73,6.933,8.159
+max_reduce,6,100000,5,2,165.653,68.143,81.055
+max_reduce,7,500,5,2,1.085,0.463,0.526
+max_reduce,7,1000,5,2,2.201,0.957,1.054
+max_reduce,7,2000,5,2,4.341,1.853,2.11
+max_reduce,7,5000,5,2,10.845,4.63,5.271
+max_reduce,7,10000,5,2,21.804,9.311,10.598
+max_reduce,7,100000,5,2,218.178,93.182,105.963
+max_reduce,8,500,5,2,1.352,0.579,0.663
+max_reduce,8,1000,5,2,2.667,1.134,1.317
+max_reduce,8,2000,5,2,5.495,2.345,2.705
+max_reduce,8,5000,5,2,13.925,5.966,6.841
+max_reduce,8,10000,5,2,27.723,11.773,13.719
+max_reduce,8,100000,5,2,269.065,114.653,132.759
diff --git a/benchmark-analysis/focus-n2-8-tahoe-m1-appleclang-rerun/focused_pairwise_n2_8.csv b/benchmark-analysis/focus-n2-8-tahoe-m1-appleclang-rerun/focused_pairwise_n2_8.csv
new file mode 100644
index 0000000..e9afee5
--- /dev/null
+++ b/benchmark-analysis/focus-n2-8-tahoe-m1-appleclang-rerun/focused_pairwise_n2_8.csv
@@ -0,0 +1,43 @@
+mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms
+pairwise,2,500,5,2,0.112,0.04,0.041
+pairwise,2,1000,5,2,0.223,0.079,0.082
+pairwise,2,2000,5,2,0.442,0.157,0.162
+pairwise,2,5000,5,2,1.098,0.389,0.402
+pairwise,2,10000,5,2,2.217,0.793,0.808
+pairwise,2,100000,5,2,22.163,7.899,8.108
+pairwise,3,500,5,2,0.263,0.108,0.11
+pairwise,3,1000,5,2,0.587,0.274,0.224
+pairwise,3,2000,5,2,1.05,0.431,0.443
+pairwise,3,5000,5,2,2.624,1.073,1.107
+pairwise,3,10000,5,2,5.266,2.161,2.221
+pairwise,3,100000,5,2,52.696,21.633,22.205
+pairwise,4,500,5,2,0.493,0.215,0.219
+pairwise,4,1000,5,2,0.99,0.433,0.441
+pairwise,4,2000,5,2,1.962,0.855,0.876
+pairwise,4,5000,5,2,4.897,2.138,2.183
+pairwise,4,10000,5,2,10.048,4.513,4.388
+pairwise,4,100000,5,2,104.145,48.626,44.01
+pairwise,5,500,5,2,0.756,0.34,0.347
+pairwise,5,1000,5,2,1.524,0.687,0.7
+pairwise,5,2000,5,2,3.052,1.37,1.406
+pairwise,5,5000,5,2,7.549,3.399,3.466
+pairwise,5,10000,5,2,15.133,6.826,6.936
+pairwise,5,100000,5,2,152.57,68.763,69.956
+pairwise,6,500,5,2,1.084,0.494,0.508
+pairwise,6,1000,5,2,2.17,0.988,1.018
+pairwise,6,2000,5,2,4.353,1.98,2.045
+pairwise,6,5000,5,2,11.523,5.606,5.103
+pairwise,6,10000,5,2,21.77,9.907,10.229
+pairwise,6,100000,5,2,231.107,112.567,102.156
+pairwise,7,500,5,2,1.5,0.695,0.709
+pairwise,7,1000,5,2,2.99,1.389,1.411
+pairwise,7,2000,5,2,5.991,2.776,2.837
+pairwise,7,5000,5,2,15.865,7.84,7.084
+pairwise,7,10000,5,2,30.024,13.946,14.181
+pairwise,7,100000,5,2,300.698,139.566,142.122
+pairwise,8,500,5,2,1.954,0.915,0.93
+pairwise,8,1000,5,2,4.055,1.83,2.009
+pairwise,8,2000,5,2,7.838,3.671,3.737
+pairwise,8,5000,5,2,19.53,9.164,9.296
+pairwise,8,10000,5,2,39.227,18.417,18.664
+pairwise,8,100000,5,2,405.026,184.878,198.534
diff --git a/benchmark-analysis/focus-n2-8-tahoe-m1-appleclang-rerun/focused_pairwise_vs_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-tahoe-m1-appleclang-rerun/focused_pairwise_vs_max_reduce_n2_8.csv
new file mode 100644
index 0000000..4c3f565
--- /dev/null
+++ b/benchmark-analysis/focus-n2-8-tahoe-m1-appleclang-rerun/focused_pairwise_vs_max_reduce_n2_8.csv
@@ -0,0 +1,43 @@
+n,t,pairwise_fb_total_ms,max_reduce_fb_total_ms,speedup_max_over_pair,winner
+2,500,0.112,0.157,0.713375796178344,pairwise
+2,1000,0.223,0.306,0.7287581699346406,pairwise
+2,2000,0.442,0.607,0.728171334431631,pairwise
+2,5000,1.098,1.51,0.7271523178807947,pairwise
+2,10000,2.217,3.032,0.7312005277044855,pairwise
+2,100000,22.163,30.532,0.7258941438490764,pairwise
+3,500,0.263,0.272,0.9669117647058824,pairwise
+3,1000,0.587,0.527,1.113851992409867,max_reduce
+3,2000,1.05,1.059,0.991501416430595,pairwise
+3,5000,2.624,2.653,0.9890689785148888,pairwise
+3,10000,5.266,5.277,0.9979154822815993,pairwise
+3,100000,52.696,52.996,0.9943391954109744,pairwise
+4,500,0.493,0.416,1.185096153846154,max_reduce
+4,1000,0.99,0.829,1.1942098914354644,max_reduce
+4,2000,1.962,1.662,1.180505415162455,max_reduce
+4,5000,4.897,4.139,1.1831360231940082,max_reduce
+4,10000,10.048,8.288,1.2123552123552124,max_reduce
+4,100000,104.145,83.281,1.2505253299071817,max_reduce
+5,500,0.756,0.629,1.2019077901430844,max_reduce
+5,1000,1.524,1.26,1.2095238095238094,max_reduce
+5,2000,3.052,2.531,1.205847491110233,max_reduce
+5,5000,7.549,6.374,1.1843426419830563,max_reduce
+5,10000,15.133,12.622,1.198938361590873,max_reduce
+5,100000,152.57,121.932,1.2512712003411737,max_reduce
+6,500,1.084,0.823,1.3171324422843258,max_reduce
+6,1000,2.17,1.645,1.3191489361702127,max_reduce
+6,2000,4.353,3.302,1.3182919442761962,max_reduce
+6,5000,11.523,8.271,1.3931809938338773,max_reduce
+6,10000,21.77,16.73,1.301255230125523,max_reduce
+6,100000,231.107,165.653,1.3951271634078466,max_reduce
+7,500,1.5,1.085,1.3824884792626728,max_reduce
+7,1000,2.99,2.201,1.3584734211721945,max_reduce
+7,2000,5.991,4.341,1.3800967519004836,max_reduce
+7,5000,15.865,10.845,1.46288612263716,max_reduce
+7,10000,30.024,21.804,1.3769950467804073,max_reduce
+7,100000,300.698,218.178,1.378223285574164,max_reduce
+8,500,1.954,1.352,1.445266272189349,max_reduce
+8,1000,4.055,2.667,1.520434945631796,max_reduce
+8,2000,7.838,5.495,1.4263876251137397,max_reduce
+8,5000,19.53,13.925,1.4025134649910234,max_reduce
+8,10000,39.227,27.723,1.4149623056667748,max_reduce
+8,100000,405.026,269.065,1.5053091260476095,max_reduce
diff --git a/benchmark-analysis/focus-n2-8-tahoe-m1-gcc15/focused_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-tahoe-m1-gcc15/focused_max_reduce_n2_8.csv
new file mode 100644
index 0000000..a595565
--- /dev/null
+++ b/benchmark-analysis/focus-n2-8-tahoe-m1-gcc15/focused_max_reduce_n2_8.csv
@@ -0,0 +1,43 @@
+mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms
+max_reduce,2,500,5,2,0.172,0.072,0.082
+max_reduce,2,1000,5,2,0.355,0.148,0.172
+max_reduce,2,2000,5,2,0.691,0.291,0.33
+max_reduce,2,5000,5,2,1.723,0.72,0.829
+max_reduce,2,10000,5,2,3.506,1.469,1.687
+max_reduce,2,100000,5,2,35.103,14.611,16.945
+max_reduce,3,500,5,2,0.322,0.137,0.161
+max_reduce,3,1000,5,2,0.645,0.274,0.323
+max_reduce,3,2000,5,2,1.302,0.551,0.655
+max_reduce,3,5000,5,2,3.28,1.399,1.64
+max_reduce,3,10000,5,2,6.51,2.767,3.265
+max_reduce,3,100000,5,2,65.988,28.265,32.896
+max_reduce,4,500,5,2,0.53,0.229,0.27
+max_reduce,4,1000,5,2,1.08,0.469,0.548
+max_reduce,4,2000,5,2,2.391,0.921,1.348
+max_reduce,4,5000,5,2,5.339,2.303,2.732
+max_reduce,4,10000,5,2,10.754,4.608,5.531
+max_reduce,4,100000,5,2,120.083,46.97,66.981
+max_reduce,5,500,5,2,0.809,0.346,0.426
+max_reduce,5,1000,5,2,1.61,0.687,0.849
+max_reduce,5,2000,5,2,3.581,1.388,2.041
+max_reduce,5,5000,5,2,8.11,3.443,4.287
+max_reduce,5,10000,5,2,16.216,6.904,8.566
+max_reduce,5,100000,5,2,161.294,69.191,84.669
+max_reduce,6,500,5,2,1.294,0.48,0.769
+max_reduce,6,1000,5,2,2.559,0.975,1.494
+max_reduce,6,2000,5,2,4.475,1.909,2.392
+max_reduce,6,5000,5,2,11.181,4.771,5.976
+max_reduce,6,10000,5,2,22.777,9.697,12.209
+max_reduce,6,100000,5,2,256.489,97.482,150.26
+max_reduce,7,500,5,2,1.434,0.621,0.764
+max_reduce,7,1000,5,2,2.927,1.26,1.568
+max_reduce,7,2000,5,2,5.839,2.524,3.119
+max_reduce,7,5000,5,2,16.801,6.317,9.998
+max_reduce,7,10000,5,2,29.012,12.558,15.477
+max_reduce,7,100000,5,2,291.194,126.161,155.247
+max_reduce,8,500,5,2,2.178,0.824,1.297
+max_reduce,8,1000,5,2,3.883,1.705,2.061
+max_reduce,8,2000,5,2,7.273,3.19,3.862
+max_reduce,8,5000,5,2,18.49,8.13,9.804
+max_reduce,8,10000,5,2,43.258,16.238,25.909
+max_reduce,8,100000,5,2,369.76,162.365,196.334
diff --git a/benchmark-analysis/focus-n2-8-tahoe-m1-gcc15/focused_pairwise_n2_8.csv b/benchmark-analysis/focus-n2-8-tahoe-m1-gcc15/focused_pairwise_n2_8.csv
new file mode 100644
index 0000000..dbada60
--- /dev/null
+++ b/benchmark-analysis/focus-n2-8-tahoe-m1-gcc15/focused_pairwise_n2_8.csv
@@ -0,0 +1,43 @@
+mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms
+pairwise,2,500,5,2,0.134,0.054,0.062
+pairwise,2,1000,5,2,0.263,0.108,0.121
+pairwise,2,2000,5,2,0.542,0.225,0.248
+pairwise,2,5000,5,2,1.337,0.561,0.603
+pairwise,2,10000,5,2,2.705,1.127,1.23
+pairwise,2,100000,5,2,25.302,10.294,11.537
+pairwise,3,500,5,2,0.324,0.145,0.154
+pairwise,3,1000,5,2,0.627,0.281,0.298
+pairwise,3,2000,5,2,1.374,0.614,0.662
+pairwise,3,5000,5,2,3.387,1.525,1.625
+pairwise,3,10000,5,2,6.65,3.023,3.145
+pairwise,3,100000,5,2,62.775,28.061,29.923
+pairwise,4,500,5,2,0.585,0.272,0.282
+pairwise,4,1000,5,2,1.14,0.528,0.551
+pairwise,4,2000,5,2,2.332,1.083,1.128
+pairwise,4,5000,5,2,5.76,2.676,2.781
+pairwise,4,10000,5,2,11.558,5.353,5.595
+pairwise,4,100000,5,2,118.66,55.307,57.106
+pairwise,5,500,5,2,0.878,0.413,0.425
+pairwise,5,1000,5,2,1.755,0.828,0.849
+pairwise,5,2000,5,2,3.683,1.738,1.792
+pairwise,5,5000,5,2,8.791,4.141,4.268
+pairwise,5,10000,5,2,17.636,8.321,8.541
+pairwise,5,100000,5,2,177.332,83.627,86.041
+pairwise,6,500,5,2,1.23,0.584,0.602
+pairwise,6,1000,5,2,2.553,1.215,1.249
+pairwise,6,2000,5,2,4.95,2.352,2.417
+pairwise,6,5000,5,2,12.307,5.823,6.045
+pairwise,6,10000,5,2,25.468,12.236,12.354
+pairwise,6,100000,5,2,251.998,121.44,121.839
+pairwise,7,500,5,2,1.663,0.794,0.817
+pairwise,7,1000,5,2,3.256,1.56,1.595
+pairwise,7,2000,5,2,6.807,3.267,3.338
+pairwise,7,5000,5,2,16.316,7.817,7.992
+pairwise,7,10000,5,2,34.127,16.348,16.78
+pairwise,7,100000,5,2,329.173,156.951,162.094
+pairwise,8,500,5,2,2.104,1.013,1.033
+pairwise,8,1000,5,2,4.157,2.008,2.033
+pairwise,8,2000,5,2,8.777,4.249,4.295
+pairwise,8,5000,5,2,23.153,11.325,11.202
+pairwise,8,10000,5,2,45.271,21.771,22.223
+pairwise,8,100000,5,2,440.486,209.662,219.102
diff --git a/benchmark-analysis/focus-n2-8-tahoe-m1-gcc15/focused_pairwise_vs_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-tahoe-m1-gcc15/focused_pairwise_vs_max_reduce_n2_8.csv
new file mode 100644
index 0000000..21d6b18
--- /dev/null
+++ b/benchmark-analysis/focus-n2-8-tahoe-m1-gcc15/focused_pairwise_vs_max_reduce_n2_8.csv
@@ -0,0 +1,43 @@
+n,t,pairwise_fb_total_ms,max_reduce_fb_total_ms,speedup_max_over_pair,winner
+2,500,0.134,0.172,0.7790697674418606,pairwise
+2,1000,0.263,0.355,0.7408450704225353,pairwise
+2,2000,0.542,0.691,0.7843704775687411,pairwise
+2,5000,1.337,1.723,0.7759721416134648,pairwise
+2,10000,2.705,3.506,0.7715345122646892,pairwise
+2,100000,25.302,35.103,0.7207930946072985,pairwise
+3,500,0.324,0.322,1.0062111801242235,max_reduce
+3,1000,0.627,0.645,0.9720930232558139,pairwise
+3,2000,1.374,1.302,1.055299539170507,max_reduce
+3,5000,3.387,3.28,1.0326219512195123,max_reduce
+3,10000,6.65,6.51,1.021505376344086,max_reduce
+3,100000,62.775,65.988,0.9513093289689034,pairwise
+4,500,0.585,0.53,1.1037735849056602,max_reduce
+4,1000,1.14,1.08,1.0555555555555554,max_reduce
+4,2000,2.332,2.391,0.9753241321622751,pairwise
+4,5000,5.76,5.339,1.0788537179247049,max_reduce
+4,10000,11.558,10.754,1.0747628789287706,max_reduce
+4,100000,118.66,120.083,0.988149863011417,pairwise
+5,500,0.878,0.809,1.0852904820766378,max_reduce
+5,1000,1.755,1.61,1.0900621118012421,max_reduce
+5,2000,3.683,3.581,1.0284836637810668,max_reduce
+5,5000,8.791,8.11,1.0839704069050555,max_reduce
+5,10000,17.636,16.216,1.0875678342377897,max_reduce
+5,100000,177.332,161.294,1.0994333329200092,max_reduce
+6,500,1.23,1.294,0.9505409582689335,pairwise
+6,1000,2.553,2.559,0.9976553341148885,pairwise
+6,2000,4.95,4.475,1.106145251396648,max_reduce
+6,5000,12.307,11.181,1.100706555764243,max_reduce
+6,10000,25.468,22.777,1.1181454976511394,max_reduce
+6,100000,251.998,256.489,0.9824904771744598,pairwise
+7,500,1.663,1.434,1.1596931659693166,max_reduce
+7,1000,3.256,2.927,1.1124017765630336,max_reduce
+7,2000,6.807,5.839,1.1657818119541017,max_reduce
+7,5000,16.316,16.801,0.9711326706743647,pairwise
+7,10000,34.127,29.012,1.1763063559906246,max_reduce
+7,100000,329.173,291.194,1.1304250774397824,max_reduce
+8,500,2.104,2.178,0.9660238751147843,pairwise
+8,1000,4.157,3.883,1.070563996909606,max_reduce
+8,2000,8.777,7.273,1.2067922452908015,max_reduce
+8,5000,23.153,18.49,1.252190373174689,max_reduce
+8,10000,45.271,43.258,1.0465347450182625,max_reduce
+8,100000,440.486,369.76,1.1912754218952835,max_reduce
diff --git a/benchmark-analysis/focus-n2-8-tahoe-m1-homebrew-llvm-rerun/focused_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-tahoe-m1-homebrew-llvm-rerun/focused_max_reduce_n2_8.csv
new file mode 100644
index 0000000..2a545b4
--- /dev/null
+++ b/benchmark-analysis/focus-n2-8-tahoe-m1-homebrew-llvm-rerun/focused_max_reduce_n2_8.csv
@@ -0,0 +1,43 @@
+mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms
+max_reduce,2,500,5,2,0.366,0.14,0.152
+max_reduce,2,1000,5,2,0.539,0.207,0.224
+max_reduce,2,2000,5,2,0.859,0.328,0.362
+max_reduce,2,5000,5,2,1.801,0.694,0.751
+max_reduce,2,10000,5,2,3.127,1.205,1.3
+max_reduce,2,100000,5,2,29.167,11.226,12.169
+max_reduce,3,500,5,2,0.293,0.116,0.128
+max_reduce,3,1000,5,2,0.585,0.232,0.257
+max_reduce,3,2000,5,2,1.197,0.482,0.524
+max_reduce,3,5000,5,2,2.818,1.082,1.294
+max_reduce,3,10000,5,2,5.209,2.088,2.266
+max_reduce,3,100000,5,2,51.026,20.294,22.466
+max_reduce,4,500,5,2,0.485,0.189,0.233
+max_reduce,4,1000,5,2,0.924,0.378,0.421
+max_reduce,4,2000,5,2,1.733,0.709,0.792
+max_reduce,4,5000,5,2,4.366,1.708,2.092
+max_reduce,4,10000,5,2,8.138,3.366,3.679
+max_reduce,4,100000,5,2,84.46,32.998,40.688
+max_reduce,5,500,5,2,0.676,0.282,0.316
+max_reduce,5,1000,5,2,1.316,0.566,0.596
+max_reduce,5,2000,5,2,2.515,1.051,1.18
+max_reduce,5,5000,5,2,6.402,2.499,3.217
+max_reduce,5,10000,5,2,12.69,4.945,6.409
+max_reduce,5,100000,5,2,118.285,49.424,55.571
+max_reduce,6,500,5,2,0.918,0.376,0.45
+max_reduce,6,1000,5,2,1.837,0.754,0.9
+max_reduce,6,2000,5,2,3.412,1.401,1.674
+max_reduce,6,5000,5,2,8.67,3.337,4.541
+max_reduce,6,10000,5,2,15.991,6.585,7.831
+max_reduce,6,100000,5,2,171.928,65.649,90.501
+max_reduce,7,500,5,2,1.206,0.514,0.585
+max_reduce,7,1000,5,2,2.248,0.96,1.088
+max_reduce,7,2000,5,2,4.387,1.895,2.101
+max_reduce,7,5000,5,2,11.492,4.532,6.047
+max_reduce,7,10000,5,2,20.987,8.956,10.201
+max_reduce,7,100000,5,2,211.033,90.182,102.544
+max_reduce,8,500,5,2,1.442,0.551,0.783
+max_reduce,8,1000,5,2,2.769,1.175,1.369
+max_reduce,8,2000,5,2,5.497,2.333,2.711
+max_reduce,8,5000,5,2,13.267,5.666,6.523
+max_reduce,8,10000,5,2,26.494,11.293,13.048
+max_reduce,8,100000,5,2,258.81,110.393,127.621
diff --git a/benchmark-analysis/focus-n2-8-tahoe-m1-homebrew-llvm-rerun/focused_pairwise_n2_8.csv b/benchmark-analysis/focus-n2-8-tahoe-m1-homebrew-llvm-rerun/focused_pairwise_n2_8.csv
new file mode 100644
index 0000000..fd6cde2
--- /dev/null
+++ b/benchmark-analysis/focus-n2-8-tahoe-m1-homebrew-llvm-rerun/focused_pairwise_n2_8.csv
@@ -0,0 +1,43 @@
+mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms
+pairwise,2,500,5,2,0.107,0.038,0.039
+pairwise,2,1000,5,2,0.24,0.086,0.088
+pairwise,2,2000,5,2,0.524,0.172,0.219
+pairwise,2,5000,5,2,1.2,0.429,0.439
+pairwise,2,10000,5,2,2.236,0.799,0.816
+pairwise,2,100000,5,2,20.877,7.438,7.663
+pairwise,3,500,5,2,0.287,0.119,0.12
+pairwise,3,1000,5,2,0.574,0.238,0.24
+pairwise,3,2000,5,2,1.143,0.47,0.483
+pairwise,3,5000,5,2,2.65,1.094,1.115
+pairwise,3,10000,5,2,5.094,2.102,2.13
+pairwise,3,100000,5,2,49.585,20.414,20.873
+pairwise,4,500,5,2,0.531,0.233,0.235
+pairwise,4,1000,5,2,1.063,0.468,0.47
+pairwise,4,2000,5,2,2.078,0.931,0.899
+pairwise,4,5000,5,2,4.841,2.152,2.115
+pairwise,4,10000,5,2,9.547,4.196,4.23
+pairwise,4,100000,5,2,94.874,42.855,41.191
+pairwise,5,500,5,2,0.845,0.381,0.387
+pairwise,5,1000,5,2,1.6,0.717,0.729
+pairwise,5,2000,5,2,3.18,1.461,1.433
+pairwise,5,5000,5,2,7.481,3.401,3.395
+pairwise,5,10000,5,2,14.703,6.683,6.693
+pairwise,5,100000,5,2,149.014,67.846,67.666
+pairwise,6,500,5,2,1.212,0.552,0.568
+pairwise,6,1000,5,2,2.31,1.072,1.067
+pairwise,6,2000,5,2,4.45,2.052,2.061
+pairwise,6,5000,5,2,10.542,4.815,4.934
+pairwise,6,10000,5,2,21.06,9.612,9.871
+pairwise,6,100000,5,2,211.288,96.319,99.171
+pairwise,7,500,5,2,1.551,0.72,0.732
+pairwise,7,1000,5,2,3.13,1.452,1.479
+pairwise,7,2000,5,2,5.981,2.771,2.827
+pairwise,7,5000,5,2,14.508,6.71,6.882
+pairwise,7,10000,5,2,29.166,13.517,13.814
+pairwise,7,100000,5,2,292.368,134.58,139.464
+pairwise,8,500,5,2,2.039,0.959,0.966
+pairwise,8,1000,5,2,3.949,1.86,1.866
+pairwise,8,2000,5,2,7.685,3.592,3.667
+pairwise,8,5000,5,2,18.993,8.878,9.08
+pairwise,8,10000,5,2,37.866,17.756,18.033
+pairwise,8,100000,5,2,379.795,177.795,181.225
diff --git a/benchmark-analysis/focus-n2-8-tahoe-m1-homebrew-llvm-rerun/focused_pairwise_vs_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-tahoe-m1-homebrew-llvm-rerun/focused_pairwise_vs_max_reduce_n2_8.csv
new file mode 100644
index 0000000..a67f5fb
--- /dev/null
+++ b/benchmark-analysis/focus-n2-8-tahoe-m1-homebrew-llvm-rerun/focused_pairwise_vs_max_reduce_n2_8.csv
@@ -0,0 +1,43 @@
+n,t,pairwise_fb_total_ms,max_reduce_fb_total_ms,speedup_max_over_pair,winner
+2,500,0.107,0.366,0.2923497267759563,pairwise
+2,1000,0.24,0.539,0.44526901669758806,pairwise
+2,2000,0.524,0.859,0.610011641443539,pairwise
+2,5000,1.2,1.801,0.6662965019433648,pairwise
+2,10000,2.236,3.127,0.7150623600895428,pairwise
+2,100000,20.877,29.167,0.7157746768608358,pairwise
+3,500,0.287,0.293,0.9795221843003413,pairwise
+3,1000,0.574,0.585,0.9811965811965812,pairwise
+3,2000,1.143,1.197,0.9548872180451128,pairwise
+3,5000,2.65,2.818,0.9403832505322923,pairwise
+3,10000,5.094,5.209,0.9779228258782877,pairwise
+3,100000,49.585,51.026,0.9717594951593305,pairwise
+4,500,0.531,0.485,1.0948453608247424,max_reduce
+4,1000,1.063,0.924,1.1504329004329004,max_reduce
+4,2000,2.078,1.733,1.199076745527986,max_reduce
+4,5000,4.841,4.366,1.1087952359138802,max_reduce
+4,10000,9.547,8.138,1.17313836323421,max_reduce
+4,100000,94.874,84.46,1.1233009708737864,max_reduce
+5,500,0.845,0.676,1.2499999999999998,max_reduce
+5,1000,1.6,1.316,1.21580547112462,max_reduce
+5,2000,3.18,2.515,1.2644135188866799,max_reduce
+5,5000,7.481,6.402,1.168541080912215,max_reduce
+5,10000,14.703,12.69,1.158628841607565,max_reduce
+5,100000,149.014,118.285,1.2597878006509702,max_reduce
+6,500,1.212,0.918,1.3202614379084967,max_reduce
+6,1000,2.31,1.837,1.25748502994012,max_reduce
+6,2000,4.45,3.412,1.3042203985932006,max_reduce
+6,5000,10.542,8.67,1.215916955017301,max_reduce
+6,10000,21.06,15.991,1.3169908073291225,max_reduce
+6,100000,211.288,171.928,1.2289330417384021,max_reduce
+7,500,1.551,1.206,1.2860696517412935,max_reduce
+7,1000,3.13,2.248,1.3923487544483983,max_reduce
+7,2000,5.981,4.387,1.363346250284933,max_reduce
+7,5000,14.508,11.492,1.2624434389140269,max_reduce
+7,10000,29.166,20.987,1.3897174441320819,max_reduce
+7,100000,292.368,211.033,1.385413655684182,max_reduce
+8,500,2.039,1.442,1.4140083217753123,max_reduce
+8,1000,3.949,2.769,1.4261466233297218,max_reduce
+8,2000,7.685,5.497,1.3980352919774421,max_reduce
+8,5000,18.993,13.267,1.4315971960503504,max_reduce
+8,10000,37.866,26.494,1.429229259454971,max_reduce
+8,100000,379.795,258.81,1.4674664812024265,max_reduce
diff --git a/benchmark-analysis/focus-n2-8-tahoe-m1/focused_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-tahoe-m1/focused_max_reduce_n2_8.csv
new file mode 100644
index 0000000..2d6616b
--- /dev/null
+++ b/benchmark-analysis/focus-n2-8-tahoe-m1/focused_max_reduce_n2_8.csv
@@ -0,0 +1,43 @@
+mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms
+max_reduce,2,500,5,2,0.153,0.058,0.064
+max_reduce,2,1000,5,2,0.313,0.12,0.131
+max_reduce,2,2000,5,2,0.603,0.232,0.252
+max_reduce,2,5000,5,2,1.506,0.579,0.63
+max_reduce,2,10000,5,2,3.016,1.159,1.262
+max_reduce,2,100000,5,2,30.249,11.622,12.676
+max_reduce,3,500,5,2,0.262,0.104,0.115
+max_reduce,3,1000,5,2,0.524,0.208,0.23
+max_reduce,3,2000,5,2,1.087,0.416,0.5
+max_reduce,3,5000,5,2,2.718,1.08,1.196
+max_reduce,3,10000,5,2,5.243,2.081,2.309
+max_reduce,3,100000,5,2,52.645,20.883,23.206
+max_reduce,4,500,5,2,0.414,0.169,0.188
+max_reduce,4,1000,5,2,0.843,0.35,0.381
+max_reduce,4,2000,5,2,1.691,0.695,0.769
+max_reduce,4,5000,5,2,4.138,1.698,1.885
+max_reduce,4,10000,5,2,8.285,3.4,3.772
+max_reduce,4,100000,5,2,83.105,34.108,37.867
+max_reduce,5,500,5,2,0.606,0.253,0.283
+max_reduce,5,1000,5,2,1.212,0.507,0.567
+max_reduce,5,2000,5,2,2.597,1.012,1.311
+max_reduce,5,5000,5,2,6.063,2.539,2.84
+max_reduce,5,10000,5,2,12.146,5.087,5.688
+max_reduce,5,100000,5,2,121.787,50.859,57.182
+max_reduce,6,500,5,2,0.822,0.336,0.403
+max_reduce,6,1000,5,2,1.643,0.673,0.805
+max_reduce,6,2000,5,2,3.294,1.352,1.616
+max_reduce,6,5000,5,2,8.313,3.427,4.067
+max_reduce,6,10000,5,2,16.464,6.742,8.081
+max_reduce,6,100000,5,2,177.789,67.649,93.821
+max_reduce,7,500,5,2,1.178,0.461,0.622
+max_reduce,7,1000,5,2,2.159,0.921,1.048
+max_reduce,7,2000,5,2,4.327,1.849,2.101
+max_reduce,7,5000,5,2,10.82,4.613,5.263
+max_reduce,7,10000,5,2,21.646,9.234,10.525
+max_reduce,7,100000,5,2,236.383,92.557,124.809
+max_reduce,8,500,5,2,1.325,0.563,0.653
+max_reduce,8,1000,5,2,2.954,1.127,1.611
+max_reduce,8,2000,5,2,5.323,2.258,2.635
+max_reduce,8,5000,5,2,13.262,5.66,6.532
+max_reduce,8,10000,5,2,26.559,11.297,13.108
+max_reduce,8,100000,5,2,295.398,113.367,160.473
diff --git a/benchmark-analysis/focus-n2-8-tahoe-m1/focused_pairwise_n2_8.csv b/benchmark-analysis/focus-n2-8-tahoe-m1/focused_pairwise_n2_8.csv
new file mode 100644
index 0000000..a7eae2c
--- /dev/null
+++ b/benchmark-analysis/focus-n2-8-tahoe-m1/focused_pairwise_n2_8.csv
@@ -0,0 +1,43 @@
+mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms
+pairwise,2,500,5,2,0.113,0.04,0.041
+pairwise,2,1000,5,2,0.216,0.077,0.079
+pairwise,2,2000,5,2,0.429,0.154,0.157
+pairwise,2,5000,5,2,1.075,0.384,0.393
+pairwise,2,10000,5,2,2.156,0.77,0.788
+pairwise,2,100000,5,2,21.628,7.757,7.854
+pairwise,3,500,5,2,0.266,0.11,0.111
+pairwise,3,1000,5,2,0.565,0.21,0.268
+pairwise,3,2000,5,2,1.019,0.42,0.429
+pairwise,3,5000,5,2,2.578,1.076,1.076
+pairwise,3,10000,5,2,5.162,2.156,2.152
+pairwise,3,100000,5,2,51.556,21.373,21.637
+pairwise,4,500,5,2,0.475,0.208,0.21
+pairwise,4,1000,5,2,0.952,0.42,0.42
+pairwise,4,2000,5,2,1.905,0.839,0.843
+pairwise,4,5000,5,2,4.778,2.098,2.124
+pairwise,4,10000,5,2,9.604,4.197,4.296
+pairwise,4,100000,5,2,99.357,44.892,43.313
+pairwise,5,500,5,2,0.755,0.341,0.345
+pairwise,5,1000,5,2,1.563,0.703,0.717
+pairwise,5,2000,5,2,3.026,1.365,1.387
+pairwise,5,5000,5,2,7.573,3.425,3.465
+pairwise,5,10000,5,2,15.112,6.838,6.904
+pairwise,5,100000,5,2,151.847,68.742,69.363
+pairwise,6,500,5,2,1.116,0.511,0.519
+pairwise,6,1000,5,2,2.189,1.0,1.025
+pairwise,6,2000,5,2,4.356,1.988,2.043
+pairwise,6,5000,5,2,11.326,5.157,5.322
+pairwise,6,10000,5,2,21.75,9.936,10.187
+pairwise,6,100000,5,2,218.774,99.698,102.684
+pairwise,7,500,5,2,1.616,0.717,0.8
+pairwise,7,1000,5,2,3.108,1.444,1.467
+pairwise,7,2000,5,2,6.0,2.784,2.837
+pairwise,7,5000,5,2,15.009,6.961,7.105
+pairwise,7,10000,5,2,30.005,13.959,14.157
+pairwise,7,100000,5,2,300.895,139.938,141.98
+pairwise,8,500,5,2,1.973,0.926,0.938
+pairwise,8,1000,5,2,3.9,1.833,1.852
+pairwise,8,2000,5,2,7.82,3.677,3.714
+pairwise,8,5000,5,2,20.138,9.188,9.879
+pairwise,8,10000,5,2,39.09,18.394,18.547
+pairwise,8,100000,5,2,391.364,183.707,186.116
diff --git a/benchmark-analysis/focus-n2-8-tahoe-m1/focused_pairwise_vs_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-tahoe-m1/focused_pairwise_vs_max_reduce_n2_8.csv
new file mode 100644
index 0000000..bec5986
--- /dev/null
+++ b/benchmark-analysis/focus-n2-8-tahoe-m1/focused_pairwise_vs_max_reduce_n2_8.csv
@@ -0,0 +1,43 @@
+n,t,pairwise_fb_total_ms,max_reduce_fb_total_ms,speedup_max_over_pair,winner
+2,500,0.113,0.153,0.738562091503268,pairwise
+2,1000,0.216,0.313,0.6900958466453674,pairwise
+2,2000,0.429,0.603,0.7114427860696517,pairwise
+2,5000,1.075,1.506,0.7138114209827356,pairwise
+2,10000,2.156,3.016,0.7148541114058355,pairwise
+2,100000,21.628,30.249,0.7149988429369566,pairwise
+3,500,0.266,0.262,1.015267175572519,max_reduce
+3,1000,0.565,0.524,1.07824427480916,max_reduce
+3,2000,1.019,1.087,0.937442502299908,pairwise
+3,5000,2.578,2.718,0.9484915378955113,pairwise
+3,10000,5.162,5.243,0.9845508296776654,pairwise
+3,100000,51.556,52.645,0.9793142748599106,pairwise
+4,500,0.475,0.414,1.1473429951690821,max_reduce
+4,1000,0.952,0.843,1.129300118623962,max_reduce
+4,2000,1.905,1.691,1.1265523358959195,max_reduce
+4,5000,4.778,4.138,1.1546640889318511,max_reduce
+4,10000,9.604,8.285,1.1592033796016896,max_reduce
+4,100000,99.357,83.105,1.1955598339450093,max_reduce
+5,500,0.755,0.606,1.245874587458746,max_reduce
+5,1000,1.563,1.212,1.2896039603960396,max_reduce
+5,2000,3.026,2.597,1.1651906045437042,max_reduce
+5,5000,7.573,6.063,1.24905162460828,max_reduce
+5,10000,15.112,12.146,1.2441956199571875,max_reduce
+5,100000,151.847,121.787,1.2468243737016267,max_reduce
+6,500,1.116,0.822,1.3576642335766425,max_reduce
+6,1000,2.189,1.643,1.332318928788801,max_reduce
+6,2000,4.356,3.294,1.3224043715846994,max_reduce
+6,5000,11.326,8.313,1.362444364248767,max_reduce
+6,10000,21.75,16.464,1.3210641399416911,max_reduce
+6,100000,218.774,177.789,1.2305260730416394,max_reduce
+7,500,1.616,1.178,1.371816638370119,max_reduce
+7,1000,3.108,2.159,1.4395553496989348,max_reduce
+7,2000,6.0,4.327,1.386642015253062,max_reduce
+7,5000,15.009,10.82,1.3871534195933457,max_reduce
+7,10000,30.005,21.646,1.3861683451907973,max_reduce
+7,100000,300.895,236.383,1.2729130267405016,max_reduce
+8,500,1.973,1.325,1.489056603773585,max_reduce
+8,1000,3.9,2.954,1.3202437373053486,max_reduce
+8,2000,7.82,5.323,1.469096374225061,max_reduce
+8,5000,20.138,13.262,1.5184738350173428,max_reduce
+8,10000,39.09,26.559,1.4718174630068903,max_reduce
+8,100000,391.364,295.398,1.3248701751535215,max_reduce
diff --git a/benchmark-analysis/focus-n2-8-ventura-kabylake/focused_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-ventura-kabylake/focused_max_reduce_n2_8.csv
new file mode 100644
index 0000000..d795131
--- /dev/null
+++ b/benchmark-analysis/focus-n2-8-ventura-kabylake/focused_max_reduce_n2_8.csv
@@ -0,0 +1,43 @@
+mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms
+max_reduce,2,500,5,2,0.277,0.112,0.118
+max_reduce,2,1000,5,2,0.54,0.216,0.235
+max_reduce,2,2000,5,2,1.087,0.432,0.471
+max_reduce,2,5000,5,2,2.706,1.084,1.179
+max_reduce,2,10000,5,2,5.423,2.175,2.373
+max_reduce,2,100000,5,2,56.385,22.584,24.996
+max_reduce,3,500,5,2,0.54,0.227,0.249
+max_reduce,3,1000,5,2,1.079,0.455,0.5
+max_reduce,3,2000,5,2,2.161,0.911,1.005
+max_reduce,3,5000,5,2,5.399,2.277,2.511
+max_reduce,3,10000,5,2,11.384,5.132,5.036
+max_reduce,3,100000,5,2,114.188,49.074,52.8
+max_reduce,4,500,5,2,0.904,0.394,0.427
+max_reduce,4,1000,5,2,1.823,0.791,0.872
+max_reduce,4,2000,5,2,3.616,1.585,1.715
+max_reduce,4,5000,5,2,9.206,3.862,4.558
+max_reduce,4,10000,5,2,19.364,8.678,9.123
+max_reduce,4,100000,5,2,188.534,82.793,90.003
+max_reduce,5,500,5,2,1.314,0.571,0.644
+max_reduce,5,1000,5,2,2.692,1.182,1.297
+max_reduce,5,2000,5,2,5.385,2.363,2.636
+max_reduce,5,5000,5,2,14.302,5.998,7.27
+max_reduce,5,10000,5,2,26.999,11.654,13.431
+max_reduce,5,100000,5,2,281.104,123.921,137.218
+max_reduce,6,500,5,2,1.837,0.802,0.914
+max_reduce,6,1000,5,2,3.734,1.613,1.864
+max_reduce,6,2000,5,2,7.391,3.227,3.708
+max_reduce,6,5000,5,2,19.422,8.992,9.299
+max_reduce,6,10000,5,2,38.488,16.513,19.652
+max_reduce,6,100000,5,2,388.776,169.478,195.388
+max_reduce,7,500,5,2,2.674,1.095,1.388
+max_reduce,7,1000,5,2,5.001,2.25,2.482
+max_reduce,7,2000,5,2,9.926,4.381,5.02
+max_reduce,7,5000,5,2,25.743,11.491,12.949
+max_reduce,7,10000,5,2,51.916,23.076,26.213
+max_reduce,7,100000,5,2,520.842,230.772,262.623
+max_reduce,8,500,5,2,3.146,1.413,1.579
+max_reduce,8,1000,5,2,6.216,2.756,3.158
+max_reduce,8,2000,5,2,12.468,5.52,6.352
+max_reduce,8,5000,5,2,32.278,14.519,16.279
+max_reduce,8,10000,5,2,65.428,29.275,33.206
+max_reduce,8,100000,5,2,654.927,289.674,334.606
diff --git a/benchmark-analysis/focus-n2-8-ventura-kabylake/focused_pairwise_n2_8.csv b/benchmark-analysis/focus-n2-8-ventura-kabylake/focused_pairwise_n2_8.csv
new file mode 100644
index 0000000..84e41f7
--- /dev/null
+++ b/benchmark-analysis/focus-n2-8-ventura-kabylake/focused_pairwise_n2_8.csv
@@ -0,0 +1,43 @@
+mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms
+pairwise,2,500,5,2,0.221,0.087,0.089
+pairwise,2,1000,5,2,0.44,0.173,0.178
+pairwise,2,2000,5,2,0.882,0.347,0.357
+pairwise,2,5000,5,2,2.207,0.867,0.897
+pairwise,2,10000,5,2,4.447,1.774,1.798
+pairwise,2,100000,5,2,45.713,17.999,18.851
+pairwise,3,500,5,2,0.507,0.219,0.224
+pairwise,3,1000,5,2,1.012,0.438,0.449
+pairwise,3,2000,5,2,2.023,0.877,0.899
+pairwise,3,5000,5,2,5.322,2.225,2.415
+pairwise,3,10000,5,2,10.175,4.392,4.568
+pairwise,3,100000,5,2,105.112,45.665,46.981
+pairwise,4,500,5,2,0.872,0.391,0.398
+pairwise,4,1000,5,2,1.747,0.785,0.802
+pairwise,4,2000,5,2,3.558,1.629,1.611
+pairwise,4,5000,5,2,9.266,4.386,4.086
+pairwise,4,10000,5,2,18.341,8.003,8.771
+pairwise,4,100000,5,2,181.699,82.067,83.36
+pairwise,5,500,5,2,1.325,0.61,0.615
+pairwise,5,1000,5,2,2.65,1.226,1.23
+pairwise,5,2000,5,2,5.339,2.45,2.502
+pairwise,5,5000,5,2,13.325,6.161,6.206
+pairwise,5,10000,5,2,27.926,13.005,13.014
+pairwise,5,100000,5,2,279.625,128.97,129.897
+pairwise,6,500,5,2,1.86,0.862,0.879
+pairwise,6,1000,5,2,3.774,1.726,1.79
+pairwise,6,2000,5,2,7.444,3.471,3.518
+pairwise,6,5000,5,2,19.162,8.772,9.256
+pairwise,6,10000,5,2,38.646,18.23,18.16
+pairwise,6,100000,5,2,392.471,182.592,186.207
+pairwise,7,500,5,2,2.489,1.167,1.186
+pairwise,7,1000,5,2,5.027,2.376,2.383
+pairwise,7,2000,5,2,10.012,4.727,4.758
+pairwise,7,5000,5,2,25.914,12.402,12.192
+pairwise,7,10000,5,2,52.01,24.442,24.963
+pairwise,7,100000,5,2,521.582,244.542,249.374
+pairwise,8,500,5,2,3.23,1.504,1.532
+pairwise,8,1000,5,2,6.377,3.018,3.058
+pairwise,8,2000,5,2,13.28,6.027,6.64
+pairwise,8,5000,5,2,33.573,15.734,16.363
+pairwise,8,10000,5,2,66.948,31.635,32.28
+pairwise,8,100000,5,2,670.26,316.36,322.499
diff --git a/benchmark-analysis/focus-n2-8-ventura-kabylake/focused_pairwise_vs_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-ventura-kabylake/focused_pairwise_vs_max_reduce_n2_8.csv
new file mode 100644
index 0000000..1b37cf8
--- /dev/null
+++ b/benchmark-analysis/focus-n2-8-ventura-kabylake/focused_pairwise_vs_max_reduce_n2_8.csv
@@ -0,0 +1,43 @@
+n,t,pairwise_fb_total_ms,max_reduce_fb_total_ms,speedup_max_over_pair,winner
+2,500,0.221,0.277,0.7978339350180504,pairwise
+2,1000,0.44,0.54,0.8148148148148148,pairwise
+2,2000,0.882,1.087,0.8114075436982521,pairwise
+2,5000,2.207,2.706,0.8155949741315595,pairwise
+2,10000,4.447,5.423,0.8200258159690208,pairwise
+2,100000,45.713,56.385,0.8107298040258935,pairwise
+3,500,0.507,0.54,0.9388888888888889,pairwise
+3,1000,1.012,1.079,0.93790546802595,pairwise
+3,2000,2.023,2.161,0.9361406756131421,pairwise
+3,5000,5.322,5.399,0.985738099648083,pairwise
+3,10000,10.175,11.384,0.8937983134223472,pairwise
+3,100000,105.112,114.188,0.9205170420709706,pairwise
+4,500,0.872,0.904,0.9646017699115044,pairwise
+4,1000,1.747,1.823,0.9583104772353265,pairwise
+4,2000,3.558,3.616,0.9839601769911503,pairwise
+4,5000,9.266,9.206,1.006517488594395,max_reduce
+4,10000,18.341,19.364,0.9471700061970667,pairwise
+4,100000,181.699,188.534,0.9637465921266192,pairwise
+5,500,1.325,1.314,1.0083713850837137,max_reduce
+5,1000,2.65,2.692,0.9843982169390787,pairwise
+5,2000,5.339,5.385,0.9914577530176417,pairwise
+5,5000,13.325,14.302,0.9316878758215634,pairwise
+5,10000,27.926,26.999,1.0343346049853699,max_reduce
+5,100000,279.625,281.104,0.9947386020832149,pairwise
+6,500,1.86,1.837,1.0125204137180186,max_reduce
+6,1000,3.774,3.734,1.0107123727905731,max_reduce
+6,2000,7.444,7.391,1.007170883506968,max_reduce
+6,5000,19.162,19.422,0.9866131191432396,pairwise
+6,10000,38.646,38.488,1.0041051756391604,max_reduce
+6,100000,392.471,388.776,1.009504187501286,max_reduce
+7,500,2.489,2.674,0.9308152580403889,pairwise
+7,1000,5.027,5.001,1.0051989602079583,max_reduce
+7,2000,10.012,9.926,1.008664114446907,max_reduce
+7,5000,25.914,25.743,1.0066425824495981,max_reduce
+7,10000,52.01,51.916,1.0018106171507821,max_reduce
+7,100000,521.582,520.842,1.0014207763582814,max_reduce
+8,500,3.23,3.146,1.0267005721551177,max_reduce
+8,1000,6.377,6.216,1.0259009009009008,max_reduce
+8,2000,13.28,12.468,1.065126724414501,max_reduce
+8,5000,33.573,32.278,1.0401202057128696,max_reduce
+8,10000,66.948,65.428,1.0232316439444886,max_reduce
+8,100000,670.26,654.927,1.0234117695559963,max_reduce
diff --git a/benchmark-analysis/focus-n2-8/focused_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8/focused_max_reduce_n2_8.csv
new file mode 100644
index 0000000..ee91af5
--- /dev/null
+++ b/benchmark-analysis/focus-n2-8/focused_max_reduce_n2_8.csv
@@ -0,0 +1,43 @@
+"mode","n","t","runs","warmup","fb_total_ms","forward_ms","backward_ms"
+"max_reduce","2","500","5","2","0.035","0.017","0.017"
+"max_reduce","2","1000","5","2","0.105","0.051","0.052"
+"max_reduce","2","2000","5","2","0.166","0.069","0.07"
+"max_reduce","2","5000","5","2","0.407","0.171","0.174"
+"max_reduce","2","10000","5","2","0.808","0.343","0.427"
+"max_reduce","2","100000","5","2","7.75","3.437","3.498"
+"max_reduce","3","500","5","2","0.064","0.032","0.031"
+"max_reduce","3","1000","5","2","0.144","0.063","0.063"
+"max_reduce","3","2000","5","2","0.405","0.211","0.136"
+"max_reduce","3","5000","5","2","0.716","0.318","0.316"
+"max_reduce","3","10000","5","2","1.278","0.634","0.631"
+"max_reduce","3","100000","5","2","14.657","6.589","6.891"
+"max_reduce","4","500","5","2","0.098","0.048","0.049"
+"max_reduce","4","1000","5","2","0.221","0.095","0.099"
+"max_reduce","4","2000","5","2","0.652","0.286","0.294"
+"max_reduce","4","5000","5","2","1.644","0.745","0.765"
+"max_reduce","4","10000","5","2","2.025","1.004","0.996"
+"max_reduce","4","100000","5","2","21.311","9.707","10.107"
+"max_reduce","5","500","5","2","0.169","0.074","0.076"
+"max_reduce","5","1000","5","2","0.332","0.149","0.153"
+"max_reduce","5","2000","5","2","0.694","0.302","0.304"
+"max_reduce","5","5000","5","2","1.631","0.744","0.828"
+"max_reduce","5","10000","5","2","3.227","1.632","1.558"
+"max_reduce","5","100000","5","2","31.91","14.943","15.307"
+"max_reduce","6","500","5","2","0.219","0.099","0.101"
+"max_reduce","6","1000","5","2","0.438","0.198","0.201"
+"max_reduce","6","2000","5","2","0.877","0.398","0.41"
+"max_reduce","6","5000","5","2","2.195","0.995","1.059"
+"max_reduce","6","10000","5","2","4.084","1.99","2.053"
+"max_reduce","6","100000","5","2","44.258","20.539","21.621"
+"max_reduce","7","500","5","2","0.281","0.128","0.131"
+"max_reduce","7","1000","5","2","0.567","0.257","0.264"
+"max_reduce","7","2000","5","2","1.127","0.518","0.532"
+"max_reduce","7","5000","5","2","2.763","1.286","1.32"
+"max_reduce","7","10000","5","2","5.572","2.629","2.765"
+"max_reduce","7","100000","5","2","56.326","26.534","27.534"
+"max_reduce","8","500","5","2","0.341","0.158","0.16"
+"max_reduce","8","1000","5","2","0.687","0.316","0.32"
+"max_reduce","8","2000","5","2","1.35","0.633","0.642"
+"max_reduce","8","5000","5","2","3.369","1.588","1.619"
+"max_reduce","8","10000","5","2","6.713","3.211","3.308"
+"max_reduce","8","100000","5","2","67.659","32.351","32.64"
diff --git a/benchmark-analysis/focus-n2-8/focused_pairwise_n2_8.csv b/benchmark-analysis/focus-n2-8/focused_pairwise_n2_8.csv
new file mode 100644
index 0000000..03aeaf9
--- /dev/null
+++ b/benchmark-analysis/focus-n2-8/focused_pairwise_n2_8.csv
@@ -0,0 +1,43 @@
+"mode","n","t","runs","warmup","fb_total_ms","forward_ms","backward_ms"
+"pairwise","2","500","5","2","0.059","0.03","0.028"
+"pairwise","2","1000","5","2","0.071","0.036","0.034"
+"pairwise","2","2000","5","2","0.17","0.073","0.067"
+"pairwise","2","5000","5","2","0.392","0.181","0.169"
+"pairwise","2","10000","5","2","0.705","0.36","0.337"
+"pairwise","2","100000","5","2","7.878","3.656","3.413"
+"pairwise","3","500","5","2","0.096","0.049","0.046"
+"pairwise","3","1000","5","2","0.31","0.146","0.139"
+"pairwise","3","2000","5","2","0.421","0.194","0.186"
+"pairwise","3","5000","5","2","0.984","0.493","0.464"
+"pairwise","3","10000","5","2","1.982","1.017","0.949"
+"pairwise","3","100000","5","2","20.358","9.852","9.412"
+"pairwise","4","500","5","2","0.216","0.108","0.107"
+"pairwise","4","1000","5","2","0.456","0.217","0.214"
+"pairwise","4","2000","5","2","0.992","0.468","0.47"
+"pairwise","4","5000","5","2","2.281","1.092","1.074"
+"pairwise","4","10000","5","2","4.358","2.184","2.153"
+"pairwise","4","100000","5","2","45.618","22.369","21.551"
+"pairwise","5","500","5","2","0.401","0.194","0.191"
+"pairwise","5","1000","5","2","0.799","0.387","0.382"
+"pairwise","5","2000","5","2","1.613","0.78","0.765"
+"pairwise","5","5000","5","2","3.969","1.946","1.921"
+"pairwise","5","10000","5","2","7.789","3.918","3.839"
+"pairwise","5","100000","5","2","79.753","39.361","38.679"
+"pairwise","6","500","5","2","0.939","0.452","0.447"
+"pairwise","6","1000","5","2","1.411","0.698","0.705"
+"pairwise","6","2000","5","2","2.9","1.411","1.45"
+"pairwise","6","5000","5","2","7.532","4.059","3.241"
+"pairwise","6","10000","5","2","14.834","8.057","6.694"
+"pairwise","6","100000","5","2","124.796","61.695","60.95"
+"pairwise","7","500","5","2","0.89","0.434","0.436"
+"pairwise","7","1000","5","2","1.76","0.87","0.862"
+"pairwise","7","2000","5","2","3.5","1.739","1.735"
+"pairwise","7","5000","5","2","8.758","4.39","4.341"
+"pairwise","7","10000","5","2","17.708","8.771","8.752"
+"pairwise","7","100000","5","2","178.154","88.417","87.337"
+"pairwise","8","500","5","2","1.199","0.588","0.587"
+"pairwise","8","1000","5","2","2.464","1.2","1.214"
+"pairwise","8","2000","5","2","4.8","2.362","2.346"
+"pairwise","8","5000","5","2","11.938","5.899","5.871"
+"pairwise","8","10000","5","2","23.908","11.86","11.807"
+"pairwise","8","100000","5","2","241.353","119.882","118.472"
diff --git a/benchmark-analysis/focus-n2-8/focused_pairwise_vs_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8/focused_pairwise_vs_max_reduce_n2_8.csv
new file mode 100644
index 0000000..f5681e4
--- /dev/null
+++ b/benchmark-analysis/focus-n2-8/focused_pairwise_vs_max_reduce_n2_8.csv
@@ -0,0 +1,43 @@
+"n","t","pairwise_fb_total_ms","max_reduce_fb_total_ms","speedup_max_over_pair","winner"
+"2","500","0.059","0.035","1.6857142857142855","max_reduce"
+"2","1000","0.071","0.105","0.6761904761904761","pairwise"
+"2","2000","0.17","0.166","1.0240963855421688","max_reduce"
+"2","5000","0.392","0.407","0.9631449631449632","pairwise"
+"2","10000","0.705","0.808","0.8725247524752474","pairwise"
+"2","100000","7.878","7.75","1.0165161290322582","max_reduce"
+"3","500","0.096","0.064","1.5","max_reduce"
+"3","1000","0.31","0.144","2.152777777777778","max_reduce"
+"3","2000","0.421","0.405","1.039506172839506","max_reduce"
+"3","5000","0.984","0.716","1.3743016759776536","max_reduce"
+"3","10000","1.982","1.278","1.5508607198748043","max_reduce"
+"3","100000","20.358","14.657","1.3889609060517158","max_reduce"
+"4","500","0.216","0.098","2.204081632653061","max_reduce"
+"4","1000","0.456","0.221","2.063348416289593","max_reduce"
+"4","2000","0.992","0.652","1.5214723926380367","max_reduce"
+"4","5000","2.281","1.644","1.387469586374696","max_reduce"
+"4","10000","4.358","2.025","2.1520987654320987","max_reduce"
+"4","100000","45.618","21.311","2.1405846745812025","max_reduce"
+"5","500","0.401","0.169","2.3727810650887573","max_reduce"
+"5","1000","0.799","0.332","2.4066265060240966","max_reduce"
+"5","2000","1.613","0.694","2.3242074927953893","max_reduce"
+"5","5000","3.969","1.631","2.4334763948497855","max_reduce"
+"5","10000","7.789","3.227","2.41369693213511","max_reduce"
+"5","100000","79.753","31.91","2.4993105609526793","max_reduce"
+"6","500","0.939","0.219","4.287671232876712","max_reduce"
+"6","1000","1.411","0.438","3.221461187214612","max_reduce"
+"6","2000","2.9","0.877","3.30672748004561","max_reduce"
+"6","5000","7.532","2.195","3.431435079726652","max_reduce"
+"6","10000","14.834","4.084","3.632223310479922","max_reduce"
+"6","100000","124.796","44.258","2.819738804283971","max_reduce"
+"7","500","0.89","0.281","3.167259786476868","max_reduce"
+"7","1000","1.76","0.567","3.104056437389771","max_reduce"
+"7","2000","3.5","1.127","3.1055900621118013","max_reduce"
+"7","5000","8.758","2.763","3.169743032935215","max_reduce"
+"7","10000","17.708","5.572","3.1780330222541275","max_reduce"
+"7","100000","178.154","56.326","3.1629087810247487","max_reduce"
+"8","500","1.199","0.341","3.5161290322580645","max_reduce"
+"8","1000","2.464","0.687","3.5866084425036386","max_reduce"
+"8","2000","4.8","1.35","3.5555555555555554","max_reduce"
+"8","5000","11.938","3.369","3.543484713564856","max_reduce"
+"8","10000","23.908","6.713","3.5614479368389693","max_reduce"
+"8","100000","241.353","67.659","3.5671972686560545","max_reduce"
diff --git a/benchmark-analysis/high-n-ventura-kabylake/high_n_max_reduce.csv b/benchmark-analysis/high-n-ventura-kabylake/high_n_max_reduce.csv
new file mode 100644
index 0000000..acd0131
--- /dev/null
+++ b/benchmark-analysis/high-n-ventura-kabylake/high_n_max_reduce.csv
@@ -0,0 +1,13 @@
+mode,n,t,runs,warmup,fb_total_ms
+max_reduce,16,1000,5,2,25.934
+max_reduce,16,2000,5,2,50.027
+max_reduce,16,5000,5,2,123.155
+max_reduce,32,500,5,2,44.255
+max_reduce,32,1000,5,2,88.512
+max_reduce,32,2000,5,2,177.733
+max_reduce,64,200,5,2,64.2
+max_reduce,64,500,5,2,161.063
+max_reduce,64,1000,5,2,325.204
+max_reduce,128,100,5,2,121.246
+max_reduce,128,250,5,2,302.313
+max_reduce,128,500,5,2,612.757
diff --git a/benchmark-analysis/high-n-ventura-kabylake/high_n_pairwise.csv b/benchmark-analysis/high-n-ventura-kabylake/high_n_pairwise.csv
new file mode 100644
index 0000000..e3e9dff
--- /dev/null
+++ b/benchmark-analysis/high-n-ventura-kabylake/high_n_pairwise.csv
@@ -0,0 +1,13 @@
+mode,n,t,runs,warmup,fb_total_ms
+pairwise,16,1000,5,2,25.675
+pairwise,16,2000,5,2,51.435
+pairwise,16,5000,5,2,126.996
+pairwise,32,500,5,2,43.501
+pairwise,32,1000,5,2,86.664
+pairwise,32,2000,5,2,173.84
+pairwise,64,200,5,2,56.837
+pairwise,64,500,5,2,145.821
+pairwise,64,1000,5,2,283.18
+pairwise,128,100,5,2,102.735
+pairwise,128,250,5,2,250.751
+pairwise,128,500,5,2,505.429
diff --git a/benchmark-analysis/high-n-ventura-kabylake/high_n_pairwise_vs_max_reduce.csv b/benchmark-analysis/high-n-ventura-kabylake/high_n_pairwise_vs_max_reduce.csv
new file mode 100644
index 0000000..b374ec2
--- /dev/null
+++ b/benchmark-analysis/high-n-ventura-kabylake/high_n_pairwise_vs_max_reduce.csv
@@ -0,0 +1,13 @@
+n,t,pairwise_fb_total_ms,max_reduce_fb_total_ms,speedup_max_over_pair,winner
+16,1000,25.675,25.934,0.9900131102028226,pairwise
+16,2000,51.435,50.027,1.0281448018070243,max_reduce
+16,5000,126.996,123.155,1.031188339896878,max_reduce
+32,500,43.501,44.255,0.9829623771325273,pairwise
+32,1000,86.664,88.512,0.97912147505423,pairwise
+32,2000,173.84,177.733,0.9780963580201764,pairwise
+64,200,56.837,64.2,0.8853115264797508,pairwise
+64,500,145.821,161.063,0.9053662231549146,pairwise
+64,1000,283.18,325.204,0.8707764972140564,pairwise
+128,100,102.735,121.246,0.8473269221252661,pairwise
+128,250,250.751,302.313,0.8294416713803244,pairwise
+128,500,505.429,612.757,0.8248441062280807,pairwise
diff --git a/benchmark-analysis/hmmlib-9pass-ryzen-windows-clangcl-rerun-o2/adaptive_passes.csv b/benchmark-analysis/hmmlib-9pass-ryzen-windows-clangcl-rerun-o2/adaptive_passes.csv
new file mode 100644
index 0000000..8fe2aab
--- /dev/null
+++ b/benchmark-analysis/hmmlib-9pass-ryzen-windows-clangcl-rerun-o2/adaptive_passes.csv
@@ -0,0 +1,10 @@
+label,pass,libhmm_avg_obs_ms,hmmlib_avg_obs_ms,ratio_hmmlib_over_libhmm
+clangcl_adaptive_o2,1,9979.5,30481.2,3.05
+clangcl_adaptive_o2,2,9192.6,27960.2,3.04
+clangcl_adaptive_o2,3,10620.8,30674.7,2.89
+clangcl_adaptive_o2,4,10261.2,30457.3,2.97
+clangcl_adaptive_o2,5,10377.6,30265.0,2.92
+clangcl_adaptive_o2,6,10339.4,30766.2,2.98
+clangcl_adaptive_o2,7,10430.0,30559.7,2.93
+clangcl_adaptive_o2,8,7184.8,25793.7,3.59
+clangcl_adaptive_o2,9,9890.9,30525.4,3.09
diff --git a/benchmark-analysis/hmmlib-9pass-ryzen-windows-clangcl-rerun-o2/control_passes.csv b/benchmark-analysis/hmmlib-9pass-ryzen-windows-clangcl-rerun-o2/control_passes.csv
new file mode 100644
index 0000000..a21d418
--- /dev/null
+++ b/benchmark-analysis/hmmlib-9pass-ryzen-windows-clangcl-rerun-o2/control_passes.csv
@@ -0,0 +1,10 @@
+label,pass,libhmm_avg_obs_ms,hmmlib_avg_obs_ms,ratio_hmmlib_over_libhmm
+clangcl_control_o2,1,8844.3,28803.3,3.26
+clangcl_control_o2,2,10440.4,30681.8,2.94
+clangcl_control_o2,3,10607.2,30760.2,2.9
+clangcl_control_o2,4,10244.6,30830.2,3.01
+clangcl_control_o2,5,10492.5,30586.3,2.92
+clangcl_control_o2,6,10371.1,30365.2,2.93
+clangcl_control_o2,7,10235.7,30156.6,2.95
+clangcl_control_o2,8,10331.6,30036.8,2.91
+clangcl_control_o2,9,10265.7,30875.1,3.01
diff --git a/benchmark-analysis/hmmlib-9pass-ryzen-windows-clangcl-rerun/adaptive_passes.csv b/benchmark-analysis/hmmlib-9pass-ryzen-windows-clangcl-rerun/adaptive_passes.csv
new file mode 100644
index 0000000..afdb795
--- /dev/null
+++ b/benchmark-analysis/hmmlib-9pass-ryzen-windows-clangcl-rerun/adaptive_passes.csv
@@ -0,0 +1,10 @@
+label,pass,libhmm_avg_obs_ms,hmmlib_avg_obs_ms,ratio_hmmlib_over_libhmm
+clangcl_adaptive,1,4413.6,5817.9,1.32
+clangcl_adaptive,2,4311.0,5602.0,1.3
+clangcl_adaptive,3,4557.1,5949.2,1.31
+clangcl_adaptive,4,4674.8,5959.4,1.27
+clangcl_adaptive,5,4749.7,5995.5,1.26
+clangcl_adaptive,6,4652.7,6016.9,1.29
+clangcl_adaptive,7,4632.0,5938.7,1.28
+clangcl_adaptive,8,4641.3,6016.9,1.3
+clangcl_adaptive,9,4661.4,6073.8,1.3
diff --git a/benchmark-analysis/hmmlib-9pass-ryzen-windows-clangcl-rerun/control_passes.csv b/benchmark-analysis/hmmlib-9pass-ryzen-windows-clangcl-rerun/control_passes.csv
new file mode 100644
index 0000000..5f30d9a
--- /dev/null
+++ b/benchmark-analysis/hmmlib-9pass-ryzen-windows-clangcl-rerun/control_passes.csv
@@ -0,0 +1,10 @@
+label,pass,libhmm_avg_obs_ms,hmmlib_avg_obs_ms,ratio_hmmlib_over_libhmm
+clangcl_control,1,4641.1,5795.5,1.25
+clangcl_control,2,4659.8,5948.7,1.28
+clangcl_control,3,4593.9,5817.9,1.27
+clangcl_control,4,4690.3,6095.3,1.3
+clangcl_control,5,4628.5,5979.6,1.29
+clangcl_control,6,4634.2,5999.1,1.29
+clangcl_control,7,4627.3,5894.7,1.27
+clangcl_control,8,4050.7,5181.0,1.28
+clangcl_control,9,4826.3,5919.6,1.23
diff --git a/benchmark-analysis/hmmlib-9pass-ryzen-windows-mingw-rerun/adaptive_passes.csv b/benchmark-analysis/hmmlib-9pass-ryzen-windows-mingw-rerun/adaptive_passes.csv
new file mode 100644
index 0000000..2f32b45
--- /dev/null
+++ b/benchmark-analysis/hmmlib-9pass-ryzen-windows-mingw-rerun/adaptive_passes.csv
@@ -0,0 +1,10 @@
+label,pass,libhmm_avg_obs_ms,hmmlib_avg_obs_ms,ratio_hmmlib_over_libhmm
+mingw_adaptive,1,10078.5,34151.8,3.39
+mingw_adaptive,2,8781.1,29842.7,3.4
+mingw_adaptive,3,9702.9,33915.3,3.5
+mingw_adaptive,4,10226.5,34044.0,3.33
+mingw_adaptive,5,9529.7,32876.4,3.45
+mingw_adaptive,6,10208.4,34532.0,3.38
+mingw_adaptive,7,10291.1,34420.4,3.34
+mingw_adaptive,8,10247.6,34227.6,3.34
+mingw_adaptive,9,10227.4,34389.8,3.36
diff --git a/benchmark-analysis/hmmlib-9pass-ryzen-windows-mingw-rerun/control_passes.csv b/benchmark-analysis/hmmlib-9pass-ryzen-windows-mingw-rerun/control_passes.csv
new file mode 100644
index 0000000..f4a88a1
--- /dev/null
+++ b/benchmark-analysis/hmmlib-9pass-ryzen-windows-mingw-rerun/control_passes.csv
@@ -0,0 +1,10 @@
+label,pass,libhmm_avg_obs_ms,hmmlib_avg_obs_ms,ratio_hmmlib_over_libhmm
+mingw_control,1,9954.9,33594.9,3.37
+mingw_control,2,8793.8,31930.7,3.63
+mingw_control,3,9913.5,33971.1,3.43
+mingw_control,4,10019.6,33623.8,3.36
+mingw_control,5,9744.4,32670.8,3.35
+mingw_control,6,10212.6,34327.2,3.36
+mingw_control,7,10327.8,34152.9,3.31
+mingw_control,8,10298.2,34393.7,3.34
+mingw_control,9,9755.7,33453.4,3.43
diff --git a/benchmark-analysis/hmmlib-9pass-ryzen-windows-msvc-rerun/adaptive_passes.csv b/benchmark-analysis/hmmlib-9pass-ryzen-windows-msvc-rerun/adaptive_passes.csv
new file mode 100644
index 0000000..7ea41a5
--- /dev/null
+++ b/benchmark-analysis/hmmlib-9pass-ryzen-windows-msvc-rerun/adaptive_passes.csv
@@ -0,0 +1,10 @@
+label,pass,libhmm_avg_obs_ms,hmmlib_avg_obs_ms,ratio_hmmlib_over_libhmm
+msvc_adaptive,1,7929.9,27251.2,3.44
+msvc_adaptive,2,8946.5,29649.1,3.31
+msvc_adaptive,3,9145.1,28956.1,3.17
+msvc_adaptive,4,9448.0,29762.3,3.15
+msvc_adaptive,5,9403.0,30316.3,3.22
+msvc_adaptive,6,9418.2,30474.7,3.24
+msvc_adaptive,7,9168.2,28367.2,3.09
+msvc_adaptive,8,9466.6,30332.9,3.2
+msvc_adaptive,9,9358.2,30473.8,3.26
diff --git a/benchmark-analysis/hmmlib-9pass-ryzen-windows-msvc-rerun/control_passes.csv b/benchmark-analysis/hmmlib-9pass-ryzen-windows-msvc-rerun/control_passes.csv
new file mode 100644
index 0000000..2db8e05
--- /dev/null
+++ b/benchmark-analysis/hmmlib-9pass-ryzen-windows-msvc-rerun/control_passes.csv
@@ -0,0 +1,10 @@
+label,pass,libhmm_avg_obs_ms,hmmlib_avg_obs_ms,ratio_hmmlib_over_libhmm
+msvc_control,1,8899.9,29202.0,3.28
+msvc_control,2,8708.8,29335.0,3.37
+msvc_control,3,8586.8,29263.8,3.41
+msvc_control,4,8847.9,28780.8,3.25
+msvc_control,5,9660.4,30483.4,3.16
+msvc_control,6,9397.2,29902.9,3.18
+msvc_control,7,9433.7,29669.7,3.15
+msvc_control,8,9497.0,30340.8,3.19
+msvc_control,9,9033.0,27398.8,3.03
diff --git a/benchmark-analysis/median-gate-hmmlib-9pass-tahoe-m1-appleclang-rerun/hmmlib_9pass_summary.json b/benchmark-analysis/median-gate-hmmlib-9pass-tahoe-m1-appleclang-rerun/hmmlib_9pass_summary.json
new file mode 100644
index 0000000..4362a1e
--- /dev/null
+++ b/benchmark-analysis/median-gate-hmmlib-9pass-tahoe-m1-appleclang-rerun/hmmlib_9pass_summary.json
@@ -0,0 +1,117 @@
+{
+  "control_median_ratio_hmmlib_over_libhmm": 7.5171448054162004,
+  "adaptive_median_ratio_hmmlib_over_libhmm": 7.613662932294204,
+  "delta_percent_adaptive_vs_control": 1.283973228884206,
+  "control_passes": [
+    {
+      "pass": 1,
+      "libhmm_avg_throughput_obs_per_ms": 4025.5,
+      "hmmlib_avg_throughput_obs_per_ms": 30999.6,
+      "ratio_hmmlib_over_libhmm": 7.700807353123835
+    },
+    {
+      "pass": 2,
+      "libhmm_avg_throughput_obs_per_ms": 4114.5,
+      "hmmlib_avg_throughput_obs_per_ms": 30763.8,
+      "ratio_hmmlib_over_libhmm": 7.476923076923077
+    },
+    {
+      "pass": 3,
+      "libhmm_avg_throughput_obs_per_ms": 4099.8,
+      "hmmlib_avg_throughput_obs_per_ms": 30764.1,
+      "ratio_hmmlib_over_libhmm": 7.503805063661641
+    },
+    {
+      "pass": 4,
+      "libhmm_avg_throughput_obs_per_ms": 4141.5,
+      "hmmlib_avg_throughput_obs_per_ms": 31065.0,
+      "ratio_hmmlib_over_libhmm": 7.5009054690329595
+    },
+    {
+      "pass": 5,
+      "libhmm_avg_throughput_obs_per_ms": 4045.7,
+      "hmmlib_avg_throughput_obs_per_ms": 30134.7,
+      "ratio_hmmlib_over_libhmm": 7.448575030279062
+    },
+    {
+      "pass": 6,
+      "libhmm_avg_throughput_obs_per_ms": 4102.6,
+      "hmmlib_avg_throughput_obs_per_ms": 31059.9,
+      "ratio_hmmlib_over_libhmm": 7.5707843806366695
+    },
+    {
+      "pass": 7,
+      "libhmm_avg_throughput_obs_per_ms": 4056.2,
+      "hmmlib_avg_throughput_obs_per_ms": 30943.8,
+      "ratio_hmmlib_over_libhmm": 7.628765839948721
+    },
+    {
+      "pass": 8,
+      "libhmm_avg_throughput_obs_per_ms": 4106.2,
+      "hmmlib_avg_throughput_obs_per_ms": 30866.9,
+      "ratio_hmmlib_over_libhmm": 7.5171448054162004
+    },
+    {
+      "pass": 9,
+      "libhmm_avg_throughput_obs_per_ms": 4112.7,
+      "hmmlib_avg_throughput_obs_per_ms": 30960.5,
+      "ratio_hmmlib_over_libhmm": 7.528022953291026
+    }
+  ],
+  "adaptive_passes": [
+    {
+      "pass": 1,
+      "libhmm_avg_throughput_obs_per_ms": 4068.7,
+      "hmmlib_avg_throughput_obs_per_ms": 31003.7,
+      "ratio_hmmlib_over_libhmm": 7.620050630422494
+    },
+    {
+      "pass": 2,
+      "libhmm_avg_throughput_obs_per_ms": 4106.0,
+      "hmmlib_avg_throughput_obs_per_ms": 31261.7,
+      "ratio_hmmlib_over_libhmm": 7.613662932294204
+    },
+    {
+      "pass": 3,
+      "libhmm_avg_throughput_obs_per_ms": 4103.9,
+      "hmmlib_avg_throughput_obs_per_ms": 30937.6,
+      "ratio_hmmlib_over_libhmm": 7.5385852481785625
+    },
+    {
+      "pass": 4,
+      "libhmm_avg_throughput_obs_per_ms": 3983.1,
+      "hmmlib_avg_throughput_obs_per_ms": 30418.5,
+      "ratio_hmmlib_over_libhmm": 7.6368908638999775
+    },
+    {
+      "pass": 5,
+      "libhmm_avg_throughput_obs_per_ms": 4001.6,
+      "hmmlib_avg_throughput_obs_per_ms": 30412.1,
+      "ratio_hmmlib_over_libhmm": 7.599985005997601
+    },
+    {
+      "pass": 6,
+      "libhmm_avg_throughput_obs_per_ms": 3996.8,
+      "hmmlib_avg_throughput_obs_per_ms": 30508.4,
+      "ratio_hmmlib_over_libhmm": 7.633206565252202
+    },
+    {
+      "pass": 7,
+      "libhmm_avg_throughput_obs_per_ms": 3995.2,
+      "hmmlib_avg_throughput_obs_per_ms": 30228.8,
+      "ratio_hmmlib_over_libhmm": 7.566279535442531
+    },
+    {
+      "pass": 8,
+      "libhmm_avg_throughput_obs_per_ms": 3982.1,
+      "hmmlib_avg_throughput_obs_per_ms": 30486.9,
+      "ratio_hmmlib_over_libhmm": 7.655985535270335
+    },
+    {
+      "pass": 9,
+      "libhmm_avg_throughput_obs_per_ms": 4001.8,
+      "hmmlib_avg_throughput_obs_per_ms": 30388.0,
+      "ratio_hmmlib_over_libhmm": 7.593582887700534
+    }
+  ]
+}
diff --git a/benchmark-analysis/median-gate-hmmlib-9pass-tahoe-m1-gcc15/hmmlib_9pass_summary.json b/benchmark-analysis/median-gate-hmmlib-9pass-tahoe-m1-gcc15/hmmlib_9pass_summary.json
new file mode 100644
index 0000000..964098d
--- /dev/null
+++ b/benchmark-analysis/median-gate-hmmlib-9pass-tahoe-m1-gcc15/hmmlib_9pass_summary.json
@@ -0,0 +1,117 @@
+{
+  "control_median_ratio_hmmlib_over_libhmm": 9.156518900955433,
+  "adaptive_median_ratio_hmmlib_over_libhmm": 9.1735840061973,
+  "delta_percent_adaptive_vs_control": 0.18637110266966933,
+  "control_passes": [
+    {
+      "pass": 1,
+      "libhmm_avg_throughput_obs_per_ms": 3253.3,
+      "hmmlib_avg_throughput_obs_per_ms": 31084.7,
+      "ratio_hmmlib_over_libhmm": 9.554821258414533
+    },
+    {
+      "pass": 2,
+      "libhmm_avg_throughput_obs_per_ms": 3420.7,
+      "hmmlib_avg_throughput_obs_per_ms": 31343.0,
+      "ratio_hmmlib_over_libhmm": 9.162744467506652
+    },
+    {
+      "pass": 3,
+      "libhmm_avg_throughput_obs_per_ms": 3411.1,
+      "hmmlib_avg_throughput_obs_per_ms": 30845.9,
+      "ratio_hmmlib_over_libhmm": 9.042801442349976
+    },
+    {
+      "pass": 4,
+      "libhmm_avg_throughput_obs_per_ms": 3367.0,
+      "hmmlib_avg_throughput_obs_per_ms": 30953.7,
+      "ratio_hmmlib_over_libhmm": 9.193258093258093
+    },
+    {
+      "pass": 5,
+      "libhmm_avg_throughput_obs_per_ms": 3356.2,
+      "hmmlib_avg_throughput_obs_per_ms": 30719.0,
+      "ratio_hmmlib_over_libhmm": 9.152911030331923
+    },
+    {
+      "pass": 6,
+      "libhmm_avg_throughput_obs_per_ms": 3404.8,
+      "hmmlib_avg_throughput_obs_per_ms": 30811.7,
+      "ratio_hmmlib_over_libhmm": 9.049488956766917
+    },
+    {
+      "pass": 7,
+      "libhmm_avg_throughput_obs_per_ms": 3370.2,
+      "hmmlib_avg_throughput_obs_per_ms": 30859.3,
+      "ratio_hmmlib_over_libhmm": 9.156518900955433
+    },
+    {
+      "pass": 8,
+      "libhmm_avg_throughput_obs_per_ms": 3341.8,
+      "hmmlib_avg_throughput_obs_per_ms": 30105.2,
+      "ratio_hmmlib_over_libhmm": 9.008677957986713
+    },
+    {
+      "pass": 9,
+      "libhmm_avg_throughput_obs_per_ms": 3352.0,
+      "hmmlib_avg_throughput_obs_per_ms": 31165.2,
+      "ratio_hmmlib_over_libhmm": 9.297494033412889
+    }
+  ],
+  "adaptive_passes": [
+    {
+      "pass": 1,
+      "libhmm_avg_throughput_obs_per_ms": 3122.7,
+      "hmmlib_avg_throughput_obs_per_ms": 28684.6,
+      "ratio_hmmlib_over_libhmm": 9.18583277292087
+    },
+    {
+      "pass": 2,
+      "libhmm_avg_throughput_obs_per_ms": 3319.1,
+      "hmmlib_avg_throughput_obs_per_ms": 30293.0,
+      "ratio_hmmlib_over_libhmm": 9.126871742339791
+    },
+    {
+      "pass": 3,
+      "libhmm_avg_throughput_obs_per_ms": 3036.4,
+      "hmmlib_avg_throughput_obs_per_ms": 28819.5,
+      "ratio_hmmlib_over_libhmm": 9.491338427084706
+    },
+    {
+      "pass": 4,
+      "libhmm_avg_throughput_obs_per_ms": 3342.7,
+      "hmmlib_avg_throughput_obs_per_ms": 30987.7,
+      "ratio_hmmlib_over_libhmm": 9.27026056780447
+    },
+    {
+      "pass": 5,
+      "libhmm_avg_throughput_obs_per_ms": 3368.9,
+      "hmmlib_avg_throughput_obs_per_ms": 30740.6,
+      "ratio_hmmlib_over_libhmm": 9.12481818991362
+    },
+    {
+      "pass": 6,
+      "libhmm_avg_throughput_obs_per_ms": 3371.9,
+      "hmmlib_avg_throughput_obs_per_ms": 30832.6,
+      "ratio_hmmlib_over_libhmm": 9.143984103917672
+    },
+    {
+      "pass": 7,
+      "libhmm_avg_throughput_obs_per_ms": 3359.7,
+      "hmmlib_avg_throughput_obs_per_ms": 31037.3,
+      "ratio_hmmlib_over_libhmm": 9.23811649849689
+    },
+    {
+      "pass": 8,
+      "libhmm_avg_throughput_obs_per_ms": 3400.0,
+      "hmmlib_avg_throughput_obs_per_ms": 31027.1,
+      "ratio_hmmlib_over_libhmm": 9.125617647058823
+    },
+    {
+      "pass": 9,
+      "libhmm_avg_throughput_obs_per_ms": 3356.3,
+      "hmmlib_avg_throughput_obs_per_ms": 30789.3,
+      "ratio_hmmlib_over_libhmm": 9.1735840061973
+    }
+  ]
+}
diff --git a/benchmark-analysis/median-gate-hmmlib-9pass-tahoe-m1-homebrew-llvm-rerun/hmmlib_9pass_summary.json b/benchmark-analysis/median-gate-hmmlib-9pass-tahoe-m1-homebrew-llvm-rerun/hmmlib_9pass_summary.json
new file mode 100644
index 0000000..b0afbf5
--- /dev/null
+++ b/benchmark-analysis/median-gate-hmmlib-9pass-tahoe-m1-homebrew-llvm-rerun/hmmlib_9pass_summary.json
@@ -0,0 +1,117 @@
+{
+  "control_median_ratio_hmmlib_over_libhmm": 7.595913843781621,
+  "adaptive_median_ratio_hmmlib_over_libhmm": 7.60328317373461,
+  "delta_percent_adaptive_vs_control": 0.09701702921527999,
+  "control_passes": [
+    {
+      "pass": 1,
+      "libhmm_avg_throughput_obs_per_ms": 3340.7,
+      "hmmlib_avg_throughput_obs_per_ms": 27567.1,
+      "ratio_hmmlib_over_libhmm": 8.251893315772143
+    },
+    {
+      "pass": 2,
+      "libhmm_avg_throughput_obs_per_ms": 4187.9,
+      "hmmlib_avg_throughput_obs_per_ms": 31588.5,
+      "ratio_hmmlib_over_libhmm": 7.542801881611309
+    },
+    {
+      "pass": 3,
+      "libhmm_avg_throughput_obs_per_ms": 4171.2,
+      "hmmlib_avg_throughput_obs_per_ms": 31666.0,
+      "ratio_hmmlib_over_libhmm": 7.591580360567702
+    },
+    {
+      "pass": 4,
+      "libhmm_avg_throughput_obs_per_ms": 4182.0,
+      "hmmlib_avg_throughput_obs_per_ms": 31595.3,
+      "ratio_hmmlib_over_libhmm": 7.555069344811095
+    },
+    {
+      "pass": 5,
+      "libhmm_avg_throughput_obs_per_ms": 4150.6,
+      "hmmlib_avg_throughput_obs_per_ms": 31527.6,
+      "ratio_hmmlib_over_libhmm": 7.595913843781621
+    },
+    {
+      "pass": 6,
+      "libhmm_avg_throughput_obs_per_ms": 3931.3,
+      "hmmlib_avg_throughput_obs_per_ms": 31573.2,
+      "ratio_hmmlib_over_libhmm": 8.031236486658358
+    },
+    {
+      "pass": 7,
+      "libhmm_avg_throughput_obs_per_ms": 4176.6,
+      "hmmlib_avg_throughput_obs_per_ms": 31611.1,
+      "ratio_hmmlib_over_libhmm": 7.568620408945074
+    },
+    {
+      "pass": 8,
+      "libhmm_avg_throughput_obs_per_ms": 4161.4,
+      "hmmlib_avg_throughput_obs_per_ms": 31685.3,
+      "ratio_hmmlib_over_libhmm": 7.614096217619071
+    },
+    {
+      "pass": 9,
+      "libhmm_avg_throughput_obs_per_ms": 4164.3,
+      "hmmlib_avg_throughput_obs_per_ms": 31757.8,
+      "ratio_hmmlib_over_libhmm": 7.626203683692337
+    }
+  ],
+  "adaptive_passes": [
+    {
+      "pass": 1,
+      "libhmm_avg_throughput_obs_per_ms": 3561.1,
+      "hmmlib_avg_throughput_obs_per_ms": 27534.9,
+      "ratio_hmmlib_over_libhmm": 7.732133329589172
+    },
+    {
+      "pass": 2,
+      "libhmm_avg_throughput_obs_per_ms": 4057.8,
+      "hmmlib_avg_throughput_obs_per_ms": 31262.9,
+      "ratio_hmmlib_over_libhmm": 7.704396470994134
+    },
+    {
+      "pass": 3,
+      "libhmm_avg_throughput_obs_per_ms": 4046.3,
+      "hmmlib_avg_throughput_obs_per_ms": 31150.0,
+      "ratio_hmmlib_over_libhmm": 7.698391122754121
+    },
+    {
+      "pass": 4,
+      "libhmm_avg_throughput_obs_per_ms": 4090.4,
+      "hmmlib_avg_throughput_obs_per_ms": 30688.5,
+      "ratio_hmmlib_over_libhmm": 7.502566986113828
+    },
+    {
+      "pass": 5,
+      "libhmm_avg_throughput_obs_per_ms": 4102.6,
+      "hmmlib_avg_throughput_obs_per_ms": 31126.7,
+      "ratio_hmmlib_over_libhmm": 7.58706673816604
+    },
+    {
+      "pass": 6,
+      "libhmm_avg_throughput_obs_per_ms": 4093.6,
+      "hmmlib_avg_throughput_obs_per_ms": 31124.8,
+      "ratio_hmmlib_over_libhmm": 7.60328317373461
+    },
+    {
+      "pass": 7,
+      "libhmm_avg_throughput_obs_per_ms": 4134.9,
+      "hmmlib_avg_throughput_obs_per_ms": 31160.5,
+      "ratio_hmmlib_over_libhmm": 7.535974267817844
+    },
+    {
+      "pass": 8,
+      "libhmm_avg_throughput_obs_per_ms": 4111.6,
+      "hmmlib_avg_throughput_obs_per_ms": 30903.3,
+      "ratio_hmmlib_over_libhmm": 7.516125109446444
+    },
+    {
+      "pass": 9,
+      "libhmm_avg_throughput_obs_per_ms": 3833.6,
+      "hmmlib_avg_throughput_obs_per_ms": 31131.7,
+      "ratio_hmmlib_over_libhmm": 8.120748121869783
+    }
+  ]
+}
diff --git a/benchmark-analysis/median-gate-hmmlib-9pass-tahoe-m1/hmmlib_9pass_summary.json b/benchmark-analysis/median-gate-hmmlib-9pass-tahoe-m1/hmmlib_9pass_summary.json
new file mode 100644
index 0000000..7fa7176
--- /dev/null
+++ b/benchmark-analysis/median-gate-hmmlib-9pass-tahoe-m1/hmmlib_9pass_summary.json
@@ -0,0 +1,117 @@
+{
+  "control_median_ratio_hmmlib_over_libhmm": 7.612772915264018,
+  "adaptive_median_ratio_hmmlib_over_libhmm": 7.609598545384946,
+  "delta_percent_adaptive_vs_control": -0.04169794520872997,
+  "control_passes": [
+    {
+      "pass": 1,
+      "libhmm_avg_throughput_obs_per_ms": 4120.8,
+      "hmmlib_avg_throughput_obs_per_ms": 31708.3,
+      "ratio_hmmlib_over_libhmm": 7.6946952048145985
+    },
+    {
+      "pass": 2,
+      "libhmm_avg_throughput_obs_per_ms": 4147.9,
+      "hmmlib_avg_throughput_obs_per_ms": 31540.3,
+      "ratio_hmmlib_over_libhmm": 7.603920055931918
+    },
+    {
+      "pass": 3,
+      "libhmm_avg_throughput_obs_per_ms": 4110.0,
+      "hmmlib_avg_throughput_obs_per_ms": 31868.2,
+      "ratio_hmmlib_over_libhmm": 7.753819951338199
+    },
+    {
+      "pass": 4,
+      "libhmm_avg_throughput_obs_per_ms": 4158.8,
+      "hmmlib_avg_throughput_obs_per_ms": 31660.0,
+      "ratio_hmmlib_over_libhmm": 7.612772915264018
+    },
+    {
+      "pass": 5,
+      "libhmm_avg_throughput_obs_per_ms": 4159.8,
+      "hmmlib_avg_throughput_obs_per_ms": 31731.5,
+      "ratio_hmmlib_over_libhmm": 7.62813116015193
+    },
+    {
+      "pass": 6,
+      "libhmm_avg_throughput_obs_per_ms": 4172.9,
+      "hmmlib_avg_throughput_obs_per_ms": 31570.4,
+      "ratio_hmmlib_over_libhmm": 7.5655778954683806
+    },
+    {
+      "pass": 7,
+      "libhmm_avg_throughput_obs_per_ms": 4186.4,
+      "hmmlib_avg_throughput_obs_per_ms": 31907.0,
+      "ratio_hmmlib_over_libhmm": 7.621584177336136
+    },
+    {
+      "pass": 8,
+      "libhmm_avg_throughput_obs_per_ms": 4227.0,
+      "hmmlib_avg_throughput_obs_per_ms": 31928.1,
+      "ratio_hmmlib_over_libhmm": 7.553371185237757
+    },
+    {
+      "pass": 9,
+      "libhmm_avg_throughput_obs_per_ms": 4201.7,
+      "hmmlib_avg_throughput_obs_per_ms": 31626.7,
+      "ratio_hmmlib_over_libhmm": 7.527119975248114
+    }
+  ],
+  "adaptive_passes": [
+    {
+      "pass": 1,
+      "libhmm_avg_throughput_obs_per_ms": 3552.5,
+      "hmmlib_avg_throughput_obs_per_ms": 27750.1,
+      "ratio_hmmlib_over_libhmm": 7.811428571428571
+    },
+    {
+      "pass": 2,
+      "libhmm_avg_throughput_obs_per_ms": 4179.8,
+      "hmmlib_avg_throughput_obs_per_ms": 31806.6,
+      "ratio_hmmlib_over_libhmm": 7.609598545384946
+    },
+    {
+      "pass": 3,
+      "libhmm_avg_throughput_obs_per_ms": 4169.8,
+      "hmmlib_avg_throughput_obs_per_ms": 31679.3,
+      "ratio_hmmlib_over_libhmm": 7.597318816250179
+    },
+    {
+      "pass": 4,
+      "libhmm_avg_throughput_obs_per_ms": 4172.3,
+      "hmmlib_avg_throughput_obs_per_ms": 31363.2,
+      "ratio_hmmlib_over_libhmm": 7.517005009227524
+    },
+    {
+      "pass": 5,
+      "libhmm_avg_throughput_obs_per_ms": 4117.6,
+      "hmmlib_avg_throughput_obs_per_ms": 31708.6,
+      "ratio_hmmlib_over_libhmm": 7.700748008548668
+    },
+    {
+      "pass": 6,
+      "libhmm_avg_throughput_obs_per_ms": 4174.7,
+      "hmmlib_avg_throughput_obs_per_ms": 31600.1,
+      "ratio_hmmlib_over_libhmm": 7.569430138692601
+    },
+    {
+      "pass": 7,
+      "libhmm_avg_throughput_obs_per_ms": 4144.7,
+      "hmmlib_avg_throughput_obs_per_ms": 31582.4,
+      "ratio_hmmlib_over_libhmm": 7.619948367795016
+    },
+    {
+      "pass": 8,
+      "libhmm_avg_throughput_obs_per_ms": 4183.4,
+      "hmmlib_avg_throughput_obs_per_ms": 31468.3,
+      "ratio_hmmlib_over_libhmm": 7.522182913419707
+    },
+    {
+      "pass": 9,
+      "libhmm_avg_throughput_obs_per_ms": 4081.0,
+      "hmmlib_avg_throughput_obs_per_ms": 31163.2,
+      "ratio_hmmlib_over_libhmm": 7.636167605978927
+    }
+  ]
+}
diff --git a/benchmark-analysis/multirun-20260426-194758/perf_vs_main_delta.csv b/benchmark-analysis/multirun-20260426-194758/perf_vs_main_delta.csv
new file mode 100644
index 0000000..925c850
--- /dev/null
+++ b/benchmark-analysis/multirun-20260426-194758/perf_vs_main_delta.csv
@@ -0,0 +1,4 @@
+"benchmark","main_median","perf_median","perf_vs_main_median_delta_pct","main_mean","perf_mean","main_stddev","perf_stddev"
+"hmmlib","9367.9","9317","-0.5433448264819184","9309.140000000001","9289.279999999999","625.8642927983668","104.81577648426794"
+"stochhmm_discrete","9008.5","9217.3","2.317810956319024","8924.98","9199.380000000001","305.1292054196056","112.49971999965126"
+"stochhmm_continuous","7001.3","6946.3","-0.7855683944410323","6581.640000000001","6554.539999999999","747.6081212774511","560.5696995022117"
diff --git a/benchmark-analysis/multirun-20260426-194758/raw_results.csv b/benchmark-analysis/multirun-20260426-194758/raw_results.csv
new file mode 100644
index 0000000..5757897
--- /dev/null
+++ b/benchmark-analysis/multirun-20260426-194758/raw_results.csv
@@ -0,0 +1,31 @@
+"branch","benchmark","run","exit_code","libhmm_obs_per_ms","comparator_obs_per_ms","reported_ratio_x","log_file"
+"main","hmmlib","1","0","9810.5","30645.6","3.12","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\main\hmmlib-run1.log"
+"main","hmmlib","2","0","8254","26242.7","3.18","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\main\hmmlib-run2.log"
+"main","hmmlib","3","0","9361.5","29549.7","3.16","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\main\hmmlib-run3.log"
+"main","hmmlib","4","0","9751.8","30446.6","3.12","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\main\hmmlib-run4.log"
+"main","hmmlib","5","0","9367.9","30395","3.24","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\main\hmmlib-run5.log"
+"main","stochhmm_discrete","1","0","8783.1","4124.6","0.47","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\main\stochhmm_discrete-run1.log"
+"main","stochhmm_discrete","2","0","9235.9","4302","0.47","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\main\stochhmm_discrete-run2.log"
+"main","stochhmm_discrete","3","0","9127.9","4219","0.46","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\main\stochhmm_discrete-run3.log"
+"main","stochhmm_discrete","4","0","8469.5","4109.1","0.49","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\main\stochhmm_discrete-run4.log"
+"main","stochhmm_discrete","5","0","9008.5","4153.6","0.46","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\main\stochhmm_discrete-run5.log"
+"main","stochhmm_continuous","1","0","7177.6","6141.8","0.86","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\main\stochhmm_continuous-run1.log"
+"main","stochhmm_continuous","2","0","7112.8","5945.1","0.84","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\main\stochhmm_continuous-run2.log"
+"main","stochhmm_continuous","3","0","6144.4","5364.2","0.87","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\main\stochhmm_continuous-run3.log"
+"main","stochhmm_continuous","4","0","7001.3","6195.2","0.88","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\main\stochhmm_continuous-run4.log"
+"main","stochhmm_continuous","5","0","5472.1","5308.7","0.97","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\main\stochhmm_continuous-run5.log"
+"perf","hmmlib","1","0","9369.8","28381","3.03","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\perf\hmmlib-run1.log"
+"perf","hmmlib","2","0","9218.2","29956.7","3.25","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\perf\hmmlib-run2.log"
+"perf","hmmlib","3","0","9395.1","29843","3.18","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\perf\hmmlib-run3.log"
+"perf","hmmlib","4","0","9146.3","29254.1","3.2","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\perf\hmmlib-run4.log"
+"perf","hmmlib","5","0","9317","30369.3","3.26","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\perf\hmmlib-run5.log"
+"perf","stochhmm_discrete","1","0","9008.2","3980","0.44","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\perf\stochhmm_discrete-run1.log"
+"perf","stochhmm_discrete","2","0","9207.3","4118.9","0.45","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\perf\stochhmm_discrete-run2.log"
+"perf","stochhmm_discrete","3","0","9217.3","4171.5","0.45","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\perf\stochhmm_discrete-run3.log"
+"perf","stochhmm_discrete","4","0","9278.7","4277.5","0.46","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\perf\stochhmm_discrete-run4.log"
+"perf","stochhmm_discrete","5","0","9285.4","4252.6","0.46","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\perf\stochhmm_discrete-run5.log"
+"perf","stochhmm_continuous","1","0","5820.3","5176.7","0.89","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\perf\stochhmm_continuous-run1.log"
+"perf","stochhmm_continuous","2","0","6982.3","6158.5","0.88","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\perf\stochhmm_continuous-run2.log"
+"perf","stochhmm_continuous","3","0","6946.3","6305.5","0.91","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\perf\stochhmm_continuous-run3.log"
+"perf","stochhmm_continuous","4","0","6077.3","5402.7","0.89","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\perf\stochhmm_continuous-run4.log"
+"perf","stochhmm_continuous","5","0","6946.5","6148.9","0.89","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\perf\stochhmm_continuous-run5.log"
diff --git a/benchmark-analysis/multirun-20260426-194758/run_manifest.json b/benchmark-analysis/multirun-20260426-194758/run_manifest.json
new file mode 100644
index 0000000..9f1d4df
--- /dev/null
+++ b/benchmark-analysis/multirun-20260426-194758/run_manifest.json
@@ -0,0 +1,7 @@
+{
+  "output_root": "C:\\Users\\gdwol\\Development\\libhmm\\benchmark-analysis\\multirun-20260426-194758",
+  "raw_results_csv": "C:\\Users\\gdwol\\Development\\libhmm\\benchmark-analysis\\multirun-20260426-194758\\raw_results.csv",
+  "summary_stats_csv": "C:\\Users\\gdwol\\Development\\libhmm\\benchmark-analysis\\multirun-20260426-194758\\summary_stats.csv",
+  "delta_csv": "C:\\Users\\gdwol\\Development\\libhmm\\benchmark-analysis\\multirun-20260426-194758\\perf_vs_main_delta.csv",
+  "runs_per_benchmark_per_branch": 5
+}
diff --git a/benchmark-analysis/multirun-20260426-194758/summary_stats.csv b/benchmark-analysis/multirun-20260426-194758/summary_stats.csv
new file mode 100644
index 0000000..2915ad8
--- /dev/null
+++ b/benchmark-analysis/multirun-20260426-194758/summary_stats.csv
@@ -0,0 +1,7 @@
+"branch","benchmark","n","mean_libhmm_obs_per_ms","median_libhmm_obs_per_ms","stddev_libhmm_obs_per_ms","min_libhmm_obs_per_ms","max_libhmm_obs_per_ms"
+"main","hmmlib","5","9309.140000000001","9367.9","625.8642927983668","8254","9810.5"
+"perf","hmmlib","5","9289.279999999999","9317","104.81577648426794","9146.3","9395.1"
+"main","stochhmm_continuous","5","6581.640000000001","7001.3","747.6081212774511","5472.1","7177.6"
+"perf","stochhmm_continuous","5","6554.539999999999","6946.3","560.5696995022117","5820.3","6982.3"
+"main","stochhmm_discrete","5","8924.98","9008.5","305.1292054196056","8469.5","9235.9"
+"perf","stochhmm_discrete","5","9199.380000000001","9217.3","112.49971999965126","9008.2","9285.4"
diff --git a/benchmark-analysis/rollback-dump-20260426-201852.patch b/benchmark-analysis/rollback-dump-20260426-201852.patch
new file mode 100644
index 0000000..326f733
--- /dev/null
+++ b/benchmark-analysis/rollback-dump-20260426-201852.patch
@@ -0,0 +1,500 @@
+diff --git a/include/libhmm/calculators/forward_backward_calculator.h b/include/libhmm/calculators/forward_backward_calculator.h
+index 3efd38d..c661736 100755
+--- a/include/libhmm/calculators/forward_backward_calculator.h
++++ b/include/libhmm/calculators/forward_backward_calculator.h
+@@ -89,15 +89,20 @@ private:
+
+     // Precomputed log-transition matrix [N x N]: logTrans_(i,j) = log a_{ij}
+     Matrix logTrans_;
++    // Transposed transition matrix [N x N]: logTransT_(j,i) = log a_{ij}
++    // Used to improve locality in forward recursion (fixed destination state j).
++    Matrix logTransT_;
+
+     // Results
+     Matrix logAlpha_; // T x N
+     Matrix logBeta_;  // T x N
+     double logProbability_{-std::numeric_limits<double>::infinity()};
+
+-    // Per-state log-emission buffer reused each timestep [T x N, row-major].
+-    // Allocated once; filled by getBatchLogProbabilities per state.
+-    mutable std::vector<double> logEmitBuf_;
++    // Per-state log-emission buffer: logEmitBuf_[i * T + t] = log b_i(O_t)
++    std::vector<double> logEmitBuf_;
++    // Time-major emission buffer: logEmitByTime_[t * N + i] = log b_i(O_t)
++    // Built once per compute() to improve locality in DP kernels.
++    std::vector<double> logEmitByTime_;
+
+     void precomputeLogTransitions();
+     void computeLogForward();
+diff --git a/include/libhmm/calculators/viterbi_calculator.h b/include/libhmm/calculators/viterbi_calculator.h
+index 7b9ae64..a341ecb 100755
+--- a/include/libhmm/calculators/viterbi_calculator.h
++++ b/include/libhmm/calculators/viterbi_calculator.h
+@@ -65,19 +65,24 @@ private:
+
+     // Precomputed log-transition matrix [N x N]
+     Matrix logTrans_;
++    // Transposed transition matrix [N x N]: logTransT_(j,i) = log a_{ij}
++    Matrix logTransT_;
+
+     // Viterbi trellis: logDelta(t,i) = max log-prob path ending at state i at time t
+     Matrix logDelta_;
+
+-    // Backtrack pointers: psi(t,i) = arg max_j [logDelta(t-1,j) + logTrans(j,i)]
+-    std::vector<std::vector<int>> psi_;
++    // Backtrack pointers in time-major contiguous storage:
++    // psi_[t * N + j] = arg max_i [logDelta(t-1,i) + logTrans(i,j)]
++    std::vector<int> psi_;
+
+     // Result
+     StateSequence sequence_;
+     double logProbability_{-std::numeric_limits<double>::infinity()};
+
+-    // Per-state emission buffer
+-    mutable std::vector<double> logEmitBuf_;
++    // Per-state log-emission buffer: logEmitBuf_[i * T + t] = log b_i(O_t)
++    std::vector<double> logEmitBuf_;
++    // Time-major emission buffer: logEmitByTime_[t * N + i] = log b_i(O_t)
++    std::vector<double> logEmitByTime_;
+
+     void precomputeLogTransitions();
+     void runViterbi();
+diff --git a/src/calculators/forward_backward_calculator.cpp b/src/calculators/forward_backward_calculator.cpp
+index 1097acc..789e632 100755
+--- a/src/calculators/forward_backward_calculator.cpp
++++ b/src/calculators/forward_backward_calculator.cpp
+@@ -50,27 +50,33 @@ void ForwardBackwardCalculator::compute() {
+     logAlpha_.resize(T, numStates_);
+     logBeta_.resize(T, numStates_);
+
+-    // Pre-fill the log-emission buffer: logEmitBuf_[i * T + t] = log b_i(O_t)
+-    // Build observation span once; reuse across all N states.
++    // Fill per-state log-emission buffer: logEmitBuf_[i * T + t] = log b_i(O_t)
++    // Build observation span directly from ObservationSet storage; no copy.
+     logEmitBuf_.resize(T * numStates_);
+-    std::vector<double> obsVec(T);
+-    for (std::size_t t = 0; t < T; ++t)
+-        obsVec[t] = observations_(t);
+-    const std::span<const double> obsSpan(obsVec.data(), T);
++    const std::span<const double> obsSpan(observations_.data(), T);
+
+     const Hmm &hmm = getHmmRef();
+     for (std::size_t i = 0; i < numStates_; ++i) {
+         hmm.getDistribution(i).getBatchLogProbabilities(
+             obsSpan, std::span<double>(logEmitBuf_.data() + i * T, T));
+     }
++    // Build time-major emission buffer once to improve locality in DP recurrences.
++    logEmitByTime_.resize(T * numStates_);
++    for (std::size_t i = 0; i < numStates_; ++i) {
++        const double *stateRow = logEmitBuf_.data() + i * T;
++        for (std::size_t t = 0; t < T; ++t) {
++            logEmitByTime_[t * numStates_ + i] = stateRow[t];
++        }
++    }
+
+     computeLogForward();
+     computeLogBackward();
+
+     // log P(O|λ) = log-sum-exp over states at final timestep
++    const double *finalAlphaRow = logAlpha_.data() + (T - 1) * numStates_;
+     double lp = LOG_ZERO;
+     for (std::size_t i = 0; i < numStates_; ++i) {
+-        lp = logSumExp(lp, logAlpha_(T - 1, i));
++        lp = logSumExp(lp, finalAlphaRow[i]);
+     }
+     logProbability_ = lp;
+ }
+@@ -83,10 +89,13 @@ void ForwardBackwardCalculator::precomputeLogTransitions() {
+     const Hmm &hmm = getHmmRef();
+     const Matrix &trans = hmm.getTrans();
+     logTrans_.resize(numStates_, numStates_);
++    logTransT_.resize(numStates_, numStates_);
+     for (std::size_t i = 0; i < numStates_; ++i) {
+         for (std::size_t j = 0; j < numStates_; ++j) {
+             const double a = trans(i, j);
+-            logTrans_(i, j) = (a > 0.0) ? std::log(a) : LOG_ZERO;
++            const double logA = (a > 0.0) ? std::log(a) : LOG_ZERO;
++            logTrans_(i, j) = logA;
++            logTransT_(j, i) = logA;
+         }
+     }
+ }
+@@ -96,42 +105,57 @@ void ForwardBackwardCalculator::computeLogForward() {
+     const Vector &pi = hmm.getPi();
+     const std::size_t T = observations_.size();
+
++    const double *logEmitByTimeData = logEmitByTime_.data();
++    const double *logTransTData = logTransT_.data();
++    double *logAlphaData = logAlpha_.data();
++    const std::size_t N = numStates_;
++
+     // t = 0: log alpha(0, i) = log pi_i + log b_i(O_0)
++    const double *emitRow0 = logEmitByTimeData;
+     for (std::size_t i = 0; i < numStates_; ++i) {
+         const double logPi = (pi(i) > 0.0) ? std::log(pi(i)) : LOG_ZERO;
+-        logAlpha_(0, i) = logPi + logEmitBuf_[i * T + 0];
++        logAlphaData[i] = logPi + emitRow0[i];
+     }
+
+     // t > 0
+     for (std::size_t t = 1; t < T; ++t) {
++        const double *prevAlphaRow = logAlphaData + (t - 1) * N;
++        double *alphaRow = logAlphaData + t * N;
++        const double *emitRow = logEmitByTimeData + t * N;
+         for (std::size_t j = 0; j < numStates_; ++j) {
+             double logSum = LOG_ZERO;
++            const double *transCol = logTransTData + j * N;
+             for (std::size_t i = 0; i < numStates_; ++i) {
+-                logSum = logSumExp(logSum, logAlpha_(t - 1, i) + logTrans_(i, j));
++                logSum = logSumExp(logSum, prevAlphaRow[i] + transCol[i]);
+             }
+-            logAlpha_(t, j) = logEmitBuf_[j * T + t] + logSum;
++            alphaRow[j] = emitRow[j] + logSum;
+         }
+     }
+ }
+
+ void ForwardBackwardCalculator::computeLogBackward() {
+     const std::size_t T = observations_.size();
++    const double *logTransData = logTrans_.data();
++    const double *logEmitByTimeData = logEmitByTime_.data();
++    double *logBetaData = logBeta_.data();
++    const std::size_t N = numStates_;
+
+     // t = T-1: log beta(T-1, i) = log(1) = 0
+-    for (std::size_t i = 0; i < numStates_; ++i) {
+-        logBeta_(T - 1, i) = 0.0;
+-    }
++    std::fill(logBetaData + (T - 1) * N, logBetaData + T * N, 0.0);
+
+     // t < T-1, working backwards
+     if (T > 1) {
+         for (std::size_t t = T - 2;; --t) {
++            const double *nextBetaRow = logBetaData + (t + 1) * N;
++            double *betaRow = logBetaData + t * N;
++            const double *nextEmitRow = logEmitByTimeData + (t + 1) * N;
+             for (std::size_t i = 0; i < numStates_; ++i) {
+                 double logSum = LOG_ZERO;
++                const double *transRow = logTransData + i * N;
+                 for (std::size_t j = 0; j < numStates_; ++j) {
+-                    logSum = logSumExp(logSum, logTrans_(i, j) + logEmitBuf_[j * T + (t + 1)] +
+-                                                   logBeta_(t + 1, j));
++                    logSum = logSumExp(logSum, transRow[j] + nextEmitRow[j] + nextBetaRow[j]);
+                 }
+-                logBeta_(t, i) = logSum;
++                betaRow[i] = logSum;
+             }
+             if (t == 0)
+                 break;
+diff --git a/src/calculators/viterbi_calculator.cpp b/src/calculators/viterbi_calculator.cpp
+index 3ade510..1df7a3f 100755
+--- a/src/calculators/viterbi_calculator.cpp
++++ b/src/calculators/viterbi_calculator.cpp
+@@ -44,16 +44,21 @@ StateSequence ViterbiCalculator::decode() {
+     // Fill log-emission buffer: logEmitBuf_[i * T + t] = log b_i(O_t)
+     logEmitBuf_.resize(T * numStates_);
+     const Hmm &hmm = getHmmRef();
+-
+-    std::vector<double> obsVec(T);
+-    for (std::size_t t = 0; t < T; ++t)
+-        obsVec[t] = observations_(t);
++    const std::span<const double> obsSpan(observations_.data(), T);
+
+     for (std::size_t i = 0; i < numStates_; ++i) {
+         hmm.getDistribution(i).getBatchLogProbabilities(
+-            std::span<const double>(obsVec.data(), T),
++            obsSpan,
+             std::span<double>(logEmitBuf_.data() + i * T, T));
+     }
++    // Build time-major emission buffer once for locality in dynamic programming.
++    logEmitByTime_.resize(T * numStates_);
++    for (std::size_t i = 0; i < numStates_; ++i) {
++        const double *stateRow = logEmitBuf_.data() + i * T;
++        for (std::size_t t = 0; t < T; ++t) {
++            logEmitByTime_[t * numStates_ + i] = stateRow[t];
++        }
++    }
+
+     runViterbi();
+     backtrack();
+@@ -68,10 +73,13 @@ void ViterbiCalculator::precomputeLogTransitions() {
+     const Hmm &hmm = getHmmRef();
+     const Matrix &trans = hmm.getTrans();
+     logTrans_.resize(numStates_, numStates_);
++    logTransT_.resize(numStates_, numStates_);
+     for (std::size_t i = 0; i < numStates_; ++i) {
+         for (std::size_t j = 0; j < numStates_; ++j) {
+             const double a = trans(i, j);
+-            logTrans_(i, j) = (a > 0.0) ? std::log(a) : LOG_ZERO;
++            const double logA = (a > 0.0) ? std::log(a) : LOG_ZERO;
++            logTrans_(i, j) = logA;
++            logTransT_(j, i) = logA;
+         }
+     }
+ }
+@@ -82,37 +90,48 @@ void ViterbiCalculator::runViterbi() {
+     const std::size_t T = observations_.size();
+
+     logDelta_.resize(T, numStates_);
+-    psi_.assign(T, std::vector<int>(numStates_, 0));
++    psi_.assign(T * numStates_, 0);
++
++    const double *logTransTData = logTransT_.data();
++    const double *logEmitByTimeData = logEmitByTime_.data();
++    double *logDeltaData = logDelta_.data();
++    const std::size_t N = numStates_;
+
+     // t = 0: initialise
++    const double *emitRow0 = logEmitByTimeData;
+     for (std::size_t i = 0; i < numStates_; ++i) {
+         const double logPi = (pi(i) > 0.0) ? std::log(pi(i)) : LOG_ZERO;
+-        logDelta_(0, i) = logPi + logEmitBuf_[i * T + 0];
++        logDeltaData[i] = logPi + emitRow0[i];
+     }
+
+     // t > 0: recursion
+     for (std::size_t t = 1; t < T; ++t) {
++        const double *prevDeltaRow = logDeltaData + (t - 1) * N;
++        double *deltaRow = logDeltaData + t * N;
++        const double *emitRow = logEmitByTimeData + t * N;
+         for (std::size_t j = 0; j < numStates_; ++j) {
+             double maxVal = LOG_ZERO;
+             int maxFrom = 0;
++            const double *transCol = logTransTData + j * N;
+             for (std::size_t i = 0; i < numStates_; ++i) {
+-                const double val = logDelta_(t - 1, i) + logTrans_(i, j);
++                const double val = prevDeltaRow[i] + transCol[i];
+                 if (val > maxVal) {
+                     maxVal = val;
+                     maxFrom = static_cast<int>(i);
+                 }
+             }
+-            logDelta_(t, j) = maxVal + logEmitBuf_[j * T + t];
+-            psi_[t][j] = maxFrom;
++            deltaRow[j] = maxVal + emitRow[j];
++            psi_[t * N + j] = maxFrom;
+         }
+     }
+
+     // Termination: best last state
+     double bestVal = LOG_ZERO;
+     int bestLast = 0;
++    const double *finalDeltaRow = logDeltaData + (T - 1) * N;
+     for (std::size_t i = 0; i < numStates_; ++i) {
+-        if (logDelta_(T - 1, i) > bestVal) {
+-            bestVal = logDelta_(T - 1, i);
++        if (finalDeltaRow[i] > bestVal) {
++            bestVal = finalDeltaRow[i];
+             bestLast = static_cast<int>(i);
+         }
+     }
+@@ -126,9 +145,10 @@ void ViterbiCalculator::backtrack() {
+     const std::size_t T = observations_.size();
+     if (T <= 1)
+         return;
++    const std::size_t N = numStates_;
+
+     for (std::size_t t = T - 2;; --t) {
+-        sequence_(t) = psi_[t + 1][static_cast<std::size_t>(sequence_(t + 1))];
++        sequence_(t) = psi_[(t + 1) * N + static_cast<std::size_t>(sequence_(t + 1))];
+         if (t == 0)
+             break;
+     }
+diff --git a/src/training/baum_welch_trainer.cpp b/src/training/baum_welch_trainer.cpp
+index 7ae236f..37d1b9c 100755
+--- a/src/training/baum_welch_trainer.cpp
++++ b/src/training/baum_welch_trainer.cpp
+@@ -29,22 +29,40 @@ void BaumWelchTrainer::train() {
+
+     // Accumulators (linear space, summed across all sequences)
+     std::vector<double> piNum(N, 0.0);
+-    std::vector<std::vector<double>> transNum(N, std::vector<double>(N, 0.0));
++    Matrix transNum(N, N);
++    clear_matrix(transNum);
+     std::vector<double> transDen(N, 0.0);
+
+     // Per-state emission data/weights accumulated across sequences
+     std::vector<std::vector<double>> emisData(N);
+     std::vector<std::vector<double>> emisWts(N);
++    std::size_t totalObservations = 0;
++    for (const auto &obs : obsLists_) {
++        totalObservations += obs.size();
++    }
++    const std::size_t reservePerState = (N > 0) ? (totalObservations / N + 1) : 0;
++    for (std::size_t i = 0; i < N; ++i) {
++        emisData[i].reserve(reservePerState);
++        emisWts[i].reserve(reservePerState);
++    }
+
+     // Precompute log-transition matrix from the current model
+     const Matrix &curTrans = hmm.getTrans();
+-    std::vector<std::vector<double>> logTrans(N, std::vector<double>(N));
++    Matrix logTrans(N, N);
++    std::vector<std::vector<std::size_t>> activeNextStates(N);
+     for (std::size_t i = 0; i < N; ++i) {
++        activeNextStates[i].reserve(N);
+         for (std::size_t j = 0; j < N; ++j) {
+             const double a = curTrans(i, j);
+-            logTrans[i][j] = (a > 0.0) ? std::log(a) : LOG_ZERO;
++            if (a > 0.0) {
++                logTrans(i, j) = std::log(a);
++                activeNextStates[i].push_back(j);
++            } else {
++                logTrans(i, j) = LOG_ZERO;
++            }
+         }
+     }
++    const double *logTransData = logTrans.data();
+
+     std::size_t validSeqs = 0;
+
+@@ -60,24 +78,29 @@ void BaumWelchTrainer::train() {
+
+         const Matrix &logAlpha = fbc.getLogForwardVariables();
+         const Matrix &logBeta = fbc.getLogBackwardVariables();
++        const double *logAlphaData = logAlpha.data();
++        const double *logBetaData = logBeta.data();
++        const double *obsData = obs.data();
+
+         // Precompute log-emissions for this sequence: logEmit[i * T + t]
+-        std::vector<double> obsVec(T);
+-        for (std::size_t t = 0; t < T; ++t)
+-            obsVec[t] = obs(t);
+
+         std::vector<double> logEmit(N * T);
++        const std::span<const double> obsSpan(obsData, T);
+         for (std::size_t i = 0; i < N; ++i) {
+             hmm.getDistribution(i).getBatchLogProbabilities(
+-                std::span<const double>(obsVec.data(), T),
++                obsSpan,
+                 std::span<double>(logEmit.data() + i * T, T));
+         }
+
+         // Accumulate gamma (per timestep per state) and pi/trans denominators
+         for (std::size_t t = 0; t < T; ++t) {
++            const double *alphaRow = logAlphaData + t * N;
++            const double *betaRow = logBetaData + t * N;
++            const double obsValue = obsData[t];
+             for (std::size_t i = 0; i < N; ++i) {
+-                const double g = std::exp(logAlpha(t, i) + logBeta(t, i) - logP);
+-                emisData[i].push_back(obs(t));
++                const double logGamma = alphaRow[i] + betaRow[i] - logP;
++                const double g = std::isfinite(logGamma) ? std::exp(logGamma) : 0.0;
++                emisData[i].push_back(obsValue);
+                 emisWts[i].push_back(g);
+                 if (t == 0)
+                     piNum[i] += g;
+@@ -88,11 +111,25 @@ void BaumWelchTrainer::train() {
+
+         // Accumulate xi (transition counts)
+         for (std::size_t t = 0; t + 1 < T; ++t) {
++            const double *alphaRow = logAlphaData + t * N;
++            const double *betaNextRow = logBetaData + (t + 1) * N;
+             for (std::size_t i = 0; i < N; ++i) {
+-                for (std::size_t j = 0; j < N; ++j) {
+-                    const double logXi = logAlpha(t, i) + logTrans[i][j] +
+-                                         logEmit[j * T + (t + 1)] + logBeta(t + 1, j) - logP;
+-                    transNum[i][j] += std::exp(logXi);
++                const double alphaVal = alphaRow[i];
++                if (!std::isfinite(alphaVal)) {
++                    continue;
++                }
++                const double *logTransRow = logTransData + i * N;
++                for (const std::size_t j : activeNextStates[i]) {
++                    const double betaNext = betaNextRow[j];
++                    const double emitNext = logEmit[j * T + (t + 1)];
++                    if (!std::isfinite(betaNext) || !std::isfinite(emitNext)) {
++                        continue;
++                    }
++                    const double logXi =
++                        alphaVal + logTransRow[j] + emitNext + betaNext - logP;
++                    if (std::isfinite(logXi)) {
++                        transNum(i, j) += std::exp(logXi);
++                    }
+                 }
+             }
+         }
+@@ -122,7 +159,7 @@ void BaumWelchTrainer::train() {
+         Matrix newTrans(N, N);
+         for (std::size_t i = 0; i < N; ++i) {
+             for (std::size_t j = 0; j < N; ++j) {
+-                newTrans(i, j) = (transDen[i] > 0.0) ? transNum[i][j] / transDen[i]
++                newTrans(i, j) = (transDen[i] > 0.0) ? transNum(i, j) / transDen[i]
+                                                      : 1.0 / static_cast<double>(N);
+             }
+         }
+diff --git a/src/training/viterbi_trainer.cpp b/src/training/viterbi_trainer.cpp
+index d159bb0..8943940 100755
+--- a/src/training/viterbi_trainer.cpp
++++ b/src/training/viterbi_trainer.cpp
+@@ -91,6 +91,15 @@ double ViterbiTrainer::runIteration() {
+     Matrix trans(N, N);
+     clear_matrix(trans);
+     std::vector<std::vector<double>> emisData(N);
++    std::size_t totalObservations = 0;
++    for (const auto &obs : obsLists_) {
++        totalObservations += obs.size();
++    }
++    const std::size_t reservePerState = (N > 0) ? (totalObservations / N + 1) : 0;
++    for (std::size_t i = 0; i < N; ++i) {
++        emisData[i].reserve(reservePerState);
++    }
++    std::vector<double> transRowSums(N, 0.0);
+
+     double totalLogProb = 0.0;
+     std::size_t validSeqs = 0;
+@@ -107,15 +116,18 @@ double ViterbiTrainer::runIteration() {
+             totalLogProb += lp;
+             const StateSequence &seq = vc.getStateSequence();
+             const std::size_t T = obs.size();
++            const int *seqData = seq.data();
++            const double *obsData = obs.data();
+
+-            pi(static_cast<std::size_t>(seq(0))) += 1.0;
++            pi(static_cast<std::size_t>(seqData[0])) += 1.0;
+
+             for (std::size_t t = 0; t < T; ++t) {
+-                const std::size_t s = static_cast<std::size_t>(seq(t));
+-                emisData[s].push_back(obs(t));
++                const std::size_t s = static_cast<std::size_t>(seqData[t]);
++                emisData[s].push_back(obsData[t]);
+                 if (t + 1 < T) {
+-                    const std::size_t sNext = static_cast<std::size_t>(seq(t + 1));
++                    const std::size_t sNext = static_cast<std::size_t>(seqData[t + 1]);
+                     trans(s, sNext) += 1.0;
++                    transRowSums[s] += 1.0;
+                 }
+             }
+             ++validSeqs;
+@@ -129,12 +141,10 @@ double ViterbiTrainer::runIteration() {
+
+     // Normalise pi
+     {
+-        double piSum = 0.0;
+-        for (std::size_t i = 0; i < N; ++i)
+-            piSum += pi(i);
+-        if (piSum > 0.0) {
++        if (validSeqs > 0) {
++            const double invValidSeqs = 1.0 / static_cast<double>(validSeqs);
+             for (std::size_t i = 0; i < N; ++i)
+-                pi(i) /= piSum;
++                pi(i) *= invValidSeqs;
+         } else {
+             for (std::size_t i = 0; i < N; ++i)
+                 pi(i) = 1.0 / static_cast<double>(N);
+@@ -144,12 +154,11 @@ double ViterbiTrainer::runIteration() {
+
+     // Normalise transition rows
+     for (std::size_t i = 0; i < N; ++i) {
+-        double rowSum = 0.0;
+-        for (std::size_t j = 0; j < N; ++j)
+-            rowSum += trans(i, j);
++        const double rowSum = transRowSums[i];
+         if (rowSum > 0.0) {
++            const double invRowSum = 1.0 / rowSum;
+             for (std::size_t j = 0; j < N; ++j)
+-                trans(i, j) /= rowSum;
++                trans(i, j) *= invRowSum;
+         } else {
+             for (std::size_t j = 0; j < N; ++j)
+                 trans(i, j) = 1.0 / static_cast<double>(N);
diff --git a/benchmark-analysis/run_focus_compiler_sweep.py b/benchmark-analysis/run_focus_compiler_sweep.py
new file mode 100644
index 0000000..a356c30
--- /dev/null
+++ b/benchmark-analysis/run_focus_compiler_sweep.py
@@ -0,0 +1,134 @@
+import csv
+import pathlib
+import re
+import subprocess
+import statistics
+
+compilers = {
+    'msvc': {
+        'pair_exe': pathlib.Path(r'C:\Users\gdwol\Development\libhmm\build-focus-pairwise-ryzen-msvc\tools\hotspot_breakdown.exe'),
+        'max_exe': pathlib.Path(r'C:\Users\gdwol\Development\libhmm\build-focus-max-ryzen-msvc\tools\hotspot_breakdown.exe'),
+        'out_dir': pathlib.Path(r'C:\Users\gdwol\Development\libhmm\benchmark-analysis\focus-n2-8-ryzen-windows-msvc-rerun'),
+    },
+    'clangcl': {
+        'pair_exe': pathlib.Path(r'C:\Users\gdwol\Development\libhmm\build-focus-pairwise-ryzen-clangcl\tools\hotspot_breakdown.exe'),
+        'max_exe': pathlib.Path(r'C:\Users\gdwol\Development\libhmm\build-focus-max-ryzen-clangcl\tools\hotspot_breakdown.exe'),
+        'out_dir': pathlib.Path(r'C:\Users\gdwol\Development\libhmm\benchmark-analysis\focus-n2-8-ryzen-windows-clangcl-rerun'),
+    },
+    'mingw': {
+        'pair_exe': pathlib.Path(r'C:\Users\gdwol\Development\libhmm\build-focus-pairwise-ryzen-mingw\tools\hotspot_breakdown.exe'),
+        'max_exe': pathlib.Path(r'C:\Users\gdwol\Development\libhmm\build-focus-max-ryzen-mingw\tools\hotspot_breakdown.exe'),
+        'out_dir': pathlib.Path(r'C:\Users\gdwol\Development\libhmm\benchmark-analysis\focus-n2-8-ryzen-windows-mingw-rerun'),
+    },
+}
+
+n_vals = list(range(2, 9))
+t_vals = [500, 1000, 2000, 5000, 10000, 100000]
+runs = 5
+warmup = 2
+
+fb_block_re = re.compile(r'Forward-Backward phase breakdown:(.*?)Viterbi phase breakdown:', re.S)
+num_re = re.compile(r'([0-9]+(?:\.[0-9]+)?)')
+
+def parse_hotspot_output(text: str):
+    m = fb_block_re.search(text)
+    if not m:
+        raise RuntimeError('Could not find FB breakdown block')
+    block = m.group(1)
+
+    def find_metric(label: str):
+        for candidate in block.splitlines():
+            if label in candidate:
+                nums = num_re.findall(candidate)
+                if nums:
+                    return float(nums[0])
+        raise RuntimeError(f'Missing metric line for {label}')
+
+    total_line = None
+    for candidate in block.splitlines():
+        if candidate.strip().startswith('TOTAL'):
+            total_line = candidate
+            break
+    if total_line is None:
+        raise RuntimeError('Missing TOTAL line in FB block')
+
+    total_nums = num_re.findall(total_line)
+    if not total_nums:
+        raise RuntimeError('No TOTAL numeric value in FB block')
+
+    return {
+        'fb_total_ms': float(total_nums[0]),
+        'forward_ms': find_metric('Forward recursion'),
+        'backward_ms': find_metric('Backward recursion'),
+    }
+
+def run_grid(exe: pathlib.Path, mode: str):
+    rows = []
+    for n in n_vals:
+        for t in t_vals:
+            proc = subprocess.run(
+                [str(exe), str(n), str(t), str(runs), str(warmup)],
+                capture_output=True,
+                text=True,
+                check=True,
+            )
+            metrics = parse_hotspot_output(proc.stdout)
+            rows.append({
+                'mode': mode,
+                'n': n,
+                't': t,
+                'runs': runs,
+                'warmup': warmup,
+                'fb_total_ms': metrics['fb_total_ms'],
+                'forward_ms': metrics['forward_ms'],
+                'backward_ms': metrics['backward_ms'],
+            })
+    return rows
+
+for compiler, cfg in compilers.items():
+    out_dir = cfg['out_dir']
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    pair_rows = run_grid(cfg['pair_exe'], 'pairwise')
+    max_rows = run_grid(cfg['max_exe'], 'max_reduce')
+
+    pair_csv = out_dir / 'focused_pairwise_n2_8.csv'
+    max_csv = out_dir / 'focused_max_reduce_n2_8.csv'
+    cmp_csv = out_dir / 'focused_pairwise_vs_max_reduce_n2_8.csv'
+
+    with pair_csv.open('w', newline='') as f:
+        w = csv.DictWriter(f, fieldnames=list(pair_rows[0].keys()))
+        w.writeheader()
+        w.writerows(pair_rows)
+
+    with max_csv.open('w', newline='') as f:
+        w = csv.DictWriter(f, fieldnames=list(max_rows[0].keys()))
+        w.writeheader()
+        w.writerows(max_rows)
+
+    pair_map = {(r['n'], r['t']): r for r in pair_rows}
+    cmp_rows = []
+    for mr in max_rows:
+        key = (mr['n'], mr['t'])
+        pr = pair_map[key]
+        speedup = pr['fb_total_ms'] / mr['fb_total_ms']
+        cmp_rows.append({
+            'n': mr['n'],
+            't': mr['t'],
+            'pairwise_fb_total_ms': pr['fb_total_ms'],
+            'max_reduce_fb_total_ms': mr['fb_total_ms'],
+            'speedup_max_over_pair': speedup,
+            'winner': 'max_reduce' if speedup > 1.0 else 'pairwise',
+        })
+
+    with cmp_csv.open('w', newline='') as f:
+        w = csv.DictWriter(f, fieldnames=list(cmp_rows[0].keys()))
+        w.writeheader()
+        w.writerows(sorted(cmp_rows, key=lambda r: (r['n'], r['t'])))
+
+    vals = [r['speedup_max_over_pair'] for r in cmp_rows]
+    max_wins = sum(1 for r in cmp_rows if r['winner'] == 'max_reduce')
+    pair_wins = len(cmp_rows) - max_wins
+    print(f"{compiler}: points={len(cmp_rows)} max_wins={max_wins} pair_wins={pair_wins} median={statistics.median(vals):.6f}")
+
+print('DONE')
diff --git a/benchmark-analysis/run_focus_single_compiler.py b/benchmark-analysis/run_focus_single_compiler.py
new file mode 100644
index 0000000..ccd402b
--- /dev/null
+++ b/benchmark-analysis/run_focus_single_compiler.py
@@ -0,0 +1,157 @@
+import argparse
+import csv
+import pathlib
+import re
+import statistics
+import subprocess
+
+
+COMPILERS = {
+    "msvc": {
+        "pair_build": pathlib.Path(r"C:\Users\gdwol\Development\libhmm\build-focus-pairwise-ryzen-msvc"),
+        "max_build": pathlib.Path(r"C:\Users\gdwol\Development\libhmm\build-focus-max-ryzen-msvc"),
+        "out_dir": pathlib.Path(r"C:\Users\gdwol\Development\libhmm\benchmark-analysis\focus-n2-8-ryzen-windows-msvc-rerun"),
+    },
+    "clangcl": {
+        "pair_build": pathlib.Path(r"C:\Users\gdwol\Development\libhmm\build-focus-pairwise-ryzen-clangcl"),
+        "max_build": pathlib.Path(r"C:\Users\gdwol\Development\libhmm\build-focus-max-ryzen-clangcl"),
+        "out_dir": pathlib.Path(r"C:\Users\gdwol\Development\libhmm\benchmark-analysis\focus-n2-8-ryzen-windows-clangcl-rerun"),
+    },
+    "mingw": {
+        "pair_build": pathlib.Path(r"C:\Users\gdwol\Development\libhmm\build-focus-pairwise-ryzen-mingw"),
+        "max_build": pathlib.Path(r"C:\Users\gdwol\Development\libhmm\build-focus-max-ryzen-mingw"),
+        "out_dir": pathlib.Path(r"C:\Users\gdwol\Development\libhmm\benchmark-analysis\focus-n2-8-ryzen-windows-mingw-rerun"),
+    },
+}
+
+N_VALUES = list(range(2, 9))
+T_VALUES = [500, 1000, 2000, 5000, 10000, 100000]
+
+FB_BLOCK_RE = re.compile(r"Forward-Backward phase breakdown:(.*?)Viterbi phase breakdown:", re.S)
+NUM_RE = re.compile(r"([0-9]+(?:\.[0-9]+)?)")
+
+
+def parse_output(text: str) -> dict:
+    block_match = FB_BLOCK_RE.search(text)
+    if not block_match:
+        raise RuntimeError("Could not find Forward-Backward breakdown block")
+    block = block_match.group(1)
+
+    def metric(label: str) -> float:
+        for line in block.splitlines():
+            if label in line:
+                nums = NUM_RE.findall(line)
+                if nums:
+                    return float(nums[0])
+        raise RuntimeError(f"Missing metric line for {label}")
+
+    total_line = None
+    for line in block.splitlines():
+        if line.strip().startswith("TOTAL"):
+            total_line = line
+            break
+    if total_line is None:
+        raise RuntimeError("Missing TOTAL line in Forward-Backward block")
+
+    total_nums = NUM_RE.findall(total_line)
+    if not total_nums:
+        raise RuntimeError("Missing TOTAL numeric value in Forward-Backward block")
+
+    return {
+        "fb_total_ms": float(total_nums[0]),
+        "forward_ms": metric("Forward recursion"),
+        "backward_ms": metric("Backward recursion"),
+    }
+
+
+def run_grid(build_dir: pathlib.Path, mode: str, runs: int, warmup: int) -> list:
+    exe = build_dir / "tools" / "hotspot_breakdown.exe"
+    if not exe.exists():
+        raise FileNotFoundError(f"Missing executable: {exe}")
+    rows = []
+    for n in N_VALUES:
+        for t in T_VALUES:
+            proc = subprocess.run(
+                [str(exe), str(n), str(t), str(runs), str(warmup)],
+                cwd=str(build_dir),
+                capture_output=True,
+                text=True,
+                check=True,
+            )
+            parsed = parse_output(proc.stdout)
+            rows.append(
+                {
+                    "mode": mode,
+                    "n": n,
+                    "t": t,
+                    "runs": runs,
+                    "warmup": warmup,
+                    "fb_total_ms": parsed["fb_total_ms"],
+                    "forward_ms": parsed["forward_ms"],
+                    "backward_ms": parsed["backward_ms"],
+                }
+            )
+    return rows
+
+
+def write_csv(path: pathlib.Path, rows: list) -> None:
+    with path.open("w", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=list(rows[0].keys()))
+        writer.writeheader()
+        writer.writerows(rows)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--compiler", choices=sorted(COMPILERS.keys()), required=True)
+    parser.add_argument("--runs", type=int, default=5)
+    parser.add_argument("--warmup", type=int, default=2)
+    args = parser.parse_args()
+
+    cfg = COMPILERS[args.compiler]
+    out_dir = cfg["out_dir"]
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    pair_rows = run_grid(cfg["pair_build"], "pairwise", args.runs, args.warmup)
+    max_rows = run_grid(cfg["max_build"], "max_reduce", args.runs, args.warmup)
+
+    pair_csv = out_dir / "focused_pairwise_n2_8.csv"
+    max_csv = out_dir / "focused_max_reduce_n2_8.csv"
+    cmp_csv = out_dir / "focused_pairwise_vs_max_reduce_n2_8.csv"
+
+    write_csv(pair_csv, pair_rows)
+    write_csv(max_csv, max_rows)
+
+    pair_map = {(r["n"], r["t"]): r for r in pair_rows}
+    cmp_rows = []
+    for mr in max_rows:
+        pr = pair_map[(mr["n"], mr["t"])]
+        speedup = pr["fb_total_ms"] / mr["fb_total_ms"]
+        cmp_rows.append(
+            {
+                "n": mr["n"],
+                "t": mr["t"],
+                "pairwise_fb_total_ms": pr["fb_total_ms"],
+                "max_reduce_fb_total_ms": mr["fb_total_ms"],
+                "speedup_max_over_pair": speedup,
+                "winner": "max_reduce" if speedup > 1.0 else "pairwise",
+            }
+        )
+
+    cmp_rows.sort(key=lambda row: (row["n"], row["t"]))
+    write_csv(cmp_csv, cmp_rows)
+
+    speedups = [row["speedup_max_over_pair"] for row in cmp_rows]
+    max_wins = sum(1 for row in cmp_rows if row["winner"] == "max_reduce")
+    pair_wins = len(cmp_rows) - max_wins
+    print(
+        f"{args.compiler}: points={len(cmp_rows)} max_wins={max_wins} "
+        f"pair_wins={pair_wins} median={statistics.median(speedups):.6f}"
+    )
+    print(f"wrote: {pair_csv}")
+    print(f"wrote: {max_csv}")
+    print(f"wrote: {cmp_csv}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark-analysis/run_hmmlib_passes.py b/benchmark-analysis/run_hmmlib_passes.py
new file mode 100644
index 0000000..34380f8
--- /dev/null
+++ b/benchmark-analysis/run_hmmlib_passes.py
@@ -0,0 +1,94 @@
+import argparse
+import csv
+import os
+import pathlib
+import re
+import statistics
+import subprocess
+
+
+LIBHMM_RE = re.compile(r"libhmm average throughput:\s*([0-9]+(?:\.[0-9]+)?)\s+observations/ms")
+HMMLIB_RE = re.compile(r"HMMLib average throughput:\s*([0-9]+(?:\.[0-9]+)?)\s+observations/ms")
+RATIO_RE = re.compile(r"Overall performance ratio:\s*([0-9]+(?:\.[0-9]+)?)x\s+\(HMMLib/libhmm\)")
+
+
+def parse_summary(output: str) -> dict:
+    m_libhmm = LIBHMM_RE.search(output)
+    m_hmmlib = HMMLIB_RE.search(output)
+    m_ratio = RATIO_RE.search(output)
+    if not (m_libhmm and m_hmmlib and m_ratio):
+        raise RuntimeError("Could not parse benchmark summary lines from comparator output")
+    return {
+        "libhmm_avg_obs_ms": float(m_libhmm.group(1)),
+        "hmmlib_avg_obs_ms": float(m_hmmlib.group(1)),
+        "ratio_hmmlib_over_libhmm": float(m_ratio.group(1)),
+    }
+
+
+def median(values: list[float]) -> float:
+    return statistics.median(values)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--exe", required=True)
+    parser.add_argument("--dll-dir", required=True)
+    parser.add_argument("--passes", type=int, default=9)
+    parser.add_argument("--label", required=True)
+    parser.add_argument("--out-csv", required=True)
+    args = parser.parse_args()
+
+    exe = pathlib.Path(args.exe)
+    if not exe.exists():
+        raise FileNotFoundError(f"Missing executable: {exe}")
+    dll_dir = pathlib.Path(args.dll_dir)
+    if not dll_dir.exists():
+        raise FileNotFoundError(f"Missing DLL directory: {dll_dir}")
+
+    env = os.environ.copy()
+    env["PATH"] = f"{dll_dir};{env.get('PATH', '')}"
+
+    rows = []
+    for run_idx in range(1, args.passes + 1):
+        proc = subprocess.run(
+            [str(exe)],
+            cwd=str(exe.parent),
+            env=env,
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        parsed = parse_summary(proc.stdout)
+        row = {"label": args.label, "pass": run_idx}
+        row.update(parsed)
+        rows.append(row)
+        print(
+            f"{args.label} pass {run_idx}/{args.passes}: "
+            f"libhmm={parsed['libhmm_avg_obs_ms']:.1f} "
+            f"hmmlib={parsed['hmmlib_avg_obs_ms']:.1f} "
+            f"ratio={parsed['ratio_hmmlib_over_libhmm']:.3f}"
+        )
+
+    out_csv = pathlib.Path(args.out_csv)
+    out_csv.parent.mkdir(parents=True, exist_ok=True)
+    with out_csv.open("w", newline="") as f:
+        writer = csv.DictWriter(
+            f,
+            fieldnames=["label", "pass", "libhmm_avg_obs_ms", "hmmlib_avg_obs_ms", "ratio_hmmlib_over_libhmm"],
+        )
+        writer.writeheader()
+        writer.writerows(rows)
+
+    lib_vals = [row["libhmm_avg_obs_ms"] for row in rows]
+    hm_vals = [row["hmmlib_avg_obs_ms"] for row in rows]
+    ratio_vals = [row["ratio_hmmlib_over_libhmm"] for row in rows]
+    print(
+        f"{args.label} medians: "
+        f"libhmm={median(lib_vals):.1f} hmmlib={median(hm_vals):.1f} "
+        f"ratio={median(ratio_vals):.3f}"
+    )
+    print(f"wrote: {out_csv}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark-analysis/summarize_windows_compiler_rerun.py b/benchmark-analysis/summarize_windows_compiler_rerun.py
new file mode 100644
index 0000000..918045f
--- /dev/null
+++ b/benchmark-analysis/summarize_windows_compiler_rerun.py
@@ -0,0 +1,94 @@
+import csv
+import math
+import pathlib
+import statistics
+
+
+ROOT = pathlib.Path(r"C:\Users\gdwol\Development\libhmm\benchmark-analysis")
+
+FOCUS = {
+    "msvc": ROOT / "focus-n2-8-ryzen-windows-msvc-rerun" / "focused_pairwise_vs_max_reduce_n2_8.csv",
+    "clangcl": ROOT / "focus-n2-8-ryzen-windows-clangcl-rerun" / "focused_pairwise_vs_max_reduce_n2_8.csv",
+    "mingw": ROOT / "focus-n2-8-ryzen-windows-mingw-rerun" / "focused_pairwise_vs_max_reduce_n2_8.csv",
+}
+
+HMMLIB = {
+    "msvc_control": ROOT / "hmmlib-9pass-ryzen-windows-msvc-rerun" / "control_passes.csv",
+    "msvc_adaptive": ROOT / "hmmlib-9pass-ryzen-windows-msvc-rerun" / "adaptive_passes.csv",
+    "mingw_control": ROOT / "hmmlib-9pass-ryzen-windows-mingw-rerun" / "control_passes.csv",
+    "mingw_adaptive": ROOT / "hmmlib-9pass-ryzen-windows-mingw-rerun" / "adaptive_passes.csv",
+    "clangcl_control": ROOT / "hmmlib-9pass-ryzen-windows-clangcl-rerun-o2" / "control_passes.csv",
+    "clangcl_adaptive": ROOT / "hmmlib-9pass-ryzen-windows-clangcl-rerun-o2" / "adaptive_passes.csv",
+}
+
+
+def geomean(vals: list[float]) -> float:
+    return math.exp(sum(math.log(v) for v in vals) / len(vals))
+
+
+def read_csv(path: pathlib.Path) -> list[dict]:
+    with path.open(newline="") as f:
+        return list(csv.DictReader(f))
+
+
+def summarize_focus() -> None:
+    print("FOCUSED_SWEEP_SUMMARY")
+    for compiler, path in FOCUS.items():
+        rows = read_csv(path)
+        speedups = [float(r["speedup_max_over_pair"]) for r in rows]
+        max_wins = sum(1 for r in rows if r["winner"] == "max_reduce")
+        pair_wins = len(rows) - max_wins
+        pair_vals = [float(r["pairwise_fb_total_ms"]) for r in rows]
+        max_vals = [float(r["max_reduce_fb_total_ms"]) for r in rows]
+        print(
+            f"{compiler}: points={len(rows)} max_wins={max_wins} pair_wins={pair_wins} "
+            f"median_speedup={statistics.median(speedups):.6f} "
+            f"geomean_pair_ms={geomean(pair_vals):.6f} geomean_max_ms={geomean(max_vals):.6f}"
+        )
+        for n in range(2, 9):
+            nrows = [r for r in rows if int(r["n"]) == n]
+            n_max = sum(1 for r in nrows if r["winner"] == "max_reduce")
+            print(f"  n={n}: max_wins={n_max}/{len(nrows)}")
+
+
+def summarize_hmmlib() -> None:
+    print("HMMLIB_9PASS_SUMMARY")
+    med = {}
+    for label, path in HMMLIB.items():
+        rows = read_csv(path)
+        lib_vals = [float(r["libhmm_avg_obs_ms"]) for r in rows]
+        hm_vals = [float(r["hmmlib_avg_obs_ms"]) for r in rows]
+        ratio_vals = [float(r["ratio_hmmlib_over_libhmm"]) for r in rows]
+        med[label] = {
+            "lib": statistics.median(lib_vals),
+            "hm": statistics.median(hm_vals),
+            "ratio": statistics.median(ratio_vals),
+        }
+        print(
+            f"{label}: passes={len(rows)} med_libhmm={med[label]['lib']:.4f} "
+            f"med_hmmlib={med[label]['hm']:.4f} med_ratio={med[label]['ratio']:.6f}"
+        )
+
+    msvc_delta = (med["msvc_adaptive"]["lib"] / med["msvc_control"]["lib"] - 1.0) * 100.0
+    mingw_delta = (med["mingw_adaptive"]["lib"] / med["mingw_control"]["lib"] - 1.0) * 100.0
+    clangcl_delta = (med["clangcl_adaptive"]["lib"] / med["clangcl_control"]["lib"] - 1.0) * 100.0
+    print(f"msvc adaptive_vs_control delta_libhmm_pct={msvc_delta:.6f}")
+    print(f"mingw adaptive_vs_control delta_libhmm_pct={mingw_delta:.6f}")
+    print(f"clangcl adaptive_vs_control delta_libhmm_pct={clangcl_delta:.6f}")
+    ctrl_mingw_vs_msvc = (med["mingw_control"]["lib"] / med["msvc_control"]["lib"] - 1.0) * 100.0
+    adapt_mingw_vs_msvc = (med["mingw_adaptive"]["lib"] / med["msvc_adaptive"]["lib"] - 1.0) * 100.0
+    ctrl_clangcl_vs_msvc = (med["clangcl_control"]["lib"] / med["msvc_control"]["lib"] - 1.0) * 100.0
+    adapt_clangcl_vs_msvc = (med["clangcl_adaptive"]["lib"] / med["msvc_adaptive"]["lib"] - 1.0) * 100.0
+    ctrl_clangcl_vs_mingw = (med["clangcl_control"]["lib"] / med["mingw_control"]["lib"] - 1.0) * 100.0
+    adapt_clangcl_vs_mingw = (med["clangcl_adaptive"]["lib"] / med["mingw_adaptive"]["lib"] - 1.0) * 100.0
+    print(f"mingw_vs_msvc control_libhmm_pct={ctrl_mingw_vs_msvc:.6f}")
+    print(f"mingw_vs_msvc adaptive_libhmm_pct={adapt_mingw_vs_msvc:.6f}")
+    print(f"clangcl_vs_msvc control_libhmm_pct={ctrl_clangcl_vs_msvc:.6f}")
+    print(f"clangcl_vs_msvc adaptive_libhmm_pct={adapt_clangcl_vs_msvc:.6f}")
+    print(f"clangcl_vs_mingw control_libhmm_pct={ctrl_clangcl_vs_mingw:.6f}")
+    print(f"clangcl_vs_mingw adaptive_libhmm_pct={adapt_clangcl_vs_mingw:.6f}")
+
+
+if __name__ == "__main__":
+    summarize_focus()
+    summarize_hmmlib()
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 098e083..a50a9a6 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -65,14 +65,14 @@ if(EXISTS "${LAMP_DIR}/hmmFind")
     set(LAMP_READY ON)
 endif()
 
+# HMMLib is a header-only library (SSE2/NEON intrinsics, no Boost runtime
+# dependency). The Boost check previously here was spurious — HMMLib headers
+# contain no Boost includes. Detect by checking for the canonical header.
 set(HMMLIB_READY OFF)
-if(EXISTS "${HMMLIB_DIR}")
-    find_package(Boost QUIET)
-    if(Boost_FOUND)
-        set(HMMLIB_READY ON)
-    else()
-        message(WARNING "HMMLib directory found at ${HMMLIB_DIR}, but Boost was not found. HMMLib-dependent benchmarks will be skipped.")
-    endif()
+if(EXISTS "${HMMLIB_DIR}/HMMlib/hmm.hpp")
+    set(HMMLIB_READY ON)
+elseif(EXISTS "${HMMLIB_DIR}")
+    message(WARNING "HMMLib directory found at ${HMMLIB_DIR} but hmm.hpp not found. HMMLib-dependent benchmarks will be skipped.")
 else()
     message(WARNING "HMMLib directory not found at ${HMMLIB_DIR}. HMMLib-dependent benchmarks will be skipped.")
 endif()
@@ -159,11 +159,7 @@ function(enable_hmmlib target_name)
     target_include_directories(${target_name}
         SYSTEM PRIVATE
             ${HMMLIB_DIR}
-            ${Boost_INCLUDE_DIRS}
     )
-    if(Boost_LIBRARIES)
-        target_link_libraries(${target_name} PRIVATE ${Boost_LIBRARIES})
-    endif()
 endfunction()
 
 function(enable_stochhmm target_name)
diff --git a/benchmarks/docs/BENCHMARKING_RESULTS.md b/benchmarks/docs/BENCHMARKING_RESULTS.md
index 9b9b900..765c7d7 100644
--- a/benchmarks/docs/BENCHMARKING_RESULTS.md
+++ b/benchmarks/docs/BENCHMARKING_RESULTS.md
@@ -9,7 +9,7 @@ This document summarizes benchmark results comparing libhmm against major HMM li
 ### Libraries Tested
 
 1. **libhmm** - Modern C++20 implementation with zero external dependencies
-2. **HMMLib** - High-performance C++ library with Boost dependencies  
+2. **HMMLib** - High-performance C++ library with Boost dependencies
 3. **StochHMM** - Bioinformatics-focused C++ library
 4. **GHMM** - General Hidden Markov Model Library (C)
 5. **HTK** - Hidden Markov Model Toolkit (command-line based)
@@ -39,7 +39,7 @@ Two classic HMM benchmark problems were used across all libraries:
 - **Transitions**: Fair→Fair (0.95), Fair→Loaded (0.05), Loaded→Fair (0.10), Loaded→Loaded (0.90)
 - **Emissions**: Fair die (uniform 1/6), Loaded die (symbol 5 favored at 0.50)
 
-#### 2. Weather Model Problem  
+#### 2. Weather Model Problem
 - **States**: 2 (Sunny, Rainy)
 - **Observations**: 2 symbols (Hot, Cold)
 - **Transitions**: Sunny→Sunny (0.7), Sunny→Rainy (0.3), Rainy→Sunny (0.4), Rainy→Rainy (0.6)
@@ -61,7 +61,7 @@ To ensure fair comparison and detect numerical issues:
 ### Performance Metrics
 
 - **Forward-Backward algorithm timing**: Primary performance metric
-- **Viterbi algorithm timing**: Secondary performance metric  
+- **Viterbi algorithm timing**: Secondary performance metric
 - **Throughput**: Observations processed per millisecond
 - **Scaling behavior**: Performance across different sequence lengths
 
@@ -101,7 +101,7 @@ libhmm shows machine-precision agreement with key reference libraries:
 
 **Example numerical comparison** (Casino Problem, 1000 observations):
 - libhmm: -1.815e+03
-- HMMLib: -1.815e+03  
+- HMMLib: -1.815e+03
 - StochHMM: -1.815e+03
 - GHMM: -1.815e+03
 - HTK: -2.000e+03 ← **Deliberately rounded for computational efficiency**
@@ -162,7 +162,7 @@ Historical snapshot from earlier benchmark runs; use the April 2026 consolidated
 
 **Medium Sequences (1,000-10,000 observations):**
 - GHMM: 20-25x faster than libhmm
-- HMMLib: 15-20x faster than libhmm  
+- HMMLib: 15-20x faster than libhmm
 - StochHMM: 2x faster than libhmm
 - HTK: Approaching libhmm performance
 
@@ -245,7 +245,7 @@ All libraries successfully processed sequences up to 1,000,000 observations with
 - Numerical precision is important
 - You can handle complex C API
 
-#### Choose **HMMLib** when:  
+#### Choose **HMMLib** when:
 - High performance is needed
 - C++ integration is required
 - Boost dependencies are acceptable
@@ -253,7 +253,7 @@ All libraries successfully processed sequences up to 1,000,000 observations with
 
 #### Choose **libhmm** when:
 - Modern C++20 features are desired
-- Zero external dependencies are required  
+- Zero external dependencies are required
 - Code maintainability is important
 - Moderate performance is sufficient
 - Cross-platform compatibility is needed
@@ -278,7 +278,7 @@ libhmm's performance should be evaluated in context:
 - For most practical applications, this performance is more than adequate
 - The ~20x speed difference with top performers matters primarily for:
   - High-frequency real-time applications
-  - Massive batch processing workflows  
+  - Massive batch processing workflows
   - Training on extremely large datasets
 
 ### Future Development
@@ -303,7 +303,7 @@ All benchmark code and configurations are available in the `benchmarks/` directo
 **Note on External Libraries**: The original source code for HMMLib, StochHMM, GHMM, and HTK is not included in this repository. To reproduce these benchmarks, these libraries must be obtained from their respective developers/maintainers and built according to their official documentation:
 
 - **HMMLib**: Available from original authors/research institutions
-- **StochHMM**: https://github.com/KorfLab/StochHMM  
+- **StochHMM**: https://github.com/KorfLab/StochHMM
 - **GHMM**: http://ghmm.org
 - **HTK**: http://htk.eng.cam.ac.uk (requires registration)
 
@@ -312,7 +312,7 @@ The benchmark implementations in this repository provide the integration code ne
 ### Validation Methodology
 The numerical accuracy validation included:
 - Direct log-likelihood comparison to machine precision
-- Step-by-step forward algorithm verification  
+- Step-by-step forward algorithm verification
 - Cross-validation between multiple reference implementations
 - Deep numerical analysis of scaling factors and intermediate values
 
@@ -432,7 +432,7 @@ This section adds an updated snapshot without removing any prior content. Earlie
 | `libhmm_vs_jahmm_benchmark`** | JAHMM | 7161.5 | 3803.6 | 0.53x | `build-benchmarks-release/benchmark-logs/libhmm_vs_jahmm_benchmark_after_pathfix.log` |
 | `libhmm_vs_lamp_benchmark` | LAMP | 6016.7 | 48.2 | 0.01x | Windows x86_64 run, April 2026 (post-warmup) |
 
-\* Uses post-PI-correction StochHMM continuous results (`after_pi_fix`).  
+\* Uses post-PI-correction StochHMM continuous results (`after_pi_fix`).
 \** JAHMM benchmark log does not emit an average throughput summary line; values above are computed from per-run forward timings in the same log.
 
 ### Updated Code Quality and Maintainability Snapshot (All Evaluated Libraries)
@@ -463,17 +463,17 @@ To capture correctness signal separately from throughput, three updated diagnost
 
 Key outcomes:
 
-- **Canonical numerical parity with HMMLib** (`deep_numerical_analysis_modernized.log`):  
+- **Canonical numerical parity with HMMLib** (`deep_numerical_analysis_modernized.log`):
   Across sequence lengths 10, 50, 100, 200, 500, 1000, and 2000, libhmm and HMMLib log-likelihoods match to near machine precision. Maximum absolute difference observed: `5.093170e-11` (length 2000), with no length-dependent drift pattern.
 
-- **Step-level forward-pass agreement** (`deep_numerical_analysis_modernized.log`):  
+- **Step-level forward-pass agreement** (`deep_numerical_analysis_modernized.log`):
   Normalized per-step forward-variable differences are in floating-point noise range (`~1e-16`, max shown `4.163e-16`), and final log-probability difference is `0.000000e+00`.
 
-- **Distribution-layer Gaussian agreement across libraries** (`gaussian_distribution_comparison_modernized.log`):  
+- **Distribution-layer Gaussian agreement across libraries** (`gaussian_distribution_comparison_modernized.log`):
   libhmm, GHMM, and StochHMM report `MATCH` across all tested Gaussian cases (standard, shifted mean, negative mean, high variance), indicating aligned PDF/log-PDF behavior at the distribution layer.
 
-- **Constructor semantics validated for reproducibility** (`diagnostic_accuracy_test_modernized.log`):  
+- **Constructor semantics validated for reproducibility** (`diagnostic_accuracy_test_modernized.log`):
   `GaussianDistribution(mean, second_parameter)` uses **standard deviation** semantics (not variance). This check avoids silent benchmark misconfiguration when mapping model parameters.
 
-- **Canonical calculator self-consistency checks pass** (`diagnostic_accuracy_test_modernized.log`):  
+- **Canonical calculator self-consistency checks pass** (`diagnostic_accuracy_test_modernized.log`):
   ForwardBackward pointer/reference constructors and `getLogProbability()` vs `log(probability())` are numerically identical on the test model; a manual forward calculation also matches libhmm (`probability diff 6.939e-18`, `log diff 0.000e+00`).
diff --git a/benchmarks/src/diagnostic_accuracy_test.cpp b/benchmarks/src/diagnostic_accuracy_test.cpp
index f70092b..b237918 100644
--- a/benchmarks/src/diagnostic_accuracy_test.cpp
+++ b/benchmarks/src/diagnostic_accuracy_test.cpp
@@ -16,7 +16,7 @@ using namespace std;
 
 /**
  * DIAGNOSTIC TEST FOR NUMERICAL ACCURACY DISCREPANCIES
- * 
+ *
  * This test isolates potential issues in:
  * 1. Distribution implementations (PDF/log-PDF calculations)
  * 2. HMM setup (parameter setting)
diff --git a/docs/GOLD_STANDARD_CHECKLIST.md b/docs/GOLD_STANDARD_CHECKLIST.md
index 9abaaa3..ce69951 100644
--- a/docs/GOLD_STANDARD_CHECKLIST.md
+++ b/docs/GOLD_STANDARD_CHECKLIST.md
@@ -44,28 +44,28 @@ All distributions must implement the `EmissionDistribution` abstract interface.
   - `getBatchLogProbabilities(span, span)` — concrete non-virtual batch loop (tier 1 minimum)
   - `reset()` — reset to default parameters
   - `toString()` — human-readable string representation
-  
+
 - ✅ **Rule of Five:**
   - Copy Constructor
-  - Move Constructor  
+  - Move Constructor
   - Copy Assignment Operator
   - Move Assignment Operator
   - Destructor (virtual, defaulted)
-  
+
 - ✅ **Caching System:**
   - Comprehensive caching of expensive calculations
   - Cache validation flags
   - Automatic cache invalidation on parameter changes
-  
+
 - ✅ **Input Validation:**
   - Robust parameter validation with appropriate exceptions
   - NaN/infinity handling
   - Data validation in fitting methods
-  
+
 - ✅ **Constants Usage:**
   - All numeric literals replaced with constants from `libhmm::constants`
   - No hardcoded magic numbers
-  
+
 - ✅ **I/O  Operators:**
   - `operator==` - Equality comparison with tolerance
   - `operator<<` - Stream output
@@ -74,13 +74,13 @@ All distributions must implement the `EmissionDistribution` abstract interface.
 ### Test Requirements
 - ✅ **Core Tests:**
   - Basic Functionality
-  - Probability Calculations  
+  - Probability Calculations
   - Parameter Fitting
   - Parameter Validation
   - Copy/Move Semantics
   - Invalid Input Handling
   - Reset Functionality
-  
+
 - ✅ **Advanced Tests:**
   - Log Probability calculations
   - String Representation
@@ -88,7 +88,7 @@ All distributions must implement the `EmissionDistribution` abstract interface.
   - Performance characteristics (recommended)
   - Mathematical Correctness (recommended)
   - Numerical Stability (recommended)
-  
+
 - ✅ **Gold Standard Tests:**
   - CDF calculations (where applicable)
   - Equality/I-O operators
@@ -154,7 +154,7 @@ All distributions must implement the `EmissionDistribution` abstract interface.
 
 ## Legend
 - ✅ **Complete**: Fully implemented and tested
-- ❌ **Missing**: Needs to be implemented/added  
+- ❌ **Missing**: Needs to be implemented/added
 - ❓ **Unknown**: Needs assessment
 - 🔄 **In Progress**: Currently being worked on
 
@@ -170,7 +170,7 @@ complete for all 15. No outstanding action items.
 
 ## Planned Update Order
 1. ✅ **Gaussian** - Reference implementation (constants applied, comprehensive tests verified)
-2. ✅ **Exponential** - Reference implementation (constants applied, comprehensive tests verified)  
+2. ✅ **Exponential** - Reference implementation (constants applied, comprehensive tests verified)
 3. ✅ **Gamma** - Updated (constants applied, comprehensive tests verified)
 4. ✅ **Uniform** - Updated (constants applied, comprehensive tests verified, performance tests added)
 5. ✅ **Chi-squared** - Updated to Gold standard (constants applied, comprehensive tests verified)
diff --git a/docs/STYLE_GUIDE.md b/docs/STYLE_GUIDE.md
index 962133c..5d34438 100644
--- a/docs/STYLE_GUIDE.md
+++ b/docs/STYLE_GUIDE.md
@@ -113,13 +113,13 @@ private:
     double mean_{0.0};                    // Private member
     double standardDeviation_{1.0};      // Private member
     mutable std::atomic<bool> cacheValid_{false};     // Private member
-    
+
     static constexpr double DEFAULT_MEAN = 0.0;  // Constant
-    
+
 public:
     void setMean(double mean);           // Public method
     double getMean() const noexcept;    // Public method
-    
+
 private:
     void validateParameters(double mean, double stdDev) const;  // Private method
     void updateCache() const noexcept;                          // Private method
@@ -169,14 +169,14 @@ for (const auto& item : container) {
 double getMean() const noexcept;
 
 // Long signatures (multi-line with parameters aligned)
-void setParameters(double mean, 
-                  double standardDeviation, 
+void setParameters(double mean,
+                  double standardDeviation,
                   bool validateInputs = true);
 
 // Constructor initialization lists
 ExponentialDistribution(double lambda = 1.0)
-    : lambda_{lambda}, 
-      logLambda_{0.0}, 
+    : lambda_{lambda},
+      logLambda_{0.0},
       cacheValid_{false} {
     validateParameters(lambda);
     updateCache();
@@ -237,7 +237,7 @@ class GaussianDistribution {
 public:
     // Explicit single-argument constructor
     explicit GaussianDistribution(double mean = 0.0, double stdDev = 1.0);
-    
+
     // Default special members when possible
     ~GaussianDistribution() = default;
     GaussianDistribution(const GaussianDistribution&) = default;
@@ -286,7 +286,7 @@ private:
     /**
      * Validates parameters for the distribution
      * @param param1 First parameter with constraints
-     * @param param2 Second parameter with constraints  
+     * @param param2 Second parameter with constraints
      * @throws std::invalid_argument if parameters are invalid
      */
     void validateParameters(double param1, double param2) const {
@@ -294,16 +294,16 @@ private:
             throw std::invalid_argument("param1 must be positive and finite");
         }
         if (std::isnan(param2) || std::isinf(param2) || param2 <= 0.0) {
-            throw std::invalid_argument("param2 must be positive and finite");  
+            throw std::invalid_argument("param2 must be positive and finite");
         }
     }
 
 public:
-    ExampleDistribution(double param1, double param2) 
+    ExampleDistribution(double param1, double param2)
         : param1_{param1}, param2_{param2} {
         validateParameters(param1, param2);  // Validate in constructor
     }
-    
+
     void setParam1(double param1) {
         validateParameters(param1, param2_);  // Validate in setter
         param1_ = param1;
@@ -332,17 +332,17 @@ Use **Doxygen-style comments** for all public interfaces:
 ```cpp
 /**
  * Computes the probability density function for the Gaussian distribution.
- * 
+ *
  * The PDF is computed using the formula:
  * f(x) = (1/σ√(2π)) * exp(-0.5*((x-μ)/σ)²)
- * 
+ *
  * @param value The value at which to evaluate the PDF
  * @return Probability density at the given value
  * @throws std::invalid_argument if value is NaN or infinite
- * 
+ *
  * @note This method is thread-safe and uses cached normalization constants
  * @complexity O(1) - constant time computation
- * 
+ *
  * @example
  * @code
  * GaussianDistribution dist(0.0, 1.0);  // Standard normal
@@ -356,23 +356,23 @@ double getProbability(double value) override;
 ```cpp
 /**
  * Modern C++20 Gaussian distribution for modeling continuous symmetric data.
- * 
+ *
  * The Gaussian (Normal) distribution is a continuous probability distribution
  * characterized by its bell-shaped curve. It's fundamental in statistics and
  * is used extensively in machine learning and data analysis.
- * 
+ *
  * PDF: f(x) = (1/σ√(2π)) * exp(-0.5*((x-μ)/σ)²)
  * where μ is the mean and σ is the standard deviation (σ > 0)
- * 
+ *
  * Properties:
- * - Mean: μ  
+ * - Mean: μ
  * - Variance: σ²
  * - Support: x ∈ (-∞, ∞)
  * - Symmetry: Symmetric around μ
- * 
+ *
  * @note Thread-safe for read operations, not thread-safe for modifications
  * @note Uses efficient caching for repeated probability calculations
- * 
+ *
  * @example Basic usage:
  * @code
  * GaussianDistribution normal(0.0, 1.0);  // Standard normal distribution
@@ -415,7 +415,7 @@ if (!cacheValid_) {
  */
 void testParameterValidation() {
     std::cout << "Testing parameter validation..." << std::endl;
-    
+
     // Test invalid constructor parameters
     try {
         GaussianDistribution dist(0.0, 0.0);  // Invalid stddev
@@ -423,16 +423,16 @@ void testParameterValidation() {
     } catch (const std::invalid_argument&) {
         // Expected behavior
     }
-    
+
     // Test NaN and infinity
     double nan_val = std::numeric_limits<double>::quiet_NaN();
     try {
         GaussianDistribution dist(nan_val, 1.0);
-        assert(false);  // Should not reach here  
+        assert(false);  // Should not reach here
     } catch (const std::invalid_argument&) {
         // Expected behavior
     }
-    
+
     std::cout << "✓ Parameter validation tests passed" << std::endl;
 }
 ```
@@ -449,7 +449,7 @@ void testParameterValidation() {
 ### 1. Required Tools
 - **clang-tidy**: Static analysis and code quality
 - **cppcheck**: Additional static analysis
-- **Address Sanitizer**: Memory error detection  
+- **Address Sanitizer**: Memory error detection
 - **Undefined Behavior Sanitizer**: UB detection
 
 ### 2. Enabled Checks
@@ -484,12 +484,12 @@ class GaussianDistribution {
 private:
     mutable double normalizationConstant_{0.0};
     mutable std::atomic<bool> cacheValid_{false};
-    
+
     void updateCache() const noexcept {
         normalizationConstant_ = 1.0 / (standardDeviation_ * std::sqrt(2.0 * M_PI));
         cacheValid_ = true;
     }
-    
+
 public:
     double getProbability(double value) override {
         if (!cacheValid_) {
@@ -521,15 +521,15 @@ private:
         }
         // Additional validations...
     }
-    
+
 public:
     // Constructor MUST call validateParameters
-    DistributionName(ParamType1 param1, ParamType2 param2) 
+    DistributionName(ParamType1 param1, ParamType2 param2)
         : param1_{param1}, param2_{param2} {
         validateParameters(param1, param2);
     }
-    
-    // Setters MUST call validateParameters  
+
+    // Setters MUST call validateParameters
     void setParam1(ParamType1 param1) {
         validateParameters(param1, param2_);
         param1_ = param1;
diff --git a/examples/economics_hmm_example.cpp b/examples/economics_hmm_example.cpp
index 58511fa..9ac6f31 100644
--- a/examples/economics_hmm_example.cpp
+++ b/examples/economics_hmm_example.cpp
@@ -17,15 +17,15 @@ using libhmm::ViterbiTrainer;
 
 /**
  * Example: Economic and Social Science Modeling with Negative Binomial and Pareto HMM
- * 
+ *
  * This example demonstrates modeling economic phenomena using:
  * - Negative Binomial distribution for overdispersed count data (customer purchases, accidents)
  * - Pareto distribution for power-law phenomena (income, wealth, city sizes)
- * 
+ *
  * Hidden States for Customer Behavior:
  * - State 0: "Low Activity" (few purchases, occasional high-value items)
  * - State 1: "High Activity" (many purchases, frequent transactions)
- * 
+ *
  * Hidden States for Economic Regimes:
  * - State 0: "Normal Economy" (typical income distribution)
  * - State 1: "Crisis Economy" (more extreme inequality)
diff --git a/examples/financial_hmm_example.cpp b/examples/financial_hmm_example.cpp
index 4fbbe27..e983d51 100644
--- a/examples/financial_hmm_example.cpp
+++ b/examples/financial_hmm_example.cpp
@@ -17,11 +17,11 @@ using libhmm::ViterbiTrainer;
 
 /**
  * Example: Financial Market Volatility Modeling with Beta and Log-Normal HMM
- * 
+ *
  * This example demonstrates modeling financial market states using:
  * - Beta distribution for volatility measures (bounded between 0 and 1)
  * - Log-Normal distribution for asset returns (always positive)
- * 
+ *
  * Hidden States:
  * - State 0: "Low Volatility" (stable market conditions)
  * - State 1: "High Volatility" (turbulent market conditions)
diff --git a/examples/poisson_hmm_example.cpp b/examples/poisson_hmm_example.cpp
index 2568db4..dc56df9 100644
--- a/examples/poisson_hmm_example.cpp
+++ b/examples/poisson_hmm_example.cpp
@@ -14,7 +14,7 @@ using libhmm::ViterbiTrainer;
 
 /**
  * Example: Modeling Website Traffic with Poisson HMM
- * 
+ *
  * This example demonstrates using Poisson distributions in an HMM to model
  * website traffic patterns. We'll model two hidden states:
  * - State 0: "Normal Traffic" (λ = 10 requests/minute)
diff --git a/examples/quality_control_hmm_example.cpp b/examples/quality_control_hmm_example.cpp
index dc27ac6..9528087 100644
--- a/examples/quality_control_hmm_example.cpp
+++ b/examples/quality_control_hmm_example.cpp
@@ -16,11 +16,11 @@ using libhmm::ViterbiTrainer;
 
 /**
  * Example: Quality Control Process Monitoring with Binomial and Uniform HMM
- * 
+ *
  * This example demonstrates modeling quality control processes using:
  * - Binomial distribution for defect counts in batches
  * - Uniform distribution for measurement tolerances
- * 
+ *
  * Hidden States:
  * - State 0: "In Control" (low defect rate, tight tolerances)
  * - State 1: "Out of Control" (high defect rate, loose tolerances)
diff --git a/examples/queuing_theory_hmm_example.cpp b/examples/queuing_theory_hmm_example.cpp
index e90882f..3284f2f 100644
--- a/examples/queuing_theory_hmm_example.cpp
+++ b/examples/queuing_theory_hmm_example.cpp
@@ -19,17 +19,17 @@ using libhmm::ViterbiTrainer;
 
 /**
  * Example: Queuing Theory and Service Systems with HMM
- * 
+ *
  * This example demonstrates modeling service systems using HMMs to represent:
  * - Customer arrival patterns (Poisson arrivals)
  * - Service time distributions (Exponential, Gamma)
  * - System state transitions (load levels, server availability)
- * 
+ *
  * Service System States:
  * - State 0: "Low Load" (few customers, fast service)
  * - State 1: "Medium Load" (moderate queue, normal service)
  * - State 2: "High Load" (long queue, slow service)
- * 
+ *
  * Models Demonstrated:
  * 1. M/M/1 Queue (Poisson arrivals, Exponential service)
  * 2. M/G/1 Queue (Poisson arrivals, Gamma service times)
diff --git a/examples/reliability_hmm_example.cpp b/examples/reliability_hmm_example.cpp
index 9ca1140..832298f 100644
--- a/examples/reliability_hmm_example.cpp
+++ b/examples/reliability_hmm_example.cpp
@@ -16,11 +16,11 @@ using libhmm::WeibullDistribution;
 
 /**
  * Example: Reliability Engineering with Weibull and Exponential HMM
- * 
+ *
  * This example demonstrates modeling system reliability using:
  * - Weibull distribution for component lifetimes (flexible hazard rates)
  * - Exponential distribution for memoryless failure times
- * 
+ *
  * Hidden States:
  * - State 0: "Normal Operation" (low failure rate)
  * - State 1: "Degraded State" (higher failure rate)
diff --git a/examples/statistical_process_control_hmm_example.cpp b/examples/statistical_process_control_hmm_example.cpp
index cb35666..4a8dcee 100644
--- a/examples/statistical_process_control_hmm_example.cpp
+++ b/examples/statistical_process_control_hmm_example.cpp
@@ -20,17 +20,17 @@ using libhmm::ViterbiTrainer;
 
 /**
  * Example: Statistical Process Control with Chi-squared Distribution HMM
- * 
+ *
  * This example demonstrates quality control monitoring using:
  * - Chi-squared distribution for test statistics and variance measures
  * - Gaussian distribution for measurement errors
  * - Exponential distribution for time-between-failures
- * 
+ *
  * Hidden States:
  * - State 0: "In Control" (process operating normally)
  * - State 1: "Warning" (process showing signs of deviation)
  * - State 2: "Out of Control" (process requires intervention)
- * 
+ *
  * Key applications of Chi-squared in quality control:
  * - Goodness-of-fit testing for process capability
  * - Variance monitoring and control charts
diff --git a/examples/swarm_coordination_example.cpp b/examples/swarm_coordination_example.cpp
index ae0487a..e6a9cb3 100644
--- a/examples/swarm_coordination_example.cpp
+++ b/examples/swarm_coordination_example.cpp
@@ -1,23 +1,23 @@
 /**
  * @file swarm_coordination_example.cpp
  * @brief Discrete State Swarm Coordination Example using libhmm
- * 
+ *
  * This example demonstrates how to use Hidden Markov Models for coordinating
  * a drone swarm through different formation states and mission phases.
- * 
+ *
  * Key Features:
  * - Discrete state space modeling (formation types, mission phases)
  * - Multi-dimensional discrete observations (altitude, speed, threats)
  * - Automatic calculator selection with SIMD optimization
  * - Real-time state prediction and formation coordination
  * - Fault detection and recovery mechanisms
- * 
+ *
  * Applications:
  * - Autonomous drone swarm coordination
  * - Multi-robot formation control
  * - Mission state management
  * - System health monitoring
- * 
+ *
  * @author libhmm development team
  * @version 2.5.0
  */
diff --git a/include/libhmm/calculators/forward_backward_calculator.h b/include/libhmm/calculators/forward_backward_calculator.h
index 3efd38d..eb2bb2a 100755
--- a/include/libhmm/calculators/forward_backward_calculator.h
+++ b/include/libhmm/calculators/forward_backward_calculator.h
@@ -1,7 +1,9 @@
 #pragma once
 
 #include "libhmm/calculators/calculator.h"
+#include "libhmm/performance/fb_recurrence_policy.h"
 #include <limits>
+#include <optional>
 #include <vector>
 
 namespace libhmm {
@@ -84,24 +86,60 @@ class ForwardBackwardCalculator : public Calculator {
     /** Number of HMM states used by this calculator. */
     [[nodiscard]] std::size_t getNumStates() const noexcept { return numStates_; }
 
+    /**
+     * @brief Force a specific recurrence kernel for subsequent compute() calls.
+     *
+     * Pass `std::nullopt` to clear the override and return to adaptive policy.
+     * The override takes precedence over the static policy bins, but is itself
+     * superseded by the compile-time `LIBHMM_EXPERIMENT_FB_MAX_REDUCE` and
+     * `LIBHMM_EXPERIMENT_FB_ADAPTIVE_SELECTOR` forcers when those are defined.
+     */
+    void setRecurrenceModeOverride(std::optional<FbRecurrenceMode> mode) noexcept {
+        modeOverride_ = mode;
+    }
+
+    /** Currently active recurrence-mode override, if any. */
+    [[nodiscard]] std::optional<FbRecurrenceMode> getRecurrenceModeOverride() const noexcept {
+        return modeOverride_;
+    }
+
+    /** Recurrence mode resolved on the most recent compute() call. */
+    [[nodiscard]] FbRecurrenceMode getRecurrenceMode() const noexcept { return currentMode_; }
+
 private:
     std::size_t numStates_{0};
 
     // Precomputed log-transition matrix [N x N]: logTrans_(i,j) = log a_{ij}
     Matrix logTrans_;
+    // Transposed transition matrix [N x N]: logTransT_(j,i) = log a_{ij}
+    Matrix logTransT_;
 
     // Results
     Matrix logAlpha_; // T x N
     Matrix logBeta_;  // T x N
     double logProbability_{-std::numeric_limits<double>::infinity()};
 
-    // Per-state log-emission buffer reused each timestep [T x N, row-major].
-    // Allocated once; filled by getBatchLogProbabilities per state.
-    mutable std::vector<double> logEmitBuf_;
-
+    // State-major log-emission buffer: logEmitBuf_[i * T + t] = log b_i(O_t).
+    // Filled directly by getBatchLogProbabilities per state.
+    std::vector<double> logEmitBuf_;
+    // Time-major emission buffer: logEmitByTime_[t * N + i] = log b_i(O_t).
+    // Derived from logEmitBuf_ for contiguous per-time access in recurrences.
+    std::vector<double> logEmitByTime_;
+    // Recurrence kernel resolved by the policy + override pipeline on the most
+    // recent compute() call. Defaults to Pairwise (the comparator-safe choice).
+    FbRecurrenceMode currentMode_{FbRecurrenceMode::Pairwise};
+    // Optional per-instance override (Phase A4). Set via setRecurrenceModeOverride().
+    std::optional<FbRecurrenceMode> modeOverride_;
+
+    [[nodiscard]] FbRecurrenceMode resolveRecurrenceMode(std::size_t numStates,
+                                                         std::size_t sequenceLength) const noexcept;
     void precomputeLogTransitions();
     void computeLogForward();
     void computeLogBackward();
+    void computeLogForwardPairwise();
+    void computeLogForwardMaxReduce();
+    void computeLogBackwardPairwise();
+    void computeLogBackwardMaxReduce();
 
     /** log-sum-exp of two log-space values: log(exp(a) + exp(b)). */
     static double logSumExp(double a, double b) noexcept;
diff --git a/include/libhmm/calculators/viterbi_calculator.h b/include/libhmm/calculators/viterbi_calculator.h
index 7b9ae64..a341ecb 100755
--- a/include/libhmm/calculators/viterbi_calculator.h
+++ b/include/libhmm/calculators/viterbi_calculator.h
@@ -65,19 +65,24 @@ class ViterbiCalculator : public Calculator {
 
     // Precomputed log-transition matrix [N x N]
     Matrix logTrans_;
+    // Transposed transition matrix [N x N]: logTransT_(j,i) = log a_{ij}
+    Matrix logTransT_;
 
     // Viterbi trellis: logDelta(t,i) = max log-prob path ending at state i at time t
     Matrix logDelta_;
 
-    // Backtrack pointers: psi(t,i) = arg max_j [logDelta(t-1,j) + logTrans(j,i)]
-    std::vector<std::vector<int>> psi_;
+    // Backtrack pointers in time-major contiguous storage:
+    // psi_[t * N + j] = arg max_i [logDelta(t-1,i) + logTrans(i,j)]
+    std::vector<int> psi_;
 
     // Result
     StateSequence sequence_;
     double logProbability_{-std::numeric_limits<double>::infinity()};
 
-    // Per-state emission buffer
-    mutable std::vector<double> logEmitBuf_;
+    // Per-state log-emission buffer: logEmitBuf_[i * T + t] = log b_i(O_t)
+    std::vector<double> logEmitBuf_;
+    // Time-major emission buffer: logEmitByTime_[t * N + i] = log b_i(O_t)
+    std::vector<double> logEmitByTime_;
 
     void precomputeLogTransitions();
     void runViterbi();
diff --git a/include/libhmm/distributions/beta_distribution.h b/include/libhmm/distributions/beta_distribution.h
index 192ca40..b9be62d 100644
--- a/include/libhmm/distributions/beta_distribution.h
+++ b/include/libhmm/distributions/beta_distribution.h
@@ -8,14 +8,14 @@ namespace libhmm {
 
 /**
  * Beta distribution for modeling probabilities and proportions.
- * 
- * The Beta distribution is a continuous probability distribution defined 
+ *
+ * The Beta distribution is a continuous probability distribution defined
  * on the interval [0,1] and parameterized by two positive shape parameters
  * α (alpha) and β (beta).
- * 
+ *
  * PDF: f(x; α, β) = (x^(α-1) * (1-x)^(β-1)) / B(α, β)
  * where B(α, β) is the Beta function: B(α, β) = Γ(α)Γ(β)/Γ(α+β)
- * 
+ *
  * Special cases:
  * - α = β = 1: Uniform distribution on [0,1]
  * - α = β: Symmetric around 0.5
@@ -30,7 +30,7 @@ class BetaDistribution : public DistributionBase {
     double alpha_{1.0};
 
     /**
-     * Shape parameter β (beta) - must be positive  
+     * Shape parameter β (beta) - must be positive
      */
     double beta_{1.0};
 
@@ -89,7 +89,7 @@ class BetaDistribution : public DistributionBase {
 public:
     /**
      * Constructs a Beta distribution with given shape parameters.
-     * 
+     *
      * @param alpha Shape parameter α (must be positive)
      * @param beta Shape parameter β (must be positive)
      * @throws std::invalid_argument if parameters are not positive finite numbers
@@ -139,7 +139,7 @@ class BetaDistribution : public DistributionBase {
 
     /**
      * Computes the probability density function for the Beta distribution.
-     * 
+     *
      * @param value The value at which to evaluate the PDF (should be in [0,1])
      * @return Probability density, or 0.0 if value is outside [0,1]
      */
@@ -154,9 +154,9 @@ class BetaDistribution : public DistributionBase {
 
     /**
      * Computes the cumulative distribution function for the Beta distribution.
-     * 
+     *
      * Uses the regularized incomplete beta function I_x(α,β)
-     * 
+     *
      * @param value The value at which to evaluate the CDF
      * @return Cumulative probability P(X ≤ value)
      */
@@ -170,7 +170,7 @@ class BetaDistribution : public DistributionBase {
     /**
      * Vectorized batch computation of PDF for multiple values.
      * Optimized for processing many values efficiently with cache reuse.
-     * 
+     *
      * @param values Vector of input values
      * @param results Output vector for results (will be resized if needed)
      */
@@ -179,7 +179,7 @@ class BetaDistribution : public DistributionBase {
     /**
      * Vectorized batch computation of log PDF for multiple values.
      * Optimized for processing many values efficiently with cache reuse.
-     * 
+     *
      * @param values Vector of input values
      * @param results Output vector for results (will be resized if needed)
      */
@@ -194,21 +194,21 @@ class BetaDistribution : public DistributionBase {
 
     /**
      * Returns a string representation of the distribution.
-     * 
+     *
      * @return String describing the distribution parameters
      */
     std::string toString() const override;
 
     /**
      * Gets the alpha (α) shape parameter.
-     * 
+     *
      * @return Current alpha value
      */
     double getAlpha() const noexcept { return alpha_; }
 
     /**
      * Sets the alpha (α) shape parameter.
-     * 
+     *
      * @param alpha New alpha parameter (must be positive)
      * @throws std::invalid_argument if alpha <= 0 or is not finite
      */
@@ -220,14 +220,14 @@ class BetaDistribution : public DistributionBase {
 
     /**
      * Gets the beta (β) shape parameter.
-     * 
+     *
      * @return Current beta value
      */
     double getBeta() const noexcept { return beta_; }
 
     /**
      * Sets the beta (β) shape parameter.
-     * 
+     *
      * @param beta New beta parameter (must be positive)
      * @throws std::invalid_argument if beta <= 0 or is not finite
      */
@@ -240,7 +240,7 @@ class BetaDistribution : public DistributionBase {
     /**
      * Gets the mean of the distribution.
      * For Beta(α, β), mean = α/(α+β)
-     * 
+     *
      * @return Mean value
      */
     double getMean() const noexcept { return alpha_ / (alpha_ + beta_); }
@@ -248,7 +248,7 @@ class BetaDistribution : public DistributionBase {
     /**
      * Gets the variance of the distribution.
      * For Beta(α, β), variance = αβ/((α+β)²(α+β+1))
-     * 
+     *
      * @return Variance value
      */
     double getVariance() const noexcept {
@@ -258,7 +258,7 @@ class BetaDistribution : public DistributionBase {
 
     /**
      * Gets the standard deviation of the distribution.
-     * 
+     *
      * @return Standard deviation
      */
     double getStandardDeviation() const noexcept { return std::sqrt(getVariance()); }
diff --git a/include/libhmm/distributions/chi_squared_distribution.h b/include/libhmm/distributions/chi_squared_distribution.h
index 458b07f..1db1414 100644
--- a/include/libhmm/distributions/chi_squared_distribution.h
+++ b/include/libhmm/distributions/chi_squared_distribution.h
@@ -8,11 +8,11 @@ namespace libhmm {
 
 /**
  * Chi-squared distribution for modeling sums of squared standard normal variables.
- * 
+ *
  * The Chi-squared distribution is a continuous probability distribution with support
  * on non-negative real numbers. It is a special case of the Gamma distribution and
  * arises frequently in statistical inference, particularly in hypothesis testing.
- * 
+ *
  * Mathematical properties:
  * - PDF: f(x; k) = (1/(2^(k/2) * Γ(k/2))) * x^(k/2-1) * e^(-x/2)
  * - Support: x ∈ [0, ∞)
@@ -20,7 +20,7 @@ namespace libhmm {
  * - Mean: k
  * - Variance: 2k
  * - Relation to Gamma: χ²(k) = Gamma(k/2, 2)
- * 
+ *
  * Applications:
  * - Goodness-of-fit tests
  * - Tests of independence in contingency tables
@@ -68,7 +68,7 @@ class ChiSquaredDistribution : public DistributionBase {
 public:
     /**
      * Constructs a Chi-squared distribution with given degrees of freedom.
-     * 
+     *
      * @param degrees_of_freedom Degrees of freedom k (must be positive)
      * @throws std::invalid_argument if degrees_of_freedom <= 0
      */
@@ -116,7 +116,7 @@ class ChiSquaredDistribution : public DistributionBase {
 
     /**
      * Computes the probability density function for the Chi-squared distribution.
-     * 
+     *
      * @param value The value at which to evaluate the PDF (should be non-negative)
      * @return Probability density f(value|k), or 0.0 if value < 0
      */
@@ -141,21 +141,21 @@ class ChiSquaredDistribution : public DistributionBase {
 
     /**
      * Returns a string representation of the distribution.
-     * 
+     *
      * @return String describing the distribution parameters
      */
     std::string toString() const override;
 
     /**
      * Gets the degrees of freedom parameter.
-     * 
+     *
      * @return Current degrees of freedom value
      */
     double getDegreesOfFreedom() const noexcept { return degrees_of_freedom_; }
 
     /**
      * Sets the degrees of freedom parameter.
-     * 
+     *
      * @param degrees_of_freedom New degrees of freedom parameter (must be positive)
      * @throws std::invalid_argument if degrees_of_freedom <= 0 or is not finite
      */
@@ -167,28 +167,28 @@ class ChiSquaredDistribution : public DistributionBase {
 
     /**
      * Gets the mean of the distribution.
-     * 
+     *
      * @return Mean (k)
      */
     double getMean() const noexcept { return degrees_of_freedom_; }
 
     /**
      * Gets the variance of the distribution.
-     * 
+     *
      * @return Variance (2k)
      */
     double getVariance() const noexcept { return 2.0 * degrees_of_freedom_; }
 
     /**
      * Gets the standard deviation of the distribution.
-     * 
+     *
      * @return Standard deviation (√(2k))
      */
     double getStandardDeviation() const noexcept { return std::sqrt(2.0 * degrees_of_freedom_); }
 
     /**
      * Gets the mode of the distribution.
-     * 
+     *
      * @return Mode (max(0, k-2))
      */
     double getMode() const noexcept { return std::max(0.0, degrees_of_freedom_ - 2.0); }
diff --git a/include/libhmm/distributions/discrete_distribution.h b/include/libhmm/distributions/discrete_distribution.h
index bfde0e2..54f69d1 100755
--- a/include/libhmm/distributions/discrete_distribution.h
+++ b/include/libhmm/distributions/discrete_distribution.h
@@ -9,21 +9,21 @@ namespace libhmm {
 
 /**
  * Modern C++20 Discrete distribution for modeling categorical data.
- * 
+ *
  * The Discrete distribution (also known as Categorical distribution) is a
  * discrete probability distribution that generalizes the Bernoulli distribution.
  * It describes the possible results of a random variable that can take on
  * one of K possible categories, with the probability of each category separately specified.
- * 
+ *
  * PMF: P(X = k) = p_k for k ∈ {0, 1, 2, ..., K-1}
  * where p_k is the probability of category k and ∑p_k = 1
- * 
+ *
  * Properties:
  * - Support: {0, 1, 2, ..., numSymbols-1}
  * - Probability mass function defined for each discrete symbol
  * - All probabilities must sum to 1.0
  * - Each probability must be in [0, 1]
- * 
+ *
  * Applications:
  * - Hidden Markov Models with discrete observations
  * - Classification problems
@@ -177,7 +177,7 @@ class DiscreteDistribution : public DistributionBase {
 
     /**
      * Gets the probability mass function value for a discrete observation.
-     * 
+     *
      * @param value The discrete value (will be cast to integer index)
      * @return Probability mass for the given value, 0.0 if out of range
      */
@@ -203,7 +203,7 @@ class DiscreteDistribution : public DistributionBase {
 
     /**
      * Sets the probability for a specific discrete observation.
-     * 
+     *
      * @param o The discrete observation (symbol index)
      * @param value The probability value (must be in [0,1])
      * @throws std::invalid_argument if value is not a valid probability
@@ -227,21 +227,21 @@ class DiscreteDistribution : public DistributionBase {
 
     /**
      * Returns a string representation of the distribution.
-     * 
+     *
      * @return String showing all symbol probabilities
      */
     std::string toString() const override;
 
     /**
      * Gets the number of discrete symbols in the distribution.
-     * 
+     *
      * @return Number of symbols/categories
      */
     std::size_t getNumSymbols() const noexcept { return numSymbols_; }
 
     /**
      * Gets the probability for a specific symbol.
-     * 
+     *
      * @param index Symbol index (must be < numSymbols)
      * @return Probability for the symbol
      * @throws std::out_of_range if index is out of range
@@ -255,7 +255,7 @@ class DiscreteDistribution : public DistributionBase {
 
     /**
      * Gets the sum of all probabilities (should be approximately 1.0).
-     * 
+     *
      * @return Sum of all probabilities
      */
     double getProbabilitySum() const {
@@ -277,7 +277,7 @@ class DiscreteDistribution : public DistributionBase {
     /**
      * Gets the mean of the distribution.
      * For discrete distribution, mean = ∑(i * p_i) for i = 0 to numSymbols-1
-     * 
+     *
      * @return Mean value
      */
     double getMean() const noexcept {
@@ -291,7 +291,7 @@ class DiscreteDistribution : public DistributionBase {
     /**
      * Gets the variance of the distribution.
      * For discrete distribution, variance = ∑(i² * p_i) - mean²
-     * 
+     *
      * @return Variance value
      */
     double getVariance() const noexcept {
@@ -306,7 +306,7 @@ class DiscreteDistribution : public DistributionBase {
 
     /**
      * Gets the standard deviation of the distribution.
-     * 
+     *
      * @return Standard deviation value
      */
     double getStandardDeviation() const noexcept { return std::sqrt(getVariance()); }
@@ -328,7 +328,7 @@ class DiscreteDistribution : public DistributionBase {
     /**
      * Evaluates the logarithm of the probability mass function
      * More numerically stable for small probabilities
-     * 
+     *
      * @param value The discrete value (will be cast to integer index)
      * @return Log probability mass, -infinity if out of range or probability is 0
      */
@@ -342,7 +342,7 @@ class DiscreteDistribution : public DistributionBase {
     /**
      * Evaluates the CDF at k using cumulative sum approach
      * Formula: CDF(k) = ∑(i=0 to k) P(X = i)
-     * 
+     *
      * @param value The value at which to evaluate the CDF
      * @return Cumulative probability P(X ≤ value)
      */
diff --git a/include/libhmm/distributions/distributions.h b/include/libhmm/distributions/distributions.h
index 73fde1a..3c26853 100644
--- a/include/libhmm/distributions/distributions.h
+++ b/include/libhmm/distributions/distributions.h
@@ -3,21 +3,21 @@
 /**
  * @file distributions.h
  * @brief Convenience header that includes all libhmm probability distributions
- * 
+ *
  * This header provides a single include point for all probability distributions
  * available in libhmm. It follows the standard library convention of providing
  * umbrella headers for related functionality.
- * 
+ *
  * Usage:
  * @code
  * #include "libhmm/distributions/distributions.h"
- * 
+ *
  * // All distributions are now available:
  * GaussianDistribution gauss(0.0, 1.0);
  * PoissonDistribution poisson(2.5);
  * DiscreteDistribution discrete(6);
  * @endcode
- * 
+ *
  * @note For better compilation times, consider including only the specific
  *       distribution headers you need in performance-critical applications.
  */
@@ -51,15 +51,15 @@
 /**
  * @namespace libhmm
  * @brief All distributions are available in the libhmm namespace
- * 
+ *
  * After including this header, all distribution classes are available:
- * 
+ *
  * **Discrete Distributions:**
  * - DiscreteDistribution: General discrete distribution
  * - BinomialDistribution: Binomial distribution B(n,p)
  * - NegativeBinomialDistribution: Negative binomial distribution
  * - PoissonDistribution: Poisson distribution P(λ)
- * 
+ *
  * **Continuous Distributions:**
  * - GaussianDistribution: Normal distribution N(μ,σ²)
  * - ExponentialDistribution: Exponential distribution Exp(λ)
diff --git a/include/libhmm/distributions/exponential_distribution.h b/include/libhmm/distributions/exponential_distribution.h
index 27ff656..2abbe25 100755
--- a/include/libhmm/distributions/exponential_distribution.h
+++ b/include/libhmm/distributions/exponential_distribution.h
@@ -8,15 +8,15 @@ namespace libhmm {
 
 /**
  * Modern C++20 Exponential distribution for modeling waiting times and decay processes.
- * 
+ *
  * The Exponential distribution is a continuous probability distribution that describes
  * the time between events in a Poisson point process. It's commonly used to model
  * lifetimes, waiting times, and decay processes.
- * 
+ *
  * PDF: f(x) = λ * exp(-λx) for x ≥ 0, 0 otherwise
  * CDF: F(x) = 1 - exp(-λx) for x ≥ 0, 0 otherwise
  * where λ is the rate parameter (λ > 0)
- * 
+ *
  * Properties:
  * - Mean: 1/λ
  * - Variance: 1/λ²
@@ -79,7 +79,7 @@ class ExponentialDistribution : public DistributionBase {
 public:
     /**
      * Constructs an Exponential distribution with given rate parameter.
-     * 
+     *
      * @param lambda Rate parameter λ (must be positive)
      * @throws std::invalid_argument if lambda is invalid
      */
@@ -129,7 +129,7 @@ class ExponentialDistribution : public DistributionBase {
 
     /**
      * Computes the probability density function for the Exponential distribution.
-     * 
+     *
      * @param value The value at which to evaluate the PDF
      * @return Probability density (or approximated probability for discrete sampling)
      */
@@ -163,21 +163,21 @@ class ExponentialDistribution : public DistributionBase {
 
     /**
      * Returns a string representation of the distribution.
-     * 
+     *
      * @return String describing the distribution parameters
      */
     std::string toString() const override;
 
     /**
      * Gets the rate parameter λ.
-     * 
+     *
      * @return Current rate parameter value
      */
     double getLambda() const noexcept { return lambda_; }
 
     /**
      * Sets the rate parameter λ.
-     * 
+     *
      * @param lambda New rate parameter (must be positive)
      * @throws std::invalid_argument if lambda <= 0 or is not finite
      */
@@ -191,7 +191,7 @@ class ExponentialDistribution : public DistributionBase {
      * Gets the mean of the distribution.
      * For Exponential distribution, mean = 1/λ
      * Uses cached value to eliminate division.
-     * 
+     *
      * @return Mean value
      */
     double getMean() const noexcept {
@@ -218,7 +218,7 @@ class ExponentialDistribution : public DistributionBase {
     /**
      * Evaluates the CDF at x using the standard exponential CDF formula
      * For exponential distribution: F(x) = 1 - exp(-λx) for x ≥ 0, 0 otherwise
-     * 
+     *
      * @param x The value at which to evaluate the CDF
      * @return Cumulative probability P(X ≤ x)
      */
diff --git a/include/libhmm/distributions/gamma_distribution.h b/include/libhmm/distributions/gamma_distribution.h
index 31381e9..3474aa1 100755
--- a/include/libhmm/distributions/gamma_distribution.h
+++ b/include/libhmm/distributions/gamma_distribution.h
@@ -8,19 +8,19 @@ namespace libhmm {
 
 /**
  * Modern C++20 Gamma distribution for modeling continuous non-negative data.
- * 
+ *
  * The Gamma distribution is a versatile continuous probability distribution
  * commonly used to model waiting times, failure rates, and size distributions.
  * It generalizes the exponential distribution and is the conjugate prior for
  * the precision of a normal distribution.
- * 
+ *
  * PDF: f(x) = (1/(Γ(k)θ^k)) * x^(k-1) * exp(-x/θ) for x ≥ 0
  * where k is the shape parameter (k > 0) and θ is the scale parameter (θ > 0)
  * Γ(k) is the gamma function
- * 
+ *
  * Alternative parameterization uses rate parameter β = 1/θ:
  * PDF: f(x) = (β^k/Γ(k)) * x^(k-1) * exp(-βx)
- * 
+ *
  * Properties:
  * - Mean: k*θ (or k/β)
  * - Variance: k*θ² (or k/β²)
@@ -94,7 +94,7 @@ class GammaDistribution : public DistributionBase {
 public:
     /**
      * Constructs a Gamma distribution with given parameters.
-     * 
+     *
      * @param k Shape parameter k (must be positive)
      * @param theta Scale parameter θ (must be positive)
      * @throws std::invalid_argument if parameters are invalid
@@ -140,7 +140,7 @@ class GammaDistribution : public DistributionBase {
 
     /**
      * Computes the probability density function for the Gamma distribution.
-     * 
+     *
      * @param value The value at which to evaluate the PDF
      * @return Probability density (or approximated probability for discrete sampling)
      */
@@ -150,7 +150,7 @@ class GammaDistribution : public DistributionBase {
      * Evaluates the logarithm of the probability density function
      * Formula: log PDF(x) = (k-1)*ln(x) - x/θ - k*ln(θ) - ln(Γ(k))
      * More numerically stable for small probabilities
-     * 
+     *
      * @param x The value at which to evaluate the log PDF
      * @return Log probability density
      */
@@ -165,7 +165,7 @@ class GammaDistribution : public DistributionBase {
      * Evaluates the CDF at x using the incomplete gamma function
      * Formula: CDF(x) = P(k, x/θ) = γ(k, x/θ) / Γ(k)
      * where P is the regularized incomplete gamma function
-     * 
+     *
      * @param x The value at which to evaluate the CDF
      * @return Cumulative probability P(X ≤ x)
      */
@@ -185,28 +185,28 @@ class GammaDistribution : public DistributionBase {
 
     /**
      * Returns a string representation of the distribution.
-     * 
+     *
      * @return String describing the distribution parameters
      */
     [[nodiscard]] std::string toString() const override;
 
     /**
      * Gets the shape parameter k.
-     * 
+     *
      * @return Current shape parameter value
      */
     [[nodiscard]] double getK() const noexcept { return k_; }
 
     /**
      * Gets the scale parameter θ.
-     * 
+     *
      * @return Current scale parameter value
      */
     [[nodiscard]] double getTheta() const noexcept { return theta_; }
 
     /**
      * Sets the shape parameter k.
-     * 
+     *
      * @param k New shape parameter (must be positive)
      * @throws std::invalid_argument if k <= 0 or is not finite
      */
@@ -230,7 +230,7 @@ class GammaDistribution : public DistributionBase {
     /**
      * Gets the mean of the distribution.
      * For Gamma distribution, mean = k*θ
-     * 
+     *
      * @return Mean value
      */
     [[nodiscard]] double getMean() const noexcept { return k_ * theta_; }
@@ -238,7 +238,7 @@ class GammaDistribution : public DistributionBase {
     /**
      * Gets the variance of the distribution.
      * For Gamma distribution, variance = k*θ²
-     * 
+     *
      * @return Variance value
      */
     [[nodiscard]] double getVariance() const noexcept { return k_ * theta_ * theta_; }
@@ -246,7 +246,7 @@ class GammaDistribution : public DistributionBase {
     /**
      * Gets the standard deviation of the distribution.
      * For Gamma distribution, std_dev = θ*√k
-     * 
+     *
      * @return Standard deviation value
      */
     [[nodiscard]] double getStandardDeviation() const noexcept { return theta_ * std::sqrt(k_); }
@@ -255,14 +255,14 @@ class GammaDistribution : public DistributionBase {
      * Gets the mode of the distribution.
      * For Gamma distribution with k > 1, mode = (k-1)*θ
      * For k ≤ 1, the mode is at x = 0 (but PDF may be infinite there)
-     * 
+     *
      * @return Mode value
      */
     [[nodiscard]] double getMode() const noexcept { return (k_ > 1.0) ? (k_ - 1.0) * theta_ : 0.0; }
 
     /**
      * Gets the rate parameter β = 1/θ (alternative parameterization).
-     * 
+     *
      * @return Rate parameter (1/θ)
      */
     [[nodiscard]] double getRate() const noexcept { return 1.0 / theta_; }
diff --git a/include/libhmm/distributions/gaussian_distribution.h b/include/libhmm/distributions/gaussian_distribution.h
index 40a321c..9aafafb 100755
--- a/include/libhmm/distributions/gaussian_distribution.h
+++ b/include/libhmm/distributions/gaussian_distribution.h
@@ -143,7 +143,7 @@ class GaussianDistribution : public DistributionBase {
     /**
      * Computes the probability density function for the Gaussian distribution.
      * Formula: PDF(x) = (1/(σ√(2π))) * exp(-½((x-μ)/σ)²)
-     * 
+     *
      * @param x The value at which to evaluate the PDF
      * @return Probability density
      */
@@ -153,7 +153,7 @@ class GaussianDistribution : public DistributionBase {
      * Evaluates the logarithm of the probability density function
      * Formula: log PDF(x) = -½log(2π) - log(σ) - ½((x-μ)/σ)²
      * More numerically stable for small probabilities
-     * 
+     *
      * @param x The value at which to evaluate the log PDF
      * @return Log probability density
      */
@@ -169,7 +169,7 @@ class GaussianDistribution : public DistributionBase {
     /**
      * Evaluates the CDF at x using the error function
      * Formula: CDF(x) = (1/2) * (1 + erf((x-μ)/(σ√2)))
-     * 
+     *
      * @param x The value at which to evaluate the CDF
      * @return Cumulative probability P(X ≤ x)
      */
@@ -194,21 +194,21 @@ class GaussianDistribution : public DistributionBase {
 
     /**
      * Returns a string representation of the distribution.
-     * 
+     *
      * @return String describing the distribution parameters
      */
     std::string toString() const override;
 
     /**
      * Gets the mean parameter μ.
-     * 
+     *
      * @return Current mean value
      */
     double getMean() const noexcept { return mean_; }
 
     /**
      * Sets the mean parameter μ.
-     * 
+     *
      * @param mean New mean parameter (any finite value)
      * @throws std::invalid_argument if mean is not finite
      */
@@ -220,14 +220,14 @@ class GaussianDistribution : public DistributionBase {
 
     /**
      * Gets the standard deviation parameter σ.
-     * 
+     *
      * @return Current standard deviation value
      */
     double getStandardDeviation() const noexcept { return standardDeviation_; }
 
     /**
      * Sets the standard deviation parameter σ.
-     * 
+     *
      * @param stdDev New standard deviation parameter (must be positive)
      * @throws std::invalid_argument if stdDev <= 0 or is not finite
      */
@@ -240,14 +240,14 @@ class GaussianDistribution : public DistributionBase {
     /**
      * Gets the variance of the distribution.
      * For Gaussian distribution, variance = σ²
-     * 
+     *
      * @return Variance value
      */
     double getVariance() const noexcept { return standardDeviation_ * standardDeviation_; }
 
     /**
      * Sets both parameters simultaneously.
-     * 
+     *
      * @param mean New mean parameter
      * @param stdDev New standard deviation parameter
      * @throws std::invalid_argument if parameters are invalid
diff --git a/include/libhmm/distributions/log_normal_distribution.h b/include/libhmm/distributions/log_normal_distribution.h
index 17cf1d6..c4a125e 100755
--- a/include/libhmm/distributions/log_normal_distribution.h
+++ b/include/libhmm/distributions/log_normal_distribution.h
@@ -8,20 +8,20 @@ namespace libhmm {
 
 /**
  * Modern C++20 Log-Normal distribution for modeling positive continuous data.
- * 
+ *
  * The Log-Normal distribution is a continuous probability distribution of a
  * random variable whose logarithm is normally distributed. It's commonly used
  * to model sizes, lengths, and other positive quantities that arise from
  * multiplicative processes.
- * 
+ *
  * Important note about parameterization:
  * This implementation uses the "log-scale" parameterization where:
  * - μ (mean_) is the mean of the underlying normal distribution ln(X)
  * - σ (standardDeviation_) is the standard deviation of ln(X)
- * 
+ *
  * PDF: f(x) = (1/(x·σ·√(2π))) * exp(-½((ln(x)-μ)/σ)²) for x > 0
  * where μ is the mean of ln(X) and σ is the std dev of ln(X)
- * 
+ *
  * Properties:
  * - Mean: exp(μ + σ²/2)
  * - Variance: (exp(σ²) - 1) * exp(2μ + σ²)
@@ -79,7 +79,7 @@ class LogNormalDistribution : public DistributionBase {
 public:
     /**
      * Constructs a Log-Normal distribution with given parameters.
-     * 
+     *
      * @param mean Mean of the underlying normal distribution (μ, any finite value)
      * @param standardDeviation Standard deviation of the underlying normal distribution (σ, must be positive)
      * @throws std::invalid_argument if parameters are invalid
@@ -139,7 +139,7 @@ class LogNormalDistribution : public DistributionBase {
 
     /**
      * Computes the probability density function for the Log-Normal distribution.
-     * 
+     *
      * @param value The value at which to evaluate the PDF
      * @return Probability density (or approximated probability for discrete sampling)
      */
@@ -168,21 +168,21 @@ class LogNormalDistribution : public DistributionBase {
 
     /**
      * Returns a string representation of the distribution.
-     * 
+     *
      * @return String describing the distribution parameters
      */
     std::string toString() const override;
 
     /**
      * Gets the mean parameter μ of the underlying normal distribution.
-     * 
+     *
      * @return Current mean parameter value
      */
     double getMean() const noexcept { return mean_; }
 
     /**
      * Sets the mean parameter μ of the underlying normal distribution.
-     * 
+     *
      * @param mean New mean parameter (any finite value)
      * @throws std::invalid_argument if mean is not finite
      */
@@ -194,14 +194,14 @@ class LogNormalDistribution : public DistributionBase {
 
     /**
      * Gets the standard deviation parameter σ of the underlying normal distribution.
-     * 
+     *
      * @return Current standard deviation parameter value
      */
     double getStandardDeviation() const noexcept { return standardDeviation_; }
 
     /**
      * Sets the standard deviation parameter σ of the underlying normal distribution.
-     * 
+     *
      * @param stdDev New standard deviation parameter (must be positive)
      * @throws std::invalid_argument if stdDev <= 0 or is not finite
      */
@@ -213,7 +213,7 @@ class LogNormalDistribution : public DistributionBase {
 
     /**
      * Sets both parameters simultaneously.
-     * 
+     *
      * @param mean New mean parameter
      * @param stdDev New standard deviation parameter
      * @throws std::invalid_argument if parameters are invalid
@@ -228,7 +228,7 @@ class LogNormalDistribution : public DistributionBase {
     /**
      * Gets the mean of the Log-Normal distribution (not the underlying normal).
      * For Log-Normal distribution, mean = exp(μ + σ²/2)
-     * 
+     *
      * @return Mean of the Log-Normal distribution
      */
     double getDistributionMean() const noexcept {
@@ -239,7 +239,7 @@ class LogNormalDistribution : public DistributionBase {
     /**
      * Gets the variance of the Log-Normal distribution.
      * For Log-Normal distribution, variance = (exp(σ²) - 1) * exp(2μ + σ²)
-     * 
+     *
      * @return Variance of the Log-Normal distribution
      */
     double getVariance() const noexcept {
@@ -249,7 +249,7 @@ class LogNormalDistribution : public DistributionBase {
 
     /**
      * Gets the standard deviation of the Log-Normal distribution.
-     * 
+     *
      * @return Standard deviation of the Log-Normal distribution
      */
     double getDistributionStandardDeviation() const noexcept { return std::sqrt(getVariance()); }
@@ -257,7 +257,7 @@ class LogNormalDistribution : public DistributionBase {
     /**
      * Gets the mode of the Log-Normal distribution.
      * For Log-Normal distribution, mode = exp(μ - σ²)
-     * 
+     *
      * @return Mode of the Log-Normal distribution
      */
     double getMode() const noexcept {
@@ -268,7 +268,7 @@ class LogNormalDistribution : public DistributionBase {
     /**
      * Gets the median of the Log-Normal distribution.
      * For Log-Normal distribution, median = exp(μ)
-     * 
+     *
      * @return Median of the Log-Normal distribution
      */
     double getMedian() const noexcept { return std::exp(mean_); }
diff --git a/include/libhmm/distributions/negative_binomial_distribution.h b/include/libhmm/distributions/negative_binomial_distribution.h
index beb2176..e793e4f 100644
--- a/include/libhmm/distributions/negative_binomial_distribution.h
+++ b/include/libhmm/distributions/negative_binomial_distribution.h
@@ -8,18 +8,18 @@ namespace libhmm {
 
 /**
  * Modern C++20 Negative Binomial distribution for modeling discrete count data.
- * 
- * The Negative Binomial distribution models the number of failures before 
- * the r-th success in a sequence of independent Bernoulli trials, each with 
+ *
+ * The Negative Binomial distribution models the number of failures before
+ * the r-th success in a sequence of independent Bernoulli trials, each with
  * success probability p.
- * 
+ *
  * PMF: P(X = k) = C(k+r-1, k) * p^r * (1-p)^k
  * where C(k+r-1, k) is the binomial coefficient
- * 
+ *
  * Alternative parameterization (often used in practice):
  * - r: number of successes (positive real number)
  * - p: success probability (in [0,1])
- * 
+ *
  * Properties:
  * - Mean: r * (1-p) / p
  * - Variance: r * (1-p) / p²
@@ -99,7 +99,7 @@ class NegativeBinomialDistribution : public DistributionBase {
 public:
     /**
      * Constructs a Negative Binomial distribution with given parameters.
-     * 
+     *
      * @param r Number of successes (must be positive)
      * @param p Success probability (must be in (0,1])
      * @throws std::invalid_argument if parameters are invalid
@@ -169,7 +169,7 @@ class NegativeBinomialDistribution : public DistributionBase {
 
     /**
      * Computes the probability mass function for the Negative Binomial distribution.
-     * 
+     *
      * @param value The value at which to evaluate the PMF (will be rounded to nearest integer)
      * @return Probability mass
      */
@@ -189,21 +189,21 @@ class NegativeBinomialDistribution : public DistributionBase {
 
     /**
      * Returns a string representation of the distribution.
-     * 
+     *
      * @return String describing the distribution parameters
      */
     std::string toString() const override;
 
     /**
      * Gets the number of successes parameter r.
-     * 
+     *
      * @return Current number of successes
      */
     double getR() const noexcept { return r_; }
 
     /**
      * Sets the number of successes parameter r.
-     * 
+     *
      * @param r New number of successes (must be positive)
      * @throws std::invalid_argument if r <= 0
      */
@@ -215,14 +215,14 @@ class NegativeBinomialDistribution : public DistributionBase {
 
     /**
      * Gets the success probability parameter p.
-     * 
+     *
      * @return Current success probability
      */
     double getP() const noexcept { return p_; }
 
     /**
      * Sets the success probability parameter p.
-     * 
+     *
      * @param p New success probability (must be in (0,1])
      * @throws std::invalid_argument if p not in (0,1]
      */
@@ -235,7 +235,7 @@ class NegativeBinomialDistribution : public DistributionBase {
     /**
      * Gets the mean of the distribution.
      * For Negative Binomial distribution, mean = r * (1-p) / p
-     * 
+     *
      * @return Mean value
      */
     double getMean() const noexcept { return r_ * (1.0 - p_) / p_; }
@@ -243,21 +243,21 @@ class NegativeBinomialDistribution : public DistributionBase {
     /**
      * Gets the variance of the distribution.
      * For Negative Binomial distribution, variance = r * (1-p) / p²
-     * 
+     *
      * @return Variance value
      */
     double getVariance() const noexcept { return r_ * (1.0 - p_) / (p_ * p_); }
 
     /**
      * Gets the standard deviation of the distribution.
-     * 
+     *
      * @return Standard deviation value
      */
     double getStandardDeviation() const noexcept { return std::sqrt(getVariance()); }
 
     /**
      * Sets both parameters simultaneously.
-     * 
+     *
      * @param r New number of successes
      * @param p New success probability
      * @throws std::invalid_argument if parameters are invalid
@@ -272,7 +272,7 @@ class NegativeBinomialDistribution : public DistributionBase {
     /**
      * Evaluates the logarithm of the probability mass function
      * More numerically stable for small probabilities
-     * 
+     *
      * @param value The value at which to evaluate the log PMF
      * @return Log probability mass
      */
@@ -286,7 +286,7 @@ class NegativeBinomialDistribution : public DistributionBase {
     /**
      * Evaluates the CDF at k using cumulative sum approach
      * Formula: CDF(k) = ∑(i=0 to k) P(X = i)
-     * 
+     *
      * @param value The value at which to evaluate the CDF
      * @return Cumulative probability P(X ≤ value)
      */
@@ -295,7 +295,7 @@ class NegativeBinomialDistribution : public DistributionBase {
     /**
      * Gets the mode of the distribution.
      * For Negative Binomial distribution, mode = floor((r-1)*(1-p)/p) if r > 1, else 0
-     * 
+     *
      * @return Mode value
      */
     int getMode() const noexcept {
@@ -308,7 +308,7 @@ class NegativeBinomialDistribution : public DistributionBase {
     /**
      * Gets the skewness of the distribution.
      * For Negative Binomial distribution, skewness = (2-p)/sqrt(r*(1-p))
-     * 
+     *
      * @return Skewness value
      */
     double getSkewness() const noexcept { return (2.0 - p_) / std::sqrt(r_ * (1.0 - p_)); }
@@ -316,7 +316,7 @@ class NegativeBinomialDistribution : public DistributionBase {
     /**
      * Gets the kurtosis of the distribution.
      * For Negative Binomial distribution, kurtosis = 3 + (6/r) + (p²/(r*(1-p)))
-     * 
+     *
      * @return Kurtosis value
      */
     double getKurtosis() const noexcept { return 3.0 + (6.0 / r_) + (p_ * p_) / (r_ * (1.0 - p_)); }
diff --git a/include/libhmm/distributions/pareto_distribution.h b/include/libhmm/distributions/pareto_distribution.h
index 32a7ed2..2baba0c 100755
--- a/include/libhmm/distributions/pareto_distribution.h
+++ b/include/libhmm/distributions/pareto_distribution.h
@@ -8,16 +8,16 @@ namespace libhmm {
 
 /**
  * Modern C++20 Pareto distribution for modeling power-law phenomena.
- * 
+ *
  * The Pareto distribution is a continuous probability distribution commonly
  * used to model income distribution, city population sizes, stock price
  * fluctuations, and other phenomena that follow the "80-20 rule" or
  * Pareto principle.
- * 
+ *
  * PDF: f(x) = (k * x_m^k) / x^(k+1) for x ≥ x_m, 0 otherwise
  * CDF: F(x) = 1 - (x_m/x)^k for x ≥ x_m, 0 otherwise
  * where k is the shape parameter (k > 0) and x_m is the scale parameter (x_m > 0)
- * 
+ *
  * Properties:
  * - Mean: k*x_m/(k-1) for k > 1, undefined for k ≤ 1
  * - Variance: (k*x_m²)/((k-1)²*(k-2)) for k > 2, undefined for k ≤ 2
@@ -107,7 +107,7 @@ class ParetoDistribution : public DistributionBase {
 public:
     /**
      * Constructs a Pareto distribution with given parameters.
-     * 
+     *
      * @param k Shape parameter k (must be positive)
      * @param xm Scale parameter x_m (must be positive)
      * @throws std::invalid_argument if parameters are invalid
@@ -173,7 +173,7 @@ class ParetoDistribution : public DistributionBase {
 
     /**
      * Computes the probability density function for the Pareto distribution.
-     * 
+     *
      * @param value The value at which to evaluate the PDF
      * @return Probability density (or approximated probability for discrete sampling)
      */
@@ -202,21 +202,21 @@ class ParetoDistribution : public DistributionBase {
 
     /**
      * Returns a string representation of the distribution.
-     * 
+     *
      * @return String describing the distribution parameters
      */
     std::string toString() const override;
 
     /**
      * Gets the shape parameter k.
-     * 
+     *
      * @return Current shape parameter value
      */
     double getK() const noexcept { return k_; }
 
     /**
      * Sets the shape parameter k.
-     * 
+     *
      * @param k New shape parameter (must be positive)
      * @throws std::invalid_argument if k <= 0 or is not finite
      */
@@ -228,14 +228,14 @@ class ParetoDistribution : public DistributionBase {
 
     /**
      * Gets the scale parameter x_m.
-     * 
+     *
      * @return Current scale parameter value
      */
     double getXm() const noexcept { return xm_; }
 
     /**
      * Sets the scale parameter x_m.
-     * 
+     *
      * @param xm New scale parameter (must be positive)
      * @throws std::invalid_argument if xm <= 0 or is not finite
      */
@@ -247,7 +247,7 @@ class ParetoDistribution : public DistributionBase {
 
     /**
      * Sets both parameters simultaneously.
-     * 
+     *
      * @param k New shape parameter
      * @param xm New scale parameter
      * @throws std::invalid_argument if parameters are invalid
@@ -262,7 +262,7 @@ class ParetoDistribution : public DistributionBase {
     /**
      * Gets the mean of the Pareto distribution.
      * For Pareto distribution, mean = k*x_m/(k-1) if k > 1, undefined otherwise
-     * 
+     *
      * @return Mean value if k > 1, otherwise returns infinity
      */
     double getMean() const noexcept {
@@ -272,7 +272,7 @@ class ParetoDistribution : public DistributionBase {
     /**
      * Gets the variance of the Pareto distribution.
      * For Pareto distribution, variance = (k*x_m²)/((k-1)²*(k-2)) if k > 2, undefined otherwise
-     * 
+     *
      * @return Variance value if k > 2, otherwise returns infinity
      */
     double getVariance() const noexcept {
@@ -285,7 +285,7 @@ class ParetoDistribution : public DistributionBase {
 
     /**
      * Gets the standard deviation of the Pareto distribution.
-     * 
+     *
      * @return Standard deviation if k > 2, otherwise returns infinity
      */
     double getStandardDeviation() const noexcept {
@@ -296,7 +296,7 @@ class ParetoDistribution : public DistributionBase {
     /**
      * Gets the mode of the Pareto distribution.
      * For Pareto distribution, mode = x_m (always at the scale parameter)
-     * 
+     *
      * @return Mode value (equals x_m)
      */
     double getMode() const noexcept { return xm_; }
@@ -304,7 +304,7 @@ class ParetoDistribution : public DistributionBase {
     /**
      * Gets the median of the Pareto distribution.
      * For Pareto distribution, median = x_m * 2^(1/k)
-     * 
+     *
      * @return Median value
      */
     double getMedian() const noexcept {
diff --git a/include/libhmm/distributions/poisson_distribution.h b/include/libhmm/distributions/poisson_distribution.h
index a7a3c21..01436aa 100644
--- a/include/libhmm/distributions/poisson_distribution.h
+++ b/include/libhmm/distributions/poisson_distribution.h
@@ -9,11 +9,11 @@ namespace libhmm {
 
 /**
  * Modern C++20 Poisson distribution for modeling count data and rare events.
- * 
- * The Poisson distribution models the number of events occurring in a fixed 
- * interval of time or space, given that these events occur with a known 
+ *
+ * The Poisson distribution models the number of events occurring in a fixed
+ * interval of time or space, given that these events occur with a known
  * constant mean rate and independently of the time since the last event.
- * 
+ *
  * PMF: P(X = k) = (λ^k * e^(-λ)) / k!  for k = 0, 1, 2, ...
  * where λ (lambda) is the rate parameter (mean number of events per interval)
  */
@@ -56,7 +56,7 @@ class PoissonDistribution : public DistributionBase {
     /**
      * Computes log(k!) using Stirling's approximation for large k,
      * exact computation for small k.
-     * 
+     *
      * @param k Non-negative integer
      * @return log(k!)
      */
@@ -69,7 +69,7 @@ class PoissonDistribution : public DistributionBase {
 
     /**
      * Validates that k is a valid count (non-negative integer)
-     * 
+     *
      * @param k Value to validate
      * @return true if k is a valid count, false otherwise
      */
@@ -82,7 +82,7 @@ class PoissonDistribution : public DistributionBase {
 public:
     /**
      * Constructs a Poisson distribution with given rate parameter.
-     * 
+     *
      * @param lambda Rate parameter (must be positive)
      * @throws std::invalid_argument if lambda <= 0 or is not finite
      */
@@ -138,7 +138,7 @@ class PoissonDistribution : public DistributionBase {
 
     /**
      * Computes the probability mass function P(X = k) for the Poisson distribution.
-     * 
+     *
      * @param value The count value k (must be non-negative integer)
      * @return Probability P(X = k), or 0.0 if value is invalid
      */
@@ -160,21 +160,21 @@ class PoissonDistribution : public DistributionBase {
 
     /**
      * Returns a string representation of the distribution.
-     * 
+     *
      * @return String describing the distribution parameters
      */
     std::string toString() const override;
 
     /**
      * Gets the rate parameter λ.
-     * 
+     *
      * @return Current lambda value
      */
     double getLambda() const noexcept { return lambda_; }
 
     /**
      * Sets the rate parameter λ.
-     * 
+     *
      * @param lambda New rate parameter (must be positive)
      * @throws std::invalid_argument if lambda <= 0 or is not finite
      */
@@ -186,14 +186,14 @@ class PoissonDistribution : public DistributionBase {
 
     /**
      * Gets the mean of the distribution (equal to λ).
-     * 
+     *
      * @return Mean value
      */
     double getMean() const noexcept { return lambda_; }
 
     /**
      * Gets the variance of the distribution (equal to λ).
-     * 
+     *
      * @return Variance value
      */
     double getVariance() const noexcept { return lambda_; }
@@ -201,7 +201,7 @@ class PoissonDistribution : public DistributionBase {
     /**
      * Gets the standard deviation of the distribution (sqrt(λ)).
      * Uses cached value for efficiency.
-     * 
+     *
      * @return Standard deviation
      */
     double getStandardDeviation() const noexcept {
@@ -213,7 +213,7 @@ class PoissonDistribution : public DistributionBase {
     /**
      * Evaluates the logarithm of the probability mass function
      * More numerically stable for small probabilities
-     * 
+     *
      * @param value The count value k at which to evaluate the log PMF
      * @return Log probability mass
      */
@@ -227,7 +227,7 @@ class PoissonDistribution : public DistributionBase {
     /**
      * Evaluates the CDF at k using cumulative sum approach
      * Formula: CDF(k) = ∑(i=0 to k) P(X = i)
-     * 
+     *
      * @param k The value at which to evaluate the CDF
      * @return Cumulative probability P(X ≤ k)
      */
diff --git a/include/libhmm/distributions/rayleigh_distribution.h b/include/libhmm/distributions/rayleigh_distribution.h
index 54e8cd4..8d09308 100644
--- a/include/libhmm/distributions/rayleigh_distribution.h
+++ b/include/libhmm/distributions/rayleigh_distribution.h
@@ -8,24 +8,24 @@ namespace libhmm {
 
 /**
  * Modern C++20 Rayleigh distribution for modeling magnitudes and speeds.
- * 
+ *
  * The Rayleigh distribution is a continuous probability distribution that arises
  * when modeling the magnitude of a 2D random vector whose components are independent,
  * identically distributed, zero-mean Gaussian random variables.
- * 
+ *
  * This is a special case of the Weibull distribution with shape parameter k = 2,
  * but implemented as a standalone class for maximum efficiency.
- * 
+ *
  * PDF: f(x) = (x/σ²) * exp(-x²/(2σ²)) for x ≥ 0, 0 otherwise
  * CDF: F(x) = 1 - exp(-x²/(2σ²)) for x ≥ 0, 0 otherwise
  * where σ is the scale parameter (σ > 0)
- * 
+ *
  * Properties:
  * - Mean: σ * √(π/2) ≈ 1.253 * σ
  * - Variance: σ² * (4-π)/2 ≈ 0.429 * σ²
  * - Mode: σ
  * - Support: x ∈ [0, ∞)
- * 
+ *
  * Applications:
  * - Wind speed modeling
  * - Wave height analysis
@@ -74,7 +74,7 @@ class RayleighDistribution : public DistributionBase {
     mutable double mean_{constants::math::SQRT_PI_OVER_TWO};
 
     /**
-     * Cached value of σ² * (4-π)/2 for variance calculation  
+     * Cached value of σ² * (4-π)/2 for variance calculation
      * Variance = σ² * (4-π)/2 ≈ 0.4292036732 * σ²
      */
     mutable double variance_{constants::math::FOUR_MINUS_PI_OVER_TWO};
@@ -106,7 +106,7 @@ class RayleighDistribution : public DistributionBase {
 public:
     /**
      * Constructs a Rayleigh distribution with given scale parameter.
-     * 
+     *
      * @param sigma Scale parameter σ (must be positive)
      * @throws std::invalid_argument if sigma is invalid
      */
@@ -196,21 +196,21 @@ class RayleighDistribution : public DistributionBase {
 
     /**
      * Returns a string representation of the distribution.
-     * 
+     *
      * @return String describing the distribution parameters
      */
     std::string toString() const override;
 
     /**
      * Gets the scale parameter σ.
-     * 
+     *
      * @return Current scale parameter value
      */
     double getSigma() const noexcept { return sigma_; }
 
     /**
      * Sets the scale parameter σ.
-     * 
+     *
      * @param sigma New scale parameter (must be positive)
      * @throws std::invalid_argument if sigma is invalid
      */
@@ -223,7 +223,7 @@ class RayleighDistribution : public DistributionBase {
     /**
      * Gets the mean of the distribution.
      * Mean = σ * √(π/2)
-     * 
+     *
      * @return Mean value
      */
     double getMean() const noexcept {
@@ -239,7 +239,7 @@ class RayleighDistribution : public DistributionBase {
 
     /**
      * Gets the standard deviation of the distribution.
-     * 
+     *
      * @return Standard deviation (square root of variance)
      */
     double getStandardDeviation() const noexcept { return std::sqrt(getVariance()); }
@@ -247,7 +247,7 @@ class RayleighDistribution : public DistributionBase {
     /**
      * Gets the mode of the distribution.
      * Mode = σ
-     * 
+     *
      * @return Mode value
      */
     double getMode() const noexcept { return sigma_; }
@@ -255,7 +255,7 @@ class RayleighDistribution : public DistributionBase {
     /**
      * Gets the median of the distribution.
      * Median = σ * √(2 * ln(2)) ≈ 1.177 * σ
-     * 
+     *
      * @return Median value
      */
     double getMedian() const noexcept { return sigma_ * constants::math::SQRT_TWO_LN_TWO; }
diff --git a/include/libhmm/distributions/student_t_distribution.h b/include/libhmm/distributions/student_t_distribution.h
index 6c37714..4fed5cf 100644
--- a/include/libhmm/distributions/student_t_distribution.h
+++ b/include/libhmm/distributions/student_t_distribution.h
@@ -8,18 +8,18 @@ namespace libhmm {
 
 /**
  * @brief Student's t-distribution implementation
- * 
+ *
  * The Student's t-distribution is a probability distribution used in statistics,
  * particularly for small sample sizes or when the population variance is unknown.
  * It approaches the normal distribution as degrees of freedom increase.
- * 
+ *
  * Mathematical properties:
  * - PDF: f(x|ν) = Γ((ν+1)/2) / (√(νπ) * Γ(ν/2)) * (1 + x²/ν)^(-(ν+1)/2)
  * - Support: x ∈ (-∞, +∞)
  * - Parameters: ν > 0 (degrees of freedom)
  * - Mean: 0 (for ν > 1), undefined otherwise
  * - Variance: ν/(ν-2) (for ν > 2), infinite for 1 < ν ≤ 2, undefined for ν ≤ 1
- * 
+ *
  * Applications:
  * - Statistical hypothesis testing (t-tests)
  * - Confidence intervals for unknown variance
@@ -118,7 +118,7 @@ class StudentTDistribution : public DistributionBase {
 
     /**
      * Computes the probability density function for the Student's t-distribution.
-     * 
+     *
      * @param value The value at which to evaluate the PDF
      * @return Probability density f(value|ν)
      */
@@ -212,7 +212,7 @@ class StudentTDistribution : public DistributionBase {
 
     /**
      * Returns a string representation of the distribution.
-     * 
+     *
      * @return String describing the distribution parameters
      */
     std::string toString() const override;
diff --git a/include/libhmm/distributions/uniform_distribution.h b/include/libhmm/distributions/uniform_distribution.h
index bc22f98..84f8bec 100644
--- a/include/libhmm/distributions/uniform_distribution.h
+++ b/include/libhmm/distributions/uniform_distribution.h
@@ -8,17 +8,17 @@ namespace libhmm {
 
 /**
  * @brief Uniform Distribution
- * 
+ *
  * The uniform distribution is a continuous probability distribution where all values
  * within a specified interval [a, b] have equal probability density.
- * 
+ *
  * Probability Density Function:
  * f(x) = 1/(b-a) for a ≤ x ≤ b, 0 otherwise
- * 
+ *
  * Parameters:
  * - a: Lower bound (minimum value)
  * - b: Upper bound (maximum value)
- * 
+ *
  * Properties:
  * - Mean: μ = (a + b) / 2
  * - Variance: σ² = (b - a)² / 12
diff --git a/include/libhmm/distributions/weibull_distribution.h b/include/libhmm/distributions/weibull_distribution.h
index 6600495..3de493b 100644
--- a/include/libhmm/distributions/weibull_distribution.h
+++ b/include/libhmm/distributions/weibull_distribution.h
@@ -8,21 +8,21 @@ namespace libhmm {
 
 /**
  * Weibull distribution for reliability analysis and survival modeling.
- * 
- * The Weibull distribution is a continuous probability distribution defined 
+ *
+ * The Weibull distribution is a continuous probability distribution defined
  * on the interval [0,∞) and parameterized by two positive parameters:
  * k (shape parameter) and λ (scale parameter).
- * 
+ *
  * PDF: f(x; k, λ) = (k/λ) * (x/λ)^(k-1) * exp(-(x/λ)^k)  for x ≥ 0
  * CDF: F(x; k, λ) = 1 - exp(-(x/λ)^k)  for x ≥ 0
- * 
+ *
  * Special cases:
  * - k = 1: Exponential distribution with rate λ
- * - k = 2: Rayleigh distribution  
+ * - k = 2: Rayleigh distribution
  * - k < 1: Decreasing failure rate (infant mortality)
  * - k = 1: Constant failure rate (random failures)
  * - k > 1: Increasing failure rate (wear-out failures)
- * 
+ *
  * Applications:
  * - Reliability engineering and failure analysis
  * - Survival analysis and lifetime modeling
@@ -38,7 +38,7 @@ class WeibullDistribution : public DistributionBase {
     double k_{1.0};
 
     /**
-     * Scale parameter λ (lambda) - must be positive  
+     * Scale parameter λ (lambda) - must be positive
      * Controls the scale/spread of the distribution
      */
     double lambda_{1.0};
@@ -97,7 +97,7 @@ class WeibullDistribution : public DistributionBase {
 public:
     /**
      * Constructs a Weibull distribution with given parameters.
-     * 
+     *
      * @param k Shape parameter (must be positive)
      * @param lambda Scale parameter (must be positive)
      * @throws std::invalid_argument if parameters are not positive finite numbers
@@ -161,7 +161,7 @@ class WeibullDistribution : public DistributionBase {
 
     /**
      * Computes the probability density function for the Weibull distribution.
-     * 
+     *
      * @param value The value at which to evaluate the PDF (should be ≥ 0)
      * @return Probability density, or 0.0 if value is negative
      */
@@ -189,14 +189,14 @@ class WeibullDistribution : public DistributionBase {
 
     /**
      * Returns a string representation of the distribution.
-     * 
+     *
      * @return String describing the distribution parameters
      */
     std::string toString() const override;
 
     /**
      * Computes the cumulative distribution function (CDF) for the Weibull distribution.
-     * 
+     *
      * @param x The value at which to evaluate the CDF (should be ≥ 0)
      * @return Cumulative probability P(X ≤ x), or 0.0 if x is negative
      */
@@ -204,7 +204,7 @@ class WeibullDistribution : public DistributionBase {
 
     /**
      * Equality comparison operator with tolerance for floating-point comparison.
-     * 
+     *
      * @param other Distribution to compare with
      * @return true if distributions have the same parameters within tolerance
      */
@@ -212,14 +212,14 @@ class WeibullDistribution : public DistributionBase {
 
     /**
      * Gets the shape parameter k.
-     * 
+     *
      * @return Current k value
      */
     double getK() const noexcept { return k_; }
 
     /**
      * Sets the shape parameter k.
-     * 
+     *
      * @param k New shape parameter (must be positive)
      * @throws std::invalid_argument if k <= 0 or is not finite
      */
@@ -231,14 +231,14 @@ class WeibullDistribution : public DistributionBase {
 
     /**
      * Gets the scale parameter λ (lambda).
-     * 
+     *
      * @return Current lambda value
      */
     double getLambda() const noexcept { return lambda_; }
 
     /**
      * Sets the scale parameter λ (lambda).
-     * 
+     *
      * @param lambda New scale parameter (must be positive)
      * @throws std::invalid_argument if lambda <= 0 or is not finite
      */
@@ -251,7 +251,7 @@ class WeibullDistribution : public DistributionBase {
     /**
      * Gets the mean of the distribution.
      * For Weibull(k, λ), mean = λ * Γ(1 + 1/k)
-     * 
+     *
      * @return Mean value
      */
     double getMean() const noexcept { return lambda_ * std::exp(std::lgamma(1.0 + 1.0 / k_)); }
@@ -259,7 +259,7 @@ class WeibullDistribution : public DistributionBase {
     /**
      * Gets the variance of the distribution.
      * For Weibull(k, λ), variance = λ² * [Γ(1 + 2/k) - (Γ(1 + 1/k))²]
-     * 
+     *
      * @return Variance value
      */
     double getVariance() const noexcept {
@@ -270,7 +270,7 @@ class WeibullDistribution : public DistributionBase {
 
     /**
      * Gets the standard deviation of the distribution.
-     * 
+     *
      * @return Standard deviation
      */
     double getStandardDeviation() const noexcept { return std::sqrt(getVariance()); }
@@ -278,7 +278,7 @@ class WeibullDistribution : public DistributionBase {
     /**
      * Gets the scale parameter (alternative name for lambda).
      * This is sometimes called the "characteristic life" in reliability contexts.
-     * 
+     *
      * @return Scale parameter value
      */
     double getScale() const noexcept { return lambda_; }
@@ -286,7 +286,7 @@ class WeibullDistribution : public DistributionBase {
     /**
      * Gets the shape parameter (alternative name for k).
      * This is sometimes called the "Weibull modulus" in reliability contexts.
-     * 
+     *
      * @return Shape parameter value
      */
     double getShape() const noexcept { return k_; }
diff --git a/include/libhmm/io/file_io_manager.h b/include/libhmm/io/file_io_manager.h
index 98f3567..fe8da0b 100644
--- a/include/libhmm/io/file_io_manager.h
+++ b/include/libhmm/io/file_io_manager.h
@@ -27,7 +27,7 @@ class FileIOManager {
 
     /**
      * Reads entire file content as a string.
-     * 
+     *
      * @param filepath Path to the file
      * @return File content as string
      * @throws std::runtime_error if file cannot be read
@@ -36,7 +36,7 @@ class FileIOManager {
 
     /**
      * Writes string content to a file.
-     * 
+     *
      * @param filepath Path to the file
      * @param content Content to write
      * @param append If true, append to file; if false, overwrite
@@ -47,7 +47,7 @@ class FileIOManager {
 
     /**
      * Reads file content as lines.
-     * 
+     *
      * @param filepath Path to the file
      * @return Vector of lines
      * @throws std::runtime_error if file cannot be read
@@ -56,7 +56,7 @@ class FileIOManager {
 
     /**
      * Writes lines to a file.
-     * 
+     *
      * @param filepath Path to the file
      * @param lines Lines to write
      * @param append If true, append to file; if false, overwrite
@@ -67,7 +67,7 @@ class FileIOManager {
 
     /**
      * Safely copies a file with error handling.
-     * 
+     *
      * @param source Source file path
      * @param destination Destination file path
      * @param overwrite If true, overwrite existing file
@@ -78,7 +78,7 @@ class FileIOManager {
 
     /**
      * Creates a backup of a file with timestamp.
-     * 
+     *
      * @param filepath Path to the file to backup
      * @return Path to the backup file
      * @throws std::runtime_error if backup fails
@@ -87,7 +87,7 @@ class FileIOManager {
 
     /**
      * Validates file path and permissions.
-     * 
+     *
      * @param filepath Path to validate
      * @param checkRead Check read permissions
      * @param checkWrite Check write permissions
@@ -98,7 +98,7 @@ class FileIOManager {
 
     /**
      * Gets file size safely.
-     * 
+     *
      * @param filepath Path to the file
      * @return File size in bytes, or nullopt if file doesn't exist
      */
@@ -107,7 +107,7 @@ class FileIOManager {
 
     /**
      * Checks if file has expected extension.
-     * 
+     *
      * @param filepath Path to check
      * @param expectedExtension Expected file extension (with or without dot)
      * @return true if file has the expected extension
@@ -117,7 +117,7 @@ class FileIOManager {
 
     /**
      * Creates directory structure if it doesn't exist.
-     * 
+     *
      * @param dirpath Directory path to create
      * @throws std::runtime_error if directory creation fails
      */
@@ -125,7 +125,7 @@ class FileIOManager {
 
     /**
      * Gets file modification time.
-     * 
+     *
      * @param filepath Path to the file
      * @return File modification time, or nullopt if file doesn't exist
      */
diff --git a/include/libhmm/io/xml_file_reader.h b/include/libhmm/io/xml_file_reader.h
index b508fd4..10a490f 100644
--- a/include/libhmm/io/xml_file_reader.h
+++ b/include/libhmm/io/xml_file_reader.h
@@ -30,7 +30,7 @@ class XMLFileReader {
 
     /**
      * Reads an HMM from an XML file with comprehensive error handling.
-     * 
+     *
      * @param filename Path to the input XML file
      * @return Loaded HMM object
      * @throws std::invalid_argument if filename is empty
@@ -40,7 +40,7 @@ class XMLFileReader {
 
     /**
      * Reads an HMM from an XML file with filesystem path.
-     * 
+     *
      * @param filepath Path to the input XML file
      * @return Loaded HMM object
      * @throws std::invalid_argument if filepath is empty
@@ -50,7 +50,7 @@ class XMLFileReader {
 
     /**
      * Validates that a file can be read from the given path.
-     * 
+     *
      * @param filepath Path to validate
      * @return true if the file can be read, false otherwise
      */
@@ -58,7 +58,7 @@ class XMLFileReader {
 
     /**
      * Checks if a file exists and appears to be a valid XML file.
-     * 
+     *
      * @param filepath Path to check
      * @return true if file exists and has XML content, false otherwise
      */
@@ -67,7 +67,7 @@ class XMLFileReader {
 private:
     /**
      * Internal implementation for reading HMM from stream.
-     * 
+     *
      * @param stream Input stream
      * @return Loaded HMM object
      * @throws std::runtime_error if deserialization fails
diff --git a/include/libhmm/io/xml_file_writer.h b/include/libhmm/io/xml_file_writer.h
index 6b7f610..3efb058 100755
--- a/include/libhmm/io/xml_file_writer.h
+++ b/include/libhmm/io/xml_file_writer.h
@@ -30,7 +30,7 @@ class XMLFileWriter {
 
     /**
      * Writes an HMM to an XML file with comprehensive error handling.
-     * 
+     *
      * @param hmm The HMM to serialize
      * @param filename Path to the output XML file
      * @throws std::invalid_argument if filename is empty
@@ -40,7 +40,7 @@ class XMLFileWriter {
 
     /**
      * Writes an HMM to an XML file with filesystem path.
-     * 
+     *
      * @param hmm The HMM to serialize
      * @param filepath Path to the output XML file
      * @throws std::invalid_argument if filepath is empty
@@ -50,7 +50,7 @@ class XMLFileWriter {
 
     /**
      * Validates that a file can be written to the given path.
-     * 
+     *
      * @param filepath Path to validate
      * @return true if the file can be written, false otherwise
      */
@@ -59,7 +59,7 @@ class XMLFileWriter {
 private:
     /**
      * Internal implementation for writing HMM to stream.
-     * 
+     *
      * @param hmm The HMM to serialize
      * @param stream Output stream
      * @throws std::runtime_error if serialization fails
diff --git a/include/libhmm/linalg/basic_matrix.h b/include/libhmm/linalg/basic_matrix.h
index 1ccacec..b675c2a 100644
--- a/include/libhmm/linalg/basic_matrix.h
+++ b/include/libhmm/linalg/basic_matrix.h
@@ -14,10 +14,10 @@ namespace libhmm {
 /**
  * Lightweight Matrix class designed to replace boost::numeric::ublas::matrix
  * with better performance and SIMD-friendly memory layout.
- * 
+ *
  * Features:
  * - Contiguous memory storage for optimal cache performance
- * - Row-major ordering for better CPU cache utilization  
+ * - Row-major ordering for better CPU cache utilization
  * - SIMD-aligned memory allocation
  * - Compatible API with existing uBLAS usage patterns
  * - Zero external dependencies (pure C++17)
diff --git a/include/libhmm/linalg/basic_vector.h b/include/libhmm/linalg/basic_vector.h
index 5cf6e6e..7bf96d4 100644
--- a/include/libhmm/linalg/basic_vector.h
+++ b/include/libhmm/linalg/basic_vector.h
@@ -15,7 +15,7 @@ namespace libhmm {
 /**
  * Lightweight Vector class designed to replace boost::numeric::ublas::vector
  * with better performance and SIMD-friendly operations.
- * 
+ *
  * Features:
  * - Based on std::vector for optimal standard library integration
  * - SIMD-friendly contiguous memory layout
diff --git a/include/libhmm/performance/fb_recurrence_policy.h b/include/libhmm/performance/fb_recurrence_policy.h
new file mode 100644
index 0000000..54dae96
--- /dev/null
+++ b/include/libhmm/performance/fb_recurrence_policy.h
@@ -0,0 +1,63 @@
+#pragma once
+
+/**
+ * @file fb_recurrence_policy.h
+ * @brief Minimal ISA-aware policy for Forward-Backward recurrence selection.
+ *
+ * The two recurrence kernels are semantically equivalent in log-space:
+ *   - Pairwise: repeated two-argument log-sum-exp
+ *   - MaxReduce: max-then-reduce
+ *
+ * The only policy decision retained here is an ISA-family cutoff:
+ *   - arm64: switch at N>=4
+ *   - x86/x64: switch at N>=4
+ *
+ * Threshold calibrated by fb_crossover_sweep on Zen 4 / MSVC / AVX-512
+ * (Ryzen 7 7745HX, T=1000, median 8 runs):
+ *   N=2: MaxReduce 2.1x slower (Pairwise wins)
+ *   N=3: MaxReduce 1.1x slower (Pairwise wins)
+ *   N=4: MaxReduce 1.7x faster -- crossover
+ *   N=8: MaxReduce 5.0x faster
+ *   N=32: MaxReduce 15x faster
+ * Previous x86 threshold was N>=5; N=4 was incorrectly left on the slower
+ * Pairwise path before the TranscendentalKernels SIMD backends landed.
+ */
+
+#include <cstddef>
+
+namespace libhmm {
+
+/// Selectable recurrence kernel for Forward-Backward.
+enum class FbRecurrenceMode {
+    Pairwise,
+    MaxReduce,
+};
+
+/**
+ * @brief Static recurrence-mode selection from ISA-family evidence.
+ *
+ * @param numStates       Number of HMM states (`N`).
+ * @param sequenceLength  Observation length (`T`). Currently unused except for
+ *                         signature stability; reserved for future T-aware bins.
+ */
+constexpr FbRecurrenceMode selectFbRecurrenceMode(std::size_t numStates,
+                                                  std::size_t sequenceLength) noexcept {
+    (void)sequenceLength;
+    if (numStates < 2) {
+        return FbRecurrenceMode::Pairwise;
+    }
+    return (numStates >= 4) ? FbRecurrenceMode::MaxReduce : FbRecurrenceMode::Pairwise;
+}
+
+/// Human-readable name for a recurrence mode.
+constexpr const char *toString(FbRecurrenceMode mode) noexcept {
+    switch (mode) {
+        case FbRecurrenceMode::Pairwise:
+            return "pairwise";
+        case FbRecurrenceMode::MaxReduce:
+            return "max-reduce";
+    }
+    return "unknown";
+}
+
+} // namespace libhmm
diff --git a/include/libhmm/performance/simd_kernels_internal.h b/include/libhmm/performance/simd_kernels_internal.h
new file mode 100644
index 0000000..da840c8
--- /dev/null
+++ b/include/libhmm/performance/simd_kernels_internal.h
@@ -0,0 +1,407 @@
+#pragma once
+// include/libhmm/performance/simd_kernels_internal.h
+//
+// Internal header — NOT part of the public API.
+//
+// Single source of truth for vector exp/log helpers shared between
+// transcendental_kernels.cpp and Tier-2 distribution TUs
+// (log_normal_distribution.cpp, pareto_distribution.cpp).
+//
+// Include only from .cpp files compiled with LIBHMM_BEST_SIMD_FLAGS.
+
+#include "libhmm/platform/simd_platform.h"
+#include "libhmm/math/constants.h"
+
+#include <cmath>
+#include <limits>
+
+namespace libhmm {
+namespace performance {
+namespace detail {
+namespace kernels {
+
+// ---------------------------------------------------------------------------
+// Shared constants
+// ---------------------------------------------------------------------------
+static constexpr double K_LN2_HI = 6.93147180369123816490e-1;
+static constexpr double K_LN2_LO = 1.90821492927058770002e-10;
+static constexpr double K_LOG2E = 1.44269504088896338700;
+static constexpr double K_SQRT2 = 1.41421356237309504880168872420969807;
+static constexpr double K_EXP_UNDERFLOW = constants::probability::MIN_LOG_PROBABILITY; // -700.0
+static constexpr double K_EXPONENT_BIAS = 1023.0;
+
+// log polynomial: 2y*(c0 + c1*y^2 + ... + c6*y^12), c_k = 1/(2k+1)
+static constexpr double K_LOG_C0 = 1.0;
+static constexpr double K_LOG_C1 = 3.3333333333333333e-1;
+static constexpr double K_LOG_C2 = 2.0000000000000000e-1;
+static constexpr double K_LOG_C3 = 1.4285714285714285e-1;
+static constexpr double K_LOG_C4 = 1.1111111111111111e-1;
+static constexpr double K_LOG_C5 = 9.0909090909090909e-2;
+static constexpr double K_LOG_C6 = 7.6923076923076923e-2;
+
+// exp polynomial: sum(r^k/k!), k=0..12
+static constexpr double K_EXP_C0 = 1.0;
+static constexpr double K_EXP_C1 = 1.0;
+static constexpr double K_EXP_C2 = 0.5;
+static constexpr double K_EXP_C3 = 1.6666666666666666e-1;
+static constexpr double K_EXP_C4 = 4.1666666666666664e-2;
+static constexpr double K_EXP_C5 = 8.3333333333333332e-3;
+static constexpr double K_EXP_C6 = 1.3888888888888889e-3;
+static constexpr double K_EXP_C7 = 1.9841269841269841e-4;
+static constexpr double K_EXP_C8 = 2.4801587301587302e-5;
+static constexpr double K_EXP_C9 = 2.7557319223985888e-6;
+static constexpr double K_EXP_C10 = 2.7557319223985888e-7;
+static constexpr double K_EXP_C11 = 2.5052108385441720e-8;
+static constexpr double K_EXP_C12 = 2.0876756987868099e-9;
+
+// ---------------------------------------------------------------------------
+// AVX-512 helpers
+// ---------------------------------------------------------------------------
+#if defined(LIBHMM_HAS_AVX512)
+
+[[nodiscard]] static inline __m512d k_log_pd_avx512(__m512d x) noexcept {
+    const __m512d neg_inf_v = _mm512_set1_pd(-std::numeric_limits<double>::infinity());
+    const __m512d sqrt2_v = _mm512_set1_pd(K_SQRT2);
+    const __m512d one_v = _mm512_set1_pd(1.0);
+    const __m512d half_v = _mm512_set1_pd(0.5);
+    const __m512d two_v = _mm512_set1_pd(2.0);
+    const __m512d ln2hi_v = _mm512_set1_pd(K_LN2_HI);
+    const __m512d ln2lo_v = _mm512_set1_pd(K_LN2_LO);
+
+    const __mmask8 invalid = _mm512_cmp_pd_mask(x, _mm512_setzero_pd(), _CMP_LE_OS);
+
+    __m512i bits = _mm512_castpd_si512(x);
+    __m512i e_biased = _mm512_srli_epi64(bits, 52);
+    const __m512i mant_mask = _mm512_set1_epi64(0x000FFFFFFFFFFFFFLL);
+    const __m512i exp_one = _mm512_set1_epi64(0x3FF0000000000000LL);
+    __m512i mbits = _mm512_or_si512(_mm512_and_si512(bits, mant_mask), exp_one);
+    __m512d m = _mm512_castsi512_pd(mbits);
+
+    // Convert int64 exponent to double via scalar (no AVX-512 DQ needed).
+    __m512i e_ub = _mm512_sub_epi64(e_biased, _mm512_set1_epi64(1023LL));
+    alignas(64) long long e_arr[8];
+    _mm512_storeu_si512(reinterpret_cast<__m512i *>(e_arr), e_ub);
+    __m512d e = _mm512_set_pd(static_cast<double>(e_arr[7]), static_cast<double>(e_arr[6]),
+                              static_cast<double>(e_arr[5]), static_cast<double>(e_arr[4]),
+                              static_cast<double>(e_arr[3]), static_cast<double>(e_arr[2]),
+                              static_cast<double>(e_arr[1]), static_cast<double>(e_arr[0]));
+
+    __mmask8 adj = _mm512_cmp_pd_mask(m, sqrt2_v, _CMP_GT_OS);
+    e = _mm512_mask_add_pd(e, adj, e, one_v);
+    m = _mm512_mask_mul_pd(m, adj, m, half_v);
+
+    __m512d y = _mm512_div_pd(_mm512_sub_pd(m, one_v), _mm512_add_pd(m, one_v));
+    __m512d y2 = _mm512_mul_pd(y, y);
+
+    __m512d p = _mm512_set1_pd(K_LOG_C6);
+    p = _mm512_fmadd_pd(p, y2, _mm512_set1_pd(K_LOG_C5));
+    p = _mm512_fmadd_pd(p, y2, _mm512_set1_pd(K_LOG_C4));
+    p = _mm512_fmadd_pd(p, y2, _mm512_set1_pd(K_LOG_C3));
+    p = _mm512_fmadd_pd(p, y2, _mm512_set1_pd(K_LOG_C2));
+    p = _mm512_fmadd_pd(p, y2, _mm512_set1_pd(K_LOG_C1));
+    p = _mm512_fmadd_pd(p, y2, _mm512_set1_pd(K_LOG_C0));
+    __m512d log_m = _mm512_mul_pd(_mm512_mul_pd(two_v, y), p);
+
+    __m512d result = _mm512_fmadd_pd(e, ln2hi_v, _mm512_fmadd_pd(e, ln2lo_v, log_m));
+    result = _mm512_mask_blend_pd(invalid, result, neg_inf_v);
+    return result;
+}
+
+[[nodiscard]] static inline __m512d k_exp_pd_avx512(__m512d x) noexcept {
+    const __m512d uflow_v = _mm512_set1_pd(K_EXP_UNDERFLOW);
+    const __m512d log2e_v = _mm512_set1_pd(K_LOG2E);
+    const __m512d half_v = _mm512_set1_pd(0.5);
+    const __m512d ln2hi_v = _mm512_set1_pd(K_LN2_HI);
+    const __m512d ln2lo_v = _mm512_set1_pd(K_LN2_LO);
+    const __m512d zero_v = _mm512_setzero_pd();
+    const __mmask8 uflow = _mm512_cmp_pd_mask(x, uflow_v, _CMP_LE_OS);
+    x = _mm512_max_pd(x, uflow_v);
+    __m512d n = _mm512_floor_pd(_mm512_fmadd_pd(x, log2e_v, half_v));
+    __m512d r = _mm512_fnmadd_pd(n, ln2hi_v, x);
+    r = _mm512_fnmadd_pd(n, ln2lo_v, r);
+    __m512d p = _mm512_set1_pd(K_EXP_C12);
+    p = _mm512_fmadd_pd(p, r, _mm512_set1_pd(K_EXP_C11));
+    p = _mm512_fmadd_pd(p, r, _mm512_set1_pd(K_EXP_C10));
+    p = _mm512_fmadd_pd(p, r, _mm512_set1_pd(K_EXP_C9));
+    p = _mm512_fmadd_pd(p, r, _mm512_set1_pd(K_EXP_C8));
+    p = _mm512_fmadd_pd(p, r, _mm512_set1_pd(K_EXP_C7));
+    p = _mm512_fmadd_pd(p, r, _mm512_set1_pd(K_EXP_C6));
+    p = _mm512_fmadd_pd(p, r, _mm512_set1_pd(K_EXP_C5));
+    p = _mm512_fmadd_pd(p, r, _mm512_set1_pd(K_EXP_C4));
+    p = _mm512_fmadd_pd(p, r, _mm512_set1_pd(K_EXP_C3));
+    p = _mm512_fmadd_pd(p, r, _mm512_set1_pd(K_EXP_C2));
+    p = _mm512_fmadd_pd(p, r, _mm512_set1_pd(K_EXP_C1));
+    p = _mm512_fmadd_pd(p, r, _mm512_set1_pd(K_EXP_C0));
+    __m256i ni = _mm512_cvtpd_epi32(n);
+    __m512i ni64 = _mm512_cvtepi32_epi64(ni);
+    ni64 = _mm512_add_epi64(ni64, _mm512_set1_epi64(static_cast<long long>(K_EXPONENT_BIAS)));
+    ni64 = _mm512_slli_epi64(ni64, 52);
+    __m512d result = _mm512_mul_pd(p, _mm512_castsi512_pd(ni64));
+    result = _mm512_mask_blend_pd(uflow, result, zero_v);
+    return result;
+}
+
+#endif // LIBHMM_HAS_AVX512
+
+// ---------------------------------------------------------------------------
+// AVX helpers (AVX-1 compatible)
+// ---------------------------------------------------------------------------
+#if defined(LIBHMM_HAS_AVX) || defined(LIBHMM_HAS_AVX2)
+
+[[nodiscard]] static inline __m256d k_log_pd_avx(__m256d x) noexcept {
+    const double neg_inf = -std::numeric_limits<double>::infinity();
+    const __m256d neg_inf_v = _mm256_set1_pd(neg_inf);
+    const __m256d sqrt2_v = _mm256_set1_pd(K_SQRT2);
+    const __m256d one_v = _mm256_set1_pd(1.0);
+    const __m256d half_v = _mm256_set1_pd(0.5);
+    const __m256d two_v = _mm256_set1_pd(2.0);
+    const __m256d ln2hi_v = _mm256_set1_pd(K_LN2_HI);
+    const __m256d ln2lo_v = _mm256_set1_pd(K_LN2_LO);
+    const __m256d invalid_mask = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_LE_OS);
+
+    auto extract_em = [](__m128d xh, __m128d &mh, __m128d &eh) {
+        __m128i bits = _mm_castpd_si128(xh);
+        __m128i eb = _mm_srli_epi64(bits, 52);
+        __m128i mm = _mm_set1_epi64x(0x000FFFFFFFFFFFFFLL);
+        __m128i eo = _mm_set1_epi64x(0x3FF0000000000000LL);
+        mh = _mm_castsi128_pd(_mm_or_si128(_mm_and_si128(bits, mm), eo));
+        __m128i eu = _mm_sub_epi64(eb, _mm_set1_epi64x(1023LL));
+        long long e0, e1;
+        _mm_storel_epi64(reinterpret_cast<__m128i *>(&e0), eu);
+        _mm_storel_epi64(reinterpret_cast<__m128i *>(&e1), _mm_unpackhi_epi64(eu, eu));
+        eh = _mm_set_pd(static_cast<double>(e1), static_cast<double>(e0));
+    };
+
+    __m128d m_lo, e_lo, m_hi, e_hi;
+    extract_em(_mm256_castpd256_pd128(x), m_lo, e_lo);
+    extract_em(_mm256_extractf128_pd(x, 1), m_hi, e_hi);
+    __m256d m = _mm256_set_m128d(m_hi, m_lo);
+    __m256d e = _mm256_set_m128d(e_hi, e_lo);
+
+    __m256d adj = _mm256_cmp_pd(m, sqrt2_v, _CMP_GT_OS);
+    e = _mm256_add_pd(e, _mm256_and_pd(adj, one_v));
+    m = _mm256_blendv_pd(m, _mm256_mul_pd(m, half_v), adj);
+
+    __m256d y = _mm256_div_pd(_mm256_sub_pd(m, one_v), _mm256_add_pd(m, one_v));
+    __m256d y2 = _mm256_mul_pd(y, y);
+
+#define K_FMA256(a_, b_, c_) _mm256_add_pd(_mm256_mul_pd((a_), (b_)), (c_))
+    __m256d p = _mm256_set1_pd(K_LOG_C6);
+    p = K_FMA256(p, y2, _mm256_set1_pd(K_LOG_C5));
+    p = K_FMA256(p, y2, _mm256_set1_pd(K_LOG_C4));
+    p = K_FMA256(p, y2, _mm256_set1_pd(K_LOG_C3));
+    p = K_FMA256(p, y2, _mm256_set1_pd(K_LOG_C2));
+    p = K_FMA256(p, y2, _mm256_set1_pd(K_LOG_C1));
+    p = K_FMA256(p, y2, _mm256_set1_pd(K_LOG_C0));
+    __m256d log_m = _mm256_mul_pd(_mm256_mul_pd(two_v, y), p);
+    __m256d result =
+        _mm256_add_pd(_mm256_mul_pd(e, ln2hi_v), _mm256_add_pd(_mm256_mul_pd(e, ln2lo_v), log_m));
+#undef K_FMA256
+    result = _mm256_blendv_pd(result, neg_inf_v, invalid_mask);
+    return result;
+}
+
+[[nodiscard]] static inline __m256d k_exp_pd_avx(__m256d x) noexcept {
+    const __m256d uflow_v = _mm256_set1_pd(K_EXP_UNDERFLOW);
+    const __m256d log2e_v = _mm256_set1_pd(K_LOG2E);
+    const __m256d half_v = _mm256_set1_pd(0.5);
+    const __m256d ln2hi_v = _mm256_set1_pd(K_LN2_HI);
+    const __m256d ln2lo_v = _mm256_set1_pd(K_LN2_LO);
+    const __m256d zero_v = _mm256_setzero_pd();
+    const __m256d ufl_mask = _mm256_cmp_pd(x, uflow_v, _CMP_LE_OS);
+    x = _mm256_max_pd(x, uflow_v);
+    __m256d n = _mm256_floor_pd(_mm256_add_pd(_mm256_mul_pd(x, log2e_v), half_v));
+    __m256d r = _mm256_sub_pd(x, _mm256_mul_pd(n, ln2hi_v));
+    r = _mm256_sub_pd(r, _mm256_mul_pd(n, ln2lo_v));
+
+#define K_MA256(a_, b_, c_) _mm256_add_pd(_mm256_mul_pd((a_), (b_)), (c_))
+    __m256d p = _mm256_set1_pd(K_EXP_C12);
+    p = K_MA256(p, r, _mm256_set1_pd(K_EXP_C11));
+    p = K_MA256(p, r, _mm256_set1_pd(K_EXP_C10));
+    p = K_MA256(p, r, _mm256_set1_pd(K_EXP_C9));
+    p = K_MA256(p, r, _mm256_set1_pd(K_EXP_C8));
+    p = K_MA256(p, r, _mm256_set1_pd(K_EXP_C7));
+    p = K_MA256(p, r, _mm256_set1_pd(K_EXP_C6));
+    p = K_MA256(p, r, _mm256_set1_pd(K_EXP_C5));
+    p = K_MA256(p, r, _mm256_set1_pd(K_EXP_C4));
+    p = K_MA256(p, r, _mm256_set1_pd(K_EXP_C3));
+    p = K_MA256(p, r, _mm256_set1_pd(K_EXP_C2));
+    p = K_MA256(p, r, _mm256_set1_pd(K_EXP_C1));
+    p = K_MA256(p, r, _mm256_set1_pd(K_EXP_C0));
+#undef K_MA256
+
+    __m128d n_lo = _mm256_castpd256_pd128(n), n_hi = _mm256_extractf128_pd(n, 1);
+    auto bp2 = [](__m128d nd) {
+        __m128i ni32 =
+            _mm_add_epi32(_mm_cvttpd_epi32(nd), _mm_set1_epi32(static_cast<int>(K_EXPONENT_BIAS)));
+        __m128i i64 = _mm_slli_epi64(_mm_unpacklo_epi32(ni32, _mm_setzero_si128()), 52);
+        return _mm_castsi128_pd(i64);
+    };
+    __m256d result = _mm256_mul_pd(p, _mm256_set_m128d(bp2(n_hi), bp2(n_lo)));
+    result = _mm256_blendv_pd(result, zero_v, ufl_mask);
+    return result;
+}
+
+#endif // LIBHMM_HAS_AVX || LIBHMM_HAS_AVX2
+
+// ---------------------------------------------------------------------------
+// SSE2 helpers
+// ---------------------------------------------------------------------------
+#if defined(LIBHMM_HAS_SSE2)
+
+[[nodiscard]] static inline __m128d k_log_pd_sse2(__m128d x) noexcept {
+    const double neg_inf = -std::numeric_limits<double>::infinity();
+    const __m128d neg_inf_v = _mm_set1_pd(neg_inf);
+    const __m128d sqrt2_v = _mm_set1_pd(K_SQRT2);
+    const __m128d one_v = _mm_set1_pd(1.0);
+    const __m128d half_v = _mm_set1_pd(0.5);
+    const __m128d two_v = _mm_set1_pd(2.0);
+    const __m128d ln2hi_v = _mm_set1_pd(K_LN2_HI);
+    const __m128d ln2lo_v = _mm_set1_pd(K_LN2_LO);
+    const __m128d invalid = _mm_cmple_pd(x, _mm_setzero_pd());
+    __m128i bits = _mm_castpd_si128(x);
+    __m128i eb = _mm_srli_epi64(bits, 52);
+    __m128i mbits = _mm_or_si128(_mm_and_si128(bits, _mm_set1_epi64x(0x000FFFFFFFFFFFFFLL)),
+                                 _mm_set1_epi64x(0x3FF0000000000000LL));
+    __m128d m = _mm_castsi128_pd(mbits);
+    __m128i eu = _mm_sub_epi64(eb, _mm_set1_epi64x(1023LL));
+    long long e0, e1;
+    _mm_storel_epi64(reinterpret_cast<__m128i *>(&e0), eu);
+    _mm_storel_epi64(reinterpret_cast<__m128i *>(&e1), _mm_unpackhi_epi64(eu, eu));
+    __m128d e = _mm_set_pd(static_cast<double>(e1), static_cast<double>(e0));
+    __m128d adj = _mm_cmpgt_pd(m, sqrt2_v);
+    e = _mm_add_pd(e, _mm_and_pd(adj, one_v));
+    m = _mm_or_pd(_mm_andnot_pd(adj, m), _mm_and_pd(adj, _mm_mul_pd(m, half_v)));
+    __m128d y = _mm_div_pd(_mm_sub_pd(m, one_v), _mm_add_pd(m, one_v));
+    __m128d y2 = _mm_mul_pd(y, y);
+#define K_FMA128(a_, b_, c_) _mm_add_pd(_mm_mul_pd((a_), (b_)), (c_))
+    __m128d p = _mm_set1_pd(K_LOG_C6);
+    p = K_FMA128(p, y2, _mm_set1_pd(K_LOG_C5));
+    p = K_FMA128(p, y2, _mm_set1_pd(K_LOG_C4));
+    p = K_FMA128(p, y2, _mm_set1_pd(K_LOG_C3));
+    p = K_FMA128(p, y2, _mm_set1_pd(K_LOG_C2));
+    p = K_FMA128(p, y2, _mm_set1_pd(K_LOG_C1));
+    p = K_FMA128(p, y2, _mm_set1_pd(K_LOG_C0));
+    __m128d log_m = _mm_mul_pd(_mm_mul_pd(two_v, y), p);
+    __m128d result = _mm_add_pd(_mm_mul_pd(e, ln2hi_v), _mm_add_pd(_mm_mul_pd(e, ln2lo_v), log_m));
+#undef K_FMA128
+    result = _mm_or_pd(_mm_andnot_pd(invalid, result), _mm_and_pd(invalid, neg_inf_v));
+    return result;
+}
+
+[[nodiscard]] static inline __m128d k_exp_pd_sse2(__m128d x) noexcept {
+    const __m128d uflow_v = _mm_set1_pd(K_EXP_UNDERFLOW);
+    const __m128d log2e_v = _mm_set1_pd(K_LOG2E);
+    const __m128d half_v = _mm_set1_pd(0.5);
+    const __m128d ln2hi_v = _mm_set1_pd(K_LN2_HI);
+    const __m128d ln2lo_v = _mm_set1_pd(K_LN2_LO);
+    const __m128d zero_v = _mm_setzero_pd();
+    const __m128d ufl = _mm_cmple_pd(x, uflow_v);
+    x = _mm_max_pd(x, uflow_v);
+    __m128d t = _mm_add_pd(_mm_mul_pd(x, log2e_v), half_v);
+    __m128i ni = _mm_cvttpd_epi32(t);
+    __m128d n = _mm_cvtepi32_pd(ni);
+    n = _mm_sub_pd(n, _mm_and_pd(_mm_cmpgt_pd(n, t), _mm_set1_pd(1.0)));
+    __m128d r = _mm_sub_pd(x, _mm_mul_pd(n, ln2hi_v));
+    r = _mm_sub_pd(r, _mm_mul_pd(n, ln2lo_v));
+#define K_MA128(a_, b_, c_) _mm_add_pd(_mm_mul_pd((a_), (b_)), (c_))
+    __m128d p = _mm_set1_pd(K_EXP_C12);
+    p = K_MA128(p, r, _mm_set1_pd(K_EXP_C11));
+    p = K_MA128(p, r, _mm_set1_pd(K_EXP_C10));
+    p = K_MA128(p, r, _mm_set1_pd(K_EXP_C9));
+    p = K_MA128(p, r, _mm_set1_pd(K_EXP_C8));
+    p = K_MA128(p, r, _mm_set1_pd(K_EXP_C7));
+    p = K_MA128(p, r, _mm_set1_pd(K_EXP_C6));
+    p = K_MA128(p, r, _mm_set1_pd(K_EXP_C5));
+    p = K_MA128(p, r, _mm_set1_pd(K_EXP_C4));
+    p = K_MA128(p, r, _mm_set1_pd(K_EXP_C3));
+    p = K_MA128(p, r, _mm_set1_pd(K_EXP_C2));
+    p = K_MA128(p, r, _mm_set1_pd(K_EXP_C1));
+    p = K_MA128(p, r, _mm_set1_pd(K_EXP_C0));
+#undef K_MA128
+    __m128i ni32b =
+        _mm_add_epi32(_mm_cvttpd_epi32(n), _mm_set1_epi32(static_cast<int>(K_EXPONENT_BIAS)));
+    __m128i i64 = _mm_slli_epi64(_mm_unpacklo_epi32(ni32b, _mm_setzero_si128()), 52);
+    __m128d result = _mm_mul_pd(p, _mm_castsi128_pd(i64));
+    result = _mm_or_pd(_mm_andnot_pd(ufl, result), _mm_and_pd(ufl, zero_v));
+    return result;
+}
+
+#endif // LIBHMM_HAS_SSE2
+
+// ---------------------------------------------------------------------------
+// NEON helpers
+// ---------------------------------------------------------------------------
+#if defined(LIBHMM_HAS_NEON)
+
+[[nodiscard]] static inline float64x2_t k_log_pd_neon(float64x2_t x) noexcept {
+    const float64x2_t neg_inf_v = vdupq_n_f64(-std::numeric_limits<double>::infinity());
+    const float64x2_t sqrt2_v = vdupq_n_f64(K_SQRT2);
+    const float64x2_t one_v = vdupq_n_f64(1.0);
+    const float64x2_t half_v = vdupq_n_f64(0.5);
+    const float64x2_t two_v = vdupq_n_f64(2.0);
+    const float64x2_t ln2hi_v = vdupq_n_f64(K_LN2_HI);
+    const float64x2_t ln2lo_v = vdupq_n_f64(K_LN2_LO);
+    const uint64x2_t invalid = vcleq_f64(x, vdupq_n_f64(0.0));
+    uint64x2_t bits = vreinterpretq_u64_f64(x);
+    uint64x2_t eb = vshrq_n_u64(bits, 52);
+    uint64x2_t mbits = vorrq_u64(vandq_u64(bits, vdupq_n_u64(0x000FFFFFFFFFFFFFULL)),
+                                 vdupq_n_u64(0x3FF0000000000000ULL));
+    float64x2_t m = vreinterpretq_f64_u64(mbits);
+    float64x2_t e = vcvtq_f64_s64(vsubq_s64(vreinterpretq_s64_u64(eb), vdupq_n_s64(1023LL)));
+    uint64x2_t adj = vcgtq_f64(m, sqrt2_v);
+    e = vbslq_f64(adj, vaddq_f64(e, one_v), e);
+    m = vbslq_f64(adj, vmulq_f64(m, half_v), m);
+    float64x2_t y = vdivq_f64(vsubq_f64(m, one_v), vaddq_f64(m, one_v));
+    float64x2_t y2 = vmulq_f64(y, y);
+    float64x2_t p = vdupq_n_f64(K_LOG_C6);
+    p = vfmaq_f64(vdupq_n_f64(K_LOG_C5), p, y2);
+    p = vfmaq_f64(vdupq_n_f64(K_LOG_C4), p, y2);
+    p = vfmaq_f64(vdupq_n_f64(K_LOG_C3), p, y2);
+    p = vfmaq_f64(vdupq_n_f64(K_LOG_C2), p, y2);
+    p = vfmaq_f64(vdupq_n_f64(K_LOG_C1), p, y2);
+    p = vfmaq_f64(vdupq_n_f64(K_LOG_C0), p, y2);
+    float64x2_t log_m = vmulq_f64(vmulq_f64(two_v, y), p);
+    float64x2_t result = vfmaq_f64(vfmaq_f64(log_m, e, ln2lo_v), e, ln2hi_v);
+    result = vbslq_f64(invalid, neg_inf_v, result);
+    return result;
+}
+
+[[nodiscard]] static inline float64x2_t k_exp_pd_neon(float64x2_t x) noexcept {
+    const float64x2_t uflow_v = vdupq_n_f64(K_EXP_UNDERFLOW);
+    const float64x2_t log2e_v = vdupq_n_f64(K_LOG2E);
+    const float64x2_t half_v = vdupq_n_f64(0.5);
+    const float64x2_t ln2hi_v = vdupq_n_f64(K_LN2_HI);
+    const float64x2_t ln2lo_v = vdupq_n_f64(K_LN2_LO);
+    const float64x2_t zero_v = vdupq_n_f64(0.0);
+    const uint64x2_t valid = vcgtq_f64(x, uflow_v);
+    x = vmaxq_f64(x, uflow_v);
+    float64x2_t n = vrndmq_f64(vfmaq_f64(half_v, x, log2e_v));
+    float64x2_t r = vfmsq_f64(x, n, ln2hi_v);
+    r = vfmsq_f64(r, n, ln2lo_v);
+    float64x2_t p = vdupq_n_f64(K_EXP_C12);
+    p = vfmaq_f64(vdupq_n_f64(K_EXP_C11), p, r);
+    p = vfmaq_f64(vdupq_n_f64(K_EXP_C10), p, r);
+    p = vfmaq_f64(vdupq_n_f64(K_EXP_C9), p, r);
+    p = vfmaq_f64(vdupq_n_f64(K_EXP_C8), p, r);
+    p = vfmaq_f64(vdupq_n_f64(K_EXP_C7), p, r);
+    p = vfmaq_f64(vdupq_n_f64(K_EXP_C6), p, r);
+    p = vfmaq_f64(vdupq_n_f64(K_EXP_C5), p, r);
+    p = vfmaq_f64(vdupq_n_f64(K_EXP_C4), p, r);
+    p = vfmaq_f64(vdupq_n_f64(K_EXP_C3), p, r);
+    p = vfmaq_f64(vdupq_n_f64(K_EXP_C2), p, r);
+    p = vfmaq_f64(vdupq_n_f64(K_EXP_C1), p, r);
+    p = vfmaq_f64(vdupq_n_f64(K_EXP_C0), p, r);
+    int64x2_t ni64 =
+        vaddq_s64(vcvtq_s64_f64(n), vdupq_n_s64(static_cast<int64_t>(K_EXPONENT_BIAS)));
+    float64x2_t result = vmulq_f64(p, vreinterpretq_f64_s64(vshlq_n_s64(ni64, 52)));
+    result = vbslq_f64(valid, result, zero_v);
+    return result;
+}
+
+#endif // LIBHMM_HAS_NEON
+
+} // namespace kernels
+} // namespace detail
+} // namespace performance
+} // namespace libhmm
diff --git a/include/libhmm/performance/transcendental_kernels.h b/include/libhmm/performance/transcendental_kernels.h
new file mode 100644
index 0000000..8cce072
--- /dev/null
+++ b/include/libhmm/performance/transcendental_kernels.h
@@ -0,0 +1,62 @@
+#pragma once
+
+#include <cstddef>
+
+/**
+ * @file transcendental_kernels.h
+ * @brief SIMD-accelerated inner-loop kernels for FB max-reduce and BW xi accumulation.
+ *
+ * Declares five static methods on TranscendentalKernels. Implementations live in
+ * src/performance/transcendental_kernels.cpp and are compiled with
+ * LIBHMM_BEST_SIMD_FLAGS, activating the appropriate #if LIBHMM_HAS_* cascade:
+ *   AVX-512  8-wide __m512d
+ *   AVX/AVX2 4-wide __m256d  (AVX-1 compatible; AVX2 compiler fuses FMA)
+ *   SSE2     2-wide __m128d
+ *   NEON     2-wide float64x2_t
+ *   scalar   tail / fallback
+ *
+ * Active ISA diagnostics use libhmm::performance::simd::feature_string() and
+ * double_vector_width() from simd_platform.h — consistent with the rest of the library.
+ */
+
+namespace libhmm {
+namespace performance {
+namespace detail {
+
+/**
+ * @brief Vectorised inner-loop kernels shared by ForwardBackwardCalculator (max-reduce
+ *        recurrence) and BaumWelchTrainer (dense-xi accumulation).
+ *
+ * All methods are noexcept and operate on raw double pointers.  Inputs are
+ * expected to be either finite log-probabilities or LOG_ZERO (-inf); +inf and
+ * NaN are not produced by any production caller and are not guarded.
+ */
+class TranscendentalKernels {
+public:
+    /// Element-wise max of (a[i]+b[i]) over [0, size).  No exp calls.
+    [[nodiscard]] static double reduce_max_sum2(const double *a, const double *b,
+                                                std::size_t size) noexcept;
+
+    /// Sum of exp(a[i]+b[i] - maxVal) for finite terms, over [0, size).
+    /// Returns 0 when maxVal is not finite.
+    [[nodiscard]] static double sum_exp_sum2_minus_max(const double *a, const double *b,
+                                                       std::size_t size, double maxVal) noexcept;
+
+    /// Element-wise max of (a[i]+b[i]+c[i]) over [0, size).  No exp calls.
+    [[nodiscard]] static double reduce_max_sum3(const double *a, const double *b, const double *c,
+                                                std::size_t size) noexcept;
+
+    /// Sum of exp(a[i]+b[i]+c[i] - maxVal) for finite terms, over [0, size).
+    /// Returns 0 when maxVal is not finite.
+    [[nodiscard]] static double sum_exp_sum3_minus_max(const double *a, const double *b,
+                                                       const double *c, std::size_t size,
+                                                       double maxVal) noexcept;
+
+    /// dst[i] += exp(a[i] + b[i] + bias) for i in [0, size).
+    static void accumulate_exp_sum2_bias(double *dst, const double *a, const double *b,
+                                         std::size_t size, double bias) noexcept;
+};
+
+} // namespace detail
+} // namespace performance
+} // namespace libhmm
diff --git a/include/libhmm/platform/simd_platform.h b/include/libhmm/platform/simd_platform.h
index 6bdcb62..6194c1a 100644
--- a/include/libhmm/platform/simd_platform.h
+++ b/include/libhmm/platform/simd_platform.h
@@ -26,11 +26,10 @@
  * - SINGLE RESPONSIBILITY: This header only handles SIMD platform concerns
  * - EXTENSIBILITY: Easy to add new SIMD instruction sets or platforms
  *
- * FILES THAT INCLUDE THIS HEADER:
- * - src/distributions/gaussian_distribution.cpp (tier-2 SIMD intrinsics)
- * - src/distributions/exponential_distribution.cpp (tier-2 SIMD intrinsics)
- * - tools/simd_inspection.cpp (ISA capability report + smoke tests)
- * - include/libhmm/performance/transcendental_kernels.h (perf branch)
+ * Included by Tier-2 distribution TUs, performance kernel TUs
+ * (transcendental_kernels.cpp, forward_backward_calculator.cpp,
+ * baum_welch_trainer.cpp), and diagnostic tools (simd_inspection.cpp).
+ * Also included transitively via simd_kernels_internal.h.
  *
  * Features:
  * - Cross-platform SIMD intrinsics inclusion
diff --git a/performance/PERFORMANCE_ARCHITECTURE.md b/performance/PERFORMANCE_ARCHITECTURE.md
index 7caf353..085d597 100644
--- a/performance/PERFORMANCE_ARCHITECTURE.md
+++ b/performance/PERFORMANCE_ARCHITECTURE.md
@@ -8,12 +8,12 @@ void getBatchLogProbabilities(std::span<const double> observations,
 ```
 The canonical calculators (`ForwardBackwardCalculator`, `ViterbiCalculator`) call this once per state per `compute()`, producing T contiguous log-emission values that the recurrences then consume from a flat row-major buffer.
 Two tiers of implementation:
-- **Tier 2 — explicit intrinsics.** `GaussianDistribution` and `ExponentialDistribution` ship hand-written `detail::` free functions with an AVX-512 → AVX/AVX2 → SSE2 → NEON → scalar dispatch chain. See `src/distributions/gaussian_distribution.cpp` `detail::gaussian_logpdf_batch` for the canonical shape. The free-function pattern is deliberately extractable to a separate TU for future runtime dispatch without API changes.
-- **Tier 1 — auto-vectorization-friendly loops.** The other 13 distributions implement `getBatchLogProbabilities` as concrete non-virtual loops over plain arrays, compiled with `LIBHMM_BEST_SIMD_FLAGS` (the highest CPU-verified ISA on the build machine). Whether the compiler actually emits vector instructions depends on the loop body — transcendentals like `std::exp` are not auto-vectorized by MSVC even with `/arch:AVX2`, so tier 1 is best read as "well-shaped scalar code" rather than "guaranteed SIMD."
+- **Tier 2 — explicit intrinsics.** `GaussianDistribution`, `ExponentialDistribution`, `LogNormalDistribution`, and `ParetoDistribution` ship hand-written `detail::` free functions with an AVX-512 → AVX/AVX2 → SSE2 → NEON → scalar dispatch chain. See `src/distributions/gaussian_distribution.cpp` `detail::gaussian_logpdf_batch` for the canonical shape. The free-function pattern is deliberately extractable to a separate TU for future runtime dispatch without API changes. Tier-2 log-probability kernels share vector log/exp helpers from `include/libhmm/performance/simd_kernels_internal.h`.
+- **Tier 1 — auto-vectorization-friendly loops.** The other 11 distributions implement `getBatchLogProbabilities` as concrete non-virtual loops over plain arrays, compiled with `LIBHMM_BEST_SIMD_FLAGS` (the highest CPU-verified ISA on the build machine). Whether the compiler actually emits vector instructions depends on the loop body — transcendentals like `std::exp` are not auto-vectorized by MSVC even with `/arch:AVX2`, so tier 1 is best read as "well-shaped scalar code" rather than "guaranteed SIMD."
 All 15 distribution TUs are listed in `LIBHMM_SIMD_SOURCES` in the top-level `CMakeLists.txt` and receive the SIMD compile flags.
 ## Where SIMD does and doesn't live today
 - ✅ **Distribution batch emission evaluation** — `getBatchLogProbabilities`. Effective for emission-bound workloads (continuous distributions, large T). Tier 2 in particular delivers measurable speedups; tier 1 depends on compiler heuristics.
-- ⚠️ **Recurrence kernels** — FB max-reduce, BW xi accumulation, Viterbi inner loop. These are state×state inner loops dominated by `exp` / `log1p` calls. Currently scalar. The active perf-branch work introduces an internal `TranscendentalKernels` abstraction in `include/libhmm/performance/transcendental_kernels.h` with scalar today and AVX2/NEON backends planned, so future explicit vector-math implementations can plug in without rewriting the call sites.
+- ✅ **Recurrence kernels** — FB max-reduce and BW xi accumulation. Five kernels in `src/performance/transcendental_kernels.cpp` with an AVX-512 → AVX/AVX2 → SSE2 → NEON → scalar cascade, consumed by `ForwardBackwardCalculator` and `BaumWelchTrainer`. The `TranscendentalKernels` class in `include/libhmm/performance/transcendental_kernels.h` exposes the public interface; call sites in the two consumer TUs are unchanged. Viterbi inner loop remains scalar.
 - The runtime `Matrix`/`Vector` typedefs in `common/common.h` resolve to `BasicMatrix<double>`/`BasicVector<double>`. The library no longer ships separate "optimized" container variants (see Historical context).
 ## Threading: not currently used
 Production calculators and trainers run single-threaded on every workload. Specifically:
@@ -29,6 +29,6 @@ The build system picks the highest CPU-verified ISA per machine and applies it a
 - **GCC/Clang on all platforms**: `-march=native`. Selects NEON on AArch64, the highest available x86 ISA on Intel/AMD.
 - **MSVC on x86_64**: probes `/arch:AVX512`, `/arch:AVX2`, `/arch:AVX` via `check_cxx_source_runs` and selects the highest one the build machine can actually execute (not just the highest the compiler accepts). Falls back to SSE2 baseline in cross-compilation.
 - **AArch64**: NEON is the mandatory ISA baseline; no flag needed.
-See the `# SIMD DETECTION` block in `CMakeLists.txt` for details. The non-distribution sources (`src/common/`, `src/calculators/`, `src/training/`, `src/io/`, `src/performance/`) compile at the platform baseline ISA so that explicit intrinsics in the distribution TUs are the only place SIMD codegen is committed to.
+See the `# SIMD DETECTION` block in `CMakeLists.txt` for details. Most non-distribution sources (`src/common/`, `src/io/`) compile at the platform baseline ISA. The exceptions are the three performance-critical TUs that contain explicit intrinsics: `src/performance/transcendental_kernels.cpp`, `src/calculators/forward_backward_calculator.cpp`, and `src/training/baum_welch_trainer.cpp` — these are listed in `LIBHMM_SIMD_SOURCES` alongside the distribution TUs and receive the full `LIBHMM_BEST_SIMD_FLAGS`.
 ## Historical context
 An earlier draft of this document described a four-level hierarchy in which calculators consumed `OptimizedMatrix`/`OptimizedVector` containers and a `WorkStealingPool` provided per-state parallelism. That plan was superseded by the v3.0.0-alpha (Phase 4) refactor (see `CHANGELOG.md`), which removed the per-calculator SIMD variants (`ScaledSIMD*`, `LogSIMD*`, `AdvancedLog*`) in favor of the per-distribution batch interface documented above. The Optimized\* containers, `WorkStealingPool`, the per-library `Benchmark` framework, and the parallel-execution constants/utilities they depended on were retained for several releases as "future hooks" but never wired into the canonical calculator/trainer pipeline; they were removed in a subsequent dead-code cleanup. The SIMD investment in `getBatchLogProbabilities` is the canonical and current strategy.
diff --git a/scripts/configure_catalina.sh b/scripts/configure_catalina.sh
index ee8a257..6bb3fc7 100755
--- a/scripts/configure_catalina.sh
+++ b/scripts/configure_catalina.sh
@@ -42,6 +42,7 @@ env -u CC \
         -DCMAKE_C_COMPILER="${CC_BIN}" \
         -DCMAKE_CXX_COMPILER="${CXX_BIN}" \
         -DCMAKE_OSX_SYSROOT="${SYSROOT}" \
+        -DCMAKE_BUILD_TYPE=Release \
         -DCMAKE_OSX_DEPLOYMENT_TARGET=10.15 \
         "$@"
 
diff --git a/scripts/phase_gate.ps1 b/scripts/phase_gate.ps1
new file mode 100644
index 0000000..692aa41
--- /dev/null
+++ b/scripts/phase_gate.ps1
@@ -0,0 +1,125 @@
+#Requires -Version 7.0
+<#
+.SYNOPSIS
+    Phase gate: run the required correctness suite before each phase PR.
+
+.DESCRIPTION
+    Builds and runs the seven gate tests listed in the plan (Phase D3).
+    Exits with code 0 on all-pass, 1 on any failure or build error.
+
+.PARAMETER BuildDir
+    Path to the CMake binary directory.  Defaults to <repo-root>/build.
+
+.PARAMETER Config
+    CMake build configuration (Release, Debug, ...).  Defaults to Release.
+
+.PARAMETER Rebuild
+    If set, rebuild all gate targets before running them.
+#>
+param(
+    [string] $BuildDir  = "",
+    [string] $Config    = "Release",
+    [switch] $Rebuild
+)
+
+Set-StrictMode -Version Latest
+
+$scriptDir = Split-Path -Parent $MyInvocation.MyCommand.Path
+$repoRoot  = Split-Path -Parent $scriptDir
+
+if (-not $BuildDir) {
+    $BuildDir = Join-Path $repoRoot "build"
+}
+
+if (-not (Test-Path $BuildDir)) {
+    Write-Error "Build directory not found: $BuildDir"
+    Write-Error "Run cmake -S . -B build first."
+    exit 1
+}
+
+# Gate tests (plan Phase D, acceptance criteria).
+$gateTargets = @(
+    "test_canonical_calculators",
+    "test_calculator_continuous",
+    "test_calculator_edge_cases",
+    "test_canonical_training",
+    "test_baum_welch_convergence",
+    "test_fb_mode_parity",
+    "test_bw_parity"
+)
+
+# ── Optional rebuild ──────────────────────────────────────────────────────────
+if ($Rebuild) {
+    Write-Host "Building gate targets ($Config)..." -ForegroundColor Cyan
+    $buildArgs = @(
+        "--build", $BuildDir,
+        "--config", $Config,
+        "--target"
+    ) + $gateTargets
+    cmake @buildArgs
+    if ($LASTEXITCODE -ne 0) {
+        Write-Host ""
+        Write-Host "PHASE GATE FAILED: build error." -ForegroundColor Red
+        exit 1
+    }
+}
+
+# ── Locate executables ────────────────────────────────────────────────────────
+# Multi-config generators (VS, Xcode) put binaries in <build>/tests/<Config>/.
+# Single-config generators (Makefiles, Ninja) put them in <build>/tests/.
+$testDir = Join-Path $BuildDir "tests"
+$candidates = @(
+    (Join-Path $testDir $Config),
+    $testDir
+)
+
+function Find-Exe {
+    param([string]$name)
+    foreach ($dir in $candidates) {
+        $exePath = Join-Path $dir "$name.exe"
+        if (Test-Path $exePath) { return $exePath }
+        $exePath = Join-Path $dir $name
+        if (Test-Path $exePath) { return $exePath }
+    }
+    return $null
+}
+
+# ── Run each gate test ────────────────────────────────────────────────────────
+$results  = [ordered]@{}
+$anyFail  = $false
+
+Write-Host ""
+Write-Host "Phase gate  —  $Config  —  $(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')" -ForegroundColor Cyan
+Write-Host ("-" * 60)
+
+foreach ($target in $gateTargets) {
+    $exe = Find-Exe $target
+    if (-not $exe) {
+        Write-Host "  SKIP  $target  (executable not found; run with -Rebuild)" -ForegroundColor Yellow
+        $results[$target] = "SKIP"
+        $anyFail = $true
+        continue
+    }
+
+    & $exe --gtest_color=no 2>&1 | Out-Null
+    if ($LASTEXITCODE -eq 0) {
+        Write-Host "  PASS  $target" -ForegroundColor Green
+        $results[$target] = "PASS"
+    } else {
+        Write-Host "  FAIL  $target" -ForegroundColor Red
+        $results[$target] = "FAIL"
+        $anyFail = $true
+        # Re-run with output so the failure is visible.
+        & $exe --gtest_color=no
+    }
+}
+
+Write-Host ("-" * 60)
+
+if ($anyFail) {
+    Write-Host "PHASE GATE FAILED" -ForegroundColor Red
+    exit 1
+} else {
+    Write-Host "PHASE GATE PASSED  ($($gateTargets.Count)/$($gateTargets.Count))" -ForegroundColor Green
+    exit 0
+}
diff --git a/src/calculators/forward_backward_calculator.cpp b/src/calculators/forward_backward_calculator.cpp
index 1097acc..b0ab429 100755
--- a/src/calculators/forward_backward_calculator.cpp
+++ b/src/calculators/forward_backward_calculator.cpp
@@ -1,15 +1,35 @@
 #include "libhmm/calculators/forward_backward_calculator.h"
 #include "libhmm/hmm.h"
-#include <algorithm>
+#include "libhmm/performance/transcendental_kernels.h"
 #include <cmath>
 #include <limits>
-#include <stdexcept>
 #include <span>
+#include <stdexcept>
 
 namespace libhmm {
 
 namespace {
 constexpr double LOG_ZERO = -std::numeric_limits<double>::infinity();
+} // namespace
+
+FbRecurrenceMode
+ForwardBackwardCalculator::resolveRecurrenceMode(const std::size_t numStates,
+                                                 const std::size_t sequenceLength) const noexcept {
+#if defined(LIBHMM_EXPERIMENT_FB_MAX_REDUCE)
+    // Compile-time forcer: highest priority. Preserves benchmark-build contract.
+    (void)numStates;
+    (void)sequenceLength;
+    return FbRecurrenceMode::MaxReduce;
+#elif defined(LIBHMM_EXPERIMENT_FB_ADAPTIVE_SELECTOR)
+    // Legacy adaptive forcer: simple N>2 cutoff. Preserves benchmark-build contract.
+    (void)sequenceLength;
+    return (numStates > 2) ? FbRecurrenceMode::MaxReduce : FbRecurrenceMode::Pairwise;
+#else
+    if (modeOverride_.has_value()) {
+        return *modeOverride_;
+    }
+    return selectFbRecurrenceMode(numStates, sequenceLength);
+#endif
 }
 
 // ---------------------------------------------------------------------------
@@ -46,28 +66,36 @@ void ForwardBackwardCalculator::compute() {
         return;
     }
 
-    // Allocate/resize result matrices
+    // Allocate/resize result matrices.
     logAlpha_.resize(T, numStates_);
     logBeta_.resize(T, numStates_);
 
-    // Pre-fill the log-emission buffer: logEmitBuf_[i * T + t] = log b_i(O_t)
-    // Build observation span once; reuse across all N states.
+    // Build state-major log-emission buffer: logEmitBuf_[i * T + t] = log b_i(O_t).
+    // Then derive shared time-major layout: logEmitByTime_[t * N + i] = log b_i(O_t).
     logEmitBuf_.resize(T * numStates_);
-    std::vector<double> obsVec(T);
-    for (std::size_t t = 0; t < T; ++t)
-        obsVec[t] = observations_(t);
-    const std::span<const double> obsSpan(obsVec.data(), T);
+    logEmitByTime_.resize(T * numStates_);
+    const std::span<const double> obsSpan(observations_.data(), T);
 
     const Hmm &hmm = getHmmRef();
     for (std::size_t i = 0; i < numStates_; ++i) {
         hmm.getDistribution(i).getBatchLogProbabilities(
             obsSpan, std::span<double>(logEmitBuf_.data() + i * T, T));
     }
+    for (std::size_t i = 0; i < numStates_; ++i) {
+        const double *stateRow = logEmitBuf_.data() + i * T;
+        for (std::size_t t = 0; t < T; ++t) {
+            logEmitByTime_[t * numStates_ + i] = stateRow[t];
+        }
+    }
+
+    // Resolve recurrence mode per the compile-time forcer / instance override /
+    // static policy pipeline.
+    currentMode_ = resolveRecurrenceMode(numStates_, T);
 
     computeLogForward();
     computeLogBackward();
 
-    // log P(O|λ) = log-sum-exp over states at final timestep
+    // log P(O|lambda) = log-sum-exp over states at final timestep.
     double lp = LOG_ZERO;
     for (std::size_t i = 0; i < numStates_; ++i) {
         lp = logSumExp(lp, logAlpha_(T - 1, i));
@@ -83,70 +111,192 @@ void ForwardBackwardCalculator::precomputeLogTransitions() {
     const Hmm &hmm = getHmmRef();
     const Matrix &trans = hmm.getTrans();
     logTrans_.resize(numStates_, numStates_);
+    logTransT_.resize(numStates_, numStates_);
     for (std::size_t i = 0; i < numStates_; ++i) {
         for (std::size_t j = 0; j < numStates_; ++j) {
             const double a = trans(i, j);
-            logTrans_(i, j) = (a > 0.0) ? std::log(a) : LOG_ZERO;
+            const double logA = (a > 0.0) ? std::log(a) : LOG_ZERO;
+            logTrans_(i, j) = logA;
+            logTransT_(j, i) = logA;
         }
     }
 }
 
 void ForwardBackwardCalculator::computeLogForward() {
+    if (currentMode_ == FbRecurrenceMode::MaxReduce) {
+        computeLogForwardMaxReduce();
+        return;
+    }
+    computeLogForwardPairwise();
+}
+
+void ForwardBackwardCalculator::computeLogForwardPairwise() {
     const Hmm &hmm = getHmmRef();
     const Vector &pi = hmm.getPi();
     const std::size_t T = observations_.size();
+    const std::size_t N = numStates_;
+    const double *logTransTData = logTransT_.data();
+    const double *emitByTimeData = logEmitByTime_.data();
+    double *alphaData = logAlpha_.data();
 
-    // t = 0: log alpha(0, i) = log pi_i + log b_i(O_0)
-    for (std::size_t i = 0; i < numStates_; ++i) {
+    // t = 0.
+    const double *emitRow0 = emitByTimeData;
+    for (std::size_t i = 0; i < N; ++i) {
         const double logPi = (pi(i) > 0.0) ? std::log(pi(i)) : LOG_ZERO;
-        logAlpha_(0, i) = logPi + logEmitBuf_[i * T + 0];
+        alphaData[i] = logPi + emitRow0[i];
     }
 
-    // t > 0
+    // t > 0.
     for (std::size_t t = 1; t < T; ++t) {
-        for (std::size_t j = 0; j < numStates_; ++j) {
+        const double *prevAlphaRow = alphaData + (t - 1) * N;
+        double *alphaRow = alphaData + t * N;
+        const double *emitRow = emitByTimeData + t * N;
+        for (std::size_t j = 0; j < N; ++j) {
+            const double *transCol = logTransTData + j * N;
             double logSum = LOG_ZERO;
-            for (std::size_t i = 0; i < numStates_; ++i) {
-                logSum = logSumExp(logSum, logAlpha_(t - 1, i) + logTrans_(i, j));
+            for (std::size_t i = 0; i < N; ++i) {
+                logSum = logSumExp(logSum, prevAlphaRow[i] + transCol[i]);
             }
-            logAlpha_(t, j) = logEmitBuf_[j * T + t] + logSum;
+            alphaRow[j] = emitRow[j] + logSum;
+        }
+    }
+}
+
+void ForwardBackwardCalculator::computeLogForwardMaxReduce() {
+    const Hmm &hmm = getHmmRef();
+    const Vector &pi = hmm.getPi();
+    const std::size_t T = observations_.size();
+    const std::size_t N = numStates_;
+    const double *logTransTData = logTransT_.data();
+    const double *emitByTimeData = logEmitByTime_.data();
+    double *alphaData = logAlpha_.data();
+
+    // t = 0.
+    const double *emitRow0 = emitByTimeData;
+    for (std::size_t i = 0; i < N; ++i) {
+        const double logPi = (pi(i) > 0.0) ? std::log(pi(i)) : LOG_ZERO;
+        alphaData[i] = logPi + emitRow0[i];
+    }
+
+    // t > 0.
+    for (std::size_t t = 1; t < T; ++t) {
+        const double *prevAlphaRow = alphaData + (t - 1) * N;
+        double *alphaRow = alphaData + t * N;
+        const double *emitRow = emitByTimeData + t * N;
+        for (std::size_t j = 0; j < N; ++j) {
+            const double *transCol = logTransTData + j * N;
+            const double maxTerm = performance::detail::TranscendentalKernels::reduce_max_sum2(
+                prevAlphaRow, transCol, N);
+
+            double logSum = LOG_ZERO;
+            if (std::isfinite(maxTerm)) {
+                const double scaledSum =
+                    performance::detail::TranscendentalKernels::sum_exp_sum2_minus_max(
+                        prevAlphaRow, transCol, N, maxTerm);
+                if (scaledSum > 0.0) {
+                    logSum = maxTerm + std::log(scaledSum);
+                }
+            }
+            alphaRow[j] = emitRow[j] + logSum;
         }
     }
 }
 
 void ForwardBackwardCalculator::computeLogBackward() {
+    if (currentMode_ == FbRecurrenceMode::MaxReduce) {
+        computeLogBackwardMaxReduce();
+        return;
+    }
+    computeLogBackwardPairwise();
+}
+
+void ForwardBackwardCalculator::computeLogBackwardPairwise() {
     const std::size_t T = observations_.size();
+    const std::size_t N = numStates_;
+    const double *logTransData = logTrans_.data();
+    const double *emitByTimeData = logEmitByTime_.data();
+    double *betaData = logBeta_.data();
 
-    // t = T-1: log beta(T-1, i) = log(1) = 0
-    for (std::size_t i = 0; i < numStates_; ++i) {
-        logBeta_(T - 1, i) = 0.0;
+    // t = T - 1.
+    double *finalBetaRow = betaData + (T - 1) * N;
+    for (std::size_t i = 0; i < N; ++i) {
+        finalBetaRow[i] = 0.0;
     }
 
-    // t < T-1, working backwards
+    // t < T - 1.
     if (T > 1) {
         for (std::size_t t = T - 2;; --t) {
-            for (std::size_t i = 0; i < numStates_; ++i) {
+            double *betaRow = betaData + t * N;
+            const double *nextBetaRow = betaData + (t + 1) * N;
+            const double *emitNextRow = emitByTimeData + (t + 1) * N;
+            for (std::size_t i = 0; i < N; ++i) {
+                const double *transRow = logTransData + i * N;
                 double logSum = LOG_ZERO;
-                for (std::size_t j = 0; j < numStates_; ++j) {
-                    logSum = logSumExp(logSum, logTrans_(i, j) + logEmitBuf_[j * T + (t + 1)] +
-                                                   logBeta_(t + 1, j));
+                for (std::size_t j = 0; j < N; ++j) {
+                    logSum = logSumExp(logSum, transRow[j] + emitNextRow[j] + nextBetaRow[j]);
                 }
-                logBeta_(t, i) = logSum;
+                betaRow[i] = logSum;
             }
-            if (t == 0)
+            if (t == 0) {
                 break;
+            }
         }
     }
 }
 
-// Numerically stable log(exp(a) + exp(b))
+void ForwardBackwardCalculator::computeLogBackwardMaxReduce() {
+    const std::size_t T = observations_.size();
+    const std::size_t N = numStates_;
+    const double *logTransData = logTrans_.data();
+    const double *emitByTimeData = logEmitByTime_.data();
+    double *betaData = logBeta_.data();
+
+    // t = T - 1.
+    double *finalBetaRow = betaData + (T - 1) * N;
+    for (std::size_t i = 0; i < N; ++i) {
+        finalBetaRow[i] = 0.0;
+    }
+
+    // t < T - 1.
+    if (T > 1) {
+        for (std::size_t t = T - 2;; --t) {
+            double *betaRow = betaData + t * N;
+            const double *nextBetaRow = betaData + (t + 1) * N;
+            const double *emitNextRow = emitByTimeData + (t + 1) * N;
+            for (std::size_t i = 0; i < N; ++i) {
+                const double *transRow = logTransData + i * N;
+                const double maxTerm = performance::detail::TranscendentalKernels::reduce_max_sum3(
+                    transRow, emitNextRow, nextBetaRow, N);
+
+                double logSum = LOG_ZERO;
+                if (std::isfinite(maxTerm)) {
+                    const double scaledSum =
+                        performance::detail::TranscendentalKernels::sum_exp_sum3_minus_max(
+                            transRow, emitNextRow, nextBetaRow, N, maxTerm);
+                    if (scaledSum > 0.0) {
+                        logSum = maxTerm + std::log(scaledSum);
+                    }
+                }
+                betaRow[i] = logSum;
+            }
+            if (t == 0) {
+                break;
+            }
+        }
+    }
+}
+
+// Numerically stable log(exp(a) + exp(b)).
 double ForwardBackwardCalculator::logSumExp(double a, double b) noexcept {
-    if (a == LOG_ZERO)
+    if (a == LOG_ZERO) {
         return b;
-    if (b == LOG_ZERO)
+    }
+    if (b == LOG_ZERO) {
         return a;
-    if (a > b)
+    }
+    if (a > b) {
         return a + std::log1p(std::exp(b - a));
+    }
     return b + std::log1p(std::exp(a - b));
 }
 
diff --git a/src/calculators/viterbi_calculator.cpp b/src/calculators/viterbi_calculator.cpp
index 3ade510..ae5c18e 100755
--- a/src/calculators/viterbi_calculator.cpp
+++ b/src/calculators/viterbi_calculator.cpp
@@ -44,15 +44,19 @@ StateSequence ViterbiCalculator::decode() {
     // Fill log-emission buffer: logEmitBuf_[i * T + t] = log b_i(O_t)
     logEmitBuf_.resize(T * numStates_);
     const Hmm &hmm = getHmmRef();
-
-    std::vector<double> obsVec(T);
-    for (std::size_t t = 0; t < T; ++t)
-        obsVec[t] = observations_(t);
+    const std::span<const double> obsSpan(observations_.data(), T);
 
     for (std::size_t i = 0; i < numStates_; ++i) {
         hmm.getDistribution(i).getBatchLogProbabilities(
-            std::span<const double>(obsVec.data(), T),
-            std::span<double>(logEmitBuf_.data() + i * T, T));
+            obsSpan, std::span<double>(logEmitBuf_.data() + i * T, T));
+    }
+    // Build time-major emission buffer once for locality in dynamic programming.
+    logEmitByTime_.resize(T * numStates_);
+    for (std::size_t i = 0; i < numStates_; ++i) {
+        const double *stateRow = logEmitBuf_.data() + i * T;
+        for (std::size_t t = 0; t < T; ++t) {
+            logEmitByTime_[t * numStates_ + i] = stateRow[t];
+        }
     }
 
     runViterbi();
@@ -68,10 +72,13 @@ void ViterbiCalculator::precomputeLogTransitions() {
     const Hmm &hmm = getHmmRef();
     const Matrix &trans = hmm.getTrans();
     logTrans_.resize(numStates_, numStates_);
+    logTransT_.resize(numStates_, numStates_);
     for (std::size_t i = 0; i < numStates_; ++i) {
         for (std::size_t j = 0; j < numStates_; ++j) {
             const double a = trans(i, j);
-            logTrans_(i, j) = (a > 0.0) ? std::log(a) : LOG_ZERO;
+            const double logA = (a > 0.0) ? std::log(a) : LOG_ZERO;
+            logTrans_(i, j) = logA;
+            logTransT_(j, i) = logA;
         }
     }
 }
@@ -82,37 +89,48 @@ void ViterbiCalculator::runViterbi() {
     const std::size_t T = observations_.size();
 
     logDelta_.resize(T, numStates_);
-    psi_.assign(T, std::vector<int>(numStates_, 0));
+    psi_.assign(T * numStates_, 0);
+
+    const double *logTransTData = logTransT_.data();
+    const double *logEmitByTimeData = logEmitByTime_.data();
+    double *logDeltaData = logDelta_.data();
+    const std::size_t N = numStates_;
 
     // t = 0: initialise
+    const double *emitRow0 = logEmitByTimeData;
     for (std::size_t i = 0; i < numStates_; ++i) {
         const double logPi = (pi(i) > 0.0) ? std::log(pi(i)) : LOG_ZERO;
-        logDelta_(0, i) = logPi + logEmitBuf_[i * T + 0];
+        logDeltaData[i] = logPi + emitRow0[i];
     }
 
     // t > 0: recursion
     for (std::size_t t = 1; t < T; ++t) {
+        const double *prevDeltaRow = logDeltaData + (t - 1) * N;
+        double *deltaRow = logDeltaData + t * N;
+        const double *emitRow = logEmitByTimeData + t * N;
         for (std::size_t j = 0; j < numStates_; ++j) {
             double maxVal = LOG_ZERO;
             int maxFrom = 0;
+            const double *transCol = logTransTData + j * N;
             for (std::size_t i = 0; i < numStates_; ++i) {
-                const double val = logDelta_(t - 1, i) + logTrans_(i, j);
+                const double val = prevDeltaRow[i] + transCol[i];
                 if (val > maxVal) {
                     maxVal = val;
                     maxFrom = static_cast<int>(i);
                 }
             }
-            logDelta_(t, j) = maxVal + logEmitBuf_[j * T + t];
-            psi_[t][j] = maxFrom;
+            deltaRow[j] = maxVal + emitRow[j];
+            psi_[t * N + j] = maxFrom;
         }
     }
 
     // Termination: best last state
     double bestVal = LOG_ZERO;
     int bestLast = 0;
+    const double *finalDeltaRow = logDeltaData + (T - 1) * N;
     for (std::size_t i = 0; i < numStates_; ++i) {
-        if (logDelta_(T - 1, i) > bestVal) {
-            bestVal = logDelta_(T - 1, i);
+        if (finalDeltaRow[i] > bestVal) {
+            bestVal = finalDeltaRow[i];
             bestLast = static_cast<int>(i);
         }
     }
@@ -126,9 +144,10 @@ void ViterbiCalculator::backtrack() {
     const std::size_t T = observations_.size();
     if (T <= 1)
         return;
+    const std::size_t N = numStates_;
 
     for (std::size_t t = T - 2;; --t) {
-        sequence_(t) = psi_[t + 1][static_cast<std::size_t>(sequence_(t + 1))];
+        sequence_(t) = psi_[(t + 1) * N + static_cast<std::size_t>(sequence_(t + 1))];
         if (t == 0)
             break;
     }
diff --git a/src/distributions/beta_distribution.cpp b/src/distributions/beta_distribution.cpp
index b3e7f19..5a5740d 100644
--- a/src/distributions/beta_distribution.cpp
+++ b/src/distributions/beta_distribution.cpp
@@ -7,7 +7,7 @@ namespace libhmm {
 
 /**
  * Computes the probability density function for the Beta distribution.
- * 
+ *
  * @param value The value at which to evaluate the PDF (should be in [0,1])
  * @return Probability density, or 0.0 if value is outside [0,1]
  */
@@ -82,9 +82,9 @@ double BetaDistribution::getProbability(double value) const {
 
 /**
  * Computes the logarithm of the probability density function for numerical stability.
- * 
+ *
  * For Beta distribution: log(f(x)) = (α-1)log(x) + (β-1)log(1-x) - log(B(α,β))
- * 
+ *
  * @param value The value at which to evaluate the log-PDF (should be in [0,1])
  * @return Natural logarithm of the probability density, or -∞ for invalid values
  */
diff --git a/src/distributions/binomial_distribution.cpp b/src/distributions/binomial_distribution.cpp
index a45108a..856fa53 100644
--- a/src/distributions/binomial_distribution.cpp
+++ b/src/distributions/binomial_distribution.cpp
@@ -9,10 +9,10 @@ namespace libhmm {
 
 /**
  * Computes the probability mass function for the Binomial distribution.
- * 
+ *
  * For discrete distributions, this returns the exact probability mass
  * P(X = k) = C(n,k) * p^k * (1-p)^(n-k)
- * 
+ *
  * @param value The value at which to evaluate the PMF (rounded to nearest integer)
  * @return Probability mass for the given value
  */
@@ -50,13 +50,13 @@ double BinomialDistribution::getProbability(double value) const {
 
 /**
  * Fits the distribution parameters to the given data using maximum likelihood estimation.
- * 
+ *
  * For Binomial distribution with known n, the MLE of p is:
  * p̂ = sample_mean / n
- * 
+ *
  * If n is unknown, we estimate it as the maximum observed value, then fit p.
  * This is a common approach when the number of trials is not known a priori.
- * 
+ *
  * @param values Vector of observed data points
  */
 void BinomialDistribution::fit(std::span<const double> data) {
@@ -131,7 +131,7 @@ void BinomialDistribution::reset() noexcept {
 
 /**
  * Returns a string representation of the distribution following the standardized format.
- * 
+ *
  * @return String describing the distribution parameters and statistics
  */
 std::string BinomialDistribution::toString() const {
diff --git a/src/distributions/discrete_distribution.cpp b/src/distributions/discrete_distribution.cpp
index d8a0723..a26a661 100755
--- a/src/distributions/discrete_distribution.cpp
+++ b/src/distributions/discrete_distribution.cpp
@@ -7,7 +7,7 @@ namespace libhmm {
 
 /**
  * Gets the probability mass function value for a discrete observation.
- * 
+ *
  * @param x The discrete value (will be cast to integer index)
  * @return Probability mass for the given value, 0.0 if out of range
  */
@@ -25,7 +25,7 @@ double DiscreteDistribution::getProbability(double x) const {
 /**
  * Fits the distribution to observed data using maximum likelihood estimation.
  * Computes empirical probabilities: P(X = k) = count(k) / total_count
- * 
+ *
  * @param values Vector of observed discrete values
  */
 void DiscreteDistribution::fit(std::span<const double> data) {
@@ -90,7 +90,7 @@ void DiscreteDistribution::reset() noexcept {
 
 /**
  * Returns a string representation of the distribution.
- * 
+ *
  * @return String showing all symbol probabilities
  */
 std::string DiscreteDistribution::toString() const {
diff --git a/src/distributions/exponential_distribution.cpp b/src/distributions/exponential_distribution.cpp
index 4a66d8c..4a5b052 100755
--- a/src/distributions/exponential_distribution.cpp
+++ b/src/distributions/exponential_distribution.cpp
@@ -10,13 +10,13 @@ namespace libhmm {
 
 /**
  * Computes the probability density function for the Exponential distribution.
- * 
+ *
  * For continuous distributions in discrete sampling contexts, we approximate
  * the probability as P(x - ε <= X <= x) = F(x) - F(x - ε) where ε is a small tolerance.
- * 
+ *
  * This provides a numerically stable approximation of the PDF scaled by the tolerance,
  * which is appropriate for discrete sampling of continuous distributions.
- * 
+ *
  * @param x The value at which to evaluate the probability
  * @return Approximated probability for discrete sampling
  */
@@ -40,9 +40,9 @@ double ExponentialDistribution::getProbability(double value) const {
 
 /**
  * Computes the logarithm of the probability density function for numerical stability.
- * 
+ *
  * For exponential distribution: log(f(x)) = log(λ) - λx for x ≥ 0
- * 
+ *
  * @param x The value at which to evaluate the log-PDF
  * @return Natural logarithm of the probability density, or -∞ for invalid values
  */
@@ -59,9 +59,9 @@ double ExponentialDistribution::getLogProbability(double value) const noexcept {
 
 /**
  * Evaluates the CDF for the Exponential distribution at x.
- * 
+ *
  * Formula: F(x) = 1 - exp(-λx) for x ≥ 0
- * 
+ *
  * @param x The value at which to evaluate the CDF
  * @return Cumulative probability P(X ≤ x)
  */
@@ -73,7 +73,7 @@ double ExponentialDistribution::getCumulativeProbability(double x) const noexcep
 
 /**
  * Fits the distribution parameters to the given data using maximum likelihood estimation.
- * 
+ *
  * For the Exponential distribution, the MLE of the rate parameter is:
  * λ = 1 / sample_mean
  *
diff --git a/src/distributions/gamma_distribution.cpp b/src/distributions/gamma_distribution.cpp
index 90d23cd..76cc848 100755
--- a/src/distributions/gamma_distribution.cpp
+++ b/src/distributions/gamma_distribution.cpp
@@ -8,7 +8,7 @@ namespace libhmm {
 /**
  * Computes the probability density function for the Gamma distribution.
  * PDF: f(x) = (1/(Γ(k)θ^k)) * x^(k-1) * exp(-x/θ) for x ≥ 0
- * 
+ *
  * @param x The value at which to evaluate the probability
  * @return Probability density
  */
@@ -32,7 +32,7 @@ double GammaDistribution::getProbability(double x) const {
 /**
  * Evaluates the logarithm of the probability density function for numerical stability.
  * Formula: log PDF(x) = (k-1)*ln(x) - x/θ - k*ln(θ) - ln(Γ(k))
- * 
+ *
  * @param x The value at which to evaluate the log PDF
  * @return Log probability density
  */
@@ -60,7 +60,7 @@ double GammaDistribution::getLogProbability(double x) const noexcept {
  * Evaluates the CDF at x using the incomplete gamma function
  * Formula: CDF(x) = P(k, x/θ) = γ(k, x/θ) / Γ(k)
  * where P is the regularized incomplete gamma function
- * 
+ *
  * @param x The value at which to evaluate the CDF
  * @return Cumulative probability P(X ≤ x)
  */
@@ -88,15 +88,15 @@ double GammaDistribution::ligamma(double a, double x) noexcept {
 
 /**
  * Fits the distribution parameters to the given data using method of moments estimation.
- * 
+ *
  * Method of moments uses:
  * sample_mean = k*θ
  * sample_variance = k*θ²
- * 
+ *
  * Solving: θ = sample_variance/sample_mean, k = sample_mean²/sample_variance
- * 
+ *
  * This is more numerically stable than MLE approximations for the Gamma distribution.
- * 
+ *
  * @param values Vector of observed data points
  */
 void GammaDistribution::fit(std::span<const double> data) {
diff --git a/src/distributions/gaussian_distribution.cpp b/src/distributions/gaussian_distribution.cpp
index ff9f31b..d4c48ef 100755
--- a/src/distributions/gaussian_distribution.cpp
+++ b/src/distributions/gaussian_distribution.cpp
@@ -10,7 +10,7 @@ using namespace libhmm::constants;
 namespace libhmm {
 /**
  * Returns the probability density function value for the Gaussian distribution.
- * 
+ *
  * Formula: PDF(x) = (1/σ√(2π)) * exp(-½((x-μ)/σ)²)
  */
 double GaussianDistribution::getProbability(double x) const {
@@ -83,7 +83,7 @@ double GaussianDistribution::getCumulativeProbability(double x) const noexcept {
 
 /*
  * Fits the distribution parameters using maximum likelihood estimation with optimized algorithm.
- * 
+ *
  * Uses single-pass Welford's algorithm for numerically stable variance calculation:
  * - Better cache locality than two-pass algorithm
  * - Numerically stable for extreme values
diff --git a/src/distributions/log_normal_distribution.cpp b/src/distributions/log_normal_distribution.cpp
index 572fb91..e598969 100755
--- a/src/distributions/log_normal_distribution.cpp
+++ b/src/distributions/log_normal_distribution.cpp
@@ -1,4 +1,5 @@
 #include "libhmm/distributions/log_normal_distribution.h"
+#include "libhmm/performance/simd_kernels_internal.h"
 // Header already includes: <iostream>, <sstream>, <iomanip>, <cmath>, <cassert>, <stdexcept> via common.h
 #include <numeric>   // For std::accumulate (not in common.h)
 #include <algorithm> // For std::for_each (exists in common.h, included for clarity)
@@ -9,13 +10,13 @@ namespace libhmm {
 
 /**
  * Computes the probability density function for the Log-Normal distribution.
- * 
+ *
  * For continuous distributions in discrete sampling contexts, we approximate
  * the probability as P(x - ε <= X <= x) = F(x) - F(x - ε) where ε is a small tolerance.
- * 
+ *
  * This provides a numerically stable approximation of the PDF scaled by the tolerance,
  * which is appropriate for discrete sampling of continuous distributions.
- * 
+ *
  * @param x The value at which to evaluate the probability
  * @return Approximated probability for discrete sampling
  */
@@ -78,13 +79,13 @@ double LogNormalDistribution::getCumulativeProbability(double value) const noexc
 
 /**
  * Fits the distribution parameters to the given data using maximum likelihood estimation.
- * 
+ *
  * For Log-Normal distribution, the MLE estimators are:
  * μ = mean(ln(x_i)) for positive x_i
  * σ = std_dev(ln(x_i)) for positive x_i
- * 
+ *
  * Only positive values are used since Log-Normal distribution has support (0, ∞).
- * 
+ *
  * @param values Vector of observed data points
  */
 void LogNormalDistribution::fit(std::span<const double> data) {
@@ -210,20 +211,111 @@ std::istream &operator>>(std::istream &is, libhmm::LogNormalDistribution &distri
     return is;
 }
 
+// =============================================================================
+// Batch log-PDF — explicit SIMD intrinsics (tier 2)
+//
+// Formula: log f(x) = -log(x) - logNormConst + negHalfInvSigma2*(log(x)-mu)^2
+// Per element: log_x = log(x); then result = -log_x - C + S*(log_x - mu)^2
+// where C = logNormalizationConstant_, S = negHalfSigmaSquaredInv_.
+//
+// x <= 0 lanes: log(x) is -inf; guard produces -inf output.
+// Pattern mirrors gaussian_logpdf_batch (gaussian_distribution.cpp).
+// =============================================================================
+namespace detail {
+
+void lognormal_logpdf_batch(const double *obs, double *out, std::size_t n, double mu, double S,
+                            double C) noexcept {
+    using namespace performance::detail::kernels;
+    std::size_t i = 0;
+    const double neg_inf = -std::numeric_limits<double>::infinity();
+
+#if defined(LIBHMM_HAS_AVX512)
+    {
+        const __m512d vmu = _mm512_set1_pd(mu);
+        const __m512d vS = _mm512_set1_pd(S);
+        const __m512d vC = _mm512_set1_pd(C);
+        for (; i + 8 <= n; i += 8) {
+            __m512d x = _mm512_loadu_pd(obs + i);
+            __m512d lx = k_log_pd_avx512(x);    // -inf where x<=0
+            __m512d d = _mm512_sub_pd(lx, vmu); // log(x) - mu
+            __m512d res = _mm512_fmadd_pd(
+                d, _mm512_mul_pd(d, vS),
+                _mm512_sub_pd(_mm512_setzero_pd(), _mm512_add_pd(lx, vC))); // -lx - C + S*d^2
+            _mm512_storeu_pd(out + i, res);
+        }
+    }
+#endif
+
+#if defined(LIBHMM_HAS_AVX) || defined(LIBHMM_HAS_AVX2)
+    {
+        const __m256d vmu = _mm256_set1_pd(mu);
+        const __m256d vS = _mm256_set1_pd(S);
+        const __m256d vC = _mm256_set1_pd(C);
+        for (; i + 4 <= n; i += 4) {
+            __m256d x = _mm256_loadu_pd(obs + i);
+            __m256d lx = k_log_pd_avx(x);
+            __m256d d = _mm256_sub_pd(lx, vmu);
+            __m256d res = _mm256_add_pd(_mm256_mul_pd(_mm256_mul_pd(d, d), vS),
+                                        _mm256_sub_pd(_mm256_setzero_pd(), _mm256_add_pd(lx, vC)));
+            _mm256_storeu_pd(out + i, res);
+        }
+    }
+#endif
+
+#if defined(LIBHMM_HAS_SSE2)
+    {
+        const __m128d vmu = _mm_set1_pd(mu);
+        const __m128d vS = _mm_set1_pd(S);
+        const __m128d vC = _mm_set1_pd(C);
+        for (; i + 2 <= n; i += 2) {
+            __m128d x = _mm_loadu_pd(obs + i);
+            __m128d lx = k_log_pd_sse2(x);
+            __m128d d = _mm_sub_pd(lx, vmu);
+            __m128d res = _mm_add_pd(_mm_mul_pd(_mm_mul_pd(d, d), vS),
+                                     _mm_sub_pd(_mm_setzero_pd(), _mm_add_pd(lx, vC)));
+            _mm_storeu_pd(out + i, res);
+        }
+    }
+#endif
+
+#if defined(LIBHMM_HAS_NEON)
+    {
+        const float64x2_t vmu = vdupq_n_f64(mu);
+        const float64x2_t vS = vdupq_n_f64(S);
+        const float64x2_t vC = vdupq_n_f64(C);
+        for (; i + 2 <= n; i += 2) {
+            float64x2_t x = vld1q_f64(obs + i);
+            float64x2_t lx = k_log_pd_neon(x);
+            float64x2_t d = vsubq_f64(lx, vmu);
+            // res = S*d^2 + (-lx - C) = S*d^2 - lx - C
+            float64x2_t res = vfmaq_f64(vsubq_f64(vnegq_f64(lx), vC), vmulq_f64(d, d), vS);
+            vst1q_f64(out + i, res);
+        }
+    }
+#endif
+
+    // Scalar tail.
+    for (; i < n; ++i) {
+        const double x = obs[i];
+        if (x <= 0.0 || std::isnan(x) || std::isinf(x)) {
+            out[i] = neg_inf;
+        } else {
+            const double lx = std::log(x);
+            const double d = lx - mu;
+            out[i] = -lx - C + S * d * d;
+        }
+    }
+}
+
+} // namespace detail
+
 void LogNormalDistribution::getBatchLogProbabilities(std::span<const double> observations,
                                                      std::span<double> out) const {
-    // Tier 1 — concrete non-virtual loop; compiler auto-vectorizes the arithmetic
-    // terms under -march=native / /arch:AVX512.
-    // Tier 2 upgrade requires vectorised log(x): the inner loop is essentially
-    // Gaussian on log(x), so once a vectorised log is available the pattern is
-    // identical to GaussianDistribution tier 2 but with an extra log-transform
-    // step. Available via Intel SVML, GNU libmvec, or Apple Accelerate vvlog,
-    // but not portably without a math-library dependency.
+    // Tier 2 — explicit SIMD via simd_kernels_internal.h
     if (!isCacheValid())
         updateCache();
-    for (std::size_t i = 0; i < observations.size(); ++i) {
-        out[i] = LogNormalDistribution::getLogProbability(observations[i]);
-    }
+    detail::lognormal_logpdf_batch(observations.data(), out.data(), observations.size(), mean_,
+                                   negHalfSigmaSquaredInv_, logNormalizationConstant_);
 }
 
 } // namespace libhmm
diff --git a/src/distributions/negative_binomial_distribution.cpp b/src/distributions/negative_binomial_distribution.cpp
index 24e8e0f..2836602 100644
--- a/src/distributions/negative_binomial_distribution.cpp
+++ b/src/distributions/negative_binomial_distribution.cpp
@@ -9,10 +9,10 @@ namespace libhmm {
 
 /**
  * Computes the probability mass function for the Negative Binomial distribution.
- * 
+ *
  * For discrete distributions, this returns the exact probability mass
  * P(X = k) = C(k+r-1, k) * p^r * (1-p)^k
- * 
+ *
  * @param value The value at which to evaluate the PMF (rounded to nearest integer)
  * @return Probability mass for the given value
  */
@@ -45,14 +45,14 @@ double NegativeBinomialDistribution::getProbability(double value) const {
 
 /**
  * Fits the distribution parameters to the given data using method of moments.
- * 
+ *
  * For Negative Binomial distribution, the method of moments estimators are:
  * p̂ = mean / variance (if variance > mean)
  * r̂ = mean² / (variance - mean) (if variance > mean)
- * 
- * If variance ≤ mean, the negative binomial model is not appropriate 
+ *
+ * If variance ≤ mean, the negative binomial model is not appropriate
  * (indicates under-dispersion), so we fall back to default parameters.
- * 
+ *
  * @param values Vector of observed data points
  */
 void NegativeBinomialDistribution::fit(std::span<const double> data) {
@@ -139,7 +139,7 @@ void NegativeBinomialDistribution::reset() noexcept {
 
 /**
  * Returns a string representation of the distribution following the standardized format.
- * 
+ *
  * @return String describing the distribution parameters and statistics
  */
 std::string NegativeBinomialDistribution::toString() const {
diff --git a/src/distributions/pareto_distribution.cpp b/src/distributions/pareto_distribution.cpp
index aae3b3b..a6e5968 100755
--- a/src/distributions/pareto_distribution.cpp
+++ b/src/distributions/pareto_distribution.cpp
@@ -1,4 +1,5 @@
 #include "libhmm/distributions/pareto_distribution.h"
+#include "libhmm/performance/simd_kernels_internal.h"
 // Header already includes: <iostream>, <sstream>, <iomanip>, <cmath>, <cassert>, <stdexcept> via common.h
 #include <numeric>   // For std::accumulate (not in common.h)
 #include <algorithm> // For std::min_element (exists in common.h, included for clarity)
@@ -10,11 +11,11 @@ namespace libhmm {
 
 /**
  * Computes the probability density function for the Pareto distribution.
- * 
+ *
  * For Pareto distribution: f(x) = (k * x_m^k) / x^(k+1) for x ≥ x_m
- * 
+ *
  * Uses direct PDF calculation for optimal performance, avoiding expensive CDF differences.
- * 
+ *
  * @param x The value at which to evaluate the probability density
  * @return Probability density for the given value
  */
@@ -38,9 +39,9 @@ double ParetoDistribution::getProbability(double x) const {
 
 /**
  * Computes the logarithm of the probability density function for numerical stability.
- * 
+ *
  * For Pareto distribution: log(f(x)) = log(k) + k*log(x_m) - (k+1)*log(x) for x ≥ x_m
- * 
+ *
  * @param value The value at which to evaluate the log-PDF
  * @return Natural logarithm of the probability density, or -∞ for invalid values
  */
@@ -65,9 +66,9 @@ double ParetoDistribution::getCumulativeProbability(double value) const noexcept
 
 /**
  * Evaluates the CDF for the Pareto distribution at x.
- * 
+ *
  * Formula: F(x) = 1 - (x_m/x)^k for x ≥ x_m
- * 
+ *
  * @param x The value at which to evaluate the CDF
  * @return Cumulative probability P(X ≤ x)
  */
@@ -77,11 +78,11 @@ double ParetoDistribution::CDF(double x) const noexcept {
 
 /**
  * Fits the distribution parameters to the given data using maximum likelihood estimation.
- * 
+ *
  * For Pareto distribution, the MLE estimators are:
  * x_m = min(x_i) for all i
  * k = n / Σ(ln(x_i) - ln(x_m)) for i = 1 to n
- * 
+ *
  * @param values Vector of observed data
  */
 void ParetoDistribution::fit(std::span<const double> data) {
@@ -196,19 +197,108 @@ std::istream &operator>>(std::istream &is, libhmm::ParetoDistribution &distribut
     return is;
 }
 
+// =============================================================================
+// Batch log-PDF — explicit SIMD intrinsics (tier 2)
+//
+// Formula: log f(x) = logK + kLogXm - kPlus1 * log(x)  for x >= xm
+//                   = -inf                               for x < xm
+// =============================================================================
+namespace detail {
+
+void pareto_logpdf_batch(const double *obs, double *out, std::size_t n, double xm,
+                         double logK_plus_kLogXm, double kPlus1) noexcept {
+    using namespace performance::detail::kernels;
+    std::size_t i = 0;
+    const double neg_inf = -std::numeric_limits<double>::infinity();
+
+#if defined(LIBHMM_HAS_AVX512)
+    {
+        const __m512d vxm = _mm512_set1_pd(xm);
+        const __m512d vconst = _mm512_set1_pd(logK_plus_kLogXm);
+        const __m512d vkp1 = _mm512_set1_pd(kPlus1);
+        const __m512d vneg_inf = _mm512_set1_pd(neg_inf);
+        for (; i + 8 <= n; i += 8) {
+            __m512d x = _mm512_loadu_pd(obs + i);
+            // x < xm: -inf
+            __mmask8 invalid = _mm512_cmp_pd_mask(x, vxm, _CMP_LT_OS);
+            __m512d lx = k_log_pd_avx512(x);
+            __m512d res = _mm512_fnmadd_pd(vkp1, lx, vconst); // const - kp1*log(x)
+            res = _mm512_mask_blend_pd(invalid, res, vneg_inf);
+            _mm512_storeu_pd(out + i, res);
+        }
+    }
+#endif
+
+#if defined(LIBHMM_HAS_AVX) || defined(LIBHMM_HAS_AVX2)
+    {
+        const __m256d vxm = _mm256_set1_pd(xm);
+        const __m256d vconst = _mm256_set1_pd(logK_plus_kLogXm);
+        const __m256d vkp1 = _mm256_set1_pd(kPlus1);
+        const __m256d vneg_inf = _mm256_set1_pd(neg_inf);
+        for (; i + 4 <= n; i += 4) {
+            __m256d x = _mm256_loadu_pd(obs + i);
+            __m256d inv = _mm256_cmp_pd(x, vxm, _CMP_LT_OS); // all-1s where x < xm
+            __m256d lx = k_log_pd_avx(x);
+            __m256d res = _mm256_sub_pd(vconst, _mm256_mul_pd(vkp1, lx));
+            res = _mm256_blendv_pd(res, vneg_inf, inv);
+            _mm256_storeu_pd(out + i, res);
+        }
+    }
+#endif
+
+#if defined(LIBHMM_HAS_SSE2)
+    {
+        const __m128d vxm = _mm_set1_pd(xm);
+        const __m128d vconst = _mm_set1_pd(logK_plus_kLogXm);
+        const __m128d vkp1 = _mm_set1_pd(kPlus1);
+        const __m128d vneg_inf = _mm_set1_pd(neg_inf);
+        for (; i + 2 <= n; i += 2) {
+            __m128d x = _mm_loadu_pd(obs + i);
+            __m128d inv = _mm_cmplt_pd(x, vxm);
+            __m128d lx = k_log_pd_sse2(x);
+            __m128d res = _mm_sub_pd(vconst, _mm_mul_pd(vkp1, lx));
+            res = _mm_or_pd(_mm_andnot_pd(inv, res), _mm_and_pd(inv, vneg_inf));
+            _mm_storeu_pd(out + i, res);
+        }
+    }
+#endif
+
+#if defined(LIBHMM_HAS_NEON)
+    {
+        const float64x2_t vxm = vdupq_n_f64(xm);
+        const float64x2_t vconst = vdupq_n_f64(logK_plus_kLogXm);
+        const float64x2_t vkp1 = vdupq_n_f64(kPlus1);
+        const float64x2_t vneg_inf = vdupq_n_f64(neg_inf);
+        for (; i + 2 <= n; i += 2) {
+            float64x2_t x = vld1q_f64(obs + i);
+            uint64x2_t inv = vcltq_f64(x, vxm); // x < xm
+            float64x2_t lx = k_log_pd_neon(x);
+            float64x2_t res = vsubq_f64(vconst, vmulq_f64(vkp1, lx));
+            res = vbslq_f64(inv, vneg_inf, res);
+            vst1q_f64(out + i, res);
+        }
+    }
+#endif
+
+    // Scalar tail.
+    for (; i < n; ++i) {
+        const double x = obs[i];
+        out[i] = (std::isnan(x) || std::isinf(x) || x < xm)
+                     ? neg_inf
+                     : logK_plus_kLogXm - kPlus1 * std::log(x);
+    }
+}
+
+} // namespace detail
+
 void ParetoDistribution::getBatchLogProbabilities(std::span<const double> observations,
                                                   std::span<double> out) const {
-    // Tier 1 — concrete non-virtual loop; compiler auto-vectorizes the arithmetic
-    // terms under -march=native / /arch:AVX512.
-    // Tier 2 upgrade requires vectorised log(x): inner loop is
-    // log(α) + α*log(x_m) - (α+1)*log(x), so a vectorised log is needed.
-    // Available via Intel SVML, GNU libmvec, or Apple Accelerate vvlog, but
-    // not portably without a math-library dependency.
+    // Tier 2 — explicit SIMD via simd_kernels_internal.h
     if (!isCacheValid())
         updateCache();
-    for (std::size_t i = 0; i < observations.size(); ++i) {
-        out[i] = ParetoDistribution::getLogProbability(observations[i]);
-    }
+    // logK_ + kLogXm_ is a single scalar constant — compute once.
+    detail::pareto_logpdf_batch(observations.data(), out.data(), observations.size(), xm_,
+                                logK_ + kLogXm_, kPlus1_);
 }
 
 } // namespace libhmm
diff --git a/src/distributions/rayleigh_distribution.cpp b/src/distributions/rayleigh_distribution.cpp
index fa8a4c0..8ab4218 100644
--- a/src/distributions/rayleigh_distribution.cpp
+++ b/src/distributions/rayleigh_distribution.cpp
@@ -6,9 +6,9 @@ namespace libhmm {
 
 /**
  * Computes the probability density function for the Rayleigh distribution.
- * 
+ *
  * PDF: f(x) = (x/σ²) * exp(-x²/(2σ²)) for x ≥ 0
- * 
+ *
  * @param value The value at which to evaluate the PDF
  * @return Probability density
  */
@@ -24,9 +24,9 @@ double RayleighDistribution::getProbability(double value) const {
 
 /**
  * Computes the logarithm of the probability density function for numerical stability.
- * 
+ *
  * For Rayleigh distribution: log(f(x)) = log(x) - 2*log(σ) - x²/(2σ²) for x > 0
- * 
+ *
  * @param value The value at which to evaluate the log-PDF
  * @return Natural logarithm of the probability density, or -∞ for invalid values
  */
@@ -54,7 +54,7 @@ double RayleighDistribution::getCumulativeProbability(double value) const noexce
  * Fits the distribution parameters to the given data using maximum likelihood estimation.
  * This method is efficient as it requires only a single pass through the data
  * to compute the sum of squares.
- * 
+ *
  * @param values Vector of observed data
  */
 void RayleighDistribution::fit(std::span<const double> data) {
diff --git a/src/distributions/student_t_distribution.cpp b/src/distributions/student_t_distribution.cpp
index c463d6e..f7944d2 100644
--- a/src/distributions/student_t_distribution.cpp
+++ b/src/distributions/student_t_distribution.cpp
@@ -121,7 +121,7 @@ double StudentTDistribution::getLogProbability(double value) const noexcept {
 
 /**
  * Computes the cumulative distribution function for the Student's t-distribution.
- * 
+ *
  * Uses the relationship with the incomplete beta function for numerical accuracy.
  */
 double StudentTDistribution::getCumulativeProbability(double value) const noexcept {
diff --git a/src/performance/transcendental_kernels.cpp b/src/performance/transcendental_kernels.cpp
new file mode 100644
index 0000000..7d61803
--- /dev/null
+++ b/src/performance/transcendental_kernels.cpp
@@ -0,0 +1,436 @@
+// src/performance/transcendental_kernels.cpp
+//
+// SIMD implementations of TranscendentalKernels methods.
+//
+// Compiled with LIBHMM_BEST_SIMD_FLAGS, activating the ISA cascade:
+//   AVX-512  8-wide __m512d
+//   AVX/AVX2 4-wide __m256d   (AVX-1 compatible; compiler fuses FMA under AVX2)
+//   SSE2     2-wide __m128d
+//   NEON     2-wide float64x2_t
+//   scalar   tail and portable fallback
+//
+// Vector exp helpers (k_exp_pd_*) and log helpers (k_log_pd_*) are defined
+// in simd_kernels_internal.h -- the single source of truth shared with
+// Tier-2 distribution TUs (log_normal_distribution.cpp, pareto_distribution.cpp).
+
+#include "libhmm/performance/transcendental_kernels.h"
+#include "libhmm/performance/simd_kernels_internal.h"
+#include "libhmm/math/constants.h"
+#include "libhmm/platform/simd_platform.h"
+
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+
+namespace libhmm {
+namespace performance {
+namespace detail {
+
+namespace {
+
+// ---------------------------------------------------------------------------
+// Horizontal reduction helpers
+// ---------------------------------------------------------------------------
+
+// SSE2: horizontal max of 2-lane vector.
+#if defined(LIBHMM_HAS_SSE2)
+static inline double hmax_pd_sse2(__m128d v) noexcept {
+    __m128d shuf = _mm_shuffle_pd(v, v, 1);
+    return _mm_cvtsd_f64(_mm_max_pd(v, shuf));
+}
+static inline double hadd_pd_sse2(__m128d v) noexcept {
+    __m128d shuf = _mm_shuffle_pd(v, v, 1);
+    return _mm_cvtsd_f64(_mm_add_pd(v, shuf));
+}
+#endif
+
+// AVX: horizontal max/sum of 4-lane vector.
+#if defined(LIBHMM_HAS_AVX) || defined(LIBHMM_HAS_AVX2)
+static inline double hmax_pd_avx(__m256d v) noexcept {
+    __m128d lo = _mm256_castpd256_pd128(v);
+    __m128d hi = _mm256_extractf128_pd(v, 1);
+    __m128d m = _mm_max_pd(lo, hi);
+    return hmax_pd_sse2(m);
+}
+static inline double hadd_pd_avx(__m256d v) noexcept {
+    __m128d lo = _mm256_castpd256_pd128(v);
+    __m128d hi = _mm256_extractf128_pd(v, 1);
+    __m128d s = _mm_add_pd(lo, hi);
+    return hadd_pd_sse2(s);
+}
+#endif
+
+} // anonymous namespace
+
+// =============================================================================
+// TranscendentalKernels method implementations
+// =============================================================================
+
+// -----------------------------------------------------------------------------
+// reduce_max_sum2: max of (a[i] + b[i])
+// -----------------------------------------------------------------------------
+double TranscendentalKernels::reduce_max_sum2(const double *a, const double *b,
+                                              std::size_t size) noexcept {
+    std::size_t i = 0;
+    const double neg_inf = -std::numeric_limits<double>::infinity();
+    // maxVal accumulates across ISA blocks; each lower-tier block seeds its
+    // vector accumulator from the value set by the highest active tier.
+    double maxVal;
+#if defined(LIBHMM_HAS_AVX512)
+    {
+        __m512d vmax = _mm512_set1_pd(neg_inf);
+        for (; i + 8 <= size; i += 8) {
+            __m512d va = _mm512_loadu_pd(a + i);
+            __m512d vb = _mm512_loadu_pd(b + i);
+            vmax = _mm512_max_pd(vmax, _mm512_add_pd(va, vb));
+        }
+        maxVal = _mm512_reduce_max_pd(vmax);
+    }
+#else
+    maxVal = neg_inf;
+#endif
+
+#if defined(LIBHMM_HAS_AVX) || defined(LIBHMM_HAS_AVX2)
+    {
+        __m256d vmax = _mm256_set1_pd(maxVal);
+        for (; i + 4 <= size; i += 4) {
+            __m256d va = _mm256_loadu_pd(a + i);
+            __m256d vb = _mm256_loadu_pd(b + i);
+            vmax = _mm256_max_pd(vmax, _mm256_add_pd(va, vb));
+        }
+        maxVal = hmax_pd_avx(vmax);
+    }
+#endif
+
+#if defined(LIBHMM_HAS_SSE2)
+    {
+        __m128d vmax = _mm_set1_pd(maxVal);
+        for (; i + 2 <= size; i += 2) {
+            __m128d va = _mm_loadu_pd(a + i);
+            __m128d vb = _mm_loadu_pd(b + i);
+            vmax = _mm_max_pd(vmax, _mm_add_pd(va, vb));
+        }
+        maxVal = hmax_pd_sse2(vmax);
+    }
+#endif
+
+#if defined(LIBHMM_HAS_NEON)
+    {
+        float64x2_t vmax = vdupq_n_f64(maxVal);
+        for (; i + 2 <= size; i += 2) {
+            float64x2_t va = vld1q_f64(a + i);
+            float64x2_t vb = vld1q_f64(b + i);
+            vmax = vmaxq_f64(vmax, vaddq_f64(va, vb));
+        }
+        maxVal = vmaxvq_f64(vmax);
+    }
+#endif
+
+    // Scalar tail.
+    for (; i < size; ++i) {
+        const double t = a[i] + b[i];
+        if (t > maxVal)
+            maxVal = t;
+    }
+    return maxVal;
+}
+
+// -----------------------------------------------------------------------------
+// sum_exp_sum2_minus_max
+// -----------------------------------------------------------------------------
+double TranscendentalKernels::sum_exp_sum2_minus_max(const double *a, const double *b,
+                                                     std::size_t size, double maxVal) noexcept {
+    if (!std::isfinite(maxVal))
+        return 0.0;
+    std::size_t i = 0;
+    double sum = 0.0;
+
+#if defined(LIBHMM_HAS_AVX512)
+    {
+        const __m512d vmaxv = _mm512_set1_pd(maxVal);
+        __m512d vsum = _mm512_setzero_pd();
+        for (; i + 8 <= size; i += 8) {
+            __m512d va = _mm512_loadu_pd(a + i);
+            __m512d vb = _mm512_loadu_pd(b + i);
+            __m512d term = _mm512_sub_pd(_mm512_add_pd(va, vb), vmaxv);
+            vsum = _mm512_add_pd(vsum, kernels::k_exp_pd_avx512(term));
+        }
+        sum += _mm512_reduce_add_pd(vsum);
+    }
+#endif
+
+#if defined(LIBHMM_HAS_AVX) || defined(LIBHMM_HAS_AVX2)
+    {
+        const __m256d vmaxv = _mm256_set1_pd(maxVal);
+        __m256d vsum = _mm256_setzero_pd();
+        for (; i + 4 <= size; i += 4) {
+            __m256d va = _mm256_loadu_pd(a + i);
+            __m256d vb = _mm256_loadu_pd(b + i);
+            __m256d term = _mm256_sub_pd(_mm256_add_pd(va, vb), vmaxv);
+            vsum = _mm256_add_pd(vsum, kernels::k_exp_pd_avx(term));
+        }
+        sum += hadd_pd_avx(vsum);
+    }
+#endif
+
+#if defined(LIBHMM_HAS_SSE2)
+    {
+        const __m128d vmaxv = _mm_set1_pd(maxVal);
+        __m128d vsum = _mm_setzero_pd();
+        for (; i + 2 <= size; i += 2) {
+            __m128d va = _mm_loadu_pd(a + i);
+            __m128d vb = _mm_loadu_pd(b + i);
+            __m128d term = _mm_sub_pd(_mm_add_pd(va, vb), vmaxv);
+            vsum = _mm_add_pd(vsum, kernels::k_exp_pd_sse2(term));
+        }
+        sum += hadd_pd_sse2(vsum);
+    }
+#endif
+
+#if defined(LIBHMM_HAS_NEON)
+    {
+        const float64x2_t vmaxv = vdupq_n_f64(maxVal);
+        float64x2_t vsum = vdupq_n_f64(0.0);
+        for (; i + 2 <= size; i += 2) {
+            float64x2_t va = vld1q_f64(a + i);
+            float64x2_t vb = vld1q_f64(b + i);
+            float64x2_t term = vsubq_f64(vaddq_f64(va, vb), vmaxv);
+            vsum = vaddq_f64(vsum, kernels::k_exp_pd_neon(term));
+        }
+        sum += vaddvq_f64(vsum);
+    }
+#endif
+
+    // Scalar tail.
+    for (; i < size; ++i) {
+        const double t = a[i] + b[i];
+        if (std::isfinite(t))
+            sum += std::exp(t - maxVal);
+    }
+    return sum;
+}
+
+// -----------------------------------------------------------------------------
+// reduce_max_sum3: max of (a[i] + b[i] + c[i])
+// -----------------------------------------------------------------------------
+double TranscendentalKernels::reduce_max_sum3(const double *a, const double *b, const double *c,
+                                              std::size_t size) noexcept {
+    std::size_t i = 0;
+    const double neg_inf = -std::numeric_limits<double>::infinity();
+    double maxVal;
+#if defined(LIBHMM_HAS_AVX512)
+    {
+        __m512d vmax = _mm512_set1_pd(neg_inf);
+        for (; i + 8 <= size; i += 8) {
+            __m512d va = _mm512_loadu_pd(a + i);
+            __m512d vb = _mm512_loadu_pd(b + i);
+            __m512d vc = _mm512_loadu_pd(c + i);
+            vmax = _mm512_max_pd(vmax, _mm512_add_pd(_mm512_add_pd(va, vb), vc));
+        }
+        maxVal = _mm512_reduce_max_pd(vmax);
+    }
+#else
+    maxVal = neg_inf;
+#endif
+
+#if defined(LIBHMM_HAS_AVX) || defined(LIBHMM_HAS_AVX2)
+    {
+        __m256d vmax = _mm256_set1_pd(maxVal);
+        for (; i + 4 <= size; i += 4) {
+            __m256d va = _mm256_loadu_pd(a + i);
+            __m256d vb = _mm256_loadu_pd(b + i);
+            __m256d vc = _mm256_loadu_pd(c + i);
+            vmax = _mm256_max_pd(vmax, _mm256_add_pd(_mm256_add_pd(va, vb), vc));
+        }
+        maxVal = hmax_pd_avx(vmax);
+    }
+#endif
+
+#if defined(LIBHMM_HAS_SSE2)
+    {
+        __m128d vmax = _mm_set1_pd(maxVal);
+        for (; i + 2 <= size; i += 2) {
+            __m128d va = _mm_loadu_pd(a + i);
+            __m128d vb = _mm_loadu_pd(b + i);
+            __m128d vc = _mm_loadu_pd(c + i);
+            vmax = _mm_max_pd(vmax, _mm_add_pd(_mm_add_pd(va, vb), vc));
+        }
+        maxVal = hmax_pd_sse2(vmax);
+    }
+#endif
+
+#if defined(LIBHMM_HAS_NEON)
+    {
+        float64x2_t vmax = vdupq_n_f64(maxVal);
+        for (; i + 2 <= size; i += 2) {
+            float64x2_t va = vld1q_f64(a + i);
+            float64x2_t vb = vld1q_f64(b + i);
+            float64x2_t vc = vld1q_f64(c + i);
+            vmax = vmaxq_f64(vmax, vaddq_f64(vaddq_f64(va, vb), vc));
+        }
+        maxVal = vmaxvq_f64(vmax);
+    }
+#endif
+
+    // Scalar tail.
+    for (; i < size; ++i) {
+        const double t = a[i] + b[i] + c[i];
+        if (t > maxVal)
+            maxVal = t;
+    }
+    return maxVal;
+}
+
+// -----------------------------------------------------------------------------
+// sum_exp_sum3_minus_max: sum of exp(a[i]+b[i]+c[i] - maxVal)
+// -----------------------------------------------------------------------------
+double TranscendentalKernels::sum_exp_sum3_minus_max(const double *a, const double *b,
+                                                     const double *c, std::size_t size,
+                                                     double maxVal) noexcept {
+    if (!std::isfinite(maxVal))
+        return 0.0;
+    std::size_t i = 0;
+    double sum = 0.0;
+
+#if defined(LIBHMM_HAS_AVX512)
+    {
+        const __m512d vmaxv = _mm512_set1_pd(maxVal);
+        __m512d vsum = _mm512_setzero_pd();
+        for (; i + 8 <= size; i += 8) {
+            __m512d va = _mm512_loadu_pd(a + i);
+            __m512d vb = _mm512_loadu_pd(b + i);
+            __m512d vc = _mm512_loadu_pd(c + i);
+            __m512d term = _mm512_sub_pd(_mm512_add_pd(_mm512_add_pd(va, vb), vc), vmaxv);
+            vsum = _mm512_add_pd(vsum, kernels::k_exp_pd_avx512(term));
+        }
+        sum += _mm512_reduce_add_pd(vsum);
+    }
+#endif
+
+#if defined(LIBHMM_HAS_AVX) || defined(LIBHMM_HAS_AVX2)
+    {
+        const __m256d vmaxv = _mm256_set1_pd(maxVal);
+        __m256d vsum = _mm256_setzero_pd();
+        for (; i + 4 <= size; i += 4) {
+            __m256d va = _mm256_loadu_pd(a + i);
+            __m256d vb = _mm256_loadu_pd(b + i);
+            __m256d vc = _mm256_loadu_pd(c + i);
+            __m256d term = _mm256_sub_pd(_mm256_add_pd(_mm256_add_pd(va, vb), vc), vmaxv);
+            vsum = _mm256_add_pd(vsum, kernels::k_exp_pd_avx(term));
+        }
+        sum += hadd_pd_avx(vsum);
+    }
+#endif
+
+#if defined(LIBHMM_HAS_SSE2)
+    {
+        const __m128d vmaxv = _mm_set1_pd(maxVal);
+        __m128d vsum = _mm_setzero_pd();
+        for (; i + 2 <= size; i += 2) {
+            __m128d va = _mm_loadu_pd(a + i);
+            __m128d vb = _mm_loadu_pd(b + i);
+            __m128d vc = _mm_loadu_pd(c + i);
+            __m128d term = _mm_sub_pd(_mm_add_pd(_mm_add_pd(va, vb), vc), vmaxv);
+            vsum = _mm_add_pd(vsum, kernels::k_exp_pd_sse2(term));
+        }
+        sum += hadd_pd_sse2(vsum);
+    }
+#endif
+
+#if defined(LIBHMM_HAS_NEON)
+    {
+        const float64x2_t vmaxv = vdupq_n_f64(maxVal);
+        float64x2_t vsum = vdupq_n_f64(0.0);
+        for (; i + 2 <= size; i += 2) {
+            float64x2_t va = vld1q_f64(a + i);
+            float64x2_t vb = vld1q_f64(b + i);
+            float64x2_t vc = vld1q_f64(c + i);
+            float64x2_t term = vsubq_f64(vaddq_f64(vaddq_f64(va, vb), vc), vmaxv);
+            vsum = vaddq_f64(vsum, kernels::k_exp_pd_neon(term));
+        }
+        sum += vaddvq_f64(vsum);
+    }
+#endif
+
+    // Scalar tail.
+    for (; i < size; ++i) {
+        const double t = a[i] + b[i] + c[i];
+        if (std::isfinite(t))
+            sum += std::exp(t - maxVal);
+    }
+    return sum;
+}
+
+// -----------------------------------------------------------------------------
+// accumulate_exp_sum2_bias: dst[i] += exp(a[i] + b[i] + bias)
+// -----------------------------------------------------------------------------
+void TranscendentalKernels::accumulate_exp_sum2_bias(double *dst, const double *a, const double *b,
+                                                     std::size_t size, double bias) noexcept {
+    std::size_t i = 0;
+
+#if defined(LIBHMM_HAS_AVX512)
+    {
+        const __m512d vbias = _mm512_set1_pd(bias);
+        for (; i + 8 <= size; i += 8) {
+            __m512d vd = _mm512_loadu_pd(dst + i);
+            __m512d va = _mm512_loadu_pd(a + i);
+            __m512d vb = _mm512_loadu_pd(b + i);
+            __m512d arg = _mm512_add_pd(_mm512_add_pd(va, vb), vbias);
+            vd = _mm512_add_pd(vd, kernels::k_exp_pd_avx512(arg));
+            _mm512_storeu_pd(dst + i, vd);
+        }
+    }
+#endif
+
+#if defined(LIBHMM_HAS_AVX) || defined(LIBHMM_HAS_AVX2)
+    {
+        const __m256d vbias = _mm256_set1_pd(bias);
+        for (; i + 4 <= size; i += 4) {
+            __m256d vd = _mm256_loadu_pd(dst + i);
+            __m256d va = _mm256_loadu_pd(a + i);
+            __m256d vb = _mm256_loadu_pd(b + i);
+            __m256d arg = _mm256_add_pd(_mm256_add_pd(va, vb), vbias);
+            vd = _mm256_add_pd(vd, kernels::k_exp_pd_avx(arg));
+            _mm256_storeu_pd(dst + i, vd);
+        }
+    }
+#endif
+
+#if defined(LIBHMM_HAS_SSE2)
+    {
+        const __m128d vbias = _mm_set1_pd(bias);
+        for (; i + 2 <= size; i += 2) {
+            __m128d vd = _mm_loadu_pd(dst + i);
+            __m128d va = _mm_loadu_pd(a + i);
+            __m128d vb = _mm_loadu_pd(b + i);
+            __m128d arg = _mm_add_pd(_mm_add_pd(va, vb), vbias);
+            vd = _mm_add_pd(vd, kernels::k_exp_pd_sse2(arg));
+            _mm_storeu_pd(dst + i, vd);
+        }
+    }
+#endif
+
+#if defined(LIBHMM_HAS_NEON)
+    {
+        const float64x2_t vbias = vdupq_n_f64(bias);
+        for (; i + 2 <= size; i += 2) {
+            float64x2_t vd = vld1q_f64(dst + i);
+            float64x2_t va = vld1q_f64(a + i);
+            float64x2_t vb = vld1q_f64(b + i);
+            float64x2_t arg = vaddq_f64(vaddq_f64(va, vb), vbias);
+            vd = vaddq_f64(vd, kernels::k_exp_pd_neon(arg));
+            vst1q_f64(dst + i, vd);
+        }
+    }
+#endif
+
+    // Scalar tail.
+    for (; i < size; ++i) {
+        dst[i] += std::exp(a[i] + b[i] + bias);
+    }
+}
+
+} // namespace detail
+} // namespace performance
+} // namespace libhmm
diff --git a/src/training/baum_welch_trainer.cpp b/src/training/baum_welch_trainer.cpp
index 7ae236f..96b251a 100755
--- a/src/training/baum_welch_trainer.cpp
+++ b/src/training/baum_welch_trainer.cpp
@@ -1,6 +1,7 @@
 #include "libhmm/training/baum_welch_trainer.h"
 #include "libhmm/calculators/forward_backward_calculator.h"
 #include "libhmm/hmm.h"
+#include "libhmm/performance/transcendental_kernels.h"
 #include <cmath>
 #include <limits>
 #include <span>
@@ -26,23 +27,41 @@ BaumWelchTrainer::BaumWelchTrainer(Hmm *hmm, const ObservationLists &obsLists)
 void BaumWelchTrainer::train() {
     Hmm &hmm = hmm_ref_.get();
     const std::size_t N = static_cast<std::size_t>(hmm.getNumStates());
+    std::size_t totalExpectedLength = 0;
+    for (const auto &obs : obsLists_) {
+        totalExpectedLength += obs.size();
+    }
 
     // Accumulators (linear space, summed across all sequences)
     std::vector<double> piNum(N, 0.0);
-    std::vector<std::vector<double>> transNum(N, std::vector<double>(N, 0.0));
     std::vector<double> transDen(N, 0.0);
+    // Column-major accumulation: transNumT[j * N + i] stores the expected count
+    // for transition i->j. This matches the t/j/i xi loop for contiguous reads
+    // from the transposed log-transition matrix.
+    std::vector<double> transNumT(N * N, 0.0);
 
     // Per-state emission data/weights accumulated across sequences
     std::vector<std::vector<double>> emisData(N);
     std::vector<std::vector<double>> emisWts(N);
+    for (std::size_t i = 0; i < N; ++i) {
+        emisData[i].reserve(totalExpectedLength);
+        emisWts[i].reserve(totalExpectedLength);
+    }
 
-    // Precompute log-transition matrix from the current model
+    // Precompute transposed log-transition matrix from the current model:
+    // logTransT[j * N + i] = log a_{ij}
     const Matrix &curTrans = hmm.getTrans();
-    std::vector<std::vector<double>> logTrans(N, std::vector<double>(N));
+    std::vector<double> logTransT(N * N);
+    bool hasZeroTransitions = false;
     for (std::size_t i = 0; i < N; ++i) {
         for (std::size_t j = 0; j < N; ++j) {
             const double a = curTrans(i, j);
-            logTrans[i][j] = (a > 0.0) ? std::log(a) : LOG_ZERO;
+            if (a > 0.0) {
+                logTransT[j * N + i] = std::log(a);
+            } else {
+                logTransT[j * N + i] = LOG_ZERO;
+                hasZeroTransitions = true;
+            }
         }
     }
 
@@ -60,24 +79,33 @@ void BaumWelchTrainer::train() {
 
         const Matrix &logAlpha = fbc.getLogForwardVariables();
         const Matrix &logBeta = fbc.getLogBackwardVariables();
-
-        // Precompute log-emissions for this sequence: logEmit[i * T + t]
-        std::vector<double> obsVec(T);
-        for (std::size_t t = 0; t < T; ++t)
-            obsVec[t] = obs(t);
-
-        std::vector<double> logEmit(N * T);
+        const double *logAlphaData = logAlpha.data();
+        const double *logBetaData = logBeta.data();
+
+        // Precompute log-emissions for this sequence, then relayout to time-major:
+        // logEmitByTime[t * N + j] = log b_j(O_t)
+        std::vector<double> logEmitStateMajor(N * T);
+        std::vector<double> logEmitByTime(N * T);
+        const std::span<const double> obsSpan(obs.data(), T);
         for (std::size_t i = 0; i < N; ++i) {
             hmm.getDistribution(i).getBatchLogProbabilities(
-                std::span<const double>(obsVec.data(), T),
-                std::span<double>(logEmit.data() + i * T, T));
+                obsSpan, std::span<double>(logEmitStateMajor.data() + i * T, T));
+        }
+        for (std::size_t i = 0; i < N; ++i) {
+            const double *stateRow = logEmitStateMajor.data() + i * T;
+            for (std::size_t t = 0; t < T; ++t) {
+                logEmitByTime[t * N + i] = stateRow[t];
+            }
         }
 
         // Accumulate gamma (per timestep per state) and pi/trans denominators
         for (std::size_t t = 0; t < T; ++t) {
+            const double *alphaRow = logAlphaData + t * N;
+            const double *betaRow = logBetaData + t * N;
+            const double obsVal = obs(t);
             for (std::size_t i = 0; i < N; ++i) {
-                const double g = std::exp(logAlpha(t, i) + logBeta(t, i) - logP);
-                emisData[i].push_back(obs(t));
+                const double g = std::exp(alphaRow[i] + betaRow[i] - logP);
+                emisData[i].push_back(obsVal);
                 emisWts[i].push_back(g);
                 if (t == 0)
                     piNum[i] += g;
@@ -86,13 +114,40 @@ void BaumWelchTrainer::train() {
             }
         }
 
-        // Accumulate xi (transition counts)
-        for (std::size_t t = 0; t + 1 < T; ++t) {
-            for (std::size_t i = 0; i < N; ++i) {
+        // Accumulate xi (transition counts). Dense models take a branch-free
+        // path; sparse models keep the zero-transition skip.
+        // Sparse path is intentionally scalar: masking non-zero transitions in
+        // a SIMD loop costs more than it saves for the typically small fraction
+        // of non-zero entries in a sparse model.
+        if (hasZeroTransitions) {
+            for (std::size_t t = 0; t + 1 < T; ++t) {
+                const double *alphaRow = logAlphaData + t * N;
+                const double *betaNextRow = logBetaData + (t + 1) * N;
+                const double *emitNextRow = logEmitByTime.data() + (t + 1) * N;
+                for (std::size_t j = 0; j < N; ++j) {
+                    const double emitBetaNext = emitNextRow[j] + betaNextRow[j] - logP;
+                    const double *transCol = logTransT.data() + j * N;
+                    double *transNumCol = transNumT.data() + j * N;
+                    for (std::size_t i = 0; i < N; ++i) {
+                        if (transCol[i] == LOG_ZERO) {
+                            continue;
+                        }
+                        const double logXi = alphaRow[i] + transCol[i] + emitBetaNext;
+                        transNumCol[i] += std::exp(logXi);
+                    }
+                }
+            }
+        } else {
+            for (std::size_t t = 0; t + 1 < T; ++t) {
+                const double *alphaRow = logAlphaData + t * N;
+                const double *betaNextRow = logBetaData + (t + 1) * N;
+                const double *emitNextRow = logEmitByTime.data() + (t + 1) * N;
                 for (std::size_t j = 0; j < N; ++j) {
-                    const double logXi = logAlpha(t, i) + logTrans[i][j] +
-                                         logEmit[j * T + (t + 1)] + logBeta(t + 1, j) - logP;
-                    transNum[i][j] += std::exp(logXi);
+                    const double emitBetaNext = emitNextRow[j] + betaNextRow[j] - logP;
+                    const double *transCol = logTransT.data() + j * N;
+                    double *transNumCol = transNumT.data() + j * N;
+                    performance::detail::TranscendentalKernels::accumulate_exp_sum2_bias(
+                        transNumCol, alphaRow, transCol, N, emitBetaNext);
                 }
             }
         }
@@ -122,7 +177,7 @@ void BaumWelchTrainer::train() {
         Matrix newTrans(N, N);
         for (std::size_t i = 0; i < N; ++i) {
             for (std::size_t j = 0; j < N; ++j) {
-                newTrans(i, j) = (transDen[i] > 0.0) ? transNum[i][j] / transDen[i]
+                newTrans(i, j) = (transDen[i] > 0.0) ? transNumT[j * N + i] / transDen[i]
                                                      : 1.0 / static_cast<double>(N);
             }
         }
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 29f55ef..1fb97fb 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -142,20 +142,32 @@ if(GTest_FOUND OR TARGET gtest)
     set(ALL_TEST_TARGETS "")
 
     # =========================================================================
-    # Level 0: Platform
-    # No tests yet. test_simd_platform will be added in Phase 4.5.2 (tools)
-    # and referenced here when a portable SIMD-capability test is available.
+    # Platform Capabilities
     # =========================================================================
+    add_hmm_test(test_simd_platform platform/test_simd_platform.cpp)
 
     # =========================================================================
-    # Level 1: Math & Numerics
+    # Math & Numerics
     # =========================================================================
     add_hmm_test(test_modern_constants    common/test_modern_constants.cpp)
     add_hmm_test(test_numerical_stability common/test_numerical_stability.cpp)
     add_hmm_test(test_common              common/test_common.cpp)
 
     # =========================================================================
-    # Level 3: Distributions
+    # Performance Primitives
+    # Cross-cutting SIMD kernels consumed by both calculators and trainers.
+    # Compiled with LIBHMM_BEST_SIMD_FLAGS so the active SIMD path matches
+    # the production library -- parity is checked against std::exp.
+    # =========================================================================
+    add_hmm_test(test_transcendental_kernels performance/test_transcendental_kernels.cpp)
+    if(LIBHMM_BEST_SIMD_FLAGS)
+        set_source_files_properties(
+            performance/test_transcendental_kernels.cpp
+            PROPERTIES COMPILE_FLAGS "${LIBHMM_BEST_SIMD_FLAGS}")
+    endif()
+
+    # =========================================================================
+    # Distributions
     # =========================================================================
     add_hmm_test(test_distribution_traits       distributions/test_distribution_traits.cpp)
     add_hmm_test(test_distributions_header      distributions/test_distributions_header.cpp)
@@ -178,27 +190,29 @@ if(GTest_FOUND OR TARGET gtest)
     add_hmm_test(test_weibull_distribution           distributions/test_weibull_distribution.cpp)
 
     # =========================================================================
-    # Level 4: Core HMM
+    # Core HMM
     # =========================================================================
     add_hmm_test(test_hmm_core test_hmm_core.cpp)
 
     # =========================================================================
-    # Level 5: Calculators
+    # Calculators
     # =========================================================================
     add_hmm_test(test_canonical_calculators calculators/test_canonical_calculators.cpp)
     add_hmm_test(test_calculator_continuous calculators/test_calculator_continuous.cpp)
     add_hmm_test(test_calculator_edge_cases calculators/test_calculator_edge_cases.cpp)
+    add_hmm_test(test_fb_mode_parity        calculators/test_fb_mode_parity.cpp)
 
     # =========================================================================
-    # Level 6: Trainers
+    # Trainers
     # =========================================================================
     add_hmm_test(test_canonical_training     training/test_canonical_training.cpp)
     add_hmm_test(test_training               training/test_training.cpp)
     add_hmm_test(test_training_edge_cases    training/test_training_edge_cases.cpp)
     add_hmm_test(test_baum_welch_convergence training/test_baum_welch_convergence.cpp)
+    add_hmm_test(test_bw_parity              training/test_bw_parity.cpp)
 
     # =========================================================================
-    # Level 7: IO & Integration
+    # IO & Integration
     # =========================================================================
     add_hmm_test(test_xml_file_io   io/test_xml_file_io.cpp)
     add_hmm_test(test_hmm_stream_io io/test_hmm_stream_io.cpp)
diff --git a/tests/calculators/test_fb_mode_parity.cpp b/tests/calculators/test_fb_mode_parity.cpp
new file mode 100644
index 0000000..32ce496
--- /dev/null
+++ b/tests/calculators/test_fb_mode_parity.cpp
@@ -0,0 +1,216 @@
+#include <gtest/gtest.h>
+
+#include "libhmm/performance/fb_recurrence_policy.h"
+#include "libhmm/calculators/forward_backward_calculator.h"
+#include "libhmm/distributions/discrete_distribution.h"
+#include "libhmm/distributions/gaussian_distribution.h"
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <memory>
+#include <optional>
+
+using namespace libhmm;
+
+namespace {
+
+constexpr double kAbsTol = 1e-9;
+constexpr double kRelTol = 1e-12;
+
+void expectClose(double a, double b, double absTol = kAbsTol, double relTol = kRelTol) {
+    if (std::isnan(a) || std::isnan(b)) {
+        FAIL() << "Unexpected NaN: a=" << a << " b=" << b;
+    }
+    if (a == b) {
+        return;
+    }
+    const double diff = std::abs(a - b);
+    if (diff <= absTol) {
+        return;
+    }
+    const double largest = std::max(std::abs(a), std::abs(b));
+    EXPECT_LE(diff, relTol * largest)
+        << "values differ beyond tolerance: a=" << a << " b=" << b << " diff=" << diff;
+}
+
+void expectMatricesClose(const Matrix &a, const Matrix &b) {
+    ASSERT_EQ(a.size1(), b.size1());
+    ASSERT_EQ(a.size2(), b.size2());
+    for (std::size_t i = 0; i < a.size1(); ++i) {
+        for (std::size_t j = 0; j < a.size2(); ++j) {
+            const double av = a(i, j);
+            const double bv = b(i, j);
+            // -inf is a valid log-space value; require an exact match in that
+            // case so the kernels do not silently disagree on which transitions
+            // are infeasible.
+            if (std::isinf(av) || std::isinf(bv)) {
+                EXPECT_EQ(av, bv) << "log-zero mismatch at (" << i << "," << j << ")";
+                continue;
+            }
+            expectClose(av, bv);
+        }
+    }
+}
+
+std::unique_ptr<Hmm> makeDiscreteCasinoHmm(std::size_t numStates) {
+    auto hmm = std::make_unique<Hmm>(static_cast<int>(numStates));
+
+    Matrix trans(numStates, numStates);
+    for (std::size_t i = 0; i < numStates; ++i) {
+        double rowSum = 0.0;
+        for (std::size_t j = 0; j < numStates; ++j) {
+            const double w = 0.1 + 0.5 * static_cast<double>((i + j + 1) % 7);
+            trans(i, j) = w;
+            rowSum += w;
+        }
+        for (std::size_t j = 0; j < numStates; ++j) {
+            trans(i, j) /= rowSum;
+        }
+    }
+    hmm->setTrans(trans);
+
+    Vector pi(numStates);
+    for (std::size_t i = 0; i < numStates; ++i) {
+        pi(i) = 1.0 / static_cast<double>(numStates);
+    }
+    hmm->setPi(pi);
+
+    constexpr std::size_t kAlphabet = 6;
+    for (std::size_t i = 0; i < numStates; ++i) {
+        auto dist = std::make_unique<DiscreteDistribution>(kAlphabet);
+        std::array<double, kAlphabet> weights{};
+        double sum = 0.0;
+        for (std::size_t s = 0; s < kAlphabet; ++s) {
+            const double w = 0.05 + 0.2 * static_cast<double>((i * 11 + s * 3 + 1) % 5);
+            weights[s] = w;
+            sum += w;
+        }
+        for (std::size_t s = 0; s < kAlphabet; ++s) {
+            dist->setProbability(static_cast<double>(s), weights[s] / sum);
+        }
+        hmm->setDistribution(i, std::move(dist));
+    }
+    return hmm;
+}
+
+ObservationSet makeDeterministicObs(std::size_t length, std::size_t alphabet) {
+    ObservationSet obs(length);
+    for (std::size_t t = 0; t < length; ++t) {
+        obs(t) = static_cast<double>((t * 7 + 3) % alphabet);
+    }
+    return obs;
+}
+
+std::unique_ptr<Hmm> makeContinuousGaussianHmm(std::size_t numStates) {
+    auto hmm = std::make_unique<Hmm>(static_cast<int>(numStates));
+
+    Matrix trans(numStates, numStates);
+    for (std::size_t i = 0; i < numStates; ++i) {
+        double rowSum = 0.0;
+        for (std::size_t j = 0; j < numStates; ++j) {
+            const double w =
+                0.1 + 0.4 * std::sin(0.7 * static_cast<double>(i) + 1.3 * static_cast<double>(j));
+            const double clamped = std::max(w, 0.05);
+            trans(i, j) = clamped;
+            rowSum += clamped;
+        }
+        for (std::size_t j = 0; j < numStates; ++j) {
+            trans(i, j) /= rowSum;
+        }
+    }
+    hmm->setTrans(trans);
+
+    Vector pi(numStates);
+    for (std::size_t i = 0; i < numStates; ++i) {
+        pi(i) = 1.0 / static_cast<double>(numStates);
+    }
+    hmm->setPi(pi);
+
+    for (std::size_t i = 0; i < numStates; ++i) {
+        const double mean = 2.0 * static_cast<double>(i);
+        const double sigma = 1.0;
+        hmm->setDistribution(i, std::make_unique<GaussianDistribution>(mean, sigma));
+    }
+    return hmm;
+}
+
+ObservationSet makeContinuousObs(std::size_t length, std::size_t numStates) {
+    ObservationSet obs(length);
+    for (std::size_t t = 0; t < length; ++t) {
+        obs(t) = std::sin(0.1 * static_cast<double>(t)) * static_cast<double>(numStates);
+    }
+    return obs;
+}
+
+void runParityCheck(const Hmm &hmm, const ObservationSet &obs) {
+    ForwardBackwardCalculator pair(hmm, obs);
+    pair.setRecurrenceModeOverride(FbRecurrenceMode::Pairwise);
+    pair.compute();
+
+    ForwardBackwardCalculator maxr(hmm, obs);
+    maxr.setRecurrenceModeOverride(FbRecurrenceMode::MaxReduce);
+    maxr.compute();
+
+    ASSERT_EQ(pair.getRecurrenceMode(), FbRecurrenceMode::Pairwise);
+    ASSERT_EQ(maxr.getRecurrenceMode(), FbRecurrenceMode::MaxReduce);
+
+    expectClose(pair.getLogProbability(), maxr.getLogProbability());
+    expectMatricesClose(pair.getLogForwardVariables(), maxr.getLogForwardVariables());
+    expectMatricesClose(pair.getLogBackwardVariables(), maxr.getLogBackwardVariables());
+}
+
+} // namespace
+
+// ---------------------------------------------------------------------------
+// Discrete coverage across N=2..8 with a fixed-length sequence
+// ---------------------------------------------------------------------------
+
+class FbModeParityDiscreteTest : public ::testing::TestWithParam<std::size_t> {};
+
+TEST_P(FbModeParityDiscreteTest, KernelsAgreeOnDiscreteHmm) {
+    const std::size_t numStates = GetParam();
+    auto hmm = makeDiscreteCasinoHmm(numStates);
+    const ObservationSet obs = makeDeterministicObs(200, 6);
+    runParityCheck(*hmm, obs);
+}
+
+INSTANTIATE_TEST_SUITE_P(N2to8, FbModeParityDiscreteTest,
+                         ::testing::Values<std::size_t>(2, 3, 4, 5, 6, 7, 8));
+
+// ---------------------------------------------------------------------------
+// Continuous (Gaussian) coverage at the medium-N regime
+// ---------------------------------------------------------------------------
+
+class FbModeParityContinuousTest : public ::testing::TestWithParam<std::size_t> {};
+
+TEST_P(FbModeParityContinuousTest, KernelsAgreeOnContinuousHmm) {
+    const std::size_t numStates = GetParam();
+    auto hmm = makeContinuousGaussianHmm(numStates);
+    const ObservationSet obs = makeContinuousObs(500, numStates);
+    runParityCheck(*hmm, obs);
+}
+
+INSTANTIATE_TEST_SUITE_P(N4_8_16, FbModeParityContinuousTest,
+                         ::testing::Values<std::size_t>(4, 8, 16));
+
+// ---------------------------------------------------------------------------
+// Override accessor sanity
+// ---------------------------------------------------------------------------
+
+TEST(FbModeParityOverride, OverrideSurfacesViaGetter) {
+    auto hmm = makeDiscreteCasinoHmm(4);
+    const ObservationSet obs = makeDeterministicObs(50, 6);
+
+    ForwardBackwardCalculator fbc(*hmm, obs);
+    EXPECT_FALSE(fbc.getRecurrenceModeOverride().has_value());
+
+    fbc.setRecurrenceModeOverride(FbRecurrenceMode::MaxReduce);
+    ASSERT_TRUE(fbc.getRecurrenceModeOverride().has_value());
+    EXPECT_EQ(*fbc.getRecurrenceModeOverride(), FbRecurrenceMode::MaxReduce);
+    fbc.compute();
+    EXPECT_EQ(fbc.getRecurrenceMode(), FbRecurrenceMode::MaxReduce);
+
+    fbc.setRecurrenceModeOverride(std::nullopt);
+    EXPECT_FALSE(fbc.getRecurrenceModeOverride().has_value());
+}
diff --git a/tests/performance/test_transcendental_kernels.cpp b/tests/performance/test_transcendental_kernels.cpp
new file mode 100644
index 0000000..c7e3546
--- /dev/null
+++ b/tests/performance/test_transcendental_kernels.cpp
@@ -0,0 +1,363 @@
+// tests/performance/test_transcendental_kernels.cpp
+//
+// Parity tests for TranscendentalKernels: verify that each of the five
+// kernel methods agrees with a std::exp-based scalar reference to within
+// 1e-12 relative / 1e-15 absolute tolerance.
+//
+// Ground truth is always computed inline here using std::exp directly — NOT
+// by calling the kernel's internal scalar variant — so the test is
+// independent of any internal refactor.
+//
+// The test binary is compiled with LIBHMM_BEST_SIMD_FLAGS (see CMakeLists.txt
+// Performance Primitives section), so the active SIMD path matches the production library.
+
+#include "libhmm/performance/transcendental_kernels.h"
+#include "libhmm/math/constants.h"
+
+#include <gtest/gtest.h>
+
+#include <cmath>
+#include <limits>
+#include <numeric>
+#include <vector>
+
+namespace {
+
+using TK = libhmm::performance::detail::TranscendentalKernels;
+
+constexpr double LOG_ZERO = -std::numeric_limits<double>::infinity();
+constexpr double REL_TOL = 1e-12;
+constexpr double ABS_TOL = 1e-15;
+
+// Sizes chosen to cover: scalar-only (1), below SSE2 width (1,3), single
+// SSE2 block (2), single AVX block (4), non-multiple-of-4 (7,15,31),
+// exact AVX-512 block (8), exact double-block (16,32), and large (64).
+const std::vector<std::size_t> TEST_SIZES = {1, 2, 3, 4, 7, 8, 15, 16, 31, 32, 64};
+
+// -------------------------------------------------------------------------
+// Helper: build test input vectors
+// -------------------------------------------------------------------------
+
+// "Normal" log-probabilities in the range (-50, 0).
+static std::vector<double> make_log_probs(std::size_t n, double offset = 0.0) {
+    std::vector<double> v(n);
+    for (std::size_t i = 0; i < n; ++i) {
+        v[i] = -1.0 - static_cast<double>(i % 20) * 2.3 + offset;
+    }
+    return v;
+}
+
+// Mix of normal log-probs and LOG_ZERO sentinels (every 5th element).
+static std::vector<double> make_mixed(std::size_t n, double offset = 0.0) {
+    std::vector<double> v = make_log_probs(n, offset);
+    for (std::size_t i = 4; i < n; i += 5) {
+        v[i] = LOG_ZERO;
+    }
+    return v;
+}
+
+// Comparison helpers.
+static void check_scalar(double got, double ref, const char *label) {
+    if (std::isinf(ref) && std::isinf(got))
+        return; // both -inf is fine
+    const double diff = std::abs(got - ref);
+    if (ref != 0.0) {
+        EXPECT_LE(diff / std::abs(ref), REL_TOL)
+            << label << ": relative error too large  got=" << got << " ref=" << ref;
+    } else {
+        EXPECT_LE(diff, ABS_TOL) << label << ": absolute error too large  got=" << got
+                                 << " ref=" << ref;
+    }
+}
+
+static void check_array(const std::vector<double> &got, const std::vector<double> &ref,
+                        const char *label) {
+    ASSERT_EQ(got.size(), ref.size());
+    for (std::size_t i = 0; i < got.size(); ++i) {
+        check_scalar(got[i], ref[i], label);
+    }
+}
+
+// =========================================================================
+// 1. reduce_max_sum2
+// =========================================================================
+
+static double ref_reduce_max_sum2(const std::vector<double> &a, const std::vector<double> &b) {
+    double m = -std::numeric_limits<double>::infinity();
+    for (std::size_t i = 0; i < a.size(); ++i) {
+        double t = a[i] + b[i];
+        if (t > m)
+            m = t;
+    }
+    return m;
+}
+
+TEST(TranscendentalKernels, ReduceMaxSum2_NormalInputs) {
+    for (std::size_t n : TEST_SIZES) {
+        auto a = make_log_probs(n, 0.0);
+        auto b = make_log_probs(n, -3.7);
+        double got = TK::reduce_max_sum2(a.data(), b.data(), n);
+        double ref = ref_reduce_max_sum2(a, b);
+        check_scalar(got, ref, "reduce_max_sum2/normal");
+    }
+}
+
+TEST(TranscendentalKernels, ReduceMaxSum2_WithLogZero) {
+    for (std::size_t n : TEST_SIZES) {
+        auto a = make_mixed(n, 0.0);
+        auto b = make_mixed(n, -1.5);
+        double got = TK::reduce_max_sum2(a.data(), b.data(), n);
+        double ref = ref_reduce_max_sum2(a, b);
+        // -inf + anything is -inf; max may be -inf if all are LOG_ZERO pairs.
+        if (std::isinf(ref) && std::isinf(got)) {
+            EXPECT_EQ(std::signbit(ref), std::signbit(got));
+        } else {
+            check_scalar(got, ref, "reduce_max_sum2/mixed");
+        }
+    }
+}
+
+// =========================================================================
+// 2. sum_exp_sum2_minus_max
+// =========================================================================
+
+static double ref_sum_exp_sum2_minus_max(const std::vector<double> &a, const std::vector<double> &b,
+                                         double maxVal) {
+    if (!std::isfinite(maxVal))
+        return 0.0;
+    double s = 0.0;
+    for (std::size_t i = 0; i < a.size(); ++i) {
+        double t = a[i] + b[i];
+        if (std::isfinite(t))
+            s += std::exp(t - maxVal);
+    }
+    return s;
+}
+
+TEST(TranscendentalKernels, SumExpSum2MinusMax_NormalInputs) {
+    for (std::size_t n : TEST_SIZES) {
+        auto a = make_log_probs(n, 0.0);
+        auto b = make_log_probs(n, -3.7);
+        double maxVal = ref_reduce_max_sum2(a, b);
+        double got = TK::sum_exp_sum2_minus_max(a.data(), b.data(), n, maxVal);
+        double ref = ref_sum_exp_sum2_minus_max(a, b, maxVal);
+        check_scalar(got, ref, "sum_exp_sum2_minus_max/normal");
+    }
+}
+
+TEST(TranscendentalKernels, SumExpSum2MinusMax_WithLogZero) {
+    for (std::size_t n : TEST_SIZES) {
+        auto a = make_mixed(n, 0.0);
+        auto b = make_mixed(n, -1.5);
+        double maxVal = ref_reduce_max_sum2(a, b);
+        double got = TK::sum_exp_sum2_minus_max(a.data(), b.data(), n, maxVal);
+        double ref = ref_sum_exp_sum2_minus_max(a, b, maxVal);
+        check_scalar(got, ref, "sum_exp_sum2_minus_max/mixed");
+    }
+}
+
+TEST(TranscendentalKernels, SumExpSum2MinusMax_InfiniteMax) {
+    for (std::size_t n : TEST_SIZES) {
+        auto a = make_log_probs(n);
+        auto b = make_log_probs(n);
+        double got = TK::sum_exp_sum2_minus_max(a.data(), b.data(), n,
+                                                -std::numeric_limits<double>::infinity());
+        EXPECT_EQ(got, 0.0) << "should return 0 when maxVal is -inf";
+    }
+}
+
+// =========================================================================
+// 3. reduce_max_sum3
+// =========================================================================
+
+static double ref_reduce_max_sum3(const std::vector<double> &a, const std::vector<double> &b,
+                                  const std::vector<double> &c) {
+    double m = -std::numeric_limits<double>::infinity();
+    for (std::size_t i = 0; i < a.size(); ++i) {
+        double t = a[i] + b[i] + c[i];
+        if (t > m)
+            m = t;
+    }
+    return m;
+}
+
+TEST(TranscendentalKernels, ReduceMaxSum3_NormalInputs) {
+    for (std::size_t n : TEST_SIZES) {
+        auto a = make_log_probs(n, 0.0);
+        auto b = make_log_probs(n, -2.1);
+        auto c = make_log_probs(n, -5.3);
+        double got = TK::reduce_max_sum3(a.data(), b.data(), c.data(), n);
+        double ref = ref_reduce_max_sum3(a, b, c);
+        check_scalar(got, ref, "reduce_max_sum3/normal");
+    }
+}
+
+TEST(TranscendentalKernels, ReduceMaxSum3_WithLogZero) {
+    for (std::size_t n : TEST_SIZES) {
+        auto a = make_mixed(n, 0.0);
+        auto b = make_mixed(n, -2.1);
+        auto c = make_mixed(n, -5.3);
+        double got = TK::reduce_max_sum3(a.data(), b.data(), c.data(), n);
+        double ref = ref_reduce_max_sum3(a, b, c);
+        if (std::isinf(ref) && std::isinf(got)) {
+            EXPECT_EQ(std::signbit(ref), std::signbit(got));
+        } else {
+            check_scalar(got, ref, "reduce_max_sum3/mixed");
+        }
+    }
+}
+
+// =========================================================================
+// 4. sum_exp_sum3_minus_max
+// =========================================================================
+
+static double ref_sum_exp_sum3_minus_max(const std::vector<double> &a, const std::vector<double> &b,
+                                         const std::vector<double> &c, double maxVal) {
+    if (!std::isfinite(maxVal))
+        return 0.0;
+    double s = 0.0;
+    for (std::size_t i = 0; i < a.size(); ++i) {
+        double t = a[i] + b[i] + c[i];
+        if (std::isfinite(t))
+            s += std::exp(t - maxVal);
+    }
+    return s;
+}
+
+TEST(TranscendentalKernels, SumExpSum3MinusMax_NormalInputs) {
+    for (std::size_t n : TEST_SIZES) {
+        auto a = make_log_probs(n, 0.0);
+        auto b = make_log_probs(n, -2.1);
+        auto c = make_log_probs(n, -5.3);
+        double maxVal = ref_reduce_max_sum3(a, b, c);
+        double got = TK::sum_exp_sum3_minus_max(a.data(), b.data(), c.data(), n, maxVal);
+        double ref = ref_sum_exp_sum3_minus_max(a, b, c, maxVal);
+        check_scalar(got, ref, "sum_exp_sum3_minus_max/normal");
+    }
+}
+
+TEST(TranscendentalKernels, SumExpSum3MinusMax_WithLogZero) {
+    for (std::size_t n : TEST_SIZES) {
+        auto a = make_mixed(n, 0.0);
+        auto b = make_mixed(n, -2.1);
+        auto c = make_mixed(n, -5.3);
+        double maxVal = ref_reduce_max_sum3(a, b, c);
+        double got = TK::sum_exp_sum3_minus_max(a.data(), b.data(), c.data(), n, maxVal);
+        double ref = ref_sum_exp_sum3_minus_max(a, b, c, maxVal);
+        check_scalar(got, ref, "sum_exp_sum3_minus_max/mixed");
+    }
+}
+
+TEST(TranscendentalKernels, SumExpSum3MinusMax_InfiniteMax) {
+    for (std::size_t n : TEST_SIZES) {
+        auto a = make_log_probs(n);
+        auto b = make_log_probs(n);
+        auto c = make_log_probs(n);
+        double got = TK::sum_exp_sum3_minus_max(a.data(), b.data(), c.data(), n,
+                                                -std::numeric_limits<double>::infinity());
+        EXPECT_EQ(got, 0.0) << "should return 0 when maxVal is -inf";
+    }
+}
+
+// =========================================================================
+// 5. accumulate_exp_sum2_bias
+// =========================================================================
+
+static void ref_accumulate_exp_sum2_bias(std::vector<double> &dst, const std::vector<double> &a,
+                                         const std::vector<double> &b, double bias) {
+    for (std::size_t i = 0; i < dst.size(); ++i) {
+        dst[i] += std::exp(a[i] + b[i] + bias);
+    }
+}
+
+TEST(TranscendentalKernels, AccumulateExpSum2Bias_NormalInputs) {
+    for (std::size_t n : TEST_SIZES) {
+        auto a = make_log_probs(n, 0.0);
+        auto b = make_log_probs(n, -3.7);
+        const double bias = -12.5;
+
+        std::vector<double> got_dst(n, 0.5);
+        std::vector<double> ref_dst(n, 0.5);
+
+        TK::accumulate_exp_sum2_bias(got_dst.data(), a.data(), b.data(), n, bias);
+        ref_accumulate_exp_sum2_bias(ref_dst, a, b, bias);
+
+        check_array(got_dst, ref_dst, "accumulate_exp_sum2_bias/normal");
+    }
+}
+
+TEST(TranscendentalKernels, AccumulateExpSum2Bias_LogZeroInputs) {
+    // LOG_ZERO inputs: exp(-inf + ...) = 0; dst[i] should be unchanged.
+    for (std::size_t n : TEST_SIZES) {
+        std::vector<double> a(n, LOG_ZERO);
+        std::vector<double> b(n, 0.0);
+        const double bias = 0.0;
+
+        std::vector<double> got_dst(n, 1.0);
+        std::vector<double> ref_dst(n, 1.0);
+
+        TK::accumulate_exp_sum2_bias(got_dst.data(), a.data(), b.data(), n, bias);
+        ref_accumulate_exp_sum2_bias(ref_dst, a, b, bias);
+
+        check_array(got_dst, ref_dst, "accumulate_exp_sum2_bias/log_zero");
+    }
+}
+
+TEST(TranscendentalKernels, AccumulateExpSum2Bias_SmallBias) {
+    // Verify behaviour near the underflow threshold.
+    // The SIMD kernel intentionally returns 0 for arg <= MIN_LOG_PROBABILITY
+    // (branch-free mask). std::exp does not underflow to 0 until ~-708.4, so
+    // inputs in the range (-708.4, -700] produce a discrepancy between raw
+    // std::exp and the SIMD. The reference must apply the same underflow
+    // contract as the kernel so the comparison is against the specified
+    // behaviour, not against an unclamped std::exp.
+    constexpr double EXP_UNDERFLOW = libhmm::constants::probability::MIN_LOG_PROBABILITY;
+    for (std::size_t n : TEST_SIZES) {
+        auto a = make_log_probs(n, 0.0);
+        auto b = make_log_probs(n, 0.0);
+        const double bias = EXP_UNDERFLOW + 5.0; // -695
+
+        std::vector<double> got_dst(n, 0.0);
+        std::vector<double> ref_dst(n, 0.0);
+
+        TK::accumulate_exp_sum2_bias(got_dst.data(), a.data(), b.data(), n, bias);
+
+        // Reference: zero for arg <= EXP_UNDERFLOW, std::exp otherwise.
+        for (std::size_t k = 0; k < n; ++k) {
+            const double arg = a[k] + b[k] + bias;
+            if (arg > EXP_UNDERFLOW)
+                ref_dst[k] += std::exp(arg);
+        }
+
+        check_array(got_dst, ref_dst, "accumulate_exp_sum2_bias/small_bias");
+    }
+}
+
+// =========================================================================
+// 6. Consistency: max-reduce round-trip
+//    reduce_max then sum_exp should reproduce log-sum-exp.
+// =========================================================================
+
+TEST(TranscendentalKernels, RoundTrip_LogSumExp2) {
+    // For finite inputs: log(sum_exp(a+b - max)) + max == log_sum_exp(a, b).
+    // Just check the intermediate values are consistent with each other.
+    for (std::size_t n : TEST_SIZES) {
+        if (n == 0)
+            continue;
+        auto a = make_log_probs(n, 0.0);
+        auto b = make_log_probs(n, -2.0);
+
+        double maxVal = TK::reduce_max_sum2(a.data(), b.data(), n);
+        double scaledSum = TK::sum_exp_sum2_minus_max(a.data(), b.data(), n, maxVal);
+
+        EXPECT_TRUE(std::isfinite(maxVal))
+            << "reduce_max_sum2 should return finite max for normal inputs (n=" << n << ")";
+        EXPECT_GT(scaledSum, 0.0) << "scaled sum should be positive (n=" << n << ")";
+
+        double logSumExp = maxVal + std::log(scaledSum);
+        EXPECT_TRUE(std::isfinite(logSumExp))
+            << "reconstructed log-sum-exp should be finite (n=" << n << ")";
+    }
+}
+
+} // anonymous namespace
diff --git a/tests/platform/test_simd_platform.cpp b/tests/platform/test_simd_platform.cpp
new file mode 100644
index 0000000..457f5de
--- /dev/null
+++ b/tests/platform/test_simd_platform.cpp
@@ -0,0 +1,169 @@
+// tests/platform/test_simd_platform.cpp
+//
+// Consistency checks for libhmm/platform/simd_platform.h.
+//
+// Two layers of verification:
+//
+//  1. Compile-time (#error) — ISA hierarchy invariants that can only fail if
+//     simd_platform.h emits a broken macro combination.  A violation here is
+//     a build error, not a test failure.
+//
+//  2. Runtime (GTest) — contracts on the utility functions:
+//       feature_string()        non-null, non-empty, agrees with active macros
+//       double_vector_width()   power-of-two >= 1
+//       float_vector_width()    == 2 * double_vector_width()
+//       optimal_alignment()     power-of-two >= 8, covers one SIMD register
+//       has_simd_support()      consistent with double_vector_width()
+//       supports_vectorization()consistent with has_simd_support()
+//       compile-time constants  DOUBLE_SIMD_WIDTH / FLOAT_SIMD_WIDTH /
+//                               SIMD_ALIGNMENT each agree with their function
+//
+// Not compiled with LIBHMM_BEST_SIMD_FLAGS — tests the detection
+// infrastructure, not the intrinsics.
+
+#include <gtest/gtest.h>
+#include "libhmm/platform/simd_platform.h"
+
+#include <cstring>
+
+// ============================================================================
+// Compile-time ISA hierarchy invariants
+// A #error here means simd_platform.h has emitted a broken macro combination.
+// ============================================================================
+
+#if defined(LIBHMM_HAS_AVX512) && !defined(LIBHMM_HAS_AVX)
+#error "LIBHMM_HAS_AVX512 requires LIBHMM_HAS_AVX"
+#endif
+#if defined(LIBHMM_HAS_AVX512) && !defined(LIBHMM_HAS_SSE2)
+#error "LIBHMM_HAS_AVX512 requires LIBHMM_HAS_SSE2"
+#endif
+#if defined(LIBHMM_HAS_AVX2) && !defined(LIBHMM_HAS_AVX)
+#error "LIBHMM_HAS_AVX2 requires LIBHMM_HAS_AVX"
+#endif
+#if defined(LIBHMM_HAS_AVX2) && !defined(LIBHMM_HAS_SSE2)
+#error "LIBHMM_HAS_AVX2 requires LIBHMM_HAS_SSE2"
+#endif
+#if defined(LIBHMM_HAS_AVX) && !defined(LIBHMM_HAS_SSE2)
+#error "LIBHMM_HAS_AVX requires LIBHMM_HAS_SSE2"
+#endif
+#if defined(LIBHMM_HAS_SSE4_1) && !defined(LIBHMM_HAS_SSE2)
+#error "LIBHMM_HAS_SSE4_1 requires LIBHMM_HAS_SSE2"
+#endif
+#if defined(LIBHMM_HAS_NEON) && defined(LIBHMM_HAS_SSE2)
+#error "LIBHMM_HAS_NEON and x86 SIMD macros are mutually exclusive"
+#endif
+
+// ============================================================================
+// Helpers
+// ============================================================================
+
+using namespace libhmm::performance::simd;
+
+namespace {
+
+constexpr bool is_power_of_two(std::size_t n) noexcept {
+    return n >= 1 && (n & (n - 1)) == 0;
+}
+
+} // namespace
+
+// ============================================================================
+// feature_string
+// ============================================================================
+
+TEST(SimdPlatformFeatureString, NonNull) {
+    EXPECT_NE(feature_string(), nullptr);
+}
+
+TEST(SimdPlatformFeatureString, NonEmpty) {
+    EXPECT_GT(std::strlen(feature_string()), 0u);
+}
+
+// The reported string must match the highest active ISA macro.
+TEST(SimdPlatformFeatureString, ConsistentWithMacros) {
+#if defined(LIBHMM_HAS_AVX512)
+    EXPECT_STREQ(feature_string(), "AVX-512");
+#elif defined(LIBHMM_HAS_AVX2)
+    EXPECT_STREQ(feature_string(), "AVX2");
+#elif defined(LIBHMM_HAS_AVX)
+    EXPECT_STREQ(feature_string(), "AVX");
+#elif defined(LIBHMM_HAS_SSE4_1)
+    EXPECT_STREQ(feature_string(), "SSE4.1");
+#elif defined(LIBHMM_HAS_SSE2)
+    EXPECT_STREQ(feature_string(), "SSE2");
+#elif defined(LIBHMM_HAS_NEON)
+    // Accepts both "ARM NEON" and "ARM NEON (Apple Silicon)".
+    EXPECT_EQ(std::strncmp(feature_string(), "ARM NEON", 8), 0);
+#else
+    EXPECT_STREQ(feature_string(), "Scalar (No SIMD)");
+#endif
+}
+
+// ============================================================================
+// double_vector_width / float_vector_width
+// ============================================================================
+
+TEST(SimdPlatformVectorWidth, DoubleWidthAtLeastOne) {
+    EXPECT_GE(double_vector_width(), 1u);
+}
+
+TEST(SimdPlatformVectorWidth, DoubleWidthIsPowerOfTwo) {
+    EXPECT_TRUE(is_power_of_two(double_vector_width()));
+}
+
+// float is 32-bit, double is 64-bit: a register holds twice as many floats.
+TEST(SimdPlatformVectorWidth, FloatWidthIsTwiceDoubleWidth) {
+    EXPECT_EQ(float_vector_width(), 2u * double_vector_width());
+}
+
+// ============================================================================
+// optimal_alignment
+// ============================================================================
+
+TEST(SimdPlatformAlignment, AtLeastEightBytes) {
+    EXPECT_GE(optimal_alignment(), 8u);
+}
+
+TEST(SimdPlatformAlignment, IsPowerOfTwo) {
+    EXPECT_TRUE(is_power_of_two(optimal_alignment()));
+}
+
+// Alignment must be at least enough to hold one full SIMD register of doubles.
+TEST(SimdPlatformAlignment, CoversOneSimdRegister) {
+    EXPECT_GE(optimal_alignment(), double_vector_width() * sizeof(double));
+}
+
+// ============================================================================
+// has_simd_support / supports_vectorization
+// ============================================================================
+
+TEST(SimdPlatformSupport, HasSimdConsistentWithWidth) {
+    if (has_simd_support()) {
+        EXPECT_GE(double_vector_width(), 2u);
+    } else {
+        EXPECT_EQ(double_vector_width(), 1u);
+    }
+}
+
+TEST(SimdPlatformSupport, SupportsVectorizationRequiresHasSimd) {
+    if (supports_vectorization()) {
+        EXPECT_TRUE(has_simd_support());
+        EXPECT_GE(double_vector_width(), 2u);
+    }
+}
+
+// ============================================================================
+// Compile-time constants agree with their corresponding functions
+// ============================================================================
+
+TEST(SimdPlatformConstants, DoubleSimdWidthMatchesFunction) {
+    EXPECT_EQ(DOUBLE_SIMD_WIDTH, double_vector_width());
+}
+
+TEST(SimdPlatformConstants, FloatSimdWidthMatchesFunction) {
+    EXPECT_EQ(FLOAT_SIMD_WIDTH, float_vector_width());
+}
+
+TEST(SimdPlatformConstants, SimdAlignmentMatchesFunction) {
+    EXPECT_EQ(SIMD_ALIGNMENT, optimal_alignment());
+}
diff --git a/tests/training/test_bw_parity.cpp b/tests/training/test_bw_parity.cpp
new file mode 100644
index 0000000..9bc390c
--- /dev/null
+++ b/tests/training/test_bw_parity.cpp
@@ -0,0 +1,232 @@
+#include <gtest/gtest.h>
+
+#include "libhmm/calculators/forward_backward_calculator.h"
+#include "libhmm/distributions/discrete_distribution.h"
+#include "libhmm/distributions/gaussian_distribution.h"
+#include "libhmm/training/baum_welch_trainer.h"
+
+#include <array>
+#include <cmath>
+#include <memory>
+#include <vector>
+
+using namespace libhmm;
+
+namespace {
+
+constexpr double kBitExactTol = 0.0;
+constexpr double kRelTol = 1e-12;
+constexpr double kAbsTol = 1e-14;
+
+void expectClose(double a, double b, double absTol = kAbsTol, double relTol = kRelTol) {
+    if (std::isnan(a) || std::isnan(b)) {
+        FAIL() << "Unexpected NaN: a=" << a << " b=" << b;
+    }
+    if (a == b) {
+        return;
+    }
+    const double diff = std::abs(a - b);
+    if (diff <= absTol) {
+        return;
+    }
+    const double largest = std::max(std::abs(a), std::abs(b));
+    EXPECT_LE(diff, relTol * largest)
+        << "values differ beyond tolerance: a=" << a << " b=" << b << " diff=" << diff;
+}
+
+void expectMatricesEqual(const Matrix &a, const Matrix &b, double absTol) {
+    ASSERT_EQ(a.size1(), b.size1());
+    ASSERT_EQ(a.size2(), b.size2());
+    for (std::size_t i = 0; i < a.size1(); ++i) {
+        for (std::size_t j = 0; j < a.size2(); ++j) {
+            if (absTol == kBitExactTol) {
+                EXPECT_EQ(a(i, j), b(i, j)) << "mismatch at (" << i << "," << j << ")";
+            } else {
+                expectClose(a(i, j), b(i, j), absTol);
+            }
+        }
+    }
+}
+
+void expectVectorsEqual(const Vector &a, const Vector &b, double absTol) {
+    ASSERT_EQ(a.size(), b.size());
+    for (std::size_t i = 0; i < a.size(); ++i) {
+        if (absTol == kBitExactTol) {
+            EXPECT_EQ(a(i), b(i)) << "mismatch at (" << i << ")";
+        } else {
+            expectClose(a(i), b(i), absTol);
+        }
+    }
+}
+
+std::unique_ptr<Hmm> makeDiscreteCasino(std::size_t numStates, std::size_t alphabet) {
+    auto hmm = std::make_unique<Hmm>(static_cast<int>(numStates));
+
+    Matrix trans(numStates, numStates);
+    for (std::size_t i = 0; i < numStates; ++i) {
+        double rowSum = 0.0;
+        for (std::size_t j = 0; j < numStates; ++j) {
+            const double w = 0.1 + 0.4 * static_cast<double>((i + j + 1) % 5);
+            trans(i, j) = w;
+            rowSum += w;
+        }
+        for (std::size_t j = 0; j < numStates; ++j) {
+            trans(i, j) /= rowSum;
+        }
+    }
+    hmm->setTrans(trans);
+
+    Vector pi(numStates);
+    for (std::size_t i = 0; i < numStates; ++i) {
+        pi(i) = 1.0 / static_cast<double>(numStates);
+    }
+    hmm->setPi(pi);
+
+    for (std::size_t i = 0; i < numStates; ++i) {
+        auto dist = std::make_unique<DiscreteDistribution>(static_cast<int>(alphabet));
+        std::vector<double> weights(alphabet);
+        double sum = 0.0;
+        for (std::size_t s = 0; s < alphabet; ++s) {
+            const double w = 0.05 + 0.2 * static_cast<double>((i * 11 + s * 3 + 1) % 5);
+            weights[s] = w;
+            sum += w;
+        }
+        for (std::size_t s = 0; s < alphabet; ++s) {
+            dist->setProbability(static_cast<double>(s), weights[s] / sum);
+        }
+        hmm->setDistribution(i, std::move(dist));
+    }
+    return hmm;
+}
+
+ObservationLists makeDiscreteSequences() {
+    ObservationLists out;
+    constexpr std::size_t kAlphabet = 6;
+    constexpr std::array<std::size_t, 4> kLengths{50, 75, 30, 100};
+    for (std::size_t s = 0; s < kLengths.size(); ++s) {
+        ObservationSet seq(kLengths[s]);
+        for (std::size_t t = 0; t < kLengths[s]; ++t) {
+            seq(t) = static_cast<double>((t * 7 + s * 13 + 3) % kAlphabet);
+        }
+        out.push_back(seq);
+    }
+    return out;
+}
+
+double scoreSequencesUnderModel(const Hmm &hmm, const ObservationLists &seqs) {
+    double total = 0.0;
+    for (const auto &seq : seqs) {
+        if (seq.size() == 0) {
+            continue;
+        }
+        ForwardBackwardCalculator fbc(hmm, seq);
+        const double lp = fbc.getLogProbability();
+        if (std::isfinite(lp)) {
+            total += lp;
+        }
+    }
+    return total;
+}
+
+} // namespace
+
+// ---------------------------------------------------------------------------
+// Determinism: two independent BW runs from the same starting point on the
+// same input must produce bit-exact identical updated parameters.
+// ---------------------------------------------------------------------------
+
+TEST(BaumWelchParity, OneStepDeterministic_DiscreteN3) {
+    auto hmmA = makeDiscreteCasino(3, 6);
+    auto hmmB = makeDiscreteCasino(3, 6);
+    const ObservationLists seqs = makeDiscreteSequences();
+
+    BaumWelchTrainer trainerA(*hmmA, seqs);
+    BaumWelchTrainer trainerB(*hmmB, seqs);
+    trainerA.train();
+    trainerB.train();
+
+    expectVectorsEqual(hmmA->getPi(), hmmB->getPi(), kBitExactTol);
+    expectMatricesEqual(hmmA->getTrans(), hmmB->getTrans(), kBitExactTol);
+    for (int i = 0; i < hmmA->getNumStates(); ++i) {
+        const auto *distA = dynamic_cast<const DiscreteDistribution *>(&hmmA->getDistribution(i));
+        const auto *distB = dynamic_cast<const DiscreteDistribution *>(&hmmB->getDistribution(i));
+        ASSERT_NE(distA, nullptr);
+        ASSERT_NE(distB, nullptr);
+        ASSERT_EQ(distA->getNumSymbols(), distB->getNumSymbols());
+        for (std::size_t s = 0; s < distA->getNumSymbols(); ++s) {
+            EXPECT_EQ(distA->getSymbolProbability(s), distB->getSymbolProbability(s))
+                << "state " << i << " symbol " << s;
+        }
+    }
+}
+
+TEST(BaumWelchParity, OneStepDeterministic_DiscreteN5) {
+    auto hmmA = makeDiscreteCasino(5, 6);
+    auto hmmB = makeDiscreteCasino(5, 6);
+    const ObservationLists seqs = makeDiscreteSequences();
+
+    BaumWelchTrainer trainerA(*hmmA, seqs);
+    BaumWelchTrainer trainerB(*hmmB, seqs);
+    trainerA.train();
+    trainerB.train();
+
+    expectVectorsEqual(hmmA->getPi(), hmmB->getPi(), kBitExactTol);
+    expectMatricesEqual(hmmA->getTrans(), hmmB->getTrans(), kBitExactTol);
+}
+
+// ---------------------------------------------------------------------------
+// EM monotonicity: a single train() step on the supplied sequences must not
+// reduce the total observation log-probability under the model.
+// ---------------------------------------------------------------------------
+
+TEST(BaumWelchParity, OneStepMonotonic_Discrete) {
+    auto hmm = makeDiscreteCasino(3, 6);
+    const ObservationLists seqs = makeDiscreteSequences();
+
+    const double scoreBefore = scoreSequencesUnderModel(*hmm, seqs);
+    BaumWelchTrainer trainer(*hmm, seqs);
+    trainer.train();
+    const double scoreAfter = scoreSequencesUnderModel(*hmm, seqs);
+
+    EXPECT_TRUE(std::isfinite(scoreBefore));
+    EXPECT_TRUE(std::isfinite(scoreAfter));
+    // Allow a small tolerance for floating-point noise around stationary points.
+    EXPECT_GE(scoreAfter, scoreBefore - 1e-9)
+        << "BW step should not decrease log-likelihood: before=" << scoreBefore
+        << " after=" << scoreAfter;
+}
+
+// ---------------------------------------------------------------------------
+// Invariants: post-step pi sums to 1, transition rows sum to 1, no NaN/inf.
+// ---------------------------------------------------------------------------
+
+TEST(BaumWelchParity, OneStepInvariants_Discrete) {
+    auto hmm = makeDiscreteCasino(4, 6);
+    const ObservationLists seqs = makeDiscreteSequences();
+
+    BaumWelchTrainer trainer(*hmm, seqs);
+    trainer.train();
+
+    const Vector &pi = hmm->getPi();
+    double piSum = 0.0;
+    for (std::size_t i = 0; i < pi.size(); ++i) {
+        EXPECT_TRUE(std::isfinite(pi(i)));
+        EXPECT_GE(pi(i), 0.0);
+        EXPECT_LE(pi(i), 1.0);
+        piSum += pi(i);
+    }
+    EXPECT_NEAR(piSum, 1.0, 1e-12);
+
+    const Matrix &trans = hmm->getTrans();
+    for (std::size_t i = 0; i < trans.size1(); ++i) {
+        double rowSum = 0.0;
+        for (std::size_t j = 0; j < trans.size2(); ++j) {
+            const double v = trans(i, j);
+            EXPECT_TRUE(std::isfinite(v));
+            EXPECT_GE(v, 0.0);
+            EXPECT_LE(v, 1.0);
+            rowSum += v;
+        }
+        EXPECT_NEAR(rowSum, 1.0, 1e-12);
+    }
+}
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 041773c..57d0692 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -36,6 +36,18 @@ add_hmm_tool(debug_parallel    debug_parallel.cpp   ${LIBHMM_TOOL_THREADPOOL})
 add_hmm_tool(simd_inspection   simd_inspection.cpp)
 add_hmm_tool(batch_performance batch_performance.cpp)
 add_hmm_tool(hmm_validator     hmm_validator.cpp)
+add_hmm_tool(hotspot_breakdown hotspot_breakdown.cpp)
+add_hmm_tool(fb_contour_sweep  fb_contour_sweep.cpp)
+add_hmm_tool(bw_hotspot          bw_hotspot.cpp)
+add_hmm_tool(fb_crossover_sweep  fb_crossover_sweep.cpp)
+if(LIBHMM_EXPERIMENT_FB_MAX_REDUCE)
+    target_compile_definitions(hotspot_breakdown PRIVATE LIBHMM_EXPERIMENT_FB_MAX_REDUCE=1)
+    target_compile_definitions(fb_contour_sweep PRIVATE LIBHMM_EXPERIMENT_FB_MAX_REDUCE=1)
+endif()
+if(LIBHMM_EXPERIMENT_FB_ADAPTIVE_SELECTOR)
+    target_compile_definitions(hotspot_breakdown PRIVATE LIBHMM_EXPERIMENT_FB_ADAPTIVE_SELECTOR=1)
+    target_compile_definitions(fb_contour_sweep PRIVATE LIBHMM_EXPERIMENT_FB_ADAPTIVE_SELECTOR=1)
+endif()
 
 # simd_inspection must be compiled with the same SIMD flags as the distribution
 # TUs so that LIBHMM_HAS_AVX512 / AVX2 / NEON are correctly defined and the
@@ -51,8 +63,11 @@ install(TARGETS
     simd_inspection
     batch_performance
     hmm_validator
+    hotspot_breakdown
+    fb_contour_sweep
+    bw_hotspot
     RUNTIME DESTINATION bin/tools
     COMPONENT tools
 )
 
-message(STATUS "Tools: analyze_overhead debug_parallel simd_inspection batch_performance hmm_validator")
+message(STATUS "Tools: analyze_overhead debug_parallel simd_inspection batch_performance hmm_validator hotspot_breakdown fb_contour_sweep bw_hotspot")
diff --git a/tools/bw_hotspot.cpp b/tools/bw_hotspot.cpp
new file mode 100644
index 0000000..7109b2a
--- /dev/null
+++ b/tools/bw_hotspot.cpp
@@ -0,0 +1,322 @@
+/**
+ * @file bw_hotspot.cpp
+ * @brief Baum-Welch inner-loop cost breakdown.
+ *
+ * Profiles the three separable cost centres of one BW E-step:
+ *   1. FB computation (delegated to ForwardBackwardCalculator)
+ *   2. Gamma accumulation  — N*T  exp() calls
+ *   3. Xi accumulation     — N^2*(T-1) exp() calls  (dominant for N>1)
+ *
+ * Implemented inline here (not through BaumWelchTrainer) so each phase
+ * can be timed independently without modifying the library.
+ *
+ * Usage:
+ *   bw_hotspot                        (default configs)
+ *   bw_hotspot <N> <T> [runs] [warmup]
+ */
+
+#include "libhmm/calculators/forward_backward_calculator.h"
+#include "libhmm/hmm.h"
+#include "libhmm/distributions/discrete_distribution.h"
+#include "libhmm/distributions/gaussian_distribution.h"
+#include "libhmm/performance/transcendental_kernels.h"
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <cstdint>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <memory>
+#include <span>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+using namespace libhmm;
+using Clock = std::chrono::high_resolution_clock;
+using Millis = std::chrono::duration<double, std::milli>;
+
+namespace {
+
+constexpr double LOG_ZERO = -std::numeric_limits<double>::infinity();
+
+// Prevent dead-code elimination on accumulated values.
+volatile double g_sink = 0.0;
+
+// ---------------------------------------------------------------------------
+
+double elapsed_ms(const Clock::time_point start) {
+    return Millis(Clock::now() - start).count();
+}
+
+template <typename T>
+double median(std::vector<T> v) {
+    if (v.empty())
+        return 0.0;
+    std::sort(v.begin(), v.end());
+    return static_cast<double>(v[v.size() / 2]);
+}
+
+// ---------------------------------------------------------------------------
+
+std::unique_ptr<Hmm> make_hmm(int n) {
+    auto hmm = std::make_unique<Hmm>(n);
+    Matrix trans(n, n);
+    for (int i = 0; i < n; ++i) {
+        double sum = 0.0;
+        for (int j = 0; j < n; ++j) {
+            trans(i, j) = 0.1 + 0.8 * (0.5 + 0.5 * std::sin(i * 0.7 + j * 1.3));
+            sum += trans(i, j);
+        }
+        for (int j = 0; j < n; ++j)
+            trans(i, j) /= sum;
+    }
+    hmm->setTrans(trans);
+
+    Vector pi(n);
+    for (int i = 0; i < n; ++i)
+        pi(i) = 1.0 / static_cast<double>(n);
+    hmm->setPi(pi);
+
+    for (int i = 0; i < n; ++i)
+        hmm->setDistribution(i, std::make_unique<GaussianDistribution>(i * 2.0, 1.0));
+    return hmm;
+}
+
+ObservationSet make_obs(int t, int n) {
+    ObservationSet obs(t);
+    for (int i = 0; i < t; ++i)
+        obs(i) = std::sin(i * 0.1) * static_cast<double>(n);
+    return obs;
+}
+
+// ---------------------------------------------------------------------------
+// One E-step with independent phase timers.
+// ---------------------------------------------------------------------------
+
+struct BwBreakdown {
+    double fb_ms = 0.0;    // ForwardBackwardCalculator (construct + compute)
+    double gamma_ms = 0.0; // gamma accumulation: N*T   exp() calls
+    double xi_ms = 0.0;    // xi accumulation:    N^2*(T-1) exp() calls
+    std::uint64_t gamma_exp_calls = 0;
+    std::uint64_t xi_exp_calls = 0;
+};
+
+BwBreakdown profile_bw(const Hmm &hmm, const ObservationSet &obs, int warmup, int runs) {
+    const std::size_t N = static_cast<std::size_t>(hmm.getNumStates());
+    const std::size_t T = obs.size();
+
+    // Precompute flat log-transition (row-major N×N) once — same as trainer would do.
+    std::vector<double> logTrans(N * N);
+    bool hasZeroTransitions = false;
+    {
+        const Matrix &t = hmm.getTrans();
+        for (std::size_t i = 0; i < N; ++i)
+            for (std::size_t j = 0; j < N; ++j) {
+                const double a = t(i, j);
+                if (a > 0.0) {
+                    logTrans[i * N + j] = std::log(a);
+                } else {
+                    logTrans[i * N + j] = LOG_ZERO;
+                    hasZeroTransitions = true;
+                }
+            }
+    }
+
+    // Log-emission: time-major logEmitByTime[t*N+j] = log b_j(O_t).
+    std::vector<double> logEmitByTime(T * N);
+    {
+        std::vector<double> stateMajor(N * T);
+        const std::span<const double> obsSpan(obs.data(), T);
+        for (std::size_t i = 0; i < N; ++i)
+            hmm.getDistribution(i).getBatchLogProbabilities(
+                obsSpan, std::span<double>(stateMajor.data() + i * T, T));
+        for (std::size_t i = 0; i < N; ++i)
+            for (std::size_t t2 = 0; t2 < T; ++t2)
+                logEmitByTime[t2 * N + i] = stateMajor[i * T + t2];
+    }
+
+    std::vector<double> fb_ms_v, gamma_ms_v, xi_ms_v;
+    fb_ms_v.reserve(static_cast<std::size_t>(runs));
+    gamma_ms_v.reserve(static_cast<std::size_t>(runs));
+    xi_ms_v.reserve(static_cast<std::size_t>(runs));
+
+    // Accumulators (reset per run to prevent dead-code elim).
+    std::vector<double> piNum(N);
+    std::vector<double> transDen(N);
+    std::vector<double> transNum(N * N);
+    std::vector<double> emisWts(N * T);
+
+    for (int iter = 0; iter < warmup + runs; ++iter) {
+        // Phase 1: FB
+        auto t0 = Clock::now();
+        ForwardBackwardCalculator fbc(hmm, obs);
+        const double logP = fbc.getLogProbability();
+        const double fb_time = elapsed_ms(t0);
+
+        if (!std::isfinite(logP))
+            continue;
+
+        const Matrix &logAlpha = fbc.getLogForwardVariables();
+        const Matrix &logBeta = fbc.getLogBackwardVariables();
+
+        // Phase 2: gamma accumulation (N*T exp() calls)
+        std::fill(piNum.begin(), piNum.end(), 0.0);
+        std::fill(transDen.begin(), transDen.end(), 0.0);
+
+        t0 = Clock::now();
+        for (std::size_t t2 = 0; t2 < T; ++t2) {
+            for (std::size_t i = 0; i < N; ++i) {
+                const double g = std::exp(logAlpha(t2, i) + logBeta(t2, i) - logP);
+                emisWts[t2 * N + i] = g;
+                if (t2 == 0)
+                    piNum[i] += g;
+                if (t2 < T - 1)
+                    transDen[i] += g;
+            }
+        }
+        const double gamma_time = elapsed_ms(t0);
+
+        // Phase 3: xi accumulation (N^2*(T-1) exp() calls)
+        std::fill(transNum.begin(), transNum.end(), 0.0);
+
+        t0 = Clock::now();
+        if (hasZeroTransitions) {
+            for (std::size_t t2 = 0; t2 + 1 < T; ++t2) {
+                const double *emitNext = logEmitByTime.data() + (t2 + 1) * N;
+                for (std::size_t i = 0; i < N; ++i) {
+                    const double logAlphaI = logAlpha(t2, i);
+                    const double *logTransRow = logTrans.data() + i * N;
+                    for (std::size_t j = 0; j < N; ++j) {
+                        if (logTransRow[j] == LOG_ZERO) {
+                            continue;
+                        }
+                        const double logXi =
+                            logAlphaI + logTransRow[j] + emitNext[j] + logBeta(t2 + 1, j) - logP;
+                        transNum[i * N + j] += std::exp(logXi);
+                    }
+                }
+            }
+        } else {
+            for (std::size_t t2 = 0; t2 + 1 < T; ++t2) {
+                const double *emitNext = logEmitByTime.data() + (t2 + 1) * N;
+                for (std::size_t i = 0; i < N; ++i) {
+                    const double logAlphaI = logAlpha(t2, i);
+                    const double *logTransRow = logTrans.data() + i * N;
+                    const double bias = -logP;
+                    // The hotspot tool keeps the same dense-xi shape as the trainer:
+                    // exp(alpha[i] + trans[i,j] + (emitNext[j] + betaNext[j] - logP)).
+                    // Since this tool stores row-major transNum, keep the scalar loop
+                    // here rather than inventing a second helper shape prematurely.
+                    for (std::size_t j = 0; j < N; ++j) {
+                        const double logXi =
+                            logAlphaI + logTransRow[j] + emitNext[j] + logBeta(t2 + 1, j) + bias;
+                        transNum[i * N + j] += std::exp(logXi);
+                    }
+                }
+            }
+        }
+        const double xi_time = elapsed_ms(t0);
+
+        // Sink to prevent elision.
+        g_sink += piNum[0] + transDen[0] + transNum[0] + emisWts[0];
+
+        if (iter >= warmup) {
+            fb_ms_v.push_back(fb_time);
+            gamma_ms_v.push_back(gamma_time);
+            xi_ms_v.push_back(xi_time);
+        }
+    }
+
+    BwBreakdown r;
+    r.fb_ms = median(fb_ms_v);
+    r.gamma_ms = median(gamma_ms_v);
+    r.xi_ms = median(xi_ms_v);
+    r.gamma_exp_calls = static_cast<std::uint64_t>(N) * T;
+    r.xi_exp_calls = static_cast<std::uint64_t>(N) * N * (T > 0 ? T - 1 : 0);
+    return r;
+}
+
+int parse_pos(const char *v, const char *name) {
+    try {
+        const int x = std::stoi(v);
+        if (x <= 0)
+            throw std::invalid_argument("non-positive");
+        return x;
+    } catch (...) {
+        throw std::invalid_argument(std::string("Invalid ") + name + ": " + v);
+    }
+}
+
+} // namespace
+
+int main(int argc, char *argv[]) {
+    struct Config {
+        int n;
+        int t;
+    };
+    std::vector<Config> configs = {{4, 500}, {8, 1000}, {16, 500}, {32, 2000}};
+    int warmup = 2, runs = 8;
+
+    if (argc == 3 || argc == 4 || argc == 5) {
+        configs = {{parse_pos(argv[1], "N"), parse_pos(argv[2], "T")}};
+        if (argc >= 4)
+            runs = parse_pos(argv[3], "runs");
+        if (argc == 5)
+            warmup = parse_pos(argv[4], "warmup");
+    } else if (argc != 1) {
+        std::cerr << "Usage: bw_hotspot [N T [runs [warmup]]]\n";
+        return 1;
+    }
+
+    std::cout << "libhmm BW Hotspot Breakdown  (median of " << runs << " runs, " << warmup
+              << " warmup)\n";
+    std::cout << std::string(66, '=') << "\n\n";
+    std::cout << std::fixed << std::setprecision(3);
+
+    for (const auto &cfg : configs) {
+        auto hmm = make_hmm(cfg.n);
+        auto obs = make_obs(cfg.t, cfg.n);
+        const auto bw = profile_bw(*hmm, obs, warmup, runs);
+
+        const double total = bw.fb_ms + bw.gamma_ms + bw.xi_ms;
+        auto pct = [&](double v) {
+            return (total > 0.0) ? 100.0 * v / total : 0.0;
+        };
+
+        std::cout << "N=" << cfg.n << "  T=" << cfg.t << "\n";
+        std::cout << "  exp() call volume:  gamma=" << static_cast<double>(bw.gamma_exp_calls) / 1e3
+                  << "K"
+                  << "  xi=" << static_cast<double>(bw.xi_exp_calls) / 1e6 << "M"
+                  << "  ratio xi/gamma="
+                  << (bw.gamma_exp_calls > 0 ? static_cast<double>(bw.xi_exp_calls) /
+                                                   static_cast<double>(bw.gamma_exp_calls)
+                                             : 0.0)
+                  << "x\n";
+
+        auto row = [&](const char *label, double ms, std::uint64_t calls) {
+            std::cout << "  " << std::left << std::setw(24) << label << std::right << std::setw(8)
+                      << ms << " ms"
+                      << "  " << std::setw(6) << std::setprecision(1) << pct(ms) << "%";
+            if (calls > 0) {
+                const double ns_per = (ms * 1e6) / static_cast<double>(calls);
+                std::cout << "  " << std::setprecision(1) << ns_per << " ns/exp()";
+            }
+            std::cout << "\n";
+            std::cout << std::setprecision(3);
+        };
+
+        row("FB (fwd+bwd)", bw.fb_ms, 0);
+        row("Gamma accum", bw.gamma_ms, bw.gamma_exp_calls);
+        row("Xi accum", bw.xi_ms, bw.xi_exp_calls);
+        std::cout << "  " << std::left << std::setw(24) << "TOTAL (1 BW iter)" << std::right
+                  << std::setw(8) << total << " ms\n";
+        std::cout << "\n";
+    }
+
+    if (g_sink == 1.23456789)
+        std::cout << "sink=" << g_sink << "\n";
+    return 0;
+}
diff --git a/tools/fb_contour_sweep.cpp b/tools/fb_contour_sweep.cpp
new file mode 100644
index 0000000..5b10626
--- /dev/null
+++ b/tools/fb_contour_sweep.cpp
@@ -0,0 +1,415 @@
+#include "libhmm/hmm.h"
+#include "libhmm/distributions/gaussian_distribution.h"
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <cstdint>
+#include <filesystem>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <memory>
+#include <span>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+using namespace libhmm;
+using Clock = std::chrono::high_resolution_clock;
+using Millis = std::chrono::duration<double, std::milli>;
+namespace fs = std::filesystem;
+
+namespace {
+
+constexpr double LOG_ZERO = -std::numeric_limits<double>::infinity();
+constexpr std::size_t FB_MAX_REDUCE_FORCE_PAIRWISE_MAX_STATES = 2;
+volatile double g_sink_double = 0.0;
+
+struct Config {
+    int n;
+    int t;
+};
+
+struct Timings {
+    double transition_ms = 0.0;
+    double obs_copy_ms = 0.0;
+    double emission_ms = 0.0;
+    double alloc_ms = 0.0;
+    double forward_ms = 0.0;
+    double backward_ms = 0.0;
+    double reduction_ms = 0.0;
+    double total_ms = 0.0;
+};
+
+double elapsed_ms(const Clock::time_point start) {
+    return Millis(Clock::now() - start).count();
+}
+
+bool should_use_max_reduce(const std::size_t n, const std::size_t t) noexcept {
+#if defined(LIBHMM_EXPERIMENT_FB_MAX_REDUCE)
+    (void)n;
+    (void)t;
+    return true;
+#elif defined(LIBHMM_EXPERIMENT_FB_ADAPTIVE_SELECTOR)
+    (void)t;
+    return n > FB_MAX_REDUCE_FORCE_PAIRWISE_MAX_STATES;
+#else
+    (void)n;
+    (void)t;
+    return false;
+#endif
+}
+
+double log_sum_exp_pairwise(const double a, const double b) noexcept {
+    if (a == LOG_ZERO) {
+        return b;
+    }
+    if (b == LOG_ZERO) {
+        return a;
+    }
+    if (a > b) {
+        return a + std::log1p(std::exp(b - a));
+    }
+    return b + std::log1p(std::exp(a - b));
+}
+
+template <typename T>
+double median(std::vector<T> values) {
+    if (values.empty()) {
+        return 0.0;
+    }
+    std::sort(values.begin(), values.end());
+    return static_cast<double>(values[values.size() / 2]);
+}
+
+std::unique_ptr<Hmm> make_hmm(const int n) {
+    auto hmm = std::make_unique<Hmm>(n);
+    Matrix trans(n, n);
+    for (int i = 0; i < n; ++i) {
+        double sum = 0.0;
+        for (int j = 0; j < n; ++j) {
+            trans(i, j) = 0.1 + 0.8 * (0.5 + 0.5 * std::sin(i * 0.7 + j * 1.3));
+            sum += trans(i, j);
+        }
+        for (int j = 0; j < n; ++j) {
+            trans(i, j) /= sum;
+        }
+    }
+    hmm->setTrans(trans);
+
+    Vector pi(n);
+    for (int i = 0; i < n; ++i) {
+        pi(i) = 1.0 / static_cast<double>(n);
+    }
+    hmm->setPi(pi);
+
+    for (int i = 0; i < n; ++i) {
+        hmm->setDistribution(i, std::make_unique<GaussianDistribution>(i * 2.0, 1.0));
+    }
+    return hmm;
+}
+
+ObservationSet make_obs(const int t, const int n) {
+    ObservationSet obs(t);
+    for (int i = 0; i < t; ++i) {
+        obs(i) = std::sin(i * 0.1) * static_cast<double>(n);
+    }
+    return obs;
+}
+
+Timings run_once(const Hmm &hmm, const ObservationSet &obs) {
+    Timings out;
+    const std::size_t n = static_cast<std::size_t>(hmm.getNumStates());
+    const std::size_t t = obs.size();
+
+    auto total_start = Clock::now();
+
+    auto stage_start = Clock::now();
+    Matrix log_trans(n, n);
+    for (std::size_t i = 0; i < n; ++i) {
+        for (std::size_t j = 0; j < n; ++j) {
+            const double a = hmm.getTrans()(i, j);
+            log_trans(i, j) = (a > 0.0) ? std::log(a) : LOG_ZERO;
+        }
+    }
+    out.transition_ms = elapsed_ms(stage_start);
+
+    stage_start = Clock::now();
+    std::vector<double> obs_copy(t);
+    for (std::size_t i = 0; i < t; ++i) {
+        obs_copy[i] = obs(i);
+    }
+    const std::span<const double> obs_span(obs_copy.data(), t);
+    out.obs_copy_ms = elapsed_ms(stage_start);
+
+    stage_start = Clock::now();
+    std::vector<double> log_emit_buf(n * t);
+    for (std::size_t i = 0; i < n; ++i) {
+        hmm.getDistribution(i).getBatchLogProbabilities(
+            obs_span, std::span<double>(log_emit_buf.data() + i * t, t));
+    }
+    out.emission_ms = elapsed_ms(stage_start);
+
+    stage_start = Clock::now();
+    Matrix log_alpha(t, n);
+    Matrix log_beta(t, n);
+    out.alloc_ms = elapsed_ms(stage_start);
+
+    stage_start = Clock::now();
+    for (std::size_t i = 0; i < n; ++i) {
+        const double pi = hmm.getPi()(i);
+        const double log_pi = (pi > 0.0) ? std::log(pi) : LOG_ZERO;
+        log_alpha(0, i) = log_pi + log_emit_buf[i * t];
+    }
+    const bool use_max_reduce = should_use_max_reduce(n, t);
+    if (use_max_reduce) {
+        for (std::size_t ti = 1; ti < t; ++ti) {
+            for (std::size_t j = 0; j < n; ++j) {
+                double max_term = LOG_ZERO;
+                for (std::size_t i = 0; i < n; ++i) {
+                    const double term = log_alpha(ti - 1, i) + log_trans(i, j);
+                    if (term > max_term) {
+                        max_term = term;
+                    }
+                }
+                double log_sum = LOG_ZERO;
+                if (std::isfinite(max_term)) {
+                    double scaled_sum = 0.0;
+                    for (std::size_t i = 0; i < n; ++i) {
+                        const double term = log_alpha(ti - 1, i) + log_trans(i, j);
+                        if (std::isfinite(term)) {
+                            scaled_sum += std::exp(term - max_term);
+                        }
+                    }
+                    if (scaled_sum > 0.0) {
+                        log_sum = max_term + std::log(scaled_sum);
+                    }
+                }
+                log_alpha(ti, j) = log_emit_buf[j * t + ti] + log_sum;
+            }
+        }
+    } else {
+        for (std::size_t ti = 1; ti < t; ++ti) {
+            for (std::size_t j = 0; j < n; ++j) {
+                double log_sum = LOG_ZERO;
+                for (std::size_t i = 0; i < n; ++i) {
+                    log_sum = log_sum_exp_pairwise(log_sum, log_alpha(ti - 1, i) + log_trans(i, j));
+                }
+                log_alpha(ti, j) = log_emit_buf[j * t + ti] + log_sum;
+            }
+        }
+    }
+    out.forward_ms = elapsed_ms(stage_start);
+
+    stage_start = Clock::now();
+    for (std::size_t i = 0; i < n; ++i) {
+        log_beta(t - 1, i) = 0.0;
+    }
+    if (t > 1) {
+        if (use_max_reduce) {
+            for (std::size_t ti = t - 2;; --ti) {
+                for (std::size_t i = 0; i < n; ++i) {
+                    double max_term = LOG_ZERO;
+                    for (std::size_t j = 0; j < n; ++j) {
+                        const double term =
+                            log_trans(i, j) + log_emit_buf[j * t + (ti + 1)] + log_beta(ti + 1, j);
+                        if (term > max_term) {
+                            max_term = term;
+                        }
+                    }
+                    double log_sum = LOG_ZERO;
+                    if (std::isfinite(max_term)) {
+                        double scaled_sum = 0.0;
+                        for (std::size_t j = 0; j < n; ++j) {
+                            const double term = log_trans(i, j) + log_emit_buf[j * t + (ti + 1)] +
+                                                log_beta(ti + 1, j);
+                            if (std::isfinite(term)) {
+                                scaled_sum += std::exp(term - max_term);
+                            }
+                        }
+                        if (scaled_sum > 0.0) {
+                            log_sum = max_term + std::log(scaled_sum);
+                        }
+                    }
+                    log_beta(ti, i) = log_sum;
+                }
+                if (ti == 0) {
+                    break;
+                }
+            }
+        } else {
+            for (std::size_t ti = t - 2;; --ti) {
+                for (std::size_t i = 0; i < n; ++i) {
+                    double log_sum = LOG_ZERO;
+                    for (std::size_t j = 0; j < n; ++j) {
+                        log_sum = log_sum_exp_pairwise(log_sum, log_trans(i, j) +
+                                                                    log_emit_buf[j * t + (ti + 1)] +
+                                                                    log_beta(ti + 1, j));
+                    }
+                    log_beta(ti, i) = log_sum;
+                }
+                if (ti == 0) {
+                    break;
+                }
+            }
+        }
+    }
+    out.backward_ms = elapsed_ms(stage_start);
+
+    stage_start = Clock::now();
+    double log_probability = LOG_ZERO;
+    for (std::size_t i = 0; i < n; ++i) {
+        log_probability = log_sum_exp_pairwise(log_probability, log_alpha(t - 1, i));
+    }
+    out.reduction_ms = elapsed_ms(stage_start);
+    g_sink_double += log_probability;
+
+    out.total_ms = elapsed_ms(total_start);
+    return out;
+}
+
+Timings profile_config(const Hmm &hmm, const ObservationSet &obs, const int runs,
+                       const int warmup) {
+    std::vector<double> transition_ms;
+    std::vector<double> obs_copy_ms;
+    std::vector<double> emission_ms;
+    std::vector<double> alloc_ms;
+    std::vector<double> forward_ms;
+    std::vector<double> backward_ms;
+    std::vector<double> reduction_ms;
+    std::vector<double> total_ms;
+
+    transition_ms.reserve(static_cast<std::size_t>(runs));
+    obs_copy_ms.reserve(static_cast<std::size_t>(runs));
+    emission_ms.reserve(static_cast<std::size_t>(runs));
+    alloc_ms.reserve(static_cast<std::size_t>(runs));
+    forward_ms.reserve(static_cast<std::size_t>(runs));
+    backward_ms.reserve(static_cast<std::size_t>(runs));
+    reduction_ms.reserve(static_cast<std::size_t>(runs));
+    total_ms.reserve(static_cast<std::size_t>(runs));
+
+    for (int iter = 0; iter < warmup + runs; ++iter) {
+        const Timings t = run_once(hmm, obs);
+        if (iter >= warmup) {
+            transition_ms.push_back(t.transition_ms);
+            obs_copy_ms.push_back(t.obs_copy_ms);
+            emission_ms.push_back(t.emission_ms);
+            alloc_ms.push_back(t.alloc_ms);
+            forward_ms.push_back(t.forward_ms);
+            backward_ms.push_back(t.backward_ms);
+            reduction_ms.push_back(t.reduction_ms);
+            total_ms.push_back(t.total_ms);
+        }
+    }
+
+    return {
+        median(transition_ms), median(obs_copy_ms), median(emission_ms),  median(alloc_ms),
+        median(forward_ms),    median(backward_ms), median(reduction_ms), median(total_ms),
+    };
+}
+
+int parse_positive_int(const char *value, const char *name) {
+    try {
+        const int parsed = std::stoi(value);
+        if (parsed <= 0) {
+            throw std::invalid_argument("non-positive");
+        }
+        return parsed;
+    } catch (...) {
+        throw std::invalid_argument(std::string("Invalid ") + name + ": " + value);
+    }
+}
+
+std::string mode_name() {
+#if defined(LIBHMM_EXPERIMENT_FB_MAX_REDUCE)
+    return "max_reduce";
+#elif defined(LIBHMM_EXPERIMENT_FB_ADAPTIVE_SELECTOR)
+    return "adaptive_static_v1";
+#else
+    return "pairwise";
+#endif
+}
+
+} // namespace
+
+int main(int argc, char *argv[]) {
+    int runs = 5;
+    int warmup = 1;
+
+    fs::path output_path =
+        fs::path("benchmark-analysis") / ("fb_contour_sweep_" + mode_name() + ".csv");
+
+    if (argc >= 2) {
+        output_path = argv[1];
+    }
+    if (argc >= 3) {
+        runs = parse_positive_int(argv[2], "runs");
+    }
+    if (argc >= 4) {
+        warmup = parse_positive_int(argv[3], "warmup");
+    }
+    if (argc > 4) {
+        std::cerr << "Usage:\n";
+        std::cerr << "  fb_contour_sweep [output_csv] [runs] [warmup]\n";
+        return 1;
+    }
+
+    const std::vector<Config> configs = {
+        {2, 1000},   {2, 10000}, {2, 100000}, {2, 1000000}, {4, 1000},  {4, 10000},
+        {4, 100000}, {8, 1000},  {8, 5000},   {8, 10000},   {16, 1000}, {16, 2000},
+        {16, 5000},  {32, 500},  {32, 1000},  {32, 2000},   {64, 200},  {64, 500},
+        {64, 1000},  {128, 100}, {128, 250},  {128, 500},
+    };
+
+    const fs::path output_dir = output_path.parent_path();
+    if (!output_dir.empty()) {
+        fs::create_directories(output_dir);
+    }
+    std::ofstream csv(output_path);
+    if (!csv) {
+        std::cerr << "Failed to open output file: " << output_path << "\n";
+        return 1;
+    }
+
+    csv << "mode,n,t,runs,warmup,recurrence_work,emission_work,transition_ms,obs_copy_ms,"
+           "emission_ms,alloc_ms,forward_ms,backward_ms,reduction_ms,total_ms\n";
+
+    std::cout << "libhmm FB contour sweep\n";
+    std::cout << "Mode: " << mode_name() << "\n";
+    std::cout << "Runs: " << runs << " (warmup " << warmup << ")\n";
+    std::cout << "Output: " << output_path << "\n\n";
+    std::cout << std::fixed << std::setprecision(3);
+
+    for (const auto &cfg : configs) {
+        auto hmm = make_hmm(cfg.n);
+        auto obs = make_obs(cfg.t, cfg.n);
+        const Timings timed = profile_config(*hmm, obs, runs, warmup);
+
+        const std::uint64_t recurrence_work =
+            static_cast<std::uint64_t>(cfg.n) * cfg.n * static_cast<std::uint64_t>(cfg.t - 1);
+        const std::uint64_t emission_work =
+            static_cast<std::uint64_t>(cfg.n) * static_cast<std::uint64_t>(cfg.t);
+
+        csv << mode_name() << "," << cfg.n << "," << cfg.t << "," << runs << "," << warmup << ","
+            << recurrence_work << "," << emission_work << "," << timed.transition_ms << ","
+            << timed.obs_copy_ms << "," << timed.emission_ms << "," << timed.alloc_ms << ","
+            << timed.forward_ms << "," << timed.backward_ms << "," << timed.reduction_ms << ","
+            << timed.total_ms << "\n";
+
+        const double recurrence_pct =
+            (timed.total_ms > 0.0)
+                ? ((timed.forward_ms + timed.backward_ms) * 100.0 / timed.total_ms)
+                : 0.0;
+        std::cout << "N=" << std::setw(3) << cfg.n << " T=" << std::setw(8) << cfg.t
+                  << " total=" << std::setw(9) << timed.total_ms << " ms"
+                  << " recur=" << std::setw(6) << recurrence_pct << "%\n";
+    }
+
+    csv.close();
+    if (g_sink_double == 42.0) {
+        std::cout << "sink=" << g_sink_double << "\n";
+    }
+    std::cout << "\nDone.\n";
+    return 0;
+}
diff --git a/tools/fb_crossover_sweep.cpp b/tools/fb_crossover_sweep.cpp
new file mode 100644
index 0000000..10d6e14
--- /dev/null
+++ b/tools/fb_crossover_sweep.cpp
@@ -0,0 +1,119 @@
+// tools/fb_crossover_sweep.cpp
+//
+// Measures ForwardBackwardCalculator runtime for Pairwise vs MaxReduce modes
+// at a range of N values using the production calculator (which has SIMD
+// transcendental kernels active in the MaxReduce path).
+//
+// Output: tab-separated table of N, pairwise_ms, maxreduce_ms, ratio.
+
+#include "libhmm/performance/fb_recurrence_policy.h"
+#include "libhmm/calculators/forward_backward_calculator.h"
+#include "libhmm/distributions/gaussian_distribution.h"
+#include "libhmm/hmm.h"
+#include "libhmm/platform/simd_platform.h"
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <vector>
+
+using namespace libhmm;
+using Clock = std::chrono::high_resolution_clock;
+using Millis = std::chrono::duration<double, std::milli>;
+
+namespace {
+
+constexpr int WARMUP_RUNS = 2;
+constexpr int TIMED_RUNS = 8;
+// T large enough that measurement is stable; small enough to finish quickly.
+constexpr int T_DEFAULT = 1000;
+
+std::unique_ptr<Hmm> make_hmm(int n) {
+    auto hmm = std::make_unique<Hmm>(n);
+    Matrix trans(n, n);
+    for (int i = 0; i < n; ++i) {
+        double s = 0.0;
+        for (int j = 0; j < n; ++j) {
+            trans(i, j) = 0.1 + 0.8 * (0.5 + 0.5 * std::sin(i * 0.7 + j * 1.3));
+            s += trans(i, j);
+        }
+        for (int j = 0; j < n; ++j)
+            trans(i, j) /= s;
+    }
+    hmm->setTrans(trans);
+    Vector pi(n);
+    for (int i = 0; i < n; ++i)
+        pi(i) = 1.0 / n;
+    hmm->setPi(pi);
+    for (int i = 0; i < n; ++i)
+        hmm->setDistribution(i, std::make_unique<GaussianDistribution>(i * 2.0, 1.0));
+    return hmm;
+}
+
+ObservationSet make_obs(int t, int n) {
+    ObservationSet obs(t);
+    for (int i = 0; i < t; ++i)
+        obs(i) = std::sin(i * 0.1) * n;
+    return obs;
+}
+
+double time_mode(Hmm &hmm, const ObservationSet &obs, FbRecurrenceMode mode) {
+    ForwardBackwardCalculator fbc(hmm, obs);
+    fbc.setRecurrenceModeOverride(mode);
+
+    // Warmup.
+    for (int r = 0; r < WARMUP_RUNS; ++r)
+        fbc.compute();
+
+    // Timed runs.
+    std::vector<double> samples;
+    samples.reserve(TIMED_RUNS);
+    for (int r = 0; r < TIMED_RUNS; ++r) {
+        auto t0 = Clock::now();
+        fbc.compute();
+        samples.push_back(Millis(Clock::now() - t0).count());
+    }
+
+    std::sort(samples.begin(), samples.end());
+    return samples[samples.size() / 2]; // median
+}
+
+} // anonymous namespace
+
+int main() {
+    const std::vector<int> N_VALUES = {2, 3, 4, 5, 6, 7, 8, 10, 12, 16, 24, 32, 48, 64};
+    const int T = T_DEFAULT;
+
+    std::cout << "FB mode crossover sweep  (T=" << T << ", median of " << TIMED_RUNS << " runs, "
+              << WARMUP_RUNS << " warmup)\n";
+    std::cout << "Active ISA: " << libhmm::performance::simd::feature_string() << "\n\n";
+
+    std::cout << std::setw(6) << "N" << std::setw(14) << "Pairwise(ms)" << std::setw(14)
+              << "MaxReduce(ms)" << std::setw(10) << "MR/PW" << std::setw(12) << "Winner"
+              << "\n";
+    std::cout << std::string(56, '-') << "\n";
+
+    for (int n : N_VALUES) {
+        auto hmm = make_hmm(n);
+        auto obs = make_obs(T, n);
+
+        const double pw = time_mode(*hmm, obs, FbRecurrenceMode::Pairwise);
+        const double mr = time_mode(*hmm, obs, FbRecurrenceMode::MaxReduce);
+        const double ratio = mr / pw;
+        const char *winner = (mr < pw) ? "MaxReduce" : "Pairwise";
+        const char *current =
+            (selectFbRecurrenceMode(n, T) == FbRecurrenceMode::MaxReduce) ? " [current]" : "";
+
+        std::cout << std::setw(6) << n << std::setw(14) << std::fixed << std::setprecision(3) << pw
+                  << std::setw(14) << std::fixed << std::setprecision(3) << mr << std::setw(10)
+                  << std::fixed << std::setprecision(3) << ratio << "  " << winner << current
+                  << "\n";
+    }
+
+    std::cout << "\n(ratio < 1 = MaxReduce faster; > 1 = Pairwise faster)\n";
+    std::cout << "[current] = what selectFbRecurrenceMode() currently picks for this N\n";
+    return 0;
+}
diff --git a/tools/hotspot_breakdown.cpp b/tools/hotspot_breakdown.cpp
new file mode 100644
index 0000000..7e59c40
--- /dev/null
+++ b/tools/hotspot_breakdown.cpp
@@ -0,0 +1,559 @@
+#include "libhmm/hmm.h"
+#include "libhmm/distributions/gaussian_distribution.h"
+#include "libhmm/math/constants.h"
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <cstdint>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <memory>
+#include <span>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+using namespace libhmm;
+using Clock = std::chrono::high_resolution_clock;
+using Millis = std::chrono::duration<double, std::milli>;
+
+namespace {
+
+constexpr double LOG_ZERO = -std::numeric_limits<double>::infinity();
+constexpr std::size_t FB_MAX_REDUCE_FORCE_PAIRWISE_MAX_STATES = 2;
+volatile double g_sink_double = 0.0;
+volatile int g_sink_int = 0;
+
+struct Config {
+    int num_states;
+    int sequence_length;
+};
+
+struct ForwardBreakdown {
+    double transition_ms = 0.0;
+    double obs_copy_ms = 0.0;
+    double emission_ms = 0.0;
+    double buffer_alloc_ms = 0.0;
+    double forward_ms = 0.0;
+    double backward_ms = 0.0;
+    double reduction_ms = 0.0;
+};
+
+struct ViterbiBreakdown {
+    double transition_ms = 0.0;
+    double emission_ms = 0.0;
+    double emission_relayout_ms = 0.0;
+    double buffer_alloc_ms = 0.0;
+    double recursion_ms = 0.0;
+    double backtrack_ms = 0.0;
+};
+
+template <typename T>
+double median(std::vector<T> values) {
+    if (values.empty()) {
+        return 0.0;
+    }
+    std::sort(values.begin(), values.end());
+    return static_cast<double>(values[values.size() / 2]);
+}
+
+bool should_use_max_reduce(const std::size_t n, const std::size_t t) noexcept {
+#if defined(LIBHMM_EXPERIMENT_FB_MAX_REDUCE)
+    (void)n;
+    (void)t;
+    return true;
+#elif defined(LIBHMM_EXPERIMENT_FB_ADAPTIVE_SELECTOR)
+    (void)t;
+    return n > FB_MAX_REDUCE_FORCE_PAIRWISE_MAX_STATES;
+#else
+    (void)n;
+    (void)t;
+    return false;
+#endif
+}
+
+double elapsed_ms(const Clock::time_point start) {
+    return Millis(Clock::now() - start).count();
+}
+
+double log_sum_exp(const double a, const double b) noexcept {
+    if (a == LOG_ZERO) {
+        return b;
+    }
+    if (b == LOG_ZERO) {
+        return a;
+    }
+    if (a > b) {
+        return a + std::log1p(std::exp(b - a));
+    }
+    return b + std::log1p(std::exp(a - b));
+}
+
+std::unique_ptr<Hmm> make_hmm(const int n) {
+    auto hmm = std::make_unique<Hmm>(n);
+    Matrix trans(n, n);
+    for (int i = 0; i < n; ++i) {
+        double sum = 0.0;
+        for (int j = 0; j < n; ++j) {
+            trans(i, j) = 0.1 + 0.8 * (0.5 + 0.5 * std::sin(i * 0.7 + j * 1.3));
+            sum += trans(i, j);
+        }
+        for (int j = 0; j < n; ++j) {
+            trans(i, j) /= sum;
+        }
+    }
+    hmm->setTrans(trans);
+
+    Vector pi(n);
+    for (int i = 0; i < n; ++i) {
+        pi(i) = 1.0 / static_cast<double>(n);
+    }
+    hmm->setPi(pi);
+
+    for (int i = 0; i < n; ++i) {
+        hmm->setDistribution(i, std::make_unique<GaussianDistribution>(i * 2.0, 1.0));
+    }
+
+    return hmm;
+}
+
+ObservationSet make_obs(const int t, const int n) {
+    ObservationSet obs(t);
+    for (int i = 0; i < t; ++i) {
+        obs(i) = std::sin(i * 0.1) * static_cast<double>(n);
+    }
+    return obs;
+}
+
+ForwardBreakdown profile_forward_backward(const Hmm &hmm, const ObservationSet &obs,
+                                          const int warmup, const int runs) {
+    const std::size_t n = static_cast<std::size_t>(hmm.getNumStates());
+    const std::size_t t = obs.size();
+
+    std::vector<double> transition_ms;
+    std::vector<double> obs_copy_ms;
+    std::vector<double> emission_ms;
+    std::vector<double> buffer_alloc_ms;
+    std::vector<double> forward_ms;
+    std::vector<double> backward_ms;
+    std::vector<double> reduction_ms;
+
+    transition_ms.reserve(static_cast<std::size_t>(runs));
+    obs_copy_ms.reserve(static_cast<std::size_t>(runs));
+    emission_ms.reserve(static_cast<std::size_t>(runs));
+    buffer_alloc_ms.reserve(static_cast<std::size_t>(runs));
+    forward_ms.reserve(static_cast<std::size_t>(runs));
+    backward_ms.reserve(static_cast<std::size_t>(runs));
+    reduction_ms.reserve(static_cast<std::size_t>(runs));
+
+    for (int iter = 0; iter < warmup + runs; ++iter) {
+        auto stage_start = Clock::now();
+        Matrix log_trans(n, n);
+        for (std::size_t i = 0; i < n; ++i) {
+            for (std::size_t j = 0; j < n; ++j) {
+                const double a = hmm.getTrans()(i, j);
+                log_trans(i, j) = (a > 0.0) ? std::log(a) : LOG_ZERO;
+            }
+        }
+        const double trans_time = elapsed_ms(stage_start);
+
+        stage_start = Clock::now();
+        std::vector<double> obs_copy(t);
+        for (std::size_t i = 0; i < t; ++i) {
+            obs_copy[i] = obs(i);
+        }
+        const std::span<const double> obs_span(obs_copy.data(), t);
+        const double obs_copy_time = elapsed_ms(stage_start);
+
+        stage_start = Clock::now();
+        std::vector<double> log_emit_buf(n * t);
+        for (std::size_t i = 0; i < n; ++i) {
+            hmm.getDistribution(i).getBatchLogProbabilities(
+                obs_span, std::span<double>(log_emit_buf.data() + i * t, t));
+        }
+        const double emission_time = elapsed_ms(stage_start);
+
+        stage_start = Clock::now();
+        Matrix log_alpha(t, n);
+        Matrix log_beta(t, n);
+        const double buffer_time = elapsed_ms(stage_start);
+
+        stage_start = Clock::now();
+        for (std::size_t i = 0; i < n; ++i) {
+            const double pi = hmm.getPi()(i);
+            const double log_pi = (pi > 0.0) ? std::log(pi) : LOG_ZERO;
+            log_alpha(0, i) = log_pi + log_emit_buf[i * t];
+        }
+        const bool use_max_reduce = should_use_max_reduce(n, t);
+        for (std::size_t ti = 1; ti < t; ++ti) {
+            for (std::size_t j = 0; j < n; ++j) {
+                double log_sum = LOG_ZERO;
+                if (use_max_reduce) {
+                    double max_term = LOG_ZERO;
+                    for (std::size_t i = 0; i < n; ++i) {
+                        const double term = log_alpha(ti - 1, i) + log_trans(i, j);
+                        if (term > max_term) {
+                            max_term = term;
+                        }
+                    }
+                    if (std::isfinite(max_term)) {
+                        double scaled_sum = 0.0;
+                        for (std::size_t i = 0; i < n; ++i) {
+                            const double term = log_alpha(ti - 1, i) + log_trans(i, j);
+                            if (std::isfinite(term)) {
+                                scaled_sum += std::exp(term - max_term);
+                            }
+                        }
+                        if (scaled_sum > 0.0) {
+                            log_sum = max_term + std::log(scaled_sum);
+                        }
+                    }
+                } else {
+                    for (std::size_t i = 0; i < n; ++i) {
+                        log_sum = log_sum_exp(log_sum, log_alpha(ti - 1, i) + log_trans(i, j));
+                    }
+                }
+                log_alpha(ti, j) = log_emit_buf[j * t + ti] + log_sum;
+            }
+        }
+        const double forward_time = elapsed_ms(stage_start);
+
+        stage_start = Clock::now();
+        for (std::size_t i = 0; i < n; ++i) {
+            log_beta(t - 1, i) = 0.0;
+        }
+        if (t > 1) {
+            for (std::size_t ti = t - 2;; --ti) {
+                for (std::size_t i = 0; i < n; ++i) {
+                    double log_sum = LOG_ZERO;
+                    if (use_max_reduce) {
+                        double max_term = LOG_ZERO;
+                        for (std::size_t j = 0; j < n; ++j) {
+                            const double term = log_trans(i, j) + log_emit_buf[j * t + (ti + 1)] +
+                                                log_beta(ti + 1, j);
+                            if (term > max_term) {
+                                max_term = term;
+                            }
+                        }
+                        if (std::isfinite(max_term)) {
+                            double scaled_sum = 0.0;
+                            for (std::size_t j = 0; j < n; ++j) {
+                                const double term = log_trans(i, j) +
+                                                    log_emit_buf[j * t + (ti + 1)] +
+                                                    log_beta(ti + 1, j);
+                                if (std::isfinite(term)) {
+                                    scaled_sum += std::exp(term - max_term);
+                                }
+                            }
+                            if (scaled_sum > 0.0) {
+                                log_sum = max_term + std::log(scaled_sum);
+                            }
+                        }
+                    } else {
+                        for (std::size_t j = 0; j < n; ++j) {
+                            log_sum = log_sum_exp(log_sum, log_trans(i, j) +
+                                                               log_emit_buf[j * t + (ti + 1)] +
+                                                               log_beta(ti + 1, j));
+                        }
+                    }
+                    log_beta(ti, i) = log_sum;
+                }
+                if (ti == 0) {
+                    break;
+                }
+            }
+        }
+        const double backward_time = elapsed_ms(stage_start);
+
+        stage_start = Clock::now();
+        double log_probability = LOG_ZERO;
+        for (std::size_t i = 0; i < n; ++i) {
+            log_probability = log_sum_exp(log_probability, log_alpha(t - 1, i));
+        }
+        const double reduction_time = elapsed_ms(stage_start);
+        g_sink_double += log_probability;
+
+        if (iter >= warmup) {
+            transition_ms.push_back(trans_time);
+            obs_copy_ms.push_back(obs_copy_time);
+            emission_ms.push_back(emission_time);
+            buffer_alloc_ms.push_back(buffer_time);
+            forward_ms.push_back(forward_time);
+            backward_ms.push_back(backward_time);
+            reduction_ms.push_back(reduction_time);
+        }
+    }
+
+    return {
+        median(transition_ms), median(obs_copy_ms), median(emission_ms),  median(buffer_alloc_ms),
+        median(forward_ms),    median(backward_ms), median(reduction_ms),
+    };
+}
+
+ViterbiBreakdown profile_viterbi(const Hmm &hmm, const ObservationSet &obs, const int warmup,
+                                 const int runs) {
+    const std::size_t n = static_cast<std::size_t>(hmm.getNumStates());
+    const std::size_t t = obs.size();
+
+    std::vector<double> transition_ms;
+    std::vector<double> emission_ms;
+    std::vector<double> emission_relayout_ms;
+    std::vector<double> buffer_alloc_ms;
+    std::vector<double> recursion_ms;
+    std::vector<double> backtrack_ms;
+
+    transition_ms.reserve(static_cast<std::size_t>(runs));
+    emission_ms.reserve(static_cast<std::size_t>(runs));
+    emission_relayout_ms.reserve(static_cast<std::size_t>(runs));
+    buffer_alloc_ms.reserve(static_cast<std::size_t>(runs));
+    recursion_ms.reserve(static_cast<std::size_t>(runs));
+    backtrack_ms.reserve(static_cast<std::size_t>(runs));
+
+    for (int iter = 0; iter < warmup + runs; ++iter) {
+        auto stage_start = Clock::now();
+        Matrix log_trans(n, n);
+        Matrix log_trans_t(n, n);
+        for (std::size_t i = 0; i < n; ++i) {
+            for (std::size_t j = 0; j < n; ++j) {
+                const double a = hmm.getTrans()(i, j);
+                const double log_a = (a > 0.0) ? std::log(a) : LOG_ZERO;
+                log_trans(i, j) = log_a;
+                log_trans_t(j, i) = log_a;
+            }
+        }
+        const double trans_time = elapsed_ms(stage_start);
+
+        stage_start = Clock::now();
+        std::vector<double> log_emit_buf(n * t);
+        const std::span<const double> obs_span(obs.data(), t);
+        for (std::size_t i = 0; i < n; ++i) {
+            hmm.getDistribution(i).getBatchLogProbabilities(
+                obs_span, std::span<double>(log_emit_buf.data() + i * t, t));
+        }
+        const double emission_time = elapsed_ms(stage_start);
+
+        stage_start = Clock::now();
+        std::vector<double> log_emit_by_time(n * t);
+        for (std::size_t i = 0; i < n; ++i) {
+            const double *state_row = log_emit_buf.data() + i * t;
+            for (std::size_t ti = 0; ti < t; ++ti) {
+                log_emit_by_time[ti * n + i] = state_row[ti];
+            }
+        }
+        const double relayout_time = elapsed_ms(stage_start);
+
+        stage_start = Clock::now();
+        Matrix log_delta(t, n);
+        std::vector<int> psi(t * n, 0);
+        std::vector<int> sequence(t, 0);
+        const double buffer_time = elapsed_ms(stage_start);
+
+        stage_start = Clock::now();
+        const double *log_trans_t_data = log_trans_t.data();
+        const double *log_emit_by_time_data = log_emit_by_time.data();
+        double *log_delta_data = log_delta.data();
+
+        const double *emit_row_0 = log_emit_by_time_data;
+        for (std::size_t i = 0; i < n; ++i) {
+            const double pi = hmm.getPi()(i);
+            const double log_pi = (pi > 0.0) ? std::log(pi) : LOG_ZERO;
+            log_delta_data[i] = log_pi + emit_row_0[i];
+        }
+
+        for (std::size_t ti = 1; ti < t; ++ti) {
+            const double *prev_delta_row = log_delta_data + (ti - 1) * n;
+            double *delta_row = log_delta_data + ti * n;
+            const double *emit_row = log_emit_by_time_data + ti * n;
+            for (std::size_t j = 0; j < n; ++j) {
+                double max_val = LOG_ZERO;
+                int max_from = 0;
+                const double *trans_col = log_trans_t_data + j * n;
+                for (std::size_t i = 0; i < n; ++i) {
+                    const double value = prev_delta_row[i] + trans_col[i];
+                    if (value > max_val) {
+                        max_val = value;
+                        max_from = static_cast<int>(i);
+                    }
+                }
+                delta_row[j] = max_val + emit_row[j];
+                psi[ti * n + j] = max_from;
+            }
+        }
+
+        double best_val = LOG_ZERO;
+        int best_last = 0;
+        const double *final_delta_row = log_delta_data + (t - 1) * n;
+        for (std::size_t i = 0; i < n; ++i) {
+            if (final_delta_row[i] > best_val) {
+                best_val = final_delta_row[i];
+                best_last = static_cast<int>(i);
+            }
+        }
+        sequence[t - 1] = best_last;
+        const double recursion_time = elapsed_ms(stage_start);
+
+        stage_start = Clock::now();
+        if (t > 1) {
+            for (std::size_t ti = t - 2;; --ti) {
+                sequence[ti] = psi[(ti + 1) * n + static_cast<std::size_t>(sequence[ti + 1])];
+                if (ti == 0) {
+                    break;
+                }
+            }
+        }
+        const double backtrack_time = elapsed_ms(stage_start);
+        g_sink_double += best_val;
+        g_sink_int += sequence[0];
+
+        if (iter >= warmup) {
+            transition_ms.push_back(trans_time);
+            emission_ms.push_back(emission_time);
+            emission_relayout_ms.push_back(relayout_time);
+            buffer_alloc_ms.push_back(buffer_time);
+            recursion_ms.push_back(recursion_time);
+            backtrack_ms.push_back(backtrack_time);
+        }
+    }
+
+    return {
+        median(transition_ms),   median(emission_ms),  median(emission_relayout_ms),
+        median(buffer_alloc_ms), median(recursion_ms), median(backtrack_ms),
+    };
+}
+
+std::size_t estimate_forward_working_set_bytes(const std::size_t n, const std::size_t t) {
+    const std::size_t doubles = (n * n) + (3 * n * t) + t;
+    return doubles * sizeof(double);
+}
+
+std::size_t estimate_viterbi_working_set_bytes(const std::size_t n, const std::size_t t) {
+    const std::size_t double_count = (2 * n * n) + (3 * n * t);
+    const std::size_t int_count = (2 * n * t);
+    return double_count * sizeof(double) + int_count * sizeof(int);
+}
+
+double bytes_to_mib(const std::size_t bytes) {
+    return static_cast<double>(bytes) / (1024.0 * 1024.0);
+}
+
+void print_phase(const std::string &label, const double value_ms, const double total_ms) {
+    const double pct = (total_ms > 0.0) ? (100.0 * value_ms / total_ms) : 0.0;
+    std::cout << "  " << std::left << std::setw(28) << label << std::right << std::setw(10)
+              << value_ms << " ms  " << std::setw(6) << pct << "%\n";
+}
+
+int parse_positive_int(const char *value, const char *arg_name) {
+    try {
+        const int parsed = std::stoi(value);
+        if (parsed <= 0) {
+            throw std::invalid_argument("non-positive");
+        }
+        return parsed;
+    } catch (...) {
+        throw std::invalid_argument(std::string("Invalid ") + arg_name + ": " + value);
+    }
+}
+
+} // namespace
+
+int main(int argc, char *argv[]) {
+    std::vector<Config> configs = {
+        {8, 1000},
+        {32, 2000},
+        {64, 1000},
+    };
+
+    int warmup = 2;
+    int runs = 8;
+
+    if (argc == 3 || argc == 4 || argc == 5) {
+        const int n = parse_positive_int(argv[1], "N");
+        const int t = parse_positive_int(argv[2], "T");
+        configs = {{n, t}};
+        if (argc >= 4) {
+            runs = parse_positive_int(argv[3], "runs");
+        }
+        if (argc == 5) {
+            warmup = parse_positive_int(argv[4], "warmup");
+        }
+    } else if (argc != 1) {
+        std::cerr << "Usage:\n";
+        std::cerr << "  hotspot_breakdown\n";
+        std::cerr << "  hotspot_breakdown <N> <T> [runs] [warmup]\n";
+        return 1;
+    }
+
+    std::cout << "libhmm Hotspot Breakdown Tool\n";
+    std::cout << "============================\n";
+    std::cout << "Median of " << runs << " timed runs (" << warmup << " warmup).\n\n";
+#if defined(LIBHMM_EXPERIMENT_FB_MAX_REDUCE)
+    std::cout << "Forward-Backward accumulation mode: max-then-reduce (experimental)\n\n";
+#elif defined(LIBHMM_EXPERIMENT_FB_ADAPTIVE_SELECTOR)
+    std::cout << "Forward-Backward accumulation mode: static adaptive selector (stage-1)\n\n";
+#else
+    std::cout << "Forward-Backward accumulation mode: pairwise logSumExp (control)\n\n";
+#endif
+
+    std::cout << std::fixed << std::setprecision(3);
+
+    for (const auto &cfg : configs) {
+        auto hmm = make_hmm(cfg.num_states);
+        auto obs = make_obs(cfg.sequence_length, cfg.num_states);
+
+        const auto fb = profile_forward_backward(*hmm, obs, warmup, runs);
+        const auto vt = profile_viterbi(*hmm, obs, warmup, runs);
+
+        const double fb_total = fb.transition_ms + fb.obs_copy_ms + fb.emission_ms +
+                                fb.buffer_alloc_ms + fb.forward_ms + fb.backward_ms +
+                                fb.reduction_ms;
+        const double vt_total = vt.transition_ms + vt.emission_ms + vt.emission_relayout_ms +
+                                vt.buffer_alloc_ms + vt.recursion_ms + vt.backtrack_ms;
+
+        const std::size_t n = static_cast<std::size_t>(cfg.num_states);
+        const std::size_t t = static_cast<std::size_t>(cfg.sequence_length);
+        const std::uint64_t emission_work = static_cast<std::uint64_t>(n) * t;
+        const std::uint64_t recurrence_work =
+            (t > 0) ? static_cast<std::uint64_t>(n) * n * (t - 1) : 0ULL;
+
+        std::cout << "Config: N=" << cfg.num_states << ", T=" << cfg.sequence_length << "\n";
+        std::cout << "  Estimated recurrence work per pass: "
+                  << static_cast<double>(recurrence_work) / 1.0e6 << " M (N^2*(T-1))\n";
+        std::cout << "  Emission evaluations per pass:      "
+                  << static_cast<double>(emission_work) / 1.0e6 << " M (N*T)\n";
+
+        std::cout << "\nForward-Backward phase breakdown:\n";
+        print_phase("Transition log precompute", fb.transition_ms, fb_total);
+        print_phase("Observation copy", fb.obs_copy_ms, fb_total);
+        print_phase("Emission batch eval", fb.emission_ms, fb_total);
+        print_phase("Alpha/Beta buffer alloc", fb.buffer_alloc_ms, fb_total);
+        print_phase("Forward recursion", fb.forward_ms, fb_total);
+        print_phase("Backward recursion", fb.backward_ms, fb_total);
+        print_phase("Final log-sum-exp reduce", fb.reduction_ms, fb_total);
+        std::cout << "  " << std::left << std::setw(28) << "TOTAL" << std::right << std::setw(10)
+                  << fb_total << " ms\n";
+
+        std::cout << "  Estimated FB working set: "
+                  << bytes_to_mib(estimate_forward_working_set_bytes(n, t)) << " MiB\n";
+
+        std::cout << "\nViterbi phase breakdown:\n";
+        print_phase("Transition log precompute", vt.transition_ms, vt_total);
+        print_phase("Emission batch eval", vt.emission_ms, vt_total);
+        print_phase("Emission relayout (T-major)", vt.emission_relayout_ms, vt_total);
+        print_phase("Delta/Psi buffer alloc", vt.buffer_alloc_ms, vt_total);
+        print_phase("Viterbi recursion", vt.recursion_ms, vt_total);
+        print_phase("Backtrack", vt.backtrack_ms, vt_total);
+        std::cout << "  " << std::left << std::setw(28) << "TOTAL" << std::right << std::setw(10)
+                  << vt_total << " ms\n";
+
+        std::cout << "  Estimated Viterbi working set: "
+                  << bytes_to_mib(estimate_viterbi_working_set_bytes(n, t)) << " MiB\n";
+        std::cout << "\n------------------------------------------------------------\n\n";
+    }
+
+    if (g_sink_int == 42) {
+        std::cout << "sink=" << g_sink_double << "\n";
+    }
+
+    return 0;
+}