diff --git a/include/core/dispatch_utils.h b/include/core/dispatch_utils.h
index 9386e3a..2f93b9e 100644
--- a/include/core/dispatch_utils.h
+++ b/include/core/dispatch_utils.h
@@ -35,9 +35,10 @@ namespace detail {  // Performance utilities
  * scalar) Strategy::PARALLEL     \u2192 parallel_func \u2192 ParallelUtils::parallelFor
  *     Strategy::WORK_STEALING \u2192 work_stealing_func \u2192 WorkStealingPool::parallelFor
  *
- * The GpuAcceleratedFunc template parameter and gpu_accelerated_func argument are
- * retained for ABI compatibility and will be removed in Phase 6 when all distribution
- * batch implementations are updated.
+ * The GPU_ACCELERATED strategy slot has been removed. The GpuAcceleratedFunc
+ * template parameter and corresponding lambda are no longer accepted by these
+ * templates. See issue #23 for the rationale and prerequisites for any future
+ * GPU backend.
  */
 class DispatchUtils {
    public:
@@ -49,7 +50,6 @@ class DispatchUtils {
      * @tparam BatchFunc Function type for SIMD batch operations
      * @tparam ParallelFunc Function type for parallel operations
      * @tparam WorkStealingFunc Function type for work-stealing operations
-     * @tparam GpuAcceleratedFunc Function type for GPU-accelerated operations
      *
      * @param dist Reference to the distribution instance
      * @param values Input values span
@@ -61,16 +61,14 @@ class DispatchUtils {
      * @param batch_func Function to call for SIMD batch operations
      * @param parallel_func Function to call for parallel operations
      * @param work_stealing_func Function to call for work-stealing operations
-     * @param gpu_accelerated_func Function to call for GPU-accelerated operations
      */
     template <typename Distribution, typename ScalarFunc, typename BatchFunc, typename ParallelFunc,
-              typename WorkStealingFunc, typename GpuAcceleratedFunc>
+              typename WorkStealingFunc>
     static void autoDispatch(const Distribution& dist, std::span<const double> values,
                              std::span<double> results, const PerformanceHint& hint,
                              DistributionType dist_type, OperationType op_type,
                              ScalarFunc&& scalar_func, BatchFunc&& batch_func,
-                             ParallelFunc&& parallel_func, WorkStealingFunc&& work_stealing_func,
-                             GpuAcceleratedFunc&& gpu_accelerated_func) {
+                             ParallelFunc&& parallel_func, WorkStealingFunc&& work_stealing_func) {
         // Validate input
         if (values.size() != results.size()) {
             throw std::invalid_argument("Input and output spans must have the same size");
@@ -103,8 +101,7 @@ class DispatchUtils {
         executeStrategy(strategy, dist, values, results, count,
                         std::forward<ScalarFunc>(scalar_func), std::forward<BatchFunc>(batch_func),
                         std::forward<ParallelFunc>(parallel_func),
-                        std::forward<WorkStealingFunc>(work_stealing_func),
-                        std::forward<GpuAcceleratedFunc>(gpu_accelerated_func));
+                        std::forward<WorkStealingFunc>(work_stealing_func));
     }
 
     /**
@@ -140,7 +137,6 @@ class DispatchUtils {
      * @tparam BatchFunc Function type for SIMD batch operations
      * @tparam ParallelFunc Function type for parallel operations
      * @tparam WorkStealingFunc Function type for work-stealing operations
-     * @tparam GpuAcceleratedFunc Function type for GPU-accelerated operations
      *
      * @param dist Reference to the distribution instance
      * @param values Input values span
@@ -150,16 +146,14 @@ class DispatchUtils {
      * @param batch_func Function to call for SIMD batch operations
      * @param parallel_func Function to call for parallel operations
      * @param work_stealing_func Function to call for work-stealing operations
-     * @param gpu_accelerated_func Function to call for GPU-accelerated operations
      */
     template <typename Distribution, typename ScalarFunc, typename BatchFunc, typename ParallelFunc,
-              typename WorkStealingFunc, typename GpuAcceleratedFunc>
+              typename WorkStealingFunc>
     static void executeWithStrategy(const Distribution& dist, std::span<const double> values,
                                     std::span<double> results, Strategy strategy,
                                     ScalarFunc&& scalar_func, BatchFunc&& batch_func,
                                     ParallelFunc&& parallel_func,
-                                    WorkStealingFunc&& work_stealing_func,
-                                    GpuAcceleratedFunc&& gpu_accelerated_func) {
+                                    WorkStealingFunc&& work_stealing_func) {
         // Validate input
         if (values.size() != results.size()) {
             throw std::invalid_argument("Input and output spans must have the same size");
@@ -173,8 +167,7 @@ class DispatchUtils {
         executeStrategy(strategy, dist, values, results, count,
                         std::forward<ScalarFunc>(scalar_func), std::forward<BatchFunc>(batch_func),
                         std::forward<ParallelFunc>(parallel_func),
-                        std::forward<WorkStealingFunc>(work_stealing_func),
-                        std::forward<GpuAcceleratedFunc>(gpu_accelerated_func));
+                        std::forward<WorkStealingFunc>(work_stealing_func));
     }
 
    private:
@@ -203,12 +196,12 @@ class DispatchUtils {
      * @brief Executes the selected strategy with appropriate function calls
      */
     template <typename Distribution, typename ScalarFunc, typename BatchFunc, typename ParallelFunc,
-              typename WorkStealingFunc, typename GpuAcceleratedFunc>
+              typename WorkStealingFunc>
     static void executeStrategy(Strategy strategy, const Distribution& dist,
                                 std::span<const double> values, std::span<double> results,
                                 size_t count, ScalarFunc&& scalar_func, BatchFunc&& batch_func,
-                                ParallelFunc&& parallel_func, WorkStealingFunc&& work_stealing_func,
-                                GpuAcceleratedFunc&& gpu_accelerated_func) {
+                                ParallelFunc&& parallel_func,
+                                WorkStealingFunc&& work_stealing_func) {
         switch (strategy) {
             case Strategy::SCALAR:
                 // Use simple loop for tiny batches (< 8 elements)
@@ -240,9 +233,6 @@ class DispatchUtils {
                 break;
             }
         }
-        // gpu_accelerated_func is intentionally unused — GPU_ACCELERATED was removed
-        // from the Strategy enum. Retained for ABI compatibility until Phase 6.
-        (void)gpu_accelerated_func;
     }
 
     /**
@@ -319,8 +309,7 @@ class DispatchUtils {
         }
     }
 
-    // executeBatchGpuAccelerated removed — GPU_ACCELERATED strategy removed from enum.
-    // Callers should use executeBatchWorkStealing directly.
+    // GPU_ACCELERATED strategy slot removed. See issue #23 for future GPU backend prerequisites.
 
     /**
      * @brief Common cache validation and parameter extraction pattern
diff --git a/src/beta.cpp b/src/beta.cpp
index 05e9ffe..3f062ab 100644
--- a/src/beta.cpp
+++ b/src/beta.cpp
@@ -570,35 +570,6 @@ void BetaDistribution::getProbability(std::span<const double> values, std::span<
                 dist.getProbabilityBatchUnsafeImpl(vals.data(), res.data(), count, lnc, am1, bm1);
             }
         },
-        [](const BetaDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            if (vals.size() != res.size())
-                throw std::invalid_argument("Span size mismatch");
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_)
-                    const_cast<BetaDistribution&>(dist).updateCacheUnsafe();
-                ulock.unlock();
-                lock.lock();
-            }
-            const double lnc = dist.logNormConst_;
-            const double am1 = dist.alphaMinus1_;
-            const double bm1 = dist.betaMinus1_;
-            lock.unlock();
-            constexpr std::size_t CHUNK = 1024;
-            const std::size_t num_chunks = (count + CHUNK - 1) / CHUNK;
-            pool.parallelFor(std::size_t{0}, num_chunks, [&](std::size_t ci) {
-                const std::size_t start = ci * CHUNK;
-                const std::size_t len = std::min(CHUNK, count - start);
-                dist.getProbabilityBatchUnsafeImpl(vals.data() + start, res.data() + start, len,
-                                                   lnc, am1, bm1);
-            });
-        },
         [](const BetaDistribution& dist, std::span<const double> vals, std::span<double> res,
            WorkStealingPool& pool) {
             if (vals.size() != res.size())
@@ -687,35 +658,6 @@ void BetaDistribution::getLogProbability(std::span<const double> values, std::sp
                                                       bm1);
             }
         },
-        [](const BetaDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            if (vals.size() != res.size())
-                throw std::invalid_argument("Span size mismatch");
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_)
-                    const_cast<BetaDistribution&>(dist).updateCacheUnsafe();
-                ulock.unlock();
-                lock.lock();
-            }
-            const double lnc = dist.logNormConst_;
-            const double am1 = dist.alphaMinus1_;
-            const double bm1 = dist.betaMinus1_;
-            lock.unlock();
-            constexpr std::size_t CHUNK = 1024;
-            const std::size_t num_chunks = (count + CHUNK - 1) / CHUNK;
-            pool.parallelFor(std::size_t{0}, num_chunks, [&](std::size_t ci) {
-                const std::size_t start = ci * CHUNK;
-                const std::size_t len = std::min(CHUNK, count - start);
-                dist.getLogProbabilityBatchUnsafeImpl(vals.data() + start, res.data() + start, len,
-                                                      lnc, am1, bm1);
-            });
-        },
         [](const BetaDistribution& dist, std::span<const double> vals, std::span<double> res,
            WorkStealingPool& pool) {
             if (vals.size() != res.size())
@@ -787,27 +729,6 @@ void BetaDistribution::getCumulativeProbability(std::span<const double> values,
                 dist.getCumulativeProbabilityBatchUnsafeImpl(vals.data(), res.data(), count, a, b);
             }
         },
-        [](const BetaDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            if (vals.size() != res.size())
-                throw std::invalid_argument("Span size mismatch");
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            const double a = dist.alpha_, b = dist.beta_;
-            lock.unlock();
-            const double log_prefix = detail::lgamma(a + b) - detail::lgamma(a) - detail::lgamma(b);
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                const double x = vals[i];
-                if (x <= 0.0)
-                    res[i] = 0.0;
-                else if (x >= 1.0)
-                    res[i] = 1.0;
-                else
-                    res[i] = detail::beta_i(x, a, b, log_prefix);
-            });
-        },
         [](const BetaDistribution& dist, std::span<const double> vals, std::span<double> res,
            WorkStealingPool& pool) {
             if (vals.size() != res.size())
diff --git a/src/discrete.cpp b/src/discrete.cpp
index 12db5f8..83b5ce0 100644
--- a/src/discrete.cpp
+++ b/src/discrete.cpp
@@ -1839,46 +1839,6 @@ void DiscreteDistribution::getProbability(std::span<const double> values, std::s
                     res[i] = detail::ZERO_DOUBLE;
                 }
             });
-        },
-        [](const DiscreteDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<DiscreteDistribution*>(&dist)->updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe GPU-accelerated access
-            const int cached_a = dist.a_;
-            const int cached_b = dist.b_;
-            const double cached_prob = dist.probability_;
-            lock.unlock();
-
-            // Use work-stealing pool for optimal load balancing and performance
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                const double x = vals[i];
-                if (std::floor(x) == x && DiscreteDistribution::isValidIntegerValue(x)) {
-                    const int k = static_cast<int>(x);
-                    res[i] = (k >= cached_a && k <= cached_b) ? cached_prob : detail::ZERO_DOUBLE;
-                } else {
-                    res[i] = detail::ZERO_DOUBLE;
-                }
-            });
         });
 }
 
@@ -2017,51 +1977,6 @@ void DiscreteDistribution::getLogProbability(std::span<const double> values,
                     res[i] = detail::NEGATIVE_INFINITY;
                 }
             });
-        },
-        [](const DiscreteDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<DiscreteDistribution*>(&dist)->updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe GPU-accelerated access
-            const int cached_a = dist.a_;
-            const int cached_b = dist.b_;
-            const double cached_log_prob = dist.logProbability_;
-            const bool cached_is_binary = dist.isBinary_;
-            lock.unlock();
-
-            // Use work-stealing pool for optimal load balancing and performance
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                const double x = vals[i];
-                if (std::floor(x) == x && DiscreteDistribution::isValidIntegerValue(x)) {
-                    const int k = static_cast<int>(x);
-                    if (k >= cached_a && k <= cached_b) {
-                        res[i] = cached_is_binary ? -detail::LN2 : cached_log_prob;
-                    } else {
-                        res[i] = detail::NEGATIVE_INFINITY;
-                    }
-                } else {
-                    res[i] = detail::NEGATIVE_INFINITY;
-                }
-            });
         });
 }
 
@@ -2208,54 +2123,6 @@ void DiscreteDistribution::getCumulativeProbability(std::span<const double> valu
                     }
                 }
             });
-        },
-        [](const DiscreteDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<DiscreteDistribution*>(&dist)->updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe GPU-accelerated access
-            const int cached_a = dist.a_;
-            const int cached_b = dist.b_;
-            const int cached_range = dist.range_;
-            const bool cached_is_binary = dist.isBinary_;
-            lock.unlock();
-
-            // Use work-stealing pool for optimal load balancing and performance
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                const double x = vals[i];
-                if (x < static_cast<double>(cached_a)) {
-                    res[i] = detail::ZERO_DOUBLE;
-                } else if (x >= static_cast<double>(cached_b)) {
-                    res[i] = detail::ONE;
-                } else {
-                    const int k = static_cast<int>(std::floor(x));
-                    if (cached_is_binary) {
-                        res[i] = (k >= 0) ? detail::ONE : detail::ZERO_DOUBLE;
-                    } else {
-                        const int numerator = k - cached_a + detail::ONE_INT;
-                        res[i] = static_cast<double>(numerator) / static_cast<double>(cached_range);
-                    }
-                }
-            });
         });
 }
 
@@ -2367,46 +2234,6 @@ void DiscreteDistribution::getProbabilityWithStrategy(std::span<const double> va
                     res[i] = detail::ZERO_DOUBLE;
                 }
             });
-        },
-        [](const DiscreteDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<DiscreteDistribution*>(&dist)->updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe GPU-accelerated access
-            const int cached_a = dist.a_;
-            const int cached_b = dist.b_;
-            const double cached_prob = dist.probability_;
-            lock.unlock();
-
-            // Use work-stealing pool for optimal load balancing and performance
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                const double x = vals[i];
-                if (std::floor(x) == x && DiscreteDistribution::isValidIntegerValue(x)) {
-                    const int k = static_cast<int>(x);
-                    res[i] = (k >= cached_a && k <= cached_b) ? cached_prob : detail::ZERO_DOUBLE;
-                } else {
-                    res[i] = detail::ZERO_DOUBLE;
-                }
-            });
         });
 }
 
@@ -2527,51 +2354,6 @@ void DiscreteDistribution::getLogProbabilityWithStrategy(std::span<const double>
                     res[i] = detail::NEGATIVE_INFINITY;
                 }
             });
-        },
-        [](const DiscreteDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<DiscreteDistribution*>(&dist)->updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe GPU-accelerated access
-            const int cached_a = dist.a_;
-            const int cached_b = dist.b_;
-            const double cached_log_prob = dist.logProbability_;
-            const bool cached_is_binary = dist.isBinary_;
-            lock.unlock();
-
-            // Use work-stealing pool for optimal load balancing and performance
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                const double x = vals[i];
-                if (std::floor(x) == x && DiscreteDistribution::isValidIntegerValue(x)) {
-                    const int k = static_cast<int>(x);
-                    if (k >= cached_a && k <= cached_b) {
-                        res[i] = cached_is_binary ? -detail::LN2 : cached_log_prob;
-                    } else {
-                        res[i] = detail::NEGATIVE_INFINITY;
-                    }
-                } else {
-                    res[i] = detail::NEGATIVE_INFINITY;
-                }
-            });
         });
 }
 
@@ -2696,54 +2478,6 @@ void DiscreteDistribution::getCumulativeProbabilityWithStrategy(std::span<const
                     }
                 }
             });
-        },
-        [](const DiscreteDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<DiscreteDistribution*>(&dist)->updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe GPU-accelerated access
-            const int cached_a = dist.a_;
-            const int cached_b = dist.b_;
-            const int cached_range = dist.range_;
-            const bool cached_is_binary = dist.isBinary_;
-            lock.unlock();
-
-            // Use work-stealing pool for optimal load balancing and performance
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                const double x = vals[i];
-                if (x < static_cast<double>(cached_a)) {
-                    res[i] = detail::ZERO_DOUBLE;
-                } else if (x >= static_cast<double>(cached_b)) {
-                    res[i] = detail::ONE;
-                } else {
-                    const int k = static_cast<int>(std::floor(x));
-                    if (cached_is_binary) {
-                        res[i] = (k >= 0) ? detail::ONE : detail::ZERO_DOUBLE;
-                    } else {
-                        const int numerator = k - cached_a + detail::ONE_INT;
-                        res[i] = static_cast<double>(numerator) / static_cast<double>(cached_range);
-                    }
-                }
-            });
         });
 }
 
diff --git a/src/exponential.cpp b/src/exponential.cpp
index 7806524..aa8e9a5 100644
--- a/src/exponential.cpp
+++ b/src/exponential.cpp
@@ -1450,49 +1450,6 @@ void ExponentialDistribution::getProbability(std::span<const double> values,
                     res[i] = cached_lambda * std::exp(cached_neg_lambda * x);
                 }
             });
-        },
-        [](const ExponentialDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: forwards to work-stealing until GPU implementation available
-            // NOTE: GPU acceleration not yet implemented - using work-stealing for optimal CPU
-            // performance
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<ExponentialDistribution&>(dist).updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe GPU-accelerated (work-stealing fallback) access
-            const double cached_lambda = dist.lambda_;
-            const double cached_neg_lambda = dist.negLambda_;
-            const bool cached_is_unit_rate = dist.isUnitRate_;
-            lock.unlock();
-
-            // Use work-stealing pool for dynamic load balancing (GPU fallback)
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                const double x = vals[i];
-                if (x < detail::ZERO_DOUBLE) {
-                    res[i] = detail::ZERO_DOUBLE;
-                } else if (cached_is_unit_rate) {
-                    res[i] = std::exp(-x);
-                } else {
-                    res[i] = cached_lambda * std::exp(cached_neg_lambda * x);
-                }
-            });
         });
 }
 
@@ -1623,49 +1580,6 @@ void ExponentialDistribution::getLogProbability(std::span<const double> values,
                     res[i] = cached_log_lambda + cached_neg_lambda * x;
                 }
             });
-        },
-        [](const ExponentialDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: forwards to work-stealing until GPU implementation available
-            // NOTE: GPU acceleration not yet implemented - using work-stealing for optimal CPU
-            // performance
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<ExponentialDistribution&>(dist).updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe GPU-accelerated (work-stealing fallback) access
-            const double cached_log_lambda = dist.logLambda_;
-            const double cached_neg_lambda = dist.negLambda_;
-            const bool cached_is_unit_rate = dist.isUnitRate_;
-            lock.unlock();
-
-            // Use work-stealing pool for dynamic load balancing (GPU fallback)
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                const double x = vals[i];
-                if (x < detail::ZERO_DOUBLE) {
-                    res[i] = detail::NEGATIVE_INFINITY;
-                } else if (cached_is_unit_rate) {
-                    res[i] = -x;
-                } else {
-                    res[i] = cached_log_lambda + cached_neg_lambda * x;
-                }
-            });
         });
 }
 
@@ -1791,48 +1705,6 @@ void ExponentialDistribution::getCumulativeProbability(std::span<const double> v
                     res[i] = detail::ONE - std::exp(cached_neg_lambda * x);
                 }
             });
-        },
-        [](const ExponentialDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: forwards to work-stealing until GPU implementation available
-            // NOTE: GPU acceleration not yet implemented - using work-stealing for optimal CPU
-            // performance
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<ExponentialDistribution&>(dist).updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe GPU-accelerated (work-stealing fallback) access
-            const double cached_neg_lambda = dist.negLambda_;
-            const bool cached_is_unit_rate = dist.isUnitRate_;
-            lock.unlock();
-
-            // Use work-stealing pool for dynamic load balancing (GPU fallback)
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                const double x = vals[i];
-                if (x < detail::ZERO_DOUBLE) {
-                    res[i] = detail::ZERO_DOUBLE;
-                } else if (cached_is_unit_rate) {
-                    res[i] = detail::ONE - std::exp(-x);
-                } else {
-                    res[i] = detail::ONE - std::exp(cached_neg_lambda * x);
-                }
-            });
         });
 }
 
@@ -1950,49 +1822,6 @@ void ExponentialDistribution::getProbabilityWithStrategy(std::span<const double>
                     res[i] = cached_lambda * std::exp(cached_neg_lambda * x);
                 }
             });
-        },
-        [](const ExponentialDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: forwards to work-stealing until GPU implementation available
-            // NOTE: GPU acceleration not yet implemented - using work-stealing for optimal CPU
-            // performance
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<ExponentialDistribution&>(dist).updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe GPU-accelerated (work-stealing fallback) access
-            const double cached_lambda = dist.lambda_;
-            const double cached_neg_lambda = dist.negLambda_;
-            const bool cached_is_unit_rate = dist.isUnitRate_;
-            lock.unlock();
-
-            // Use work-stealing pool for dynamic load balancing (GPU fallback)
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                const double x = vals[i];
-                if (x < detail::ZERO_DOUBLE) {
-                    res[i] = detail::ZERO_DOUBLE;
-                } else if (cached_is_unit_rate) {
-                    res[i] = std::exp(-x);
-                } else {
-                    res[i] = cached_lambda * std::exp(cached_neg_lambda * x);
-                }
-            });
         });
 }
 
@@ -2107,49 +1936,6 @@ void ExponentialDistribution::getLogProbabilityWithStrategy(std::span<const doub
                     res[i] = cached_log_lambda + cached_neg_lambda * x;
                 }
             });
-        },
-        [](const ExponentialDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: forwards to work-stealing until GPU implementation available
-            // NOTE: GPU acceleration not yet implemented - using work-stealing for optimal CPU
-            // performance
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<ExponentialDistribution&>(dist).updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe GPU-accelerated (work-stealing fallback) processing
-            const double cached_log_lambda = dist.logLambda_;
-            const double cached_neg_lambda = dist.negLambda_;
-            const bool cached_is_unit_rate = dist.isUnitRate_;
-            lock.unlock();
-
-            // Use work-stealing pool for dynamic load balancing (GPU fallback)
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                const double x = vals[i];
-                if (x < detail::ZERO_DOUBLE) {
-                    res[i] = detail::NEGATIVE_INFINITY;
-                } else if (cached_is_unit_rate) {
-                    res[i] = -x;
-                } else {
-                    res[i] = cached_log_lambda + cached_neg_lambda * x;
-                }
-            });
         });
 }
 
@@ -2259,48 +2045,6 @@ void ExponentialDistribution::getCumulativeProbabilityWithStrategy(
                     res[i] = detail::ONE - std::exp(cached_neg_lambda * x);
                 }
             });
-        },
-        [](const ExponentialDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: forwards to work-stealing until GPU implementation available
-            // NOTE: GPU acceleration not yet implemented - using work-stealing for optimal CPU
-            // performance
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<ExponentialDistribution&>(dist).updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe GPU-accelerated (work-stealing fallback) processing
-            const double cached_neg_lambda = dist.negLambda_;
-            const bool cached_is_unit_rate = dist.isUnitRate_;
-            lock.unlock();
-
-            // Use work-stealing pool for dynamic load balancing (GPU fallback)
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                const double x = vals[i];
-                if (x < detail::ZERO_DOUBLE) {
-                    res[i] = detail::ZERO_DOUBLE;
-                } else if (cached_is_unit_rate) {
-                    res[i] = detail::ONE - std::exp(-x);
-                } else {
-                    res[i] = detail::ONE - std::exp(cached_neg_lambda * x);
-                }
-            });
         });
 }
 
diff --git a/src/gamma.cpp b/src/gamma.cpp
index ff99dac..504df93 100644
--- a/src/gamma.cpp
+++ b/src/gamma.cpp
@@ -1579,50 +1579,6 @@ void GammaDistribution::getProbability(std::span<const double> values, std::span
                                       cached_beta * x - cached_log_gamma_alpha);
                 }
             });
-        },
-        [](const GammaDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // Cache-Aware lambda: For continuous distributions, caching is counterproductive
-            // Fallback to parallel execution which is faster and more predictable
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<GammaDistribution&>(dist).updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe parallel processing
-            [[maybe_unused]] const double cached_alpha = dist.alpha_;
-            const double cached_beta = dist.beta_;
-            const double cached_log_gamma_alpha = dist.logGammaAlpha_;
-            const double cached_alpha_log_beta = dist.alphaLogBeta_;
-            const double cached_alpha_minus_one = dist.alphaMinusOne_;
-            lock.unlock();
-
-            // Use work-stealing pool for optimal load balancing and performance
-            // This approach avoids the cache contention issues that caused performance regression
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                const double x = vals[i];
-                if (x <= detail::ZERO_DOUBLE) {
-                    res[i] = detail::ZERO_DOUBLE;
-                } else {
-                    res[i] = std::exp(cached_alpha_log_beta + cached_alpha_minus_one * std::log(x) -
-                                      cached_beta * x - cached_log_gamma_alpha);
-                }
-            });
         });
 }
 
@@ -1750,47 +1706,6 @@ void GammaDistribution::getLogProbability(std::span<const double> values, std::s
                              cached_alpha_minus_one * std::log(x) - cached_beta * x;
                 }
             });
-        },
-        [](const GammaDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<GammaDistribution&>(dist).updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe GPU-accelerated processing
-            const double cached_beta = dist.beta_;
-            const double cached_log_gamma_alpha = dist.logGammaAlpha_;
-            const double cached_alpha_log_beta = dist.alphaLogBeta_;
-            const double cached_alpha_minus_one = dist.alphaMinusOne_;
-            lock.unlock();
-
-            // Use work-stealing pool for optimal load balancing and performance
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                const double x = vals[i];
-                if (x <= detail::ZERO_DOUBLE) {
-                    res[i] = detail::NEGATIVE_INFINITY;
-                } else {
-                    res[i] = cached_alpha_log_beta - cached_log_gamma_alpha +
-                             cached_alpha_minus_one * std::log(x) - cached_beta * x;
-                }
-            });
         });
 }
 
@@ -1911,44 +1826,6 @@ void GammaDistribution::getCumulativeProbability(std::span<const double> values,
                     res[i] = detail::gamma_p(cached_alpha, cached_beta * x);
                 }
             });
-        },
-        [](const GammaDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<GammaDistribution&>(dist).updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe GPU-accelerated processing
-            const double cached_alpha = dist.alpha_;
-            const double cached_beta = dist.beta_;
-            lock.unlock();
-
-            // Use work-stealing pool for optimal load balancing and performance
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                const double x = vals[i];
-                if (x <= detail::ZERO_DOUBLE) {
-                    res[i] = detail::ZERO_DOUBLE;
-                } else {
-                    res[i] = detail::gamma_p(cached_alpha, cached_beta * x);
-                }
-            });
         });
 }
 
@@ -2070,48 +1947,6 @@ void GammaDistribution::getProbabilityWithStrategy(std::span<const double> value
                                       cached_beta * x - cached_log_gamma_alpha);
                 }
             });
-        },
-        [](const GammaDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<GammaDistribution&>(dist).updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe GPU-accelerated processing
-            [[maybe_unused]] const double cached_alpha = dist.alpha_;
-            const double cached_beta = dist.beta_;
-            const double cached_log_gamma_alpha = dist.logGammaAlpha_;
-            const double cached_alpha_log_beta = dist.alphaLogBeta_;
-            const double cached_alpha_minus_one = dist.alphaMinusOne_;
-            lock.unlock();
-
-            // Use work-stealing pool for optimal load balancing and performance
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                const double x = vals[i];
-                if (x <= detail::ZERO_DOUBLE) {
-                    res[i] = detail::ZERO_DOUBLE;
-                } else {
-                    res[i] = std::exp(cached_alpha_log_beta + cached_alpha_minus_one * std::log(x) -
-                                      cached_beta * x - cached_log_gamma_alpha);
-                }
-            });
         });
 }
 
@@ -2229,48 +2064,6 @@ void GammaDistribution::getLogProbabilityWithStrategy(std::span<const double> va
                              cached_alpha_minus_one * std::log(x) - cached_beta * x;
                 }
             });
-        },
-        [](const GammaDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<GammaDistribution&>(dist).updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe GPU-accelerated processing
-            [[maybe_unused]] const double cached_alpha = dist.alpha_;
-            const double cached_beta = dist.beta_;
-            const double cached_log_gamma_alpha = dist.logGammaAlpha_;
-            const double cached_alpha_log_beta = dist.alphaLogBeta_;
-            const double cached_alpha_minus_one = dist.alphaMinusOne_;
-            lock.unlock();
-
-            // Use work-stealing pool for optimal load balancing and performance
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                const double x = vals[i];
-                if (x <= detail::ZERO_DOUBLE) {
-                    res[i] = detail::NEGATIVE_INFINITY;
-                } else {
-                    res[i] = cached_alpha_log_beta - cached_log_gamma_alpha +
-                             cached_alpha_minus_one * std::log(x) - cached_beta * x;
-                }
-            });
         });
 }
 
@@ -2378,44 +2171,6 @@ void GammaDistribution::getCumulativeProbabilityWithStrategy(std::span<const dou
                     res[i] = detail::gamma_p(cached_alpha, cached_beta * x);
                 }
             });
-        },
-        [](const GammaDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<GammaDistribution&>(dist).updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe GPU-accelerated processing
-            const double cached_alpha = dist.alpha_;
-            const double cached_beta = dist.beta_;
-            lock.unlock();
-
-            // Use work-stealing pool for optimal load balancing and performance
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                const double x = vals[i];
-                if (x <= detail::ZERO_DOUBLE) {
-                    res[i] = detail::ZERO_DOUBLE;
-                } else {
-                    res[i] = detail::gamma_p(cached_alpha, cached_beta * x);
-                }
-            });
         });
 }
 
diff --git a/src/gaussian.cpp b/src/gaussian.cpp
index c1a714f..9e8aac5 100644
--- a/src/gaussian.cpp
+++ b/src/gaussian.cpp
@@ -1908,48 +1908,6 @@ void GaussianDistribution::getProbability(std::span<const double> values, std::s
                     res[i] = cached_norm_constant * std::exp(cached_neg_half_inv_var * sq_diff);
                 }
             });
-        },
-        [](const GaussianDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<GaussianDistribution&>(dist).updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe GPU-accelerated processing
-            const double cached_mean = dist.mean_;
-            const double cached_norm_constant = dist.normalizationConstant_;
-            const double cached_neg_half_inv_var = dist.negHalfSigmaSquaredInv_;
-            const bool cached_is_standard_normal = dist.isStandardNormal_;
-            lock.unlock();
-
-            // Use work-stealing pool for optimal load balancing and performance
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                if (cached_is_standard_normal) {
-                    const double sq_diff = vals[i] * vals[i];
-                    res[i] = detail::INV_SQRT_2PI * std::exp(detail::NEG_HALF * sq_diff);
-                } else {
-                    const double diff = vals[i] - cached_mean;
-                    const double sq_diff = diff * diff;
-                    res[i] = cached_norm_constant * std::exp(cached_neg_half_inv_var * sq_diff);
-                }
-            });
         });
 }
 
@@ -2086,49 +2044,6 @@ void GaussianDistribution::getLogProbability(std::span<const double> values,
                              cached_neg_half_inv_var * sq_diff;
                 }
             });
-        },
-        [](const GaussianDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<GaussianDistribution&>(dist).updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe GPU-accelerated processing
-            const double cached_mean = dist.mean_;
-            const double cached_log_std = dist.logStandardDeviation_;
-            const double cached_neg_half_inv_var = dist.negHalfSigmaSquaredInv_;
-            const bool cached_is_standard_normal = dist.isStandardNormal_;
-            lock.unlock();
-
-            // Use work-stealing pool for optimal load balancing and performance
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                if (cached_is_standard_normal) {
-                    const double sq_diff = vals[i] * vals[i];
-                    res[i] = detail::NEG_HALF_LN_2PI + detail::NEG_HALF * sq_diff;
-                } else {
-                    const double diff = vals[i] - cached_mean;
-                    const double sq_diff = diff * diff;
-                    res[i] = detail::NEG_HALF_LN_2PI - cached_log_std +
-                             cached_neg_half_inv_var * sq_diff;
-                }
-            });
         });
 }
 
@@ -2254,45 +2169,6 @@ void GaussianDistribution::getCumulativeProbability(std::span<const double> valu
                     res[i] = detail::HALF * (detail::ONE + std::erf(normalized));
                 }
             });
-        },
-        [](const GaussianDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<GaussianDistribution&>(dist).updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe GPU-accelerated processing
-            const double cached_mean = dist.mean_;
-            const double cached_sigma_sqrt2 = dist.sigmaSqrt2_;
-            const bool cached_is_standard_normal = dist.isStandardNormal_;
-            lock.unlock();
-
-            // Use work-stealing pool for optimal load balancing and performance
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                if (cached_is_standard_normal) {
-                    res[i] = detail::HALF * (detail::ONE + std::erf(vals[i] * detail::INV_SQRT_2));
-                } else {
-                    const double normalized = (vals[i] - cached_mean) / cached_sigma_sqrt2;
-                    res[i] = detail::HALF * (detail::ONE + std::erf(normalized));
-                }
-            });
         });
 }
 
@@ -2412,49 +2288,6 @@ void GaussianDistribution::getProbabilityWithStrategy(std::span<const double> va
                     res[i] = cached_norm_constant * std::exp(cached_neg_half_inv_var * sq_diff);
                 }
             });
-        },
-        [](const GaussianDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<GaussianDistribution&>(dist).updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe GPU-accelerated processing
-            const double cached_mean = dist.mean_;
-            const double cached_log_std = dist.logStandardDeviation_;
-            const double cached_neg_half_inv_var = dist.negHalfSigmaSquaredInv_;
-            const bool cached_is_standard_normal = dist.isStandardNormal_;
-            lock.unlock();
-
-            // Use work-stealing pool for optimal load balancing and performance
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                if (cached_is_standard_normal) {
-                    const double sq_diff = vals[i] * vals[i];
-                    res[i] = detail::NEG_HALF_LN_2PI + detail::NEG_HALF * sq_diff;
-                } else {
-                    const double diff = vals[i] - cached_mean;
-                    const double sq_diff = diff * diff;
-                    res[i] = detail::NEG_HALF_LN_2PI - cached_log_std +
-                             cached_neg_half_inv_var * sq_diff;
-                }
-            });
         });
 }
 
@@ -2575,49 +2408,6 @@ void GaussianDistribution::getLogProbabilityWithStrategy(std::span<const double>
                              cached_neg_half_inv_var * sq_diff;
                 }
             });
-        },
-        [](const GaussianDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<GaussianDistribution&>(dist).updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe GPU-accelerated processing
-            const double cached_mean = dist.mean_;
-            const double cached_log_std = dist.logStandardDeviation_;
-            const double cached_neg_half_inv_var = dist.negHalfSigmaSquaredInv_;
-            const bool cached_is_standard_normal = dist.isStandardNormal_;
-            lock.unlock();
-
-            // Use work-stealing pool for optimal load balancing and performance
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                if (cached_is_standard_normal) {
-                    const double sq_diff = vals[i] * vals[i];
-                    res[i] = detail::NEG_HALF_LN_2PI + detail::NEG_HALF * sq_diff;
-                } else {
-                    const double diff = vals[i] - cached_mean;
-                    const double sq_diff = diff * diff;
-                    res[i] = detail::NEG_HALF_LN_2PI - cached_log_std +
-                             cached_neg_half_inv_var * sq_diff;
-                }
-            });
         });
 }
 
@@ -2728,45 +2518,6 @@ void GaussianDistribution::getCumulativeProbabilityWithStrategy(std::span<const
                     res[i] = detail::HALF * (detail::ONE + std::erf(normalized));
                 }
             });
-        },
-        [](const GaussianDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<GaussianDistribution&>(dist).updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe GPU-accelerated processing
-            const double cached_mean = dist.mean_;
-            const double cached_sigma_sqrt2 = dist.sigmaSqrt2_;
-            const bool cached_is_standard_normal = dist.isStandardNormal_;
-            lock.unlock();
-
-            // Use work-stealing pool for optimal load balancing and performance
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                if (cached_is_standard_normal) {
-                    res[i] = detail::HALF * (detail::ONE + std::erf(vals[i] * detail::INV_SQRT_2));
-                } else {
-                    const double normalized = (vals[i] - cached_mean) / cached_sigma_sqrt2;
-                    res[i] = detail::HALF * (detail::ONE + std::erf(normalized));
-                }
-            });
         });
 }
 
diff --git a/src/poisson.cpp b/src/poisson.cpp
index 2b01b49..3cd0d97 100644
--- a/src/poisson.cpp
+++ b/src/poisson.cpp
@@ -1789,62 +1789,6 @@ void PoissonDistribution::getProbability(std::span<const double> values, std::sp
                     return;
                 }
 
-                if (k == 0) {
-                    res[i] = cached_exp_neg_lambda;
-                } else if (cached_lambda < detail::SMALL_LAMBDA_THRESHOLD &&
-                           k < static_cast<int>(PoissonDistribution::FACTORIAL_CACHE.size())) {
-                    res[i] = std::pow(cached_lambda, k) * cached_exp_neg_lambda /
-                             PoissonDistribution::FACTORIAL_CACHE[static_cast<std::size_t>(k)];
-                } else {
-                    double log_result = k * cached_log_lambda - cached_lambda -
-                                        PoissonDistribution::logFactorial(k);
-                    res[i] = std::exp(log_result);
-                }
-            });
-        },
-        [](const PoissonDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<PoissonDistribution&>(dist).updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe GPU-accelerated access
-            const double cached_lambda = dist.lambda_;
-            const double cached_log_lambda = dist.logLambda_;
-            const double cached_exp_neg_lambda = dist.expNegLambda_;
-            [[maybe_unused]] const bool cached_is_small_lambda = dist.isSmallLambda_;
-            lock.unlock();
-
-            // Use work-stealing pool for optimal load balancing and performance
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                if (vals[i] < detail::ZERO_DOUBLE) {
-                    res[i] = detail::ZERO_DOUBLE;
-                    return;
-                }
-
-                int k = PoissonDistribution::roundToNonNegativeInt(vals[i]);
-                if (!PoissonDistribution::isValidCount(vals[i])) {
-                    res[i] = detail::ZERO_DOUBLE;
-                    return;
-                }
-
                 if (k == 0) {
                     res[i] = cached_exp_neg_lambda;
                 } else if (cached_lambda < detail::SMALL_LAMBDA_THRESHOLD &&
@@ -1995,52 +1939,6 @@ void PoissonDistribution::getLogProbability(std::span<const double> values,
                     return;
                 }
 
-                // log P(X = k) = k * log(λ) - λ - log(k!)
-                res[i] =
-                    k * cached_log_lambda - cached_lambda - PoissonDistribution::logFactorial(k);
-            });
-        },
-        [](const PoissonDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<PoissonDistribution&>(dist).updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe GPU-accelerated access
-            const double cached_lambda = dist.lambda_;
-            const double cached_log_lambda = dist.logLambda_;
-            lock.unlock();
-
-            // Use work-stealing pool for optimal load balancing and performance
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                if (vals[i] < detail::ZERO_DOUBLE) {
-                    res[i] = detail::MIN_LOG_PROBABILITY;
-                    return;
-                }
-
-                int k = PoissonDistribution::roundToNonNegativeInt(vals[i]);
-                if (!PoissonDistribution::isValidCount(vals[i])) {
-                    res[i] = detail::MIN_LOG_PROBABILITY;
-                    return;
-                }
-
                 // log P(X = k) = k * log(λ) - λ - log(k!)
                 res[i] =
                     k * cached_log_lambda - cached_lambda - PoissonDistribution::logFactorial(k);
@@ -2178,50 +2076,6 @@ void PoissonDistribution::getCumulativeProbability(std::span<const double> value
                     return;
                 }
 
-                // Use regularized incomplete gamma function: P(X ≤ k) = Q(k+1, λ)
-                res[i] = detail::gamma_q(k + 1, cached_lambda);
-            });
-        },
-        [](const PoissonDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: Use pool.parallelFor for GPU acceleration pattern
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<PoissonDistribution&>(dist).updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe parallel processing
-            const double cached_lambda = dist.lambda_;
-            lock.unlock();
-
-            // Use pool.parallelFor for GPU acceleration pattern
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                if (vals[i] < detail::ZERO_DOUBLE) {
-                    res[i] = detail::ZERO_DOUBLE;
-                    return;
-                }
-
-                int k = PoissonDistribution::roundToNonNegativeInt(vals[i]);
-                if (!PoissonDistribution::isValidCount(vals[i])) {
-                    res[i] = detail::ONE;
-                    return;
-                }
-
                 // Use regularized incomplete gamma function: P(X ≤ k) = Q(k+1, λ)
                 res[i] = detail::gamma_q(k + 1, cached_lambda);
             });
@@ -2360,62 +2214,6 @@ void PoissonDistribution::getProbabilityWithStrategy(std::span<const double> val
                     return;
                 }
 
-                if (k == 0) {
-                    res[i] = cached_exp_neg_lambda;
-                } else if (cached_lambda < detail::SMALL_LAMBDA_THRESHOLD &&
-                           k < static_cast<int>(PoissonDistribution::FACTORIAL_CACHE.size())) {
-                    res[i] = std::pow(cached_lambda, k) * cached_exp_neg_lambda /
-                             PoissonDistribution::FACTORIAL_CACHE[static_cast<std::size_t>(k)];
-                } else {
-                    double log_result = k * cached_log_lambda - cached_lambda -
-                                        PoissonDistribution::logFactorial(k);
-                    res[i] = std::exp(log_result);
-                }
-            });
-        },
-        [](const PoissonDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<PoissonDistribution&>(dist).updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe GPU-accelerated access
-            const double cached_lambda = dist.lambda_;
-            const double cached_log_lambda = dist.logLambda_;
-            const double cached_exp_neg_lambda = dist.expNegLambda_;
-            [[maybe_unused]] const bool cached_is_small_lambda = dist.isSmallLambda_;
-            lock.unlock();
-
-            // Use work-stealing pool for optimal load balancing and performance
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                if (vals[i] < detail::ZERO_DOUBLE) {
-                    res[i] = detail::ZERO_DOUBLE;
-                    return;
-                }
-
-                int k = PoissonDistribution::roundToNonNegativeInt(vals[i]);
-                if (!PoissonDistribution::isValidCount(vals[i])) {
-                    res[i] = detail::ZERO_DOUBLE;
-                    return;
-                }
-
                 if (k == 0) {
                     res[i] = cached_exp_neg_lambda;
                 } else if (cached_lambda < detail::SMALL_LAMBDA_THRESHOLD &&
@@ -2545,52 +2343,6 @@ void PoissonDistribution::getLogProbabilityWithStrategy(std::span<const double>
                     return;
                 }
 
-                // log P(X = k) = k * log(λ) - λ - log(k!)
-                res[i] =
-                    k * cached_log_lambda - cached_lambda - PoissonDistribution::logFactorial(k);
-            });
-        },
-        [](const PoissonDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<PoissonDistribution&>(dist).updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe GPU-accelerated access
-            const double cached_lambda = dist.lambda_;
-            const double cached_log_lambda = dist.logLambda_;
-            lock.unlock();
-
-            // Use work-stealing pool for optimal load balancing and performance
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                if (vals[i] < detail::ZERO_DOUBLE) {
-                    res[i] = detail::MIN_LOG_PROBABILITY;
-                    return;
-                }
-
-                int k = PoissonDistribution::roundToNonNegativeInt(vals[i]);
-                if (!PoissonDistribution::isValidCount(vals[i])) {
-                    res[i] = detail::MIN_LOG_PROBABILITY;
-                    return;
-                }
-
                 // log P(X = k) = k * log(λ) - λ - log(k!)
                 res[i] =
                     k * cached_log_lambda - cached_lambda - PoissonDistribution::logFactorial(k);
@@ -2708,50 +2460,6 @@ void PoissonDistribution::getCumulativeProbabilityWithStrategy(std::span<const d
                     return;
                 }
 
-                // Use regularized incomplete gamma function: P(X ≤ k) = Q(k+1, λ)
-                res[i] = detail::gamma_q(k + 1, cached_lambda);
-            });
-        },
-        [](const PoissonDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<PoissonDistribution&>(dist).updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe GPU-accelerated access
-            const double cached_lambda = dist.lambda_;
-            lock.unlock();
-
-            // Use work-stealing pool for optimal load balancing and performance
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                if (vals[i] < detail::ZERO_DOUBLE) {
-                    res[i] = detail::ZERO_DOUBLE;
-                    return;
-                }
-
-                int k = PoissonDistribution::roundToNonNegativeInt(vals[i]);
-                if (!PoissonDistribution::isValidCount(vals[i])) {
-                    res[i] = detail::ONE;
-                    return;
-                }
-
                 // Use regularized incomplete gamma function: P(X ≤ k) = Q(k+1, λ)
                 res[i] = detail::gamma_q(k + 1, cached_lambda);
             });
diff --git a/src/student_t.cpp b/src/student_t.cpp
index efb3fd7..32b1a86 100644
--- a/src/student_t.cpp
+++ b/src/student_t.cpp
@@ -478,23 +478,6 @@ void StudentTDistribution::getProbability(std::span<const double> values, std::s
                 }
             }
         },
-        [](const StudentTDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            const double lnc = dist.logNormConst_;
-            const double nhnpo = dist.negHalfNuPlusOne_;
-            const double inv_nu = dist.invNu_;
-            lock.unlock();
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                res[i] = std::exp(lnc + nhnpo * std::log1p(vals[i] * vals[i] * inv_nu));
-            });
-        },
         [](const StudentTDistribution& dist, std::span<const double> vals, std::span<double> res,
            WorkStealingPool& pool) {
             if (vals.size() != res.size()) {
@@ -562,23 +545,6 @@ void StudentTDistribution::getLogProbability(std::span<const double> values,
                 }
             }
         },
-        [](const StudentTDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            const double lnc = dist.logNormConst_;
-            const double nhnpo = dist.negHalfNuPlusOne_;
-            const double inv_nu = dist.invNu_;
-            lock.unlock();
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                res[i] = lnc + nhnpo * std::log1p(vals[i] * vals[i] * inv_nu);
-            });
-        },
         [](const StudentTDistribution& dist, std::span<const double> vals, std::span<double> res,
            WorkStealingPool& pool) {
             if (vals.size() != res.size()) {
@@ -627,20 +593,6 @@ void StudentTDistribution::getCumulativeProbability(std::span<const double> valu
                 res[i] = detail::t_cdf(vals[i], cached_nu);
             }
         },
-        [](const StudentTDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            const double cached_nu = dist.nu_;
-            lock.unlock();
-            pool.parallelFor(std::size_t{0}, count,
-                             [&](std::size_t i) { res[i] = detail::t_cdf(vals[i], cached_nu); });
-        },
         [](const StudentTDistribution& dist, std::span<const double> vals, std::span<double> res,
            WorkStealingPool& pool) {
             if (vals.size() != res.size()) {
diff --git a/src/uniform.cpp b/src/uniform.cpp
index 1abb12e..df2e649 100644
--- a/src/uniform.cpp
+++ b/src/uniform.cpp
@@ -1434,44 +1434,6 @@ void UniformDistribution::getProbability(std::span<const double> values, std::sp
                 const double x = vals[i];
                 res[i] = (x >= cached_a && x <= cached_b) ? cached_inv_width : detail::ZERO_DOUBLE;
             });
-        },
-        [](const UniformDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: Use work-stealing pool for optimal performance
-            // This replaces the previous cache-aware implementation that caused 100x performance
-            // regression
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<UniformDistribution&>(dist).updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe parallel access
-            const double cached_a = dist.a_;
-            const double cached_b = dist.b_;
-            const double cached_inv_width = dist.invWidth_;
-            lock.unlock();
-
-            // Use work-stealing pool for optimal load balancing and performance
-            // This approach avoids the cache contention issues that caused performance regression
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                const double x = vals[i];
-                res[i] = (x >= cached_a && x <= cached_b) ? cached_inv_width : detail::ZERO_DOUBLE;
-            });
         });
 }
 
@@ -1597,48 +1559,6 @@ void UniformDistribution::getLogProbability(std::span<const double> values,
                     res[i] = cached_log_inv_width;
                 }
             });
-        },
-        [](const UniformDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<UniformDistribution&>(dist).updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe GPU-accelerated processing
-            const double cached_a = dist.a_;
-            const double cached_b = dist.b_;
-            const double cached_log_inv_width = -std::log(dist.width_);
-            const bool cached_is_unit_interval = dist.isUnitInterval_;
-            lock.unlock();
-
-            // Use work-stealing pool for optimal load balancing and performance
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                const double x = vals[i];
-                if (x < cached_a || x > cached_b) {
-                    res[i] = detail::NEGATIVE_INFINITY;
-                } else if (cached_is_unit_interval) {
-                    res[i] = detail::ZERO_DOUBLE;  // log(1) = 0
-                } else {
-                    res[i] = cached_log_inv_width;
-                }
-            });
         });
 }
 
@@ -1772,50 +1692,6 @@ void UniformDistribution::getCumulativeProbability(std::span<const double> value
                     res[i] = (x - cached_a) * cached_inv_width;
                 }
             });
-        },
-        [](const UniformDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<UniformDistribution&>(dist).updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe GPU-accelerated processing
-            const double cached_a = dist.a_;
-            const double cached_b = dist.b_;
-            const double cached_inv_width = dist.invWidth_;
-            const bool cached_is_unit_interval = dist.isUnitInterval_;
-            lock.unlock();
-
-            // Use work-stealing pool for optimal load balancing and performance
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                const double x = vals[i];
-                if (x < cached_a) {
-                    res[i] = detail::ZERO_DOUBLE;
-                } else if (x > cached_b) {
-                    res[i] = detail::ONE;
-                } else if (cached_is_unit_interval) {
-                    res[i] = x;  // CDF(x) = x for U(0,1)
-                } else {
-                    res[i] = (x - cached_a) * cached_inv_width;
-                }
-            });
         });
 }
 
@@ -1914,41 +1790,6 @@ void UniformDistribution::getProbabilityWithStrategy(std::span<const double> val
                 res[i] = (vals[i] >= cached_a && vals[i] <= cached_b) ? cached_inv_width
                                                                       : detail::ZERO_DOUBLE;
             });
-        },
-        [](const UniformDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<UniformDistribution&>(dist).updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe GPU-accelerated access
-            const double cached_a = dist.a_;
-            const double cached_b = dist.b_;
-            const double cached_inv_width = dist.invWidth_;
-            lock.unlock();
-
-            // Use work-stealing pool for optimal load balancing and performance
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                res[i] = (vals[i] >= cached_a && vals[i] <= cached_b) ? cached_inv_width
-                                                                      : detail::ZERO_DOUBLE;
-            });
         });
 }
 
@@ -2055,47 +1896,6 @@ void UniformDistribution::getLogProbabilityWithStrategy(std::span<const double>
                     res[i] = cached_log_inv_width;
                 }
             });
-        },
-        [](const UniformDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<UniformDistribution&>(dist).updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe GPU-accelerated access
-            const double cached_a = dist.a_;
-            const double cached_b = dist.b_;
-            const double cached_log_inv_width = -std::log(dist.width_);
-            const bool cached_is_unit_interval = dist.isUnitInterval_;
-            lock.unlock();
-
-            // Use work-stealing pool for optimal load balancing and performance
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                if (vals[i] < cached_a || vals[i] > cached_b) {
-                    res[i] = detail::NEGATIVE_INFINITY;
-                } else if (cached_is_unit_interval) {
-                    res[i] = detail::ZERO_DOUBLE;  // log(1) = 0
-                } else {
-                    res[i] = cached_log_inv_width;
-                }
-            });
         });
 }
 
@@ -2208,50 +2008,6 @@ void UniformDistribution::getCumulativeProbabilityWithStrategy(std::span<const d
                     res[i] = (vals[i] - cached_a) * cached_inv_width;
                 }
             });
-        },
-        [](const UniformDistribution& dist, std::span<const double> vals, std::span<double> res,
-           WorkStealingPool& pool) {
-            // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing
-            if (vals.size() != res.size()) {
-                throw std::invalid_argument("Input and output spans must have the same size");
-            }
-
-            const std::size_t count = vals.size();
-            if (count == 0)
-                return;
-
-            // Ensure cache is valid
-            std::shared_lock<std::shared_mutex> lock(dist.cache_mutex_);
-            if (!dist.cache_valid_) {
-                lock.unlock();
-                std::unique_lock<std::shared_mutex> ulock(dist.cache_mutex_);
-                if (!dist.cache_valid_) {
-                    const_cast<UniformDistribution&>(dist).updateCacheUnsafe();
-                }
-                ulock.unlock();
-                lock.lock();
-            }
-
-            // Cache parameters for thread-safe GPU-accelerated access
-            const double cached_a = dist.a_;
-            const double cached_b = dist.b_;
-            const double cached_inv_width = dist.invWidth_;
-            const bool cached_is_unit_interval = dist.isUnitInterval_;
-            lock.unlock();
-
-            // Use work-stealing pool for optimal load balancing and performance
-            pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) {
-                const double x = vals[i];
-                if (x < cached_a) {
-                    res[i] = detail::ZERO_DOUBLE;
-                } else if (x > cached_b) {
-                    res[i] = detail::ONE;
-                } else if (cached_is_unit_interval) {
-                    res[i] = x;  // CDF(x) = x for U(0,1)
-                } else {
-                    res[i] = (x - cached_a) * cached_inv_width;
-                }
-            });
         });
 }