diff --git a/include/core/dispatch_utils.h b/include/core/dispatch_utils.h index 9386e3a..2f93b9e 100644 --- a/include/core/dispatch_utils.h +++ b/include/core/dispatch_utils.h @@ -35,9 +35,10 @@ namespace detail { // Performance utilities * scalar) Strategy::PARALLEL \u2192 parallel_func \u2192 ParallelUtils::parallelFor * Strategy::WORK_STEALING \u2192 work_stealing_func \u2192 WorkStealingPool::parallelFor * - * The GpuAcceleratedFunc template parameter and gpu_accelerated_func argument are - * retained for ABI compatibility and will be removed in Phase 6 when all distribution - * batch implementations are updated. + * The GPU_ACCELERATED strategy slot has been removed. The GpuAcceleratedFunc + * template parameter and corresponding lambda are no longer accepted by these + * templates. See issue #23 for the rationale and prerequisites for any future + * GPU backend. */ class DispatchUtils { public: @@ -49,7 +50,6 @@ class DispatchUtils { * @tparam BatchFunc Function type for SIMD batch operations * @tparam ParallelFunc Function type for parallel operations * @tparam WorkStealingFunc Function type for work-stealing operations - * @tparam GpuAcceleratedFunc Function type for GPU-accelerated operations * * @param dist Reference to the distribution instance * @param values Input values span @@ -61,16 +61,14 @@ class DispatchUtils { * @param batch_func Function to call for SIMD batch operations * @param parallel_func Function to call for parallel operations * @param work_stealing_func Function to call for work-stealing operations - * @param gpu_accelerated_func Function to call for GPU-accelerated operations */ template + typename WorkStealingFunc> static void autoDispatch(const Distribution& dist, std::span values, std::span results, const PerformanceHint& hint, DistributionType dist_type, OperationType op_type, ScalarFunc&& scalar_func, BatchFunc&& batch_func, - ParallelFunc&& parallel_func, WorkStealingFunc&& work_stealing_func, - GpuAcceleratedFunc&& gpu_accelerated_func) { + ParallelFunc&& parallel_func, WorkStealingFunc&& work_stealing_func) { // Validate input if (values.size() != results.size()) { throw std::invalid_argument("Input and output spans must have the same size"); @@ -103,8 +101,7 @@ class DispatchUtils { executeStrategy(strategy, dist, values, results, count, std::forward(scalar_func), std::forward(batch_func), std::forward(parallel_func), - std::forward(work_stealing_func), - std::forward(gpu_accelerated_func)); + std::forward(work_stealing_func)); } /** @@ -140,7 +137,6 @@ class DispatchUtils { * @tparam BatchFunc Function type for SIMD batch operations * @tparam ParallelFunc Function type for parallel operations * @tparam WorkStealingFunc Function type for work-stealing operations - * @tparam GpuAcceleratedFunc Function type for GPU-accelerated operations * * @param dist Reference to the distribution instance * @param values Input values span @@ -150,16 +146,14 @@ class DispatchUtils { * @param batch_func Function to call for SIMD batch operations * @param parallel_func Function to call for parallel operations * @param work_stealing_func Function to call for work-stealing operations - * @param gpu_accelerated_func Function to call for GPU-accelerated operations */ template + typename WorkStealingFunc> static void executeWithStrategy(const Distribution& dist, std::span values, std::span results, Strategy strategy, ScalarFunc&& scalar_func, BatchFunc&& batch_func, ParallelFunc&& parallel_func, - WorkStealingFunc&& work_stealing_func, - GpuAcceleratedFunc&& gpu_accelerated_func) { + WorkStealingFunc&& work_stealing_func) { // Validate input if (values.size() != results.size()) { throw std::invalid_argument("Input and output spans must have the same size"); @@ -173,8 +167,7 @@ class DispatchUtils { executeStrategy(strategy, dist, values, results, count, std::forward(scalar_func), std::forward(batch_func), std::forward(parallel_func), - std::forward(work_stealing_func), - std::forward(gpu_accelerated_func)); + std::forward(work_stealing_func)); } private: @@ -203,12 +196,12 @@ class DispatchUtils { * @brief Executes the selected strategy with appropriate function calls */ template + typename WorkStealingFunc> static void executeStrategy(Strategy strategy, const Distribution& dist, std::span values, std::span results, size_t count, ScalarFunc&& scalar_func, BatchFunc&& batch_func, - ParallelFunc&& parallel_func, WorkStealingFunc&& work_stealing_func, - GpuAcceleratedFunc&& gpu_accelerated_func) { + ParallelFunc&& parallel_func, + WorkStealingFunc&& work_stealing_func) { switch (strategy) { case Strategy::SCALAR: // Use simple loop for tiny batches (< 8 elements) @@ -240,9 +233,6 @@ class DispatchUtils { break; } } - // gpu_accelerated_func is intentionally unused — GPU_ACCELERATED was removed - // from the Strategy enum. Retained for ABI compatibility until Phase 6. - (void)gpu_accelerated_func; } /** @@ -319,8 +309,7 @@ class DispatchUtils { } } - // executeBatchGpuAccelerated removed — GPU_ACCELERATED strategy removed from enum. - // Callers should use executeBatchWorkStealing directly. + // GPU_ACCELERATED strategy slot removed. See issue #23 for future GPU backend prerequisites. /** * @brief Common cache validation and parameter extraction pattern diff --git a/src/beta.cpp b/src/beta.cpp index 05e9ffe..3f062ab 100644 --- a/src/beta.cpp +++ b/src/beta.cpp @@ -570,35 +570,6 @@ void BetaDistribution::getProbability(std::span values, std::span< dist.getProbabilityBatchUnsafeImpl(vals.data(), res.data(), count, lnc, am1, bm1); } }, - [](const BetaDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - if (vals.size() != res.size()) - throw std::invalid_argument("Span size mismatch"); - const std::size_t count = vals.size(); - if (count == 0) - return; - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) - const_cast(dist).updateCacheUnsafe(); - ulock.unlock(); - lock.lock(); - } - const double lnc = dist.logNormConst_; - const double am1 = dist.alphaMinus1_; - const double bm1 = dist.betaMinus1_; - lock.unlock(); - constexpr std::size_t CHUNK = 1024; - const std::size_t num_chunks = (count + CHUNK - 1) / CHUNK; - pool.parallelFor(std::size_t{0}, num_chunks, [&](std::size_t ci) { - const std::size_t start = ci * CHUNK; - const std::size_t len = std::min(CHUNK, count - start); - dist.getProbabilityBatchUnsafeImpl(vals.data() + start, res.data() + start, len, - lnc, am1, bm1); - }); - }, [](const BetaDistribution& dist, std::span vals, std::span res, WorkStealingPool& pool) { if (vals.size() != res.size()) @@ -687,35 +658,6 @@ void BetaDistribution::getLogProbability(std::span values, std::sp bm1); } }, - [](const BetaDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - if (vals.size() != res.size()) - throw std::invalid_argument("Span size mismatch"); - const std::size_t count = vals.size(); - if (count == 0) - return; - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) - const_cast(dist).updateCacheUnsafe(); - ulock.unlock(); - lock.lock(); - } - const double lnc = dist.logNormConst_; - const double am1 = dist.alphaMinus1_; - const double bm1 = dist.betaMinus1_; - lock.unlock(); - constexpr std::size_t CHUNK = 1024; - const std::size_t num_chunks = (count + CHUNK - 1) / CHUNK; - pool.parallelFor(std::size_t{0}, num_chunks, [&](std::size_t ci) { - const std::size_t start = ci * CHUNK; - const std::size_t len = std::min(CHUNK, count - start); - dist.getLogProbabilityBatchUnsafeImpl(vals.data() + start, res.data() + start, len, - lnc, am1, bm1); - }); - }, [](const BetaDistribution& dist, std::span vals, std::span res, WorkStealingPool& pool) { if (vals.size() != res.size()) @@ -787,27 +729,6 @@ void BetaDistribution::getCumulativeProbability(std::span values, dist.getCumulativeProbabilityBatchUnsafeImpl(vals.data(), res.data(), count, a, b); } }, - [](const BetaDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - if (vals.size() != res.size()) - throw std::invalid_argument("Span size mismatch"); - const std::size_t count = vals.size(); - if (count == 0) - return; - std::shared_lock lock(dist.cache_mutex_); - const double a = dist.alpha_, b = dist.beta_; - lock.unlock(); - const double log_prefix = detail::lgamma(a + b) - detail::lgamma(a) - detail::lgamma(b); - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - const double x = vals[i]; - if (x <= 0.0) - res[i] = 0.0; - else if (x >= 1.0) - res[i] = 1.0; - else - res[i] = detail::beta_i(x, a, b, log_prefix); - }); - }, [](const BetaDistribution& dist, std::span vals, std::span res, WorkStealingPool& pool) { if (vals.size() != res.size()) diff --git a/src/discrete.cpp b/src/discrete.cpp index 12db5f8..83b5ce0 100644 --- a/src/discrete.cpp +++ b/src/discrete.cpp @@ -1839,46 +1839,6 @@ void DiscreteDistribution::getProbability(std::span values, std::s res[i] = detail::ZERO_DOUBLE; } }); - }, - [](const DiscreteDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(&dist)->updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe GPU-accelerated access - const int cached_a = dist.a_; - const int cached_b = dist.b_; - const double cached_prob = dist.probability_; - lock.unlock(); - - // Use work-stealing pool for optimal load balancing and performance - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - const double x = vals[i]; - if (std::floor(x) == x && DiscreteDistribution::isValidIntegerValue(x)) { - const int k = static_cast(x); - res[i] = (k >= cached_a && k <= cached_b) ? cached_prob : detail::ZERO_DOUBLE; - } else { - res[i] = detail::ZERO_DOUBLE; - } - }); }); } @@ -2017,51 +1977,6 @@ void DiscreteDistribution::getLogProbability(std::span values, res[i] = detail::NEGATIVE_INFINITY; } }); - }, - [](const DiscreteDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(&dist)->updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe GPU-accelerated access - const int cached_a = dist.a_; - const int cached_b = dist.b_; - const double cached_log_prob = dist.logProbability_; - const bool cached_is_binary = dist.isBinary_; - lock.unlock(); - - // Use work-stealing pool for optimal load balancing and performance - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - const double x = vals[i]; - if (std::floor(x) == x && DiscreteDistribution::isValidIntegerValue(x)) { - const int k = static_cast(x); - if (k >= cached_a && k <= cached_b) { - res[i] = cached_is_binary ? -detail::LN2 : cached_log_prob; - } else { - res[i] = detail::NEGATIVE_INFINITY; - } - } else { - res[i] = detail::NEGATIVE_INFINITY; - } - }); }); } @@ -2208,54 +2123,6 @@ void DiscreteDistribution::getCumulativeProbability(std::span valu } } }); - }, - [](const DiscreteDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(&dist)->updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe GPU-accelerated access - const int cached_a = dist.a_; - const int cached_b = dist.b_; - const int cached_range = dist.range_; - const bool cached_is_binary = dist.isBinary_; - lock.unlock(); - - // Use work-stealing pool for optimal load balancing and performance - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - const double x = vals[i]; - if (x < static_cast(cached_a)) { - res[i] = detail::ZERO_DOUBLE; - } else if (x >= static_cast(cached_b)) { - res[i] = detail::ONE; - } else { - const int k = static_cast(std::floor(x)); - if (cached_is_binary) { - res[i] = (k >= 0) ? detail::ONE : detail::ZERO_DOUBLE; - } else { - const int numerator = k - cached_a + detail::ONE_INT; - res[i] = static_cast(numerator) / static_cast(cached_range); - } - } - }); }); } @@ -2367,46 +2234,6 @@ void DiscreteDistribution::getProbabilityWithStrategy(std::span va res[i] = detail::ZERO_DOUBLE; } }); - }, - [](const DiscreteDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(&dist)->updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe GPU-accelerated access - const int cached_a = dist.a_; - const int cached_b = dist.b_; - const double cached_prob = dist.probability_; - lock.unlock(); - - // Use work-stealing pool for optimal load balancing and performance - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - const double x = vals[i]; - if (std::floor(x) == x && DiscreteDistribution::isValidIntegerValue(x)) { - const int k = static_cast(x); - res[i] = (k >= cached_a && k <= cached_b) ? cached_prob : detail::ZERO_DOUBLE; - } else { - res[i] = detail::ZERO_DOUBLE; - } - }); }); } @@ -2527,51 +2354,6 @@ void DiscreteDistribution::getLogProbabilityWithStrategy(std::span res[i] = detail::NEGATIVE_INFINITY; } }); - }, - [](const DiscreteDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(&dist)->updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe GPU-accelerated access - const int cached_a = dist.a_; - const int cached_b = dist.b_; - const double cached_log_prob = dist.logProbability_; - const bool cached_is_binary = dist.isBinary_; - lock.unlock(); - - // Use work-stealing pool for optimal load balancing and performance - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - const double x = vals[i]; - if (std::floor(x) == x && DiscreteDistribution::isValidIntegerValue(x)) { - const int k = static_cast(x); - if (k >= cached_a && k <= cached_b) { - res[i] = cached_is_binary ? -detail::LN2 : cached_log_prob; - } else { - res[i] = detail::NEGATIVE_INFINITY; - } - } else { - res[i] = detail::NEGATIVE_INFINITY; - } - }); }); } @@ -2696,54 +2478,6 @@ void DiscreteDistribution::getCumulativeProbabilityWithStrategy(std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(&dist)->updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe GPU-accelerated access - const int cached_a = dist.a_; - const int cached_b = dist.b_; - const int cached_range = dist.range_; - const bool cached_is_binary = dist.isBinary_; - lock.unlock(); - - // Use work-stealing pool for optimal load balancing and performance - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - const double x = vals[i]; - if (x < static_cast(cached_a)) { - res[i] = detail::ZERO_DOUBLE; - } else if (x >= static_cast(cached_b)) { - res[i] = detail::ONE; - } else { - const int k = static_cast(std::floor(x)); - if (cached_is_binary) { - res[i] = (k >= 0) ? detail::ONE : detail::ZERO_DOUBLE; - } else { - const int numerator = k - cached_a + detail::ONE_INT; - res[i] = static_cast(numerator) / static_cast(cached_range); - } - } - }); }); } diff --git a/src/exponential.cpp b/src/exponential.cpp index 7806524..aa8e9a5 100644 --- a/src/exponential.cpp +++ b/src/exponential.cpp @@ -1450,49 +1450,6 @@ void ExponentialDistribution::getProbability(std::span values, res[i] = cached_lambda * std::exp(cached_neg_lambda * x); } }); - }, - [](const ExponentialDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: forwards to work-stealing until GPU implementation available - // NOTE: GPU acceleration not yet implemented - using work-stealing for optimal CPU - // performance - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(dist).updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe GPU-accelerated (work-stealing fallback) access - const double cached_lambda = dist.lambda_; - const double cached_neg_lambda = dist.negLambda_; - const bool cached_is_unit_rate = dist.isUnitRate_; - lock.unlock(); - - // Use work-stealing pool for dynamic load balancing (GPU fallback) - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - const double x = vals[i]; - if (x < detail::ZERO_DOUBLE) { - res[i] = detail::ZERO_DOUBLE; - } else if (cached_is_unit_rate) { - res[i] = std::exp(-x); - } else { - res[i] = cached_lambda * std::exp(cached_neg_lambda * x); - } - }); }); } @@ -1623,49 +1580,6 @@ void ExponentialDistribution::getLogProbability(std::span values, res[i] = cached_log_lambda + cached_neg_lambda * x; } }); - }, - [](const ExponentialDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: forwards to work-stealing until GPU implementation available - // NOTE: GPU acceleration not yet implemented - using work-stealing for optimal CPU - // performance - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(dist).updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe GPU-accelerated (work-stealing fallback) access - const double cached_log_lambda = dist.logLambda_; - const double cached_neg_lambda = dist.negLambda_; - const bool cached_is_unit_rate = dist.isUnitRate_; - lock.unlock(); - - // Use work-stealing pool for dynamic load balancing (GPU fallback) - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - const double x = vals[i]; - if (x < detail::ZERO_DOUBLE) { - res[i] = detail::NEGATIVE_INFINITY; - } else if (cached_is_unit_rate) { - res[i] = -x; - } else { - res[i] = cached_log_lambda + cached_neg_lambda * x; - } - }); }); } @@ -1791,48 +1705,6 @@ void ExponentialDistribution::getCumulativeProbability(std::span v res[i] = detail::ONE - std::exp(cached_neg_lambda * x); } }); - }, - [](const ExponentialDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: forwards to work-stealing until GPU implementation available - // NOTE: GPU acceleration not yet implemented - using work-stealing for optimal CPU - // performance - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(dist).updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe GPU-accelerated (work-stealing fallback) access - const double cached_neg_lambda = dist.negLambda_; - const bool cached_is_unit_rate = dist.isUnitRate_; - lock.unlock(); - - // Use work-stealing pool for dynamic load balancing (GPU fallback) - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - const double x = vals[i]; - if (x < detail::ZERO_DOUBLE) { - res[i] = detail::ZERO_DOUBLE; - } else if (cached_is_unit_rate) { - res[i] = detail::ONE - std::exp(-x); - } else { - res[i] = detail::ONE - std::exp(cached_neg_lambda * x); - } - }); }); } @@ -1950,49 +1822,6 @@ void ExponentialDistribution::getProbabilityWithStrategy(std::span res[i] = cached_lambda * std::exp(cached_neg_lambda * x); } }); - }, - [](const ExponentialDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: forwards to work-stealing until GPU implementation available - // NOTE: GPU acceleration not yet implemented - using work-stealing for optimal CPU - // performance - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(dist).updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe GPU-accelerated (work-stealing fallback) access - const double cached_lambda = dist.lambda_; - const double cached_neg_lambda = dist.negLambda_; - const bool cached_is_unit_rate = dist.isUnitRate_; - lock.unlock(); - - // Use work-stealing pool for dynamic load balancing (GPU fallback) - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - const double x = vals[i]; - if (x < detail::ZERO_DOUBLE) { - res[i] = detail::ZERO_DOUBLE; - } else if (cached_is_unit_rate) { - res[i] = std::exp(-x); - } else { - res[i] = cached_lambda * std::exp(cached_neg_lambda * x); - } - }); }); } @@ -2107,49 +1936,6 @@ void ExponentialDistribution::getLogProbabilityWithStrategy(std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: forwards to work-stealing until GPU implementation available - // NOTE: GPU acceleration not yet implemented - using work-stealing for optimal CPU - // performance - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(dist).updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe GPU-accelerated (work-stealing fallback) processing - const double cached_log_lambda = dist.logLambda_; - const double cached_neg_lambda = dist.negLambda_; - const bool cached_is_unit_rate = dist.isUnitRate_; - lock.unlock(); - - // Use work-stealing pool for dynamic load balancing (GPU fallback) - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - const double x = vals[i]; - if (x < detail::ZERO_DOUBLE) { - res[i] = detail::NEGATIVE_INFINITY; - } else if (cached_is_unit_rate) { - res[i] = -x; - } else { - res[i] = cached_log_lambda + cached_neg_lambda * x; - } - }); }); } @@ -2259,48 +2045,6 @@ void ExponentialDistribution::getCumulativeProbabilityWithStrategy( res[i] = detail::ONE - std::exp(cached_neg_lambda * x); } }); - }, - [](const ExponentialDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: forwards to work-stealing until GPU implementation available - // NOTE: GPU acceleration not yet implemented - using work-stealing for optimal CPU - // performance - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(dist).updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe GPU-accelerated (work-stealing fallback) processing - const double cached_neg_lambda = dist.negLambda_; - const bool cached_is_unit_rate = dist.isUnitRate_; - lock.unlock(); - - // Use work-stealing pool for dynamic load balancing (GPU fallback) - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - const double x = vals[i]; - if (x < detail::ZERO_DOUBLE) { - res[i] = detail::ZERO_DOUBLE; - } else if (cached_is_unit_rate) { - res[i] = detail::ONE - std::exp(-x); - } else { - res[i] = detail::ONE - std::exp(cached_neg_lambda * x); - } - }); }); } diff --git a/src/gamma.cpp b/src/gamma.cpp index ff99dac..504df93 100644 --- a/src/gamma.cpp +++ b/src/gamma.cpp @@ -1579,50 +1579,6 @@ void GammaDistribution::getProbability(std::span values, std::span cached_beta * x - cached_log_gamma_alpha); } }); - }, - [](const GammaDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - // Cache-Aware lambda: For continuous distributions, caching is counterproductive - // Fallback to parallel execution which is faster and more predictable - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(dist).updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe parallel processing - [[maybe_unused]] const double cached_alpha = dist.alpha_; - const double cached_beta = dist.beta_; - const double cached_log_gamma_alpha = dist.logGammaAlpha_; - const double cached_alpha_log_beta = dist.alphaLogBeta_; - const double cached_alpha_minus_one = dist.alphaMinusOne_; - lock.unlock(); - - // Use work-stealing pool for optimal load balancing and performance - // This approach avoids the cache contention issues that caused performance regression - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - const double x = vals[i]; - if (x <= detail::ZERO_DOUBLE) { - res[i] = detail::ZERO_DOUBLE; - } else { - res[i] = std::exp(cached_alpha_log_beta + cached_alpha_minus_one * std::log(x) - - cached_beta * x - cached_log_gamma_alpha); - } - }); }); } @@ -1750,47 +1706,6 @@ void GammaDistribution::getLogProbability(std::span values, std::s cached_alpha_minus_one * std::log(x) - cached_beta * x; } }); - }, - [](const GammaDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(dist).updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe GPU-accelerated processing - const double cached_beta = dist.beta_; - const double cached_log_gamma_alpha = dist.logGammaAlpha_; - const double cached_alpha_log_beta = dist.alphaLogBeta_; - const double cached_alpha_minus_one = dist.alphaMinusOne_; - lock.unlock(); - - // Use work-stealing pool for optimal load balancing and performance - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - const double x = vals[i]; - if (x <= detail::ZERO_DOUBLE) { - res[i] = detail::NEGATIVE_INFINITY; - } else { - res[i] = cached_alpha_log_beta - cached_log_gamma_alpha + - cached_alpha_minus_one * std::log(x) - cached_beta * x; - } - }); }); } @@ -1911,44 +1826,6 @@ void GammaDistribution::getCumulativeProbability(std::span values, res[i] = detail::gamma_p(cached_alpha, cached_beta * x); } }); - }, - [](const GammaDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(dist).updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe GPU-accelerated processing - const double cached_alpha = dist.alpha_; - const double cached_beta = dist.beta_; - lock.unlock(); - - // Use work-stealing pool for optimal load balancing and performance - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - const double x = vals[i]; - if (x <= detail::ZERO_DOUBLE) { - res[i] = detail::ZERO_DOUBLE; - } else { - res[i] = detail::gamma_p(cached_alpha, cached_beta * x); - } - }); }); } @@ -2070,48 +1947,6 @@ void GammaDistribution::getProbabilityWithStrategy(std::span value cached_beta * x - cached_log_gamma_alpha); } }); - }, - [](const GammaDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(dist).updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe GPU-accelerated processing - [[maybe_unused]] const double cached_alpha = dist.alpha_; - const double cached_beta = dist.beta_; - const double cached_log_gamma_alpha = dist.logGammaAlpha_; - const double cached_alpha_log_beta = dist.alphaLogBeta_; - const double cached_alpha_minus_one = dist.alphaMinusOne_; - lock.unlock(); - - // Use work-stealing pool for optimal load balancing and performance - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - const double x = vals[i]; - if (x <= detail::ZERO_DOUBLE) { - res[i] = detail::ZERO_DOUBLE; - } else { - res[i] = std::exp(cached_alpha_log_beta + cached_alpha_minus_one * std::log(x) - - cached_beta * x - cached_log_gamma_alpha); - } - }); }); } @@ -2229,48 +2064,6 @@ void GammaDistribution::getLogProbabilityWithStrategy(std::span va cached_alpha_minus_one * std::log(x) - cached_beta * x; } }); - }, - [](const GammaDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(dist).updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe GPU-accelerated processing - [[maybe_unused]] const double cached_alpha = dist.alpha_; - const double cached_beta = dist.beta_; - const double cached_log_gamma_alpha = dist.logGammaAlpha_; - const double cached_alpha_log_beta = dist.alphaLogBeta_; - const double cached_alpha_minus_one = dist.alphaMinusOne_; - lock.unlock(); - - // Use work-stealing pool for optimal load balancing and performance - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - const double x = vals[i]; - if (x <= detail::ZERO_DOUBLE) { - res[i] = detail::NEGATIVE_INFINITY; - } else { - res[i] = cached_alpha_log_beta - cached_log_gamma_alpha + - cached_alpha_minus_one * std::log(x) - cached_beta * x; - } - }); }); } @@ -2378,44 +2171,6 @@ void GammaDistribution::getCumulativeProbabilityWithStrategy(std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(dist).updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe GPU-accelerated processing - const double cached_alpha = dist.alpha_; - const double cached_beta = dist.beta_; - lock.unlock(); - - // Use work-stealing pool for optimal load balancing and performance - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - const double x = vals[i]; - if (x <= detail::ZERO_DOUBLE) { - res[i] = detail::ZERO_DOUBLE; - } else { - res[i] = detail::gamma_p(cached_alpha, cached_beta * x); - } - }); }); } diff --git a/src/gaussian.cpp b/src/gaussian.cpp index c1a714f..9e8aac5 100644 --- a/src/gaussian.cpp +++ b/src/gaussian.cpp @@ -1908,48 +1908,6 @@ void GaussianDistribution::getProbability(std::span values, std::s res[i] = cached_norm_constant * std::exp(cached_neg_half_inv_var * sq_diff); } }); - }, - [](const GaussianDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(dist).updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe GPU-accelerated processing - const double cached_mean = dist.mean_; - const double cached_norm_constant = dist.normalizationConstant_; - const double cached_neg_half_inv_var = dist.negHalfSigmaSquaredInv_; - const bool cached_is_standard_normal = dist.isStandardNormal_; - lock.unlock(); - - // Use work-stealing pool for optimal load balancing and performance - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - if (cached_is_standard_normal) { - const double sq_diff = vals[i] * vals[i]; - res[i] = detail::INV_SQRT_2PI * std::exp(detail::NEG_HALF * sq_diff); - } else { - const double diff = vals[i] - cached_mean; - const double sq_diff = diff * diff; - res[i] = cached_norm_constant * std::exp(cached_neg_half_inv_var * sq_diff); - } - }); }); } @@ -2086,49 +2044,6 @@ void GaussianDistribution::getLogProbability(std::span values, cached_neg_half_inv_var * sq_diff; } }); - }, - [](const GaussianDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(dist).updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe GPU-accelerated processing - const double cached_mean = dist.mean_; - const double cached_log_std = dist.logStandardDeviation_; - const double cached_neg_half_inv_var = dist.negHalfSigmaSquaredInv_; - const bool cached_is_standard_normal = dist.isStandardNormal_; - lock.unlock(); - - // Use work-stealing pool for optimal load balancing and performance - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - if (cached_is_standard_normal) { - const double sq_diff = vals[i] * vals[i]; - res[i] = detail::NEG_HALF_LN_2PI + detail::NEG_HALF * sq_diff; - } else { - const double diff = vals[i] - cached_mean; - const double sq_diff = diff * diff; - res[i] = detail::NEG_HALF_LN_2PI - cached_log_std + - cached_neg_half_inv_var * sq_diff; - } - }); }); } @@ -2254,45 +2169,6 @@ void GaussianDistribution::getCumulativeProbability(std::span valu res[i] = detail::HALF * (detail::ONE + std::erf(normalized)); } }); - }, - [](const GaussianDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(dist).updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe GPU-accelerated processing - const double cached_mean = dist.mean_; - const double cached_sigma_sqrt2 = dist.sigmaSqrt2_; - const bool cached_is_standard_normal = dist.isStandardNormal_; - lock.unlock(); - - // Use work-stealing pool for optimal load balancing and performance - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - if (cached_is_standard_normal) { - res[i] = detail::HALF * (detail::ONE + std::erf(vals[i] * detail::INV_SQRT_2)); - } else { - const double normalized = (vals[i] - cached_mean) / cached_sigma_sqrt2; - res[i] = detail::HALF * (detail::ONE + std::erf(normalized)); - } - }); }); } @@ -2412,49 +2288,6 @@ void GaussianDistribution::getProbabilityWithStrategy(std::span va res[i] = cached_norm_constant * std::exp(cached_neg_half_inv_var * sq_diff); } }); - }, - [](const GaussianDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(dist).updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe GPU-accelerated processing - const double cached_mean = dist.mean_; - const double cached_log_std = dist.logStandardDeviation_; - const double cached_neg_half_inv_var = dist.negHalfSigmaSquaredInv_; - const bool cached_is_standard_normal = dist.isStandardNormal_; - lock.unlock(); - - // Use work-stealing pool for optimal load balancing and performance - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - if (cached_is_standard_normal) { - const double sq_diff = vals[i] * vals[i]; - res[i] = detail::NEG_HALF_LN_2PI + detail::NEG_HALF * sq_diff; - } else { - const double diff = vals[i] - cached_mean; - const double sq_diff = diff * diff; - res[i] = detail::NEG_HALF_LN_2PI - cached_log_std + - cached_neg_half_inv_var * sq_diff; - } - }); }); } @@ -2575,49 +2408,6 @@ void GaussianDistribution::getLogProbabilityWithStrategy(std::span cached_neg_half_inv_var * sq_diff; } }); - }, - [](const GaussianDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(dist).updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe GPU-accelerated processing - const double cached_mean = dist.mean_; - const double cached_log_std = dist.logStandardDeviation_; - const double cached_neg_half_inv_var = dist.negHalfSigmaSquaredInv_; - const bool cached_is_standard_normal = dist.isStandardNormal_; - lock.unlock(); - - // Use work-stealing pool for optimal load balancing and performance - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - if (cached_is_standard_normal) { - const double sq_diff = vals[i] * vals[i]; - res[i] = detail::NEG_HALF_LN_2PI + detail::NEG_HALF * sq_diff; - } else { - const double diff = vals[i] - cached_mean; - const double sq_diff = diff * diff; - res[i] = detail::NEG_HALF_LN_2PI - cached_log_std + - cached_neg_half_inv_var * sq_diff; - } - }); }); } @@ -2728,45 +2518,6 @@ void GaussianDistribution::getCumulativeProbabilityWithStrategy(std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(dist).updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe GPU-accelerated processing - const double cached_mean = dist.mean_; - const double cached_sigma_sqrt2 = dist.sigmaSqrt2_; - const bool cached_is_standard_normal = dist.isStandardNormal_; - lock.unlock(); - - // Use work-stealing pool for optimal load balancing and performance - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - if (cached_is_standard_normal) { - res[i] = detail::HALF * (detail::ONE + std::erf(vals[i] * detail::INV_SQRT_2)); - } else { - const double normalized = (vals[i] - cached_mean) / cached_sigma_sqrt2; - res[i] = detail::HALF * (detail::ONE + std::erf(normalized)); - } - }); }); } diff --git a/src/poisson.cpp b/src/poisson.cpp index 2b01b49..3cd0d97 100644 --- a/src/poisson.cpp +++ b/src/poisson.cpp @@ -1789,62 +1789,6 @@ void PoissonDistribution::getProbability(std::span values, std::sp return; } - if (k == 0) { - res[i] = cached_exp_neg_lambda; - } else if (cached_lambda < detail::SMALL_LAMBDA_THRESHOLD && - k < static_cast(PoissonDistribution::FACTORIAL_CACHE.size())) { - res[i] = std::pow(cached_lambda, k) * cached_exp_neg_lambda / - PoissonDistribution::FACTORIAL_CACHE[static_cast(k)]; - } else { - double log_result = k * cached_log_lambda - cached_lambda - - PoissonDistribution::logFactorial(k); - res[i] = std::exp(log_result); - } - }); - }, - [](const PoissonDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(dist).updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe GPU-accelerated access - const double cached_lambda = dist.lambda_; - const double cached_log_lambda = dist.logLambda_; - const double cached_exp_neg_lambda = dist.expNegLambda_; - [[maybe_unused]] const bool cached_is_small_lambda = dist.isSmallLambda_; - lock.unlock(); - - // Use work-stealing pool for optimal load balancing and performance - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - if (vals[i] < detail::ZERO_DOUBLE) { - res[i] = detail::ZERO_DOUBLE; - return; - } - - int k = PoissonDistribution::roundToNonNegativeInt(vals[i]); - if (!PoissonDistribution::isValidCount(vals[i])) { - res[i] = detail::ZERO_DOUBLE; - return; - } - if (k == 0) { res[i] = cached_exp_neg_lambda; } else if (cached_lambda < detail::SMALL_LAMBDA_THRESHOLD && @@ -1995,52 +1939,6 @@ void PoissonDistribution::getLogProbability(std::span values, return; } - // log P(X = k) = k * log(λ) - λ - log(k!) - res[i] = - k * cached_log_lambda - cached_lambda - PoissonDistribution::logFactorial(k); - }); - }, - [](const PoissonDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(dist).updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe GPU-accelerated access - const double cached_lambda = dist.lambda_; - const double cached_log_lambda = dist.logLambda_; - lock.unlock(); - - // Use work-stealing pool for optimal load balancing and performance - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - if (vals[i] < detail::ZERO_DOUBLE) { - res[i] = detail::MIN_LOG_PROBABILITY; - return; - } - - int k = PoissonDistribution::roundToNonNegativeInt(vals[i]); - if (!PoissonDistribution::isValidCount(vals[i])) { - res[i] = detail::MIN_LOG_PROBABILITY; - return; - } - // log P(X = k) = k * log(λ) - λ - log(k!) res[i] = k * cached_log_lambda - cached_lambda - PoissonDistribution::logFactorial(k); @@ -2178,50 +2076,6 @@ void PoissonDistribution::getCumulativeProbability(std::span value return; } - // Use regularized incomplete gamma function: P(X ≤ k) = Q(k+1, λ) - res[i] = detail::gamma_q(k + 1, cached_lambda); - }); - }, - [](const PoissonDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: Use pool.parallelFor for GPU acceleration pattern - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(dist).updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe parallel processing - const double cached_lambda = dist.lambda_; - lock.unlock(); - - // Use pool.parallelFor for GPU acceleration pattern - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - if (vals[i] < detail::ZERO_DOUBLE) { - res[i] = detail::ZERO_DOUBLE; - return; - } - - int k = PoissonDistribution::roundToNonNegativeInt(vals[i]); - if (!PoissonDistribution::isValidCount(vals[i])) { - res[i] = detail::ONE; - return; - } - // Use regularized incomplete gamma function: P(X ≤ k) = Q(k+1, λ) res[i] = detail::gamma_q(k + 1, cached_lambda); }); @@ -2360,62 +2214,6 @@ void PoissonDistribution::getProbabilityWithStrategy(std::span val return; } - if (k == 0) { - res[i] = cached_exp_neg_lambda; - } else if (cached_lambda < detail::SMALL_LAMBDA_THRESHOLD && - k < static_cast(PoissonDistribution::FACTORIAL_CACHE.size())) { - res[i] = std::pow(cached_lambda, k) * cached_exp_neg_lambda / - PoissonDistribution::FACTORIAL_CACHE[static_cast(k)]; - } else { - double log_result = k * cached_log_lambda - cached_lambda - - PoissonDistribution::logFactorial(k); - res[i] = std::exp(log_result); - } - }); - }, - [](const PoissonDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(dist).updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe GPU-accelerated access - const double cached_lambda = dist.lambda_; - const double cached_log_lambda = dist.logLambda_; - const double cached_exp_neg_lambda = dist.expNegLambda_; - [[maybe_unused]] const bool cached_is_small_lambda = dist.isSmallLambda_; - lock.unlock(); - - // Use work-stealing pool for optimal load balancing and performance - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - if (vals[i] < detail::ZERO_DOUBLE) { - res[i] = detail::ZERO_DOUBLE; - return; - } - - int k = PoissonDistribution::roundToNonNegativeInt(vals[i]); - if (!PoissonDistribution::isValidCount(vals[i])) { - res[i] = detail::ZERO_DOUBLE; - return; - } - if (k == 0) { res[i] = cached_exp_neg_lambda; } else if (cached_lambda < detail::SMALL_LAMBDA_THRESHOLD && @@ -2545,52 +2343,6 @@ void PoissonDistribution::getLogProbabilityWithStrategy(std::span return; } - // log P(X = k) = k * log(λ) - λ - log(k!) - res[i] = - k * cached_log_lambda - cached_lambda - PoissonDistribution::logFactorial(k); - }); - }, - [](const PoissonDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(dist).updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe GPU-accelerated access - const double cached_lambda = dist.lambda_; - const double cached_log_lambda = dist.logLambda_; - lock.unlock(); - - // Use work-stealing pool for optimal load balancing and performance - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - if (vals[i] < detail::ZERO_DOUBLE) { - res[i] = detail::MIN_LOG_PROBABILITY; - return; - } - - int k = PoissonDistribution::roundToNonNegativeInt(vals[i]); - if (!PoissonDistribution::isValidCount(vals[i])) { - res[i] = detail::MIN_LOG_PROBABILITY; - return; - } - // log P(X = k) = k * log(λ) - λ - log(k!) res[i] = k * cached_log_lambda - cached_lambda - PoissonDistribution::logFactorial(k); @@ -2708,50 +2460,6 @@ void PoissonDistribution::getCumulativeProbabilityWithStrategy(std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(dist).updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe GPU-accelerated access - const double cached_lambda = dist.lambda_; - lock.unlock(); - - // Use work-stealing pool for optimal load balancing and performance - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - if (vals[i] < detail::ZERO_DOUBLE) { - res[i] = detail::ZERO_DOUBLE; - return; - } - - int k = PoissonDistribution::roundToNonNegativeInt(vals[i]); - if (!PoissonDistribution::isValidCount(vals[i])) { - res[i] = detail::ONE; - return; - } - // Use regularized incomplete gamma function: P(X ≤ k) = Q(k+1, λ) res[i] = detail::gamma_q(k + 1, cached_lambda); }); diff --git a/src/student_t.cpp b/src/student_t.cpp index efb3fd7..32b1a86 100644 --- a/src/student_t.cpp +++ b/src/student_t.cpp @@ -478,23 +478,6 @@ void StudentTDistribution::getProbability(std::span values, std::s } } }, - [](const StudentTDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - const std::size_t count = vals.size(); - if (count == 0) - return; - std::shared_lock lock(dist.cache_mutex_); - const double lnc = dist.logNormConst_; - const double nhnpo = dist.negHalfNuPlusOne_; - const double inv_nu = dist.invNu_; - lock.unlock(); - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - res[i] = std::exp(lnc + nhnpo * std::log1p(vals[i] * vals[i] * inv_nu)); - }); - }, [](const StudentTDistribution& dist, std::span vals, std::span res, WorkStealingPool& pool) { if (vals.size() != res.size()) { @@ -562,23 +545,6 @@ void StudentTDistribution::getLogProbability(std::span values, } } }, - [](const StudentTDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - const std::size_t count = vals.size(); - if (count == 0) - return; - std::shared_lock lock(dist.cache_mutex_); - const double lnc = dist.logNormConst_; - const double nhnpo = dist.negHalfNuPlusOne_; - const double inv_nu = dist.invNu_; - lock.unlock(); - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - res[i] = lnc + nhnpo * std::log1p(vals[i] * vals[i] * inv_nu); - }); - }, [](const StudentTDistribution& dist, std::span vals, std::span res, WorkStealingPool& pool) { if (vals.size() != res.size()) { @@ -627,20 +593,6 @@ void StudentTDistribution::getCumulativeProbability(std::span valu res[i] = detail::t_cdf(vals[i], cached_nu); } }, - [](const StudentTDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - const std::size_t count = vals.size(); - if (count == 0) - return; - std::shared_lock lock(dist.cache_mutex_); - const double cached_nu = dist.nu_; - lock.unlock(); - pool.parallelFor(std::size_t{0}, count, - [&](std::size_t i) { res[i] = detail::t_cdf(vals[i], cached_nu); }); - }, [](const StudentTDistribution& dist, std::span vals, std::span res, WorkStealingPool& pool) { if (vals.size() != res.size()) { diff --git a/src/uniform.cpp b/src/uniform.cpp index 1abb12e..df2e649 100644 --- a/src/uniform.cpp +++ b/src/uniform.cpp @@ -1434,44 +1434,6 @@ void UniformDistribution::getProbability(std::span values, std::sp const double x = vals[i]; res[i] = (x >= cached_a && x <= cached_b) ? cached_inv_width : detail::ZERO_DOUBLE; }); - }, - [](const UniformDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: Use work-stealing pool for optimal performance - // This replaces the previous cache-aware implementation that caused 100x performance - // regression - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(dist).updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe parallel access - const double cached_a = dist.a_; - const double cached_b = dist.b_; - const double cached_inv_width = dist.invWidth_; - lock.unlock(); - - // Use work-stealing pool for optimal load balancing and performance - // This approach avoids the cache contention issues that caused performance regression - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - const double x = vals[i]; - res[i] = (x >= cached_a && x <= cached_b) ? cached_inv_width : detail::ZERO_DOUBLE; - }); }); } @@ -1597,48 +1559,6 @@ void UniformDistribution::getLogProbability(std::span values, res[i] = cached_log_inv_width; } }); - }, - [](const UniformDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(dist).updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe GPU-accelerated processing - const double cached_a = dist.a_; - const double cached_b = dist.b_; - const double cached_log_inv_width = -std::log(dist.width_); - const bool cached_is_unit_interval = dist.isUnitInterval_; - lock.unlock(); - - // Use work-stealing pool for optimal load balancing and performance - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - const double x = vals[i]; - if (x < cached_a || x > cached_b) { - res[i] = detail::NEGATIVE_INFINITY; - } else if (cached_is_unit_interval) { - res[i] = detail::ZERO_DOUBLE; // log(1) = 0 - } else { - res[i] = cached_log_inv_width; - } - }); }); } @@ -1772,50 +1692,6 @@ void UniformDistribution::getCumulativeProbability(std::span value res[i] = (x - cached_a) * cached_inv_width; } }); - }, - [](const UniformDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(dist).updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe GPU-accelerated processing - const double cached_a = dist.a_; - const double cached_b = dist.b_; - const double cached_inv_width = dist.invWidth_; - const bool cached_is_unit_interval = dist.isUnitInterval_; - lock.unlock(); - - // Use work-stealing pool for optimal load balancing and performance - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - const double x = vals[i]; - if (x < cached_a) { - res[i] = detail::ZERO_DOUBLE; - } else if (x > cached_b) { - res[i] = detail::ONE; - } else if (cached_is_unit_interval) { - res[i] = x; // CDF(x) = x for U(0,1) - } else { - res[i] = (x - cached_a) * cached_inv_width; - } - }); }); } @@ -1914,41 +1790,6 @@ void UniformDistribution::getProbabilityWithStrategy(std::span val res[i] = (vals[i] >= cached_a && vals[i] <= cached_b) ? cached_inv_width : detail::ZERO_DOUBLE; }); - }, - [](const UniformDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(dist).updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe GPU-accelerated access - const double cached_a = dist.a_; - const double cached_b = dist.b_; - const double cached_inv_width = dist.invWidth_; - lock.unlock(); - - // Use work-stealing pool for optimal load balancing and performance - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - res[i] = (vals[i] >= cached_a && vals[i] <= cached_b) ? cached_inv_width - : detail::ZERO_DOUBLE; - }); }); } @@ -2055,47 +1896,6 @@ void UniformDistribution::getLogProbabilityWithStrategy(std::span res[i] = cached_log_inv_width; } }); - }, - [](const UniformDistribution& dist, std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(dist).updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe GPU-accelerated access - const double cached_a = dist.a_; - const double cached_b = dist.b_; - const double cached_log_inv_width = -std::log(dist.width_); - const bool cached_is_unit_interval = dist.isUnitInterval_; - lock.unlock(); - - // Use work-stealing pool for optimal load balancing and performance - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - if (vals[i] < cached_a || vals[i] > cached_b) { - res[i] = detail::NEGATIVE_INFINITY; - } else if (cached_is_unit_interval) { - res[i] = detail::ZERO_DOUBLE; // log(1) = 0 - } else { - res[i] = cached_log_inv_width; - } - }); }); } @@ -2208,50 +2008,6 @@ void UniformDistribution::getCumulativeProbabilityWithStrategy(std::span vals, std::span res, - WorkStealingPool& pool) { - // GPU-Accelerated lambda: should use pool.parallelFor for dynamic load balancing - if (vals.size() != res.size()) { - throw std::invalid_argument("Input and output spans must have the same size"); - } - - const std::size_t count = vals.size(); - if (count == 0) - return; - - // Ensure cache is valid - std::shared_lock lock(dist.cache_mutex_); - if (!dist.cache_valid_) { - lock.unlock(); - std::unique_lock ulock(dist.cache_mutex_); - if (!dist.cache_valid_) { - const_cast(dist).updateCacheUnsafe(); - } - ulock.unlock(); - lock.lock(); - } - - // Cache parameters for thread-safe GPU-accelerated access - const double cached_a = dist.a_; - const double cached_b = dist.b_; - const double cached_inv_width = dist.invWidth_; - const bool cached_is_unit_interval = dist.isUnitInterval_; - lock.unlock(); - - // Use work-stealing pool for optimal load balancing and performance - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - const double x = vals[i]; - if (x < cached_a) { - res[i] = detail::ZERO_DOUBLE; - } else if (x > cached_b) { - res[i] = detail::ONE; - } else if (cached_is_unit_interval) { - res[i] = x; // CDF(x) = x for U(0,1) - } else { - res[i] = (x - cached_a) * cached_inv_width; - } - }); }); }