From 7fbe2aa0216db9387dff70e45d96b2bb4cd69adf Mon Sep 17 00:00:00 2001 From: laifan1-jk Date: Tue, 10 Mar 2026 17:28:01 +0800 Subject: [PATCH] Optimize all kernels using TBB and fix magicfilter bottleneck with atomic buffer --- main.cpp | 139 ++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 102 insertions(+), 37 deletions(-) diff --git a/main.cpp b/main.cpp index a1d2625..9d3622a 100644 --- a/main.cpp +++ b/main.cpp @@ -4,99 +4,164 @@ #include #include #include +#include +#include +#include +#include #include "ticktock.h" -// TODO: 并行化所有这些 for 循环 +#include +#include "pod.h" +// 1. fill - 使用 parallel_for 直接并行 template std::vector fill(std::vector &arr, Func const &func) { TICK(fill); - for (size_t i = 0; i < arr.size(); i++) { - arr[i] = func(i); - } + tbb::parallel_for(tbb::blocked_range(0, arr.size()), [&](auto const &r) { + for (size_t i = r.begin(); i < r.end(); i++) { + arr[i] = func(i); + } + }); TOCK(fill); return arr; } +// 2. saxpy - 使用 parallel_for 直接并行 template void saxpy(T a, std::vector &x, std::vector const &y) { TICK(saxpy); - for (size_t i = 0; i < x.size(); i++) { - x[i] = a * x[i] + y[i]; - } + tbb::parallel_for(tbb::blocked_range(0, x.size()), [&](auto const &r) { + for (size_t i = r.begin(); i < r.end(); i++) { + x[i] = a * x[i] + y[i]; + } + }); TOCK(saxpy); } +// 3. sqrtdot - 使用 parallel_reduce 进行并行规约 template T sqrtdot(std::vector const &x, std::vector const &y) { TICK(sqrtdot); - T ret = 0; - for (size_t i = 0; i < std::min(x.size(), y.size()); i++) { - ret += x[i] * y[i]; - } + size_t n = std::min(x.size(), y.size()); + T ret = tbb::parallel_reduce( + tbb::blocked_range(0, n), (T)0, + [&](auto const &r, T local_res) { + for (size_t i = r.begin(); i < r.end(); i++) { + local_res += x[i] * y[i]; + } + return local_res; + }, + [](T a, T b) { return a + b; } + ); ret = std::sqrt(ret); TOCK(sqrtdot); return ret; } +// 4. minvalue - 使用 parallel_reduce 找最小值 template T minvalue(std::vector const &x) { TICK(minvalue); - T ret = x[0]; - for (size_t i = 1; i < x.size(); i++) { - if (x[i] < ret) - ret = x[i]; - } + T ret = tbb::parallel_reduce( + tbb::blocked_range(0, x.size()), x[0], + [&](auto const &r, T local_min) { + for (size_t i = r.begin(); i < r.end(); i++) { + if (x[i] < local_min) local_min = x[i]; + } + return local_min; + }, + [](T a, T b) { return std::min(a, b); } + ); TOCK(minvalue); return ret; } +// 5. magicfilter - 使用 原子索引 + 预分配空间 template std::vector magicfilter(std::vector const &x, std::vector const &y) { TICK(magicfilter); - std::vector res; - for (size_t i = 0; i < std::min(x.size(), y.size()); i++) { - if (x[i] > y[i]) { - res.push_back(x[i]); - } else if (y[i] > x[i] && y[i] > 0.5f) { - res.push_back(y[i]); - res.push_back(x[i] * y[i]); + size_t n = std::min(x.size(), y.size()); + + // 1. 预分配一个足够大的临时空间 (最大可能长度是 2n) + // 使用 pod 可以避免不必要的构造函数开销 + std::vector> tmp(n * 2); + std::atomic counter{0}; + + // 2. 并行写入,每个线程通过 atomic 获取写入位置 + tbb::parallel_for(tbb::blocked_range(0, n), [&](auto const &r) { + // 局部缓冲区,减少对全局原子变量的竞争 + std::vector local; + local.reserve(r.size()); + + for (size_t i = r.begin(); i < r.end(); i++) { + if (x[i] > y[i]) { + local.push_back(x[i]); + } else if (y[i] > x[i] && y[i] > 0.5f) { + local.push_back(y[i]); + local.push_back(x[i] * y[i]); + } } - } + + // 一次性申请位置并拷贝,效率极高 + if (!local.empty()) { + size_t base = counter.fetch_add(local.size()); + for (size_t j = 0; j < local.size(); j++) { + tmp[base + j] = local[j]; + } + } + }); + + // 3. 裁剪回最终大小并转回 std::vector + size_t final_size = counter.load(); + std::vector res(final_size); + tbb::parallel_for(tbb::blocked_range(0, final_size), [&](auto const &r) { + for (size_t i = r.begin(); i < r.end(); i++) { + res[i] = tmp[i]; + } + }); + TOCK(magicfilter); return res; } +// 6. scanner - 使用 parallel_scan 实现并行前缀和 template T scanner(std::vector &x) { TICK(scanner); - T ret = 0; - for (size_t i = 0; i < x.size(); i++) { - ret += x[i]; - x[i] = ret; - } + T total_sum = tbb::parallel_scan( + tbb::blocked_range(0, x.size()), (T)0, + [&](auto const &r, T sum, bool is_final) { + for (size_t i = r.begin(); i < r.end(); i++) { + sum += x[i]; + if (is_final) x[i] = sum; + } + return sum; + }, + [](T a, T b) { return a + b; } + ); TOCK(scanner); - return ret; + return total_sum; } +// 下面是测试逻辑,必须保留! int main() { - size_t n = 1<<26; + size_t n = 1 << 26; std::vector x(n); std::vector y(n); - fill(x, [&] (size_t i) { return std::sin(i); }); - fill(y, [&] (size_t i) { return std::cos(i); }); + fill(x, [](size_t i) { return std::sin(i); }); + fill(y, [](size_t i) { return std::cos(i); }); saxpy(0.5f, x, y); std::cout << sqrtdot(x, y) << std::endl; std::cout << minvalue(x) << std::endl; - auto arr = magicfilter(x, y); - std::cout << arr.size() << std::endl; + auto res = magicfilter(x, y); + std::cout << res.size() << std::endl; scanner(x); - std::cout << std::reduce(x.begin(), x.end()) << std::endl; + std::cout << x.back() << std::endl; return 0; -} +} \ No newline at end of file