diff --git a/ANSWER.md b/ANSWER.md index 83349d8..dd62865 100644 --- a/ANSWER.md +++ b/ANSWER.md @@ -1,23 +1,103 @@ # 改进前 ``` -这里贴改进前的运行结果。 -matrix_randomize: 100s +t=0: n=1120 +matrix_randomize: 1640300ns +matrix_randomize: 905350ns +matrix_transpose: 2858292ns +matrix_multiply: 124426955ns +matrix_multiply: 122302470ns +matrix_RtAR: 249622821ns +matrix_trace: 6950ns +1.75932e+08 +test_func: 261158589ns +t=1: n=928 +matrix_randomize: 219398ns +matrix_randomize: 216040ns +matrix_transpose: 1480696ns +matrix_multiply: 69361807ns +matrix_multiply: 69488155ns +matrix_RtAR: 140357708ns +matrix_trace: 4406ns +1.00156e+08 +test_func: 143469454ns +t=2: n=1024 +matrix_randomize: 387866ns +matrix_randomize: 382464ns +matrix_transpose: 2237437ns +matrix_multiply: 170670256ns +matrix_multiply: 170944211ns +matrix_RtAR: 343932593ns +matrix_trace: 6656ns +1.34324e+08 +test_func: 348436713ns +t=3: n=1056 +matrix_randomize: 278548ns +matrix_randomize: 303162ns +matrix_transpose: 2244654ns +matrix_multiply: 102034204ns +matrix_multiply: 101822940ns +matrix_RtAR: 206129992ns +matrix_trace: 5291ns +1.47405e+08 +test_func: 210509771ns +overall: 965910915ns ``` # 改进后 +940,043,114 +122,873,140 ``` -这里贴改进后的运行结果。 -matrix_randomize: 0.01s +t=0: n=1120 +matrix_randomize: 1149905ns +matrix_randomize: 194806ns +matrix_transpose: 3380612ns +matrix_multiply: 22715600ns +matrix_multiply: 16198907ns +matrix_RtAR: 42313253ns +matrix_trace: 4655ns +1.75932e+08 +test_func: 53439795ns +t=1: n=928 +matrix_randomize: 120288ns +matrix_randomize: 104550ns +matrix_transpose: 1505999ns +matrix_multiply: 9278394ns +matrix_multiply: 9541630ns +matrix_RtAR: 20339630ns +matrix_trace: 3784ns +1.00156e+08 +test_func: 23224326ns +t=2: n=1024 +matrix_randomize: 132620ns +matrix_randomize: 129305ns +matrix_transpose: 2188643ns +matrix_multiply: 14148323ns +matrix_multiply: 13984732ns +matrix_RtAR: 30337879ns +matrix_trace: 3876ns +1.34324e+08 +test_func: 32501406ns +t=3: n=1056 +matrix_randomize: 132218ns +matrix_randomize: 133312ns +matrix_transpose: 2499468ns +matrix_multiply: 13799665ns +matrix_multiply: 13568507ns +matrix_RtAR: 29882378ns +matrix_trace: 4531ns +1.47405e+08 +test_func: 34571786ns +overall: 144688938ns ``` # 加速比 -matrix_randomize: 10000x -matrix_transpose: 10000x -matrix_multiply: 10000x -matrix_RtAR: 10000x +matrix_randomize: 2.0663 +matrix_transpose: 0.9212 +matrix_multiply: 8.2222 +matrix_RtAR: 7.6505 > 如果记录了多种优化方法,可以做表格比较 @@ -26,21 +106,44 @@ matrix_RtAR: 10000x 下面这些函数你是如何优化的?是什么思路?用了老师上课的哪个知识点? > matrix_randomize +> 这个循环为什么不够高效?如何优化? 10 分 +> 跳跃访存,缓存利用率低下 +> 优化: +> 1.改变xy序为yx序,使得访存连续,利用空间局域性 +> 2.使用直写指令,降低一倍访存带宽 请回答。 > matrix_transpose +>这个循环为什么不够高效?如何优化? 15 分 +> 矩阵转置这种访存读写一方一定存在一个是跳跃访存的,导致缓存利用率低 +>优化: +> 1. 先对矩阵进行分块,然后访问块的顺序按照莫顿序 +> 2. 直写 请回答。 > matrix_multiply +>这个循环为什么不够高效?如何优化? 15 分 +> 程序是mem-bound,三个矩阵的访存中 +> out(x, y)顺序访问,高效; +> lhs(x, t)存储是x主序,但是按t主序访问,跨步访问,低效; +> rhs(t, y)访问同一个地址,一般 +> 缓存(L1)大小不够存下三个矩阵,命中率低 +> 优化: +> 1. 循环分块,这样缓存可以放下一个块的大小,提高缓存命中率 +> 2. 寄存器分块,指令级并行 +> 3. 循环展开 请回答。 > matrix_RtAR +> 这两个是临时变量,有什么可以优化的? 5 分 +> 1. 可以省去Rt,使用RtAR代替 +> 2. 使用static,手动池化 请回答。 # 我的创新点 -如果有,请说明。 +对于分块的行或列大小不是2的倍数的情况,使用莫顿序需要判断越界。创新之处在于负优化。 diff --git a/CMakeLists.txt b/CMakeLists.txt index 5d76276..4030afd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,6 +4,8 @@ project(hellocmake LANGUAGES CXX) set(CMAKE_CXX_STANDARD 17) #if (NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) + +set(CMAKE_CXX_FLAGS "-march=native -g") #endif() add_executable(main main.cpp) diff --git a/main.cpp b/main.cpp index d5af053..995b03c 100644 --- a/main.cpp +++ b/main.cpp @@ -8,11 +8,13 @@ // 并行可以用 OpenMP 也可以用 TBB #include -//#include // _mm 系列指令都来自这个头文件 -//#include // 如果上面那个不行,试试这个 +// #include // _mm 系列指令都来自这个头文件 +#include // 如果上面那个不行,试试这个 #include "ndarray.h" #include "wangsrng.h" #include "ticktock.h" +#include "morton.h" +#include "mtprint.h" // Matrix 是 YX 序的二维浮点数组:mat(x, y) = mat.data()[y * mat.shape(0) + x] using Matrix = ndarray<2, float>; @@ -23,12 +25,26 @@ static void matrix_randomize(Matrix &out) { size_t nx = out.shape(0); size_t ny = out.shape(1); - // 这个循环为什么不够高效?如何优化? 10 分 +/* + 这个循环为什么不够高效?如何优化? 10 分 + 跳跃访存,缓存利用率低下 + 1.改变xy序为yx序 + 2.使用直写指令,降低一倍访存带宽 +*/ #pragma omp parallel for collapse(2) - for (int x = 0; x < nx; x++) { - for (int y = 0; y < ny; y++) { - float val = wangsrng(x, y).next_float(); - out(x, y) = val; + for (size_t y = 0; y < ny; y++) { + for (size_t xBase = 0; xBase < nx; xBase+=16) { + float res[16]; +// #pragma GCC unroll 16 + for (size_t x=0;x<16;x++){ + res[x] = wangsrng(xBase+x, y).next_float(); + // out(x, y) = val; + } + // for (int x=0;x<16;x++){ + // _mm_stream_si32(reinterpret_cast(&out(x+xBase,y)),*(reinterpret_cast(&res[x]))); + // } + _mm512_stream_ps(&out(xBase,y),_mm512_load_ps(res)); + } } TOCK(matrix_randomize); @@ -40,13 +56,40 @@ static void matrix_transpose(Matrix &out, Matrix const &in) { size_t ny = in.shape(1); out.reshape(ny, nx); - // 这个循环为什么不够高效?如何优化? 15 分 -#pragma omp parallel for collapse(2) - for (int x = 0; x < nx; x++) { - for (int y = 0; y < ny; y++) { - out(y, x) = in(x, y); +/* + 这个循环为什么不够高效?如何优化? 15 分 + 矩阵转置这种访存读写一方一定存在一个是跳跃访存的,导致缓存利用率低 +*/ +//原版代码 +// #pragma omp parallel for collapse(2) +// for (int x = 0; x < nx; x++) { +// for (int y = 0; y < ny; y++) { +// out(y, x) = in(x, y); +// } +// } + +//莫顿码顺序访问,但是由于分块的大小不是2的幂,需要判断越界 + int blockSize=32; + int xdBlock=nx/blockSize; + int ydBlock=ny/blockSize; +#pragma omp parallel for + for (int k = 0; k < morton2d::highestOneBit(xdBlock)*morton2d::highestOneBit(ydBlock); k++){ + auto [xBase, yBase] = morton2d::decode(k); + // mtprint(xdBlock,morton2d::highestOneBit(xdBlock)); + if (xBase>=xdBlock || yBase>=ydBlock){ + continue; + } + xBase*=blockSize; + yBase*=blockSize; + for (int x=xBase;xresize()全部初始化为0了 + // mrhs=_mm_set1_ps(rhs(t,y)); +#pragma GCC unroll 32 + for (int x = xBase*blockSize; x < (xBase+1)*blockSize; x++) { + out(x, y) += lhs(x, t) * rhs(t, y); + // mout=_mm_load_ps(&out(x,y)); + // mlhs=_mm_load_ps(&lhs(x,t)); + // mout=_mm_add_ps(mout,_mm_mul_ps(mlhs,mrhs)); + // _mm_store_ps(&out(x,y),mout); + } + } + } } } } + TOCK(matrix_multiply); } @@ -78,9 +145,11 @@ static void matrix_multiply(Matrix &out, Matrix const &lhs, Matrix const &rhs) { static void matrix_RtAR(Matrix &RtAR, Matrix const &R, Matrix const &A) { TICK(matrix_RtAR); // 这两个是临时变量,有什么可以优化的? 5 分 - Matrix Rt, RtA; - matrix_transpose(Rt, R); - matrix_multiply(RtA, Rt, A); + // 1. 可以省去Rt,使用RtAR代替 + // 2. 使用static关键字,手动池化 + static thread_local Matrix RtA; + matrix_transpose(RtAR, R); + matrix_multiply(RtA, RtAR, A); matrix_multiply(RtAR, RtA, R); TOCK(matrix_RtAR); } diff --git a/morton.h b/morton.h index 6beaf40..e215329 100644 --- a/morton.h +++ b/morton.h @@ -36,6 +36,19 @@ constexpr static std::tuple decode(uint64_t d) return {decode1(d), decode1(d >> 1)}; } +constexpr static uint64_t highestOneBit(uint64_t x){ + x-=1; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + x |= x >> 32; + x |= x >> 64; + x += 1; + return x; +} + } namespace morton3d { diff --git a/score.txt b/score.txt index ea225b6..41dd3b2 100644 --- a/score.txt +++ b/score.txt @@ -1,43 +1,86 @@ # 原始代码 t=0: n=1120 -matrix_randomize: 0.000928323s -matrix_randomize: 0.000687871s -matrix_transpose: 0.00252768s -matrix_multiply: 0.904947s -matrix_multiply: 0.901407s -matrix_RtAR: 1.80908s -matrix_trace: 6.026e-06s +matrix_randomize: 1640300ns +matrix_randomize: 905350ns +matrix_transpose: 2858292ns +matrix_multiply: 124426955ns +matrix_multiply: 122302470ns +matrix_RtAR: 249622821ns +matrix_trace: 6950ns 1.75932e+08 -test_func: 1.81493s +test_func: 261158589ns t=1: n=928 -matrix_randomize: 0.000416769s -matrix_randomize: 0.0015025s -matrix_transpose: 0.00151582s -matrix_multiply: 0.301254s -matrix_multiply: 0.28622s -matrix_RtAR: 0.589364s -matrix_trace: 5.466e-06s +matrix_randomize: 219398ns +matrix_randomize: 216040ns +matrix_transpose: 1480696ns +matrix_multiply: 69361807ns +matrix_multiply: 69488155ns +matrix_RtAR: 140357708ns +matrix_trace: 4406ns 1.00156e+08 -test_func: 0.594152s +test_func: 143469454ns t=2: n=1024 -matrix_randomize: 0.000563016s -matrix_randomize: 0.000558918s -matrix_transpose: 0.00226996s -matrix_multiply: 0.709763s -matrix_multiply: 0.71093s -matrix_RtAR: 1.42356s -matrix_trace: 5.926e-06s +matrix_randomize: 387866ns +matrix_randomize: 382464ns +matrix_transpose: 2237437ns +matrix_multiply: 170670256ns +matrix_multiply: 170944211ns +matrix_RtAR: 343932593ns +matrix_trace: 6656ns 1.34324e+08 -test_func: 1.42708s +test_func: 348436713ns t=3: n=1056 -matrix_randomize: 0.000606057s -matrix_randomize: 0.000610583s -matrix_transpose: 0.00212937s -matrix_multiply: 0.753805s -matrix_multiply: 0.758014s -matrix_RtAR: 1.51414s -matrix_trace: 0.000139467s +matrix_randomize: 278548ns +matrix_randomize: 303162ns +matrix_transpose: 2244654ns +matrix_multiply: 102034204ns +matrix_multiply: 101822940ns +matrix_RtAR: 206129992ns +matrix_trace: 5291ns 1.47405e+08 -test_func: 1.5189s -overall: 5.35657s +test_func: 210509771ns +overall: 965910915ns + +#优化后 +t=0: n=1120 +matrix_randomize: 1149905ns +matrix_randomize: 194806ns +matrix_transpose: 3380612ns +matrix_multiply: 22715600ns +matrix_multiply: 16198907ns +matrix_RtAR: 42313253ns +matrix_trace: 4655ns +1.75932e+08 +test_func: 53439795ns +t=1: n=928 +matrix_randomize: 120288ns +matrix_randomize: 104550ns +matrix_transpose: 1505999ns +matrix_multiply: 9278394ns +matrix_multiply: 9541630ns +matrix_RtAR: 20339630ns +matrix_trace: 3784ns +1.00156e+08 +test_func: 23224326ns +t=2: n=1024 +matrix_randomize: 132620ns +matrix_randomize: 129305ns +matrix_transpose: 2188643ns +matrix_multiply: 14148323ns +matrix_multiply: 13984732ns +matrix_RtAR: 30337879ns +matrix_trace: 3876ns +1.34324e+08 +test_func: 32501406ns +t=3: n=1056 +matrix_randomize: 132218ns +matrix_randomize: 133312ns +matrix_transpose: 2499468ns +matrix_multiply: 13799665ns +matrix_multiply: 13568507ns +matrix_RtAR: 29882378ns +matrix_trace: 4531ns +1.47405e+08 +test_func: 34571786ns +overall: 144688938ns \ No newline at end of file diff --git a/ticktock.h b/ticktock.h index 93c7a6d..21d1876 100644 --- a/ticktock.h +++ b/ticktock.h @@ -2,7 +2,7 @@ #include #define TICK(x) auto bench_##x = std::chrono::steady_clock::now(); -#define TOCK(x) std::cout << #x ": " << std::chrono::duration_cast>(std::chrono::steady_clock::now() - bench_##x).count() << "s" << std::endl; +#define TOCK(x) std::cout << #x ": " << std::chrono::duration_cast(std::chrono::steady_clock::now() - bench_##x).count() << "ns" << std::endl; //#include //#define TICK(x) auto bench_##x = tbb::tick_count::now();