Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 112 additions & 9 deletions ANSWER.md
Original file line number Diff line number Diff line change
@@ -1,23 +1,103 @@
# 改进前

```
这里贴改进前的运行结果。
matrix_randomize: 100s
t=0: n=1120
matrix_randomize: 1640300ns
matrix_randomize: 905350ns
matrix_transpose: 2858292ns
matrix_multiply: 124426955ns
matrix_multiply: 122302470ns
matrix_RtAR: 249622821ns
matrix_trace: 6950ns
1.75932e+08
test_func: 261158589ns
t=1: n=928
matrix_randomize: 219398ns
matrix_randomize: 216040ns
matrix_transpose: 1480696ns
matrix_multiply: 69361807ns
matrix_multiply: 69488155ns
matrix_RtAR: 140357708ns
matrix_trace: 4406ns
1.00156e+08
test_func: 143469454ns
t=2: n=1024
matrix_randomize: 387866ns
matrix_randomize: 382464ns
matrix_transpose: 2237437ns
matrix_multiply: 170670256ns
matrix_multiply: 170944211ns
matrix_RtAR: 343932593ns
matrix_trace: 6656ns
1.34324e+08
test_func: 348436713ns
t=3: n=1056
matrix_randomize: 278548ns
matrix_randomize: 303162ns
matrix_transpose: 2244654ns
matrix_multiply: 102034204ns
matrix_multiply: 101822940ns
matrix_RtAR: 206129992ns
matrix_trace: 5291ns
1.47405e+08
test_func: 210509771ns
overall: 965910915ns
```

# 改进后

940,043,114
122,873,140
```
这里贴改进后的运行结果。
matrix_randomize: 0.01s
t=0: n=1120
matrix_randomize: 1149905ns
matrix_randomize: 194806ns
matrix_transpose: 3380612ns
matrix_multiply: 22715600ns
matrix_multiply: 16198907ns
matrix_RtAR: 42313253ns
matrix_trace: 4655ns
1.75932e+08
test_func: 53439795ns
t=1: n=928
matrix_randomize: 120288ns
matrix_randomize: 104550ns
matrix_transpose: 1505999ns
matrix_multiply: 9278394ns
matrix_multiply: 9541630ns
matrix_RtAR: 20339630ns
matrix_trace: 3784ns
1.00156e+08
test_func: 23224326ns
t=2: n=1024
matrix_randomize: 132620ns
matrix_randomize: 129305ns
matrix_transpose: 2188643ns
matrix_multiply: 14148323ns
matrix_multiply: 13984732ns
matrix_RtAR: 30337879ns
matrix_trace: 3876ns
1.34324e+08
test_func: 32501406ns
t=3: n=1056
matrix_randomize: 132218ns
matrix_randomize: 133312ns
matrix_transpose: 2499468ns
matrix_multiply: 13799665ns
matrix_multiply: 13568507ns
matrix_RtAR: 29882378ns
matrix_trace: 4531ns
1.47405e+08
test_func: 34571786ns
overall: 144688938ns
```

# 加速比

matrix_randomize: 10000x
matrix_transpose: 10000x
matrix_multiply: 10000x
matrix_RtAR: 10000x
matrix_randomize: 2.0663
matrix_transpose: 0.9212
matrix_multiply: 8.2222
matrix_RtAR: 7.6505

> 如果记录了多种优化方法,可以做表格比较

Expand All @@ -26,21 +106,44 @@ matrix_RtAR: 10000x
下面这些函数你是如何优化的?是什么思路?用了老师上课的哪个知识点?

> matrix_randomize
> 这个循环为什么不够高效?如何优化? 10 分
> 跳跃访存,缓存利用率低下
> 优化:
> 1.改变xy序为yx序,使得访存连续,利用空间局域性
> 2.使用直写指令,降低一倍访存带宽

请回答。

> matrix_transpose
>这个循环为什么不够高效?如何优化? 15 分
> 矩阵转置这种访存读写一方一定存在一个是跳跃访存的,导致缓存利用率低
>优化:
> 1. 先对矩阵进行分块,然后访问块的顺序按照莫顿序
> 2. 直写

请回答。

> matrix_multiply
>这个循环为什么不够高效?如何优化? 15 分
> 程序是mem-bound,三个矩阵的访存中
> out(x, y)顺序访问,高效;
> lhs(x, t)存储是x主序,但是按t主序访问,跨步访问,低效;
> rhs(t, y)访问同一个地址,一般
> 缓存(L1)大小不够存下三个矩阵,命中率低
> 优化:
> 1. 循环分块,这样缓存可以放下一个块的大小,提高缓存命中率
> 2. 寄存器分块,指令级并行
> 3. 循环展开

请回答。

> matrix_RtAR
> 这两个是临时变量,有什么可以优化的? 5 分
> 1. 可以省去Rt,使用RtAR代替
> 2. 使用static,手动池化

请回答。

# 我的创新点

如果有,请说明
对于分块的行或列大小不是2的倍数的情况,使用莫顿序需要判断越界。创新之处在于负优化
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ project(hellocmake LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 17)
#if (NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)

set(CMAKE_CXX_FLAGS "-march=native -g")
#endif()

add_executable(main main.cpp)
Expand Down
115 changes: 92 additions & 23 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@
// 并行可以用 OpenMP 也可以用 TBB

#include <iostream>
//#include <x86intrin.h> // _mm 系列指令都来自这个头文件
//#include <xmmintrin.h> // 如果上面那个不行,试试这个
// #include <x86intrin.h> // _mm 系列指令都来自这个头文件
#include <immintrin.h> // 如果上面那个不行,试试这个
#include "ndarray.h"
#include "wangsrng.h"
#include "ticktock.h"
#include "morton.h"
#include "mtprint.h"

// Matrix 是 YX 序的二维浮点数组:mat(x, y) = mat.data()[y * mat.shape(0) + x]
using Matrix = ndarray<2, float>;
Expand All @@ -23,12 +25,26 @@ static void matrix_randomize(Matrix &out) {
size_t nx = out.shape(0);
size_t ny = out.shape(1);

// 这个循环为什么不够高效?如何优化? 10 分
/*
这个循环为什么不够高效?如何优化? 10 分
跳跃访存,缓存利用率低下
1.改变xy序为yx序
2.使用直写指令,降低一倍访存带宽
*/
#pragma omp parallel for collapse(2)
for (int x = 0; x < nx; x++) {
for (int y = 0; y < ny; y++) {
float val = wangsrng(x, y).next_float();
out(x, y) = val;
for (size_t y = 0; y < ny; y++) {
for (size_t xBase = 0; xBase < nx; xBase+=16) {
float res[16];
// #pragma GCC unroll 16
for (size_t x=0;x<16;x++){
res[x] = wangsrng(xBase+x, y).next_float();
// out(x, y) = val;
}
// for (int x=0;x<16;x++){
// _mm_stream_si32(reinterpret_cast<int *>(&out(x+xBase,y)),*(reinterpret_cast<int *>(&res[x])));
// }
_mm512_stream_ps(&out(xBase,y),_mm512_load_ps(res));

}
}
TOCK(matrix_randomize);
Expand All @@ -40,13 +56,40 @@ static void matrix_transpose(Matrix &out, Matrix const &in) {
size_t ny = in.shape(1);
out.reshape(ny, nx);

// 这个循环为什么不够高效?如何优化? 15 分
#pragma omp parallel for collapse(2)
for (int x = 0; x < nx; x++) {
for (int y = 0; y < ny; y++) {
out(y, x) = in(x, y);
/*
这个循环为什么不够高效?如何优化? 15 分
矩阵转置这种访存读写一方一定存在一个是跳跃访存的,导致缓存利用率低
*/
//原版代码
// #pragma omp parallel for collapse(2)
// for (int x = 0; x < nx; x++) {
// for (int y = 0; y < ny; y++) {
// out(y, x) = in(x, y);
// }
// }

//莫顿码顺序访问,但是由于分块的大小不是2的幂,需要判断越界
int blockSize=32;
int xdBlock=nx/blockSize;
int ydBlock=ny/blockSize;
#pragma omp parallel for
for (int k = 0; k < morton2d::highestOneBit(xdBlock)*morton2d::highestOneBit(ydBlock); k++){
auto [xBase, yBase] = morton2d::decode(k);
// mtprint(xdBlock,morton2d::highestOneBit(xdBlock));
if (xBase>=xdBlock || yBase>=ydBlock){
continue;
}
xBase*=blockSize;
yBase*=blockSize;
for (int x=xBase;x<xBase+blockSize;x++){
for(int y=yBase;y<yBase+blockSize;y++){
// 用stream指令反而变慢了,不知道为何
// _mm_stream_si32((int *)(&out(y,x)),(int &)in(x,y));
out(y, x) = in(x, y);
}
}
}

TOCK(matrix_transpose);
}

Expand All @@ -60,27 +103,53 @@ static void matrix_multiply(Matrix &out, Matrix const &lhs, Matrix const &rhs) {
throw;
}
out.reshape(nx, ny);

// 这个循环为什么不够高效?如何优化? 15 分
#pragma omp parallel for collapse(2)
for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) {
out(x, y) = 0; // 有没有必要手动初始化? 5 分
for (int t = 0; t < nt; t++) {
out(x, y) += lhs(x, t) * rhs(t, y);
/*
这个循环为什么不够高效?如何优化? 15 分
程序是mem-bound,三个矩阵的访存中
out(x, y)顺序访问,高效;
lhs(x, t)存储是x主序,但是按t主序访问,跨步访问,低效;
rhs(t, y)访问同一个地址,一般
缓存(L1)大小不够存下三个矩阵,命中率低
1. 循环分块,这样缓存可以放下一个块的大小,提高缓存命中率
2. 寄存器分块,指令级并行
3. 循环展开
*/
int blockSize=32;
// __m128 mout,mlhs,mrhs;
#pragma omp parallel for collapse(3)
for(int yBase = 0; yBase < ny / blockSize;yBase++){
for (int xBase = 0; xBase < nx / blockSize; xBase++) {
for ( int tBase=0 ; tBase< nt / blockSize; tBase ++){
for (int y = yBase*blockSize; y < (yBase+1)*blockSize; y++) {
for (int t = tBase*blockSize; t < (tBase+1)*blockSize; t++) {
// out(x, y) = 0; // 有没有必要手动初始化? 5 分 答:没必要,clear()->resize()全部初始化为0了
// mrhs=_mm_set1_ps(rhs(t,y));
#pragma GCC unroll 32
for (int x = xBase*blockSize; x < (xBase+1)*blockSize; x++) {
out(x, y) += lhs(x, t) * rhs(t, y);
// mout=_mm_load_ps(&out(x,y));
// mlhs=_mm_load_ps(&lhs(x,t));
// mout=_mm_add_ps(mout,_mm_mul_ps(mlhs,mrhs));
// _mm_store_ps(&out(x,y),mout);
}
}
}
}
}
}

TOCK(matrix_multiply);
}

// 求出 R^T A R
static void matrix_RtAR(Matrix &RtAR, Matrix const &R, Matrix const &A) {
TICK(matrix_RtAR);
// 这两个是临时变量,有什么可以优化的? 5 分
Matrix Rt, RtA;
matrix_transpose(Rt, R);
matrix_multiply(RtA, Rt, A);
// 1. 可以省去Rt,使用RtAR代替
// 2. 使用static关键字,手动池化
static thread_local Matrix RtA;
matrix_transpose(RtAR, R);
matrix_multiply(RtA, RtAR, A);
matrix_multiply(RtAR, RtA, R);
TOCK(matrix_RtAR);
}
Expand Down
13 changes: 13 additions & 0 deletions morton.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,19 @@ constexpr static std::tuple<uint64_t, uint64_t> decode(uint64_t d)
return {decode1(d), decode1(d >> 1)};
}

constexpr static uint64_t highestOneBit(uint64_t x){
x-=1;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
x |= x >> 32;
x |= x >> 64;
x += 1;
return x;
}

}

namespace morton3d {
Expand Down
Loading