Skip to content

Commit 6bbd44d

Browse files
committed
1
1 parent 43b2203 commit 6bbd44d

1 file changed

Lines changed: 78 additions & 0 deletions

File tree

  • excuter/op-mem-cuda/src/deepx/tensorfunc
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
#include "deepx/tensorfunc/cuda.hpp"
2+
3+
#include <cuda_fp64.h>
4+
#include <cuda_fp32.h>
5+
#include "deepx/tensor.hpp"
6+
7+
#include "deepx/tensorfunc/matmul.hpp"
8+
#include "deepx/tensorfunc/authors.hpp"
9+
#include "deepx/tensorfunc/cuda.hpp"
10+
11+
namespace deepx::tensorfunc {
12+
13+
#define BLOCK_SIZE 32
14+
15+
__global__ void fp64MatmulKernel(double *C, const double *A, const double *B,
16+
int M, int N, int K) {
17+
// 定义共享内存块,用于缓存A和B的矩阵块
18+
__shared__ double tileA[BLOCK_SIZE][BLOCK_SIZE];
19+
__shared__ double tileB[BLOCK_SIZE][BLOCK_SIZE];
20+
21+
// 计算当前线程处理的全局矩阵位置
22+
int row = blockIdx.y * BLOCK_SIZE + threadIdx.y;
23+
int col = blockIdx.x * BLOCK_SIZE + threadIdx.x;
24+
25+
double sum = 0.0;
26+
27+
// 分块循环处理整个K维度
28+
for (int t = 0; t < (K + BLOCK_SIZE - 1) / BLOCK_SIZE; ++t) {
29+
// 计算当前块的起始位置
30+
int tiledK = t * BLOCK_SIZE;
31+
32+
// 加载A的块到共享内存(行优先)
33+
int loadA_col = tiledK + threadIdx.x;
34+
if (row < M && loadA_col < K) {
35+
tileA[threadIdx.y][threadIdx.x] = A[row * K + loadA_col];
36+
} else {
37+
tileA[threadIdx.y][threadIdx.x] = 0.0; // 填充0处理边界
38+
}
39+
40+
// 加载B的块到共享内存(列优先等效处理)
41+
int loadB_row = tiledK + threadIdx.y;
42+
if (col < N && loadB_row < K) {
43+
tileB[threadIdx.y][threadIdx.x] = B[loadB_row * N + col];
44+
} else {
45+
tileB[threadIdx.y][threadIdx.x] = 0.0; // 填充0处理边界
46+
}
47+
48+
__syncthreads(); // 确保块加载完成
49+
50+
// 计算当前块的矩阵乘法贡献
51+
for (int k = 0; k < BLOCK_SIZE; ++k) {
52+
sum += tileA[threadIdx.y][k] * tileB[k][threadIdx.x];
53+
}
54+
55+
__syncthreads(); // 确保计算完成再加载下一块
56+
}
57+
58+
// 只将有效范围内的结果写入全局内存
59+
if (row < M && col < N) {
60+
C[row * N + col] = sum;
61+
}
62+
}
63+
64+
// 主机函数调用内核
65+
void fp64Matmul(double *d_C, const double *d_A, const double *d_B,
66+
int M, int N, int K) {
67+
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
68+
dim3 dimGrid((N + BLOCK_SIZE - 1) / BLOCK_SIZE,
69+
(M + BLOCK_SIZE - 1) / BLOCK_SIZE);
70+
71+
fp64MatmulKernel<<<dimGrid, dimBlock>>>(d_C, d_A, d_B, M, N, K);
72+
}
73+
74+
75+
}
76+
77+
} // namespace tensorfunc
78+
} // namespace deepx

0 commit comments

Comments
 (0)