Skip to content

Commit d9551a1

Browse files
committed
matmul:qwang
1 parent 6bbd44d commit d9551a1

8 files changed

Lines changed: 201 additions & 83 deletions

File tree

excuter/cpp-common/src/deepx/tensorfunc/authors.hpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,11 @@ namespace deepx::tensorfunc{
2424
public:
2525
static std::string name() { return "cublas"; }
2626
};
27+
28+
class qwang{
29+
public:
30+
static std::string name() { return "wqing"; }
31+
};
2732
}
2833

2934
#endif

excuter/cpp-common/src/deepx/tensorfunc/matmul.hpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
#include "deepx/tensor.hpp"
55
#include "deepx/tensorfunc/authors.hpp"
6+
#include "stdutil/error.hpp"
67

78
namespace deepx::tensorfunc
89
{
@@ -29,7 +30,9 @@ namespace deepx::tensorfunc
2930
template <typename Author, typename T>
3031
struct matmulDispatcher
3132
{
32-
static void matmul(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C) = delete;
33+
static void matmul(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C) {
34+
throw NotImplementError("matmul");
35+
};
3336
};
3437

3538
template <typename Author, typename T>

excuter/op-mem-cuda/src/deepx/tensorfunc/matmul.cu

Lines changed: 0 additions & 78 deletions
This file was deleted.
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
#include "deepx/tensorfunc/cuda.hpp"
2+
3+
// #include <cuda_fp64.h>
4+
// #include <cuda_fp32.h>
5+
#include "deepx/tensor.hpp"
6+
7+
#include "deepx/tensorfunc/matmul_qwang.cuh"
8+
#include "deepx/tensorfunc/authors.hpp"
9+
#include "deepx/tensorfunc/cuda.hpp"
10+
11+
namespace deepx::tensorfunc
12+
{
13+
14+
#define BLOCK_SIZE 32
15+
16+
template <typename T>
17+
__global__ void matmul_kernel(T *C, const T *A, const T *B,
18+
int M, int N, int K)
19+
{
20+
// 定义共享内存块,用于缓存A和B的矩阵块
21+
__shared__ T tileA[BLOCK_SIZE][BLOCK_SIZE];
22+
__shared__ T tileB[BLOCK_SIZE][BLOCK_SIZE];
23+
24+
// 计算当前线程处理的全局矩阵位置
25+
int row = blockIdx.y * BLOCK_SIZE + threadIdx.y;
26+
int col = blockIdx.x * BLOCK_SIZE + threadIdx.x;
27+
28+
T sum = 0.0;
29+
30+
// 分块循环处理整个K维度
31+
for (int t = 0; t < (K + BLOCK_SIZE - 1) / BLOCK_SIZE; ++t)
32+
{
33+
// 计算当前块的起始位置
34+
int tiledK = t * BLOCK_SIZE;
35+
36+
// 加载A的块到共享内存(行优先)
37+
int loadA_col = tiledK + threadIdx.x;
38+
if (row < M && loadA_col < K)
39+
{
40+
tileA[threadIdx.y][threadIdx.x] = A[row * K + loadA_col];
41+
}
42+
else
43+
{
44+
tileA[threadIdx.y][threadIdx.x] = 0.0; // 填充0处理边界
45+
}
46+
47+
// 加载B的块到共享内存(列优先等效处理)
48+
int loadB_row = tiledK + threadIdx.y;
49+
if (col < N && loadB_row < K)
50+
{
51+
tileB[threadIdx.y][threadIdx.x] = B[loadB_row * N + col];
52+
}
53+
else
54+
{
55+
tileB[threadIdx.y][threadIdx.x] = 0.0; // 填充0处理边界
56+
}
57+
58+
__syncthreads(); // 确保块加载完成
59+
60+
// 计算当前块的矩阵乘法贡献
61+
for (int k = 0; k < BLOCK_SIZE; ++k)
62+
{
63+
sum += tileA[threadIdx.y][k] * tileB[k][threadIdx.x];
64+
}
65+
66+
__syncthreads(); // 确保计算完成再加载下一块
67+
}
68+
69+
// 只将有效范围内的结果写入全局内存
70+
if (row < M && col < N)
71+
{
72+
C[row * N + col] = sum;
73+
}
74+
}
75+
76+
template __global__ void matmul_kernel<double>(double *C, const double *A, const double *B,
77+
int M, int N, int K);
78+
template __global__ void matmul_kernel<float>(float *C, const float *A, const float *B,
79+
int M, int N, int K);
80+
template __global__ void matmul_kernel<half>(half *C, const half *A, const half *B,
81+
int M, int N, int K);
82+
template __global__ void matmul_kernel<nv_bfloat16>(nv_bfloat16 *C, const nv_bfloat16 *A, const nv_bfloat16 *B,
83+
int M, int N, int K);
84+
template __global__ void matmul_kernel<int64_t>(int64_t *C, const int64_t *A, const int64_t *B,
85+
int M, int N, int K);
86+
template __global__ void matmul_kernel<int32_t>(int32_t *C, const int32_t *A, const int32_t *B,
87+
int M, int N, int K);
88+
template __global__ void matmul_kernel<int16_t>(int16_t *C, const int16_t *A, const int16_t *B,
89+
int M, int N, int K);
90+
template __global__ void matmul_kernel<int8_t>(int8_t *C, const int8_t *A, const int8_t *B,
91+
int M, int N, int K);
92+
// 主机函数调用内核
93+
template <typename T>
94+
void launch_matmul(T *d_C, const T *d_A, const T *d_B,
95+
int M, int N, int K)
96+
{
97+
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
98+
dim3 dimGrid((N + BLOCK_SIZE - 1) / BLOCK_SIZE,
99+
(M + BLOCK_SIZE - 1) / BLOCK_SIZE);
100+
101+
matmul_kernel<<<dimGrid, dimBlock>>>(d_C, d_A, d_B, M, N, K);
102+
}
103+
template void launch_matmul<double>(double *d_C, const double *d_A, const double *d_B,
104+
int M, int N, int K);
105+
template void launch_matmul<float>(float *d_C, const float *d_A, const float *d_B,
106+
int M, int N, int K);
107+
template void launch_matmul<half>(half *d_C, const half *d_A, const half *d_B,
108+
int M, int N, int K);
109+
template void launch_matmul<nv_bfloat16>(nv_bfloat16 *d_C, const nv_bfloat16 *d_A, const nv_bfloat16 *d_B,
110+
int M, int N, int K);
111+
template void launch_matmul<int64_t>(int64_t *d_C, const int64_t *d_A, const int64_t *d_B,
112+
int M, int N, int K);
113+
template void launch_matmul<int32_t>(int32_t *d_C, const int32_t *d_A, const int32_t *d_B,
114+
int M, int N, int K);
115+
template void launch_matmul<int16_t>(int16_t *d_C, const int16_t *d_A, const int16_t *d_B,
116+
int M, int N, int K);
117+
template void launch_matmul<int8_t>(int8_t *d_C, const int8_t *d_A, const int8_t *d_B,
118+
int M, int N, int K);
119+
}
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#ifndef DEEPX_TENSORFUNC_MATMUL_QWANG_CUH
2+
#define DEEPX_TENSORFUNC_MATMUL_QWANG_CUH
3+
4+
#include "deepx/tensorfunc/cuda.hpp"
5+
#include "deepx/tensorfunc/matmul.hpp"
6+
7+
namespace deepx::tensorfunc
8+
{
9+
10+
#define BLOCK_SIZE 32
11+
12+
template <typename T>
13+
__global__ void matmul_kernel(T *C, const T *A, const T *B,
14+
int M, int N, int K);
15+
16+
17+
18+
template <typename T>
19+
void launch_matmul(T *d_C, const T *d_A, const T *d_B,
20+
int M, int N, int K);
21+
22+
extern template void launch_matmul<double>(double *d_C, const double *d_A, const double *d_B,
23+
int M, int N, int K);
24+
extern template void launch_matmul<float>(float *d_C, const float *d_A, const float *d_B,
25+
int M, int N, int K);
26+
extern template void launch_matmul<half>(half *d_C, const half *d_A, const half *d_B,
27+
int M, int N, int K);
28+
extern template void launch_matmul<nv_bfloat16>(nv_bfloat16 *d_C, const nv_bfloat16 *d_A, const nv_bfloat16 *d_B,
29+
int M, int N, int K);
30+
extern template void launch_matmul<int64_t>(int64_t *d_C, const int64_t *d_A, const int64_t *d_B,
31+
int M, int N, int K);
32+
extern template void launch_matmul<int32_t>(int32_t *d_C, const int32_t *d_A, const int32_t *d_B,
33+
int M, int N, int K);
34+
extern template void launch_matmul<int16_t>(int16_t *d_C, const int16_t *d_A, const int16_t *d_B,
35+
int M, int N, int K);
36+
extern template void launch_matmul<int8_t>(int8_t *d_C, const int8_t *d_A, const int8_t *d_B,
37+
int M, int N, int K);
38+
}
39+
#endif
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#ifndef DEEPX_TENSORFUNC_MATMUL_WQING_HPP
2+
#define DEEPX_TENSORFUNC_MATMUL_WQING_HPP
3+
4+
#include "deepx/tensorfunc/cuda.hpp"
5+
#include "deepx/tensor.hpp"
6+
#include "deepx/tensorfunc/matmul.hpp"
7+
#include "deepx/tensorfunc/authors.hpp"
8+
#include "deepx/tensorfunc/matmul_qwang.cuh"
9+
10+
namespace deepx::tensorfunc
11+
{
12+
using namespace deepx;
13+
14+
template <typename T>
15+
struct matmulDispatcher<qwang, T>
16+
{
17+
static void matmul(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C)
18+
{
19+
launch_matmul(C.data, A.data, B.data, A.shape[-2], A.shape[-1], B.shape[-1]);
20+
}
21+
};
22+
23+
24+
}
25+
#endif

excuter/op-mem-cuda/test/tensorfunc/1_cublas_matmul.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,11 @@
44
#include "deepx/tensorfunc/print_miaobyte.hpp"
55
#include "deepx/tensorfunc/matmul.hpp"
66
#include "deepx/tensorfunc/matmul_cublas.hpp"
7-
7+
#include "deepx/tensorfunc/matmul_qwang.hpp"
88
using namespace deepx::tensorfunc;
99
using namespace deepx;
1010

11+
template <typename Author>
1112
void test_matmul()
1213
{
1314
// 创建矩阵 A (2x3)
@@ -30,7 +31,7 @@ void test_matmul()
3031
print<miaobyte>(b, "%.2f");
3132

3233
// 执行矩阵乘法 C = A × B
33-
matmul<deepx::tensorfunc::cublas,float>(a, b, c);
34+
matmul<Author,float>(a, b, c);
3435

3536
// 打印结果
3637
print<miaobyte>(c, "%.2f");
@@ -69,7 +70,11 @@ int main(int argc, char **argv)
6970
}
7071
switch (casei) {
7172
case 0:
72-
test_matmul();
73+
74+
printf("test qwang matmul\n");
75+
test_matmul<deepx::tensorfunc::qwang>();
76+
printf("test cublas matmul\n");
77+
test_matmul<deepx::tensorfunc::cublas>();
7378
break;
7479
case 1:
7580
test_matmul_batch();

excuter/op-mem-cuda/test/tensorfunc/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,4 @@ add_executable(1_cublas_add 1_cublas_add.cpp)
55
target_link_libraries(1_cublas_add deepx CUDA::cudart)
66

77
add_executable(1_cublas_matmul 1_cublas_matmul.cpp)
8-
target_link_libraries(1_cublas_matmul deepx CUDA::cudart)
8+
target_link_libraries(1_cublas_matmul deepx CUDA::cudart CUDA::cublas)

0 commit comments

Comments
 (0)