Skip to content

Commit 280fad9

Browse files
committed
tensorfuncispatcher:init,matmul
1 parent 30b5a44 commit 280fad9

5 files changed

Lines changed: 176 additions & 12 deletions

File tree

excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.cu

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ namespace deepx::tensorfunc
1717
}
1818
}
1919

20+
21+
2022
// 实现特化版本的成员函数
2123
void _constant_func<miaobyte, float>::func(Tensor<float> &tensor, const float value)
2224
{
@@ -59,4 +61,57 @@ namespace deepx::tensorfunc
5961
throw std::runtime_error("Failed to launch constant kernel");
6062
}
6163
}
64+
65+
// 添加kernel函数
66+
template <typename T>
67+
__global__ void kernel_arange(T *data, int size, T start, T step)
68+
{
69+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
70+
if (idx < size)
71+
{
72+
data[idx] = start + step * static_cast<T>(idx);
73+
}
74+
}
75+
76+
void _arange_func<miaobyte, float>::func(Tensor<float> &tensor, const float start, const float step)
77+
{
78+
int size = tensor.shape.size;
79+
int blockSize = 256;
80+
int numBlocks = (size + blockSize - 1) / blockSize;
81+
82+
kernel_arange<<<numBlocks, blockSize>>>(tensor.data, size, start, step);
83+
84+
cudaError_t err = cudaGetLastError();
85+
if (err != cudaSuccess) {
86+
throw std::runtime_error("Failed to launch arange kernel");
87+
}
88+
}
89+
90+
void _arange_func<miaobyte, double>::func(Tensor<double> &tensor, const double start, const double step)
91+
{
92+
int size = tensor.shape.size;
93+
int blockSize = 256;
94+
int numBlocks = (size + blockSize - 1) / blockSize;
95+
96+
kernel_arange<<<numBlocks, blockSize>>>(tensor.data, size, start, step);
97+
98+
cudaError_t err = cudaGetLastError();
99+
if (err != cudaSuccess) {
100+
throw std::runtime_error("Failed to launch arange kernel");
101+
}
102+
}
103+
104+
void _arange_func<miaobyte, __half>::func(Tensor<__half> &tensor, const __half start, const __half step)
105+
{
106+
int size = tensor.shape.size;
107+
int blockSize = 256;
108+
int numBlocks = (size + blockSize - 1) / blockSize;
109+
110+
kernel_arange<<<numBlocks, blockSize>>>(tensor.data, size, start, step);
111+
112+
cudaError_t err = cudaGetLastError();
113+
if (err != cudaSuccess) {
114+
throw std::runtime_error("Failed to launch arange kernel");
115+
}
116+
}
62117
}

excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.hpp

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,12 +43,32 @@ namespace deepx::tensorfunc
4343
}
4444
};
4545

46+
template <typename Author, typename T>
47+
struct _arange_func {
48+
static void func(Tensor<T> &tensor, const T start, const T step);
49+
};
50+
51+
template <>
52+
struct _arange_func<miaobyte, float> {
53+
static void func(Tensor<float> &tensor, const float start, const float step);
54+
};
55+
56+
template <>
57+
struct _arange_func<miaobyte, double> {
58+
static void func(Tensor<double> &tensor, const double start, const double step);
59+
};
60+
61+
template <>
62+
struct _arange_func<miaobyte, __half> {
63+
static void func(Tensor<__half> &tensor, const __half start, const __half step);
64+
};
65+
66+
// 使用实现结构体
4667
template <typename T>
4768
struct arangeDispatcher<miaobyte, T>
4869
{
49-
static void arange(Tensor<T> &tensor, const T start, const T step)
50-
{
51-
//todo
70+
static void arange(Tensor<T> &tensor, const T start, const T step) {
71+
_arange_func<miaobyte, T>::func(tensor, start, step);
5272
}
5373
};
5474

excuter/op-mem-cuda/src/deepx/tensorfunc/matmul_cublas.hpp

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,8 @@ namespace deepx::tensorfunc
173173
B.data, n, stride_b, // B在前
174174
A.data, k, stride_a, // A在后
175175
&beta,
176-
C.data, n, stride_c); // 调整leading dimension
176+
C.data, n, stride_c, // 调整leading dimension
177+
batch_size); // 添加缺失的batch_size参数
177178

178179
if (status != CUBLAS_STATUS_SUCCESS)
179180
{
@@ -218,19 +219,25 @@ namespace deepx::tensorfunc
218219

219220
if (batch_size > 1)
220221
{
222+
// 计算步长
223+
int64_t stride_a = m * k;
224+
int64_t stride_b = k * n;
225+
int64_t stride_c = m * n;
226+
221227
auto status = cublasDgemmStridedBatched(handle.get(),
222228
CUBLAS_OP_N,
223229
CUBLAS_OP_N,
224-
m, n, k,
230+
n, m, k, // 交换m,n处理行主序
225231
&alpha,
226-
A.data, m,
227-
B.data, k,
232+
B.data, n, stride_b, // B在前
233+
A.data, k, stride_a, // A在后
228234
&beta,
229-
C.data, m);
235+
C.data, n, stride_c, // 输出维度对应调整
236+
batch_size);
230237

231238
if (status != CUBLAS_STATUS_SUCCESS)
232239
{
233-
throw std::runtime_error("cublasDgemm failed");
240+
throw std::runtime_error("cublasDgemmStridedBatched failed");
234241
}
235242
}
236243
else
@@ -251,5 +258,6 @@ namespace deepx::tensorfunc
251258
}
252259
}
253260
};
254-
}
261+
};
262+
};
255263
#endif // DEEPX_TENSORFUNC_MATMUL_HPP
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
#include "deepx/tensorfunc/init_miaobyte.hpp"
2+
#include "deepx/tensor.hpp"
3+
#include "deepx/tensorfunc/new.hpp"
4+
#include "deepx/tensorfunc/print.hpp"
5+
#include "deepx/tensorfunc/matmul.hpp"
6+
#include "deepx/tensorfunc/matmul_cublas.hpp"
7+
8+
using namespace deepx::tensorfunc;
9+
using namespace deepx;
10+
11+
void test_matmul()
12+
{
13+
// 创建矩阵 A (2x3)
14+
Tensor<float> a = New<float>({2, 3});
15+
arange<miaobyte,float>(a, 1.0f, 1.0f); // 1,2,3
16+
// 4,5,6
17+
18+
// 创建矩阵 B (3x2)
19+
Tensor<float> b = New<float>({3, 2});
20+
arange<miaobyte,float>(b, 1.0f, 1.0f); // 1,2
21+
// 3,4
22+
// 5,6
23+
24+
// 创建结果矩阵 C (2x2)
25+
Tensor<float> c = New<float>({2, 2});
26+
constant<miaobyte,float>(c, 0.0f);
27+
28+
// 打印输入矩阵
29+
print(a, "%.2f");
30+
print(b, "%.2f");
31+
32+
// 执行矩阵乘法 C = A × B
33+
matmul<deepx::tensorfunc::cublas,float>(a, b, c);
34+
35+
// 打印结果
36+
print(c, "%.2f");
37+
}
38+
39+
void test_matmul_batch()
40+
{
41+
// 创建矩阵 A
42+
Tensor<float> a = New<float>({2, 3,4,5});
43+
arange<miaobyte,float>(a, 1.0f, 1.0f);
44+
45+
// 创建矩阵 B
46+
Tensor<float> b = New<float>({2,3,5,6});
47+
arange<miaobyte,float>(b, 1.0f, 1.0f);
48+
49+
// 创建结果矩阵 C
50+
Tensor<float> c = New<float>({2, 3,4,6});
51+
constant<miaobyte,float>(c, 0.0f);
52+
53+
// 打印输入矩阵
54+
print(a, "%.2f");
55+
print(b, "%.2f");
56+
57+
// 执行矩阵乘法 C = A × B
58+
matmul<deepx::tensorfunc::cublas,float>(a, b, c);
59+
60+
// 打印结果
61+
print(c, "%.2f");
62+
}
63+
64+
int main(int argc, char **argv)
65+
{
66+
int casei = 0;
67+
if (argc > 1) {
68+
casei = atoi(argv[1]);
69+
}
70+
switch (casei) {
71+
case 0:
72+
test_matmul();
73+
break;
74+
case 1:
75+
test_matmul_batch();
76+
break;
77+
}
78+
return 0;
79+
}
Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
add_executable(0_new 0_new.cpp)
2-
target_link_libraries(0_new deepx CUDA::cudart)
2+
target_link_libraries(0_new deepx CUDA::cudart)
33

44
add_executable(1_cublas_add 1_cublas_add.cpp)
5-
target_link_libraries(1_cublas_add deepx CUDA::cudart)
5+
target_link_libraries(1_cublas_add deepx CUDA::cudart)
66

7+
add_executable(1_cublas_matmul 1_cublas_matmul.cpp)
8+
target_link_libraries(1_cublas_matmul deepx CUDA::cudart)

0 commit comments

Comments
 (0)