Skip to content

Commit 53df05e

Browse files
committed
excuter(cpu/cuda): add
1 parent e4880e3 commit 53df05e

16 files changed

Lines changed: 575 additions & 134 deletions

File tree

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
+ 向量X的最大值
2+
返回向量中绝对值最大的元素的索引
3+
```
4+
cublasI[s|d|c|z]amax(cublasHandle_t handle, int n, const T *x, int incx, T *result)
5+
```
6+
7+
+ 向量X的最小值
8+
```
9+
cublasI[s|d|c|z]amin(cublasHandle_t handle, int n, const T *x, int incx, T *result)
10+
```
11+
12+
+ 向量X的绝对值之和
13+
```
14+
cublasI[s|d|c|z]asum(cublasHandle_t handle, int n, const T *x, int incx, T *result)
15+
```
16+
17+
+ 向量X的标量乘法
18+
19+
result = alpha * x + y
20+
21+
```
22+
cublasI[s|d|c|z]axpy(cublasHandle_t handle, int n,
23+
const T *alpha,
24+
const T *x, int incx,
25+
T *y, int incy)
26+
```
27+
28+
29+
+ 向量X的copy
30+
31+
y = x
32+
33+
```
34+
cublasI[s|d|c|z]copy(cublasHandle_t handle, int n, const T *x, int incx, T *y, int incy)
35+
```
36+
37+
38+
+ 向量X的dot
39+
40+
?
41+
42+
```
43+
cublasI[s|d|c|z]dot(cublasHandle_t handle, int n,
44+
const T *x, int incx,
45+
const T *y, int incy,
46+
T *result)
47+
```
48+
49+
+ 向量X的nrm2
50+
51+
计算向量x的欧几里得范数
52+
53+
```
54+
cublasI[s|d|c|z]nrm2(cublasHandle_t handle, int n, const T *x, int incx, T *result)
55+
```
56+
57+
+ 向量X的rot()
58+
在x, y平面上按cos(alpha)=c,sin(alpha)=s定义的角度逆时针旋转
59+
60+
61+
```
62+
cublasI[s|d|c|z]rot(cublasHandle_t handle, int n,
63+
T *x, int incx,
64+
T *y, int incy,
65+
const T *c,const T *s)
66+
```
67+
68+
69+
## Level-3 BLAS操作
70+
71+
列主格式存储
72+
73+
+ gemm (通用矩阵乘法)
74+
计算 C = α * op(A) * op(B) + β * C
75+
其中op(X)可以是X或X^T
76+
77+
```
78+
cublasI[s|d|c|z]gemm(cublasHandle_t handle,
79+
cublasOperation_t transa, cublasOperation_t transb,
80+
int m, int n, int k,
81+
const T *alpha, const T *A, int lda, const T *B, int ldb, const T *beta, T *C, int ldc)
82+
```
83+
84+
+ gemmBatched (批量矩阵乘法)
85+
同时计算多个独立的矩阵乘法操作
86+
数学公式: C[i] = α * op(A[i]) * op(B[i]) + β * C[i], i ∈ [0,batchCount)
87+
88+
+ gemmStridedBatched (步进批量矩阵乘法)
89+
处理内存连续的批量矩阵乘法,通过stride指定每个矩阵的步长
90+
数学公式: 同gemmBatched,但矩阵在内存中以固定步长排列
91+
92+
+ gemmGroupedBatched (分组批量矩阵乘法)
93+
按组处理批量矩阵乘法,每组可以有不同的维度参数
94+
数学公式: 同gemmBatched,但可以按组设置不同的m,n,k
95+
96+
+ geam (矩阵加法与转置)
97+
计算 C = α * op(A) + β * op(B)
98+
其中op(X)可以是X或X^T
99+
可用于实现矩阵转置、加法、缩放等操作
100+
101+
+ dgmm (对角矩阵乘法)
102+
计算对角矩阵与普通矩阵的乘法
103+
数学公式:
104+
- 左乘模式: C = diag(x) * A
105+
- 右乘模式: C = A * diag(x)
106+
107+
+ gemmEx (扩展精度矩阵乘法)
108+
支持混合精度计算,如:
109+
- FP16输入,FP32累加和输出
110+
- INT8输入,INT32累加,FP32输出
111+
数学公式: 同gemm,但支持不同数据类型
112+
113+
+ GemmBatchedEx (扩展精度批量矩阵乘法)
114+
批量版本的gemmEx,支持混合精度
115+
116+
+ cublasGemmStridedBatchedEx (扩展精度步进批量矩阵乘法)
117+
步进批量版本的gemmEx,支持混合精度
118+
119+
+ cublasGemmGroupedBatchedEx (扩展精度分组批量矩阵乘法)
120+
分组批量版本的gemmEx,支持混合精度
121+
122+
+ Csyrk3mEx (对称矩阵更新)
123+
计算对称矩阵的rank-k更新,使用3M算法优化复数运算
124+
数学公式: C = α * op(A) * op(A)^T + β * C
125+
其中C为对称矩阵
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
+ cublasLtMatmul()
2+
支持部分低精度
3+
4+
+ cublasLtMatmulEx()
5+
6+
+ cublasLtMatmulBatched()
7+

doc/excuter/op-mem-cuda/list.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
| Operation | Author | Func Def | Math Formula | IR Instruction |
66
|-----------|--------|------------|--------------|----------------|
7+
| add | miaobyte | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
78
| uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
89
| arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
910
| constant | miaobyte | constant(tensor<any> t, var<any> value)->() | constant(T1) | constant(tensor<any> t, var<any> value)->() |

doc/excuter/op-mem-ompsimd/list.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
| Operation | Author | Func Def | Math Formula | IR Instruction |
66
|-----------|--------|------------|--------------|----------------|
77
| concat | none | concat()->() | Tresult = concat([T1, T2...], axis=3) | concat()->() |
8+
| add | cblas | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) | T3=T1+T2 | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) |
9+
| add | miaobyte | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
810
| uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
911
| arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
1012
| constant | miaobyte | constant(tensor<any> t, var<any> value)->() | print(T1) | constant(tensor<any> t, var<any> value)->() |

excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,10 @@ namespace deepx::tensorfunc
99
template <typename Author, typename T>
1010
struct addDispatcher
1111
{
12-
static void add(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C) = delete;
12+
static void add(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C)
13+
{
14+
throw NotImplementError("add");
15+
}
1316
};
1417

1518
template <typename Author, typename T>

excuter/op-mem-cuda/src/client/tfs.cpp

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include "deepx/tf/new.hpp"
44
#include "deepx/tf/print.hpp"
55
#include "deepx/tf/init.hpp"
6+
#include "deepx/tf/elementwise_basic.hpp"
67
#include "deepx/dtype.hpp"
78
#include "deepx/tf/tffactory.hpp"
89
#include "deepx/tensorfunc/authors.hpp"
@@ -78,23 +79,32 @@ namespace deepx::tf
7879
// io
7980
void register_util(TfFactory &opfactory)
8081
{
81-
opfactory.add_tf(std::make_shared<Print<tensorfunc::miaobyte>>(vector<Param>(
82+
opfactory.add_tf(std::make_shared<Print< miaobyte>>(vector<Param>(
8283
{
8384
Param("", DataCategory::Tensor, Precision::Any),
8485
}),
8586
vector<Param>()));
8687

87-
opfactory.add_tf(std::make_shared<Print<tensorfunc::miaobyte>>(vector<Param>(
88+
opfactory.add_tf(std::make_shared<Print< miaobyte>>(vector<Param>(
8889
{
8990
Param("", DataCategory::Tensor, Precision::Any),
9091
Param("", DataCategory::Var, Precision::String),
9192
}),
9293
vector<Param>()));
9394
}
9495

95-
// // elementwise
96-
// void register_elementwise(OpFactory &opfactory)
97-
// {
96+
// elementwise
97+
void register_elementwise(TfFactory &tffactory)
98+
{
99+
tffactory.add_tf(std::make_shared<Add<miaobyte>>(vector<Param>(
100+
{
101+
Param("a", DataCategory::Tensor, Precision::Any),
102+
Param("b", DataCategory::Tensor, Precision::Any),
103+
}),
104+
vector<Param>(
105+
{
106+
Param("c", DataCategory::Tensor, Precision::Any),
107+
})));
98108
// opfactory.add_op(Add_miaobyte<float>());
99109
// opfactory.add_op(Add_miaobyte<double>());
100110
// opfactory.add_op(Add_miaobyte<int8_t>());
@@ -140,7 +150,7 @@ namespace deepx::tf
140150

141151
// opfactory.add_op(Powscalar_miaobyte<float>());
142152
// opfactory.add_op(Powscalar_miaobyte<double>());
143-
// }
153+
}
144154
// // matmul
145155
// void register_matmul(OpFactory &opfactory)
146156
// {
@@ -174,7 +184,7 @@ namespace deepx::tf
174184
register_lifecycle(tffactory);
175185
register_init(tffactory);
176186
register_util(tffactory);
177-
// register_elementwise(opfactory);
187+
register_elementwise(tffactory);
178188
// register_matmul(opfactory);
179189
register_changeshape(tffactory);
180190
// register_reduce(opfactory);

excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_cublas_basic.hpp

Lines changed: 24 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -13,71 +13,69 @@
1313
#include "deepx/tensorfunc/cuda.hpp"
1414
namespace deepx::tensorfunc
1515
{
16-
17-
// float特化
16+
17+
// double特化
1818
template <>
19-
struct addDispatcher<cublas, float>
19+
struct addDispatcher<cublas, double>
2020
{
21-
static void add(const Tensor<float> &A, const Tensor<float> &B, Tensor<float> &C)
21+
static void add(const Tensor<double> &A, const Tensor<double> &B, Tensor<double> &C)
2222
{
2323
if (A.shape.size != B.shape.size || A.shape.size != C.shape.size)
2424
{
2525
throw std::runtime_error("Tensor shapes must match for addition");
2626
}
2727

2828
static CublasHandle handle;
29-
const float alpha = 1.0f;
30-
const float beta = 1.0f;
31-
32-
// 使用cublasSgeam直接计算 C = alpha*A + beta*B
33-
auto status = cublasSgeam(handle.get(),
34-
CUBLAS_OP_N, // 不转置A
35-
CUBLAS_OP_N, // 不转置B
36-
A.shape.size, 1, // 假设是向量(或展平处理)
29+
const double alpha = 1.0;
30+
const double beta = 1.0;
31+
auto status = cublasDgeam(handle.get(),
32+
CUBLAS_OP_N,
33+
CUBLAS_OP_N,
34+
A.shape.size, 1,
3735
&alpha,
3836
A.data, A.shape.size,
3937
&beta,
4038
B.data, B.shape.size,
4139
C.data, C.shape.size);
42-
4340
if (status != CUBLAS_STATUS_SUCCESS)
4441
{
45-
throw std::runtime_error("cuBLAS Sgeam failed");
42+
throw std::runtime_error("cuBLAS Dgeam failed");
4643
}
4744
}
4845
};
49-
50-
// double特化
46+
// float特化
5147
template <>
52-
struct addDispatcher<cublas, double>
48+
struct addDispatcher<cublas, float>
5349
{
54-
static void add(const Tensor<double> &A, const Tensor<double> &B, Tensor<double> &C)
50+
static void add(const Tensor<float> &A, const Tensor<float> &B, Tensor<float> &C)
5551
{
5652
if (A.shape.size != B.shape.size || A.shape.size != C.shape.size)
5753
{
5854
throw std::runtime_error("Tensor shapes must match for addition");
5955
}
6056

6157
static CublasHandle handle;
62-
const double alpha = 1.0;
63-
const double beta = 1.0;
64-
auto status = cublasDgeam(handle.get(),
65-
CUBLAS_OP_N,
66-
CUBLAS_OP_N,
67-
A.shape.size, 1,
58+
const float alpha = 1.0f;
59+
const float beta = 1.0f;
60+
61+
// 使用cublasSgeam直接计算 C = alpha*A + beta*B
62+
auto status = cublasSgeam(handle.get(),
63+
CUBLAS_OP_N, // 不转置A
64+
CUBLAS_OP_N, // 不转置B
65+
A.shape.size, 1, // 假设是向量(或展平处理)
6866
&alpha,
6967
A.data, A.shape.size,
7068
&beta,
7169
B.data, B.shape.size,
7270
C.data, C.shape.size);
71+
7372
if (status != CUBLAS_STATUS_SUCCESS)
7473
{
75-
throw std::runtime_error("cuBLAS Dgeam failed");
74+
throw std::runtime_error("cuBLAS Sgeam failed");
7675
}
7776
}
7877
};
7978

80-
8179
}
8280

8381
#endif
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
#ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CUH
2+
#define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CUH
3+
4+
#include "deepx/tensorfunc/elementwise.hpp"
5+
#include "deepx/tensorfunc/cuda.hpp"
6+
#include "deepx/tensorfunc/authors.hpp"
7+
8+
namespace deepx::tensorfunc
9+
{
10+
template <typename T>
11+
__global__ void add_kernel(const T* A, const T* B, T* C, int size) {
12+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
13+
if (idx < size) {
14+
C[idx] = A[idx] + B[idx];
15+
}
16+
}
17+
template __global__ void add_kernel<double>(const double* A, const double* B, double* C, int size);
18+
template __global__ void add_kernel<float>(const float* A, const float* B, float* C, int size);
19+
template __global__ void add_kernel<half>(const half* A, const half* B, half* C, int size);
20+
template __global__ void add_kernel<nv_bfloat16>(const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C, int size);
21+
template __global__ void add_kernel<int64_t>(const int64_t* A, const int64_t* B, int64_t* C, int size);
22+
template __global__ void add_kernel<int32_t>(const int32_t* A, const int32_t* B, int32_t* C, int size);
23+
template __global__ void add_kernel<int16_t>(const int16_t* A, const int16_t* B, int16_t* C, int size);
24+
template __global__ void add_kernel<int8_t>(const int8_t* A, const int8_t* B, int8_t* C, int size);
25+
26+
27+
template <typename T>
28+
void launch_add(int numBlocks, int blockSize,const T* a, const T* b, T* c, int size)
29+
{
30+
// 启动kernel
31+
add_kernel<<<numBlocks, blockSize>>>(a, b, c, size);
32+
// 检查kernel执行是否成功
33+
cudaError_t err = cudaGetLastError();
34+
if (err != cudaSuccess) {
35+
throw std::runtime_error("Failed to launch add kernel: " +
36+
std::string(cudaGetErrorString(err)));
37+
}
38+
}
39+
40+
template void launch_add<double>(int numBlocks, int blockSize,const double* a, const double* b, double* c, int size);
41+
template void launch_add<float>(int numBlocks, int blockSize,const float* a, const float* b, float* c, int size);
42+
template void launch_add<half>(int numBlocks, int blockSize,const half* a, const half* b, half* c, int size);
43+
template void launch_add<nv_bfloat16>(int numBlocks, int blockSize,const nv_bfloat16* a, const nv_bfloat16* b, nv_bfloat16* c, int size);
44+
template void launch_add<int64_t>(int numBlocks, int blockSize,const int64_t* a, const int64_t* b, int64_t* c, int size);
45+
template void launch_add<int32_t>(int numBlocks, int blockSize, const int32_t* a, const int32_t* b, int32_t* c, int size);
46+
template void launch_add<int16_t>(int numBlocks, int blockSize, const int16_t* a, const int16_t* b, int16_t* c, int size);
47+
template void launch_add<int8_t>(int numBlocks, int blockSize, const int8_t* a, const int8_t* b, int8_t* c, int size);
48+
}
49+
50+
#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CUH

0 commit comments

Comments
 (0)