Skip to content

Commit 6a18f1f

Browse files
committed
excuter(cpu/cuda):sub fix
1 parent fafd98f commit 6a18f1f

11 files changed

Lines changed: 298 additions & 88 deletions

File tree

doc/excuter/op-mem-cuda/list.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,5 @@
1414
| newtensor | none | newtensor(vector<int32> shape)->(tensor<any> tensor1) | T1 = zeros(shape) | newtensor(vector<int32> shape)->(tensor<any> tensor1) |
1515
| newtensor | none | newtensor(var<string> shape)->(tensor<any> tensor1) | T1 = zeros(shape) | newtensor(var<string> shape)->(tensor<any> tensor1) |
1616
| vecset | none | vecset(vector<any> value)->(vector<any> name) | shape = [3 4 5] | vecset(vector<any> value)->(vector<any> name) |
17+
| sub | miaobyte | sub(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1-T2 | sub(tensor<any> A, tensor<any> B)->(tensor<any> C) |
1718
| argset | none | argset(var<any> value)->(var<any> name) | var argname = argvalue | argset(var<any> value)->(var<any> name) |

doc/excuter/op-mem-ompsimd/list.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,11 @@
1010
| add | miaobyte | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
1111
| uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
1212
| arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
13-
| constant | miaobyte | constant(tensor<any> t, var<any> value)->() | print(T1) | constant(tensor<any> t, var<any> value)->() |
13+
| constant | miaobyte | constant(tensor<any> t, var<any> value)->() | constant(T1,value) | constant(tensor<any> t, var<any> value)->() |
1414
| print | miaobyte | print(tensor<any> )->() | print(T1) | print(tensor<any> )->() |
1515
| print | miaobyte | print(tensor<any> , var<string> )->() | print(T1) | print(tensor<any> , var<string> )->() |
16-
| newtensor | none | newtensor(vector<int32> shape)->(tensor<any> tensor1) | T1 = zeros(shape) | newtensor(vector<int32> shape)->(tensor<any> tensor1) |
17-
| newtensor | none | newtensor(var<string> shape)->(tensor<any> tensor1) | T1 = zeros(shape) | newtensor(var<string> shape)->(tensor<any> tensor1) |
16+
| newtensor | none | newtensor(vector<int32> shape)->(tensor<any> tensor1) | T1 =Tensor(shape=[...]) | newtensor(vector<int32> shape)->(tensor<any> tensor1) |
17+
| newtensor | none | newtensor(var<string> shape)->(tensor<any> tensor1) | T1 =Tensor(shape=[...]) | newtensor(var<string> shape)->(tensor<any> tensor1) |
1818
| vecset | none | vecset(vector<any> value)->(vector<any> name) | shape = [3 4 5] | vecset(vector<any> value)->(vector<any> name) |
19+
| sub | miaobyte | sub(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1-T2 | sub(tensor<any> a, tensor<any> b)->(tensor<any> c) |
1920
| argset | none | argset(var<any> value)->(var<any> name) | var argname = argvalue | argset(var<any> value)->(var<any> name) |

excuter/op-mem-cuda/src/client/tfs.cpp

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -105,10 +105,15 @@ namespace deepx::tf
105105
{
106106
Param("c", DataCategory::Tensor, Precision::Any),
107107
})));
108-
109-
// opfactory.add_op(Add_cblas<float>());
110-
// opfactory.add_op(Add_cblas<double>());
111-
108+
tffactory.add_tf(std::make_shared<Add<cublas>>(vector<Param>(
109+
{
110+
Param("a", DataCategory::Tensor, Precision::Any),
111+
Param("b", DataCategory::Tensor, Precision::Any),
112+
}),
113+
vector<Param>(
114+
{
115+
Param("c", DataCategory::Tensor, Precision::Any),
116+
})));
112117
tffactory.add_tf(std::make_shared<Addscalar<miaobyte>>(vector<Param>(
113118
{
114119
Param("A", DataCategory::Tensor, Precision::Any),
@@ -118,9 +123,17 @@ namespace deepx::tf
118123
{
119124
Param("C", DataCategory::Tensor, Precision::Any),
120125
})));
121-
// opfactory.add_op(Sub_miaobyte<float>());
122-
// opfactory.add_op(Sub_miaobyte<double>());
123126

127+
tffactory.add_tf(std::make_shared<Sub<miaobyte>>(vector<Param>(
128+
{
129+
Param("A", DataCategory::Tensor, Precision::Any),
130+
Param("B", DataCategory::Tensor, Precision::Any),
131+
}),
132+
vector<Param>(
133+
{
134+
Param("C", DataCategory::Tensor, Precision::Any),
135+
})));
136+
124137
// opfactory.add_op(Sub_cblas<float>());
125138
// opfactory.add_op(Sub_cblas<double>());
126139

excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu

Lines changed: 59 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -7,24 +7,24 @@
77
namespace deepx::tensorfunc
88
{
99
template <typename T>
10-
__global__ void add_kernel(const T* A, const T* B, T* C, int size) {
10+
__global__ void add_kernel(const T* A, const T* B, T* C,const int size) {
1111
int idx = blockIdx.x * blockDim.x + threadIdx.x;
1212
if (idx < size) {
1313
C[idx] = A[idx] + B[idx];
1414
}
1515
}
16-
template __global__ void add_kernel<double>(const double* A, const double* B, double* C, int size);
17-
template __global__ void add_kernel<float>(const float* A, const float* B, float* C, int size);
18-
template __global__ void add_kernel<half>(const half* A, const half* B, half* C, int size);
19-
template __global__ void add_kernel<nv_bfloat16>(const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C, int size);
20-
template __global__ void add_kernel<int64_t>(const int64_t* A, const int64_t* B, int64_t* C, int size);
21-
template __global__ void add_kernel<int32_t>(const int32_t* A, const int32_t* B, int32_t* C, int size);
22-
template __global__ void add_kernel<int16_t>(const int16_t* A, const int16_t* B, int16_t* C, int size);
23-
template __global__ void add_kernel<int8_t>(const int8_t* A, const int8_t* B, int8_t* C, int size);
16+
template __global__ void add_kernel<double>(const double* A, const double* B, double* C,const int size);
17+
template __global__ void add_kernel<float>(const float* A, const float* B, float* C,const int size);
18+
template __global__ void add_kernel<half>(const half* A, const half* B, half* C,const int size);
19+
template __global__ void add_kernel<nv_bfloat16>(const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C,const int size);
20+
template __global__ void add_kernel<int64_t>(const int64_t* A, const int64_t* B, int64_t* C,const int size);
21+
template __global__ void add_kernel<int32_t>(const int32_t* A, const int32_t* B, int32_t* C,const int size);
22+
template __global__ void add_kernel<int16_t>(const int16_t* A, const int16_t* B, int16_t* C,const int size);
23+
template __global__ void add_kernel<int8_t>(const int8_t* A, const int8_t* B, int8_t* C,const int size);
2424

2525

2626
template <typename T>
27-
void launch_add(int numBlocks, int blockSize,const T* a, const T* b, T* c, int size)
27+
void launch_add(int numBlocks, int blockSize,const T* a, const T* b, T* c,const int size)
2828
{
2929
// 启动kernel
3030
add_kernel<<<numBlocks, blockSize>>>(a, b, c, size);
@@ -36,31 +36,31 @@ namespace deepx::tensorfunc
3636
}
3737
}
3838

39-
template void launch_add<double>(int numBlocks, int blockSize,const double* a, const double* b, double* c, int size);
40-
template void launch_add<float>(int numBlocks, int blockSize,const float* a, const float* b, float* c, int size);
41-
template void launch_add<half>(int numBlocks, int blockSize,const half* a, const half* b, half* c, int size);
42-
template void launch_add<nv_bfloat16>(int numBlocks, int blockSize,const nv_bfloat16* a, const nv_bfloat16* b, nv_bfloat16* c, int size);
43-
template void launch_add<int64_t>(int numBlocks, int blockSize,const int64_t* a, const int64_t* b, int64_t* c, int size);
44-
template void launch_add<int32_t>(int numBlocks, int blockSize, const int32_t* a, const int32_t* b, int32_t* c, int size);
45-
template void launch_add<int16_t>(int numBlocks, int blockSize, const int16_t* a, const int16_t* b, int16_t* c, int size);
46-
template void launch_add<int8_t>(int numBlocks, int blockSize, const int8_t* a, const int8_t* b, int8_t* c, int size);
39+
template void launch_add<double>(int numBlocks, int blockSize,const double* a, const double* b, double* c,const int size);
40+
template void launch_add<float>(int numBlocks, int blockSize,const float* a, const float* b, float* c,const int size);
41+
template void launch_add<half>(int numBlocks, int blockSize,const half* a, const half* b, half* c,const int size);
42+
template void launch_add<nv_bfloat16>(int numBlocks, int blockSize,const nv_bfloat16* a, const nv_bfloat16* b, nv_bfloat16* c,const int size);
43+
template void launch_add<int64_t>(int numBlocks, int blockSize,const int64_t* a, const int64_t* b, int64_t* c,const int size);
44+
template void launch_add<int32_t>(int numBlocks, int blockSize, const int32_t* a, const int32_t* b, int32_t* c,const int size);
45+
template void launch_add<int16_t>(int numBlocks, int blockSize, const int16_t* a, const int16_t* b, int16_t* c,const int size);
46+
template void launch_add<int8_t>(int numBlocks, int blockSize, const int8_t* a, const int8_t* b, int8_t* c,const int size);
4747

4848

4949
template <typename T>
50-
__global__ void addscalar_kernel(const T* A, const T scalar, T* C, int size) {
50+
__global__ void addscalar_kernel(const T* A, const T scalar, T* C,const int size) {
5151
int idx = blockIdx.x * blockDim.x + threadIdx.x;
5252
if (idx < size) {
5353
C[idx] = A[idx] + scalar;
5454
}
5555
}
56-
template __global__ void addscalar_kernel<double>(const double* A, const double scalar, double* C, int size);
57-
template __global__ void addscalar_kernel<float>(const float* A, const float scalar, float* C, int size);
58-
template __global__ void addscalar_kernel<half>(const half* A, const half scalar, half* C, int size);
59-
template __global__ void addscalar_kernel<nv_bfloat16>(const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C, int size);
60-
template __global__ void addscalar_kernel<int64_t>(const int64_t* A, const int64_t scalar, int64_t* C, int size);
61-
template __global__ void addscalar_kernel<int32_t>(const int32_t* A, const int32_t scalar, int32_t* C, int size);
62-
template __global__ void addscalar_kernel<int16_t>(const int16_t* A, const int16_t scalar, int16_t* C, int size);
63-
template __global__ void addscalar_kernel<int8_t>(const int8_t* A, const int8_t scalar, int8_t* C, int size);
56+
template __global__ void addscalar_kernel<double>(const double* A, const double scalar, double* C,const int size);
57+
template __global__ void addscalar_kernel<float>(const float* A, const float scalar, float* C,const int size);
58+
template __global__ void addscalar_kernel<half>(const half* A, const half scalar, half* C,const int size);
59+
template __global__ void addscalar_kernel<nv_bfloat16>(const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C,const int size);
60+
template __global__ void addscalar_kernel<int64_t>(const int64_t* A, const int64_t scalar, int64_t* C,const int size);
61+
template __global__ void addscalar_kernel<int32_t>(const int32_t* A, const int32_t scalar, int32_t* C,const int size);
62+
template __global__ void addscalar_kernel<int16_t>(const int16_t* A, const int16_t scalar, int16_t* C,const int size);
63+
template __global__ void addscalar_kernel<int8_t>(const int8_t* A, const int8_t scalar, int8_t* C,const int size);
6464

6565
template <typename T>
6666
void launch_addscalar(const int numBlocks, const int blockSize, const T* a, const T scalar, T* c, const int size) {
@@ -74,6 +74,38 @@ namespace deepx::tensorfunc
7474
template void launch_addscalar<int32_t>(const int numBlocks, const int blockSize, const int32_t* a, const int32_t scalar, int32_t* c, const int size);
7575
template void launch_addscalar<int16_t>(const int numBlocks, const int blockSize, const int16_t* a, const int16_t scalar, int16_t* c, const int size);
7676
template void launch_addscalar<int8_t>(const int numBlocks, const int blockSize, const int8_t* a, const int8_t scalar, int8_t* c, const int size);
77+
78+
79+
template <typename T>
80+
__global__ void sub_kernel(const T* A, const T* B, T* C,const int size){
81+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
82+
if (idx < size) {
83+
C[idx] = A[idx] - B[idx];
84+
}
85+
}
86+
template __global__ void sub_kernel<double>(const double* A, const double* B, double* C, const int size);
87+
template __global__ void sub_kernel<float>(const float* A, const float* B, float* C, const int size);
88+
template __global__ void sub_kernel<half>(const half* A, const half* B, half* C, const int size);
89+
template __global__ void sub_kernel<nv_bfloat16>(const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C, const int size);
90+
template __global__ void sub_kernel<int64_t>(const int64_t* A, const int64_t* B, int64_t* C, const int size);
91+
template __global__ void sub_kernel<int32_t>(const int32_t* A, const int32_t* B, int32_t* C, const int size);
92+
template __global__ void sub_kernel<int16_t>(const int16_t* A, const int16_t* B, int16_t* C, const int size);
93+
template __global__ void sub_kernel<int8_t>(const int8_t* A, const int8_t* B, int8_t* C, const int size);
94+
95+
template <typename T>
96+
void launch_sub(const int numBlocks, const int blockSize, const T* a, const T* b, T* c, const int size) {
97+
sub_kernel<<<numBlocks, blockSize>>>(a, b, c, size);
98+
}
99+
template void launch_sub<double>(const int numBlocks, const int blockSize, const double* a, const double* b, double* c, const int size);
100+
template void launch_sub<float>(const int numBlocks, const int blockSize, const float* a, const float* b, float* c, const int size);
101+
template void launch_sub<half>(const int numBlocks, const int blockSize, const half* a, const half* b, half* c, const int size);
102+
template void launch_sub<nv_bfloat16>(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16* b, nv_bfloat16* c, const int size);
103+
template void launch_sub<int64_t>(const int numBlocks, const int blockSize, const int64_t* a, const int64_t* b, int64_t* c, const int size);
104+
template void launch_sub<int32_t>(const int numBlocks, const int blockSize, const int32_t* a, const int32_t* b, int32_t* c, const int size);
105+
template void launch_sub<int16_t>(const int numBlocks, const int blockSize, const int16_t* a, const int16_t* b, int16_t* c, const int size);
106+
template void launch_sub<int8_t>(const int numBlocks, const int blockSize, const int8_t* a, const int8_t* b, int8_t* c, const int size);
107+
108+
77109
}
78110

79111
#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CUH

0 commit comments

Comments
 (0)