Skip to content

Commit f71f809

Browse files
committed
excuter(cpu/cuda):subscalar
1 parent d184866 commit f71f809

10 files changed

Lines changed: 320 additions & 98 deletions

File tree

doc/excuter/op-mem-cuda/list.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@
55
| Operation | Author | Func Def | Math Formula | IR Instruction |
66
|-----------|--------|------------|--------------|----------------|
77
| addscalar | miaobyte | addscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1+scalar | addscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
8+
| add | cublas | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
89
| add | miaobyte | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
910
| uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
11+
| subscalar | miaobyte | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1-scalar | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
1012
| arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
1113
| constant | miaobyte | constant(tensor<any> t, var<any> value)->() | constant(T1) | constant(tensor<any> t, var<any> value)->() |
1214
| print | miaobyte | print(tensor<any> )->() | print(T1) | print(tensor<any> )->() |

doc/excuter/op-mem-ompsimd/list.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
| add | cblas | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) | T3=T1+T2 | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) |
1010
| add | miaobyte | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
1111
| uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
12+
| subscalar | miaobyte | subscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) | T3=T1-scalar | subscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) |
1213
| arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
1314
| constant | miaobyte | constant(tensor<any> t, var<any> value)->() | constant(T1,value) | constant(tensor<any> t, var<any> value)->() |
1415
| print | miaobyte | print(tensor<any> )->() | print(T1) | print(tensor<any> )->() |

excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@ namespace deepx::tensorfunc
2424
template <typename Author, typename T>
2525
struct addscalarDispatcher
2626
{
27-
static void addscalar(const Tensor<T> &input, const T value, Tensor<T> &output) = delete;
27+
static void addscalar(const Tensor<T> &input, const T value, Tensor<T> &output){
28+
throw NotImplementError("addscalar");
29+
}
2830
};
2931

3032
template <typename Author, typename T>
@@ -36,7 +38,9 @@ namespace deepx::tensorfunc
3638
template <typename Author, typename T>
3739
struct subDispatcher
3840
{
39-
static void sub(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C) = delete;
41+
static void sub(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C){
42+
throw NotImplementError("sub");
43+
}
4044
};
4145

4246
template <typename Author, typename T>
@@ -48,7 +52,9 @@ namespace deepx::tensorfunc
4852
template <typename Author, typename T>
4953
struct subscalarDispatcher
5054
{
51-
static void subscalar(const Tensor<T> &input, const T value, Tensor<T> &output) = delete;
55+
static void subscalar(const Tensor<T> &input, const T value, Tensor<T> &output){
56+
throw NotImplementError("subscalar");
57+
}
5258
};
5359

5460
template <typename Author, typename T>

excuter/op-mem-cuda/src/client/tfs.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ namespace deepx::tf
114114
{
115115
Param("c", DataCategory::Tensor, Precision::Any),
116116
})));
117-
tffactory.add_tf(std::make_shared<Addscalar<miaobyte>>(vector<Param>(
117+
tffactory.add_tf(std::make_shared<AddScalar<miaobyte>>(vector<Param>(
118118
{
119119
Param("A", DataCategory::Tensor, Precision::Any),
120120
Param("b", DataCategory::Var, Precision::Any),
@@ -133,7 +133,16 @@ namespace deepx::tf
133133
{
134134
Param("C", DataCategory::Tensor, Precision::Any),
135135
})));
136-
136+
tffactory.add_tf(std::make_shared<SubScalar<miaobyte>>(vector<Param>(
137+
{
138+
Param("A", DataCategory::Tensor, Precision::Any),
139+
Param("b", DataCategory::Var, Precision::Any),
140+
}),
141+
vector<Param>(
142+
{
143+
Param("C", DataCategory::Tensor, Precision::Any),
144+
})));
145+
137146
// opfactory.add_op(Sub_cblas<float>());
138147
// opfactory.add_op(Sub_cblas<double>());
139148

excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,34 @@ namespace deepx::tensorfunc
105105
template void launch_sub<int16_t>(const int numBlocks, const int blockSize, const int16_t* a, const int16_t* b, int16_t* c, const int size);
106106
template void launch_sub<int8_t>(const int numBlocks, const int blockSize, const int8_t* a, const int8_t* b, int8_t* c, const int size);
107107

108-
108+
template <typename T>
109+
__global__ void subscalar_kernel(const T* A, const T scalar, T* C,const int size){
110+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
111+
if (idx < size) {
112+
C[idx] = A[idx] - scalar;
113+
}
114+
}
115+
template __global__ void subscalar_kernel<double>(const double* A, const double scalar, double* C,const int size);
116+
template __global__ void subscalar_kernel<float>(const float* A, const float scalar, float* C,const int size);
117+
template __global__ void subscalar_kernel<half>(const half* A, const half scalar, half* C,const int size);
118+
template __global__ void subscalar_kernel<nv_bfloat16>(const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C,const int size);
119+
template __global__ void subscalar_kernel<int64_t>(const int64_t* A, const int64_t scalar, int64_t* C,const int size);
120+
template __global__ void subscalar_kernel<int32_t>(const int32_t* A, const int32_t scalar, int32_t* C,const int size);
121+
template __global__ void subscalar_kernel<int16_t>(const int16_t* A, const int16_t scalar, int16_t* C,const int size);
122+
template __global__ void subscalar_kernel<int8_t>(const int8_t* A, const int8_t scalar, int8_t* C,const int size);
123+
124+
template <typename T>
125+
void launch_subscalar(const int numBlocks, const int blockSize, const T* a, const T scalar, T* c, const int size) {
126+
subscalar_kernel<<<numBlocks, blockSize>>>(a, scalar, c, size);
127+
}
128+
template void launch_subscalar<double>(const int numBlocks, const int blockSize, const double* a, const double scalar, double* c, const int size);
129+
template void launch_subscalar<float>(const int numBlocks, const int blockSize, const float* a, const float scalar, float* c, const int size);
130+
template void launch_subscalar<half>(const int numBlocks, const int blockSize, const half* a, const half scalar, half* c, const int size);
131+
template void launch_subscalar<nv_bfloat16>(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c, const int size);
132+
template void launch_subscalar<int64_t>(const int numBlocks, const int blockSize, const int64_t* a, const int64_t scalar, int64_t* c, const int size);
133+
template void launch_subscalar<int32_t>(const int numBlocks, const int blockSize, const int32_t* a, const int32_t scalar, int32_t* c, const int size);
134+
template void launch_subscalar<int16_t>(const int numBlocks, const int blockSize, const int16_t* a, const int16_t scalar, int16_t* c, const int size);
135+
template void launch_subscalar<int8_t>(const int numBlocks, const int blockSize, const int8_t* a, const int8_t scalar, int8_t* c, const int size);
109136
}
110137

111138
#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CUH

excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,38 @@ namespace deepx::tensorfunc
103103

104104
template <>
105105
void launch_sub<int8_t>(int numBlocks, int blockSize, const int8_t* a, const int8_t* b, int8_t* c,const int size);
106-
106+
107+
// subscalar
108+
template <typename T>
109+
__global__ void subscalar_kernel(const T* A, const T scalar, T* C,const int size);
110+
111+
template <typename T>
112+
void launch_subscalar(const int numBlocks, const int blockSize, const T* a, const T scalar, T* c,const int size);
113+
114+
template <>
115+
void launch_subscalar<double>(const int numBlocks, const int blockSize, const double* a, const double scalar, double* c,const int size);
116+
117+
template <>
118+
void launch_subscalar<float>(const int numBlocks, const int blockSize, const float* a, const float scalar, float* c,const int size);
119+
120+
template <>
121+
void launch_subscalar<nv_bfloat16>(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c,const int size);
122+
123+
template <>
124+
void launch_subscalar<__half>(const int numBlocks, const int blockSize, const __half* a, const __half scalar, __half* c,const int size);
125+
126+
template <>
127+
void launch_subscalar<int64_t>(const int numBlocks, const int blockSize, const int64_t* a, const int64_t scalar, int64_t* c,const int size);
128+
129+
template <>
130+
void launch_subscalar<int32_t>(const int numBlocks, const int blockSize, const int32_t* a, const int32_t scalar, int32_t* c,const int size);
131+
132+
template <>
133+
void launch_subscalar<int16_t>(const int numBlocks, const int blockSize, const int16_t* a, const int16_t scalar, int16_t* c,const int size);
134+
135+
template <>
136+
void launch_subscalar<int8_t>(const int numBlocks, const int blockSize, const int8_t* a, const int8_t scalar, int8_t* c,const int size);
137+
107138
}
108139

109140
#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CUH

excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,20 @@ namespace deepx::tensorfunc
5555
launch_sub(numBlocks, blockSize, A.data, B.data, C.data, A.shape.size);
5656
}
5757
};
58+
59+
template <typename T>
60+
struct subscalarDispatcher<miaobyte, T>
61+
{
62+
static void subscalar(const Tensor<T> &A, const T scalar, Tensor<T> &C)
63+
{
64+
if (A.shape.size != C.shape.size) {
65+
throw TensorShapeError("subscalar");
66+
}
67+
const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
68+
int numBlocks = (A.shape.size + blockSize - 1) / blockSize;
69+
launch_subscalar(numBlocks, blockSize, A.data, scalar, C.data, A.shape.size);
70+
}
71+
};
5872
}
5973

6074
#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_HPP

excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp

Lines changed: 78 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -83,18 +83,18 @@ namespace deepx::tf
8383
};
8484

8585
template <typename Author>
86-
class Addscalar : public TF
86+
class AddScalar : public TF
8787
{
8888
public:
89-
Addscalar(const vector<Param> &args, const vector<Param> &returns)
89+
AddScalar(const vector<Param> &args, const vector<Param> &returns)
9090
{
9191
this->name = "addscalar";
9292
this->author = Author::name();
9393
this->args = args;
9494
this->returns = returns;
9595
}
9696

97-
Addscalar(string text)
97+
AddScalar(string text)
9898
{
9999
this->parse(text);
100100
this->author = Author::name();
@@ -109,7 +109,7 @@ namespace deepx::tf
109109
}
110110
shared_ptr<TF> clone() const override
111111
{
112-
return make_shared<Addscalar<Author>>(*this);
112+
return make_shared<AddScalar<Author>>(*this);
113113
}
114114
int run(shared_ptr<MemBase> mem, string &error) override
115115
{
@@ -226,6 +226,80 @@ namespace deepx::tf
226226
return 0;
227227
}
228228
};
229+
230+
template <typename Author>
231+
class SubScalar : public TF
232+
{
233+
public:
234+
SubScalar(const vector<Param> &args, const vector<Param> &returns)
235+
{
236+
this->name = "subscalar";
237+
this->author = Author::name();
238+
this->args = args;
239+
this->returns = returns;
240+
}
241+
242+
SubScalar(string text)
243+
{
244+
this->parse(text);
245+
this->author = Author::name();
246+
if (this->name != "subscalar")
247+
{
248+
throw std::runtime_error("Invalid name: " + this->name);
249+
}
250+
}
251+
string math_formula() const override
252+
{
253+
return "T3=T1-scalar";
254+
}
255+
shared_ptr<TF> clone() const override
256+
{
257+
return make_shared<SubScalar<Author>>(*this);
258+
}
259+
int run(shared_ptr<MemBase> mem, string &error) override
260+
{
261+
Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
262+
Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
263+
if (a_type != c_type)
264+
{
265+
error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type);
266+
return 1;
267+
}
268+
switch (a_type)
269+
{
270+
case Precision::Float64:
271+
tensorfunc::subscalar<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), this->getvar<double>(1, mem), *mem->gettensor<double>(this->returns[0].textvalue));
272+
break;
273+
case Precision::Float32:
274+
tensorfunc::subscalar<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), this->getvar<float>(1, mem), *mem->gettensor<float>(this->returns[0].textvalue));
275+
break;
276+
case Precision::Float16:
277+
tensorfunc::subscalar<Author, half>(*mem->gettensor<half>(this->args[0].textvalue), this->getvar<half>(1, mem), *mem->gettensor<half>(this->returns[0].textvalue));
278+
break;
279+
case Precision::BFloat16:
280+
tensorfunc::subscalar<Author, nv_bfloat16>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), this->getvar<nv_bfloat16>(1, mem), *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
281+
break;
282+
case Precision::Int64:
283+
tensorfunc::subscalar<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->returns[0].textvalue));
284+
break;
285+
case Precision::Int32:
286+
tensorfunc::subscalar<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->returns[0].textvalue));
287+
break;
288+
case Precision::Int16:
289+
tensorfunc::subscalar<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), this->getvar<int16_t>(1, mem), *mem->gettensor<int16_t>(this->returns[0].textvalue));
290+
break;
291+
case Precision::Int8:
292+
tensorfunc::subscalar<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), this->getvar<int8_t>(1, mem), *mem->gettensor<int8_t>(this->returns[0].textvalue));
293+
break;
294+
default:
295+
error = "Unsupported dtype: " + precision_str(a_type);
296+
return 1;
297+
}
298+
return 0;
299+
}
300+
};
301+
302+
229303
};
230304

231305
#endif // DEEPX_TF_ELEMENTWISE_BASIC_HPP

excuter/op-mem-ompsimd/src/client/tfs.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,16 @@ namespace deepx::tf
140140
{
141141
Param("c", DataCategory::Tensor, Precision::Any),
142142
})));
143+
144+
tffactory.add_tf(std::make_shared<SubScalar<miaobyte>>(vector<Param>(
145+
{
146+
Param("a", DataCategory::Tensor, Precision::Any),
147+
Param("scalar", DataCategory::Var, Precision::Any),
148+
}),
149+
vector<Param>(
150+
{
151+
Param("c", DataCategory::Tensor, Precision::Any),
152+
})));
143153
// opfactory.add_op(Addscalar_miaobyte<float>());
144154
// opfactory.add_op(Addscalar_miaobyte<double>());
145155

0 commit comments

Comments
 (0)