77namespace deepx ::tensorfunc
88{
99 template <typename T>
10- __global__ void add_kernel (const T* A, const T* B, T* C, int size) {
10+ __global__ void add_kernel (const T* A, const T* B, T* C,const int size) {
1111 int idx = blockIdx .x * blockDim .x + threadIdx .x ;
1212 if (idx < size) {
1313 C[idx] = A[idx] + B[idx];
1414 }
1515 }
16- template __global__ void add_kernel<double >(const double * A, const double * B, double * C, int size);
17- template __global__ void add_kernel<float >(const float * A, const float * B, float * C, int size);
18- template __global__ void add_kernel<half>(const half* A, const half* B, half* C, int size);
19- template __global__ void add_kernel<nv_bfloat16>(const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C, int size);
20- template __global__ void add_kernel<int64_t >(const int64_t * A, const int64_t * B, int64_t * C, int size);
21- template __global__ void add_kernel<int32_t >(const int32_t * A, const int32_t * B, int32_t * C, int size);
22- template __global__ void add_kernel<int16_t >(const int16_t * A, const int16_t * B, int16_t * C, int size);
23- template __global__ void add_kernel<int8_t >(const int8_t * A, const int8_t * B, int8_t * C, int size);
16+ template __global__ void add_kernel<double >(const double * A, const double * B, double * C,const int size);
17+ template __global__ void add_kernel<float >(const float * A, const float * B, float * C,const int size);
18+ template __global__ void add_kernel<half>(const half* A, const half* B, half* C,const int size);
19+ template __global__ void add_kernel<nv_bfloat16>(const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C,const int size);
20+ template __global__ void add_kernel<int64_t >(const int64_t * A, const int64_t * B, int64_t * C,const int size);
21+ template __global__ void add_kernel<int32_t >(const int32_t * A, const int32_t * B, int32_t * C,const int size);
22+ template __global__ void add_kernel<int16_t >(const int16_t * A, const int16_t * B, int16_t * C,const int size);
23+ template __global__ void add_kernel<int8_t >(const int8_t * A, const int8_t * B, int8_t * C,const int size);
2424
2525
2626 template <typename T>
27- void launch_add (int numBlocks, int blockSize,const T* a, const T* b, T* c, int size)
27+ void launch_add (int numBlocks, int blockSize,const T* a, const T* b, T* c,const int size)
2828 {
2929 // 启动kernel
3030 add_kernel<<<numBlocks, blockSize>>> (a, b, c, size);
@@ -36,31 +36,31 @@ namespace deepx::tensorfunc
3636 }
3737 }
3838
39- template void launch_add<double >(int numBlocks, int blockSize,const double * a, const double * b, double * c, int size);
40- template void launch_add<float >(int numBlocks, int blockSize,const float * a, const float * b, float * c, int size);
41- template void launch_add<half>(int numBlocks, int blockSize,const half* a, const half* b, half* c, int size);
42- template void launch_add<nv_bfloat16>(int numBlocks, int blockSize,const nv_bfloat16* a, const nv_bfloat16* b, nv_bfloat16* c, int size);
43- template void launch_add<int64_t >(int numBlocks, int blockSize,const int64_t * a, const int64_t * b, int64_t * c, int size);
44- template void launch_add<int32_t >(int numBlocks, int blockSize, const int32_t * a, const int32_t * b, int32_t * c, int size);
45- template void launch_add<int16_t >(int numBlocks, int blockSize, const int16_t * a, const int16_t * b, int16_t * c, int size);
46- template void launch_add<int8_t >(int numBlocks, int blockSize, const int8_t * a, const int8_t * b, int8_t * c, int size);
39+ template void launch_add<double >(int numBlocks, int blockSize,const double * a, const double * b, double * c,const int size);
40+ template void launch_add<float >(int numBlocks, int blockSize,const float * a, const float * b, float * c,const int size);
41+ template void launch_add<half>(int numBlocks, int blockSize,const half* a, const half* b, half* c,const int size);
42+ template void launch_add<nv_bfloat16>(int numBlocks, int blockSize,const nv_bfloat16* a, const nv_bfloat16* b, nv_bfloat16* c,const int size);
43+ template void launch_add<int64_t >(int numBlocks, int blockSize,const int64_t * a, const int64_t * b, int64_t * c,const int size);
44+ template void launch_add<int32_t >(int numBlocks, int blockSize, const int32_t * a, const int32_t * b, int32_t * c,const int size);
45+ template void launch_add<int16_t >(int numBlocks, int blockSize, const int16_t * a, const int16_t * b, int16_t * c,const int size);
46+ template void launch_add<int8_t >(int numBlocks, int blockSize, const int8_t * a, const int8_t * b, int8_t * c,const int size);
4747
4848
4949 template <typename T>
50- __global__ void addscalar_kernel (const T* A, const T scalar, T* C, int size) {
50+ __global__ void addscalar_kernel (const T* A, const T scalar, T* C,const int size) {
5151 int idx = blockIdx .x * blockDim .x + threadIdx .x ;
5252 if (idx < size) {
5353 C[idx] = A[idx] + scalar;
5454 }
5555 }
56- template __global__ void addscalar_kernel<double >(const double * A, const double scalar, double * C, int size);
57- template __global__ void addscalar_kernel<float >(const float * A, const float scalar, float * C, int size);
58- template __global__ void addscalar_kernel<half>(const half* A, const half scalar, half* C, int size);
59- template __global__ void addscalar_kernel<nv_bfloat16>(const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C, int size);
60- template __global__ void addscalar_kernel<int64_t >(const int64_t * A, const int64_t scalar, int64_t * C, int size);
61- template __global__ void addscalar_kernel<int32_t >(const int32_t * A, const int32_t scalar, int32_t * C, int size);
62- template __global__ void addscalar_kernel<int16_t >(const int16_t * A, const int16_t scalar, int16_t * C, int size);
63- template __global__ void addscalar_kernel<int8_t >(const int8_t * A, const int8_t scalar, int8_t * C, int size);
56+ template __global__ void addscalar_kernel<double >(const double * A, const double scalar, double * C,const int size);
57+ template __global__ void addscalar_kernel<float >(const float * A, const float scalar, float * C,const int size);
58+ template __global__ void addscalar_kernel<half>(const half* A, const half scalar, half* C,const int size);
59+ template __global__ void addscalar_kernel<nv_bfloat16>(const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C,const int size);
60+ template __global__ void addscalar_kernel<int64_t >(const int64_t * A, const int64_t scalar, int64_t * C,const int size);
61+ template __global__ void addscalar_kernel<int32_t >(const int32_t * A, const int32_t scalar, int32_t * C,const int size);
62+ template __global__ void addscalar_kernel<int16_t >(const int16_t * A, const int16_t scalar, int16_t * C,const int size);
63+ template __global__ void addscalar_kernel<int8_t >(const int8_t * A, const int8_t scalar, int8_t * C,const int size);
6464
6565 template <typename T>
6666 void launch_addscalar (const int numBlocks, const int blockSize, const T* a, const T scalar, T* c, const int size) {
@@ -74,6 +74,38 @@ namespace deepx::tensorfunc
7474 template void launch_addscalar<int32_t >(const int numBlocks, const int blockSize, const int32_t * a, const int32_t scalar, int32_t * c, const int size);
7575 template void launch_addscalar<int16_t >(const int numBlocks, const int blockSize, const int16_t * a, const int16_t scalar, int16_t * c, const int size);
7676 template void launch_addscalar<int8_t >(const int numBlocks, const int blockSize, const int8_t * a, const int8_t scalar, int8_t * c, const int size);
77+
78+
79+ template <typename T>
80+ __global__ void sub_kernel (const T* A, const T* B, T* C,const int size){
81+ int idx = blockIdx .x * blockDim .x + threadIdx .x ;
82+ if (idx < size) {
83+ C[idx] = A[idx] - B[idx];
84+ }
85+ }
86+ template __global__ void sub_kernel<double >(const double * A, const double * B, double * C, const int size);
87+ template __global__ void sub_kernel<float >(const float * A, const float * B, float * C, const int size);
88+ template __global__ void sub_kernel<half>(const half* A, const half* B, half* C, const int size);
89+ template __global__ void sub_kernel<nv_bfloat16>(const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C, const int size);
90+ template __global__ void sub_kernel<int64_t >(const int64_t * A, const int64_t * B, int64_t * C, const int size);
91+ template __global__ void sub_kernel<int32_t >(const int32_t * A, const int32_t * B, int32_t * C, const int size);
92+ template __global__ void sub_kernel<int16_t >(const int16_t * A, const int16_t * B, int16_t * C, const int size);
93+ template __global__ void sub_kernel<int8_t >(const int8_t * A, const int8_t * B, int8_t * C, const int size);
94+
95+ template <typename T>
96+ void launch_sub (const int numBlocks, const int blockSize, const T* a, const T* b, T* c, const int size) {
97+ sub_kernel<<<numBlocks, blockSize>>> (a, b, c, size);
98+ }
99+ template void launch_sub<double >(const int numBlocks, const int blockSize, const double * a, const double * b, double * c, const int size);
100+ template void launch_sub<float >(const int numBlocks, const int blockSize, const float * a, const float * b, float * c, const int size);
101+ template void launch_sub<half>(const int numBlocks, const int blockSize, const half* a, const half* b, half* c, const int size);
102+ template void launch_sub<nv_bfloat16>(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16* b, nv_bfloat16* c, const int size);
103+ template void launch_sub<int64_t >(const int numBlocks, const int blockSize, const int64_t * a, const int64_t * b, int64_t * c, const int size);
104+ template void launch_sub<int32_t >(const int numBlocks, const int blockSize, const int32_t * a, const int32_t * b, int32_t * c, const int size);
105+ template void launch_sub<int16_t >(const int numBlocks, const int blockSize, const int16_t * a, const int16_t * b, int16_t * c, const int size);
106+ template void launch_sub<int8_t >(const int numBlocks, const int blockSize, const int8_t * a, const int8_t * b, int8_t * c, const int size);
107+
108+
77109}
78110
79111#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CUH
0 commit comments