Skip to content

Commit da13e9f

Browse files
committed
drop->tftype = "init"
1 parent b00f9f5 commit da13e9f

26 files changed

Lines changed: 387 additions & 393 deletions

File tree

doc/design.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,11 @@ deepIR{
1818
excuter执行deepxIR的规则
1919

2020
+ excuter执行deepxIR时,不得修改args中的tensor
21-
+ 但deepIR不限制args和returns中的Param同名,这样可以实现类似inplace的操作
21+
+ 但deepIR不限制args和returns中的Param同名,这样可以实现类似inplace的操作
22+
23+
24+
## front/python规则
25+
26+
### 1.命名规则
27+
+ inplace操作的函数,其名为_后缀, 返回值为空
28+
+ 非inplace操作的函数,其名无_后缀

doc/excuter/op-mem-cuda/list.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,25 +48,25 @@
4848

4949
| Operation | Author | Math Formula | IR Instruction |
5050
|-----------|--------|--------------|----------------|
51-
| dropout | miaobyte | T1.dropout(p,seed)->T3 | dropout(tensor<any> A, var<float32> p, var<int32> seed)->(tensor<any> C) |
5251
| switch | miaobyte | C=switch(tensors,cases) | switch(listtensor<any> tensors, tensor<int8> cases)->(tensor<any> result) |
5352
| greaterscalar | miaobyte | mask=compare(T1, scalar) | greaterscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
5453
| equalscalar | miaobyte | mask=compare(T1, scalar) | equalscalar(tensor<any> A, var<any> scalar, var<float64> epsilon)->(tensor<bool> mask) |
5554
| min | miaobyte | T3=min(T1, T2) | min(tensor<any> A, tensor<any> B)->(tensor<any> C) |
5655
| maxscalar | miaobyte | T3=max(T1, scalar) | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
5756
| tan | miaobyte | T3=tan(T1) | tan(tensor<float64|float32> A)->(tensor<float64|float32> C) |
5857
| sin | miaobyte | T3=sin(T1) | sin(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
58+
| dropout | miaobyte | dropout(p,seed)->A | dropout(var<float32> p, var<int32> seed)->(tensor<any> A) |
5959
| divscalar | miaobyte | T3=scalar/T1 | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
6060
| log | miaobyte | T3=log(T1) | log(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
6161
| addscalar | miaobyte | T3=T1+scalar | addscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
6262
| greater | miaobyte | mask=compare(T1, T2) | greater(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
6363
| lessscalar | miaobyte | mask=compare(T1, scalar) | lessscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
6464
| cos | miaobyte | T3=cos(T1) | cos(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
65-
| less | miaobyte | mask=compare(T1, T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
66-
| powscalar | miaobyte | T3=pow(T1, scalar) | powscalar(tensor<float64|float32> A, var<float64|int32> scalar)->(tensor<float64|float32> C) |
6765
| minscalar | miaobyte | T3=min(T1, scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
68-
| rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
6966
| rpowscalar | miaobyte | T3=pow(scalar, T1) | rpowscalar(var<float64|int32> scalar, tensor<float64|float32> A)->(tensor<float64|float32> C) |
67+
| rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
68+
| less | miaobyte | mask=compare(T1, T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
69+
| powscalar | miaobyte | T3=pow(T1, scalar) | powscalar(tensor<float64|float32> A, var<float64|int32> scalar)->(tensor<float64|float32> C) |
7070
| todtype | none | T3(dtypeA)->T1(dtypeB) | todtype(tensor<any> a)->(tensor<any> b) |
7171
| add | cublas | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
7272
| add | miaobyte | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |

doc/excuter/op-mem-ompsimd/list.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,22 +49,22 @@
4949

5050
| Operation | Author | Math Formula | IR Instruction |
5151
|-----------|--------|--------------|----------------|
52-
| dropout | miaobyte | A.dropout(p,seed)->C | dropout(tensor<any> A, var<float32> p, var<int32> seed)->(tensor<any> C) |
5352
| switch | miaobyte | C=switch([tensors],case) | switch(listtensor<any> tensors, tensor<int8> cases)->(tensor<any> C) |
5453
| greaterscalar | miaobyte | mask=greater(T1,scalar) | greaterscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
5554
| equalscalar | miaobyte | mask=equal(T1,scalar) | equalscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
5655
| min | miaobyte | T3=min(T1,T2) | min(tensor<any> A, tensor<any> B)->(tensor<any> C) |
5756
| maxscalar | miaobyte | T3=max(T1,scalar) | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
57+
| dropout | miaobyte | dropout(p,seed)->A | dropout(var<float32> p, var<int32> seed)->(tensor<any> A) |
5858
| divscalar | miaobyte | T3=T1/scalar | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
5959
| log | miaobyte | T3=log(T1) | log(tensor<any> A)->(tensor<any> C) |
6060
| addscalar | miaobyte | T3=T1+scalar | addscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) |
6161
| greater | miaobyte | mask=greater(T1,T2) | greater(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
6262
| lessscalar | miaobyte | mask=less(T1,scalar) | lessscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
63-
| less | miaobyte | mask=less(T1,T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
64-
| powscalar | miaobyte | T3=T1^scalar | powscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
6563
| minscalar | miaobyte | T3=min(T1,scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
66-
| rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
6764
| rpowscalar | miaobyte | T3=scalar^T1 | rpowscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
65+
| rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
66+
| less | miaobyte | mask=less(T1,T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
67+
| powscalar | miaobyte | T3=T1^scalar | powscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
6868
| todtype | none | T3(dtypeA)->T1(dtypeB) | todtype(tensor<any> A)->(tensor<any> C) |
6969
| add | cblas | T3=T1+T2 | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) |
7070
| add | miaobyte | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |

excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -410,20 +410,7 @@ namespace deepx::tensorfunc
410410
void invert(const Tensor<T> &input, Tensor<T> &output)
411411
{
412412
invertDispatcher<Author, T>::invert(input, output);
413-
}
414-
415-
//dropout(A,p)=>C
416-
template <typename Author, typename T>
417-
struct dropoutDispatcher
418-
{
419-
static void dropout(const Tensor<T> &input, const float p,const unsigned int seed, Tensor<T> &output) = delete;
420413
};
421-
422-
template <typename Author, typename T>
423-
void dropout(const Tensor<T> &input, const float p,const unsigned int seed, Tensor<T> &output)
424-
{
425-
dropoutDispatcher<Author, T>::dropout(input, p, seed, output);
426-
}
427414

428415
} // namespace deepx::tensorfunc
429416

excuter/cpp-common/src/deepx/tensorfunc/init.hpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,20 @@ namespace deepx::tensorfunc
1919
constantDispatcher<Author, T>::constant(tensor, value);
2020
}
2121

22+
//dropout(A,p)=>C
23+
template <typename Author, typename T>
24+
struct dropoutDispatcher
25+
{
26+
static void dropout(Tensor<T> &input, const float p,const unsigned int seed) = delete;
27+
};
28+
29+
template <typename Author, typename T>
30+
void dropout(Tensor<T> &input, const float p,const unsigned int seed)
31+
{
32+
dropoutDispatcher<Author, T>::dropout(input, p, seed);
33+
}
34+
35+
2236
//arange
2337
template <typename Author, typename T>
2438
struct arangeDispatcher

excuter/op-mem-cuda/src/client/tfs.cpp

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,17 @@ namespace deepx::tf
9191
vector<Param>({
9292
Param("t", DataCategory::Tensor, Precision::Any),
9393
})));
94-
94+
// dropout
95+
tffactory.add_tf(std::make_shared<Dropout<miaobyte>>(vector<Param>(
96+
{
97+
Param("p", DataCategory::Var, Precision::Float32),
98+
Param("seed", DataCategory::Var, Precision::Int32),
99+
}),
100+
vector<Param>(
101+
{
102+
Param("A", DataCategory::Tensor, Precision::Any),
103+
})));
104+
95105
tffactory.add_tf(std::make_shared<Arange<miaobyte>>(vector<Param>(
96106
{
97107

@@ -462,17 +472,7 @@ namespace deepx::tf
462472
{
463473
Param("result", DataCategory::Tensor, Precision::Any),
464474
})));
465-
// dropout
466-
tffactory.add_tf(std::make_shared<Dropout<miaobyte>>(vector<Param>(
467-
{
468-
Param("A", DataCategory::Tensor, Precision::Any),
469-
Param("p", DataCategory::Var, Precision::Float32),
470-
Param("seed", DataCategory::Var, Precision::Int32),
471-
}),
472-
vector<Param>(
473-
{
474-
Param("C", DataCategory::Tensor, Precision::Any),
475-
})));
475+
476476
}
477477
// matmul
478478
void register_matmul(TfFactory &tffactory)

excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu

Lines changed: 0 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -407,48 +407,6 @@ namespace deepx::tensorfunc
407407
template void launch_invert<int16_t>(const int16_t *a, int16_t *c, const int size);
408408
template void launch_invert<int8_t>(const int8_t *a, int8_t *c, const int size);
409409

410-
//dropout
411-
template <typename T>
412-
__global__ void dropout_kernel(const T *A, const float p,const unsigned int seed, T *C, const int size)
413-
{
414-
int stride = blockDim.x * gridDim.x;
415-
curandState state;
416-
curand_init(seed, threadIdx.x, 0, &state); // 仅初始化一次
417-
418-
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride)
419-
{
420-
float rand = curand_uniform(&state);
421-
if (rand < p)
422-
{
423-
C[idx] = 0;
424-
}
425-
else
426-
{
427-
C[idx] = A[idx];
428-
}
429-
}
430-
}
431-
432-
template <typename T>
433-
void launch_dropout(const T *a, const float p,const unsigned int seed, T *c, const int size)
434-
{
435-
auto [numBlocks, blockSize] = BestDims(size);
436-
dropout_kernel<<<numBlocks, blockSize>>>(a, p, seed, c, size);
437-
cudaError_t err = cudaGetLastError();
438-
if (err != cudaSuccess)
439-
{
440-
throw std::runtime_error("Failed to launch dropout kernel: " +
441-
std::string(cudaGetErrorString(err)));
442-
}
443-
}
444-
template void launch_dropout<double>(const double *a, const float p,const unsigned int seed, double *c, const int size);
445-
template void launch_dropout<float>(const float *a, const float p,const unsigned int seed, float *c, const int size);
446-
template void launch_dropout<half>(const half *a, const float p,const unsigned int seed, half *c, const int size);
447-
template void launch_dropout<nv_bfloat16>(const nv_bfloat16 *a, const float p,const unsigned int seed, nv_bfloat16 *c, const int size);
448-
template void launch_dropout<int64_t>(const int64_t *a, const float p,const unsigned int seed, int64_t *c, const int size);
449-
template void launch_dropout<int32_t>(const int32_t *a, const float p,const unsigned int seed, int32_t *c, const int size);
450-
template void launch_dropout<int16_t>(const int16_t *a, const float p,const unsigned int seed, int16_t *c, const int size);
451-
template void launch_dropout<int8_t>(const int8_t *a, const float p,const unsigned int seed, int8_t *c, const int size);
452410
}
453411

454412
#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_BASIC_CU

excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -89,12 +89,6 @@ namespace deepx::tensorfunc
8989
template <typename T>
9090
void launch_invert(const T* a, T* c,const int size);
9191

92-
//dropout
93-
template <typename T>
94-
__global__ void dropout_kernel(const T* A, const float p,const unsigned int seed, T* C,const int size);
95-
96-
template <typename T>
97-
void launch_dropout(const T* a, const float p,const unsigned int seed, T* c,const int size);
9892
}
9993

10094
#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CUH

excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -140,15 +140,6 @@ namespace deepx::tensorfunc
140140
launch_invert( A.data, C.data, A.shape.size);
141141
}
142142
};
143-
144-
template <typename T>
145-
struct dropoutDispatcher<miaobyte, T>
146-
{
147-
static void dropout(const Tensor<T> &A, const float p,const unsigned int seed, Tensor<T> &C)
148-
{
149-
launch_dropout(A.data, p, seed, C.data, A.shape.size);
150-
}
151-
};
152143
}
153144

154145
#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_HPP

excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.cu

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
#include "deepx/tensorfunc/cuda.hpp"
88
namespace deepx::tensorfunc
99
{
10+
// 填充
11+
// constant
1012
template <typename T>
1113
__global__ void kernel_constant(T *data, const T value, const int size)
1214
{
@@ -40,7 +42,47 @@ namespace deepx::tensorfunc
4042
template void launch_constant<int8_t>(int8_t *a, const int8_t value, const int size);
4143
template void launch_constant<bool>(bool *a, const bool value, const int size);
4244

43-
// 添加kernel函数
45+
// dropout
46+
template <typename T>
47+
__global__ void dropout_kernel(T *A, const float p, const unsigned int seed, const int size)
48+
{
49+
int stride = blockDim.x * gridDim.x;
50+
curandState state;
51+
curand_init(seed, threadIdx.x, 0, &state); // 仅初始化一次
52+
53+
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride)
54+
{
55+
float rand = curand_uniform(&state);
56+
if (rand < p)
57+
{
58+
A[idx] = 0;
59+
}
60+
}
61+
}
62+
63+
template <typename T>
64+
void launch_dropout(T *a, const float p, const unsigned int seed, const int size)
65+
{
66+
auto [numBlocks, blockSize] = BestDims(size);
67+
dropout_kernel<<<numBlocks, blockSize>>>(a, p, seed, size);
68+
cudaError_t err = cudaGetLastError();
69+
if (err != cudaSuccess)
70+
{
71+
throw std::runtime_error("Failed to launch dropout kernel: " +
72+
std::string(cudaGetErrorString(err)));
73+
}
74+
}
75+
template void launch_dropout<double>(double *a, const float p, const unsigned int seed, const int size);
76+
template void launch_dropout<float>(float *a, const float p, const unsigned int seed, const int size);
77+
template void launch_dropout<half>(half *a, const float p, const unsigned int seed, const int size);
78+
template void launch_dropout<nv_bfloat16>(nv_bfloat16 *a, const float p, const unsigned int seed, const int size);
79+
template void launch_dropout<int64_t>(int64_t *a, const float p, const unsigned int seed, const int size);
80+
template void launch_dropout<int32_t>(int32_t *a, const float p, const unsigned int seed, const int size);
81+
template void launch_dropout<int16_t>(int16_t *a, const float p, const unsigned int seed, const int size);
82+
template void launch_dropout<int8_t>(int8_t *a, const float p, const unsigned int seed, const int size);
83+
84+
// 初始化
85+
// arange
4486
template <typename T>
4587
__global__ void kernel_arange(T *data, const float start, const float step, const int size)
4688
{
@@ -133,7 +175,7 @@ namespace deepx::tensorfunc
133175
void launch_normal(T *a, const T mean, const T stddev, const unsigned int seed, const int size)
134176
{
135177
auto [numBlocks, blockSize] = BestDims(size);
136-
kernel_normal<<<numBlocks, blockSize>>>(a,float(mean), float(stddev), seed, size);
178+
kernel_normal<<<numBlocks, blockSize>>>(a, float(mean), float(stddev), seed, size);
137179
cudaError_t err = cudaGetLastError();
138180
if (err != cudaSuccess)
139181
throw std::runtime_error("Failed to launch normal kernel");

0 commit comments

Comments
 (0)