Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion doc/design.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,11 @@ deepIR{
excuter执行deepxIR的规则

+ excuter执行deepxIR时,不得修改args中的tensor
+ 但deepIR不限制args和returns中的Param同名,这样可以实现类似inplace的操作
+ 但deepIR不限制args和returns中的Param同名,这样可以实现类似inplace的操作


## front/python规则

### 1.命名规则
+ inplace操作的函数,其名为_后缀, 返回值为空
+ 非inplace操作的函数,其名无_后缀
8 changes: 4 additions & 4 deletions doc/excuter/op-mem-cuda/list.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,25 +48,25 @@

| Operation | Author | Math Formula | IR Instruction |
|-----------|--------|--------------|----------------|
| dropout | miaobyte | T1.dropout(p,seed)->T3 | dropout(tensor<any> A, var<float32> p, var<int32> seed)->(tensor<any> C) |
| switch | miaobyte | C=switch(tensors,cases) | switch(listtensor<any> tensors, tensor<int8> cases)->(tensor<any> result) |
| greaterscalar | miaobyte | mask=compare(T1, scalar) | greaterscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
| equalscalar | miaobyte | mask=compare(T1, scalar) | equalscalar(tensor<any> A, var<any> scalar, var<float64> epsilon)->(tensor<bool> mask) |
| min | miaobyte | T3=min(T1, T2) | min(tensor<any> A, tensor<any> B)->(tensor<any> C) |
| maxscalar | miaobyte | T3=max(T1, scalar) | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
| tan | miaobyte | T3=tan(T1) | tan(tensor<float64|float32> A)->(tensor<float64|float32> C) |
| sin | miaobyte | T3=sin(T1) | sin(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
| dropout | miaobyte | dropout(p,seed)->A | dropout(var<float32> p, var<int32> seed)->(tensor<any> A) |
| divscalar | miaobyte | T3=scalar/T1 | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
| log | miaobyte | T3=log(T1) | log(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
| addscalar | miaobyte | T3=T1+scalar | addscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
| greater | miaobyte | mask=compare(T1, T2) | greater(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
| lessscalar | miaobyte | mask=compare(T1, scalar) | lessscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
| cos | miaobyte | T3=cos(T1) | cos(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
| less | miaobyte | mask=compare(T1, T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
| powscalar | miaobyte | T3=pow(T1, scalar) | powscalar(tensor<float64|float32> A, var<float64|int32> scalar)->(tensor<float64|float32> C) |
| minscalar | miaobyte | T3=min(T1, scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
| rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
| rpowscalar | miaobyte | T3=pow(scalar, T1) | rpowscalar(var<float64|int32> scalar, tensor<float64|float32> A)->(tensor<float64|float32> C) |
| rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
| less | miaobyte | mask=compare(T1, T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
| powscalar | miaobyte | T3=pow(T1, scalar) | powscalar(tensor<float64|float32> A, var<float64|int32> scalar)->(tensor<float64|float32> C) |
| todtype | none | T3(dtypeA)->T1(dtypeB) | todtype(tensor<any> a)->(tensor<any> b) |
| add | cublas | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
| add | miaobyte | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
Expand Down
8 changes: 4 additions & 4 deletions doc/excuter/op-mem-ompsimd/list.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,22 +49,22 @@

| Operation | Author | Math Formula | IR Instruction |
|-----------|--------|--------------|----------------|
| dropout | miaobyte | A.dropout(p,seed)->C | dropout(tensor<any> A, var<float32> p, var<int32> seed)->(tensor<any> C) |
| switch | miaobyte | C=switch([tensors],case) | switch(listtensor<any> tensors, tensor<int8> cases)->(tensor<any> C) |
| greaterscalar | miaobyte | mask=greater(T1,scalar) | greaterscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
| equalscalar | miaobyte | mask=equal(T1,scalar) | equalscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
| min | miaobyte | T3=min(T1,T2) | min(tensor<any> A, tensor<any> B)->(tensor<any> C) |
| maxscalar | miaobyte | T3=max(T1,scalar) | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
| dropout | miaobyte | dropout(p,seed)->A | dropout(var<float32> p, var<int32> seed)->(tensor<any> A) |
| divscalar | miaobyte | T3=T1/scalar | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
| log | miaobyte | T3=log(T1) | log(tensor<any> A)->(tensor<any> C) |
| addscalar | miaobyte | T3=T1+scalar | addscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) |
| greater | miaobyte | mask=greater(T1,T2) | greater(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
| lessscalar | miaobyte | mask=less(T1,scalar) | lessscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
| less | miaobyte | mask=less(T1,T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
| powscalar | miaobyte | T3=T1^scalar | powscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
| minscalar | miaobyte | T3=min(T1,scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
| rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
| rpowscalar | miaobyte | T3=scalar^T1 | rpowscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
| rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
| less | miaobyte | mask=less(T1,T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
| powscalar | miaobyte | T3=T1^scalar | powscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
| todtype | none | T3(dtypeA)->T1(dtypeB) | todtype(tensor<any> A)->(tensor<any> C) |
| add | cblas | T3=T1+T2 | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) |
| add | miaobyte | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
Expand Down
13 changes: 0 additions & 13 deletions excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -410,20 +410,7 @@ namespace deepx::tensorfunc
void invert(const Tensor<T> &input, Tensor<T> &output)
{
invertDispatcher<Author, T>::invert(input, output);
}

//dropout(A,p)=>C
template <typename Author, typename T>
struct dropoutDispatcher
{
static void dropout(const Tensor<T> &input, const float p,const unsigned int seed, Tensor<T> &output) = delete;
};

template <typename Author, typename T>
void dropout(const Tensor<T> &input, const float p,const unsigned int seed, Tensor<T> &output)
{
dropoutDispatcher<Author, T>::dropout(input, p, seed, output);
}

} // namespace deepx::tensorfunc

Expand Down
14 changes: 14 additions & 0 deletions excuter/cpp-common/src/deepx/tensorfunc/init.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,20 @@ namespace deepx::tensorfunc
constantDispatcher<Author, T>::constant(tensor, value);
}

//dropout(A,p)=>C
template <typename Author, typename T>
struct dropoutDispatcher
{
static void dropout(Tensor<T> &input, const float p,const unsigned int seed) = delete;
};

template <typename Author, typename T>
void dropout(Tensor<T> &input, const float p,const unsigned int seed)
{
dropoutDispatcher<Author, T>::dropout(input, p, seed);
}


//arange
template <typename Author, typename T>
struct arangeDispatcher
Expand Down
24 changes: 12 additions & 12 deletions excuter/op-mem-cuda/src/client/tfs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,17 @@ namespace deepx::tf
vector<Param>({
Param("t", DataCategory::Tensor, Precision::Any),
})));

// dropout
tffactory.add_tf(std::make_shared<Dropout<miaobyte>>(vector<Param>(
{
Param("p", DataCategory::Var, Precision::Float32),
Param("seed", DataCategory::Var, Precision::Int32),
}),
vector<Param>(
{
Param("A", DataCategory::Tensor, Precision::Any),
})));

tffactory.add_tf(std::make_shared<Arange<miaobyte>>(vector<Param>(
{

Expand Down Expand Up @@ -462,17 +472,7 @@ namespace deepx::tf
{
Param("result", DataCategory::Tensor, Precision::Any),
})));
// dropout
tffactory.add_tf(std::make_shared<Dropout<miaobyte>>(vector<Param>(
{
Param("A", DataCategory::Tensor, Precision::Any),
Param("p", DataCategory::Var, Precision::Float32),
Param("seed", DataCategory::Var, Precision::Int32),
}),
vector<Param>(
{
Param("C", DataCategory::Tensor, Precision::Any),
})));

}
// matmul
void register_matmul(TfFactory &tffactory)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -407,48 +407,6 @@ namespace deepx::tensorfunc
template void launch_invert<int16_t>(const int16_t *a, int16_t *c, const int size);
template void launch_invert<int8_t>(const int8_t *a, int8_t *c, const int size);

//dropout
template <typename T>
__global__ void dropout_kernel(const T *A, const float p,const unsigned int seed, T *C, const int size)
{
int stride = blockDim.x * gridDim.x;
curandState state;
curand_init(seed, threadIdx.x, 0, &state); // 仅初始化一次

for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride)
{
float rand = curand_uniform(&state);
if (rand < p)
{
C[idx] = 0;
}
else
{
C[idx] = A[idx];
}
}
}

template <typename T>
void launch_dropout(const T *a, const float p,const unsigned int seed, T *c, const int size)
{
auto [numBlocks, blockSize] = BestDims(size);
dropout_kernel<<<numBlocks, blockSize>>>(a, p, seed, c, size);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
throw std::runtime_error("Failed to launch dropout kernel: " +
std::string(cudaGetErrorString(err)));
}
}
template void launch_dropout<double>(const double *a, const float p,const unsigned int seed, double *c, const int size);
template void launch_dropout<float>(const float *a, const float p,const unsigned int seed, float *c, const int size);
template void launch_dropout<half>(const half *a, const float p,const unsigned int seed, half *c, const int size);
template void launch_dropout<nv_bfloat16>(const nv_bfloat16 *a, const float p,const unsigned int seed, nv_bfloat16 *c, const int size);
template void launch_dropout<int64_t>(const int64_t *a, const float p,const unsigned int seed, int64_t *c, const int size);
template void launch_dropout<int32_t>(const int32_t *a, const float p,const unsigned int seed, int32_t *c, const int size);
template void launch_dropout<int16_t>(const int16_t *a, const float p,const unsigned int seed, int16_t *c, const int size);
template void launch_dropout<int8_t>(const int8_t *a, const float p,const unsigned int seed, int8_t *c, const int size);
}

#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_BASIC_CU
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,6 @@ namespace deepx::tensorfunc
template <typename T>
void launch_invert(const T* a, T* c,const int size);

//dropout
template <typename T>
__global__ void dropout_kernel(const T* A, const float p,const unsigned int seed, T* C,const int size);

template <typename T>
void launch_dropout(const T* a, const float p,const unsigned int seed, T* c,const int size);
}

#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CUH
Original file line number Diff line number Diff line change
Expand Up @@ -140,15 +140,6 @@ namespace deepx::tensorfunc
launch_invert( A.data, C.data, A.shape.size);
}
};

template <typename T>
struct dropoutDispatcher<miaobyte, T>
{
static void dropout(const Tensor<T> &A, const float p,const unsigned int seed, Tensor<T> &C)
{
launch_dropout(A.data, p, seed, C.data, A.shape.size);
}
};
}

#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_HPP
46 changes: 44 additions & 2 deletions excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
#include "deepx/tensorfunc/cuda.hpp"
namespace deepx::tensorfunc
{
// 填充
// constant
template <typename T>
__global__ void kernel_constant(T *data, const T value, const int size)
{
Expand Down Expand Up @@ -40,7 +42,47 @@ namespace deepx::tensorfunc
template void launch_constant<int8_t>(int8_t *a, const int8_t value, const int size);
template void launch_constant<bool>(bool *a, const bool value, const int size);

// 添加kernel函数
// dropout
template <typename T>
__global__ void dropout_kernel(T *A, const float p, const unsigned int seed, const int size)
{
int stride = blockDim.x * gridDim.x;
curandState state;
curand_init(seed, threadIdx.x, 0, &state); // 仅初始化一次

for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride)
{
float rand = curand_uniform(&state);
if (rand < p)
{
A[idx] = 0;
}
}
}

template <typename T>
void launch_dropout(T *a, const float p, const unsigned int seed, const int size)
{
auto [numBlocks, blockSize] = BestDims(size);
dropout_kernel<<<numBlocks, blockSize>>>(a, p, seed, size);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
throw std::runtime_error("Failed to launch dropout kernel: " +
std::string(cudaGetErrorString(err)));
}
}
template void launch_dropout<double>(double *a, const float p, const unsigned int seed, const int size);
template void launch_dropout<float>(float *a, const float p, const unsigned int seed, const int size);
template void launch_dropout<half>(half *a, const float p, const unsigned int seed, const int size);
template void launch_dropout<nv_bfloat16>(nv_bfloat16 *a, const float p, const unsigned int seed, const int size);
template void launch_dropout<int64_t>(int64_t *a, const float p, const unsigned int seed, const int size);
template void launch_dropout<int32_t>(int32_t *a, const float p, const unsigned int seed, const int size);
template void launch_dropout<int16_t>(int16_t *a, const float p, const unsigned int seed, const int size);
template void launch_dropout<int8_t>(int8_t *a, const float p, const unsigned int seed, const int size);

// 初始化
// arange
template <typename T>
__global__ void kernel_arange(T *data, const float start, const float step, const int size)
{
Expand Down Expand Up @@ -133,7 +175,7 @@ namespace deepx::tensorfunc
void launch_normal(T *a, const T mean, const T stddev, const unsigned int seed, const int size)
{
auto [numBlocks, blockSize] = BestDims(size);
kernel_normal<<<numBlocks, blockSize>>>(a,float(mean), float(stddev), seed, size);
kernel_normal<<<numBlocks, blockSize>>>(a, float(mean), float(stddev), seed, size);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
throw std::runtime_error("Failed to launch normal kernel");
Expand Down
16 changes: 14 additions & 2 deletions excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -10,24 +10,36 @@

namespace deepx::tensorfunc
{
//填充
template <typename T>
__global__ void kernel_constant(T *data, const T value, const int size);

template <typename T>
void launch_constant(T *a, const T value, const int size);


//dropout
template <typename T>
__global__ void dropout_kernel(T* A, const float p,const unsigned int seed,const int size);

template <typename T>
void launch_dropout(T* a, const float p,const unsigned int seed,const int size);

//初始化
//arange
template <typename T>
__global__ void kernel_arange(T *data, const float start, const float step, const int size);

template <typename T>
void launch_arange(T *a, const T start, const T step, const int size);


//uniform
template <typename T>
__global__ void kernel_uniform(T *data, const float low, const float high, const unsigned int seed, const int size);

template <typename T>
void launch_uniform(T *a, const T low, const T high, const unsigned int seed, const int size);

//normal
template <typename T>
__global__ void kernel_normal(T *data, const float mean, const float stddev, const unsigned int seed, const int size);

Expand Down
11 changes: 11 additions & 0 deletions excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,17 @@ namespace deepx::tensorfunc
}
};



template <typename T>
struct dropoutDispatcher<miaobyte, T>
{
static void dropout(Tensor<T> &A, const float p,const unsigned int seed)
{
launch_dropout(A.data, p, seed, A.shape.size);
}
};

// arange
template <typename T>
struct arangeDispatcher<miaobyte, T>
Expand Down
Loading
Loading