From 4992e78448c24cbfa1f1fb4780c7c4b7ed57a920 Mon Sep 17 00:00:00 2001 From: lipeng <734991033@qq.com> Date: Mon, 28 Apr 2025 21:36:27 +0800 Subject: [PATCH 1/2] =?UTF-8?q?dropout:ompsimd+cuda=E7=9A=84=E5=AE=9E?= =?UTF-8?q?=E7=8E=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CODE_OF_CONDUCT.md | 45 +++ CONTRIBUTING.md | 28 ++ README.md | 23 -- doc/excuter/op-mem-cuda/list.md | 1 + .../src/deepx/tensorfunc/elementwise.hpp | 13 + excuter/op-mem-cuda/src/client/tfs.cpp | 11 + .../tensorfunc/elementwise_miaobyte_basic.cu | 45 +++ .../tensorfunc/elementwise_miaobyte_basic.cuh | 6 + .../tensorfunc/elementwise_miaobyte_basic.hpp | 9 + .../src/deepx/tf/elementwise_basic.hpp | 68 +++++ .../deepx/tensorfunc/elementwise_miaobyte.hpp | 256 ++++++++++-------- .../src/deepx/tf/elementwise.hpp | 61 ++++- 12 files changed, 428 insertions(+), 138 deletions(-) create mode 100644 CODE_OF_CONDUCT.md create mode 100644 CONTRIBUTING.md diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 00000000..96620939 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,45 @@ +# DeepX 行为准则 + +## 我们的承诺 + +作为贡献者和维护者,我们承诺为每个人提供一个开放和欢迎的环境。 + +## 我们的标准 + +有助于创造积极环境的行为包括但不限于: + +- 使用友好和包容的语言 +- 尊重不同的观点和经验 +- 耐心地接受建设性的批评 +- 关注对社区最有利的事情 +- 友善对待其他社区成员 + +不可接受的行为包括但不限于: + +- 使用性化的语言或图像以及不受欢迎的性关注或挑逗 +- 捣乱/煽动/侮辱性/贬损的评论,人身攻击或政治攻击 +- 公开或私下的骚扰 +- 未经明确许可,发布他人的私人信息,如物理或电子地址 +- 其他可以合理地被认为不符合专业行为的行为 + +## 我们的责任 + +项目维护者有责任澄清可接受行为的标准,并应对任何不可接受的行为采取适当和公平的纠正措施。 + +项目维护者有权利和责任删除、编辑或拒绝与本行为准则不符的评论、提交、代码、wiki编辑、问题和其他贡献,并可暂时或永久禁止任何他们认为不适合、威胁、冒犯或有害的贡献者。 + +## 适用范围 + +当个人代表项目或其社区时,本行为准则适用于项目空间和公共空间。 + +## 执行 + +如有滥用、骚扰或其他不可接受的行为,请通过以下方式联系项目团队。所有投诉都将被审查和调查,并将导致认为必要和适当的回应。 + +## 联系信息 + +请通过 [您的联系信息] 联系我们。 + +## 归属 + +本行为准则改编自[贡献者公约](https://www.contributor-covenant.org),版本1.4。 \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..15b9f801 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,28 @@ +# deepx 贡献指南 + +deepx框架的发展,主要包括五大类方向 + ++ front: 新增模型、module、python类函数等 ++ 中间层:包括计算图优化器,插件系统(自动KVcache系统),自动分布式化,栈tensor自动释放,自动Inplace化等操作 ++ 新增或修改excuter ++ 增加或修改算子,进一步可以分为leaftensorfunc(不可分割的基础算子),fusedtensorfunc(融合算子) ++ 文档丰富: ++ 运维自动化方向 + +大家可以选择一个方向 + +## 步骤 + +第一次提交 + 1. Fork本仓库(github.com/array2d/deepx)的main分支,到你的github/yourname/deepx + 2. 本地clone github/yourname/deepx + 3. 提交并推送您的更改到你的github:`git commit -m 'Add some feature'` + 4. 创建一个Pull Request。 + +第N次提交 + + 1. 保障你的本地和github/yourname/deepx中均已提pull request并得到merge + 2. 在github/yourname/deepx中sync fork【危险操作,会删除你新增的代码】,拉取(github.com/array2d/deepx) main分支的最新代码 + 3. 本地clone github/yourname/deepx + 4. 提交并推送您的更改到你的github:`git commit -m 'Add some feature'` + 5. 创建一个Pull Request。 \ No newline at end of file diff --git a/README.md b/README.md index 4f53405c..a590faa9 100644 --- a/README.md +++ b/README.md @@ -66,29 +66,6 @@ DeepX可以集成现有的张量计算框架作为执行器,充分利用现有 这种架构使得DeepX可以整合各类先进的计算框架作为执行引擎,同时提供统一的分布式调度和执行能力,为用户提供更灵活的选择和更高的性能。 -## 二.贡献指南 - -也可以参考官方文档的指南 - -https://deepx.array2d.com - -欢迎通过以下方式参与项目共建: - -1. **代码贡献** - - 提交PR前请先创建Issue说明修改内容 - - front项目当前以py为核心 - - excuter:目前规划开发的3类执行器,参考这里如何给excuter添加一个新算子[excuter](doc/excuter/excuter.md) - - cpu: - - cuda: - - jax: - -2. **文档改进** - - 提交文档更新到`doc/`目录 - -3. **问题反馈** - - 当前处于高速迭代中,可通过issue反馈问题 - - ### 官方文档 diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md index 0fa979ae..b177d4e9 100644 --- a/doc/excuter/op-mem-cuda/list.md +++ b/doc/excuter/op-mem-cuda/list.md @@ -48,6 +48,7 @@ | Operation | Author | Math Formula | IR Instruction | |-----------|--------|--------------|----------------| +| dropout | miaobyte | T1.dropout(p,seed)->T3 | dropout(tensor A, var p, var seed)->(tensor C) | | switch | miaobyte | C=switch(tensors,cases) | switch(listtensor tensors, tensor cases)->(tensor result) | | greaterscalar | miaobyte | mask=compare(T1, scalar) | greaterscalar(tensor A, var scalar)->(tensor mask) | | equalscalar | miaobyte | mask=compare(T1, scalar) | equalscalar(tensor A, var scalar, var epsilon)->(tensor mask) | diff --git a/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp b/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp index 6e3b2072..52fd04bc 100644 --- a/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp +++ b/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp @@ -412,6 +412,19 @@ namespace deepx::tensorfunc invertDispatcher::invert(input, output); } + //dropout(A,p)=>C + template + struct dropoutDispatcher + { + static void dropout(const Tensor &input, const float p,const unsigned int seed, Tensor &output) = delete; + }; + + template + void dropout(const Tensor &input, const float p,const unsigned int seed, Tensor &output) + { + dropoutDispatcher::dropout(input, p, seed, output); + } + } // namespace deepx::tensorfunc #endif // DEEPX_TENSORFUNC_ELEMENTWISE_HPP diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp index 4aeaa5ad..5e304d29 100644 --- a/excuter/op-mem-cuda/src/client/tfs.cpp +++ b/excuter/op-mem-cuda/src/client/tfs.cpp @@ -462,6 +462,17 @@ namespace deepx::tf { Param("result", DataCategory::Tensor, Precision::Any), }))); + // dropout + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("p", DataCategory::Var, Precision::Float32), + Param("seed", DataCategory::Var, Precision::Int32), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); } // matmul void register_matmul(TfFactory &tffactory) diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu index 772865f3..8550e6b0 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu @@ -3,6 +3,9 @@ #include #include +#include + + #include "deepx/tensorfunc/cuda.hpp" #include "deepx/tensorfunc/authors.hpp" #include "deepx/tensorfunc/cuda_math.cuh" @@ -404,6 +407,48 @@ namespace deepx::tensorfunc template void launch_invert(const int16_t *a, int16_t *c, const int size); template void launch_invert(const int8_t *a, int8_t *c, const int size); + //dropout + template + __global__ void dropout_kernel(const T *A, const float p,const unsigned int seed, T *C, const int size) + { + int stride = blockDim.x * gridDim.x; + curandState state; + curand_init(seed, threadIdx.x, 0, &state); // 仅初始化一次 + + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride) + { + float rand = curand_uniform(&state); + if (rand < p) + { + C[idx] = 0; + } + else + { + C[idx] = A[idx]; + } + } + } + + template + void launch_dropout(const T *a, const float p,const unsigned int seed, T *c, const int size) + { + auto [numBlocks, blockSize] = BestDims(size); + dropout_kernel<<>>(a, p, seed, c, size); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + throw std::runtime_error("Failed to launch dropout kernel: " + + std::string(cudaGetErrorString(err))); + } + } + template void launch_dropout(const double *a, const float p,const unsigned int seed, double *c, const int size); + template void launch_dropout(const float *a, const float p,const unsigned int seed, float *c, const int size); + template void launch_dropout(const half *a, const float p,const unsigned int seed, half *c, const int size); + template void launch_dropout(const nv_bfloat16 *a, const float p,const unsigned int seed, nv_bfloat16 *c, const int size); + template void launch_dropout(const int64_t *a, const float p,const unsigned int seed, int64_t *c, const int size); + template void launch_dropout(const int32_t *a, const float p,const unsigned int seed, int32_t *c, const int size); + template void launch_dropout(const int16_t *a, const float p,const unsigned int seed, int16_t *c, const int size); + template void launch_dropout(const int8_t *a, const float p,const unsigned int seed, int8_t *c, const int size); } #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_BASIC_CU diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh index 4100f38d..c85cce81 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh @@ -89,6 +89,12 @@ namespace deepx::tensorfunc template void launch_invert(const T* a, T* c,const int size); + //dropout + template + __global__ void dropout_kernel(const T* A, const float p,const unsigned int seed, T* C,const int size); + + template + void launch_dropout(const T* a, const float p,const unsigned int seed, T* c,const int size); } #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CUH diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp index b7d3a680..65f7a82d 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp @@ -140,6 +140,15 @@ namespace deepx::tensorfunc launch_invert( A.data, C.data, A.shape.size); } }; + + template + struct dropoutDispatcher + { + static void dropout(const Tensor &A, const float p,const unsigned int seed, Tensor &C) + { + launch_dropout(A.data, p, seed, C.data, A.shape.size); + } + }; } #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_HPP diff --git a/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp b/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp index 709a899c..885bb5e6 100644 --- a/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp +++ b/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp @@ -1033,6 +1033,74 @@ namespace deepx::tf return 0; } }; + + // dropout + template + class Dropout : public TF + { + public: + Dropout(const vector &args, const vector &returns) + { + this->name = "dropout"; + this->metadata.author = Author::name(); + this->tftype = "elementwise"; + this->args = args; + this->returns = returns; + } + string math_formula() const override + { + return "T1.dropout(p,seed)->T3"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error)) + { + return 1; + } + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != c_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::dropout(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), this->getvar(2, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::dropout(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), this->getvar(2, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float16: + tensorfunc::dropout(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), this->getvar(2, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::BFloat16: + tensorfunc::dropout(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), this->getvar(2, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::dropout(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), this->getvar(2, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::dropout(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), this->getvar(2, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::dropout(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), this->getvar(2, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::dropout(*mem->gettensor(this->args[0].textvalue), this->getvar(1, mem), this->getvar(2, mem), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(a_type); + return 1; + } + return 0; + } + }; }; #endif // DEEPX_TF_ELEMENTWISE_BASIC_HPP diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp index f8f0302f..b36902a4 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp @@ -1,6 +1,7 @@ #ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_HPP #define DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_HPP +#include #include #include #include @@ -18,8 +19,8 @@ namespace deepx::tensorfunc { if (A.shape == B.shape && A.shape == C.shape) { - C.shape.rangeElementwiseParallel([&A, &B, &C, &scalar_op, &simd_op](int i,int i_end) - { + C.shape.rangeElementwiseParallel([&A, &B, &C, &scalar_op, &simd_op](int i, int i_end) + { const ScalableTag tag; const size_t lanes = Lanes(tag); @@ -62,8 +63,8 @@ namespace deepx::tensorfunc { if (A.shape == C.shape) { - C.shape.rangeElementwiseParallel([&A, &b, &C, &scalar_op, &simd_op](int i,int i_end) - { + C.shape.rangeElementwiseParallel([&A, &b, &C, &scalar_op, &simd_op](int i, int i_end) + { const ScalableTag tag; const size_t lanes = Lanes(tag); size_t j = 0; @@ -98,20 +99,18 @@ namespace deepx::tensorfunc } } - //todtype - template + // todtype + template static void todtype(const Tensor &A, Tensor &C) { - C.shape.rangeElementwiseParallel([&A, &C](int i,int i_end) - { + C.shape.rangeElementwiseParallel([&A, &C](int i, int i_end) + { for (int j = 0; j < i_end; j++) { C.data[i + j] = static_cast(A.data[i + j]); - } - }); + } }); } - // add template struct addDispatcher @@ -302,23 +301,22 @@ namespace deepx::tensorfunc struct invertDispatcher { static void invert(const Tensor &A, Tensor &C) - { + { if (A.shape == C.shape) { - A.shape.rangeElementwiseParallel([&A, &C](int idx,int idx_end) - { + A.shape.rangeElementwiseParallel([&A, &C](int idx, int idx_end) + { for (int j=0;j struct sqrtDispatcher>> @@ -327,8 +325,8 @@ namespace deepx::tensorfunc { if (input.shape == output.shape) { - output.shape.rangeElementwiseParallel([&input, &output](int i,int i_end) - { + output.shape.rangeElementwiseParallel([&input, &output](int i, int i_end) + { const ScalableTag tag; const size_t lanes = Lanes(tag); size_t j=0; @@ -367,8 +365,8 @@ namespace deepx::tensorfunc { if (input.shape == output.shape) { - output.shape.rangeElementwiseParallel([&input, &output](int i,int i_end) - { + output.shape.rangeElementwiseParallel([&input, &output](int i, int i_end) + { size_t j = 0; while (j < i_end) @@ -392,8 +390,8 @@ namespace deepx::tensorfunc { if (A.shape == B.shape && A.shape == C.shape) { - C.shape.rangeElementwiseParallel([&A, &B, &C](int i,int i_end) - { + C.shape.rangeElementwiseParallel([&A, &B, &C](int i, int i_end) + { for (int j = 0; j < i_end; j++) C.data[i+j] = std::pow(A.data[i+j], B.data[i+j]); }); } @@ -413,11 +411,10 @@ namespace deepx::tensorfunc { if (input.shape == output.shape) { - output.shape.rangeElementwiseParallel([&input, &output, &value](int i,int i_end) - { + output.shape.rangeElementwiseParallel([&input, &output, &value](int i, int i_end) + { for (int j = 0; j < i_end; j++) - output.data[i+j] = std::pow(input.data[i+j], value); - }); + output.data[i+j] = std::pow(input.data[i+j], value); }); } else { @@ -434,18 +431,17 @@ namespace deepx::tensorfunc { if (input.shape == output.shape) { - output.shape.rangeElementwiseParallel([&input, &output, &value](int i,int i_end) - { + output.shape.rangeElementwiseParallel([&input, &output, &value](int i, int i_end) + { for (int j = 0; j < i_end; j++) - output.data[i+j] = std::pow(value, input.data[i+j]); - }); + output.data[i+j] = std::pow(value, input.data[i+j]); }); } else { throw std::invalid_argument("shape mismatch"); } } - }; + }; template struct logDispatcher @@ -455,10 +451,9 @@ namespace deepx::tensorfunc { if (input.shape == output.shape) { - output.shape.rangeElementwiseParallel([&input, &output](int i,int i_end) - { for (int j = 0; j < i_end; j++) - output.data[i+j] = std::log(input.data[i+j]); - }); + output.shape.rangeElementwiseParallel([&input, &output](int i, int i_end) + { for (int j = 0; j < i_end; j++) + output.data[i+j] = std::log(input.data[i+j]); }); } else { @@ -475,10 +470,9 @@ namespace deepx::tensorfunc { if (input.shape == output.shape) { - output.shape.rangeElementwiseParallel([&input, &output](int i,int i_end) - { for (int j = 0; j < i_end; j++) - output.data[i+j] = std::exp(input.data[i+j]); - }); + output.shape.rangeElementwiseParallel([&input, &output](int i, int i_end) + { for (int j = 0; j < i_end; j++) + output.data[i+j] = std::exp(input.data[i+j]); }); } else { @@ -495,8 +489,8 @@ namespace deepx::tensorfunc { if (input.shape == output.shape) { - output.shape.rangeElementwiseParallel([&input, &output](int i,int i_end) - { + output.shape.rangeElementwiseParallel([&input, &output](int i, int i_end) + { const ScalableTag tag; const size_t lanes = Lanes(tag); size_t j=0; @@ -537,8 +531,8 @@ namespace deepx::tensorfunc { if (input.shape == output.shape) { - output.shape.rangeElementwiseParallel([&input, &output](int i,int i_end) - { + output.shape.rangeElementwiseParallel([&input, &output](int i, int i_end) + { const ScalableTag tag; const size_t lanes = Lanes(tag); size_t j=0; @@ -579,8 +573,8 @@ namespace deepx::tensorfunc { if (input.shape == output.shape) { - output.shape.rangeElementwiseParallel([&input, &output](int i,int i_end) - { + output.shape.rangeElementwiseParallel([&input, &output](int i, int i_end) + { const ScalableTag tag; const size_t lanes = Lanes(tag); size_t j=0; @@ -620,8 +614,8 @@ namespace deepx::tensorfunc { if (A.shape == B.shape && A.shape == C.shape) { - C.shape.rangeElementwiseParallel([&A, &B, &C](int i,int i_end) - { + C.shape.rangeElementwiseParallel([&A, &B, &C](int i, int i_end) + { const ScalableTag tag; const size_t lanes = Lanes(tag); size_t j=0; @@ -662,8 +656,8 @@ namespace deepx::tensorfunc { if (A.shape == C.shape) { - C.shape.rangeElementwiseParallel([&A, b, &C](int i,int i_end) - { + C.shape.rangeElementwiseParallel([&A, b, &C](int i, int i_end) + { const ScalableTag tag; const size_t lanes = Lanes(tag); size_t j=0; @@ -704,8 +698,8 @@ namespace deepx::tensorfunc { if (A.shape == B.shape && A.shape == C.shape) { - C.shape.rangeElementwiseParallel([&A, &B, &C](int i,int i_end) - { + C.shape.rangeElementwiseParallel([&A, &B, &C](int i, int i_end) + { const ScalableTag tag; const size_t lanes = Lanes(tag); size_t j=0; @@ -746,8 +740,8 @@ namespace deepx::tensorfunc { if (A.shape == C.shape) { - C.shape.rangeElementwiseParallel([&A, b, &C](int i,int i_end) - { + C.shape.rangeElementwiseParallel([&A, b, &C](int i, int i_end) + { const ScalableTag tag; const size_t lanes = Lanes(tag); size_t j=0; @@ -780,16 +774,16 @@ namespace deepx::tensorfunc } }; - //equal - template - struct equalDispatcher + // equal + template + struct equalDispatcher { - static void equal(const Tensor &A, const Tensor &B,const float epsilon, Tensor &mask) + static void equal(const Tensor &A, const Tensor &B, const float epsilon, Tensor &mask) { if (A.shape == B.shape && mask.shape == A.shape) - { - A.shape.rangeElementwiseParallel([&A, &B, &mask,epsilon](int i,int i_end) - { + { + A.shape.rangeElementwiseParallel([&A, &B, &mask, epsilon](int i, int i_end) + { for (int j = 0; j < i_end; j++) { if (epsilon == 0) @@ -799,8 +793,7 @@ namespace deepx::tensorfunc else{ mask.data[i+j]=std::abs(A.data[i+j]-B.data[i+j])<=epsilon; } - } - }); + } }); } else { @@ -809,16 +802,16 @@ namespace deepx::tensorfunc } }; - //equalscalar - template - struct equalscalarDispatcher + // equalscalar + template + struct equalscalarDispatcher { - static void equalscalar(const Tensor &A, const T scalar,const float epsilon, Tensor &mask) + static void equalscalar(const Tensor &A, const T scalar, const float epsilon, Tensor &mask) { if (A.shape == mask.shape) { - A.shape.rangeElementwiseParallel([&A, &mask, &scalar,epsilon](int i,int i_end) - { + A.shape.rangeElementwiseParallel([&A, &mask, &scalar, epsilon](int i, int i_end) + { for (int j = 0; j < i_end; j++) { if (epsilon == 0) @@ -828,8 +821,7 @@ namespace deepx::tensorfunc else{ mask.data[i+j]=std::abs(A.data[i+j]-scalar)<=epsilon; } - } - }); + } }); } else { @@ -838,67 +830,64 @@ namespace deepx::tensorfunc }; }; - //less - template - struct lessDispatcher + // less + template + struct lessDispatcher { static void less(const Tensor &A, const Tensor &B, Tensor &mask) { if (A.shape == B.shape && mask.shape == A.shape) { - A.shape.rangeElementwiseParallel([&A, &B, &mask](int i,int i_end) - { + A.shape.rangeElementwiseParallel([&A, &B, &mask](int i, int i_end) + { for (int j = 0; j < i_end; j++) { mask.data[i+j]=A.data[i+j] - struct lessscalarDispatcher + // lessscalar + template + struct lessscalarDispatcher { static void lessscalar(const Tensor &A, const T scalar, Tensor &mask) { if (A.shape == mask.shape) { - A.shape.rangeElementwiseParallel([&A, &mask, &scalar](int i,int i_end) - { + A.shape.rangeElementwiseParallel([&A, &mask, &scalar](int i, int i_end) + { for (int j = 0; j < i_end; j++) { mask.data[i+j]=A.data[i+j] - struct greaterDispatcher + + // greater + template + struct greaterDispatcher { static void greater(const Tensor &A, const Tensor &B, Tensor &mask) { if (A.shape == B.shape && mask.shape == A.shape) { - A.shape.rangeElementwiseParallel([&A, &B, &mask](int i,int i_end) - { + A.shape.rangeElementwiseParallel([&A, &B, &mask](int i, int i_end) + { for (int j = 0; j < i_end; j++) { mask.data[i+j]=A.data[i+j]>B.data[i+j]; - } - }); + } }); } else { @@ -907,52 +896,91 @@ namespace deepx::tensorfunc } }; - //greaterscalar - template - struct greaterscalarDispatcher + // greaterscalar + template + struct greaterscalarDispatcher { static void greaterscalar(const Tensor &A, const T scalar, Tensor &mask) { if (A.shape == mask.shape) { - A.shape.rangeElementwiseParallel([&A, &mask, &scalar](int i,int i_end) - { + A.shape.rangeElementwiseParallel([&A, &mask, &scalar](int i, int i_end) + { for (int j = 0; j < i_end; j++) { mask.data[i+j]=A.data[i+j]>scalar; - } - }); + } }); } else { throw std::invalid_argument("shape mismatch"); } - } - }; + } + }; - //switch - template - struct switchDispatcher + // switch + template + struct switchDispatcher { - static void Switch(const vector*> tensors,const Tensor &cases, Tensor &C) + static void Switch(const vector *> tensors, const Tensor &cases, Tensor &C) { if (cases.shape == C.shape) { - C.shape.rangeElementwiseParallel([&tensors, &cases, &C](int i,int i_end) - { + C.shape.rangeElementwiseParallel([&tensors, &cases, &C](int i, int i_end) + { for (int j = 0; j < i_end; j++) { int which_tensor=cases.data[i]; C.data[i+j]=tensors[which_tensor]->data[i]; + } }); + } + else + { + throw std::invalid_argument("shape mismatch"); + } + } + }; + + // dropout + template + struct dropoutDispatcher + { + static void dropout(const Tensor &A, const float p, const unsigned int seed, Tensor &C) + { + if (A.shape == C.shape) + { + std::uniform_real_distribution distribution(0, 1); + std::default_random_engine generator; + if (seed != 0) + { + generator.seed(seed); + } + else + { + std::random_device rd; + generator.seed(rd()); } - }); + + A.shape.rangeElementwiseParallel([&A, &C, &p, &distribution, &generator](int i, int i_end) + { + for (int j = 0; j < i_end; j++) + { + double rand = distribution(generator); + if (rand < p) + { + C.data[i+j]=0; + } + else + { + C.data[i+j]=A.data[i+j]; + } + } }); } else { throw std::invalid_argument("shape mismatch"); - } + } } - }; - + }; }; #endif // DEEPX_OP_CPU_ELEMENTWISE_HPP \ No newline at end of file diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp index 58768784..7c29ddab 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp @@ -1900,5 +1900,64 @@ namespace deepx::tf return 0; } }; + + //dropout + template + class Dropout : public TF + { + public: + Dropout(vector args, vector returns) + { + this->name = "dropout"; + this->metadata.author = Author::name(); + this->tftype = "elementwise"; + this->args = args; + this->returns = returns; + } + string math_formula() const override + { + return "A.dropout(p,seed)->C"; + } + shared_ptr clone() const override + { + return make_shared>(*this); + } + int run(shared_ptr mem, string &error) override + { + Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype; + Precision C_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype; + if (a_type != C_type) + { + error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(C_type); + return 1; + } + switch (a_type) + { + case Precision::Float64: + tensorfunc::dropout(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), this->getvar(2,mem,true), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::dropout(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), this->getvar(2,mem,true), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::dropout(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), this->getvar(2,mem,true), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::dropout(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), this->getvar(2,mem,true), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::dropout(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), this->getvar(2,mem,true), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::dropout(*mem->gettensor(this->args[0].textvalue), this->getvar(1,mem,true), this->getvar(2,mem,true), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(a_type); + return 1; + } + return 0; + } + }; + }; -#endif +#endif // DEEPX_TF_ELEMENTWISE_HPP From b46fe51cef93c01f466b40e772fa5fcdf578bbcc Mon Sep 17 00:00:00 2001 From: lipeng <734991033@qq.com> Date: Mon, 28 Apr 2025 21:36:30 +0800 Subject: [PATCH 2/2] =?UTF-8?q?dropout:ompsimd+cuda=E7=9A=84=E5=AE=9E?= =?UTF-8?q?=E7=8E=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/excuter/op-mem-ompsimd/list.md | 1 + excuter/op-mem-ompsimd/src/client/tfs.cpp | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/doc/excuter/op-mem-ompsimd/list.md b/doc/excuter/op-mem-ompsimd/list.md index 7d676562..58772944 100644 --- a/doc/excuter/op-mem-ompsimd/list.md +++ b/doc/excuter/op-mem-ompsimd/list.md @@ -49,6 +49,7 @@ | Operation | Author | Math Formula | IR Instruction | |-----------|--------|--------------|----------------| +| dropout | miaobyte | A.dropout(p,seed)->C | dropout(tensor A, var p, var seed)->(tensor C) | | switch | miaobyte | C=switch([tensors],case) | switch(listtensor tensors, tensor cases)->(tensor C) | | greaterscalar | miaobyte | mask=greater(T1,scalar) | greaterscalar(tensor A, var scalar)->(tensor mask) | | equalscalar | miaobyte | mask=equal(T1,scalar) | equalscalar(tensor A, var scalar)->(tensor mask) | diff --git a/excuter/op-mem-ompsimd/src/client/tfs.cpp b/excuter/op-mem-ompsimd/src/client/tfs.cpp index d2893cfc..32553db4 100644 --- a/excuter/op-mem-ompsimd/src/client/tfs.cpp +++ b/excuter/op-mem-ompsimd/src/client/tfs.cpp @@ -465,6 +465,17 @@ namespace deepx::tf { Param("C", DataCategory::Tensor, Precision::Any), }))); + // dropout author=miaobyte + tffactory.add_tf(std::make_shared>(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + Param("p", DataCategory::Var, Precision::Float32), + Param("seed", DataCategory::Var, Precision::Int32), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }) )); } // matmul void register_matmul(TfFactory &tffactory)