From 4992e78448c24cbfa1f1fb4780c7c4b7ed57a920 Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Mon, 28 Apr 2025 21:36:27 +0800
Subject: [PATCH 1/2] =?UTF-8?q?dropout:ompsimd+cuda=E7=9A=84=E5=AE=9E?=
 =?UTF-8?q?=E7=8E=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 CODE_OF_CONDUCT.md                            |  45 +++
 CONTRIBUTING.md                               |  28 ++
 README.md                                     |  23 --
 doc/excuter/op-mem-cuda/list.md               |   1 +
 .../src/deepx/tensorfunc/elementwise.hpp      |  13 +
 excuter/op-mem-cuda/src/client/tfs.cpp        |  11 +
 .../tensorfunc/elementwise_miaobyte_basic.cu  |  45 +++
 .../tensorfunc/elementwise_miaobyte_basic.cuh |   6 +
 .../tensorfunc/elementwise_miaobyte_basic.hpp |   9 +
 .../src/deepx/tf/elementwise_basic.hpp        |  68 +++++
 .../deepx/tensorfunc/elementwise_miaobyte.hpp | 256 ++++++++++--------
 .../src/deepx/tf/elementwise.hpp              |  61 ++++-
 12 files changed, 428 insertions(+), 138 deletions(-)
 create mode 100644 CODE_OF_CONDUCT.md
 create mode 100644 CONTRIBUTING.md

diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 00000000..96620939
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,45 @@
+# DeepX 行为准则
+
+## 我们的承诺
+
+作为贡献者和维护者，我们承诺为每个人提供一个开放和欢迎的环境。
+
+## 我们的标准
+
+有助于创造积极环境的行为包括但不限于：
+
+- 使用友好和包容的语言
+- 尊重不同的观点和经验
+- 耐心地接受建设性的批评
+- 关注对社区最有利的事情
+- 友善对待其他社区成员
+
+不可接受的行为包括但不限于：
+
+- 使用性化的语言或图像以及不受欢迎的性关注或挑逗
+- 捣乱/煽动/侮辱性/贬损的评论，人身攻击或政治攻击
+- 公开或私下的骚扰
+- 未经明确许可，发布他人的私人信息，如物理或电子地址
+- 其他可以合理地被认为不符合专业行为的行为
+
+## 我们的责任
+
+项目维护者有责任澄清可接受行为的标准，并应对任何不可接受的行为采取适当和公平的纠正措施。
+
+项目维护者有权利和责任删除、编辑或拒绝与本行为准则不符的评论、提交、代码、wiki编辑、问题和其他贡献，并可暂时或永久禁止任何他们认为不适合、威胁、冒犯或有害的贡献者。
+
+## 适用范围
+
+当个人代表项目或其社区时，本行为准则适用于项目空间和公共空间。
+
+## 执行
+
+如有滥用、骚扰或其他不可接受的行为，请通过以下方式联系项目团队。所有投诉都将被审查和调查，并将导致认为必要和适当的回应。
+
+## 联系信息
+
+请通过 [您的联系信息] 联系我们。
+
+## 归属
+
+本行为准则改编自[贡献者公约](https://www.contributor-covenant.org)，版本1.4。
\ No newline at end of file
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 00000000..15b9f801
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,28 @@
+# deepx 贡献指南
+
+deepx框架的发展，主要包括五大类方向
+
++ front: 新增模型、module、python类函数等
++ 中间层：包括计算图优化器，插件系统(自动KVcache系统)，自动分布式化，栈tensor自动释放，自动Inplace化等操作
++ 新增或修改excuter
++ 增加或修改算子，进一步可以分为leaftensorfunc(不可分割的基础算子)，fusedtensorfunc（融合算子）
++ 文档丰富：
++ 运维自动化方向
+
+大家可以选择一个方向
+
+## 步骤
+
+第一次提交
+  1. Fork本仓库（github.com/array2d/deepx）的main分支，到你的github/yourname/deepx
+  2. 本地clone github/yourname/deepx
+  3. 提交并推送您的更改到你的github：`git commit -m 'Add some feature'`
+  4. 创建一个Pull Request。
+
+第N次提交 
+
+  1. 保障你的本地和github/yourname/deepx中均已提pull request并得到merge
+  2. 在github/yourname/deepx中sync fork【危险操作，会删除你新增的代码】，拉取（github.com/array2d/deepx） main分支的最新代码
+  3. 本地clone github/yourname/deepx
+  4. 提交并推送您的更改到你的github：`git commit -m 'Add some feature'`
+  5. 创建一个Pull Request。
\ No newline at end of file
diff --git a/README.md b/README.md
index 4f53405c..a590faa9 100644
--- a/README.md
+++ b/README.md
@@ -66,29 +66,6 @@ DeepX可以集成现有的张量计算框架作为执行器，充分利用现有
 
 这种架构使得DeepX可以整合各类先进的计算框架作为执行引擎，同时提供统一的分布式调度和执行能力，为用户提供更灵活的选择和更高的性能。
 
-## 二.贡献指南
-
-也可以参考官方文档的指南
-
-https://deepx.array2d.com 
-
-欢迎通过以下方式参与项目共建：
-
-1. **代码贡献**
-   - 提交PR前请先创建Issue说明修改内容
-   - front项目当前以py为核心
-   - excuter:目前规划开发的3类执行器,参考这里如何给excuter添加一个新算子[excuter](doc/excuter/excuter.md)
-      - cpu:
-      - cuda:
-      - jax:
- 
-2. **文档改进**
-   - 提交文档更新到`doc/`目录
-
-3. **问题反馈**
-   - 当前处于高速迭代中，可通过issue反馈问题
- 
-
  
  ### 官方文档
  
diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md
index 0fa979ae..b177d4e9 100644
--- a/doc/excuter/op-mem-cuda/list.md
+++ b/doc/excuter/op-mem-cuda/list.md
@@ -48,6 +48,7 @@
 
 | Operation | Author |  Math Formula | IR Instruction |
 |-----------|--------|--------------|----------------|
+| dropout | miaobyte | T1.dropout(p,seed)->T3 | dropout(tensor<any> A, var<float32> p, var<int32> seed)->(tensor<any> C) |
 | switch | miaobyte | C=switch(tensors,cases) | switch(listtensor<any> tensors, tensor<int8> cases)->(tensor<any> result) |
 | greaterscalar | miaobyte | mask=compare(T1, scalar) | greaterscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
 | equalscalar | miaobyte | mask=compare(T1, scalar) | equalscalar(tensor<any> A, var<any> scalar, var<float64> epsilon)->(tensor<bool> mask) |
diff --git a/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp b/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp
index 6e3b2072..52fd04bc 100644
--- a/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp
+++ b/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp
@@ -412,6 +412,19 @@ namespace deepx::tensorfunc
         invertDispatcher<Author, T>::invert(input, output);
     }
 
+    //dropout(A,p)=>C
+    template <typename Author, typename T>
+    struct dropoutDispatcher
+    {
+        static void dropout(const Tensor<T> &input, const float p,const unsigned int seed, Tensor<T> &output) = delete;
+    };
+
+    template <typename Author, typename T>
+    void dropout(const Tensor<T> &input, const float p,const unsigned int seed, Tensor<T> &output)
+    {
+        dropoutDispatcher<Author, T>::dropout(input, p, seed, output);
+    }
+    
 } // namespace deepx::tensorfunc
 
 #endif // DEEPX_TENSORFUNC_ELEMENTWISE_HPP
diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp
index 4aeaa5ad..5e304d29 100644
--- a/excuter/op-mem-cuda/src/client/tfs.cpp
+++ b/excuter/op-mem-cuda/src/client/tfs.cpp
@@ -462,6 +462,17 @@ namespace deepx::tf
                                                                 {
                                                                     Param("result", DataCategory::Tensor, Precision::Any),
                                                                 })));
+        // dropout
+        tffactory.add_tf(std::make_shared<Dropout<miaobyte>>(vector<Param>(
+                                                                 {
+                                                                     Param("A", DataCategory::Tensor, Precision::Any),
+                                                                     Param("p", DataCategory::Var, Precision::Float32),
+                                                                     Param("seed", DataCategory::Var, Precision::Int32),
+                                                                 }),
+                                                                 vector<Param>(
+                                                                     {
+                                                                         Param("C", DataCategory::Tensor, Precision::Any),
+                                                                     })));
     }
     // matmul
     void register_matmul(TfFactory &tffactory)
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu
index 772865f3..8550e6b0 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu
@@ -3,6 +3,9 @@
 
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
+#include <curand_kernel.h>
+
+
 #include "deepx/tensorfunc/cuda.hpp"
 #include "deepx/tensorfunc/authors.hpp"
 #include "deepx/tensorfunc/cuda_math.cuh"
@@ -404,6 +407,48 @@ namespace deepx::tensorfunc
     template void launch_invert<int16_t>(const int16_t *a, int16_t *c, const int size);
     template void launch_invert<int8_t>(const int8_t *a, int8_t *c, const int size);
 
+    //dropout
+    template <typename T>
+    __global__ void dropout_kernel(const T *A, const float p,const unsigned int seed, T *C, const int size)
+    {
+        int stride = blockDim.x * gridDim.x;
+        curandState state;
+        curand_init(seed, threadIdx.x, 0, &state); // 仅初始化一次
+
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride)
+        {
+            float rand = curand_uniform(&state);
+            if (rand < p)
+            {
+                C[idx] = 0;
+            }
+            else
+            {
+                C[idx] = A[idx];
+            }
+        }
+    }
+
+    template <typename T>
+    void launch_dropout(const T *a, const float p,const unsigned int seed, T *c, const int size)
+    {
+        auto [numBlocks, blockSize] = BestDims(size);
+        dropout_kernel<<<numBlocks, blockSize>>>(a, p, seed, c, size);
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess)
+        {
+            throw std::runtime_error("Failed to launch dropout kernel: " +
+                                     std::string(cudaGetErrorString(err)));
+        }
+    }
+    template void launch_dropout<double>(const double *a, const float p,const unsigned int seed, double *c, const int size);
+    template void launch_dropout<float>(const float *a, const float p,const unsigned int seed, float *c, const int size);
+    template void launch_dropout<half>(const half *a, const float p,const unsigned int seed, half *c, const int size);
+    template void launch_dropout<nv_bfloat16>(const nv_bfloat16 *a, const float p,const unsigned int seed, nv_bfloat16 *c, const int size);
+    template void launch_dropout<int64_t>(const int64_t *a, const float p,const unsigned int seed, int64_t *c, const int size);
+    template void launch_dropout<int32_t>(const int32_t *a, const float p,const unsigned int seed, int32_t *c, const int size);
+    template void launch_dropout<int16_t>(const int16_t *a, const float p,const unsigned int seed, int16_t *c, const int size);
+    template void launch_dropout<int8_t>(const int8_t *a, const float p,const unsigned int seed, int8_t *c, const int size);
 }
 
 #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_BASIC_CU
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh
index 4100f38d..c85cce81 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh
@@ -89,6 +89,12 @@ namespace deepx::tensorfunc
     template <typename T>
     void launch_invert(const T* a, T* c,const int size);
 
+    //dropout
+    template <typename T>
+    __global__ void dropout_kernel(const T* A, const float p,const unsigned int seed, T* C,const int size);
+
+    template <typename T>
+    void launch_dropout(const T* a, const float p,const unsigned int seed, T* c,const int size);
 }
 
 #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CUH
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp
index b7d3a680..65f7a82d 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp
@@ -140,6 +140,15 @@ namespace deepx::tensorfunc
             launch_invert( A.data, C.data, A.shape.size);
         }
     };
+
+    template <typename T>
+    struct dropoutDispatcher<miaobyte, T>
+    {
+        static void dropout(const Tensor<T> &A, const float p,const unsigned int seed, Tensor<T> &C)
+        {
+            launch_dropout(A.data, p, seed, C.data, A.shape.size);
+        }           
+    };
 }
 
 #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_HPP
diff --git a/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp b/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp
index 709a899c..885bb5e6 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp
@@ -1033,6 +1033,74 @@ namespace deepx::tf
             return 0;
         }
     };
+
+    // dropout
+    template <typename Author>
+    class Dropout : public TF
+    {
+    public:
+        Dropout(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "dropout";
+            this->metadata.author = Author::name();
+            this->tftype = "elementwise";
+            this->args = args;
+            this->returns = returns;
+        }
+        string math_formula() const override
+        {
+            return "T1.dropout(p,seed)->T3";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Dropout<Author>>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error))
+            {
+                return 1;
+            }
+            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != c_type)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type);
+                return 1;
+            }
+            switch (a_type)
+            {
+            case Precision::Float64:
+                tensorfunc::dropout<Author>(*mem->gettensor<double>(this->args[0].textvalue), this->getvar<float>(1, mem), this->getvar<unsigned int>(2, mem), *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::dropout<Author>(*mem->gettensor<float>(this->args[0].textvalue), this->getvar<float>(1, mem), this->getvar<unsigned int>(2, mem), *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            case Precision::Float16:
+                tensorfunc::dropout<Author>(*mem->gettensor<half>(this->args[0].textvalue), this->getvar<float>(1, mem), this->getvar<unsigned int>(2, mem), *mem->gettensor<half>(this->returns[0].textvalue));
+                break;
+            case Precision::BFloat16:
+                tensorfunc::dropout<Author>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), this->getvar<float>(1, mem), this->getvar<unsigned int>(2, mem), *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
+                break;  
+            case Precision::Int64:
+                tensorfunc::dropout<Author>(*mem->gettensor<int64_t>(this->args[0].textvalue), this->getvar<float>(1, mem), this->getvar<unsigned int>(2, mem), *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::dropout<Author>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<float>(1, mem), this->getvar<unsigned int>(2, mem), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::dropout<Author>(*mem->gettensor<int16_t>(this->args[0].textvalue), this->getvar<float>(1, mem), this->getvar<unsigned int>(2, mem), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                tensorfunc::dropout<Author>(*mem->gettensor<int8_t>(this->args[0].textvalue), this->getvar<float>(1, mem), this->getvar<unsigned int>(2, mem), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported dtype: " + precision_str(a_type);
+                return 1;
+            }
+            return 0;
+        }
+    };
 };
 
 #endif // DEEPX_TF_ELEMENTWISE_BASIC_HPP
diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp
index f8f0302f..b36902a4 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp
@@ -1,6 +1,7 @@
 #ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_HPP
 #define DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_HPP
 
+#include <random>
 #include <cblas.h>
 #include <cmath>
 #include <hwy/highway.h>
@@ -18,8 +19,8 @@ namespace deepx::tensorfunc
     {
         if (A.shape == B.shape && A.shape == C.shape)
         {
-            C.shape.rangeElementwiseParallel([&A, &B, &C, &scalar_op, &simd_op](int i,int i_end)
-                                  {
+            C.shape.rangeElementwiseParallel([&A, &B, &C, &scalar_op, &simd_op](int i, int i_end)
+                                             {
                                       
                                       const ScalableTag<T> tag;
                                       const size_t lanes = Lanes(tag);
@@ -62,8 +63,8 @@ namespace deepx::tensorfunc
     {
         if (A.shape == C.shape)
         {
-            C.shape.rangeElementwiseParallel([&A, &b, &C, &scalar_op, &simd_op](int i,int i_end)
-                                  {
+            C.shape.rangeElementwiseParallel([&A, &b, &C, &scalar_op, &simd_op](int i, int i_end)
+                                             {
                                       const ScalableTag<T> tag;
                                       const size_t lanes = Lanes(tag);
                                       size_t j = 0;
@@ -98,20 +99,18 @@ namespace deepx::tensorfunc
         }
     }
 
-    //todtype
-    template <typename T,typename Dtype>
+    // todtype
+    template <typename T, typename Dtype>
     static void todtype(const Tensor<T> &A, Tensor<Dtype> &C)
     {
-        C.shape.rangeElementwiseParallel([&A, &C](int i,int i_end)
-        {
+        C.shape.rangeElementwiseParallel([&A, &C](int i, int i_end)
+                                         {
             for (int j = 0; j < i_end; j++)
             {
                 C.data[i + j] = static_cast<Dtype>(A.data[i + j]);
-            }
-        });
+            } });
     }
 
-
     // add
     template <typename T>
     struct addDispatcher<miaobyte, T>
@@ -302,23 +301,22 @@ namespace deepx::tensorfunc
     struct invertDispatcher<miaobyte, T>
     {
         static void invert(const Tensor<T> &A, Tensor<T> &C)
-        {   
+        {
             if (A.shape == C.shape)
             {
-                A.shape.rangeElementwiseParallel([&A, &C](int idx,int idx_end)
-                                      {
+                A.shape.rangeElementwiseParallel([&A, &C](int idx, int idx_end)
+                                                 {
                                            for (int j=0;j<idx_end;j++)
                                            {
                                                 C.data[idx+j]=~A.data[idx+j];
-                                           } 
-                                      });
+                                           } });
             }
             else
             {
                 throw std::invalid_argument("shape mismatch");
             }
         }
-    };  
+    };
 
     template <typename T>
     struct sqrtDispatcher<miaobyte, T, std::enable_if_t<std::is_floating_point_v<T>>>
@@ -327,8 +325,8 @@ namespace deepx::tensorfunc
         {
             if (input.shape == output.shape)
             {
-                output.shape.rangeElementwiseParallel([&input, &output](int i,int i_end)
-                                           {
+                output.shape.rangeElementwiseParallel([&input, &output](int i, int i_end)
+                                                      {
                 const ScalableTag<T> tag;
                 const size_t lanes = Lanes(tag);
                 size_t j=0;
@@ -367,8 +365,8 @@ namespace deepx::tensorfunc
         {
             if (input.shape == output.shape)
             {
-                output.shape.rangeElementwiseParallel([&input, &output](int i,int i_end)
-                                           {
+                output.shape.rangeElementwiseParallel([&input, &output](int i, int i_end)
+                                                      {
                                                size_t j = 0;
 
                                                while (j < i_end)
@@ -392,8 +390,8 @@ namespace deepx::tensorfunc
         {
             if (A.shape == B.shape && A.shape == C.shape)
             {
-                C.shape.rangeElementwiseParallel([&A, &B, &C](int i,int i_end)
-                                      {
+                C.shape.rangeElementwiseParallel([&A, &B, &C](int i, int i_end)
+                                                 {
                                          for (int j = 0; j < i_end; j++)
                                          C.data[i+j] = std::pow(A.data[i+j], B.data[i+j]); });
             }
@@ -413,11 +411,10 @@ namespace deepx::tensorfunc
         {
             if (input.shape == output.shape)
             {
-                output.shape.rangeElementwiseParallel([&input, &output, &value](int i,int i_end)
-                                           {
+                output.shape.rangeElementwiseParallel([&input, &output, &value](int i, int i_end)
+                                                      {
                                              for (int j = 0; j < i_end; j++)
-                                                output.data[i+j] = std::pow(input.data[i+j], value);
-                                            });
+                                                output.data[i+j] = std::pow(input.data[i+j], value); });
             }
             else
             {
@@ -434,18 +431,17 @@ namespace deepx::tensorfunc
         {
             if (input.shape == output.shape)
             {
-                output.shape.rangeElementwiseParallel([&input, &output, &value](int i,int i_end)
-                                           {
+                output.shape.rangeElementwiseParallel([&input, &output, &value](int i, int i_end)
+                                                      {
                                                 for (int j = 0; j < i_end; j++)
-                                                output.data[i+j] = std::pow(value, input.data[i+j]);
-                                           });
+                                                output.data[i+j] = std::pow(value, input.data[i+j]); });
             }
             else
             {
                 throw std::invalid_argument("shape mismatch");
             }
         }
-    };  
+    };
 
     template <typename T>
     struct logDispatcher<miaobyte, T>
@@ -455,10 +451,9 @@ namespace deepx::tensorfunc
         {
             if (input.shape == output.shape)
             {
-                output.shape.rangeElementwiseParallel([&input, &output](int i,int i_end)
-                                           { for (int j = 0; j < i_end; j++)
-                                                output.data[i+j] = std::log(input.data[i+j]);
-                                           });
+                output.shape.rangeElementwiseParallel([&input, &output](int i, int i_end)
+                                                      { for (int j = 0; j < i_end; j++)
+                                                output.data[i+j] = std::log(input.data[i+j]); });
             }
             else
             {
@@ -475,10 +470,9 @@ namespace deepx::tensorfunc
         {
             if (input.shape == output.shape)
             {
-                output.shape.rangeElementwiseParallel([&input, &output](int i,int i_end)
-                                           { for (int j = 0; j < i_end; j++)
-                                                output.data[i+j] = std::exp(input.data[i+j]);
-                                           });
+                output.shape.rangeElementwiseParallel([&input, &output](int i, int i_end)
+                                                      { for (int j = 0; j < i_end; j++)
+                                                output.data[i+j] = std::exp(input.data[i+j]); });
             }
             else
             {
@@ -495,8 +489,8 @@ namespace deepx::tensorfunc
         {
             if (input.shape == output.shape)
             {
-                output.shape.rangeElementwiseParallel([&input, &output](int i,int i_end)
-                                           {
+                output.shape.rangeElementwiseParallel([&input, &output](int i, int i_end)
+                                                      {
                 const ScalableTag<T> tag;
                 const size_t lanes = Lanes(tag);
                 size_t j=0;
@@ -537,8 +531,8 @@ namespace deepx::tensorfunc
         {
             if (input.shape == output.shape)
             {
-                output.shape.rangeElementwiseParallel([&input, &output](int i,int i_end)
-                                           {
+                output.shape.rangeElementwiseParallel([&input, &output](int i, int i_end)
+                                                      {
                 const ScalableTag<T> tag;
                 const size_t lanes = Lanes(tag);
                 size_t j=0;
@@ -579,8 +573,8 @@ namespace deepx::tensorfunc
         {
             if (input.shape == output.shape)
             {
-                output.shape.rangeElementwiseParallel([&input, &output](int i,int i_end)
-                                           {
+                output.shape.rangeElementwiseParallel([&input, &output](int i, int i_end)
+                                                      {
                 const ScalableTag<T> tag;
                 const size_t lanes = Lanes(tag);
                 size_t j=0;
@@ -620,8 +614,8 @@ namespace deepx::tensorfunc
         {
             if (A.shape == B.shape && A.shape == C.shape)
             {
-                C.shape.rangeElementwiseParallel([&A, &B, &C](int i,int i_end)
-                                      {
+                C.shape.rangeElementwiseParallel([&A, &B, &C](int i, int i_end)
+                                                 {
                 const ScalableTag<T> tag;
                 const size_t lanes = Lanes(tag);
                 size_t j=0;
@@ -662,8 +656,8 @@ namespace deepx::tensorfunc
         {
             if (A.shape == C.shape)
             {
-                C.shape.rangeElementwiseParallel([&A, b, &C](int i,int i_end)
-                                      {
+                C.shape.rangeElementwiseParallel([&A, b, &C](int i, int i_end)
+                                                 {
                 const ScalableTag<T> tag;
                 const size_t lanes = Lanes(tag);
                 size_t j=0;
@@ -704,8 +698,8 @@ namespace deepx::tensorfunc
         {
             if (A.shape == B.shape && A.shape == C.shape)
             {
-                C.shape.rangeElementwiseParallel([&A, &B, &C](int i,int i_end)
-                                      {
+                C.shape.rangeElementwiseParallel([&A, &B, &C](int i, int i_end)
+                                                 {
                 const ScalableTag<T> tag;
                 const size_t lanes = Lanes(tag);
                 size_t j=0;
@@ -746,8 +740,8 @@ namespace deepx::tensorfunc
         {
             if (A.shape == C.shape)
             {
-                C.shape.rangeElementwiseParallel([&A, b, &C](int i,int i_end)
-                                      {   
+                C.shape.rangeElementwiseParallel([&A, b, &C](int i, int i_end)
+                                                 {   
                 const ScalableTag<T> tag;
                 const size_t lanes = Lanes(tag);
                 size_t j=0;
@@ -780,16 +774,16 @@ namespace deepx::tensorfunc
         }
     };
 
-    //equal
-    template <typename T,typename MaskT>
-    struct equalDispatcher<miaobyte, T,MaskT>
+    // equal
+    template <typename T, typename MaskT>
+    struct equalDispatcher<miaobyte, T, MaskT>
     {
-        static void equal(const Tensor<T> &A, const Tensor<T> &B,const float epsilon, Tensor<MaskT> &mask)
+        static void equal(const Tensor<T> &A, const Tensor<T> &B, const float epsilon, Tensor<MaskT> &mask)
         {
             if (A.shape == B.shape && mask.shape == A.shape)
-            {   
-                A.shape.rangeElementwiseParallel([&A, &B, &mask,epsilon](int i,int i_end)
-                                      {
+            {
+                A.shape.rangeElementwiseParallel([&A, &B, &mask, epsilon](int i, int i_end)
+                                                 {
                                             for (int j = 0; j < i_end; j++)
                                             {
                                                 if (epsilon == 0)
@@ -799,8 +793,7 @@ namespace deepx::tensorfunc
                                                 else{
                                                     mask.data[i+j]=std::abs(A.data[i+j]-B.data[i+j])<=epsilon;
                                                 }
-                                            }
-                                            });
+                                            } });
             }
             else
             {
@@ -809,16 +802,16 @@ namespace deepx::tensorfunc
         }
     };
 
-    //equalscalar
-    template <typename T,typename MaskT>
-    struct equalscalarDispatcher<miaobyte, T,MaskT>
+    // equalscalar
+    template <typename T, typename MaskT>
+    struct equalscalarDispatcher<miaobyte, T, MaskT>
     {
-        static void equalscalar(const Tensor<T> &A, const T scalar,const float epsilon, Tensor<MaskT> &mask)
+        static void equalscalar(const Tensor<T> &A, const T scalar, const float epsilon, Tensor<MaskT> &mask)
         {
             if (A.shape == mask.shape)
             {
-                A.shape.rangeElementwiseParallel([&A, &mask, &scalar,epsilon](int i,int i_end)
-                                      {
+                A.shape.rangeElementwiseParallel([&A, &mask, &scalar, epsilon](int i, int i_end)
+                                                 {
                 for (int j = 0; j < i_end; j++)
                 {
                     if (epsilon == 0)
@@ -828,8 +821,7 @@ namespace deepx::tensorfunc
                     else{
                         mask.data[i+j]=std::abs(A.data[i+j]-scalar)<=epsilon;
                     }
-                }
-                });
+                } });
             }
             else
             {
@@ -838,67 +830,64 @@ namespace deepx::tensorfunc
         };
     };
 
-    //less
-    template <typename T,typename MaskT>
-    struct lessDispatcher<miaobyte, T,MaskT>
+    // less
+    template <typename T, typename MaskT>
+    struct lessDispatcher<miaobyte, T, MaskT>
     {
         static void less(const Tensor<T> &A, const Tensor<T> &B, Tensor<MaskT> &mask)
         {
             if (A.shape == B.shape && mask.shape == A.shape)
             {
-                A.shape.rangeElementwiseParallel([&A, &B, &mask](int i,int i_end)
-                                      {
+                A.shape.rangeElementwiseParallel([&A, &B, &mask](int i, int i_end)
+                                                 {
                 for (int j = 0; j < i_end; j++)
                 {
                     mask.data[i+j]=A.data[i+j]<B.data[i+j];
-                }   
-                });
+                } });
             }
             else
             {
                 throw std::invalid_argument("shape mismatch");
             }
-        }   
+        }
     };
 
-    //lessscalar
-    template <typename T,typename MaskT>
-    struct lessscalarDispatcher<miaobyte, T,MaskT>
+    // lessscalar
+    template <typename T, typename MaskT>
+    struct lessscalarDispatcher<miaobyte, T, MaskT>
     {
         static void lessscalar(const Tensor<T> &A, const T scalar, Tensor<MaskT> &mask)
         {
             if (A.shape == mask.shape)
             {
-                A.shape.rangeElementwiseParallel([&A, &mask, &scalar](int i,int i_end)
-                                      {
+                A.shape.rangeElementwiseParallel([&A, &mask, &scalar](int i, int i_end)
+                                                 {
                 for (int j = 0; j < i_end; j++)
                 {
                     mask.data[i+j]=A.data[i+j]<scalar;
-                }
-                });
+                } });
             }
             else
             {
                 throw std::invalid_argument("shape mismatch");
             }
-        }   
+        }
     };
-    
-    //greater
-    template <typename T,typename MaskT>
-    struct greaterDispatcher<miaobyte, T,MaskT>
+
+    // greater
+    template <typename T, typename MaskT>
+    struct greaterDispatcher<miaobyte, T, MaskT>
     {
         static void greater(const Tensor<T> &A, const Tensor<T> &B, Tensor<MaskT> &mask)
         {
             if (A.shape == B.shape && mask.shape == A.shape)
             {
-                A.shape.rangeElementwiseParallel([&A, &B, &mask](int i,int i_end)
-                                      {
+                A.shape.rangeElementwiseParallel([&A, &B, &mask](int i, int i_end)
+                                                 {
                 for (int j = 0; j < i_end; j++)
                 {
                     mask.data[i+j]=A.data[i+j]>B.data[i+j];
-                }
-                });
+                } });
             }
             else
             {
@@ -907,52 +896,91 @@ namespace deepx::tensorfunc
         }
     };
 
-    //greaterscalar
-    template <typename T,typename MaskT>
-    struct greaterscalarDispatcher<miaobyte, T,MaskT>
+    // greaterscalar
+    template <typename T, typename MaskT>
+    struct greaterscalarDispatcher<miaobyte, T, MaskT>
     {
         static void greaterscalar(const Tensor<T> &A, const T scalar, Tensor<MaskT> &mask)
         {
             if (A.shape == mask.shape)
             {
-                A.shape.rangeElementwiseParallel([&A, &mask, &scalar](int i,int i_end)
-                                      {
+                A.shape.rangeElementwiseParallel([&A, &mask, &scalar](int i, int i_end)
+                                                 {
                 for (int j = 0; j < i_end; j++)
                 {
                     mask.data[i+j]=A.data[i+j]>scalar;
-                }
-                });
+                } });
             }
             else
             {
                 throw std::invalid_argument("shape mismatch");
             }
-        }   
-    };      
+        }
+    };
 
-    //switch
-    template <typename T,typename casesT>
-    struct switchDispatcher<miaobyte, T,casesT>
+    // switch
+    template <typename T, typename casesT>
+    struct switchDispatcher<miaobyte, T, casesT>
     {
-        static void Switch(const vector<Tensor<T>*> tensors,const Tensor<casesT> &cases, Tensor<T> &C)
+        static void Switch(const vector<Tensor<T> *> tensors, const Tensor<casesT> &cases, Tensor<T> &C)
         {
             if (cases.shape == C.shape)
             {
-                C.shape.rangeElementwiseParallel([&tensors, &cases, &C](int i,int i_end)
-                                      {
+                C.shape.rangeElementwiseParallel([&tensors, &cases, &C](int i, int i_end)
+                                                 {
                 for (int j = 0; j < i_end; j++)
                 {   
                     int which_tensor=cases.data[i];
                     C.data[i+j]=tensors[which_tensor]->data[i];
+                } });
+            }
+            else
+            {
+                throw std::invalid_argument("shape mismatch");
+            }
+        }
+    };
+
+    // dropout
+    template <typename T>
+    struct dropoutDispatcher<miaobyte, T>
+    {
+        static void dropout(const Tensor<T> &A, const float p, const unsigned int seed, Tensor<T> &C)
+        {
+            if (A.shape == C.shape)
+            {
+                std::uniform_real_distribution<double> distribution(0, 1);
+                std::default_random_engine generator;
+                if (seed != 0)
+                {
+                    generator.seed(seed);
+                }
+                else
+                {
+                    std::random_device rd;
+                    generator.seed(rd());
                 }
-                });
+
+                A.shape.rangeElementwiseParallel([&A, &C, &p, &distribution, &generator](int i, int i_end)
+                                                 {
+                                        for (int j = 0; j < i_end; j++)
+                                        {
+                                            double rand = distribution(generator);
+                                            if (rand < p)
+                                            {
+                                                C.data[i+j]=0;
+                                            }
+                                            else
+                                            {
+                                                C.data[i+j]=A.data[i+j];
+                                            }
+                                        } });
             }
             else
             {
                 throw std::invalid_argument("shape mismatch");
-            }   
+            }
         }
-    };      
-    
+    };
 };
 #endif // DEEPX_OP_CPU_ELEMENTWISE_HPP
\ No newline at end of file
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp
index 58768784..7c29ddab 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp
@@ -1900,5 +1900,64 @@ namespace deepx::tf
             return 0;
         }
     };
+
+    //dropout
+    template <typename Author>
+    class Dropout : public TF
+    {
+    public:
+        Dropout(vector<Param> args, vector<Param> returns)
+        {
+            this->name = "dropout";
+            this->metadata.author = Author::name();
+            this->tftype = "elementwise";
+            this->args = args;
+            this->returns = returns;
+        }
+        string math_formula() const override
+        {
+            return "A.dropout(p,seed)->C";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Dropout<Author>>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision C_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != C_type)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(C_type);
+                return 1;
+            }   
+            switch (a_type)
+            {
+            case Precision::Float64:
+                tensorfunc::dropout<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), this->getvar<float>(1,mem,true), this->getvar<unsigned int>(2,mem,true), *mem->gettensor<double>(this->returns[0].textvalue));
+                break;  
+            case Precision::Float32:
+                tensorfunc::dropout<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), this->getvar<float>(1,mem,true), this->getvar<unsigned int>(2,mem,true), *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                tensorfunc::dropout<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), this->getvar<float>(1,mem,true), this->getvar<unsigned int>(2,mem,true), *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                break;  
+            case Precision::Int32:
+                tensorfunc::dropout<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<float>(1,mem,true), this->getvar<unsigned int>(2,mem,true), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::dropout<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), this->getvar<float>(1,mem,true), this->getvar<unsigned int>(2,mem,true), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;  
+            case Precision::Int8:
+                tensorfunc::dropout<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), this->getvar<float>(1,mem,true), this->getvar<unsigned int>(2,mem,true), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported dtype: " + precision_str(a_type);
+                return 1;   
+            }
+            return 0;
+        }
+    };
+
 };
-#endif
+#endif // DEEPX_TF_ELEMENTWISE_HPP

From b46fe51cef93c01f466b40e772fa5fcdf578bbcc Mon Sep 17 00:00:00 2001
From: lipeng <734991033@qq.com>
Date: Mon, 28 Apr 2025 21:36:30 +0800
Subject: [PATCH 2/2] =?UTF-8?q?dropout:ompsimd+cuda=E7=9A=84=E5=AE=9E?=
 =?UTF-8?q?=E7=8E=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 doc/excuter/op-mem-ompsimd/list.md        |  1 +
 excuter/op-mem-ompsimd/src/client/tfs.cpp | 11 +++++++++++
 2 files changed, 12 insertions(+)

diff --git a/doc/excuter/op-mem-ompsimd/list.md b/doc/excuter/op-mem-ompsimd/list.md
index 7d676562..58772944 100644
--- a/doc/excuter/op-mem-ompsimd/list.md
+++ b/doc/excuter/op-mem-ompsimd/list.md
@@ -49,6 +49,7 @@
 
 | Operation | Author |  Math Formula | IR Instruction |
 |-----------|--------|--------------|----------------|
+| dropout | miaobyte | A.dropout(p,seed)->C | dropout(tensor<any> A, var<float32> p, var<int32> seed)->(tensor<any> C) |
 | switch | miaobyte | C=switch([tensors],case) | switch(listtensor<any> tensors, tensor<int8> cases)->(tensor<any> C) |
 | greaterscalar | miaobyte | mask=greater(T1,scalar) | greaterscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
 | equalscalar | miaobyte | mask=equal(T1,scalar) | equalscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
diff --git a/excuter/op-mem-ompsimd/src/client/tfs.cpp b/excuter/op-mem-ompsimd/src/client/tfs.cpp
index d2893cfc..32553db4 100644
--- a/excuter/op-mem-ompsimd/src/client/tfs.cpp
+++ b/excuter/op-mem-ompsimd/src/client/tfs.cpp
@@ -465,6 +465,17 @@ namespace deepx::tf
                                                                 {
                                                                     Param("C", DataCategory::Tensor, Precision::Any),
                                                                 })));
+        // dropout author=miaobyte
+        tffactory.add_tf(std::make_shared<Dropout<miaobyte>>(vector<Param>(
+                                                                 {
+                                                                     Param("A", DataCategory::Tensor, Precision::Any),
+                                                                     Param("p", DataCategory::Var, Precision::Float32),
+                                                                     Param("seed", DataCategory::Var, Precision::Int32),
+                                                                 }),
+                                                                 vector<Param>(
+                                                                     {
+                                                                         Param("C", DataCategory::Tensor, Precision::Any),
+                                                                     }) ));
     }
     // matmul
     void register_matmul(TfFactory &tffactory)