array2d
diff --git a/‎doc/excuter/op-mem-cuda/cublas/api.md‎
Lines changed: 125 additions & 0 deletions b/‎doc/excuter/op-mem-cuda/cublas/api.md‎
Lines changed: 125 additions & 0 deletions
diff --git a/‎doc/excuter/op-mem-cuda/cublaslt/api.md‎
Lines changed: 7 additions & 0 deletions b/‎doc/excuter/op-mem-cuda/cublaslt/api.md‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎doc/excuter/op-mem-cuda/list.md‎
Lines changed: 1 addition & 0 deletions b/‎doc/excuter/op-mem-cuda/list.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/excuter/op-mem-ompsimd/list.md‎
Lines changed: 2 additions & 0 deletions b/‎doc/excuter/op-mem-ompsimd/list.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp‎
Lines changed: 4 additions & 1 deletion b/‎excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎excuter/op-mem-cuda/src/client/tfs.cpp‎
Lines changed: 17 additions & 7 deletions b/‎excuter/op-mem-cuda/src/client/tfs.cpp‎
Lines changed: 17 additions & 7 deletions
diff --git a/‎excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_cublas_basic.hpp‎
Lines changed: 24 additions & 26 deletions b/‎excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_cublas_basic.hpp‎
Lines changed: 24 additions & 26 deletions
diff --git a/‎excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu‎
Lines changed: 50 additions & 0 deletions b/‎excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu‎
Lines changed: 50 additions & 0 deletions
@@ -0,0 +1,125 @@
++ 向量X的最大值
+返回向量中绝对值最大的元素的索引
+```
+cublasI[s|d|c|z]amax(cublasHandle_t handle, int n, const T *x, int incx, T *result)
+```
+
++ 向量X的最小值
+```
+cublasI[s|d|c|z]amin(cublasHandle_t handle, int n, const T *x, int incx, T *result)
+```
+
++ 向量X的绝对值之和
+```
+cublasI[s|d|c|z]asum(cublasHandle_t handle, int n, const T *x, int incx, T *result) 
+```
+
++ 向量X的标量乘法
+ 
+result = alpha * x + y
+ 
+```
+cublasI[s|d|c|z]axpy(cublasHandle_t handle, int n, 
+                    const T *alpha,
+                    const T *x, int incx,
+                    T *y, int incy)
+```
+ 
+ 
++ 向量X的copy
+
+y = x
+
+```
+cublasI[s|d|c|z]copy(cublasHandle_t handle, int n, const T *x, int incx, T *y, int incy)
+```
+
+
++ 向量X的dot
+
+?
+
+```
+cublasI[s|d|c|z]dot(cublasHandle_t handle, int n, 
+                    const T *x, int incx, 
+                    const T *y, int incy, 
+                    T *result)
+```
+
++ 向量X的nrm2 
+
+计算向量x的欧几里得范数
+
+```
+cublasI[s|d|c|z]nrm2(cublasHandle_t handle, int n, const T *x, int incx, T *result)
+``` 
+
++ 向量X的rot()
+在x， y平面上按cos（alpha）=c，sin（alpha）=s定义的角度逆时针旋转
+
+
+```
+cublasI[s|d|c|z]rot(cublasHandle_t handle, int n,
+                     T *x, int incx, 
+                     T *y, int incy,
+                     const T *c,const T *s)
+```
+
+
+## Level-3 BLAS操作
+
+列主格式存储
+
++ gemm (通用矩阵乘法)
+计算 C = α * op(A) * op(B) + β * C
+其中op(X)可以是X或X^T
+
+```
+cublasI[s|d|c|z]gemm(cublasHandle_t handle,
+                     cublasOperation_t transa, cublasOperation_t transb,
+                     int m, int n, int k,
+                     const T *alpha, const T *A, int lda, const T *B, int ldb, const T *beta, T *C, int ldc)
+```
+
++ gemmBatched (批量矩阵乘法)
+同时计算多个独立的矩阵乘法操作
+数学公式: C[i] = α * op(A[i]) * op(B[i]) + β * C[i], i ∈ [0,batchCount)
+
++ gemmStridedBatched (步进批量矩阵乘法)
+处理内存连续的批量矩阵乘法，通过stride指定每个矩阵的步长
+数学公式: 同gemmBatched，但矩阵在内存中以固定步长排列
+
++ gemmGroupedBatched (分组批量矩阵乘法)
+按组处理批量矩阵乘法，每组可以有不同的维度参数
+数学公式: 同gemmBatched，但可以按组设置不同的m,n,k
+
++ geam (矩阵加法与转置)
+计算 C = α * op(A) + β * op(B)
+其中op(X)可以是X或X^T
+可用于实现矩阵转置、加法、缩放等操作
+
++ dgmm (对角矩阵乘法)
+计算对角矩阵与普通矩阵的乘法
+数学公式:
+- 左乘模式: C = diag(x) * A
+- 右乘模式: C = A * diag(x)
+
++ gemmEx (扩展精度矩阵乘法)
+支持混合精度计算，如:
+- FP16输入，FP32累加和输出
+- INT8输入，INT32累加，FP32输出
+数学公式: 同gemm，但支持不同数据类型
+
++ GemmBatchedEx (扩展精度批量矩阵乘法)
+批量版本的gemmEx，支持混合精度
+
++ cublasGemmStridedBatchedEx (扩展精度步进批量矩阵乘法)
+步进批量版本的gemmEx，支持混合精度
+
++ cublasGemmGroupedBatchedEx (扩展精度分组批量矩阵乘法)
+分组批量版本的gemmEx，支持混合精度
+
++ Csyrk3mEx (对称矩阵更新)
+计算对称矩阵的rank-k更新，使用3M算法优化复数运算
+数学公式: C = α * op(A) * op(A)^T + β * C
+其中C为对称矩阵
@@ -0,0 +1,7 @@
++ cublasLtMatmul()
+支持部分低精度
+
++ cublasLtMatmulEx()
+
++ cublasLtMatmulBatched()
+
@@ -4,6 +4,7 @@
 
 | Operation | Author | Func Def | Math Formula | IR Instruction |
 |-----------|--------|------------|--------------|----------------|
+| add | miaobyte | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
 | uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
 | arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
 | constant | miaobyte | constant(tensor<any> t, var<any> value)->() | constant(T1) | constant(tensor<any> t, var<any> value)->() |
 
@@ -5,6 +5,8 @@
 | Operation | Author | Func Def | Math Formula | IR Instruction |
 |-----------|--------|------------|--------------|----------------|
 | concat |  none  | concat()->() | Tresult = concat([T1, T2...], axis=3) | concat()->() |
+| add | cblas | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) | T3=T1+T2 | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) |
+| add | miaobyte | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
 | uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
 | arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
 | constant | miaobyte | constant(tensor<any> t, var<any> value)->() | print(T1) | constant(tensor<any> t, var<any> value)->() |
 
@@ -9,7 +9,10 @@ namespace deepx::tensorfunc
     template <typename Author, typename T>
     struct addDispatcher
     {
-        static void add(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C) = delete;
+        static void add(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C)
+        {
+            throw NotImplementError("add");
+        }
     };
 
     template <typename Author, typename T>
 
@@ -3,6 +3,7 @@
 #include "deepx/tf/new.hpp"
 #include "deepx/tf/print.hpp"
 #include "deepx/tf/init.hpp"
+#include "deepx/tf/elementwise_basic.hpp"
 #include "deepx/dtype.hpp"
 #include "deepx/tf/tffactory.hpp"
 #include "deepx/tensorfunc/authors.hpp"
@@ -78,23 +79,32 @@ namespace deepx::tf
     // io
     void register_util(TfFactory &opfactory)
     {
-        opfactory.add_tf(std::make_shared<Print<tensorfunc::miaobyte>>(vector<Param>(
+        opfactory.add_tf(std::make_shared<Print< miaobyte>>(vector<Param>(
                                                                            {
                                                                                Param("", DataCategory::Tensor, Precision::Any),
                                                                            }),
                                                                        vector<Param>()));
 
-        opfactory.add_tf(std::make_shared<Print<tensorfunc::miaobyte>>(vector<Param>(
+        opfactory.add_tf(std::make_shared<Print< miaobyte>>(vector<Param>(
                                                                            {
                                                                                Param("", DataCategory::Tensor, Precision::Any),
                                                                                Param("", DataCategory::Var, Precision::String),
                                                                            }),
                                                                        vector<Param>()));
     }
 
-    // // elementwise
-    // void register_elementwise(OpFactory &opfactory)
-    // {
+    // elementwise
+    void register_elementwise(TfFactory &tffactory)
+    {
+        tffactory.add_tf(std::make_shared<Add<miaobyte>>(vector<Param>(
+                                                                  {
+                                                                      Param("a", DataCategory::Tensor, Precision::Any),
+                                                                      Param("b", DataCategory::Tensor, Precision::Any),
+                                                                  }),
+                                                              vector<Param>(
+                                                                  {
+                                                                      Param("c", DataCategory::Tensor, Precision::Any),
+                                                                  })));
     //     opfactory.add_op(Add_miaobyte<float>());
     //     opfactory.add_op(Add_miaobyte<double>());
     //     opfactory.add_op(Add_miaobyte<int8_t>());
@@ -140,7 +150,7 @@ namespace deepx::tf
 
     //     opfactory.add_op(Powscalar_miaobyte<float>());
     //     opfactory.add_op(Powscalar_miaobyte<double>());
-    // }
+    }
     // // matmul
     // void register_matmul(OpFactory &opfactory)
     // {
@@ -174,7 +184,7 @@ namespace deepx::tf
         register_lifecycle(tffactory);
         register_init(tffactory);
         register_util(tffactory);
-        // register_elementwise(opfactory);
+        register_elementwise(tffactory);
         // register_matmul(opfactory);
         register_changeshape(tffactory);
         // register_reduce(opfactory);
 
@@ -13,71 +13,69 @@
 #include "deepx/tensorfunc/cuda.hpp"
 namespace deepx::tensorfunc
 {
- 
-    // float特化
+
+    // double特化
     template <>
-    struct  addDispatcher<cublas, float>
+    struct addDispatcher<cublas, double>
     {
-        static void add(const Tensor<float> &A, const Tensor<float> &B, Tensor<float> &C)
+        static void add(const Tensor<double> &A, const Tensor<double> &B, Tensor<double> &C)
         {
             if (A.shape.size != B.shape.size || A.shape.size != C.shape.size)
             {
                 throw std::runtime_error("Tensor shapes must match for addition");
             }
 
             static CublasHandle handle;
-            const float alpha = 1.0f;
-            const float beta = 1.0f;
-
-            // 使用cublasSgeam直接计算 C = alpha*A + beta*B
-            auto status = cublasSgeam(handle.get(),
-                                      CUBLAS_OP_N,     // 不转置A
-                                      CUBLAS_OP_N,     // 不转置B
-                                      A.shape.size, 1, // 假设是向量（或展平处理）
+            const double alpha = 1.0;
+            const double beta = 1.0;
+            auto status = cublasDgeam(handle.get(),
+                                      CUBLAS_OP_N,
+                                      CUBLAS_OP_N,
+                                      A.shape.size, 1,
                                       &alpha,
                                       A.data, A.shape.size,
                                       &beta,
                                       B.data, B.shape.size,
                                       C.data, C.shape.size);
-
             if (status != CUBLAS_STATUS_SUCCESS)
             {
-                throw std::runtime_error("cuBLAS Sgeam failed");
+                throw std::runtime_error("cuBLAS Dgeam failed");
             }
         }
     };
-
-    // double特化
+    // float特化
     template <>
-    struct addDispatcher<cublas, double>
+    struct addDispatcher<cublas, float>
     {
-        static void add(const Tensor<double> &A, const Tensor<double> &B, Tensor<double> &C)
+        static void add(const Tensor<float> &A, const Tensor<float> &B, Tensor<float> &C)
         {
             if (A.shape.size != B.shape.size || A.shape.size != C.shape.size)
             {
                 throw std::runtime_error("Tensor shapes must match for addition");
             }
 
             static CublasHandle handle;
-            const double alpha = 1.0;
-            const double beta = 1.0;
-            auto status = cublasDgeam(handle.get(),
-                                      CUBLAS_OP_N,
-                                      CUBLAS_OP_N,
-                                      A.shape.size, 1,
+            const float alpha = 1.0f;
+            const float beta = 1.0f;
+
+            // 使用cublasSgeam直接计算 C = alpha*A + beta*B
+            auto status = cublasSgeam(handle.get(),
+                                      CUBLAS_OP_N,     // 不转置A
+                                      CUBLAS_OP_N,     // 不转置B
+                                      A.shape.size, 1, // 假设是向量（或展平处理）
                                       &alpha,
                                       A.data, A.shape.size,
                                       &beta,
                                       B.data, B.shape.size,
                                       C.data, C.shape.size);
+
             if (status != CUBLAS_STATUS_SUCCESS)
             {
-                throw std::runtime_error("cuBLAS Dgeam failed");
+                throw std::runtime_error("cuBLAS Sgeam failed");
             }
         }
     };
 
- 
 }
 
 #endif
@@ -0,0 +1,50 @@
+#ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CUH
+#define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CUH
+
+#include "deepx/tensorfunc/elementwise.hpp"
+#include "deepx/tensorfunc/cuda.hpp"
+#include "deepx/tensorfunc/authors.hpp"
+
+namespace deepx::tensorfunc
+{
+     template <typename T>
+    __global__ void add_kernel(const T* A, const T* B, T* C, int size) {
+        int idx = blockIdx.x * blockDim.x + threadIdx.x;
+        if (idx < size) {
+            C[idx] = A[idx] + B[idx];
+        }
+    }
+    template __global__ void add_kernel<double>(const double* A, const double* B, double* C, int size);
+    template __global__ void add_kernel<float>(const float* A, const float* B, float* C, int size);
+    template __global__ void add_kernel<half>(const half* A, const half* B, half* C, int size);
+    template __global__ void add_kernel<nv_bfloat16>(const nv_bfloat16* A, const nv_bfloat16* B, nv_bfloat16* C, int size);
+    template __global__ void add_kernel<int64_t>(const int64_t* A, const int64_t* B, int64_t* C, int size);
+    template __global__ void add_kernel<int32_t>(const int32_t* A, const int32_t* B, int32_t* C, int size);
+    template __global__ void add_kernel<int16_t>(const int16_t* A, const int16_t* B, int16_t* C, int size);
+    template __global__ void add_kernel<int8_t>(const int8_t* A, const int8_t* B, int8_t* C, int size);
+    
+
+    template <typename T>
+    void launch_add(int numBlocks, int blockSize,const T*  a, const  T* b,  T* c, int size)
+    {
+         // 启动kernel
+            add_kernel<<<numBlocks, blockSize>>>(a, b, c, size);
+            // 检查kernel执行是否成功
+            cudaError_t err = cudaGetLastError();
+            if (err != cudaSuccess) {
+                throw std::runtime_error("Failed to launch add kernel: " + 
+                                       std::string(cudaGetErrorString(err)));
+            }
+    }
+
+    template void launch_add<double>(int numBlocks, int blockSize,const double*  a, const  double* b,  double* c, int size);
+    template void launch_add<float>(int numBlocks, int blockSize,const float*  a, const  float* b,  float* c, int size);
+    template void launch_add<half>(int numBlocks, int blockSize,const half*  a, const  half* b,  half* c, int size);
+    template void launch_add<nv_bfloat16>(int numBlocks, int blockSize,const nv_bfloat16*  a, const  nv_bfloat16* b,  nv_bfloat16* c, int size);
+    template void launch_add<int64_t>(int numBlocks, int blockSize,const int64_t*  a, const  int64_t* b,  int64_t* c, int size);
+    template void launch_add<int32_t>(int numBlocks, int blockSize, const int32_t*  a, const  int32_t* b,  int32_t* c, int size);
+    template void launch_add<int16_t>(int numBlocks, int blockSize, const int16_t*  a, const  int16_t* b,  int16_t* c, int size);
+    template void launch_add<int8_t>(int numBlocks, int blockSize, const int8_t*  a, const  int8_t* b,  int8_t* c, int size);
+}
+
+#endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CUH