excuter(cpu/cuda):subscalar

miaobyte · miaobyte · commit f71f809bfd2b · 2025-03-20T16:05:36.000+08:00
diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md
@@ -5,8 +5,10 @@
 | Operation | Author | Func Def | Math Formula | IR Instruction |
 |-----------|--------|------------|--------------|----------------|
 | addscalar | miaobyte | addscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1+scalar | addscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
+| add | cublas | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
 | add | miaobyte | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
 | uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
+| subscalar | miaobyte | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1-scalar | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
 | arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
 | constant | miaobyte | constant(tensor<any> t, var<any> value)->() | constant(T1) | constant(tensor<any> t, var<any> value)->() |
 | print | miaobyte | print(tensor<any> )->() | print(T1) | print(tensor<any> )->() |
diff --git a/doc/excuter/op-mem-ompsimd/list.md b/doc/excuter/op-mem-ompsimd/list.md
@@ -9,6 +9,7 @@
 | add | cblas | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) | T3=T1+T2 | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) |
 | add | miaobyte | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
 | uniform | miaobyte | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() | uniform(T1,low,high,seed) | uniform(tensor<any> t, var<any> low, var<any> high, var<int32> seed)->() |
+| subscalar | miaobyte | subscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) | T3=T1-scalar | subscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) |
 | arange | miaobyte | arange(tensor<any> t, var<any> start, var<any> step)->() | arange(T1,start,step) | arange(tensor<any> t, var<any> start, var<any> step)->() |
 | constant | miaobyte | constant(tensor<any> t, var<any> value)->() | constant(T1,value) | constant(tensor<any> t, var<any> value)->() |
 | print | miaobyte | print(tensor<any> )->() | print(T1) | print(tensor<any> )->() |
diff --git a/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp b/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp
@@ -24,7 +24,9 @@ namespace deepx::tensorfunc
     template <typename Author, typename T>
     struct addscalarDispatcher
     {
-        static void addscalar(const Tensor<T> &input, const T value, Tensor<T> &output) = delete;
+        static void addscalar(const Tensor<T> &input, const T value, Tensor<T> &output){
+            throw NotImplementError("addscalar");
+        }
     };
 
     template <typename Author, typename T>
@@ -36,7 +38,9 @@ namespace deepx::tensorfunc
     template <typename Author, typename T>
     struct subDispatcher
     {
-        static void sub(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C) = delete;
+        static void sub(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C){
+            throw NotImplementError("sub");
+        }
     };
 
     template <typename Author, typename T>
@@ -48,7 +52,9 @@ namespace deepx::tensorfunc
     template <typename Author, typename T>
     struct subscalarDispatcher
     {
-        static void subscalar(const Tensor<T> &input, const T value, Tensor<T> &output) = delete;
+        static void subscalar(const Tensor<T> &input, const T value, Tensor<T> &output){
+            throw NotImplementError("subscalar");
+        }
     };
 
     template <typename Author, typename T>
diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp
@@ -114,7 +114,7 @@ namespace deepx::tf
                                                              {
                                                                  Param("c", DataCategory::Tensor, Precision::Any),
                                                              })));
-        tffactory.add_tf(std::make_shared<Addscalar<miaobyte>>(vector<Param>(
+        tffactory.add_tf(std::make_shared<AddScalar<miaobyte>>(vector<Param>(
                                                                    {
                                                                        Param("A", DataCategory::Tensor, Precision::Any),
                                                                        Param("b", DataCategory::Var, Precision::Any),
@@ -133,7 +133,16 @@ namespace deepx::tf
                                                                    {
                                                                        Param("C", DataCategory::Tensor, Precision::Any),
                                                                    })));
-        
+        tffactory.add_tf(std::make_shared<SubScalar<miaobyte>>(vector<Param>(
+                                                                   {
+                                                                       Param("A", DataCategory::Tensor, Precision::Any),
+                                                                       Param("b", DataCategory::Var, Precision::Any),
+                                                                   }),
+                                                               vector<Param>(
+                                                                   {
+                                                                       Param("C", DataCategory::Tensor, Precision::Any),
+                                                                   })));
+
         //     opfactory.add_op(Sub_cblas<float>());
         //     opfactory.add_op(Sub_cblas<double>());
 
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu
@@ -105,7 +105,34 @@ namespace deepx::tensorfunc
     template void launch_sub<int16_t>(const int numBlocks, const int blockSize, const int16_t* a, const int16_t* b, int16_t* c, const int size);
     template void launch_sub<int8_t>(const int numBlocks, const int blockSize, const int8_t* a, const int8_t* b, int8_t* c, const int size);    
     
-    
+    template <typename T>
+    __global__ void subscalar_kernel(const T* A, const T scalar, T* C,const int size){
+        int idx = blockIdx.x * blockDim.x + threadIdx.x;
+        if (idx < size) {
+            C[idx] = A[idx] - scalar;
+        }
+    }   
+    template __global__ void subscalar_kernel<double>(const double* A, const double scalar, double* C,const int size);
+    template __global__ void subscalar_kernel<float>(const float* A, const float scalar, float* C,const int size);
+    template __global__ void subscalar_kernel<half>(const half* A, const half scalar, half* C,const int size);
+    template __global__ void subscalar_kernel<nv_bfloat16>(const nv_bfloat16* A, const nv_bfloat16 scalar, nv_bfloat16* C,const int size);
+    template __global__ void subscalar_kernel<int64_t>(const int64_t* A, const int64_t scalar, int64_t* C,const int size);  
+    template __global__ void subscalar_kernel<int32_t>(const int32_t* A, const int32_t scalar, int32_t* C,const int size);  
+    template __global__ void subscalar_kernel<int16_t>(const int16_t* A, const int16_t scalar, int16_t* C,const int size);  
+    template __global__ void subscalar_kernel<int8_t>(const int8_t* A, const int8_t scalar, int8_t* C,const int size);  
+
+    template <typename T>
+    void launch_subscalar(const int numBlocks, const int blockSize, const T* a, const T scalar, T* c, const int size) { 
+        subscalar_kernel<<<numBlocks, blockSize>>>(a, scalar, c, size);
+    }
+    template void launch_subscalar<double>(const int numBlocks, const int blockSize, const double* a, const double scalar, double* c, const int size);
+    template void launch_subscalar<float>(const int numBlocks, const int blockSize, const float* a, const float scalar, float* c, const int size);
+    template void launch_subscalar<half>(const int numBlocks, const int blockSize, const half* a, const half scalar, half* c, const int size);
+    template void launch_subscalar<nv_bfloat16>(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c, const int size);  
+    template void launch_subscalar<int64_t>(const int numBlocks, const int blockSize, const int64_t* a, const int64_t scalar, int64_t* c, const int size);  
+    template void launch_subscalar<int32_t>(const int numBlocks, const int blockSize, const int32_t* a, const int32_t scalar, int32_t* c, const int size);  
+    template void launch_subscalar<int16_t>(const int numBlocks, const int blockSize, const int16_t* a, const int16_t scalar, int16_t* c, const int size);  
+    template void launch_subscalar<int8_t>(const int numBlocks, const int blockSize, const int8_t* a, const int8_t scalar, int8_t* c, const int size);    
 }
 
 #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CUH
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh
@@ -103,7 +103,38 @@ namespace deepx::tensorfunc
 
     template <> 
     void launch_sub<int8_t>(int numBlocks, int blockSize, const int8_t* a, const int8_t* b, int8_t* c,const int size);
-    
+
+    // subscalar
+    template <typename T>
+    __global__ void subscalar_kernel(const T* A, const T scalar, T* C,const int size);
+
+    template <typename T>
+    void launch_subscalar(const int numBlocks, const int blockSize, const T* a, const T scalar, T* c,const int size);
+
+    template <>
+    void launch_subscalar<double>(const int numBlocks, const int blockSize, const double* a, const double scalar, double* c,const int size);
+
+    template <>
+    void launch_subscalar<float>(const int numBlocks, const int blockSize, const float* a, const float scalar, float* c,const int size);
+
+    template <>
+    void launch_subscalar<nv_bfloat16>(const int numBlocks, const int blockSize, const nv_bfloat16* a, const nv_bfloat16 scalar, nv_bfloat16* c,const int size);
+
+    template <>
+    void launch_subscalar<__half>(const int numBlocks, const int blockSize, const __half* a, const __half scalar, __half* c,const int size);
+
+    template <>
+    void launch_subscalar<int64_t>(const int numBlocks, const int blockSize, const int64_t* a, const int64_t scalar, int64_t* c,const int size);
+
+    template <>
+    void launch_subscalar<int32_t>(const int numBlocks, const int blockSize, const int32_t* a, const int32_t scalar, int32_t* c,const int size);
+
+    template <>
+    void launch_subscalar<int16_t>(const int numBlocks, const int blockSize, const int16_t* a, const int16_t scalar, int16_t* c,const int size);
+
+    template <>
+    void launch_subscalar<int8_t>(const int numBlocks, const int blockSize, const int8_t* a, const int8_t scalar, int8_t* c,const int size);    
+ 
 }
 
 #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_CUH
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp
@@ -55,6 +55,20 @@ namespace deepx::tensorfunc
             launch_sub(numBlocks, blockSize, A.data, B.data, C.data, A.shape.size);
         }
     };
+
+    template <typename T>
+    struct subscalarDispatcher<miaobyte, T>
+    {
+        static void subscalar(const Tensor<T> &A, const T scalar, Tensor<T> &C)
+        {
+            if (A.shape.size != C.shape.size) { 
+                throw TensorShapeError("subscalar");
+            }
+            const int blockSize = A.shape.size > 256 ? 256 : A.shape.size;
+            int numBlocks = (A.shape.size + blockSize - 1) / blockSize;
+            launch_subscalar(numBlocks, blockSize, A.data, scalar, C.data, A.shape.size);
+        }
+    };  
 }
 
 #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_HPP
diff --git a/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp b/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp
@@ -83,18 +83,18 @@ namespace deepx::tf
     };
 
     template <typename Author>
-    class Addscalar : public TF
+    class AddScalar : public TF
     {
     public:
-        Addscalar(const vector<Param> &args, const vector<Param> &returns)
+        AddScalar(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "addscalar";
             this->author = Author::name();
             this->args = args;
             this->returns = returns;
         }
 
-        Addscalar(string text)
+        AddScalar(string text)
         {
             this->parse(text);
             this->author = Author::name();
@@ -109,7 +109,7 @@ namespace deepx::tf
         }
         shared_ptr<TF> clone() const override
         {
-            return make_shared<Addscalar<Author>>(*this);
+            return make_shared<AddScalar<Author>>(*this);
         }
         int run(shared_ptr<MemBase> mem, string &error) override
         {
@@ -226,6 +226,80 @@ namespace deepx::tf
             return 0;
         }
     };
+
+    template <typename Author>
+    class SubScalar : public TF
+    {
+    public: 
+        SubScalar(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "subscalar";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+
+        SubScalar(string text)
+        {
+            this->parse(text);
+            this->author = Author::name();
+            if (this->name != "subscalar")
+            {
+                throw std::runtime_error("Invalid name: " + this->name);
+            }
+        }
+        string math_formula() const override
+        {
+            return "T3=T1-scalar";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<SubScalar<Author>>(*this);
+        }   
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != c_type)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type);
+                return 1;
+            }
+            switch (a_type)
+            {
+            case Precision::Float64:
+                tensorfunc::subscalar<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), this->getvar<double>(1, mem), *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::subscalar<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), this->getvar<float>(1, mem), *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            case Precision::Float16:
+                tensorfunc::subscalar<Author, half>(*mem->gettensor<half>(this->args[0].textvalue), this->getvar<half>(1, mem), *mem->gettensor<half>(this->returns[0].textvalue));
+                break;
+            case Precision::BFloat16:
+                tensorfunc::subscalar<Author, nv_bfloat16>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), this->getvar<nv_bfloat16>(1, mem), *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                tensorfunc::subscalar<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::subscalar<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::subscalar<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), this->getvar<int16_t>(1, mem), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;  
+            case Precision::Int8:
+                tensorfunc::subscalar<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), this->getvar<int8_t>(1, mem), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported dtype: " + precision_str(a_type);
+                return 1;
+            }
+            return 0;
+        }   
+    };
+ 
+    
 };
 
 #endif // DEEPX_TF_ELEMENTWISE_BASIC_HPP
diff --git a/excuter/op-mem-ompsimd/src/client/tfs.cpp b/excuter/op-mem-ompsimd/src/client/tfs.cpp
@@ -140,6 +140,16 @@ namespace deepx::tf
                                                                  {
                                                                      Param("c", DataCategory::Tensor, Precision::Any),
                                                                  }))); 
+
+        tffactory.add_tf(std::make_shared<SubScalar<miaobyte>>(vector<Param>(
+                                                                 {
+                                                                     Param("a", DataCategory::Tensor, Precision::Any),
+                                                                     Param("scalar", DataCategory::Var, Precision::Any),
+                                                                 }),
+                                                             vector<Param>(
+                                                                 {
+                                                                     Param("c", DataCategory::Tensor, Precision::Any),
+                                                                 }))); 
         //     opfactory.add_op(Addscalar_miaobyte<float>());
         //     opfactory.add_op(Addscalar_miaobyte<double>());
 
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp