front&excuter:联合调试matmul.cblas,cublas (#9)

miaobyte · harryharrygo · web-flow · commit 510a09fa771c · 2025-03-28T16:15:30.000+08:00
* excuter(cpu/cuda):subscalar * front:newtensor,print 联合调试 * front:newtensor,print 联合调试 * Fix build error in gcc compiler. (#5) In gcc/++13 compiler, it shows error: ``` dtype.hpp:8:29: error: found ‘:’ in nested-name-specifier, expected ‘::’ 8 | enum class DataCategory : uint8_t ``` * front&excuter:联合调试，修复init、elementwise的IR * front&excuter:联合调试matmul.cblas,cublas --------- Co-authored-by: harryharrygo <harryharrygogogo@gmail.com>
diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md
@@ -16,5 +16,6 @@
 | newtensor |  none  | newtensor(vector<int32> shape)->(tensor<any> tensor1) | T1 = zeros(shape) | newtensor(vector<int32> shape)->(tensor<any> tensor1) |
 | newtensor |  none  | newtensor(var<string> shape)->(tensor<any> tensor1) | T1 = zeros(shape) | newtensor(var<string> shape)->(tensor<any> tensor1) |
 | vecset |  none  | vecset(vector<any> value)->(vector<any> name) | shape = [3  4  5] | vecset(vector<any> value)->(vector<any> name) |
+| matmul | cublas | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1 @ T2 | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | sub | miaobyte | sub(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1-T2 | sub(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | argset |  none  | argset(var<any> value)->(var<any> name) | var argname = argvalue | argset(var<any> value)->(var<any> name) |
diff --git a/doc/excuter/op-mem-ompsimd/list.md b/doc/excuter/op-mem-ompsimd/list.md
@@ -17,5 +17,7 @@
 | newtensor |  none  | newtensor(vector<int32> shape)->(tensor<any> tensor1) | T1 =Tensor(shape=[...]) | newtensor(vector<int32> shape)->(tensor<any> tensor1) |
 | newtensor |  none  | newtensor(var<string> shape)->(tensor<any> tensor1) | T1 =Tensor(shape=[...]) | newtensor(var<string> shape)->(tensor<any> tensor1) |
 | vecset |  none  | vecset(vector<any> value)->(vector<any> name) | shape = [3  4  5] | vecset(vector<any> value)->(vector<any> name) |
+| matmul | cblas | matmul(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) | T3=T1 @ T2 | matmul(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) |
+| matmul | miaobyte | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1 @ T2 | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 | sub | miaobyte | sub(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1-T2 | sub(tensor<any> a, tensor<any> b)->(tensor<any> c) |
 | argset |  none  | argset(var<any> value)->(var<any> name) | var argname = argvalue | argset(var<any> value)->(var<any> name) |
diff --git a/excuter/cpp-common/src/deepx/tensorfunc/matmul.hpp b/excuter/cpp-common/src/deepx/tensorfunc/matmul.hpp
@@ -3,7 +3,7 @@
 
 #include "deepx/tensor.hpp"
 #include "deepx/tensorfunc/authors.hpp"
-
+#include "stdutil/error.hpp"
 namespace deepx::tensorfunc
 {
     bool check_matmul_shape(const Shape &a, const Shape &b)
@@ -29,7 +29,10 @@ namespace deepx::tensorfunc
     template <typename Author, typename T>
     struct matmulDispatcher
     {
-        static void matmul(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C) = delete;
+        static void matmul(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C)
+        {
+            throw NotImplementError("matmul");
+        }
     };
 
     template <typename Author, typename T>
diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp
@@ -4,6 +4,7 @@
 #include "deepx/tf/print.hpp"
 #include "deepx/tf/init.hpp"
 #include "deepx/tf/elementwise_basic.hpp"
+#include "deepx/tf/matmul.hpp"
 #include "deepx/dtype.hpp"
 #include "deepx/tf/tffactory.hpp"
 #include "deepx/tensorfunc/authors.hpp"
@@ -173,12 +174,19 @@ namespace deepx::tf
         //     opfactory.add_op(Powscalar_miaobyte<float>());
         //     opfactory.add_op(Powscalar_miaobyte<double>());
     }
-    // // matmul
-    // void register_matmul(OpFactory &opfactory)
-    // {
-    //     opfactory.add_op(MatMul<float>());
-    //     opfactory.add_op(MatMul<double>());
-    // }
+    // matmul
+    void register_matmul(TfFactory &tffactory)
+    {
+        tffactory.add_tf(std::make_shared<MatMul<cublas>>(vector<Param>(
+                                                             {
+                                                                 Param("A", DataCategory::Tensor, Precision::Any),
+                                                                 Param("B", DataCategory::Tensor, Precision::Any),
+                                                             }),
+                                                         vector<Param>(
+                                                             {
+                                                                 Param("C", DataCategory::Tensor, Precision::Any),
+                                                             })));
+    }
     // // changeshape
     void register_changeshape(TfFactory &tffactory)
     {
@@ -207,7 +215,7 @@ namespace deepx::tf
         register_init(tffactory);
         register_util(tffactory);
         register_elementwise(tffactory);
-        // register_matmul(opfactory);
+        register_matmul(tffactory);
         register_changeshape(tffactory);
         // register_reduce(opfactory);
         return 0;
diff --git a/excuter/op-mem-cuda/src/deepx/tf/matmul.hpp b/excuter/op-mem-cuda/src/deepx/tf/matmul.hpp
@@ -0,0 +1,88 @@
+#ifndef DEEPX_TF_MATMUL_HPP
+#define DEEPX_TF_MATMUL_HPP
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+#include "deepx/tf/tf.hpp"
+#include "deepx/dtype.hpp"
+#include "deepx/dtype_cuda.hpp"
+#include "deepx/tensorfunc/matmul_cublas.hpp"
+
+namespace deepx::tf
+{
+    template <typename Author>
+    class MatMul : public TF
+    {
+    public:
+        MatMul(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "matmul";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+
+        MatMul(string text)
+        {
+            this->parse(text);
+            this->author = Author::name();
+            if (this->name != "matmul")
+            {
+                throw std::runtime_error("Invalid name: " + this->name);
+            }
+        }
+        string math_formula() const override
+        {
+            return "T3=T1 @ T2";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<MatMul<Author>>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
+            Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != b_type || a_type != c_type)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " != " + precision_str(c_type);
+                return 1;
+            }
+            switch (a_type)
+            {
+            case Precision::Float64:
+                tensorfunc::matmul<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->args[1].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::matmul<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            case Precision::Float16:
+                tensorfunc::matmul<Author, half>(*mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<half>(this->args[1].textvalue), *mem->gettensor<half>(this->returns[0].textvalue));
+                break;
+            case Precision::BFloat16:
+                tensorfunc::matmul<Author, nv_bfloat16>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), *mem->gettensor<nv_bfloat16>(this->args[1].textvalue), *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                tensorfunc::matmul<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::matmul<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::matmul<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->args[1].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                tensorfunc::matmul<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported dtype: " + precision_str(a_type);
+                return 1;
+            }
+            return 0;
+        }
+    };
+}
+
+#endif
diff --git a/excuter/op-mem-ompsimd/src/client/tfs.cpp b/excuter/op-mem-ompsimd/src/client/tfs.cpp
@@ -8,7 +8,7 @@
 #include "deepx/tf/changeshape.hpp"
 #include "deepx/tf/elementwise.hpp"
 #include "deepx/tf/tffactory.hpp"
-
+#include "deepx/tf/matmul.hpp"
 #include "deepx/tensorfunc/authors.hpp"
 namespace deepx::tf
 {
@@ -186,12 +186,28 @@ namespace deepx::tf
         //     opfactory.add_op(Powscalar_miaobyte<float>());
         //     opfactory.add_op(Powscalar_miaobyte<double>());
     }
-    // // matmul
-    // void register_matmul(OpFactory &opfactory)
-    // {
-    //     opfactory.add_op(MatMul<float>());
-    //     opfactory.add_op(MatMul<double>());
-    // }
+    // matmul
+    void register_matmul(TfFactory &tffactory)
+    {
+        tffactory.add_tf(std::make_shared<MatMul<miaobyte>>(vector<Param>(
+                                                             {
+                                                                 Param("A", DataCategory::Tensor, Precision::Any),
+                                                                 Param("B", DataCategory::Tensor, Precision::Any),
+                                                             }),
+                                                         vector<Param>(
+                                                             {
+                                                                 Param("C", DataCategory::Tensor, Precision::Any),
+                                                             })));
+        tffactory.add_tf(std::make_shared<MatMul<cblas>>(vector<Param>(
+                                                             {
+                                                                 Param("A", DataCategory::Tensor, Precision::Float64|Precision::Float32),
+                                                                 Param("B", DataCategory::Tensor, Precision::Float64|Precision::Float32),
+                                                             }),
+                                                         vector<Param>(
+                                                             {
+                                                                 Param("C", DataCategory::Tensor, Precision::Float64|Precision::Float32),
+                                                             })));  
+    }
     // // changeshape
     void register_changeshape(TfFactory &tffactory)
     {
@@ -220,7 +236,7 @@ namespace deepx::tf
         register_init(tffactory);
         register_util(tffactory);
         register_elementwise(tffactory);
-        // register_matmul(opfactory);
+        register_matmul(tffactory);
         register_changeshape(tffactory);
         // register_reduce(opfactory);
         return 0;
diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_cblas.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_cblas.hpp
@@ -1,5 +1,5 @@
-#ifndef DEEPX_TENSORFUNC_MATMUL_HPP
-#define DEEPX_TENSORFUNC_MATMUL_HPP
+#ifndef DEEPX_TENSORFUNC_MATMUL_CBLAS_HPP
+#define DEEPX_TENSORFUNC_MATMUL_CBLAS_HPP
 
 #include <cblas.h> // 如果使用 OpenBLAS
 #include "deepx/tensor.hpp"
@@ -64,7 +64,7 @@ namespace deepx::tensorfunc
   {
     static void matmul(const Tensor<double> &a, const Tensor<double> &b, Tensor<double> &c)
     {
-      if (!check_shape(a.shape, b.shape))
+      if (!check_matmul_shape(a.shape, b.shape))
       {
         throw std::invalid_argument("a.shape could matmul with b.shape");
       }
@@ -150,7 +150,7 @@ namespace deepx::tensorfunc
   {
     static void matmuladd(const Tensor<float> &a, const Tensor<float> &b, const float &alpha, const float &beta, Tensor<float> &c)
     {
-      if (!check_shape(a.shape, b.shape))
+      if (!check_matmul_shape(a.shape, b.shape))
       {
         throw std::invalid_argument("a.shape could matmul with b.shape");
       }
@@ -208,7 +208,7 @@ namespace deepx::tensorfunc
   {
     static void matmuladd(const Tensor<double> &a, const Tensor<double> &b, const double &alpha, const double &beta, Tensor<double> &c)
     {
-      if (!check_shape(a.shape, b.shape))
+      if (!check_matmul_shape(a.shape, b.shape))
       {
         throw std::invalid_argument("a.shape could matmul with b.shape");
       }
@@ -261,4 +261,4 @@ namespace deepx::tensorfunc
     }
   };
 }
-#endif // DEEPX_TENSORFUNC_MATMUL_HPP
+#endif // DEEPX_TENSORFUNC_MATMUL_CBLAS_HPP
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/matmul.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/matmul.hpp
@@ -0,0 +1,80 @@
+#ifndef DEEPX_TF_MATMUL_HPP
+#define DEEPX_TF_MATMUL_HPP
+ 
+#include "deepx/tf/tf.hpp"
+#include "deepx/dtype.hpp"
+#include "deepx/dtype_ompsimd.hpp"
+#include "deepx/tensorfunc/matmul.hpp"
+#include "deepx/tensorfunc/matmul_cblas.hpp"
+#include "deepx/tensorfunc/matmul_miaobyte.hpp"
+namespace deepx::tf
+{
+    template <typename Author>
+    class MatMul : public TF
+    {
+    public:
+        MatMul(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "matmul";
+            this->author = Author::name();
+            this->args = args;
+            this->returns = returns;
+        }
+
+        MatMul(string text)
+        {
+            this->parse(text);
+            this->author = Author::name();
+            if (this->name != "matmul")
+            {
+                throw std::runtime_error("Invalid name: " + this->name);
+            }
+        }
+        string math_formula() const override
+        {
+            return "T3=T1 @ T2";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<MatMul<Author>>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
+            Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != b_type || a_type != c_type)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " != " + precision_str(c_type);
+                return 1;
+            }
+            switch (a_type)
+            {
+            case Precision::Float64:
+                tensorfunc::matmul<Author, double>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->args[1].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::matmul<Author, float>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                tensorfunc::matmul<Author, int64_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->args[1].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::matmul<Author, int32_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::matmul<Author, int16_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->args[1].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                tensorfunc::matmul<Author, int8_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported dtype: " + precision_str(a_type);
+                return 1;
+            }
+            return 0;
+        }
+    };
+}
+
+#endif
diff --git a/front/py/deepx/nn/functional/matmul.py b/front/py/deepx/nn/functional/matmul.py
@@ -11,7 +11,8 @@
 def matmul(
         a:Tensor,
         b: Tensor, 
-        out:Union[Tensor,str]='')->Tensor:   
+        out:Union[Tensor,str]='',
+        author:str='cublas'):
     opnode = a.graph.add_op("matmul")
     opnode.add_input(a.node)
     opnode.add_input(b.node)
@@ -25,6 +26,6 @@ def matmul(
         outtensor=out
     outtensor.node.add_input(opnode)
     if a.graph.eager:
-        ir=DeepxIR("matmul", a.dtype, [a.node.name,b.node.name], [outtensor.node.name])
+        ir=DeepxIR("matmul", [a.node.name,b.node.name], [outtensor.node.name], author=author)
         send(ir)
     return outtensor
diff --git a/front/py/deepx/scheduler/client/udpconn.py b/front/py/deepx/scheduler/client/udpconn.py
@@ -3,7 +3,7 @@
 import select
 
 class UDPConn:
-    def __init__(self, endpoint: str = "localhost:8080"):
+    def __init__(self, endpoint: str = "localhost:9090"):
         # 解析endpoint
         self._host, port_str = endpoint.split(':')
         self._port = int(port_str)
diff --git a/front/py/examples/2_ir/3_matmul.dot b/front/py/examples/2_ir/3_matmul.dot
@@ -0,0 +1,25 @@
+// Computational Graph
+digraph {
+	rankdir=TB
+	node [shape=record]
+	135175655853216 [label="t1
+(3, 4)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
+	135173962560752 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	135173963166624 [label="var_1
+1" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
+	135173961432896 [label="t2
+(4, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
+	135173961432704 [label=constant color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	135173961432224 [label="var_2
+1" color=orange fillcolor=moccasin fontname="Sans-Serif" labeljust=l shape=box style=filled]
+	135173961432464 [label=matmul color=darkslategray fillcolor=lightgray fontname="Courier Bold" labeljust=l shape=box style=filled]
+	135173961432416 [label="tensor_3
+(3, 5)" color=skyblue fillcolor=aliceblue fontname="Sans-Serif" labeljust=l shape=box style=filled]
+	135173962560752 -> 135175655853216 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	135173963166624 -> 135173962560752 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	135173961432704 -> 135173961432896 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	135173961432224 -> 135173961432704 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	135175655853216 -> 135173961432464 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	135173961432896 -> 135173961432464 [arrowsize=0.8 color=gray40 penwidth=1.2]
+	135173961432464 -> 135173961432416 [arrowsize=0.8 color=gray40 penwidth=1.2]
+}
diff --git a/front/py/examples/2_ir/3_matmul.dot.svg b/front/py/examples/2_ir/3_matmul.dot.svg

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`		`-#ifndef DEEPX_TENSORFUNC_MATMUL_HPP`
`2`		`-#define DEEPX_TENSORFUNC_MATMUL_HPP`
	`1`	`+#ifndef DEEPX_TENSORFUNC_MATMUL_CBLAS_HPP`
	`2`	`+#define DEEPX_TENSORFUNC_MATMUL_CBLAS_HPP`
`3`	`3`
`4`	`4`	`#include <cblas.h> // 如果使用 OpenBLAS`
`5`	`5`	`#include "deepx/tensor.hpp"`
`@@ -64,7 +64,7 @@ namespace deepx::tensorfunc`
`64`	`64`	`{`
`65`	`65`	`static void matmul(const Tensor<double> &a, const Tensor<double> &b, Tensor<double> &c)`
`66`	`66`	`{`
`67`		`- if (!check_shape(a.shape, b.shape))`
	`67`	`+ if (!check_matmul_shape(a.shape, b.shape))`
`68`	`68`	`{`
`69`	`69`	`throw std::invalid_argument("a.shape could matmul with b.shape");`
`70`	`70`	`}`
`@@ -150,7 +150,7 @@ namespace deepx::tensorfunc`
`150`	`150`	`{`
`151`	`151`	`static void matmuladd(const Tensor<float> &a, const Tensor<float> &b, const float &alpha, const float &beta, Tensor<float> &c)`
`152`	`152`	`{`
`153`		`- if (!check_shape(a.shape, b.shape))`
	`153`	`+ if (!check_matmul_shape(a.shape, b.shape))`
`154`	`154`	`{`
`155`	`155`	`throw std::invalid_argument("a.shape could matmul with b.shape");`
`156`	`156`	`}`
`@@ -208,7 +208,7 @@ namespace deepx::tensorfunc`
`208`	`208`	`{`
`209`	`209`	`static void matmuladd(const Tensor<double> &a, const Tensor<double> &b, const double &alpha, const double &beta, Tensor<double> &c)`
`210`	`210`	`{`
`211`		`- if (!check_shape(a.shape, b.shape))`
	`211`	`+ if (!check_matmul_shape(a.shape, b.shape))`
`212`	`212`	`{`
`213`	`213`	`throw std::invalid_argument("a.shape could matmul with b.shape");`
`214`	`214`	`}`
`@@ -261,4 +261,4 @@ namespace deepx::tensorfunc`
`261`	`261`	`}`
`262`	`262`	`};`
`263`	`263`	`}`
`264`		`-#endif // DEEPX_TENSORFUNC_MATMUL_HPP`
	`264`	`+#endif // DEEPX_TENSORFUNC_MATMUL_CBLAS_HPP`