matmul:qwang

miaobyte · miaobyte · commit d9551a1b2ca4 · 2025-03-26T20:00:37.000+08:00
diff --git a/excuter/cpp-common/src/deepx/tensorfunc/authors.hpp b/excuter/cpp-common/src/deepx/tensorfunc/authors.hpp
@@ -24,6 +24,11 @@ namespace deepx::tensorfunc{
     public:
         static std::string name() { return "cublas"; }
     };
+
+    class qwang{
+    public:
+        static std::string name() { return "wqing"; }
+    };
 }
 
 #endif
diff --git a/excuter/cpp-common/src/deepx/tensorfunc/matmul.hpp b/excuter/cpp-common/src/deepx/tensorfunc/matmul.hpp
@@ -3,6 +3,7 @@
 
 #include "deepx/tensor.hpp"
 #include "deepx/tensorfunc/authors.hpp"
+#include "stdutil/error.hpp"
 
 namespace deepx::tensorfunc
 {
@@ -29,7 +30,9 @@ namespace deepx::tensorfunc
     template <typename Author, typename T>
     struct matmulDispatcher
     {
-        static void matmul(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C) = delete;
+        static void matmul(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C) {
+            throw NotImplementError("matmul");
+        };
     };
 
     template <typename Author, typename T>
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/matmul.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/matmul.cu
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/matmul_qwang.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/matmul_qwang.cu
@@ -0,0 +1,119 @@
+#include "deepx/tensorfunc/cuda.hpp"
+
+// #include <cuda_fp64.h>
+// #include <cuda_fp32.h>
+#include "deepx/tensor.hpp"
+
+#include "deepx/tensorfunc/matmul_qwang.cuh"
+#include "deepx/tensorfunc/authors.hpp"
+#include "deepx/tensorfunc/cuda.hpp"
+
+namespace deepx::tensorfunc
+{
+
+#define BLOCK_SIZE 32
+
+    template <typename T>
+    __global__ void matmul_kernel(T *C, const T *A, const T *B,
+                                     int M, int N, int K)
+    {
+        // 定义共享内存块，用于缓存A和B的矩阵块
+        __shared__ T tileA[BLOCK_SIZE][BLOCK_SIZE];
+        __shared__ T tileB[BLOCK_SIZE][BLOCK_SIZE];
+
+        // 计算当前线程处理的全局矩阵位置
+        int row = blockIdx.y * BLOCK_SIZE + threadIdx.y;
+        int col = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+
+        T sum = 0.0;
+
+        // 分块循环处理整个K维度
+        for (int t = 0; t < (K + BLOCK_SIZE - 1) / BLOCK_SIZE; ++t)
+        {
+            // 计算当前块的起始位置
+            int tiledK = t * BLOCK_SIZE;
+
+            // 加载A的块到共享内存（行优先）
+            int loadA_col = tiledK + threadIdx.x;
+            if (row < M && loadA_col < K)
+            {
+                tileA[threadIdx.y][threadIdx.x] = A[row * K + loadA_col];
+            }
+            else
+            {
+                tileA[threadIdx.y][threadIdx.x] = 0.0; // 填充0处理边界
+            }
+
+            // 加载B的块到共享内存（列优先等效处理）
+            int loadB_row = tiledK + threadIdx.y;
+            if (col < N && loadB_row < K)
+            {
+                tileB[threadIdx.y][threadIdx.x] = B[loadB_row * N + col];
+            }
+            else
+            {
+                tileB[threadIdx.y][threadIdx.x] = 0.0; // 填充0处理边界
+            }
+
+            __syncthreads(); // 确保块加载完成
+
+            // 计算当前块的矩阵乘法贡献
+            for (int k = 0; k < BLOCK_SIZE; ++k)
+            {
+                sum += tileA[threadIdx.y][k] * tileB[k][threadIdx.x];
+            }
+
+            __syncthreads(); // 确保计算完成再加载下一块
+        }
+
+        // 只将有效范围内的结果写入全局内存
+        if (row < M && col < N)
+        {
+            C[row * N + col] = sum;
+        }
+    }
+
+    template __global__ void matmul_kernel<double>(double *C, const double *A, const double *B,
+                                                     int M, int N, int K);
+    template __global__ void matmul_kernel<float>(float *C, const float *A, const float *B,
+                                                     int M, int N, int K);
+    template __global__ void matmul_kernel<half>(half *C, const half *A, const half *B,
+                                                     int M, int N, int K);
+    template __global__ void matmul_kernel<nv_bfloat16>(nv_bfloat16 *C, const nv_bfloat16 *A, const nv_bfloat16 *B,
+                                                     int M, int N, int K);
+    template __global__ void matmul_kernel<int64_t>(int64_t *C, const int64_t *A, const int64_t *B,
+                                                     int M, int N, int K);
+    template __global__ void matmul_kernel<int32_t>(int32_t *C, const int32_t *A, const int32_t *B,
+                                                     int M, int N, int K);
+    template __global__ void matmul_kernel<int16_t>(int16_t *C, const int16_t *A, const int16_t *B,
+                                                     int M, int N, int K);
+    template __global__ void matmul_kernel<int8_t>(int8_t *C, const int8_t *A, const int8_t *B,
+                                                     int M, int N, int K);
+    // 主机函数调用内核
+    template <typename T>
+    void launch_matmul(T *d_C, const T *d_A, const T *d_B,
+                    int M, int N, int K)
+    {
+        dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
+        dim3 dimGrid((N + BLOCK_SIZE - 1) / BLOCK_SIZE,
+                     (M + BLOCK_SIZE - 1) / BLOCK_SIZE);
+
+        matmul_kernel<<<dimGrid, dimBlock>>>(d_C, d_A, d_B, M, N, K);
+    }
+    template void launch_matmul<double>(double *d_C, const double *d_A, const double *d_B,
+                                        int M, int N, int K);
+    template void launch_matmul<float>(float *d_C, const float *d_A, const float *d_B,
+                                        int M, int N, int K);
+    template void launch_matmul<half>(half *d_C, const half *d_A, const half *d_B,
+                                        int M, int N, int K);
+    template void launch_matmul<nv_bfloat16>(nv_bfloat16 *d_C, const nv_bfloat16 *d_A,   const nv_bfloat16 *d_B,
+                                        int M, int N, int K);
+    template void launch_matmul<int64_t>(int64_t *d_C, const int64_t *d_A, const int64_t *d_B,
+                                        int M, int N, int K);
+    template void launch_matmul<int32_t>(int32_t *d_C, const int32_t *d_A, const int32_t *d_B,
+                                        int M, int N, int K);
+    template void launch_matmul<int16_t>(int16_t *d_C, const int16_t *d_A, const int16_t *d_B,
+                                        int M, int N, int K);
+    template void launch_matmul<int8_t>(int8_t *d_C, const int8_t *d_A, const int8_t *d_B,
+                                        int M, int N, int K);
+}
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/matmul_qwang.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/matmul_qwang.cuh
@@ -0,0 +1,39 @@
+#ifndef DEEPX_TENSORFUNC_MATMUL_QWANG_CUH
+#define DEEPX_TENSORFUNC_MATMUL_QWANG_CUH
+
+#include "deepx/tensorfunc/cuda.hpp"
+#include "deepx/tensorfunc/matmul.hpp"
+
+namespace deepx::tensorfunc
+{
+
+    #define BLOCK_SIZE 32
+
+    template <typename T>
+    __global__ void matmul_kernel(T *C, const T *A, const T *B,
+                                     int M, int N, int K);  
+ 
+
+
+    template <typename T>
+    void launch_matmul(T *d_C, const T *d_A, const T *d_B,
+                       int M, int N, int K);
+
+    extern template void launch_matmul<double>(double *d_C, const double *d_A, const double *d_B,
+                                        int M, int N, int K);
+    extern template void launch_matmul<float>(float *d_C, const float *d_A, const float *d_B,
+                                        int M, int N, int K);
+    extern template void launch_matmul<half>(half *d_C, const half *d_A, const half *d_B,
+                                        int M, int N, int K);
+    extern template void launch_matmul<nv_bfloat16>(nv_bfloat16 *d_C, const nv_bfloat16 *d_A, const nv_bfloat16 *d_B,
+                                        int M, int N, int K);
+    extern template void launch_matmul<int64_t>(int64_t *d_C, const int64_t *d_A, const int64_t *d_B,
+                                        int M, int N, int K);
+    extern template void launch_matmul<int32_t>(int32_t *d_C, const int32_t *d_A, const int32_t *d_B,
+                                        int M, int N, int K);
+    extern template void launch_matmul<int16_t>(int16_t *d_C, const int16_t *d_A, const int16_t *d_B,
+                                        int M, int N, int K);
+    extern template void launch_matmul<int8_t>(int8_t *d_C, const int8_t *d_A, const int8_t *d_B,
+                                        int M, int N, int K);
+}
+#endif
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/matmul_qwang.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/matmul_qwang.hpp
@@ -0,0 +1,25 @@
+#ifndef DEEPX_TENSORFUNC_MATMUL_WQING_HPP
+#define DEEPX_TENSORFUNC_MATMUL_WQING_HPP
+
+#include "deepx/tensorfunc/cuda.hpp"
+#include "deepx/tensor.hpp"
+#include "deepx/tensorfunc/matmul.hpp"
+#include "deepx/tensorfunc/authors.hpp"
+#include "deepx/tensorfunc/matmul_qwang.cuh"
+
+namespace deepx::tensorfunc
+{
+    using namespace deepx;
+
+    template <typename T>
+    struct matmulDispatcher<qwang, T>
+    {
+        static void matmul(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C)
+        {
+            launch_matmul(C.data, A.data, B.data, A.shape[-2], A.shape[-1], B.shape[-1]);
+        }
+    };
+
+ 
+}
+#endif  
diff --git a/excuter/op-mem-cuda/test/tensorfunc/1_cublas_matmul.cpp b/excuter/op-mem-cuda/test/tensorfunc/1_cublas_matmul.cpp
@@ -4,10 +4,11 @@
 #include "deepx/tensorfunc/print_miaobyte.hpp"
 #include "deepx/tensorfunc/matmul.hpp"
 #include "deepx/tensorfunc/matmul_cublas.hpp"
-
+#include "deepx/tensorfunc/matmul_qwang.hpp"
 using namespace deepx::tensorfunc;
 using namespace deepx;
 
+template <typename Author>
 void test_matmul()
 {
     // 创建矩阵 A (2x3)
@@ -30,7 +31,7 @@ void test_matmul()
     print<miaobyte>(b, "%.2f");
 
     // 执行矩阵乘法 C = A × B
-    matmul<deepx::tensorfunc::cublas,float>(a, b, c);
+    matmul<Author,float>(a, b, c);
 
     // 打印结果
     print<miaobyte>(c, "%.2f");
@@ -69,7 +70,11 @@ int main(int argc, char **argv)
     }
     switch (casei) {
         case 0:
-            test_matmul();
+
+            printf("test qwang matmul\n");
+            test_matmul<deepx::tensorfunc::qwang>();
+        printf("test cublas matmul\n");
+        test_matmul<deepx::tensorfunc::cublas>();
             break;
         case 1:
             test_matmul_batch();
diff --git a/excuter/op-mem-cuda/test/tensorfunc/CMakeLists.txt b/excuter/op-mem-cuda/test/tensorfunc/CMakeLists.txt
@@ -5,4 +5,4 @@ add_executable(1_cublas_add 1_cublas_add.cpp)
 target_link_libraries(1_cublas_add deepx CUDA::cudart)
 
 add_executable(1_cublas_matmul 1_cublas_matmul.cpp)
-target_link_libraries(1_cublas_matmul deepx CUDA::cudart)
+target_link_libraries(1_cublas_matmul deepx CUDA::cudart CUDA::cublas)