sync the progress not finish

Alwaysproblem · Alwaysproblem · commit 80e2354bec9a · 2026-02-27T15:22:18.000+09:00
diff --git a/mlir/cuda-tile/Toy/cuda_wrapper/cuda_shim.cpp b/mlir/cuda-tile/Toy/cuda_wrapper/cuda_shim.cpp
@@ -518,11 +518,11 @@ cuda_shim_launch_block_packed(uint64_t module_handle, uint64_t kernel_name_ptr,
 extern "C" void cuda_shim_ctx_synchronize(void) { mgpuCtxSynchronize(); }
 
 // only for debugging
-// extern "C" void cuda_debug_dump_float(uint64_t dptr, int n) {
-//   auto *p = reinterpret_cast<const float *>(static_cast<uintptr_t>(dptr));
-//   for (uint32_t i = 0; i < n; ++i) {
-//     fprintf(stderr, "i=%u v=%f\n", i, p[i]);
-//   }
-// }
+extern "C" void cuda_debug_dump_float(uint64_t dptr, int n) {
+  auto *p = reinterpret_cast<const float*>(static_cast<uintptr_t>(dptr));
+  for (uint32_t i = 0; i < n; ++i) {
+    fprintf(stderr, "i=%u v=%f\n", i, p[i]);
+  }
+}
 
 #endif
diff --git a/mlir/cuda-tile/Toy/mlir/LowerToAffineLoops.cpp b/mlir/cuda-tile/Toy/mlir/LowerToAffineLoops.cpp
@@ -12,19 +12,24 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "cuda_shim/CudaShimBuilder.hpp"
+#include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinDialect.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/DialectRegistry.h"
+#include "mlir/IR/Operation.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/ValueRange.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/TypeID.h"
 #include "toy/Dialect.h"
 #include "toy/Passes.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/DebugLog.h"
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
@@ -36,9 +41,8 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Sequence.h"
-#include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Casting.h"
-#include <algorithm>
+#include "llvm/Support/Debug.h"
 #include <cstdint>
 #include <functional>
 #include <memory>
@@ -390,6 +394,92 @@ struct MatMulOpLowering : public ConversionPattern {
   }
 };
 
+memref::GlobalOp createGlobalForStringAttr(mlir::PatternRewriter &rewriter,
+                                           Operation *op,
+                                           llvm::StringRef sym_name,
+                                           StringAttr attr) {
+  auto loc = op->getLoc();
+  auto moduleOp = op->getParentOfType<ModuleOp>();
+
+  if (auto global = moduleOp.lookupSymbol<memref::GlobalOp>(sym_name); global) {
+    return global;
+  }
+
+  OpBuilder::InsertionGuard guard(rewriter);
+  rewriter.setInsertionPointToStart(moduleOp.getBody());
+
+  auto str = attr.getValue();
+  std::vector<uint8_t> bytes(str.begin(), str.end());
+  bytes.push_back(0);
+
+  auto memrefType =
+      MemRefType::get({(int64_t)bytes.size()}, rewriter.getIntegerType(8));
+
+  auto denseAttr =
+      DenseElementsAttr::get(memrefType, llvm::ArrayRef<uint8_t>(bytes));
+
+  auto global = memref::GlobalOp::create(
+      rewriter, loc, sym_name,
+      /*sym_visibility=*/rewriter.getStringAttr("private"), memrefType,
+      denseAttr,
+      /*constant=*/true,
+      /*alignment=*/nullptr);
+
+  return global;
+}
+
+struct LanchGpuLowering : public ConversionPattern {
+  LanchGpuLowering(MLIRContext *ctx)
+      : ConversionPattern(toy::LaunchGpuOp::getOperationName(), 1, ctx) {}
+
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    auto loc = op->getLoc();
+    CudaShimRegistry registry(op->getParentOfType<ModuleOp>());
+
+    toy::LaunchGpuOp launchGpuOp = llvm::cast<toy::LaunchGpuOp>(op);
+    for (auto ranked_tensor_type : launchGpuOp->getOperands()) {
+      if (!llvm::isa<RankedTensorType>(ranked_tensor_type.getType())) {
+        return rewriter.notifyMatchFailure(op, "expected operand to be a "
+                                               "ranked tensor type");
+      }
+    }
+
+    auto cudaBinaryPathAttr =
+        launchGpuOp->getDiscardableAttr("cuda_binary_path");
+    if (!cudaBinaryPathAttr) {
+      return rewriter.notifyMatchFailure(
+          op, "expected 'cuda_binary_path' attribute to be present");
+    }
+
+    auto cudaBinaryPathStr = llvm::dyn_cast<StringAttr>(cudaBinaryPathAttr);
+    if (!cudaBinaryPathStr) {
+      return rewriter.notifyMatchFailure(
+          op, "expected 'cuda_binary_path' attribute to be a string");
+    }
+
+    auto cuda_blob_memref = createGlobalForStringAttr(
+        rewriter, launchGpuOp, "cuda_blob", cudaBinaryPathStr);
+
+    auto kernelName = launchGpuOp.getCallee();
+
+    auto kernel_name_memref = createGlobalForStringAttr(
+        rewriter, launchGpuOp, "kname", rewriter.getStringAttr(kernelName));
+
+    auto nbytesVal = arith::ConstantIndexOp::create(rewriter, loc, 1);
+    auto streamVal = arith::ConstantIndexOp::create(rewriter, loc, 0);
+    auto isHostSharedVal = arith::ConstantIntOp::create(rewriter, loc, 0, 1);
+
+    auto callee =
+        registry.call(rewriter, launchGpuOp, CudaShimFn::Malloc,
+                      ValueRange{nbytesVal, streamVal, isHostSharedVal});
+
+    rewriter.replaceOp(op, callee);
+    return success();
+  }
+};
+
 } // namespace
 
 //===----------------------------------------------------------------------===//
@@ -442,7 +532,7 @@ void ToyToAffineLoweringPass::runOnOperation() {
   RewritePatternSet patterns(&getContext());
   patterns.add<AddOpLowering, ConstantOpLowering, FuncOpLowering, MulOpLowering,
                PrintOpLowering, ReturnOpLowering, TransposeOpLowering,
-               MatMulOpLowering>(&getContext());
+               MatMulOpLowering, LanchGpuLowering>(&getContext());
 
   // With the target and rewrite patterns defined, we can now attempt the
   // conversion. The conversion will signal failure if any of our `illegal`
diff --git a/mlir/cuda-tile/Toy/toyc.cpp b/mlir/cuda-tile/Toy/toyc.cpp
@@ -345,7 +345,7 @@ static int loadAndProcessMLIRGPU(mlir::MLIRContext &context,
 
     // mlir::OpPassManager &gpuOptPM = pm.nest<mlir::toy::FuncOp>();
     // // Partially lower the toy dialect.
-    // pm.addPass(mlir::toy::createLowerToAffinePass());
+    pm.addPass(mlir::toy::createLowerToAffinePass());
 
     //   // Add a few cleanups post lowering.
     //   mlir::OpPassManager &optPM = pm.nest<mlir::func::FuncOp>();
diff --git a/mlir/cuda-tile/sample/matmul.toy b/mlir/cuda-tile/sample/matmul.toy
@@ -1,16 +1,17 @@
 def main() {
   # Define a variable `a` with shape <2, 3>, initialized with the literal value.
   # The shape is inferred from the supplied literal.
-  var a = [[1, 2, 3], [4, 5, 6]];
+  var a = [[1, 2, 3, 9], [4, 5, 6, 10]];
 
   # b is identical to a, the literal tensor is implicitly reshaped: defining new
   # variables is the way to reshape tensors (element count must match).
-  var b<2, 3> = [11, 12, 13, 14, 15, 16];
+  var b<2, 4> = [11, 12, 13, 14, 15, 16, 17, 18];
 
   # transpose() and print() are the only builtin, the following will transpose
   # a and b and perform an element-wise multiplication before printing the result.
   # print(a * b + b);
-  print(matmul(a, transpose(b)));
-  var c<2, 3> = [[7, 8, 9], [10, 11, 12]];
-  print(a * c + b);
+  # print(matmul(a, transpose(b)));
+  var c<2, 4> = [[7, 8, 9, 13], [10, 11, 12, 14]];
+  var d<2, 4> = [[7, 8, 9, 13], [10, 11, 12, 14]];
+  print(a * c + b * d);
 }