Move the helper function into cuda shim builder

Alwaysproblem · Alwaysproblem · commit b2e34d04bf17 · 2026-03-02T08:43:04.000Z
diff --git a/mlir/cuda-tile/Toy/include/cuda_shim/CudaShimBuilder.hpp b/mlir/cuda-tile/Toy/include/cuda_shim/CudaShimBuilder.hpp
@@ -6,6 +6,22 @@
 #include "mlir/IR/Value.h"
 #include "llvm/ADT/DenseMap.h"
 
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/DialectConversion.h"
+
 enum class CudaShimFn {
   // ----- Module -----
   LoadModuleFromImage,
@@ -199,3 +215,84 @@ class CudaShimRegistry {
   mlir::ModuleOp module;
   llvm::DenseMap<unsigned, mlir::func::FuncOp> cache;
 };
+
+inline mlir::memref::GlobalOp
+createGlobalForStringAttr(mlir::PatternRewriter &rewriter, mlir::Operation *op,
+                          llvm::StringRef sym_name, mlir::StringAttr attr) {
+  auto loc = op->getLoc();
+  auto moduleOp = op->getParentOfType<mlir::ModuleOp>();
+
+  if (auto global = moduleOp.lookupSymbol<mlir::memref::GlobalOp>(sym_name);
+      global) {
+    return global;
+  }
+
+  mlir::OpBuilder::InsertionGuard guard(rewriter);
+  rewriter.setInsertionPointToStart(moduleOp.getBody());
+
+  auto str = attr.getValue();
+  std::vector<uint8_t> bytes(str.begin(), str.end());
+  bytes.push_back(0);
+
+  auto type = mlir::RankedTensorType::get({(int64_t)bytes.size()},
+                                          rewriter.getIntegerType(8));
+
+  auto memrefType = mlir::MemRefType::get({(int64_t)bytes.size()},
+                                          rewriter.getIntegerType(8));
+
+  auto denseAttr =
+      mlir::DenseElementsAttr::get(type, llvm::ArrayRef<uint8_t>(bytes));
+
+  auto global = mlir::memref::GlobalOp::create(
+      rewriter, loc, sym_name,
+      /*sym_visibility=*/rewriter.getStringAttr("private"), memrefType,
+      denseAttr,
+      /*constant=*/true,
+      /*alignment=*/nullptr);
+
+  return global;
+}
+
+inline mlir::arith::IndexCastOp
+getIndexFromValue(mlir::PatternRewriter &rewriter, mlir::Location loc,
+                  mlir::Value value) {
+  auto extractOp = mlir::memref::ExtractAlignedPointerAsIndexOp::create(
+      rewriter, loc, rewriter.getIndexType(), value);
+  auto indexCastOp = mlir::arith::IndexCastOp::create(
+      rewriter, loc, rewriter.getI64Type(), extractOp.getResult());
+  return indexCastOp;
+}
+
+inline mlir::arith::IndexCastOp
+getIndexFromGlobalMemref(mlir::PatternRewriter &rewriter, mlir::Location loc,
+                         mlir::memref::GlobalOp global) {
+
+  auto getGlobalOp = mlir::memref::GetGlobalOp::create(
+      rewriter, loc, global.getType(), global.getName());
+
+  return getIndexFromValue(rewriter, loc, getGlobalOp.getResult());
+}
+
+inline mlir::func::CallOp createCallToCudaShimMalloc(
+    mlir::PatternRewriter &rewriter, mlir::Location loc,
+    CudaShimRegistry &registry, mlir::func::CallOp stream,
+    mlir::arith::ConstantIntOp nbytesVal, bool isHostShared) {
+  mlir::arith::ConstantIntOp isHostSharedVal;
+  if (isHostShared) {
+    isHostSharedVal = mlir::arith::ConstantIntOp::create(rewriter, loc, 1, 1);
+  } else {
+    isHostSharedVal = mlir::arith::ConstantIntOp::create(rewriter, loc, 0, 1);
+  }
+  auto sreamVal = stream.getResult(0);
+  auto callee =
+      registry.call(rewriter, stream, CudaShimFn::Malloc,
+                    mlir::ValueRange{nbytesVal, sreamVal, isHostSharedVal});
+  return callee;
+}
+
+inline unsigned long getNbytes(mlir::Type tensorType) {
+  auto ranked_tensor_type = llvm::cast<mlir::MemRefType>(tensorType);
+  return llvm::divideCeil(ranked_tensor_type.getNumElements() *
+                              ranked_tensor_type.getElementTypeBitWidth(),
+                          8);
+}
diff --git a/mlir/cuda-tile/Toy/mlir/LowerToAffineLoops.cpp b/mlir/cuda-tile/Toy/mlir/LowerToAffineLoops.cpp
@@ -397,84 +397,6 @@ struct MatMulOpLowering : public ConversionPattern {
   }
 };
 
-memref::GlobalOp createGlobalForStringAttr(mlir::PatternRewriter &rewriter,
-                                           Operation *op,
-                                           llvm::StringRef sym_name,
-                                           StringAttr attr) {
-  auto loc = op->getLoc();
-  auto moduleOp = op->getParentOfType<ModuleOp>();
-
-  if (auto global = moduleOp.lookupSymbol<memref::GlobalOp>(sym_name); global) {
-    return global;
-  }
-
-  OpBuilder::InsertionGuard guard(rewriter);
-  rewriter.setInsertionPointToStart(moduleOp.getBody());
-
-  auto str = attr.getValue();
-  std::vector<uint8_t> bytes(str.begin(), str.end());
-  bytes.push_back(0);
-
-  auto type = RankedTensorType::get({(int64_t)bytes.size()},
-                                    rewriter.getIntegerType(8));
-
-  auto memrefType =
-      MemRefType::get({(int64_t)bytes.size()}, rewriter.getIntegerType(8));
-
-  auto denseAttr = DenseElementsAttr::get(type, llvm::ArrayRef<uint8_t>(bytes));
-
-  auto global = memref::GlobalOp::create(
-      rewriter, loc, sym_name,
-      /*sym_visibility=*/rewriter.getStringAttr("private"), memrefType,
-      denseAttr,
-      /*constant=*/true,
-      /*alignment=*/nullptr);
-
-  return global;
-}
-
-arith::IndexCastOp getIndexFromValue(mlir::PatternRewriter &rewriter,
-                                     Location loc, Value value) {
-  auto extractOp = memref::ExtractAlignedPointerAsIndexOp::create(
-      rewriter, loc, rewriter.getIndexType(), value);
-  auto indexCastOp = arith::IndexCastOp::create(
-      rewriter, loc, rewriter.getI64Type(), extractOp.getResult());
-  return indexCastOp;
-}
-
-arith::IndexCastOp getIndexFromGlobalMemref(mlir::PatternRewriter &rewriter,
-                                            Location loc,
-                                            memref::GlobalOp global) {
-
-  auto getGlobalOp = memref::GetGlobalOp::create(
-      rewriter, loc, global.getType(), global.getName());
-
-  return getIndexFromValue(rewriter, loc, getGlobalOp.getResult());
-}
-
-func::CallOp
-createCallToCudaShimMalloc(mlir::PatternRewriter &rewriter, Location loc,
-                           CudaShimRegistry &registry, func::CallOp stream,
-                           arith::ConstantIntOp nbytesVal, bool isHostShared) {
-  arith::ConstantIntOp isHostSharedVal;
-  if (isHostShared) {
-    isHostSharedVal = arith::ConstantIntOp::create(rewriter, loc, 1, 1);
-  } else {
-    isHostSharedVal = arith::ConstantIntOp::create(rewriter, loc, 0, 1);
-  }
-  auto sreamVal = stream.getResult(0);
-  auto callee = registry.call(rewriter, stream, CudaShimFn::Malloc,
-                              ValueRange{nbytesVal, sreamVal, isHostSharedVal});
-  return callee;
-}
-
-unsigned long getNbytes(Type tensorType) {
-  auto ranked_tensor_type = llvm::cast<MemRefType>(tensorType);
-  return llvm::divideCeil(ranked_tensor_type.getNumElements() *
-                              ranked_tensor_type.getElementTypeBitWidth(),
-                          8);
-}
-
 struct LanchGpuLowering : public OpConversionPattern<toy::LaunchGpuOp> {
   using OpConversionPattern<toy::LaunchGpuOp>::OpConversionPattern;
 
diff --git a/mlir/cuda-tile/sample/test.mlir b/mlir/cuda-tile/sample/test.mlir
@@ -127,15 +127,15 @@ module {
     memref.store %12, %alloc_35[%c3_52] : memref<4xi64>
     %c8_i64_53 = arith.constant 8 : i64
     memref.store %c8_i64_53, %alloc_36[%c3_52] : memref<4xi64>
+    %c8_i32 = arith.constant 8 : i32
     %c1_i32 = arith.constant 1 : i32
     %c1_i32_54 = arith.constant 1 : i32
-    %c1_i32_55 = arith.constant 1 : i32
     %c4_i32 = arith.constant 4 : i32
-    %intptr_56 = memref.extract_aligned_pointer_as_index %alloc_35 : memref<4xi64> -> index
-    %14 = arith.index_cast %intptr_56 : index to i64
-    %intptr_57 = memref.extract_aligned_pointer_as_index %alloc_36 : memref<4xi64> -> index
-    %15 = arith.index_cast %intptr_57 : index to i64
-    call @cuda_shim_launch_block_packed(%4, %3, %c1_i32, %c1_i32_54, %c1_i32_55, %5, %14, %15, %c4_i32) : (i64, i64, i32, i32, i32, i64, i64, i64, i32) -> ()
+    %intptr_55 = memref.extract_aligned_pointer_as_index %alloc_35 : memref<4xi64> -> index
+    %14 = arith.index_cast %intptr_55 : index to i64
+    %intptr_56 = memref.extract_aligned_pointer_as_index %alloc_36 : memref<4xi64> -> index
+    %15 = arith.index_cast %intptr_56 : index to i64
+    call @cuda_shim_launch_block_packed(%4, %3, %c8_i32, %c1_i32, %c1_i32_54, %5, %14, %15, %c4_i32) : (i64, i64, i32, i32, i32, i64, i64, i64, i32) -> ()
     call @cuda_shim_stream_synchronize(%5) : (i64) -> ()
     call @cuda_shim_memcpy_d2h(%13, %12, %c32_i64_49) : (i64, i64, i64) -> ()
     memref.dealloc %alloc_35 : memref<4xi64>