Alwaysproblem
diff --git a/‎mlir/cuda-tile/.devcontainer/devcontainer.json‎
Lines changed: 3 additions & 1 deletion b/‎mlir/cuda-tile/.devcontainer/devcontainer.json‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎mlir/cuda-tile/.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎mlir/cuda-tile/.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎mlir/cuda-tile/Toy/cuda_wrapper/cuda_shim.cpp‎
Lines changed: 262 additions & 0 deletions b/‎mlir/cuda-tile/Toy/cuda_wrapper/cuda_shim.cpp‎
Lines changed: 262 additions & 0 deletions
diff --git a/‎mlir/cuda-tile/Toy/toyc.cpp‎
Lines changed: 4 additions & 2 deletions b/‎mlir/cuda-tile/Toy/toyc.cpp‎
Lines changed: 4 additions & 2 deletions
@@ -68,7 +68,9 @@
         "llvm-vs-code-extensions.vscode-clangd",
         "llvm-vs-code-extensions.lldb-dap",
         "mutantdino.resourcemonitor",
-        "hoovercj.vscode-power-mode"
+        "hoovercj.vscode-power-mode",
+        "GitHub.copilot-chat",
+        "Codereviewforgithubcopilot.github-copilot-code-review"
       ]
     }
   }
 
@@ -0,0 +1,3 @@
+*.ptx
+*.cubin
+*.fatbin
@@ -12,14 +12,17 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <cstdlib>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <sys/types.h>
 
 #include "cuda.h"
 #include "cuda_bf16.h"
 #include "cuda_fp16.h"
+#include <vector>
 
 // We assume the program runs on the linux platform if not on Windows.
 // Copy from
@@ -246,6 +249,8 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) {
   defaultDevice = device;
 }
 
+// ===----------------------------------------------------------------------===//
+
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuCtxSynchronize() {
   ScopedContext scopedContext;
   CUDA_REPORT_IF_ERROR(cuCtxSynchronize());
@@ -263,4 +268,261 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuMemcpyDtoH(void *dst, void *src,
       cuMemcpyDtoH(dst, reinterpret_cast<CUdeviceptr>(src), sizeBytes));
 }
 
+//===----------------------------------------------------------------------===//
+
+static inline CUdeviceptr asDevPtr(uint64_t h) {
+  return static_cast<CUdeviceptr>(h);
+}
+static inline uint64_t asHandle(CUdeviceptr p) {
+  return static_cast<uint64_t>(p);
+}
+
+static inline CUstream asStream(uint64_t h) {
+  return reinterpret_cast<CUstream>(static_cast<uintptr_t>(h));
+}
+static inline uint64_t asStreamHandle(CUstream s) {
+  return static_cast<uint64_t>(reinterpret_cast<uintptr_t>(s));
+}
+
+static inline CUevent asEvent(uint64_t h) {
+  return reinterpret_cast<CUevent>(static_cast<uintptr_t>(h));
+}
+static inline uint64_t asEventHandle(CUevent e) {
+  return static_cast<uint64_t>(reinterpret_cast<uintptr_t>(e));
+}
+
+static inline void *asHostPtr(uint64_t h) {
+  return reinterpret_cast<void *>(static_cast<uintptr_t>(h));
+}
+static inline const void *asHostCPtr(uint64_t h) {
+  return reinterpret_cast<const void *>(static_cast<uintptr_t>(h));
+}
+
+// Align up helper
+static inline uint64_t alignUp(uint64_t x, uint64_t a) {
+  return (x + (a - 1)) & ~(a - 1);
+}
+
+// Load module from PTX or CUBIN image in memory.
+// Driver API supports cuModuleLoadDataEx for both PTX and cubin (it
+// auto-detects).
+extern "C" uint64_t cuda_shim_load_module_from_image(uint64_t image_ptr,
+                                                     uint64_t image_nbytes) {
+
+  (void)image_nbytes;
+  auto data = const_cast<void *>(asHostCPtr(image_ptr));
+  CUmodule mod = mgpuModuleLoad(data, image_nbytes);
+  return static_cast<uint64_t>(reinterpret_cast<uintptr_t>(mod));
+}
+
+extern "C" uint64_t cuda_shim_load_module_jit_from_image(uint64_t image_ptr,
+                                                         uint64_t image_nbytes,
+                                                         int opt_level) {
+
+  (void)image_nbytes;
+  auto data = const_cast<void *>(asHostCPtr(image_ptr));
+  CUmodule mod = mgpuModuleLoadJIT(data, opt_level);
+  return static_cast<uint64_t>(reinterpret_cast<uintptr_t>(mod));
+}
+
+extern "C" uint64_t
+cuda_shim_load_module_from_file(uint64_t file_path_ptr,
+                                uint64_t /*file_path_nbytes*/) {
+  auto file_path_cstr =
+      reinterpret_cast<const char *>(asHostCPtr(file_path_ptr));
+  // fprintf(stdout, "%s", file_path_cstr);
+  CUmodule module = nullptr;
+  ScopedContext scopedContext;
+  CUDA_REPORT_IF_ERROR(cuModuleLoad(&module, file_path_cstr));
+  return static_cast<uint64_t>(reinterpret_cast<uintptr_t>(module));
+}
+
+extern "C" void cuda_shim_unload_module(uint64_t module_handle) {
+  CUmodule module =
+      reinterpret_cast<CUmodule>(static_cast<uintptr_t>(module_handle));
+  mgpuModuleUnload(module);
+}
+
+extern "C" uint64_t cuda_shim_malloc(uint64_t nbytes, uint64_t stream,
+                                     bool is_host_shared) {
+  CUstream cu_stream = asStream(stream);
+  if (stream == 0)
+    cu_stream = nullptr;
+  void *ptr = mgpuMemAlloc(nbytes, /*stream=*/cu_stream,
+                           /*isHostShared=*/is_host_shared);
+  return static_cast<uint64_t>(reinterpret_cast<uintptr_t>(ptr));
+}
+
+extern "C" void cuda_shim_free(uint64_t dptr, uint64_t stream) {
+  CUstream cu_stream = asStream(stream);
+  void *ptr = reinterpret_cast<void *>(static_cast<uintptr_t>(dptr));
+  if (stream == 0) {
+    cu_stream = nullptr;
+  }
+  mgpuMemFree(ptr, /*stream=*/cu_stream);
+}
+
+extern "C" void cuda_shim_memset32(uint64_t dptr, uint32_t value,
+                                   uint64_t count_dwords, uint64_t stream) {
+  void *ptr = reinterpret_cast<void *>(static_cast<uintptr_t>(dptr));
+  CUstream cu_stream = asStream(stream);
+  mgpuMemset32(ptr, value, count_dwords, cu_stream);
+}
+
+extern "C" void cuda_shim_memset16(uint64_t dptr, uint32_t value,
+                                   uint64_t count_dwords, uint64_t stream) {
+  void *ptr = reinterpret_cast<void *>(static_cast<uintptr_t>(dptr));
+  CUstream cu_stream = asStream(stream);
+  mgpuMemset16(ptr, value, count_dwords, cu_stream);
+}
+
+extern "C" uint64_t cuda_shim_stream_create(void) {
+  CUstream stream = mgpuStreamCreate();
+  return asStreamHandle(stream);
+}
+
+extern "C" void cuda_shim_stream_destroy(uint64_t stream) {
+  CUstream cu_stream = asStream(stream);
+  mgpuStreamDestroy(cu_stream);
+}
+
+extern "C" void cuda_shim_stream_synchronize(uint64_t stream) {
+  CUstream cu_stream = asStream(stream);
+  mgpuStreamSynchronize(cu_stream);
+}
+
+extern "C" uint64_t cuda_shim_event_create(void) {
+  CUevent event = mgpuEventCreate();
+  return asEventHandle(event);
+}
+
+extern "C" void cuda_shim_event_destroy(uint64_t ev) {
+  CUevent event = asEvent(ev);
+  mgpuEventDestroy(event);
+}
+
+extern "C" void cuda_shim_event_record(uint64_t ev, uint64_t stream) {
+  CUevent event = asEvent(ev);
+  CUstream cu_stream = asStream(stream);
+  mgpuEventRecord(event, cu_stream);
+}
+
+extern "C" void cuda_shim_event_synchronize(uint64_t ev) {
+  CUevent event = asEvent(ev);
+  mgpuEventSynchronize(event);
+}
+
+extern "C" void cuda_shim_stream_wait_event(uint64_t stream, uint64_t ev) {
+  CUstream cu_stream = asStream(stream);
+  CUevent event = asEvent(ev);
+  mgpuStreamWaitEvent(cu_stream, event);
+}
+
+// ----------------------------- Memcpy (raw ABI) --------------------------
+// Host pointers are passed as uint64_t. This is the key of 2A.
+
+extern "C" void cuda_shim_memcpy_h2d(uint64_t dst_dptr, uint64_t src_hptr,
+                                     uint64_t nbytes) {
+  ScopedContext scopedContext;
+  auto dst = asHostPtr(dst_dptr);
+  auto src = asHostPtr(src_hptr);
+  mgpuMemcpyHtoD(dst, src, static_cast<size_t>(nbytes));
+}
+
+extern "C" void cuda_shim_memcpy_d2h(uint64_t dst_hptr, uint64_t src_dptr,
+                                     uint64_t nbytes) {
+  ScopedContext scopedContext;
+  auto dst = asHostPtr(dst_hptr);
+  auto src = asHostPtr(src_dptr);
+  mgpuMemcpyDtoH(dst, src, static_cast<size_t>(nbytes));
+}
+
+// ----------------------------- Kernel launch -----------------------------
+// The hardest part is kernelParams (void**).
+// We avoid building it in MLIR. Instead MLIR passes:
+// - arg_data_ptr: host pointer to a packed buffer containing raw argument bytes
+// - arg_sizes_ptr: host pointer to uint64_t[num_args], each is the byte-size of
+// that argument The shim constructs kernelParams[i] = &arg_data[offset_i] with
+// 8-byte alignment. This matches typical ABI expectations for scalar/pointer
+// args. If you have special alignment requirements, extend this (e.g., per-arg
+// alignment array).
+
+extern "C" void cuda_shim_launch_packed(
+    uint64_t module_handle, uint64_t kernel_name_ptr, uint32_t gridX,
+    uint32_t gridY, uint32_t gridZ, uint32_t blockX, uint32_t blockY,
+    uint32_t blockZ, uint32_t sharedMemBytes, uint64_t stream,
+    uint64_t arg_data_ptr, uint64_t arg_sizes_ptr, uint32_t num_args) {
+
+  auto mh = reinterpret_cast<CUmodule>(static_cast<uintptr_t>(module_handle));
+  if (!mh) {
+    fprintf(stderr, "[cuda_shim] launch_packed: invalid module handle\n");
+    abort();
+  }
+
+  const char *kname =
+      reinterpret_cast<const char *>(asHostCPtr(kernel_name_ptr));
+  if (!kname) {
+    fprintf(stderr, "[cuda_shim] launch_packed: null kernel name\n");
+    abort();
+  }
+
+  CUfunction fn = mgpuModuleGetFunction(mh, kname);
+
+  auto *argData = reinterpret_cast<uint8_t *>(asHostPtr(arg_data_ptr));
+  auto *argSizes =
+      reinterpret_cast<const uint64_t *>(asHostCPtr(arg_sizes_ptr));
+
+  if (num_args > 0 && (!argData || !argSizes)) {
+    fprintf(stderr, "[cuda_shim] launch_packed: argData/argSizes null\n");
+    abort();
+  }
+
+  // Build kernelParams array on heap (safe for large num_args).
+  std::vector<void *> params;
+  params.resize(num_args);
+
+  uint64_t off = 0;
+  for (uint32_t i = 0; i < num_args; ++i) {
+    // 8-byte align each argument start (common safe default).
+    off = alignUp(off, 8);
+    params[i] = argData + off;
+    off += argSizes[i];
+  }
+
+  auto cu_stream = asStream(stream);
+
+  if (stream == 0) {
+    cu_stream = nullptr;
+  }
+
+  mgpuLaunchKernel(fn, static_cast<intptr_t>(gridX),
+                   static_cast<intptr_t>(gridY), static_cast<intptr_t>(gridZ),
+                   static_cast<intptr_t>(blockX), static_cast<intptr_t>(blockY),
+                   static_cast<intptr_t>(blockZ),
+                   static_cast<int32_t>(sharedMemBytes), cu_stream,
+                   params.data(), nullptr, static_cast<size_t>(num_args));
+}
+
+// Convenience: 1D launch, shared=0, stream optional
+extern "C" void
+cuda_shim_launch_block_packed(uint64_t module_handle, uint64_t kernel_name_ptr,
+                              uint32_t blockX, uint32_t blockY, uint32_t blockZ,
+                              uint64_t stream, uint64_t arg_data_ptr,
+                              uint64_t arg_sizes_ptr, uint32_t num_args) {
+  cuda_shim_launch_packed(module_handle, kernel_name_ptr, 1, 1, 1, blockX,
+                          blockY, blockZ, 0, stream, arg_data_ptr,
+                          arg_sizes_ptr, num_args);
+}
+
+// Optional: global sync (avoid in async pipeline; prefer event/stream sync)
+extern "C" void cuda_shim_ctx_synchronize(void) { mgpuCtxSynchronize(); }
+
+// only for debugging
+// extern "C" void cuda_debug_dump_float(uint64_t dptr, int n) {
+//   auto *p = reinterpret_cast<const float *>(static_cast<uintptr_t>(dptr));
+//   for (uint32_t i = 0; i < n; ++i) {
+//     fprintf(stderr, "i=%u v=%f\n", i, p[i]);
+//   }
+// }
+
 #endif
@@ -342,8 +342,10 @@ static int loadAndProcessMLIRGPU(mlir::MLIRContext &context,
   if (isLoweringToAffine) {
     pm.addPass(mlir::toy::createEmbedCudaTileBinaryPass(
         "/usr/local/cuda/bin/tileiras", "sm_120"));
-    //   // Partially lower the toy dialect.
-    //   optPM.addPass(mlir::toy::createLowerToAffinePass());
+
+    // mlir::OpPassManager &gpuOptPM = pm.nest<mlir::toy::FuncOp>();
+    // // Partially lower the toy dialect.
+    // pm.addPass(mlir::toy::createLowerToAffinePass());
 
     //   // Add a few cleanups post lowering.
     //   mlir::OpPassManager &optPM = pm.nest<mlir::func::FuncOp>();
Original file line number	Diff line number	Diff line change
`@@ -68,7 +68,9 @@`
`68`	`68`	`"llvm-vs-code-extensions.vscode-clangd",`
`69`	`69`	`"llvm-vs-code-extensions.lldb-dap",`
`70`	`70`	`"mutantdino.resourcemonitor",`
`71`		`- "hoovercj.vscode-power-mode"`
	`71`	`+ "hoovercj.vscode-power-mode",`
	`72`	`+ "GitHub.copilot-chat",`
	`73`	`+ "Codereviewforgithubcopilot.github-copilot-code-review"`
`72`	`74`	`]`
`73`	`75`	`}`
`74`	`76`	`}`