From fe8fc5f8f41a2a8aaf5b2589e6118705dc0aa5ae Mon Sep 17 00:00:00 2001
From: Okwan Kwon <okkwon@gmail.com>
Date: Fri, 4 Nov 2022 15:03:30 -0700
Subject: [PATCH 01/27] fix typos

---
 compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp     | 2 +-
 .../iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp  | 2 +-
 .../src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileTensor.cpp     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index e6dd8cd20809..221bcec8d1c3 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -410,7 +410,7 @@ static LogicalResult setRootDefaultConfig(func::FuncOp entryPoint,
   }
 
   auto linalgOp = dyn_cast<linalg::LinalgOp>(op);
-  // Pick a vectorSize of 1 for op that we know won't get vectorizedd.
+  // Pick a vectorSize of 1 for op that we know won't get vectorized.
   // Also skip vectorization for linalg on memref (no result) as the pipeline
   // relies on tensor level tiling.
   // TODO(thomasraoux): This could be improved by checking if the linalg op
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
index 23f76a9779cc..2e2776778348 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
@@ -191,7 +191,7 @@ struct LLVMGPUTileAndDistributePass
     auto funcOp = getOperation();
     if (!isEntryPoint(funcOp)) return;
 
-    // Promote C matrix and propagate the potential  fill producer into the temp
+    // Promote C matrix and propagate the potential fill producer into the temp
     // allocation. This needs to be done before reduction tiling.
     {
       RewritePatternSet promotionPatterns(&getContext());
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileTensor.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileTensor.cpp
index ae0ef3e6edc4..6348ddf1d05a 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileTensor.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileTensor.cpp
@@ -69,7 +69,7 @@ static void populateTilingPatterns(RewritePatternSet &patterns,
 
 LogicalResult tileToSerialLoops(func::FuncOp funcOp, bool onlyReduction) {
   {
-    // Tile again at the workgroup level since redution dimension were
+    // Tile again at the workgroup level since reduction dimension were
     // ignored. Dimensions already tiled will be ignore since we tile to the
     // same size.
     RewritePatternSet wgTilingPatterns(funcOp.getContext());

From a2da08e031fb19ab33ad6a28aa9417a66af37c6e Mon Sep 17 00:00:00 2001
From: Okwan Kwon <okkwon@gmail.com>
Date: Mon, 7 Nov 2022 08:58:43 -0800
Subject: [PATCH 02/27] WIP: allow unaligned tensor sizes for tensor core

---
 .../compiler/Codegen/LLVMGPU/KernelConfig.cpp |  5 ++--
 .../iree/compiler/Codegen/LLVMGPU/Passes.cpp  |  6 ++++
 .../compiler/Codegen/LLVMGPU/Verifiers.cpp    | 28 +++++++++----------
 3 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index 221bcec8d1c3..fbd0cf41264f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -227,9 +227,8 @@ static LogicalResult setContractConfig(func::FuncOp entryPoint,
       // Pick the best configuration where the original shape is aligned on the
       // tile size.
       for (TileWorkgroupSizePair &config : TCtileSizeConfig) {
-        if (sizeK % config.tileSize[2] == 0 &&
-            sizeN % config.tileSize[1] == 0 &&
-            sizeM % config.tileSize[0] == 0) {
+        if (sizeK >= config.tileSize[2] && sizeN >= config.tileSize[1] &&
+            sizeM >= config.tileSize[0]) {
           return setMatmulConfig(
               config.tileSize[0], config.tileSize[1], config.tileSize[2],
               config.workgroupSize,
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
index 8d05e0d179a8..8322ba5a36ec 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -199,6 +199,12 @@ void addGPUMatmulTensorCorePassPipeline(OpPassManager &pm,
   tileAndBufferize(pm);
 
   auto &nestedModulePM = pm.nest<ModuleOp>();
+  // Do specialization
+  nestedModulePM.addNestedPass<func::FuncOp>(
+      createWorkgroupSpecializationPass());
+  nestedModulePM.addPass(createCanonicalizerPass());
+  nestedModulePM.addPass(createCSEPass());
+
   // Distribute linalg onto warps within the workgroup.
   nestedModulePM.addNestedPass<func::FuncOp>(
       createLLVMGPUTileAndDistribute(/*distributeToWarp=*/true));
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp
index c04186c75ea6..8395a1adf329 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp
@@ -188,20 +188,20 @@ LogicalResult verifyGPUMatmulTensorCorePipeline(
 
   // Verify the first level tile size divides the matmul
   // inputs A [M x K] & B [K x N]
-  if (lhsShape[0] % firstLevelTileSizes[0] != 0 ||
-      lhsShape[1] % firstLevelTileSizes[2] != 0) {
-    return op->emitOpError(
-               "lhsShape doesn't factor into first level tile size for ")
-           << pipelineName << " [ " << lhsShape[0] << ", " << lhsShape[1]
-           << "]";
-  }
-  if (rhsShape[0] % firstLevelTileSizes[2] != 0 ||
-      rhsShape[1] % firstLevelTileSizes[1] != 0) {
-    return op->emitOpError(
-               "rhsShape doesn't factor into first level tile size for ")
-           << pipelineName << " [ " << rhsShape[0] << ", " << rhsShape[1]
-           << "]";
-  }
+  // if (lhsShape[0] % firstLevelTileSizes[0] != 0 ||
+  //     lhsShape[1] % firstLevelTileSizes[2] != 0) {
+  //   return op->emitOpError(
+  //              "lhsShape doesn't factor into first level tile size for ")
+  //          << pipelineName << " [ " << lhsShape[0] << ", " << lhsShape[1]
+  //          << "]";
+  // }
+  // if (rhsShape[0] % firstLevelTileSizes[2] != 0 ||
+  //     rhsShape[1] % firstLevelTileSizes[1] != 0) {
+  //   return op->emitOpError(
+  //              "rhsShape doesn't factor into first level tile size for ")
+  //          << pipelineName << " [ " << rhsShape[0] << ", " << rhsShape[1]
+  //          << "]";
+  // }
 
   // Verify shared memory usage of operands after tiling requires <= 64Kb
   // combined space.

From c2793ea0626b38f929e949f897391ad79e33c3ff Mon Sep 17 00:00:00 2001
From: Okwan Kwon <okkwon@gmail.com>
Date: Tue, 8 Nov 2022 18:37:37 -0800
Subject: [PATCH 03/27] filter out ops with unaligned tensor sizes

---
 .../LLVMGPU/LLVMGPUTileAndDistribute.cpp      | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
index 2e2776778348..2f5d17132d3e 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
@@ -103,6 +103,29 @@ static SmallVector<Value, 4> calculateDistributedTileSize(
   return tileSizesVal;
 }
 
+/// Returns true if a contract op has a multiple of its tile sizes.
+static LogicalResult alignedContractOpFilter(Operation *op) {
+  auto linalgOp = dyn_cast<linalg::LinalgOp>(op);
+  if (!linalgOp) return failure();
+
+  if (linalgOp.hasDynamicShape()) return failure();
+
+  // matmul or batch matmul
+  bool isElible = linalg::isaContractionOpInterface(op) &&
+                  linalgOp.getNumParallelLoops() >= 2 &&
+                  linalgOp.getNumParallelLoops() <= 3;
+  if (!isElible) return failure();
+
+  SmallVector<int64_t> wgTileSizes = getTileSizes(op, 0);
+  if (wgTileSizes.empty()) return failure();
+
+  SmallVector<int64_t> sizes = linalgOp.getStaticLoopRanges();
+  for (unsigned i = 0, e = sizes.size(); i != e; ++i) {
+    if (sizes[i] % wgTileSizes[i] != 0) return failure();
+  }
+  return success();
+}
+
 /// Patterns for warp level tiling.
 static void populateTilingToWarpPatterns(
     RewritePatternSet &patterns, SmallVectorImpl<int64_t> &workgroupSize) {
@@ -120,6 +143,7 @@ static void populateTilingToWarpPatterns(
     return getSubgroupIdsAndCounts(builder, loc, /*warpSize=*/32u,
                                    parallelLoopRanges.size(), warpPerWorkgroup);
   };
+
   linalg::LinalgLoopDistributionOptions warpDistributionOptions;
   warpDistributionOptions.procInfo = getWarpProcInfoFn;
 
@@ -133,6 +157,8 @@ static void populateTilingToWarpPatterns(
        StringAttr::get(context, getWorkgroupMemoryMarker())},
       StringAttr::get(context, getVectorizeMarker()));
   filter.setMatchByDefault();
+  // Bail out the case where the tensor sizes are not a multiple of tile sizes.
+  filter.addFilter(alignedContractOpFilter);
   TilingPatterns<linalg::MatmulOp, linalg::FillOp, linalg::BatchMatmulOp,
                  linalg::GenericOp>::insert(patterns, tilingOptions, filter);
 }

From cf7b50d62c919fbedcbba2663f18ec7a126dd27e Mon Sep 17 00:00:00 2001
From: Okwan Kwon <okkwon@gmail.com>
Date: Wed, 9 Nov 2022 10:50:25 -0800
Subject: [PATCH 04/27] add a marker for workgroup specialization

---
 .../compiler/Codegen/Common/WorkgroupSpecializationPass.cpp   | 2 ++
 compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp      | 4 ++++
 compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h        | 4 ++++
 3 files changed, 10 insertions(+)

diff --git a/compiler/src/iree/compiler/Codegen/Common/WorkgroupSpecializationPass.cpp b/compiler/src/iree/compiler/Codegen/Common/WorkgroupSpecializationPass.cpp
index 5a1d1da2386f..09d2b9105a3a 100644
--- a/compiler/src/iree/compiler/Codegen/Common/WorkgroupSpecializationPass.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/WorkgroupSpecializationPass.cpp
@@ -36,6 +36,7 @@
 #include "iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.h"
 #include "iree/compiler/Codegen/PassDetail.h"
 #include "iree/compiler/Codegen/Passes.h"
+#include "iree/compiler/Codegen/Utils/MarkerUtils.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -220,6 +221,7 @@ static void specializeDistributionLoops(
 
   // generate scf.if %cond
   auto ifOp = builder.create<scf::IfOp>(loc, cond, /*withElseRegion=*/true);
+  setMarker(ifOp, getWorkgroupSpecializationMarker());
 
   // Transfer the original body to the scf.else body.
   auto origBodyBegin = ++Block::iterator(ifOp);
diff --git a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp
index e62239e9f81f..f76dbf39a75c 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp
@@ -36,6 +36,10 @@ StringRef getWorkgroupMemoryNumItemsGENumItersMarker() {
   return "workgroup_memory_numprocs_ge_numiters";
 }
 
+StringRef getWorkgroupSpecializationMarker() {
+  return "workgroup_specialization";
+}
+
 StringRef getCopyToWorkgroupMemoryMarker() {
   return "copy_to_workgroup_memory";
 }
diff --git a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h
index 2057ede82128..50178cf62ae7 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h
+++ b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h
@@ -33,6 +33,10 @@ StringRef getWorkgroupMemoryMarker();
 /// to workgroups L1 tiles.
 StringRef getWorkgroupL1TileMarker();
 
+/// Marker to scf.IfOp to denote that the op is inserted for workgroup
+/// specialization.
+StringRef getWorkgroupSpecializationMarker();
+
 /// Marker for copy operations that are moving data from StorageClass to
 /// Workgroup memory.
 StringRef getCopyToWorkgroupMemoryMarker();

From dab6480ca25dd424fe7c62d01a50a92e1ae191cd Mon Sep 17 00:00:00 2001
From: Okwan Kwon <okkwon@gmail.com>
Date: Wed, 9 Nov 2022 10:50:57 -0800
Subject: [PATCH 05/27] add findAncestorWithMarker(op, marker)

---
 compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp | 8 ++++++++
 compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h   | 3 +++
 2 files changed, 11 insertions(+)

diff --git a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp
index f76dbf39a75c..0399f119891d 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp
@@ -71,5 +71,13 @@ void setMarker(Operation *op, StringRef marker) {
               StringAttr::get(op->getContext(), marker));
 }
 
+Operation *findAncestorWithMarker(Operation *op, StringRef marker) {
+  while (op) {
+    if (hasMarker(op, marker)) return op;
+    op = op->getParentOp();
+  };
+  return nullptr;
+}
+
 }  // namespace iree_compiler
 }  // namespace mlir
diff --git a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h
index 50178cf62ae7..80e11b7d2115 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h
+++ b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h
@@ -63,6 +63,9 @@ bool hasMarker(Operation *, ArrayRef<StringRef> markers = {});
 /// Sets a given marker on an operation.
 void setMarker(Operation *, StringRef);
 
+/// Find an ancester with the given marker.
+Operation *findAncestorWithMarker(Operation *op, StringRef marker);
+
 }  // namespace iree_compiler
 }  // namespace mlir
 

From 9c742712f8dbe58b04b08f03604d50fad0810276 Mon Sep 17 00:00:00 2001
From: Okwan Kwon <okkwon@gmail.com>
Date: Wed, 9 Nov 2022 10:51:29 -0800
Subject: [PATCH 06/27] Do warp distribution only when an op is aligned

---
 .../LLVMGPU/LLVMGPUTileAndDistribute.cpp      | 37 ++++++++-----------
 1 file changed, 16 insertions(+), 21 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
index 2f5d17132d3e..24e5aa68e066 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
@@ -24,6 +24,7 @@
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/Support/MathExtras.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -103,27 +104,21 @@ static SmallVector<Value, 4> calculateDistributedTileSize(
   return tileSizesVal;
 }
 
-/// Returns true if a contract op has a multiple of its tile sizes.
-static LogicalResult alignedContractOpFilter(Operation *op) {
-  auto linalgOp = dyn_cast<linalg::LinalgOp>(op);
-  if (!linalgOp) return failure();
-
-  if (linalgOp.hasDynamicShape()) return failure();
-
-  // matmul or batch matmul
-  bool isElible = linalg::isaContractionOpInterface(op) &&
-                  linalgOp.getNumParallelLoops() >= 2 &&
-                  linalgOp.getNumParallelLoops() <= 3;
-  if (!isElible) return failure();
-
-  SmallVector<int64_t> wgTileSizes = getTileSizes(op, 0);
-  if (wgTileSizes.empty()) return failure();
-
-  SmallVector<int64_t> sizes = linalgOp.getStaticLoopRanges();
-  for (unsigned i = 0, e = sizes.size(); i != e; ++i) {
-    if (sizes[i] % wgTileSizes[i] != 0) return failure();
+/// Returns true if an op is aligned by checking if
+///   1. the op is inside the workgroup-specialized region, or
+///   2. the op's parent is not the workgroup-specialized region.
+/// The second case does not have a workgroup-spcialized region because
+/// it is already aligned.
+static LogicalResult alignedOpFilter(Operation *op) {
+  Operation *opWithMarker =
+      findAncestorWithMarker(op, getWorkgroupSpecializationMarker());
+
+  if (opWithMarker) {
+    auto ifOp = cast<scf::IfOp>(opWithMarker);
+    return success(ifOp.getThenRegion().isAncestor(op->getParentRegion()));
+  } else {
+    return success();
   }
-  return success();
 }
 
 /// Patterns for warp level tiling.
@@ -158,7 +153,7 @@ static void populateTilingToWarpPatterns(
       StringAttr::get(context, getVectorizeMarker()));
   filter.setMatchByDefault();
   // Bail out the case where the tensor sizes are not a multiple of tile sizes.
-  filter.addFilter(alignedContractOpFilter);
+  filter.addFilter(alignedOpFilter);
   TilingPatterns<linalg::MatmulOp, linalg::FillOp, linalg::BatchMatmulOp,
                  linalg::GenericOp>::insert(patterns, tilingOptions, filter);
 }

From 3e5622698f1863dab96990b17d5cffe4044b2cd3 Mon Sep 17 00:00:00 2001
From: Okwan Kwon <okkwon@gmail.com>
Date: Wed, 9 Nov 2022 20:22:04 -0800
Subject: [PATCH 07/27] apply thread-level tiling for unaligned matmul op

---
 .../LLVMGPU/LLVMGPUTileAndDistribute.cpp      | 36 +++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
index 24e5aa68e066..3433a6b6c9d2 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
@@ -121,6 +121,25 @@ static LogicalResult alignedOpFilter(Operation *op) {
   }
 }
 
+/// Returns true if an op is unaligned by checking if
+///   1. the op is inside the workgroup-specialized region, or
+///   2. the op's parent is not the workgroup-specialized region.
+/// The second case does not have a workgroup-spcialized region because
+/// it is already aligned.
+static LogicalResult unalignedOpFilter(Operation *op) {
+  Operation *opWithMarker =
+      findAncestorWithMarker(op, getWorkgroupSpecializationMarker());
+
+  if (opWithMarker) {
+    auto ifOp = cast<scf::IfOp>(opWithMarker);
+    return success(ifOp.getElseRegion().isAncestor(op->getParentRegion()));
+  } else {
+    // When there is no workgroup specialization, it means the op is already
+    // aligned.
+    return failure();
+  }
+}
+
 /// Patterns for warp level tiling.
 static void populateTilingToWarpPatterns(
     RewritePatternSet &patterns, SmallVectorImpl<int64_t> &workgroupSize) {
@@ -158,9 +177,12 @@ static void populateTilingToWarpPatterns(
                  linalg::GenericOp>::insert(patterns, tilingOptions, filter);
 }
 
+using FilterFunction = std::function<LogicalResult(Operation *)>;
+
 /// Patterns for thread level tiling.
 static void populateTilingToInvocationPatterns(
-    RewritePatternSet &patterns, SmallVectorImpl<int64_t> &workgroupSize) {
+    RewritePatternSet &patterns, SmallVectorImpl<int64_t> &workgroupSize,
+    const FilterFunction &ff = nullptr) {
   linalg::TileSizeComputationFunction getInnerTileSizeFn =
       [&](OpBuilder &builder, Operation *operation) {
         return calculateDistributedTileSize(workgroupSize, builder, operation);
@@ -189,6 +211,8 @@ static void populateTilingToInvocationPatterns(
      // FFT doesn't support second level of tiling yet.
      return success(!isa<IREE::LinalgExt::FftOp>(op));
    }).setMatchByDefault();
+  // Add the user provided filter if available.
+  if (ff) f.addFilter(ff);
   patterns.insert<IREE::LinalgExt::LinalgTilingPattern,
                   IREE::LinalgExt::TilingInterfaceTilingPattern>(
       context, tilingOptions, f);
@@ -271,7 +295,7 @@ struct LLVMGPUTileAndDistributePass
     });
 
     if (distributeToWarp) {
-      // Apply last level of tiling and distribute to warps.
+      // Apply last level of tiling and distribute to warps for aligned ops.
       RewritePatternSet warpLevelTilingPatterns(context);
       populateTilingToWarpPatterns(warpLevelTilingPatterns, workgroupSize);
       if (failed(applyPatternsAndFoldGreedily(
@@ -279,6 +303,14 @@ struct LLVMGPUTileAndDistributePass
         return signalPassFailure();
       }
 
+      // Apply last level of tiling and distribute to threads for unaligned ops.
+      RewritePatternSet threadLevelTilingPatterns(context);
+      populateTilingToInvocationPatterns(threadLevelTilingPatterns,
+                                         workgroupSize, unalignedOpFilter);
+      if (failed(applyPatternsAndFoldGreedily(
+              funcOp, std::move(threadLevelTilingPatterns)))) {
+        return signalPassFailure();
+      }
     } else {
       // Apply last level of tiling and distribute to threads.
       RewritePatternSet threadLevelTilingPatterns(context);

From 846d488c25200c415f8eab40d309b0e5666a6038 Mon Sep 17 00:00:00 2001
From: Okwan Kwon <okkwon@gmail.com>
Date: Wed, 9 Nov 2022 20:44:06 -0800
Subject: [PATCH 08/27] Do not fail for LLVMGPUMultiBuffering

There are cases where it can fail, so skip reporting it as a pass
failure.
---
 .../iree/compiler/Codegen/Common/GPUMultiBuffering.cpp    | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPUMultiBuffering.cpp b/compiler/src/iree/compiler/Codegen/Common/GPUMultiBuffering.cpp
index e8d8e18a1a7b..1dbc93ed1623 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPUMultiBuffering.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPUMultiBuffering.cpp
@@ -38,10 +38,10 @@ struct GPUMultiBufferingPass
     });
     // Apply multi-buffering to all of them.
     for (memref::AllocOp alloc : allocs) {
-      if (failed(memref::multiBuffer(alloc, numBuffers)))
-        // Stop if any buffer cannot be multi buffered as pipelining will assume
-        // this happened.
-        return signalPassFailure();
+      if (failed(memref::multiBuffer(alloc, numBuffers))) {
+        // There can be a failing case. Continue processing eligible ones.
+        continue;
+      }
     }
   }
 

From 9395678f14bac475c7a806eed36bb7625aa42e3f Mon Sep 17 00:00:00 2001
From: Okwan Kwon <okkwon@gmail.com>
Date: Thu, 10 Nov 2022 12:00:18 -0800
Subject: [PATCH 09/27] move aligned and unaligned op filters to Utils

---
 .../LLVMGPU/LLVMGPUTileAndDistribute.cpp      | 36 -------------------
 .../src/iree/compiler/Codegen/Utils/Utils.cpp | 26 ++++++++++++++
 .../src/iree/compiler/Codegen/Utils/Utils.h   | 14 ++++++++
 3 files changed, 40 insertions(+), 36 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
index 3433a6b6c9d2..d9c3ba271745 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
@@ -104,42 +104,6 @@ static SmallVector<Value, 4> calculateDistributedTileSize(
   return tileSizesVal;
 }
 
-/// Returns true if an op is aligned by checking if
-///   1. the op is inside the workgroup-specialized region, or
-///   2. the op's parent is not the workgroup-specialized region.
-/// The second case does not have a workgroup-spcialized region because
-/// it is already aligned.
-static LogicalResult alignedOpFilter(Operation *op) {
-  Operation *opWithMarker =
-      findAncestorWithMarker(op, getWorkgroupSpecializationMarker());
-
-  if (opWithMarker) {
-    auto ifOp = cast<scf::IfOp>(opWithMarker);
-    return success(ifOp.getThenRegion().isAncestor(op->getParentRegion()));
-  } else {
-    return success();
-  }
-}
-
-/// Returns true if an op is unaligned by checking if
-///   1. the op is inside the workgroup-specialized region, or
-///   2. the op's parent is not the workgroup-specialized region.
-/// The second case does not have a workgroup-spcialized region because
-/// it is already aligned.
-static LogicalResult unalignedOpFilter(Operation *op) {
-  Operation *opWithMarker =
-      findAncestorWithMarker(op, getWorkgroupSpecializationMarker());
-
-  if (opWithMarker) {
-    auto ifOp = cast<scf::IfOp>(opWithMarker);
-    return success(ifOp.getElseRegion().isAncestor(op->getParentRegion()));
-  } else {
-    // When there is no workgroup specialization, it means the op is already
-    // aligned.
-    return failure();
-  }
-}
-
 /// Patterns for warp level tiling.
 static void populateTilingToWarpPatterns(
     RewritePatternSet &patterns, SmallVectorImpl<int64_t> &workgroupSize) {
diff --git a/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp b/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp
index 03ae9f65fa53..600ba9bbcafe 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp
@@ -677,5 +677,31 @@ void replaceMemrefUsesAndPropagateType(Operation *oldOp, Value val,
   for (Operation *op : opToDelete) op->erase();
 }
 
+LogicalResult alignedOpFilter(Operation *op) {
+  Operation *opWithMarker =
+      findAncestorWithMarker(op, getWorkgroupSpecializationMarker());
+
+  if (opWithMarker) {
+    auto ifOp = cast<scf::IfOp>(opWithMarker);
+    return success(ifOp.getThenRegion().isAncestor(op->getParentRegion()));
+  } else {
+    return success();
+  }
+}
+
+LogicalResult unalignedOpFilter(Operation *op) {
+  Operation *opWithMarker =
+      findAncestorWithMarker(op, getWorkgroupSpecializationMarker());
+
+  if (opWithMarker) {
+    auto ifOp = cast<scf::IfOp>(opWithMarker);
+    return success(ifOp.getElseRegion().isAncestor(op->getParentRegion()));
+  } else {
+    // When there is no workgroup specialization, it means the op is already
+    // aligned.
+    return failure();
+  }
+}
+
 }  // namespace iree_compiler
 }  // namespace mlir
diff --git a/compiler/src/iree/compiler/Codegen/Utils/Utils.h b/compiler/src/iree/compiler/Codegen/Utils/Utils.h
index 506f707e9217..37479dfca7b5 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/Utils.h
+++ b/compiler/src/iree/compiler/Codegen/Utils/Utils.h
@@ -175,6 +175,20 @@ linalg::LinalgLoopDistributionOptions getIREELinalgLoopDistributionOptions();
 void replaceMemrefUsesAndPropagateType(Operation *oldOp, Value val,
                                        OpBuilder &builder);
 
+/// Returns true if an op is aligned by checking if
+///   1. the op is inside the workgroup-specialized region, or
+///   2. the op's parent is not the workgroup-specialized region.
+/// The second case does not have a workgroup-spcialized region because
+/// it is already aligned.
+LogicalResult alignedOpFilter(Operation *op);
+
+/// Returns true if an op is unaligned by checking if
+///   1. the op is inside the workgroup-specialized region, or
+///   2. the op's parent is not the workgroup-specialized region.
+/// The second case does not have a workgroup-spcialized region because
+/// it is already aligned.
+LogicalResult unalignedOpFilter(Operation *op);
+
 }  // namespace iree_compiler
 }  // namespace mlir
 

From e5d4f83b8aa9b338d3a6fcd89b2d4fa469f12df1 Mon Sep 17 00:00:00 2001
From: Okwan Kwon <okkwon@gmail.com>
Date: Thu, 10 Nov 2022 12:04:56 -0800
Subject: [PATCH 10/27] do tensorcore vectorization only when the candidate op
 is aligned

---
 .../compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp
index 27b0ccd92a9c..5bb7bdfb6022 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp
@@ -31,6 +31,8 @@ extern llvm::cl::opt<bool> llvmgpuUseMMASync;
 static void populateVectorizationPatterns(RewritePatternSet &patterns) {
   IREE::LinalgExt::LinalgTransformationFilter f(
       StringAttr::get(patterns.getContext(), getVectorizeMarker()));
+  // only handle aligned ops
+  f.addFilter(alignedOpFilter);
   VectorizationPatterns<linalg::FillOp, linalg::GenericOp>::insert(patterns, f);
   patterns.add<LinalgVectorizationPattern>(
       patterns.getContext(), f.addOpFilter<linalg::ContractionOpInterface>());

From 38b578779a54a88630c64f9d6dc0dcf632e9ba01 Mon Sep 17 00:00:00 2001
From: Okwan Kwon <okkwon@gmail.com>
Date: Thu, 10 Nov 2022 15:23:50 -0800
Subject: [PATCH 11/27] Bail out unaligned K dimension from tensorcore
 specialization

---
 compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index fbd0cf41264f..69d75cda0ee0 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -227,7 +227,8 @@ static LogicalResult setContractConfig(func::FuncOp entryPoint,
       // Pick the best configuration where the original shape is aligned on the
       // tile size.
       for (TileWorkgroupSizePair &config : TCtileSizeConfig) {
-        if (sizeK >= config.tileSize[2] && sizeN >= config.tileSize[1] &&
+        // TODO: support unaligned K size
+        if (sizeK % config.tileSize[2] == 0 && sizeN >= config.tileSize[1] &&
             sizeM >= config.tileSize[0]) {
           return setMatmulConfig(
               config.tileSize[0], config.tileSize[1], config.tileSize[2],

From 8211f9944cd2126759895ddae41579b6425f9546 Mon Sep 17 00:00:00 2001
From: Okwan Kwon <okkwon@gmail.com>
Date: Thu, 10 Nov 2022 16:24:18 -0800
Subject: [PATCH 12/27] update the unit tests

---
 .../LLVMGPU/test/illegal_configuration.mlir   | 64 -------------------
 ...orkgroup_specialization_pipeline_test.mlir | 13 ++--
 2 files changed, 7 insertions(+), 70 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir
index 5164192a28e1..a9136ba3430b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir
@@ -224,70 +224,6 @@ hal.executable private @matmul_tensors {
 
 // -----
 
-#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
-  #hal.descriptor_set.layout<0, bindings = [
-    #hal.descriptor_set.binding<0, storage_buffer>,
-    #hal.descriptor_set.binding<1, storage_buffer>,
-    #hal.descriptor_set.binding<2, storage_buffer>
-  ]>
-]>
-hal.executable private @matmul_tensors {
-  hal.executable.variant @cuda, target = #hal.executable.target<"cuda", "cuda-nvptx-fb"> {
-    hal.executable.export @illegal layout(#pipeline_layout) attributes {
-      translation_info = #translation,
-      workgroup_size = [64 : index, 2 : index, 1 : index]
-    }
-    builtin.module {
-      func.func @illegal() {
-        %c0 = arith.constant 0 : index
-        %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<48x16xf32>
-        %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x32xf32>
-        %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<48x32xf32>
-        // expected-error @+1 {{lhsShape doesn't factor into first level tile size for LLVMGPUMatmulTensorCore}}
-        linalg.matmul {lowering_config = #config} ins(%lhs, %rhs : memref<48x16xf32>, memref<16x32xf32>)
-          outs(%result: memref<48x32xf32>)
-        return
-      }
-    }
-  }
-}
-
-// -----
-
-#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
-  #hal.descriptor_set.layout<0, bindings = [
-    #hal.descriptor_set.binding<0, storage_buffer>,
-    #hal.descriptor_set.binding<1, storage_buffer>,
-    #hal.descriptor_set.binding<2, storage_buffer>
-  ]>
-]>
-hal.executable private @matmul_tensors {
-  hal.executable.variant @cuda, target = #hal.executable.target<"cuda", "cuda-nvptx-fb"> {
-    hal.executable.export @illegal layout(#pipeline_layout) attributes {
-      translation_info = #translation,
-      workgroup_size = [64 : index, 2 : index, 1 : index]
-    }
-    builtin.module {
-      func.func @illegal() {
-        %c0 = arith.constant 0 : index
-        %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<32x16xf32>
-        %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x48xf32>
-        %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<32x48xf32>
-        // expected-error @+1 {{rhsShape doesn't factor into first level tile size for LLVMGPUMatmulTensorCore}}
-        linalg.matmul {lowering_config = #config} ins(%lhs, %rhs : memref<32x16xf32>, memref<16x48xf32>)
-          outs(%result: memref<32x48xf32>)
-        return
-      }
-    }
-  }
-}
-
-// -----
-
 #config = #iree_codegen.lowering_config<tile_sizes = [[2, 32, 32, 16]]>
 #translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore>
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/workgroup_specialization_pipeline_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/workgroup_specialization_pipeline_test.mlir
index 3703d9f75930..ce78695b3e6e 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/workgroup_specialization_pipeline_test.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/workgroup_specialization_pipeline_test.mlir
@@ -42,11 +42,12 @@ module attributes {hal.device.targets = [#hal.device.target<"cuda", {executable_
 // CHECK-LABEL: func.func @forward_dispatch_116_matmul_128x30522x768
 //       CHECK: arith.cmpi eq
 //       CHECK: scf.if
-//       CHECK:   vector.transfer_read
-//       CHECK:   vector.transfer_read
-//       CHECK:   vector.contract
-//       CHECK:   vector.transfer_read
-//       CHECK:   vector.broadcast
-//       CHECK:   vector.transfer_write
+//       CHECK:   gpu.subgroup_mma_load_matrix
+//       CHECK:   gpu.subgroup_mma_load_matrix
+//       CHECK:   gpu.subgroup_mma_load_matrix
+//       CHECK:   gpu.subgroup_mma_load_matrix
+//       CHECK:   gpu.subgroup_mma_compute
+//       CHECK:   gpu.subgroup_mma_compute
+//       CHECK:   gpu.subgroup_mma_store_matrix
 //       CHECK: else
 //   CHECK-NOT:   vector.transfer

From 157d3f1aa035a1bed852385bc5cb8349e4f947a2 Mon Sep 17 00:00:00 2001
From: Okwan Kwon <okkwon@gmail.com>
Date: Wed, 16 Nov 2022 14:22:39 -0800
Subject: [PATCH 13/27] Add a filter param to
 populateContractPromotionPatterns()

---
 .../compiler/Codegen/Common/GPUPatterns.cpp     | 17 +++++++++++------
 .../iree/compiler/Codegen/Common/GPUPatterns.h  |  7 +++++--
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPUPatterns.cpp b/compiler/src/iree/compiler/Codegen/Common/GPUPatterns.cpp
index a69b187c00ce..8f4936c8e79b 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPUPatterns.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPUPatterns.cpp
@@ -6,6 +6,7 @@
 
 #include "iree/compiler/Codegen/Common/GPUPatterns.h"
 
+#include "iree-dialects/Dialect/LinalgExt/Passes/Passes.h"
 #include "iree-dialects/Dialect/LinalgExt/Transforms/Transforms.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "iree/compiler/Codegen/Utils/MarkerUtils.h"
@@ -165,8 +166,11 @@ void populateVectorTransferToGPUMMAPreparationPatterns(
   patterns.add<FlattenTransferReadOp>(patterns.getContext());
 }
 
+using LinalgTransformationFilter = IREE::LinalgExt::LinalgTransformationFilter;
+
 void populateContractPromotionPatterns(RewritePatternSet &patterns,
-                                       ArrayRef<int64_t> operandsToPromote) {
+                                       ArrayRef<int64_t> operandsToPromote,
+                                       LinalgTransformationFilter *filter) {
   MLIRContext *context = patterns.getContext();
   patterns.insert<LinalgPromotionPattern<linalg::MatmulOp>,
                   LinalgPromotionPattern<linalg::BatchMatmulOp>,
@@ -178,11 +182,12 @@ void populateContractPromotionPatterns(RewritePatternSet &patterns,
           .setCopyInOutFns(copyToWorkgroupMemory, copyToWorkgroupMemory)
           .setOperandsToPromote(operandsToPromote)
           .setUseFullTileBuffers({false, false}),
-      IREE::LinalgExt::LinalgTransformationFilter(
-          {StringAttr::get(context, getWorkgroupKTiledMarker())},
-          StringAttr::get(context, getWorkgroupMemoryMarker()))
-          .setMatchByDefault()
-          .addFilter(contractOpFilter));
+      filter ? *filter
+             : IREE::LinalgExt::LinalgTransformationFilter(
+                   {StringAttr::get(context, getWorkgroupKTiledMarker())},
+                   StringAttr::get(context, getWorkgroupMemoryMarker()))
+                   .setMatchByDefault()
+                   .addFilter(contractOpFilter));
 }
 
 }  // namespace iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPUPatterns.h b/compiler/src/iree/compiler/Codegen/Common/GPUPatterns.h
index deefb38740ac..7f9afc865d44 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPUPatterns.h
+++ b/compiler/src/iree/compiler/Codegen/Common/GPUPatterns.h
@@ -7,7 +7,9 @@
 #ifndef IREE_COMPILER_CODEGEN_COMMON_GPUPATTERNS_H_
 #define IREE_COMPILER_CODEGEN_COMMON_GPUPATTERNS_H_
 
+#include "iree-dialects/Dialect/LinalgExt/Passes/Passes.h"
 #include "mlir/IR/PatternMatch.h"
+
 namespace mlir {
 namespace iree_compiler {
 
@@ -18,8 +20,9 @@ void populateVectorTransferToGPUMMAPreparationPatterns(
 
 /// Adds patterns for promoting Linalg contract op's operands to use GPU shared
 /// memory.
-void populateContractPromotionPatterns(RewritePatternSet &patterns,
-                                       ArrayRef<int64_t> operandsToPromote);
+void populateContractPromotionPatterns(
+    RewritePatternSet &patterns, ArrayRef<int64_t> operandsToPromote,
+    IREE::LinalgExt::LinalgTransformationFilter *filter = nullptr);
 
 }  // namespace iree_compiler
 }  // namespace mlir

From 1309942c485208f940baa8801bca81c9e9132175 Mon Sep 17 00:00:00 2001
From: Okwan Kwon <okkwon@gmail.com>
Date: Wed, 16 Nov 2022 14:23:32 -0800
Subject: [PATCH 14/27] Enable unaligned K for tensorcore specialization

---
 compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index 69d75cda0ee0..fbd0cf41264f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -227,8 +227,7 @@ static LogicalResult setContractConfig(func::FuncOp entryPoint,
       // Pick the best configuration where the original shape is aligned on the
       // tile size.
       for (TileWorkgroupSizePair &config : TCtileSizeConfig) {
-        // TODO: support unaligned K size
-        if (sizeK % config.tileSize[2] == 0 && sizeN >= config.tileSize[1] &&
+        if (sizeK >= config.tileSize[2] && sizeN >= config.tileSize[1] &&
             sizeM >= config.tileSize[0]) {
           return setMatmulConfig(
               config.tileSize[0], config.tileSize[1], config.tileSize[2],

From d5c6cb01bf47c2bed750171639283f62f97315d1 Mon Sep 17 00:00:00 2001
From: Okwan Kwon <okkwon@gmail.com>
Date: Wed, 16 Nov 2022 15:10:04 -0800
Subject: [PATCH 15/27] Mark tensorcore and SIMT lowering candidates

---
 .../LLVMGPU/LLVMGPUTileAndDistribute.cpp      | 21 +++++++++++++++++++
 .../compiler/Codegen/Utils/MarkerUtils.cpp    |  6 ++++++
 .../iree/compiler/Codegen/Utils/MarkerUtils.h |  6 ++++++
 3 files changed, 33 insertions(+)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
index d9c3ba271745..1301a705e40d 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
@@ -24,8 +24,11 @@
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/Matchers.h"
+#include "mlir/IR/Visitors.h"
 #include "mlir/Support/MathExtras.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
@@ -182,6 +185,20 @@ static void populateTilingToInvocationPatterns(
       context, tilingOptions, f);
 }
 
+static void markCandidates(func::FuncOp funcOp) {
+  funcOp.walk([](linalg::LinalgOp op) {
+    if (!isa<linalg::BatchMatmulOp, linalg::MatmulOp, linalg::GenericOp>(op))
+      return WalkResult::skip();
+
+    if (succeeded(alignedOpFilter(op))) {
+      setMarker(op, getGPUTensorCoreLoweringReqMarker());
+    } else {
+      setMarker(op, getGPUSimtLoweringReqMarker());
+    }
+    return WalkResult::advance();
+  });
+}
+
 namespace {
 struct LLVMGPUTileAndDistributePass
     : public LLVMGPUTileAndDistributeBase<LLVMGPUTileAndDistributePass> {
@@ -200,6 +217,10 @@ struct LLVMGPUTileAndDistributePass
     auto funcOp = getOperation();
     if (!isEntryPoint(funcOp)) return;
 
+    // Mark lowering candidates. An op can be a tensorcore or SIMT lowering
+    // candidate.
+    markCandidates(funcOp);
+
     // Promote C matrix and propagate the potential fill producer into the temp
     // allocation. This needs to be done before reduction tiling.
     {
diff --git a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp
index 0399f119891d..d357d742a651 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp
@@ -40,6 +40,12 @@ StringRef getWorkgroupSpecializationMarker() {
   return "workgroup_specialization";
 }
 
+StringRef getGPUSimtLoweringReqMarker() { return "gpu_simt_lowering_req"; }
+
+StringRef getGPUTensorCoreLoweringReqMarker() {
+  return "gpu_tensorcore_lowering_req";
+}
+
 StringRef getCopyToWorkgroupMemoryMarker() {
   return "copy_to_workgroup_memory";
 }
diff --git a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h
index 80e11b7d2115..2b94cad19222 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h
+++ b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h
@@ -37,6 +37,12 @@ StringRef getWorkgroupL1TileMarker();
 /// specialization.
 StringRef getWorkgroupSpecializationMarker();
 
+/// Marker to denote that the op is marked for SIMT lowering
+StringRef getGPUSimtLoweringReqMarker();
+
+/// Marker to denote that the op is marked for tensorcore lowering
+StringRef getGPUTensorCoreLoweringReqMarker();
+
 /// Marker for copy operations that are moving data from StorageClass to
 /// Workgroup memory.
 StringRef getCopyToWorkgroupMemoryMarker();

From fd4b182b1aa0a987950a3d4178bb6686ef793c00 Mon Sep 17 00:00:00 2001
From: Okwan Kwon <okkwon@gmail.com>
Date: Wed, 16 Nov 2022 16:43:57 -0800
Subject: [PATCH 16/27] Use the tensorcore marker to serialize the K loop

---
 .../LLVMGPU/LLVMGPUTileAndDistribute.cpp      | 65 ++++++++++++++++++-
 1 file changed, 62 insertions(+), 3 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
index 1301a705e40d..e246074e895b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
@@ -27,7 +27,9 @@
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/SCF/Transforms/Transforms.h"
 #include "mlir/IR/Matchers.h"
+#include "mlir/IR/OperationSupport.h"
 #include "mlir/IR/Visitors.h"
 #include "mlir/Support/MathExtras.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -199,6 +201,58 @@ static void markCandidates(func::FuncOp funcOp) {
   });
 }
 
+static LogicalResult tileTensorCoreKDim(func::FuncOp funcOp) {
+  // mark which linarg op is a tensorcore
+  markCandidates(funcOp);
+
+  auto context = funcOp.getContext();
+  RewritePatternSet patterns(context);
+  auto tileSizesFn = [](OpBuilder &builder,
+                        Operation *op) -> SmallVector<Value, 4> {
+    auto interfaceOp = cast<PartitionableLoopsInterface>(*op);
+    auto partitionedLoops =
+        interfaceOp.getPartitionableLoops(kNumMaxParallelDims);
+    SmallVector<Value, 4> tileSizes = getTileSizes(builder, op, 0);
+    auto zero = builder.create<arith::ConstantIndexOp>(op->getLoc(), 0);
+    for (unsigned depth : partitionedLoops) {
+      if (depth < tileSizes.size()) {
+        tileSizes[depth] = zero;
+      }
+    }
+    return tileSizes;
+  };
+
+  auto tilingOptions =
+      linalg::LinalgTilingOptions()
+          .setLoopType(linalg::LinalgTilingLoopType::Loops)
+          .setTileSizeComputationFunction(tileSizesFn)
+          .setPeeledLoops({0});  // peel off the partial iterations
+
+  IREE::LinalgExt::LinalgTransformationFilter filter(
+      ArrayRef<StringAttr>{
+          StringAttr::get(context, getGPUTensorCoreLoweringReqMarker())},
+      StringAttr::get(context, getWorkgroupKTiledMarker()));
+
+  TilingPatterns<linalg::MatmulOp, linalg::BatchMatmulOp,
+                 linalg::GenericOp>::insert(patterns, tilingOptions, filter);
+
+  if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) {
+    return failure();
+  }
+
+  RewritePatternSet wgTilingCanonicalizationPatterns =
+      linalg::getLinalgTilingCanonicalizationPatterns(funcOp.getContext());
+  populateAffineMinSCFCanonicalizationPattern(wgTilingCanonicalizationPatterns);
+  scf::populateSCFForLoopCanonicalizationPatterns(
+      wgTilingCanonicalizationPatterns);
+  if (failed(applyPatternsAndFoldGreedily(
+          funcOp, std::move(wgTilingCanonicalizationPatterns)))) {
+    return failure();
+  }
+
+  return success();
+}
+
 namespace {
 struct LLVMGPUTileAndDistributePass
     : public LLVMGPUTileAndDistributeBase<LLVMGPUTileAndDistributePass> {
@@ -221,11 +275,15 @@ struct LLVMGPUTileAndDistributePass
     // candidate.
     markCandidates(funcOp);
 
+    auto tensorcoreFilter = IREE::LinalgExt::LinalgTransformationFilter(
+        {StringAttr::get(context, getGPUTensorCoreLoweringReqMarker())});
+
     // Promote C matrix and propagate the potential fill producer into the temp
     // allocation. This needs to be done before reduction tiling.
     {
       RewritePatternSet promotionPatterns(&getContext());
-      populateContractPromotionPatterns(promotionPatterns, {2});
+      populateContractPromotionPatterns(promotionPatterns, {2},
+                                        &tensorcoreFilter);
       if (failed(applyPatternsAndFoldGreedily(funcOp,
                                               std::move(promotionPatterns)))) {
         return signalPassFailure();
@@ -236,7 +294,7 @@ struct LLVMGPUTileAndDistributePass
     // Tile again at the workgroup level since reduction dimension were
     // ignored. Dimensions already tiled will be ignore since we tile to the
     // same size.
-    if (failed(tileToSerialLoops(funcOp))) {
+    if (failed(tileTensorCoreKDim(funcOp))) {
       return signalPassFailure();
     }
 
@@ -255,7 +313,8 @@ struct LLVMGPUTileAndDistributePass
     if (flatWorkgroupSize > kWarpSize) {
       RewritePatternSet promotionPatterns(&getContext());
 
-      populateContractPromotionPatterns(promotionPatterns, {0, 1});
+      populateContractPromotionPatterns(promotionPatterns, {0, 1},
+                                        &tensorcoreFilter);
 
       if (failed(applyPatternsAndFoldGreedily(funcOp,
                                               std::move(promotionPatterns)))) {

From bccc04cf5cc66b60fc2f2854df38312185486855 Mon Sep 17 00:00:00 2001
From: Okwan Kwon <okkwon@gmail.com>
Date: Wed, 16 Nov 2022 16:55:30 -0800
Subject: [PATCH 17/27] do not handle genericOp in the tensorcore path

---
 .../compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
index e246074e895b..d80e505a0d64 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
@@ -189,7 +189,7 @@ static void populateTilingToInvocationPatterns(
 
 static void markCandidates(func::FuncOp funcOp) {
   funcOp.walk([](linalg::LinalgOp op) {
-    if (!isa<linalg::BatchMatmulOp, linalg::MatmulOp, linalg::GenericOp>(op))
+    if (!isa<linalg::BatchMatmulOp, linalg::MatmulOp>(op))
       return WalkResult::skip();
 
     if (succeeded(alignedOpFilter(op))) {
@@ -233,8 +233,8 @@ static LogicalResult tileTensorCoreKDim(func::FuncOp funcOp) {
           StringAttr::get(context, getGPUTensorCoreLoweringReqMarker())},
       StringAttr::get(context, getWorkgroupKTiledMarker()));
 
-  TilingPatterns<linalg::MatmulOp, linalg::BatchMatmulOp,
-                 linalg::GenericOp>::insert(patterns, tilingOptions, filter);
+  TilingPatterns<linalg::MatmulOp, linalg::BatchMatmulOp>::insert(
+      patterns, tilingOptions, filter);
 
   if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) {
     return failure();

From 74a23ee7ef9be6a12b316eae7a10c4bbe7cda664 Mon Sep 17 00:00:00 2001
From: Okwan Kwon <okkwon@gmail.com>
Date: Thu, 17 Nov 2022 11:50:06 -0800
Subject: [PATCH 18/27] Do warp-level tiling selectively using a marker

---
 .../LLVMGPU/LLVMGPUTileAndDistribute.cpp      | 51 ++++++++++++++++---
 .../compiler/Codegen/Utils/MarkerUtils.cpp    |  4 ++
 .../iree/compiler/Codegen/Utils/MarkerUtils.h |  3 ++
 3 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
index d80e505a0d64..140d3194859c 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
@@ -28,6 +28,7 @@
 #include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SCF/Transforms/Transforms.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/OperationSupport.h"
 #include "mlir/IR/Visitors.h"
@@ -123,7 +124,7 @@ static void populateTilingToWarpPatterns(
   auto getWarpProcInfoFn = [warpPerWorkgroup](
                                OpBuilder &builder, Location loc,
                                ArrayRef<Range> parallelLoopRanges) {
-    return getSubgroupIdsAndCounts(builder, loc, /*warpSize=*/32u,
+    return getSubgroupIdsAndCounts(builder, loc, kWarpSize,
                                    parallelLoopRanges.size(), warpPerWorkgroup);
   };
 
@@ -136,12 +137,8 @@ static void populateTilingToWarpPatterns(
                            .setDistributionOptions(warpDistributionOptions);
   MLIRContext *context = patterns.getContext();
   IREE::LinalgExt::LinalgTransformationFilter filter(
-      {StringAttr::get(context, getWorkgroupKTiledMarker()),
-       StringAttr::get(context, getWorkgroupMemoryMarker())},
+      {StringAttr::get(context, getGPUWarpLevelTilingReqMarker())},
       StringAttr::get(context, getVectorizeMarker()));
-  filter.setMatchByDefault();
-  // Bail out the case where the tensor sizes are not a multiple of tile sizes.
-  filter.addFilter(alignedOpFilter);
   TilingPatterns<linalg::MatmulOp, linalg::FillOp, linalg::BatchMatmulOp,
                  linalg::GenericOp>::insert(patterns, tilingOptions, filter);
 }
@@ -253,6 +250,28 @@ static LogicalResult tileTensorCoreKDim(func::FuncOp funcOp) {
   return success();
 }
 
+// Get K dimension size. It returns kDynamicSize for unknown cases.
+static int64_t getSizeK(linalg::LinalgOp op) {
+  int64_t sizeK = ShapedType::kDynamicSize;
+
+  if (!isa<linalg::BatchMatmulOp, linalg::MatmulOp>(op)) return sizeK;
+
+  auto lhsShape =
+      op.getDpsInputOperand(0)->get().getType().cast<ShapedType>().getShape();
+  SmallVector<unsigned> exprs;
+  op.getReductionDims(exprs);
+  if (exprs.size() == 1) {
+    for (unsigned i = 0; i < lhsShape.size(); i++) {
+      if (op.getMatchingIndexingMap(op.getDpsInputOperand(0))
+              .getDimPosition(i) == exprs[0]) {
+        sizeK = lhsShape[i];
+        break;
+      }
+    }
+  }
+  return sizeK;
+}
+
 namespace {
 struct LLVMGPUTileAndDistributePass
     : public LLVMGPUTileAndDistributeBase<LLVMGPUTileAndDistributePass> {
@@ -339,6 +358,26 @@ struct LLVMGPUTileAndDistributePass
     });
 
     if (distributeToWarp) {
+      // mark candidates for the warp level tiling
+      funcOp.walk([&](linalg::LinalgOp op) {
+        if (failed(alignedOpFilter(op))) return WalkResult::skip();
+        if (!isa<linalg::BatchMatmulOp, linalg::MatmulOp, linalg::FillOp>(op))
+          return WalkResult::skip();
+
+        // check if K is a multiple of Tile-K.
+        int64_t sizeK = getSizeK(op);
+        if (sizeK != ShapedType::kDynamicSize) {
+          // WG tile sizes
+          auto wgTileSizes = getTileSizes(op, 0);
+
+          if (sizeK % wgTileSizes[wgTileSizes.size() - 1] != 0)
+            return WalkResult::skip();
+        }
+
+        setMarker(op, getGPUWarpLevelTilingReqMarker());
+        return WalkResult::advance();
+      });
+
       // Apply last level of tiling and distribute to warps for aligned ops.
       RewritePatternSet warpLevelTilingPatterns(context);
       populateTilingToWarpPatterns(warpLevelTilingPatterns, workgroupSize);
diff --git a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp
index d357d742a651..358e4c048168 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp
@@ -46,6 +46,10 @@ StringRef getGPUTensorCoreLoweringReqMarker() {
   return "gpu_tensorcore_lowering_req";
 }
 
+StringRef getGPUWarpLevelTilingReqMarker() {
+  return "gpu_warp_level_tiling_req";
+}
+
 StringRef getCopyToWorkgroupMemoryMarker() {
   return "copy_to_workgroup_memory";
 }
diff --git a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h
index 2b94cad19222..89bb8a953e0c 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h
+++ b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h
@@ -43,6 +43,9 @@ StringRef getGPUSimtLoweringReqMarker();
 /// Marker to denote that the op is marked for tensorcore lowering
 StringRef getGPUTensorCoreLoweringReqMarker();
 
+/// Marker to denote that the op is marked for warp-level tiling
+StringRef getGPUWarpLevelTilingReqMarker();
+
 /// Marker for copy operations that are moving data from StorageClass to
 /// Workgroup memory.
 StringRef getCopyToWorkgroupMemoryMarker();

From 04dc39b3f2a21e4d4cf06a7ad3cef6f434f63e70 Mon Sep 17 00:00:00 2001
From: Okwan Kwon <okkwon@gmail.com>
Date: Thu, 17 Nov 2022 14:13:17 -0800
Subject: [PATCH 19/27] do thread-level tiling for the partial K and
 unspecialized ops

---
 .../Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp         | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
index 140d3194859c..9162c520685a 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
@@ -171,12 +171,13 @@ static void populateTilingToInvocationPatterns(
   MLIRContext *context = patterns.getContext();
   IREE::LinalgExt::LinalgTransformationFilter f(
       {StringAttr::get(context, getWorkgroupKTiledMarker()),
-       StringAttr::get(context, getWorkgroupMemoryMarker())},
-      StringAttr::get(context, getVectorizeMarker()));
+       StringAttr::get(context, getWorkgroupMemoryMarker()),
+       StringAttr::get(context, getGPUSimtLoweringReqMarker())
+      });
   f.addFilter([](Operation *op) {
      // FFT doesn't support second level of tiling yet.
      return success(!isa<IREE::LinalgExt::FftOp>(op));
-   }).setMatchByDefault();
+  });
   // Add the user provided filter if available.
   if (ff) f.addFilter(ff);
   patterns.insert<IREE::LinalgExt::LinalgTilingPattern,
@@ -389,7 +390,7 @@ struct LLVMGPUTileAndDistributePass
       // Apply last level of tiling and distribute to threads for unaligned ops.
       RewritePatternSet threadLevelTilingPatterns(context);
       populateTilingToInvocationPatterns(threadLevelTilingPatterns,
-                                         workgroupSize, unalignedOpFilter);
+                                         workgroupSize);
       if (failed(applyPatternsAndFoldGreedily(
               funcOp, std::move(threadLevelTilingPatterns)))) {
         return signalPassFailure();

From b7996157dd3b17a4419c472a72059250c5264183 Mon Sep 17 00:00:00 2001
From: Okwan Kwon <okkwon@gmail.com>
Date: Thu, 17 Nov 2022 21:21:49 -0800
Subject: [PATCH 20/27] promote inputs and output always

---
 .../Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp   | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
index 9162c520685a..b7a19467bd34 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
@@ -291,19 +291,11 @@ struct LLVMGPUTileAndDistributePass
     auto funcOp = getOperation();
     if (!isEntryPoint(funcOp)) return;
 
-    // Mark lowering candidates. An op can be a tensorcore or SIMT lowering
-    // candidate.
-    markCandidates(funcOp);
-
-    auto tensorcoreFilter = IREE::LinalgExt::LinalgTransformationFilter(
-        {StringAttr::get(context, getGPUTensorCoreLoweringReqMarker())});
-
     // Promote C matrix and propagate the potential fill producer into the temp
     // allocation. This needs to be done before reduction tiling.
     {
       RewritePatternSet promotionPatterns(&getContext());
-      populateContractPromotionPatterns(promotionPatterns, {2},
-                                        &tensorcoreFilter);
+      populateContractPromotionPatterns(promotionPatterns, {2});
       if (failed(applyPatternsAndFoldGreedily(funcOp,
                                               std::move(promotionPatterns)))) {
         return signalPassFailure();
@@ -333,9 +325,7 @@ struct LLVMGPUTileAndDistributePass
     if (flatWorkgroupSize > kWarpSize) {
       RewritePatternSet promotionPatterns(&getContext());
 
-      populateContractPromotionPatterns(promotionPatterns, {0, 1},
-                                        &tensorcoreFilter);
-
+      populateContractPromotionPatterns(promotionPatterns, {0, 1});
       if (failed(applyPatternsAndFoldGreedily(funcOp,
                                               std::move(promotionPatterns)))) {
         return signalPassFailure();

From 3b67ab89bf59d8ccce1ab9b79e95569b89a2bf98 Mon Sep 17 00:00:00 2001
From: Okwan Kwon <okkwon@gmail.com>
Date: Thu, 17 Nov 2022 21:43:40 -0800
Subject: [PATCH 21/27] NFC: remove default value from tileToSerialLoops

---
 .../src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorAlloc.cpp    | 2 +-
 .../iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp  | 2 +-
 compiler/src/iree/compiler/Codegen/LLVMGPU/TilingUtils.h        | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorAlloc.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorAlloc.cpp
index d2f43b69896e..551eba1d66ff 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorAlloc.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorAlloc.cpp
@@ -83,7 +83,7 @@ struct LLVMGPUTensorAllocPass
     auto funcOp = getOperation();
 
     // Tile the reduction first to reduce the alloc size.
-    if (failed(tileToSerialLoops(funcOp))) {
+    if (failed(tileToSerialLoops(funcOp, /*onlyReduction=*/true))) {
       return signalPassFailure();
     }
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
index b7a19467bd34..77f851815455 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
@@ -306,7 +306,7 @@ struct LLVMGPUTileAndDistributePass
     // Tile again at the workgroup level since reduction dimension were
     // ignored. Dimensions already tiled will be ignore since we tile to the
     // same size.
-    if (failed(tileTensorCoreKDim(funcOp))) {
+    if (failed(tileToSerialLoops(funcOp, /*onlyReduction=*/true))) {
       return signalPassFailure();
     }
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TilingUtils.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/TilingUtils.h
index 7ef1b93324d4..6303ee86d79f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TilingUtils.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TilingUtils.h
@@ -16,7 +16,7 @@ class FuncOp;
 namespace iree_compiler {
 
 /// Apply tiling to reduction dimensions based on op attributes.
-LogicalResult tileToSerialLoops(func::FuncOp funcOp, bool onlyReduction = true);
+LogicalResult tileToSerialLoops(func::FuncOp funcOp, bool onlyReduction);
 
 }  // namespace iree_compiler
 }  // namespace mlir

From c8daa2b18d1f7987fddc9424ef76ac3745ee57c2 Mon Sep 17 00:00:00 2001
From: Okwan Kwon <okkwon@gmail.com>
Date: Thu, 17 Nov 2022 21:55:18 -0800
Subject: [PATCH 22/27] Remove onlyReduction from tileToSerialLoops and add
 peel

The function is always used for reduction cases, so let's remove the flag. In
addition, add `peel` to peel the serial loop when the warp-level tiling is
requested.
---
 .../Codegen/LLVMGPU/LLVMGPUTensorAlloc.cpp    |  2 +-
 .../LLVMGPU/LLVMGPUTileAndDistribute.cpp      |  5 ++--
 .../Codegen/LLVMGPU/LLVMGPUTileTensor.cpp     | 23 +++++++++----------
 .../compiler/Codegen/LLVMGPU/TilingUtils.h    |  2 +-
 4 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorAlloc.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorAlloc.cpp
index 551eba1d66ff..3164f3d3dfae 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorAlloc.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorAlloc.cpp
@@ -83,7 +83,7 @@ struct LLVMGPUTensorAllocPass
     auto funcOp = getOperation();
 
     // Tile the reduction first to reduce the alloc size.
-    if (failed(tileToSerialLoops(funcOp, /*onlyReduction=*/true))) {
+    if (failed(tileToSerialLoops(funcOp, /*peel=*/false))) {
       return signalPassFailure();
     }
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
index 77f851815455..a1b9656a4a64 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
@@ -305,8 +305,9 @@ struct LLVMGPUTileAndDistributePass
 
     // Tile again at the workgroup level since reduction dimension were
     // ignored. Dimensions already tiled will be ignore since we tile to the
-    // same size.
-    if (failed(tileToSerialLoops(funcOp, /*onlyReduction=*/true))) {
+    // same size. For distributing to warps, peel the partial iterations as
+    // a separate loop, since the warp distribution is requested for wmma.
+    if (failed(tileToSerialLoops(funcOp, /*peel=*/distributeToWarp))) {
       return signalPassFailure();
     }
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileTensor.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileTensor.cpp
index 6348ddf1d05a..aeb1140b7675 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileTensor.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileTensor.cpp
@@ -32,20 +32,18 @@ namespace iree_compiler {
 /// Patterns for workgroup level tiling. Workgroup tiling is done at the flow
 /// level but we may have extra tiling for the reduction dimension. Therefore we
 /// tile again without distributing.
-static void populateTilingPatterns(RewritePatternSet &patterns,
-                                   bool onlyReduction) {
-  auto tileSizesFn = [onlyReduction](OpBuilder &builder,
+static void populateReductionTilingPatterns(RewritePatternSet &patterns,
+                                            bool peel) {
+  auto tileSizesFn = [](OpBuilder &builder,
                                      Operation *op) -> SmallVector<Value, 4> {
     auto interfaceOp = cast<PartitionableLoopsInterface>(*op);
     auto partitionedLoops =
         interfaceOp.getPartitionableLoops(kNumMaxParallelDims);
     SmallVector<Value, 4> tileSizes = getTileSizes(builder, op, 0);
-    if (onlyReduction) {
-      auto zero = builder.create<arith::ConstantIndexOp>(op->getLoc(), 0);
-      for (unsigned depth : partitionedLoops) {
-        if (depth < tileSizes.size()) {
-          tileSizes[depth] = zero;
-        }
+    auto zero = builder.create<arith::ConstantIndexOp>(op->getLoc(), 0);
+    for (unsigned depth : partitionedLoops) {
+      if (depth < tileSizes.size()) {
+        tileSizes[depth] = zero;
       }
     }
     return tileSizes;
@@ -54,6 +52,7 @@ static void populateTilingPatterns(RewritePatternSet &patterns,
   auto tilingOptions = linalg::LinalgTilingOptions()
                            .setLoopType(linalg::LinalgTilingLoopType::Loops)
                            .setTileSizeComputationFunction(tileSizesFn);
+  if (peel) tilingOptions.setPeeledLoops({0});
   MLIRContext *context = patterns.getContext();
 
   IREE::LinalgExt::LinalgTransformationFilter filter(
@@ -67,13 +66,13 @@ static void populateTilingPatterns(RewritePatternSet &patterns,
                                                    filter);
 }
 
-LogicalResult tileToSerialLoops(func::FuncOp funcOp, bool onlyReduction) {
+LogicalResult tileToSerialLoops(func::FuncOp funcOp, bool peel) {
   {
     // Tile again at the workgroup level since reduction dimension were
     // ignored. Dimensions already tiled will be ignore since we tile to the
     // same size.
     RewritePatternSet wgTilingPatterns(funcOp.getContext());
-    populateTilingPatterns(wgTilingPatterns, onlyReduction);
+    populateReductionTilingPatterns(wgTilingPatterns, peel);
     if (failed(applyPatternsAndFoldGreedily(funcOp,
                                             std::move(wgTilingPatterns)))) {
       return failure();
@@ -223,7 +222,7 @@ struct LLVMGPUTileTensorPass
 
     // Tile to serial loops to the wg tile size to handle reductions and other
     // dimension that have not been distributed.
-    if (failed(tileToSerialLoops(funcOp, /*onlyReduction=*/true))) {
+    if (failed(tileToSerialLoops(funcOp, /*peel=*/false))) {
       return signalPassFailure();
     }
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TilingUtils.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/TilingUtils.h
index 6303ee86d79f..8616a92d6fc0 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TilingUtils.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TilingUtils.h
@@ -16,7 +16,7 @@ class FuncOp;
 namespace iree_compiler {
 
 /// Apply tiling to reduction dimensions based on op attributes.
-LogicalResult tileToSerialLoops(func::FuncOp funcOp, bool onlyReduction);
+LogicalResult tileToSerialLoops(func::FuncOp funcOp, bool peel);
 
 }  // namespace iree_compiler
 }  // namespace mlir

From a4cb7b5669c06c43a82bf1d2316dfc3e9a1b82c4 Mon Sep 17 00:00:00 2001
From: Okwan Kwon <okkwon@gmail.com>
Date: Fri, 18 Nov 2022 13:49:13 -0800
Subject: [PATCH 23/27] Use `vectorize_for_tensorcore` marker

---
 .../LLVMGPU/LLVMGPUTensorCoreVectorization.cpp    |  6 ++----
 .../Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp  | 15 ++++++---------
 .../Codegen/LLVMGPU/LLVMGPUTileTensor.cpp         |  2 +-
 .../LLVMGPU/test/tensorcore_vectorization.mlir    |  4 ++--
 .../iree/compiler/Codegen/Utils/MarkerUtils.cpp   |  4 ++++
 .../src/iree/compiler/Codegen/Utils/MarkerUtils.h |  3 +++
 6 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp
index 5bb7bdfb6022..64dc7e65b1a3 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp
@@ -29,10 +29,8 @@ extern llvm::cl::opt<bool> llvmgpuUseMMASync;
 //====---------------------------------------------------------------------===//
 
 static void populateVectorizationPatterns(RewritePatternSet &patterns) {
-  IREE::LinalgExt::LinalgTransformationFilter f(
-      StringAttr::get(patterns.getContext(), getVectorizeMarker()));
-  // only handle aligned ops
-  f.addFilter(alignedOpFilter);
+  IREE::LinalgExt::LinalgTransformationFilter f(StringAttr::get(
+      patterns.getContext(), getVectorizeForTensorCoreMarker()));
   VectorizationPatterns<linalg::FillOp, linalg::GenericOp>::insert(patterns, f);
   patterns.add<LinalgVectorizationPattern>(
       patterns.getContext(), f.addOpFilter<linalg::ContractionOpInterface>());
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
index a1b9656a4a64..56db23351023 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
@@ -138,7 +138,7 @@ static void populateTilingToWarpPatterns(
   MLIRContext *context = patterns.getContext();
   IREE::LinalgExt::LinalgTransformationFilter filter(
       {StringAttr::get(context, getGPUWarpLevelTilingReqMarker())},
-      StringAttr::get(context, getVectorizeMarker()));
+      StringAttr::get(context, getVectorizeForTensorCoreMarker()));
   TilingPatterns<linalg::MatmulOp, linalg::FillOp, linalg::BatchMatmulOp,
                  linalg::GenericOp>::insert(patterns, tilingOptions, filter);
 }
@@ -147,8 +147,7 @@ using FilterFunction = std::function<LogicalResult(Operation *)>;
 
 /// Patterns for thread level tiling.
 static void populateTilingToInvocationPatterns(
-    RewritePatternSet &patterns, SmallVectorImpl<int64_t> &workgroupSize,
-    const FilterFunction &ff = nullptr) {
+    RewritePatternSet &patterns, SmallVectorImpl<int64_t> &workgroupSize) {
   linalg::TileSizeComputationFunction getInnerTileSizeFn =
       [&](OpBuilder &builder, Operation *operation) {
         return calculateDistributedTileSize(workgroupSize, builder, operation);
@@ -172,14 +171,12 @@ static void populateTilingToInvocationPatterns(
   IREE::LinalgExt::LinalgTransformationFilter f(
       {StringAttr::get(context, getWorkgroupKTiledMarker()),
        StringAttr::get(context, getWorkgroupMemoryMarker()),
-       StringAttr::get(context, getGPUSimtLoweringReqMarker())
-      });
+       StringAttr::get(context, getGPUSimtLoweringReqMarker())},
+      StringAttr::get(context, getVectorizeMarker()));
   f.addFilter([](Operation *op) {
-     // FFT doesn't support second level of tiling yet.
-     return success(!isa<IREE::LinalgExt::FftOp>(op));
+    // FFT doesn't support second level of tiling yet.
+    return success(!isa<IREE::LinalgExt::FftOp>(op));
   });
-  // Add the user provided filter if available.
-  if (ff) f.addFilter(ff);
   patterns.insert<IREE::LinalgExt::LinalgTilingPattern,
                   IREE::LinalgExt::TilingInterfaceTilingPattern>(
       context, tilingOptions, f);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileTensor.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileTensor.cpp
index aeb1140b7675..a9c2a7810f16 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileTensor.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileTensor.cpp
@@ -35,7 +35,7 @@ namespace iree_compiler {
 static void populateReductionTilingPatterns(RewritePatternSet &patterns,
                                             bool peel) {
   auto tileSizesFn = [](OpBuilder &builder,
-                                     Operation *op) -> SmallVector<Value, 4> {
+                        Operation *op) -> SmallVector<Value, 4> {
     auto interfaceOp = cast<PartitionableLoopsInterface>(*op);
     auto partitionedLoops =
         interfaceOp.getPartitionableLoops(kNumMaxParallelDims);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/tensorcore_vectorization.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/tensorcore_vectorization.mlir
index 425e406127ae..ba6ed805fe55 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/tensorcore_vectorization.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/tensorcore_vectorization.mlir
@@ -20,7 +20,7 @@ func.func @dot() {
   %10 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%9]
   %11 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 32)>(%8)
   %12 = memref.subview %7[%10, %11] [32, 32] [1, 1] : memref<64x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<32x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
-  linalg.fill {__internal_linalg_transform__ = "vectorize"} ins(%cst : f32) outs(%12 : memref<32x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>)
+  linalg.fill {__internal_linalg_transform__ = "vectorize_for_tensorcore"} ins(%cst : f32) outs(%12 : memref<32x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>)
   scf.for %arg0 = %c0 to %c1024 step %c16 {
     %13 = memref.subview %5[0, %arg0] [64, 16] [1, 1] : memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>> to memref<64x16xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
     %14 = memref.subview %6[%arg0, 0] [16, 64] [1, 1] : memref<1024x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<16x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
@@ -28,7 +28,7 @@ func.func @dot() {
     %16 = memref.subview %13[%10, 0] [32, 16] [1, 1] : memref<64x16xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>> to memref<32x16xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
     %17 = memref.subview %14[0, %15] [16, 32] [1, 1] : memref<16x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<16x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
     %18 = memref.subview %7[%10, %15] [32, 32] [1, 1] : memref<64x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<32x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
-    linalg.matmul {__internal_linalg_transform__ = "vectorize"} ins(%16, %17 : memref<32x16xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, memref<16x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>) outs(%18 : memref<32x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>)
+    linalg.matmul {__internal_linalg_transform__ = "vectorize_for_tensorcore"} ins(%16, %17 : memref<32x16xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, memref<16x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>) outs(%18 : memref<32x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>)
   }
   return
 }
diff --git a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp
index 358e4c048168..bb44b7681187 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp
@@ -58,6 +58,10 @@ StringRef getTileReductionMarker() { return "tile_reduction"; }
 
 StringRef getVectorizeMarker() { return "vectorize"; }
 
+StringRef getVectorizeForTensorCoreMarker() {
+  return "vectorize_for_tensorcore";
+}
+
 StringRef getDeleteMarker() { return "delete"; }
 
 StringRef getMarkerOrNull(Operation *op) {
diff --git a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h
index 89bb8a953e0c..f62ddbba6cd7 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h
+++ b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h
@@ -56,6 +56,9 @@ StringRef getTileReductionMarker();
 /// Marker for operations that are going to be vectorized.
 StringRef getVectorizeMarker();
 
+/// Marker for operations that are going to be vectorized for tensorcore.
+StringRef getVectorizeForTensorCoreMarker();
+
 /// Marker for tagging an operation for deletion. Tile and fuse pattern does
 /// not delete the original operation to not invalidate the
 /// `linalg::LinalgDependenceGraph` data structure. Instead it is marked with

From 6d7305bad311ddb9a69da5fce37f485b9fe38c09 Mon Sep 17 00:00:00 2001
From: Okwan Kwon <okkwon@gmail.com>
Date: Fri, 18 Nov 2022 15:04:37 -0800
Subject: [PATCH 24/27] match by default for the non-tensorcore flow

---
 .../compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp  | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
index 56db23351023..75b3a874ef80 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
@@ -147,7 +147,8 @@ using FilterFunction = std::function<LogicalResult(Operation *)>;
 
 /// Patterns for thread level tiling.
 static void populateTilingToInvocationPatterns(
-    RewritePatternSet &patterns, SmallVectorImpl<int64_t> &workgroupSize) {
+    RewritePatternSet &patterns, SmallVectorImpl<int64_t> &workgroupSize,
+    bool matchByDefault = true) {
   linalg::TileSizeComputationFunction getInnerTileSizeFn =
       [&](OpBuilder &builder, Operation *operation) {
         return calculateDistributedTileSize(workgroupSize, builder, operation);
@@ -177,6 +178,7 @@ static void populateTilingToInvocationPatterns(
     // FFT doesn't support second level of tiling yet.
     return success(!isa<IREE::LinalgExt::FftOp>(op));
   });
+  if (matchByDefault) f.setMatchByDefault();
   patterns.insert<IREE::LinalgExt::LinalgTilingPattern,
                   IREE::LinalgExt::TilingInterfaceTilingPattern>(
       context, tilingOptions, f);
@@ -378,7 +380,8 @@ struct LLVMGPUTileAndDistributePass
       // Apply last level of tiling and distribute to threads for unaligned ops.
       RewritePatternSet threadLevelTilingPatterns(context);
       populateTilingToInvocationPatterns(threadLevelTilingPatterns,
-                                         workgroupSize);
+                                         workgroupSize,
+                                         /*matchByDefault=*/false);
       if (failed(applyPatternsAndFoldGreedily(
               funcOp, std::move(threadLevelTilingPatterns)))) {
         return signalPassFailure();

From d7fe191e5cfc642814f13e65071c3e7d6e29f718 Mon Sep 17 00:00:00 2001
From: Okwan Kwon <okkwon@gmail.com>
Date: Fri, 18 Nov 2022 15:40:45 -0800
Subject: [PATCH 25/27] add missing genericOp supports for tensorcore lowering

---
 .../iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
index 75b3a874ef80..e4dd64cdbc79 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
@@ -352,7 +352,8 @@ struct LLVMGPUTileAndDistributePass
       // mark candidates for the warp level tiling
       funcOp.walk([&](linalg::LinalgOp op) {
         if (failed(alignedOpFilter(op))) return WalkResult::skip();
-        if (!isa<linalg::BatchMatmulOp, linalg::MatmulOp, linalg::FillOp>(op))
+        if (!isa<linalg::BatchMatmulOp, linalg::MatmulOp, linalg::FillOp,
+            linalg::GenericOp>(op))
           return WalkResult::skip();
 
         // check if K is a multiple of Tile-K.

From 23e68dbedb1aa95b1e4c654cd0d20520b223c478 Mon Sep 17 00:00:00 2001
From: Okwan Kwon <okkwon@gmail.com>
Date: Sat, 19 Nov 2022 16:28:17 -0800
Subject: [PATCH 26/27] add more debug print to LLVMGPUTileAndDistribute

---
 .../Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp     | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
index e4dd64cdbc79..c6f393f9a9bb 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
@@ -299,7 +299,17 @@ struct LLVMGPUTileAndDistributePass
                                               std::move(promotionPatterns)))) {
         return signalPassFailure();
       }
+      LLVM_DEBUG({
+        llvm::dbgs() << "After promote C:\n";
+        funcOp.dump();
+      });
+
       propagateSharedMemoryCopy(funcOp);
+
+      LLVM_DEBUG({
+        llvm::dbgs() << "After propagateSharedMemoryCopy():\n";
+        funcOp.dump();
+      });
     }
 
     // Tile again at the workgroup level since reduction dimension were
@@ -353,7 +363,7 @@ struct LLVMGPUTileAndDistributePass
       funcOp.walk([&](linalg::LinalgOp op) {
         if (failed(alignedOpFilter(op))) return WalkResult::skip();
         if (!isa<linalg::BatchMatmulOp, linalg::MatmulOp, linalg::FillOp,
-            linalg::GenericOp>(op))
+                 linalg::GenericOp>(op))
           return WalkResult::skip();
 
         // check if K is a multiple of Tile-K.

From 1f8c8c9a8adc8682ba9f192646b473d7bbf34a3b Mon Sep 17 00:00:00 2001
From: Okwan Kwon <okkwon@gmail.com>
Date: Sat, 19 Nov 2022 16:37:25 -0800
Subject: [PATCH 27/27] Do not handle the fused op with warp-level tiling

---
 .../compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp   | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
index c6f393f9a9bb..64973d2123a9 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
@@ -366,6 +366,12 @@ struct LLVMGPUTileAndDistributePass
                  linalg::GenericOp>(op))
           return WalkResult::skip();
 
+        if (isa<linalg::GenericOp>(op) &&
+            hasMarker(op, getCopyToWorkgroupMemoryMarker())) {
+          // The GPUDistributeSharedMemoryCopy pass will handle it later.
+          return WalkResult::skip();
+        }
+
         // check if K is a multiple of Tile-K.
         int64_t sizeK = getSizeK(op);
         if (sizeK != ShapedType::kDynamicSize) {