diff --git a/compiler/src/iree/compiler/Codegen/Common/GPUMultiBuffering.cpp b/compiler/src/iree/compiler/Codegen/Common/GPUMultiBuffering.cpp
index e8d8e18a1a7b..1dbc93ed1623 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPUMultiBuffering.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPUMultiBuffering.cpp
@@ -38,10 +38,10 @@ struct GPUMultiBufferingPass
     });
     // Apply multi-buffering to all of them.
     for (memref::AllocOp alloc : allocs) {
-      if (failed(memref::multiBuffer(alloc, numBuffers)))
-        // Stop if any buffer cannot be multi buffered as pipelining will assume
-        // this happened.
-        return signalPassFailure();
+      if (failed(memref::multiBuffer(alloc, numBuffers))) {
+        // There can be a failing case. Continue processing eligible ones.
+        continue;
+      }
     }
   }
 
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPUPatterns.cpp b/compiler/src/iree/compiler/Codegen/Common/GPUPatterns.cpp
index a69b187c00ce..8f4936c8e79b 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPUPatterns.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPUPatterns.cpp
@@ -6,6 +6,7 @@
 
 #include "iree/compiler/Codegen/Common/GPUPatterns.h"
 
+#include "iree-dialects/Dialect/LinalgExt/Passes/Passes.h"
 #include "iree-dialects/Dialect/LinalgExt/Transforms/Transforms.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "iree/compiler/Codegen/Utils/MarkerUtils.h"
@@ -165,8 +166,11 @@ void populateVectorTransferToGPUMMAPreparationPatterns(
   patterns.add<FlattenTransferReadOp>(patterns.getContext());
 }
 
+using LinalgTransformationFilter = IREE::LinalgExt::LinalgTransformationFilter;
+
 void populateContractPromotionPatterns(RewritePatternSet &patterns,
-                                       ArrayRef<int64_t> operandsToPromote) {
+                                       ArrayRef<int64_t> operandsToPromote,
+                                       LinalgTransformationFilter *filter) {
   MLIRContext *context = patterns.getContext();
   patterns.insert<LinalgPromotionPattern<linalg::MatmulOp>,
                   LinalgPromotionPattern<linalg::BatchMatmulOp>,
@@ -178,11 +182,12 @@ void populateContractPromotionPatterns(RewritePatternSet &patterns,
           .setCopyInOutFns(copyToWorkgroupMemory, copyToWorkgroupMemory)
           .setOperandsToPromote(operandsToPromote)
           .setUseFullTileBuffers({false, false}),
-      IREE::LinalgExt::LinalgTransformationFilter(
-          {StringAttr::get(context, getWorkgroupKTiledMarker())},
-          StringAttr::get(context, getWorkgroupMemoryMarker()))
-          .setMatchByDefault()
-          .addFilter(contractOpFilter));
+      filter ? *filter
+             : IREE::LinalgExt::LinalgTransformationFilter(
+                   {StringAttr::get(context, getWorkgroupKTiledMarker())},
+                   StringAttr::get(context, getWorkgroupMemoryMarker()))
+                   .setMatchByDefault()
+                   .addFilter(contractOpFilter));
 }
 
 }  // namespace iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPUPatterns.h b/compiler/src/iree/compiler/Codegen/Common/GPUPatterns.h
index deefb38740ac..7f9afc865d44 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPUPatterns.h
+++ b/compiler/src/iree/compiler/Codegen/Common/GPUPatterns.h
@@ -7,7 +7,9 @@
 #ifndef IREE_COMPILER_CODEGEN_COMMON_GPUPATTERNS_H_
 #define IREE_COMPILER_CODEGEN_COMMON_GPUPATTERNS_H_
 
+#include "iree-dialects/Dialect/LinalgExt/Passes/Passes.h"
 #include "mlir/IR/PatternMatch.h"
+
 namespace mlir {
 namespace iree_compiler {
 
@@ -18,8 +20,9 @@ void populateVectorTransferToGPUMMAPreparationPatterns(
 
 /// Adds patterns for promoting Linalg contract op's operands to use GPU shared
 /// memory.
-void populateContractPromotionPatterns(RewritePatternSet &patterns,
-                                       ArrayRef<int64_t> operandsToPromote);
+void populateContractPromotionPatterns(
+    RewritePatternSet &patterns, ArrayRef<int64_t> operandsToPromote,
+    IREE::LinalgExt::LinalgTransformationFilter *filter = nullptr);
 
 }  // namespace iree_compiler
 }  // namespace mlir
diff --git a/compiler/src/iree/compiler/Codegen/Common/WorkgroupSpecializationPass.cpp b/compiler/src/iree/compiler/Codegen/Common/WorkgroupSpecializationPass.cpp
index 5a1d1da2386f..09d2b9105a3a 100644
--- a/compiler/src/iree/compiler/Codegen/Common/WorkgroupSpecializationPass.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/WorkgroupSpecializationPass.cpp
@@ -36,6 +36,7 @@
 #include "iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.h"
 #include "iree/compiler/Codegen/PassDetail.h"
 #include "iree/compiler/Codegen/Passes.h"
+#include "iree/compiler/Codegen/Utils/MarkerUtils.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -220,6 +221,7 @@ static void specializeDistributionLoops(
 
   // generate scf.if %cond
   auto ifOp = builder.create<scf::IfOp>(loc, cond, /*withElseRegion=*/true);
+  setMarker(ifOp, getWorkgroupSpecializationMarker());
 
   // Transfer the original body to the scf.else body.
   auto origBodyBegin = ++Block::iterator(ifOp);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index e6dd8cd20809..fbd0cf41264f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -227,9 +227,8 @@ static LogicalResult setContractConfig(func::FuncOp entryPoint,
       // Pick the best configuration where the original shape is aligned on the
       // tile size.
       for (TileWorkgroupSizePair &config : TCtileSizeConfig) {
-        if (sizeK % config.tileSize[2] == 0 &&
-            sizeN % config.tileSize[1] == 0 &&
-            sizeM % config.tileSize[0] == 0) {
+        if (sizeK >= config.tileSize[2] && sizeN >= config.tileSize[1] &&
+            sizeM >= config.tileSize[0]) {
           return setMatmulConfig(
               config.tileSize[0], config.tileSize[1], config.tileSize[2],
               config.workgroupSize,
@@ -410,7 +409,7 @@ static LogicalResult setRootDefaultConfig(func::FuncOp entryPoint,
   }
 
   auto linalgOp = dyn_cast<linalg::LinalgOp>(op);
-  // Pick a vectorSize of 1 for op that we know won't get vectorizedd.
+  // Pick a vectorSize of 1 for op that we know won't get vectorized.
   // Also skip vectorization for linalg on memref (no result) as the pipeline
   // relies on tensor level tiling.
   // TODO(thomasraoux): This could be improved by checking if the linalg op
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorAlloc.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorAlloc.cpp
index d2f43b69896e..3164f3d3dfae 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorAlloc.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorAlloc.cpp
@@ -83,7 +83,7 @@ struct LLVMGPUTensorAllocPass
     auto funcOp = getOperation();
 
     // Tile the reduction first to reduce the alloc size.
-    if (failed(tileToSerialLoops(funcOp))) {
+    if (failed(tileToSerialLoops(funcOp, /*peel=*/false))) {
       return signalPassFailure();
     }
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp
index 27b0ccd92a9c..64dc7e65b1a3 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp
@@ -29,8 +29,8 @@ extern llvm::cl::opt<bool> llvmgpuUseMMASync;
 //====---------------------------------------------------------------------===//
 
 static void populateVectorizationPatterns(RewritePatternSet &patterns) {
-  IREE::LinalgExt::LinalgTransformationFilter f(
-      StringAttr::get(patterns.getContext(), getVectorizeMarker()));
+  IREE::LinalgExt::LinalgTransformationFilter f(StringAttr::get(
+      patterns.getContext(), getVectorizeForTensorCoreMarker()));
   VectorizationPatterns<linalg::FillOp, linalg::GenericOp>::insert(patterns, f);
   patterns.add<LinalgVectorizationPattern>(
       patterns.getContext(), f.addOpFilter<linalg::ContractionOpInterface>());
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
index 23f76a9779cc..64973d2123a9 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
@@ -24,7 +24,14 @@
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/SCF/Transforms/Transforms.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/IR/Matchers.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/Visitors.h"
 #include "mlir/Support/MathExtras.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
@@ -117,9 +124,10 @@ static void populateTilingToWarpPatterns(
   auto getWarpProcInfoFn = [warpPerWorkgroup](
                                OpBuilder &builder, Location loc,
                                ArrayRef<Range> parallelLoopRanges) {
-    return getSubgroupIdsAndCounts(builder, loc, /*warpSize=*/32u,
+    return getSubgroupIdsAndCounts(builder, loc, kWarpSize,
                                    parallelLoopRanges.size(), warpPerWorkgroup);
   };
+
   linalg::LinalgLoopDistributionOptions warpDistributionOptions;
   warpDistributionOptions.procInfo = getWarpProcInfoFn;
 
@@ -129,17 +137,18 @@ static void populateTilingToWarpPatterns(
                            .setDistributionOptions(warpDistributionOptions);
   MLIRContext *context = patterns.getContext();
   IREE::LinalgExt::LinalgTransformationFilter filter(
-      {StringAttr::get(context, getWorkgroupKTiledMarker()),
-       StringAttr::get(context, getWorkgroupMemoryMarker())},
-      StringAttr::get(context, getVectorizeMarker()));
-  filter.setMatchByDefault();
+      {StringAttr::get(context, getGPUWarpLevelTilingReqMarker())},
+      StringAttr::get(context, getVectorizeForTensorCoreMarker()));
   TilingPatterns<linalg::MatmulOp, linalg::FillOp, linalg::BatchMatmulOp,
                  linalg::GenericOp>::insert(patterns, tilingOptions, filter);
 }
 
+using FilterFunction = std::function<LogicalResult(Operation *)>;
+
 /// Patterns for thread level tiling.
 static void populateTilingToInvocationPatterns(
-    RewritePatternSet &patterns, SmallVectorImpl<int64_t> &workgroupSize) {
+    RewritePatternSet &patterns, SmallVectorImpl<int64_t> &workgroupSize,
+    bool matchByDefault = true) {
   linalg::TileSizeComputationFunction getInnerTileSizeFn =
       [&](OpBuilder &builder, Operation *operation) {
         return calculateDistributedTileSize(workgroupSize, builder, operation);
@@ -162,17 +171,107 @@ static void populateTilingToInvocationPatterns(
   MLIRContext *context = patterns.getContext();
   IREE::LinalgExt::LinalgTransformationFilter f(
       {StringAttr::get(context, getWorkgroupKTiledMarker()),
-       StringAttr::get(context, getWorkgroupMemoryMarker())},
+       StringAttr::get(context, getWorkgroupMemoryMarker()),
+       StringAttr::get(context, getGPUSimtLoweringReqMarker())},
       StringAttr::get(context, getVectorizeMarker()));
   f.addFilter([](Operation *op) {
-     // FFT doesn't support second level of tiling yet.
-     return success(!isa<IREE::LinalgExt::FftOp>(op));
-   }).setMatchByDefault();
+    // FFT doesn't support second level of tiling yet.
+    return success(!isa<IREE::LinalgExt::FftOp>(op));
+  });
+  if (matchByDefault) f.setMatchByDefault();
   patterns.insert<IREE::LinalgExt::LinalgTilingPattern,
                   IREE::LinalgExt::TilingInterfaceTilingPattern>(
       context, tilingOptions, f);
 }
 
+static void markCandidates(func::FuncOp funcOp) {
+  funcOp.walk([](linalg::LinalgOp op) {
+    if (!isa<linalg::BatchMatmulOp, linalg::MatmulOp>(op))
+      return WalkResult::skip();
+
+    if (succeeded(alignedOpFilter(op))) {
+      setMarker(op, getGPUTensorCoreLoweringReqMarker());
+    } else {
+      setMarker(op, getGPUSimtLoweringReqMarker());
+    }
+    return WalkResult::advance();
+  });
+}
+
+static LogicalResult tileTensorCoreKDim(func::FuncOp funcOp) {
+  // mark which linarg op is a tensorcore
+  markCandidates(funcOp);
+
+  auto context = funcOp.getContext();
+  RewritePatternSet patterns(context);
+  auto tileSizesFn = [](OpBuilder &builder,
+                        Operation *op) -> SmallVector<Value, 4> {
+    auto interfaceOp = cast<PartitionableLoopsInterface>(*op);
+    auto partitionedLoops =
+        interfaceOp.getPartitionableLoops(kNumMaxParallelDims);
+    SmallVector<Value, 4> tileSizes = getTileSizes(builder, op, 0);
+    auto zero = builder.create<arith::ConstantIndexOp>(op->getLoc(), 0);
+    for (unsigned depth : partitionedLoops) {
+      if (depth < tileSizes.size()) {
+        tileSizes[depth] = zero;
+      }
+    }
+    return tileSizes;
+  };
+
+  auto tilingOptions =
+      linalg::LinalgTilingOptions()
+          .setLoopType(linalg::LinalgTilingLoopType::Loops)
+          .setTileSizeComputationFunction(tileSizesFn)
+          .setPeeledLoops({0});  // peel off the partial iterations
+
+  IREE::LinalgExt::LinalgTransformationFilter filter(
+      ArrayRef<StringAttr>{
+          StringAttr::get(context, getGPUTensorCoreLoweringReqMarker())},
+      StringAttr::get(context, getWorkgroupKTiledMarker()));
+
+  TilingPatterns<linalg::MatmulOp, linalg::BatchMatmulOp>::insert(
+      patterns, tilingOptions, filter);
+
+  if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) {
+    return failure();
+  }
+
+  RewritePatternSet wgTilingCanonicalizationPatterns =
+      linalg::getLinalgTilingCanonicalizationPatterns(funcOp.getContext());
+  populateAffineMinSCFCanonicalizationPattern(wgTilingCanonicalizationPatterns);
+  scf::populateSCFForLoopCanonicalizationPatterns(
+      wgTilingCanonicalizationPatterns);
+  if (failed(applyPatternsAndFoldGreedily(
+          funcOp, std::move(wgTilingCanonicalizationPatterns)))) {
+    return failure();
+  }
+
+  return success();
+}
+
+// Get K dimension size. It returns kDynamicSize for unknown cases.
+static int64_t getSizeK(linalg::LinalgOp op) {
+  int64_t sizeK = ShapedType::kDynamicSize;
+
+  if (!isa<linalg::BatchMatmulOp, linalg::MatmulOp>(op)) return sizeK;
+
+  auto lhsShape =
+      op.getDpsInputOperand(0)->get().getType().cast<ShapedType>().getShape();
+  SmallVector<unsigned> exprs;
+  op.getReductionDims(exprs);
+  if (exprs.size() == 1) {
+    for (unsigned i = 0; i < lhsShape.size(); i++) {
+      if (op.getMatchingIndexingMap(op.getDpsInputOperand(0))
+              .getDimPosition(i) == exprs[0]) {
+        sizeK = lhsShape[i];
+        break;
+      }
+    }
+  }
+  return sizeK;
+}
+
 namespace {
 struct LLVMGPUTileAndDistributePass
     : public LLVMGPUTileAndDistributeBase<LLVMGPUTileAndDistributePass> {
@@ -191,7 +290,7 @@ struct LLVMGPUTileAndDistributePass
     auto funcOp = getOperation();
     if (!isEntryPoint(funcOp)) return;
 
-    // Promote C matrix and propagate the potential  fill producer into the temp
+    // Promote C matrix and propagate the potential fill producer into the temp
     // allocation. This needs to be done before reduction tiling.
     {
       RewritePatternSet promotionPatterns(&getContext());
@@ -200,13 +299,24 @@ struct LLVMGPUTileAndDistributePass
                                               std::move(promotionPatterns)))) {
         return signalPassFailure();
       }
+      LLVM_DEBUG({
+        llvm::dbgs() << "After promote C:\n";
+        funcOp.dump();
+      });
+
       propagateSharedMemoryCopy(funcOp);
+
+      LLVM_DEBUG({
+        llvm::dbgs() << "After propagateSharedMemoryCopy():\n";
+        funcOp.dump();
+      });
     }
 
     // Tile again at the workgroup level since reduction dimension were
     // ignored. Dimensions already tiled will be ignore since we tile to the
-    // same size.
-    if (failed(tileToSerialLoops(funcOp))) {
+    // same size. For distributing to warps, peel the partial iterations as
+    // a separate loop, since the warp distribution is requested for wmma.
+    if (failed(tileToSerialLoops(funcOp, /*peel=*/distributeToWarp))) {
       return signalPassFailure();
     }
 
@@ -226,7 +336,6 @@ struct LLVMGPUTileAndDistributePass
       RewritePatternSet promotionPatterns(&getContext());
 
       populateContractPromotionPatterns(promotionPatterns, {0, 1});
-
       if (failed(applyPatternsAndFoldGreedily(funcOp,
                                               std::move(promotionPatterns)))) {
         return signalPassFailure();
@@ -250,7 +359,34 @@ struct LLVMGPUTileAndDistributePass
     });
 
     if (distributeToWarp) {
-      // Apply last level of tiling and distribute to warps.
+      // mark candidates for the warp level tiling
+      funcOp.walk([&](linalg::LinalgOp op) {
+        if (failed(alignedOpFilter(op))) return WalkResult::skip();
+        if (!isa<linalg::BatchMatmulOp, linalg::MatmulOp, linalg::FillOp,
+                 linalg::GenericOp>(op))
+          return WalkResult::skip();
+
+        if (isa<linalg::GenericOp>(op) &&
+            hasMarker(op, getCopyToWorkgroupMemoryMarker())) {
+          // The GPUDistributeSharedMemoryCopy pass will handle it later.
+          return WalkResult::skip();
+        }
+
+        // check if K is a multiple of Tile-K.
+        int64_t sizeK = getSizeK(op);
+        if (sizeK != ShapedType::kDynamicSize) {
+          // WG tile sizes
+          auto wgTileSizes = getTileSizes(op, 0);
+
+          if (sizeK % wgTileSizes[wgTileSizes.size() - 1] != 0)
+            return WalkResult::skip();
+        }
+
+        setMarker(op, getGPUWarpLevelTilingReqMarker());
+        return WalkResult::advance();
+      });
+
+      // Apply last level of tiling and distribute to warps for aligned ops.
       RewritePatternSet warpLevelTilingPatterns(context);
       populateTilingToWarpPatterns(warpLevelTilingPatterns, workgroupSize);
       if (failed(applyPatternsAndFoldGreedily(
@@ -258,6 +394,15 @@ struct LLVMGPUTileAndDistributePass
         return signalPassFailure();
       }
 
+      // Apply last level of tiling and distribute to threads for unaligned ops.
+      RewritePatternSet threadLevelTilingPatterns(context);
+      populateTilingToInvocationPatterns(threadLevelTilingPatterns,
+                                         workgroupSize,
+                                         /*matchByDefault=*/false);
+      if (failed(applyPatternsAndFoldGreedily(
+              funcOp, std::move(threadLevelTilingPatterns)))) {
+        return signalPassFailure();
+      }
     } else {
       // Apply last level of tiling and distribute to threads.
       RewritePatternSet threadLevelTilingPatterns(context);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileTensor.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileTensor.cpp
index ae0ef3e6edc4..a9c2a7810f16 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileTensor.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileTensor.cpp
@@ -32,20 +32,18 @@ namespace iree_compiler {
 /// Patterns for workgroup level tiling. Workgroup tiling is done at the flow
 /// level but we may have extra tiling for the reduction dimension. Therefore we
 /// tile again without distributing.
-static void populateTilingPatterns(RewritePatternSet &patterns,
-                                   bool onlyReduction) {
-  auto tileSizesFn = [onlyReduction](OpBuilder &builder,
-                                     Operation *op) -> SmallVector<Value, 4> {
+static void populateReductionTilingPatterns(RewritePatternSet &patterns,
+                                            bool peel) {
+  auto tileSizesFn = [](OpBuilder &builder,
+                        Operation *op) -> SmallVector<Value, 4> {
     auto interfaceOp = cast<PartitionableLoopsInterface>(*op);
     auto partitionedLoops =
         interfaceOp.getPartitionableLoops(kNumMaxParallelDims);
     SmallVector<Value, 4> tileSizes = getTileSizes(builder, op, 0);
-    if (onlyReduction) {
-      auto zero = builder.create<arith::ConstantIndexOp>(op->getLoc(), 0);
-      for (unsigned depth : partitionedLoops) {
-        if (depth < tileSizes.size()) {
-          tileSizes[depth] = zero;
-        }
+    auto zero = builder.create<arith::ConstantIndexOp>(op->getLoc(), 0);
+    for (unsigned depth : partitionedLoops) {
+      if (depth < tileSizes.size()) {
+        tileSizes[depth] = zero;
       }
     }
     return tileSizes;
@@ -54,6 +52,7 @@ static void populateTilingPatterns(RewritePatternSet &patterns,
   auto tilingOptions = linalg::LinalgTilingOptions()
                            .setLoopType(linalg::LinalgTilingLoopType::Loops)
                            .setTileSizeComputationFunction(tileSizesFn);
+  if (peel) tilingOptions.setPeeledLoops({0});
   MLIRContext *context = patterns.getContext();
 
   IREE::LinalgExt::LinalgTransformationFilter filter(
@@ -67,13 +66,13 @@ static void populateTilingPatterns(RewritePatternSet &patterns,
                                                    filter);
 }
 
-LogicalResult tileToSerialLoops(func::FuncOp funcOp, bool onlyReduction) {
+LogicalResult tileToSerialLoops(func::FuncOp funcOp, bool peel) {
   {
-    // Tile again at the workgroup level since redution dimension were
+    // Tile again at the workgroup level since reduction dimension were
     // ignored. Dimensions already tiled will be ignore since we tile to the
     // same size.
     RewritePatternSet wgTilingPatterns(funcOp.getContext());
-    populateTilingPatterns(wgTilingPatterns, onlyReduction);
+    populateReductionTilingPatterns(wgTilingPatterns, peel);
     if (failed(applyPatternsAndFoldGreedily(funcOp,
                                             std::move(wgTilingPatterns)))) {
       return failure();
@@ -223,7 +222,7 @@ struct LLVMGPUTileTensorPass
 
     // Tile to serial loops to the wg tile size to handle reductions and other
     // dimension that have not been distributed.
-    if (failed(tileToSerialLoops(funcOp, /*onlyReduction=*/true))) {
+    if (failed(tileToSerialLoops(funcOp, /*peel=*/false))) {
       return signalPassFailure();
     }
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
index 8d05e0d179a8..8322ba5a36ec 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -199,6 +199,12 @@ void addGPUMatmulTensorCorePassPipeline(OpPassManager &pm,
   tileAndBufferize(pm);
 
   auto &nestedModulePM = pm.nest<ModuleOp>();
+  // Do specialization
+  nestedModulePM.addNestedPass<func::FuncOp>(
+      createWorkgroupSpecializationPass());
+  nestedModulePM.addPass(createCanonicalizerPass());
+  nestedModulePM.addPass(createCSEPass());
+
   // Distribute linalg onto warps within the workgroup.
   nestedModulePM.addNestedPass<func::FuncOp>(
       createLLVMGPUTileAndDistribute(/*distributeToWarp=*/true));
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TilingUtils.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/TilingUtils.h
index 7ef1b93324d4..8616a92d6fc0 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TilingUtils.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TilingUtils.h
@@ -16,7 +16,7 @@ class FuncOp;
 namespace iree_compiler {
 
 /// Apply tiling to reduction dimensions based on op attributes.
-LogicalResult tileToSerialLoops(func::FuncOp funcOp, bool onlyReduction = true);
+LogicalResult tileToSerialLoops(func::FuncOp funcOp, bool peel);
 
 }  // namespace iree_compiler
 }  // namespace mlir
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp
index c04186c75ea6..8395a1adf329 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp
@@ -188,20 +188,20 @@ LogicalResult verifyGPUMatmulTensorCorePipeline(
 
   // Verify the first level tile size divides the matmul
   // inputs A [M x K] & B [K x N]
-  if (lhsShape[0] % firstLevelTileSizes[0] != 0 ||
-      lhsShape[1] % firstLevelTileSizes[2] != 0) {
-    return op->emitOpError(
-               "lhsShape doesn't factor into first level tile size for ")
-           << pipelineName << " [ " << lhsShape[0] << ", " << lhsShape[1]
-           << "]";
-  }
-  if (rhsShape[0] % firstLevelTileSizes[2] != 0 ||
-      rhsShape[1] % firstLevelTileSizes[1] != 0) {
-    return op->emitOpError(
-               "rhsShape doesn't factor into first level tile size for ")
-           << pipelineName << " [ " << rhsShape[0] << ", " << rhsShape[1]
-           << "]";
-  }
+  // if (lhsShape[0] % firstLevelTileSizes[0] != 0 ||
+  //     lhsShape[1] % firstLevelTileSizes[2] != 0) {
+  //   return op->emitOpError(
+  //              "lhsShape doesn't factor into first level tile size for ")
+  //          << pipelineName << " [ " << lhsShape[0] << ", " << lhsShape[1]
+  //          << "]";
+  // }
+  // if (rhsShape[0] % firstLevelTileSizes[2] != 0 ||
+  //     rhsShape[1] % firstLevelTileSizes[1] != 0) {
+  //   return op->emitOpError(
+  //              "rhsShape doesn't factor into first level tile size for ")
+  //          << pipelineName << " [ " << rhsShape[0] << ", " << rhsShape[1]
+  //          << "]";
+  // }
 
   // Verify shared memory usage of operands after tiling requires <= 64Kb
   // combined space.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir
index 5164192a28e1..a9136ba3430b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir
@@ -224,70 +224,6 @@ hal.executable private @matmul_tensors {
 
 // -----
 
-#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
-  #hal.descriptor_set.layout<0, bindings = [
-    #hal.descriptor_set.binding<0, storage_buffer>,
-    #hal.descriptor_set.binding<1, storage_buffer>,
-    #hal.descriptor_set.binding<2, storage_buffer>
-  ]>
-]>
-hal.executable private @matmul_tensors {
-  hal.executable.variant @cuda, target = #hal.executable.target<"cuda", "cuda-nvptx-fb"> {
-    hal.executable.export @illegal layout(#pipeline_layout) attributes {
-      translation_info = #translation,
-      workgroup_size = [64 : index, 2 : index, 1 : index]
-    }
-    builtin.module {
-      func.func @illegal() {
-        %c0 = arith.constant 0 : index
-        %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<48x16xf32>
-        %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x32xf32>
-        %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<48x32xf32>
-        // expected-error @+1 {{lhsShape doesn't factor into first level tile size for LLVMGPUMatmulTensorCore}}
-        linalg.matmul {lowering_config = #config} ins(%lhs, %rhs : memref<48x16xf32>, memref<16x32xf32>)
-          outs(%result: memref<48x32xf32>)
-        return
-      }
-    }
-  }
-}
-
-// -----
-
-#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
-  #hal.descriptor_set.layout<0, bindings = [
-    #hal.descriptor_set.binding<0, storage_buffer>,
-    #hal.descriptor_set.binding<1, storage_buffer>,
-    #hal.descriptor_set.binding<2, storage_buffer>
-  ]>
-]>
-hal.executable private @matmul_tensors {
-  hal.executable.variant @cuda, target = #hal.executable.target<"cuda", "cuda-nvptx-fb"> {
-    hal.executable.export @illegal layout(#pipeline_layout) attributes {
-      translation_info = #translation,
-      workgroup_size = [64 : index, 2 : index, 1 : index]
-    }
-    builtin.module {
-      func.func @illegal() {
-        %c0 = arith.constant 0 : index
-        %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<32x16xf32>
-        %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x48xf32>
-        %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<32x48xf32>
-        // expected-error @+1 {{rhsShape doesn't factor into first level tile size for LLVMGPUMatmulTensorCore}}
-        linalg.matmul {lowering_config = #config} ins(%lhs, %rhs : memref<32x16xf32>, memref<16x48xf32>)
-          outs(%result: memref<32x48xf32>)
-        return
-      }
-    }
-  }
-}
-
-// -----
-
 #config = #iree_codegen.lowering_config<tile_sizes = [[2, 32, 32, 16]]>
 #translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore>
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/tensorcore_vectorization.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/tensorcore_vectorization.mlir
index 425e406127ae..ba6ed805fe55 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/tensorcore_vectorization.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/tensorcore_vectorization.mlir
@@ -20,7 +20,7 @@ func.func @dot() {
   %10 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%9]
   %11 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 32)>(%8)
   %12 = memref.subview %7[%10, %11] [32, 32] [1, 1] : memref<64x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<32x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
-  linalg.fill {__internal_linalg_transform__ = "vectorize"} ins(%cst : f32) outs(%12 : memref<32x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>)
+  linalg.fill {__internal_linalg_transform__ = "vectorize_for_tensorcore"} ins(%cst : f32) outs(%12 : memref<32x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>)
   scf.for %arg0 = %c0 to %c1024 step %c16 {
     %13 = memref.subview %5[0, %arg0] [64, 16] [1, 1] : memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>> to memref<64x16xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
     %14 = memref.subview %6[%arg0, 0] [16, 64] [1, 1] : memref<1024x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<16x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
@@ -28,7 +28,7 @@ func.func @dot() {
     %16 = memref.subview %13[%10, 0] [32, 16] [1, 1] : memref<64x16xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>> to memref<32x16xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
     %17 = memref.subview %14[0, %15] [16, 32] [1, 1] : memref<16x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<16x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
     %18 = memref.subview %7[%10, %15] [32, 32] [1, 1] : memref<64x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<32x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
-    linalg.matmul {__internal_linalg_transform__ = "vectorize"} ins(%16, %17 : memref<32x16xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, memref<16x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>) outs(%18 : memref<32x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>)
+    linalg.matmul {__internal_linalg_transform__ = "vectorize_for_tensorcore"} ins(%16, %17 : memref<32x16xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, memref<16x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>) outs(%18 : memref<32x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>)
   }
   return
 }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/workgroup_specialization_pipeline_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/workgroup_specialization_pipeline_test.mlir
index 3703d9f75930..ce78695b3e6e 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/workgroup_specialization_pipeline_test.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/workgroup_specialization_pipeline_test.mlir
@@ -42,11 +42,12 @@ module attributes {hal.device.targets = [#hal.device.target<"cuda", {executable_
 // CHECK-LABEL: func.func @forward_dispatch_116_matmul_128x30522x768
 //       CHECK: arith.cmpi eq
 //       CHECK: scf.if
-//       CHECK:   vector.transfer_read
-//       CHECK:   vector.transfer_read
-//       CHECK:   vector.contract
-//       CHECK:   vector.transfer_read
-//       CHECK:   vector.broadcast
-//       CHECK:   vector.transfer_write
+//       CHECK:   gpu.subgroup_mma_load_matrix
+//       CHECK:   gpu.subgroup_mma_load_matrix
+//       CHECK:   gpu.subgroup_mma_load_matrix
+//       CHECK:   gpu.subgroup_mma_load_matrix
+//       CHECK:   gpu.subgroup_mma_compute
+//       CHECK:   gpu.subgroup_mma_compute
+//       CHECK:   gpu.subgroup_mma_store_matrix
 //       CHECK: else
 //   CHECK-NOT:   vector.transfer
diff --git a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp
index e62239e9f81f..bb44b7681187 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp
@@ -36,6 +36,20 @@ StringRef getWorkgroupMemoryNumItemsGENumItersMarker() {
   return "workgroup_memory_numprocs_ge_numiters";
 }
 
+StringRef getWorkgroupSpecializationMarker() {
+  return "workgroup_specialization";
+}
+
+StringRef getGPUSimtLoweringReqMarker() { return "gpu_simt_lowering_req"; }
+
+StringRef getGPUTensorCoreLoweringReqMarker() {
+  return "gpu_tensorcore_lowering_req";
+}
+
+StringRef getGPUWarpLevelTilingReqMarker() {
+  return "gpu_warp_level_tiling_req";
+}
+
 StringRef getCopyToWorkgroupMemoryMarker() {
   return "copy_to_workgroup_memory";
 }
@@ -44,6 +58,10 @@ StringRef getTileReductionMarker() { return "tile_reduction"; }
 
 StringRef getVectorizeMarker() { return "vectorize"; }
 
+StringRef getVectorizeForTensorCoreMarker() {
+  return "vectorize_for_tensorcore";
+}
+
 StringRef getDeleteMarker() { return "delete"; }
 
 StringRef getMarkerOrNull(Operation *op) {
@@ -67,5 +85,13 @@ void setMarker(Operation *op, StringRef marker) {
               StringAttr::get(op->getContext(), marker));
 }
 
+Operation *findAncestorWithMarker(Operation *op, StringRef marker) {
+  while (op) {
+    if (hasMarker(op, marker)) return op;
+    op = op->getParentOp();
+  };
+  return nullptr;
+}
+
 }  // namespace iree_compiler
 }  // namespace mlir
diff --git a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h
index 2057ede82128..f62ddbba6cd7 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h
+++ b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h
@@ -33,6 +33,19 @@ StringRef getWorkgroupMemoryMarker();
 /// to workgroups L1 tiles.
 StringRef getWorkgroupL1TileMarker();
 
+/// Marker to scf.IfOp to denote that the op is inserted for workgroup
+/// specialization.
+StringRef getWorkgroupSpecializationMarker();
+
+/// Marker to denote that the op is marked for SIMT lowering
+StringRef getGPUSimtLoweringReqMarker();
+
+/// Marker to denote that the op is marked for tensorcore lowering
+StringRef getGPUTensorCoreLoweringReqMarker();
+
+/// Marker to denote that the op is marked for warp-level tiling
+StringRef getGPUWarpLevelTilingReqMarker();
+
 /// Marker for copy operations that are moving data from StorageClass to
 /// Workgroup memory.
 StringRef getCopyToWorkgroupMemoryMarker();
@@ -43,6 +56,9 @@ StringRef getTileReductionMarker();
 /// Marker for operations that are going to be vectorized.
 StringRef getVectorizeMarker();
 
+/// Marker for operations that are going to be vectorized for tensorcore.
+StringRef getVectorizeForTensorCoreMarker();
+
 /// Marker for tagging an operation for deletion. Tile and fuse pattern does
 /// not delete the original operation to not invalidate the
 /// `linalg::LinalgDependenceGraph` data structure. Instead it is marked with
@@ -59,6 +75,9 @@ bool hasMarker(Operation *, ArrayRef<StringRef> markers = {});
 /// Sets a given marker on an operation.
 void setMarker(Operation *, StringRef);
 
+/// Find an ancester with the given marker.
+Operation *findAncestorWithMarker(Operation *op, StringRef marker);
+
 }  // namespace iree_compiler
 }  // namespace mlir
 
diff --git a/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp b/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp
index 03ae9f65fa53..600ba9bbcafe 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp
@@ -677,5 +677,31 @@ void replaceMemrefUsesAndPropagateType(Operation *oldOp, Value val,
   for (Operation *op : opToDelete) op->erase();
 }
 
+LogicalResult alignedOpFilter(Operation *op) {
+  Operation *opWithMarker =
+      findAncestorWithMarker(op, getWorkgroupSpecializationMarker());
+
+  if (opWithMarker) {
+    auto ifOp = cast<scf::IfOp>(opWithMarker);
+    return success(ifOp.getThenRegion().isAncestor(op->getParentRegion()));
+  } else {
+    return success();
+  }
+}
+
+LogicalResult unalignedOpFilter(Operation *op) {
+  Operation *opWithMarker =
+      findAncestorWithMarker(op, getWorkgroupSpecializationMarker());
+
+  if (opWithMarker) {
+    auto ifOp = cast<scf::IfOp>(opWithMarker);
+    return success(ifOp.getElseRegion().isAncestor(op->getParentRegion()));
+  } else {
+    // When there is no workgroup specialization, it means the op is already
+    // aligned.
+    return failure();
+  }
+}
+
 }  // namespace iree_compiler
 }  // namespace mlir
diff --git a/compiler/src/iree/compiler/Codegen/Utils/Utils.h b/compiler/src/iree/compiler/Codegen/Utils/Utils.h
index 506f707e9217..37479dfca7b5 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/Utils.h
+++ b/compiler/src/iree/compiler/Codegen/Utils/Utils.h
@@ -175,6 +175,20 @@ linalg::LinalgLoopDistributionOptions getIREELinalgLoopDistributionOptions();
 void replaceMemrefUsesAndPropagateType(Operation *oldOp, Value val,
                                        OpBuilder &builder);
 
+/// Returns true if an op is aligned by checking if
+///   1. the op is inside the workgroup-specialized region, or
+///   2. the op's parent is not the workgroup-specialized region.
+/// The second case does not have a workgroup-spcialized region because
+/// it is already aligned.
+LogicalResult alignedOpFilter(Operation *op);
+
+/// Returns true if an op is unaligned by checking if
+///   1. the op is inside the workgroup-specialized region, or
+///   2. the op's parent is not the workgroup-specialized region.
+/// The second case does not have a workgroup-spcialized region because
+/// it is already aligned.
+LogicalResult unalignedOpFilter(Operation *op);
+
 }  // namespace iree_compiler
 }  // namespace mlir