From fe8fc5f8f41a2a8aaf5b2589e6118705dc0aa5ae Mon Sep 17 00:00:00 2001 From: Okwan Kwon Date: Fri, 4 Nov 2022 15:03:30 -0700 Subject: [PATCH 01/27] fix typos --- compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp | 2 +- .../iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp | 2 +- .../src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileTensor.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp index e6dd8cd20809..221bcec8d1c3 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp @@ -410,7 +410,7 @@ static LogicalResult setRootDefaultConfig(func::FuncOp entryPoint, } auto linalgOp = dyn_cast(op); - // Pick a vectorSize of 1 for op that we know won't get vectorizedd. + // Pick a vectorSize of 1 for op that we know won't get vectorized. // Also skip vectorization for linalg on memref (no result) as the pipeline // relies on tensor level tiling. // TODO(thomasraoux): This could be improved by checking if the linalg op diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp index 23f76a9779cc..2e2776778348 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp @@ -191,7 +191,7 @@ struct LLVMGPUTileAndDistributePass auto funcOp = getOperation(); if (!isEntryPoint(funcOp)) return; - // Promote C matrix and propagate the potential fill producer into the temp + // Promote C matrix and propagate the potential fill producer into the temp // allocation. This needs to be done before reduction tiling. { RewritePatternSet promotionPatterns(&getContext()); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileTensor.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileTensor.cpp index ae0ef3e6edc4..6348ddf1d05a 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileTensor.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileTensor.cpp @@ -69,7 +69,7 @@ static void populateTilingPatterns(RewritePatternSet &patterns, LogicalResult tileToSerialLoops(func::FuncOp funcOp, bool onlyReduction) { { - // Tile again at the workgroup level since redution dimension were + // Tile again at the workgroup level since reduction dimension were // ignored. Dimensions already tiled will be ignore since we tile to the // same size. RewritePatternSet wgTilingPatterns(funcOp.getContext()); From a2da08e031fb19ab33ad6a28aa9417a66af37c6e Mon Sep 17 00:00:00 2001 From: Okwan Kwon Date: Mon, 7 Nov 2022 08:58:43 -0800 Subject: [PATCH 02/27] WIP: allow unaligned tensor sizes for tensor core --- .../compiler/Codegen/LLVMGPU/KernelConfig.cpp | 5 ++-- .../iree/compiler/Codegen/LLVMGPU/Passes.cpp | 6 ++++ .../compiler/Codegen/LLVMGPU/Verifiers.cpp | 28 +++++++++---------- 3 files changed, 22 insertions(+), 17 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp index 221bcec8d1c3..fbd0cf41264f 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp @@ -227,9 +227,8 @@ static LogicalResult setContractConfig(func::FuncOp entryPoint, // Pick the best configuration where the original shape is aligned on the // tile size. for (TileWorkgroupSizePair &config : TCtileSizeConfig) { - if (sizeK % config.tileSize[2] == 0 && - sizeN % config.tileSize[1] == 0 && - sizeM % config.tileSize[0] == 0) { + if (sizeK >= config.tileSize[2] && sizeN >= config.tileSize[1] && + sizeM >= config.tileSize[0]) { return setMatmulConfig( config.tileSize[0], config.tileSize[1], config.tileSize[2], config.workgroupSize, diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp index 8d05e0d179a8..8322ba5a36ec 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp @@ -199,6 +199,12 @@ void addGPUMatmulTensorCorePassPipeline(OpPassManager &pm, tileAndBufferize(pm); auto &nestedModulePM = pm.nest(); + // Do specialization + nestedModulePM.addNestedPass( + createWorkgroupSpecializationPass()); + nestedModulePM.addPass(createCanonicalizerPass()); + nestedModulePM.addPass(createCSEPass()); + // Distribute linalg onto warps within the workgroup. nestedModulePM.addNestedPass( createLLVMGPUTileAndDistribute(/*distributeToWarp=*/true)); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp index c04186c75ea6..8395a1adf329 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp @@ -188,20 +188,20 @@ LogicalResult verifyGPUMatmulTensorCorePipeline( // Verify the first level tile size divides the matmul // inputs A [M x K] & B [K x N] - if (lhsShape[0] % firstLevelTileSizes[0] != 0 || - lhsShape[1] % firstLevelTileSizes[2] != 0) { - return op->emitOpError( - "lhsShape doesn't factor into first level tile size for ") - << pipelineName << " [ " << lhsShape[0] << ", " << lhsShape[1] - << "]"; - } - if (rhsShape[0] % firstLevelTileSizes[2] != 0 || - rhsShape[1] % firstLevelTileSizes[1] != 0) { - return op->emitOpError( - "rhsShape doesn't factor into first level tile size for ") - << pipelineName << " [ " << rhsShape[0] << ", " << rhsShape[1] - << "]"; - } + // if (lhsShape[0] % firstLevelTileSizes[0] != 0 || + // lhsShape[1] % firstLevelTileSizes[2] != 0) { + // return op->emitOpError( + // "lhsShape doesn't factor into first level tile size for ") + // << pipelineName << " [ " << lhsShape[0] << ", " << lhsShape[1] + // << "]"; + // } + // if (rhsShape[0] % firstLevelTileSizes[2] != 0 || + // rhsShape[1] % firstLevelTileSizes[1] != 0) { + // return op->emitOpError( + // "rhsShape doesn't factor into first level tile size for ") + // << pipelineName << " [ " << rhsShape[0] << ", " << rhsShape[1] + // << "]"; + // } // Verify shared memory usage of operands after tiling requires <= 64Kb // combined space. From c2793ea0626b38f929e949f897391ad79e33c3ff Mon Sep 17 00:00:00 2001 From: Okwan Kwon Date: Tue, 8 Nov 2022 18:37:37 -0800 Subject: [PATCH 03/27] filter out ops with unaligned tensor sizes --- .../LLVMGPU/LLVMGPUTileAndDistribute.cpp | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp index 2e2776778348..2f5d17132d3e 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp @@ -103,6 +103,29 @@ static SmallVector calculateDistributedTileSize( return tileSizesVal; } +/// Returns true if a contract op has a multiple of its tile sizes. +static LogicalResult alignedContractOpFilter(Operation *op) { + auto linalgOp = dyn_cast(op); + if (!linalgOp) return failure(); + + if (linalgOp.hasDynamicShape()) return failure(); + + // matmul or batch matmul + bool isElible = linalg::isaContractionOpInterface(op) && + linalgOp.getNumParallelLoops() >= 2 && + linalgOp.getNumParallelLoops() <= 3; + if (!isElible) return failure(); + + SmallVector wgTileSizes = getTileSizes(op, 0); + if (wgTileSizes.empty()) return failure(); + + SmallVector sizes = linalgOp.getStaticLoopRanges(); + for (unsigned i = 0, e = sizes.size(); i != e; ++i) { + if (sizes[i] % wgTileSizes[i] != 0) return failure(); + } + return success(); +} + /// Patterns for warp level tiling. static void populateTilingToWarpPatterns( RewritePatternSet &patterns, SmallVectorImpl &workgroupSize) { @@ -120,6 +143,7 @@ static void populateTilingToWarpPatterns( return getSubgroupIdsAndCounts(builder, loc, /*warpSize=*/32u, parallelLoopRanges.size(), warpPerWorkgroup); }; + linalg::LinalgLoopDistributionOptions warpDistributionOptions; warpDistributionOptions.procInfo = getWarpProcInfoFn; @@ -133,6 +157,8 @@ static void populateTilingToWarpPatterns( StringAttr::get(context, getWorkgroupMemoryMarker())}, StringAttr::get(context, getVectorizeMarker())); filter.setMatchByDefault(); + // Bail out the case where the tensor sizes are not a multiple of tile sizes. + filter.addFilter(alignedContractOpFilter); TilingPatterns::insert(patterns, tilingOptions, filter); } From cf7b50d62c919fbedcbba2663f18ec7a126dd27e Mon Sep 17 00:00:00 2001 From: Okwan Kwon Date: Wed, 9 Nov 2022 10:50:25 -0800 Subject: [PATCH 04/27] add a marker for workgroup specialization --- .../compiler/Codegen/Common/WorkgroupSpecializationPass.cpp | 2 ++ compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp | 4 ++++ compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h | 4 ++++ 3 files changed, 10 insertions(+) diff --git a/compiler/src/iree/compiler/Codegen/Common/WorkgroupSpecializationPass.cpp b/compiler/src/iree/compiler/Codegen/Common/WorkgroupSpecializationPass.cpp index 5a1d1da2386f..09d2b9105a3a 100644 --- a/compiler/src/iree/compiler/Codegen/Common/WorkgroupSpecializationPass.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/WorkgroupSpecializationPass.cpp @@ -36,6 +36,7 @@ #include "iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.h" #include "iree/compiler/Codegen/PassDetail.h" #include "iree/compiler/Codegen/Passes.h" +#include "iree/compiler/Codegen/Utils/MarkerUtils.h" #include "iree/compiler/Codegen/Utils/Utils.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -220,6 +221,7 @@ static void specializeDistributionLoops( // generate scf.if %cond auto ifOp = builder.create(loc, cond, /*withElseRegion=*/true); + setMarker(ifOp, getWorkgroupSpecializationMarker()); // Transfer the original body to the scf.else body. auto origBodyBegin = ++Block::iterator(ifOp); diff --git a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp index e62239e9f81f..f76dbf39a75c 100644 --- a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp +++ b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp @@ -36,6 +36,10 @@ StringRef getWorkgroupMemoryNumItemsGENumItersMarker() { return "workgroup_memory_numprocs_ge_numiters"; } +StringRef getWorkgroupSpecializationMarker() { + return "workgroup_specialization"; +} + StringRef getCopyToWorkgroupMemoryMarker() { return "copy_to_workgroup_memory"; } diff --git a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h index 2057ede82128..50178cf62ae7 100644 --- a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h +++ b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h @@ -33,6 +33,10 @@ StringRef getWorkgroupMemoryMarker(); /// to workgroups L1 tiles. StringRef getWorkgroupL1TileMarker(); +/// Marker to scf.IfOp to denote that the op is inserted for workgroup +/// specialization. +StringRef getWorkgroupSpecializationMarker(); + /// Marker for copy operations that are moving data from StorageClass to /// Workgroup memory. StringRef getCopyToWorkgroupMemoryMarker(); From dab6480ca25dd424fe7c62d01a50a92e1ae191cd Mon Sep 17 00:00:00 2001 From: Okwan Kwon Date: Wed, 9 Nov 2022 10:50:57 -0800 Subject: [PATCH 05/27] add findAncestorWithMarker(op, marker) --- compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp | 8 ++++++++ compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h | 3 +++ 2 files changed, 11 insertions(+) diff --git a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp index f76dbf39a75c..0399f119891d 100644 --- a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp +++ b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp @@ -71,5 +71,13 @@ void setMarker(Operation *op, StringRef marker) { StringAttr::get(op->getContext(), marker)); } +Operation *findAncestorWithMarker(Operation *op, StringRef marker) { + while (op) { + if (hasMarker(op, marker)) return op; + op = op->getParentOp(); + }; + return nullptr; +} + } // namespace iree_compiler } // namespace mlir diff --git a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h index 50178cf62ae7..80e11b7d2115 100644 --- a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h +++ b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h @@ -63,6 +63,9 @@ bool hasMarker(Operation *, ArrayRef markers = {}); /// Sets a given marker on an operation. void setMarker(Operation *, StringRef); +/// Find an ancester with the given marker. +Operation *findAncestorWithMarker(Operation *op, StringRef marker); + } // namespace iree_compiler } // namespace mlir From 9c742712f8dbe58b04b08f03604d50fad0810276 Mon Sep 17 00:00:00 2001 From: Okwan Kwon Date: Wed, 9 Nov 2022 10:51:29 -0800 Subject: [PATCH 06/27] Do warp distribution only when an op is aligned --- .../LLVMGPU/LLVMGPUTileAndDistribute.cpp | 37 ++++++++----------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp index 2f5d17132d3e..24e5aa68e066 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp @@ -24,6 +24,7 @@ #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/GPU/Transforms/Passes.h" #include "mlir/Dialect/LLVMIR/NVVMDialect.h" +#include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/IR/Matchers.h" #include "mlir/Support/MathExtras.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" @@ -103,27 +104,21 @@ static SmallVector calculateDistributedTileSize( return tileSizesVal; } -/// Returns true if a contract op has a multiple of its tile sizes. -static LogicalResult alignedContractOpFilter(Operation *op) { - auto linalgOp = dyn_cast(op); - if (!linalgOp) return failure(); - - if (linalgOp.hasDynamicShape()) return failure(); - - // matmul or batch matmul - bool isElible = linalg::isaContractionOpInterface(op) && - linalgOp.getNumParallelLoops() >= 2 && - linalgOp.getNumParallelLoops() <= 3; - if (!isElible) return failure(); - - SmallVector wgTileSizes = getTileSizes(op, 0); - if (wgTileSizes.empty()) return failure(); - - SmallVector sizes = linalgOp.getStaticLoopRanges(); - for (unsigned i = 0, e = sizes.size(); i != e; ++i) { - if (sizes[i] % wgTileSizes[i] != 0) return failure(); +/// Returns true if an op is aligned by checking if +/// 1. the op is inside the workgroup-specialized region, or +/// 2. the op's parent is not the workgroup-specialized region. +/// The second case does not have a workgroup-spcialized region because +/// it is already aligned. +static LogicalResult alignedOpFilter(Operation *op) { + Operation *opWithMarker = + findAncestorWithMarker(op, getWorkgroupSpecializationMarker()); + + if (opWithMarker) { + auto ifOp = cast(opWithMarker); + return success(ifOp.getThenRegion().isAncestor(op->getParentRegion())); + } else { + return success(); } - return success(); } /// Patterns for warp level tiling. @@ -158,7 +153,7 @@ static void populateTilingToWarpPatterns( StringAttr::get(context, getVectorizeMarker())); filter.setMatchByDefault(); // Bail out the case where the tensor sizes are not a multiple of tile sizes. - filter.addFilter(alignedContractOpFilter); + filter.addFilter(alignedOpFilter); TilingPatterns::insert(patterns, tilingOptions, filter); } From 3e5622698f1863dab96990b17d5cffe4044b2cd3 Mon Sep 17 00:00:00 2001 From: Okwan Kwon Date: Wed, 9 Nov 2022 20:22:04 -0800 Subject: [PATCH 07/27] apply thread-level tiling for unaligned matmul op --- .../LLVMGPU/LLVMGPUTileAndDistribute.cpp | 36 +++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp index 24e5aa68e066..3433a6b6c9d2 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp @@ -121,6 +121,25 @@ static LogicalResult alignedOpFilter(Operation *op) { } } +/// Returns true if an op is unaligned by checking if +/// 1. the op is inside the workgroup-specialized region, or +/// 2. the op's parent is not the workgroup-specialized region. +/// The second case does not have a workgroup-spcialized region because +/// it is already aligned. +static LogicalResult unalignedOpFilter(Operation *op) { + Operation *opWithMarker = + findAncestorWithMarker(op, getWorkgroupSpecializationMarker()); + + if (opWithMarker) { + auto ifOp = cast(opWithMarker); + return success(ifOp.getElseRegion().isAncestor(op->getParentRegion())); + } else { + // When there is no workgroup specialization, it means the op is already + // aligned. + return failure(); + } +} + /// Patterns for warp level tiling. static void populateTilingToWarpPatterns( RewritePatternSet &patterns, SmallVectorImpl &workgroupSize) { @@ -158,9 +177,12 @@ static void populateTilingToWarpPatterns( linalg::GenericOp>::insert(patterns, tilingOptions, filter); } +using FilterFunction = std::function; + /// Patterns for thread level tiling. static void populateTilingToInvocationPatterns( - RewritePatternSet &patterns, SmallVectorImpl &workgroupSize) { + RewritePatternSet &patterns, SmallVectorImpl &workgroupSize, + const FilterFunction &ff = nullptr) { linalg::TileSizeComputationFunction getInnerTileSizeFn = [&](OpBuilder &builder, Operation *operation) { return calculateDistributedTileSize(workgroupSize, builder, operation); @@ -189,6 +211,8 @@ static void populateTilingToInvocationPatterns( // FFT doesn't support second level of tiling yet. return success(!isa(op)); }).setMatchByDefault(); + // Add the user provided filter if available. + if (ff) f.addFilter(ff); patterns.insert( context, tilingOptions, f); @@ -271,7 +295,7 @@ struct LLVMGPUTileAndDistributePass }); if (distributeToWarp) { - // Apply last level of tiling and distribute to warps. + // Apply last level of tiling and distribute to warps for aligned ops. RewritePatternSet warpLevelTilingPatterns(context); populateTilingToWarpPatterns(warpLevelTilingPatterns, workgroupSize); if (failed(applyPatternsAndFoldGreedily( @@ -279,6 +303,14 @@ struct LLVMGPUTileAndDistributePass return signalPassFailure(); } + // Apply last level of tiling and distribute to threads for unaligned ops. + RewritePatternSet threadLevelTilingPatterns(context); + populateTilingToInvocationPatterns(threadLevelTilingPatterns, + workgroupSize, unalignedOpFilter); + if (failed(applyPatternsAndFoldGreedily( + funcOp, std::move(threadLevelTilingPatterns)))) { + return signalPassFailure(); + } } else { // Apply last level of tiling and distribute to threads. RewritePatternSet threadLevelTilingPatterns(context); From 846d488c25200c415f8eab40d309b0e5666a6038 Mon Sep 17 00:00:00 2001 From: Okwan Kwon Date: Wed, 9 Nov 2022 20:44:06 -0800 Subject: [PATCH 08/27] Do not fail for LLVMGPUMultiBuffering There are cases where it can fail, so skip reporting it as a pass failure. --- .../iree/compiler/Codegen/Common/GPUMultiBuffering.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/Common/GPUMultiBuffering.cpp b/compiler/src/iree/compiler/Codegen/Common/GPUMultiBuffering.cpp index e8d8e18a1a7b..1dbc93ed1623 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPUMultiBuffering.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GPUMultiBuffering.cpp @@ -38,10 +38,10 @@ struct GPUMultiBufferingPass }); // Apply multi-buffering to all of them. for (memref::AllocOp alloc : allocs) { - if (failed(memref::multiBuffer(alloc, numBuffers))) - // Stop if any buffer cannot be multi buffered as pipelining will assume - // this happened. - return signalPassFailure(); + if (failed(memref::multiBuffer(alloc, numBuffers))) { + // There can be a failing case. Continue processing eligible ones. + continue; + } } } From 9395678f14bac475c7a806eed36bb7625aa42e3f Mon Sep 17 00:00:00 2001 From: Okwan Kwon Date: Thu, 10 Nov 2022 12:00:18 -0800 Subject: [PATCH 09/27] move aligned and unaligned op filters to Utils --- .../LLVMGPU/LLVMGPUTileAndDistribute.cpp | 36 ------------------- .../src/iree/compiler/Codegen/Utils/Utils.cpp | 26 ++++++++++++++ .../src/iree/compiler/Codegen/Utils/Utils.h | 14 ++++++++ 3 files changed, 40 insertions(+), 36 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp index 3433a6b6c9d2..d9c3ba271745 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp @@ -104,42 +104,6 @@ static SmallVector calculateDistributedTileSize( return tileSizesVal; } -/// Returns true if an op is aligned by checking if -/// 1. the op is inside the workgroup-specialized region, or -/// 2. the op's parent is not the workgroup-specialized region. -/// The second case does not have a workgroup-spcialized region because -/// it is already aligned. -static LogicalResult alignedOpFilter(Operation *op) { - Operation *opWithMarker = - findAncestorWithMarker(op, getWorkgroupSpecializationMarker()); - - if (opWithMarker) { - auto ifOp = cast(opWithMarker); - return success(ifOp.getThenRegion().isAncestor(op->getParentRegion())); - } else { - return success(); - } -} - -/// Returns true if an op is unaligned by checking if -/// 1. the op is inside the workgroup-specialized region, or -/// 2. the op's parent is not the workgroup-specialized region. -/// The second case does not have a workgroup-spcialized region because -/// it is already aligned. -static LogicalResult unalignedOpFilter(Operation *op) { - Operation *opWithMarker = - findAncestorWithMarker(op, getWorkgroupSpecializationMarker()); - - if (opWithMarker) { - auto ifOp = cast(opWithMarker); - return success(ifOp.getElseRegion().isAncestor(op->getParentRegion())); - } else { - // When there is no workgroup specialization, it means the op is already - // aligned. - return failure(); - } -} - /// Patterns for warp level tiling. static void populateTilingToWarpPatterns( RewritePatternSet &patterns, SmallVectorImpl &workgroupSize) { diff --git a/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp b/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp index 03ae9f65fa53..600ba9bbcafe 100644 --- a/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp +++ b/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp @@ -677,5 +677,31 @@ void replaceMemrefUsesAndPropagateType(Operation *oldOp, Value val, for (Operation *op : opToDelete) op->erase(); } +LogicalResult alignedOpFilter(Operation *op) { + Operation *opWithMarker = + findAncestorWithMarker(op, getWorkgroupSpecializationMarker()); + + if (opWithMarker) { + auto ifOp = cast(opWithMarker); + return success(ifOp.getThenRegion().isAncestor(op->getParentRegion())); + } else { + return success(); + } +} + +LogicalResult unalignedOpFilter(Operation *op) { + Operation *opWithMarker = + findAncestorWithMarker(op, getWorkgroupSpecializationMarker()); + + if (opWithMarker) { + auto ifOp = cast(opWithMarker); + return success(ifOp.getElseRegion().isAncestor(op->getParentRegion())); + } else { + // When there is no workgroup specialization, it means the op is already + // aligned. + return failure(); + } +} + } // namespace iree_compiler } // namespace mlir diff --git a/compiler/src/iree/compiler/Codegen/Utils/Utils.h b/compiler/src/iree/compiler/Codegen/Utils/Utils.h index 506f707e9217..37479dfca7b5 100644 --- a/compiler/src/iree/compiler/Codegen/Utils/Utils.h +++ b/compiler/src/iree/compiler/Codegen/Utils/Utils.h @@ -175,6 +175,20 @@ linalg::LinalgLoopDistributionOptions getIREELinalgLoopDistributionOptions(); void replaceMemrefUsesAndPropagateType(Operation *oldOp, Value val, OpBuilder &builder); +/// Returns true if an op is aligned by checking if +/// 1. the op is inside the workgroup-specialized region, or +/// 2. the op's parent is not the workgroup-specialized region. +/// The second case does not have a workgroup-spcialized region because +/// it is already aligned. +LogicalResult alignedOpFilter(Operation *op); + +/// Returns true if an op is unaligned by checking if +/// 1. the op is inside the workgroup-specialized region, or +/// 2. the op's parent is not the workgroup-specialized region. +/// The second case does not have a workgroup-spcialized region because +/// it is already aligned. +LogicalResult unalignedOpFilter(Operation *op); + } // namespace iree_compiler } // namespace mlir From e5d4f83b8aa9b338d3a6fcd89b2d4fa469f12df1 Mon Sep 17 00:00:00 2001 From: Okwan Kwon Date: Thu, 10 Nov 2022 12:04:56 -0800 Subject: [PATCH 10/27] do tensorcore vectorization only when the candidate op is aligned --- .../compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp index 27b0ccd92a9c..5bb7bdfb6022 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp @@ -31,6 +31,8 @@ extern llvm::cl::opt llvmgpuUseMMASync; static void populateVectorizationPatterns(RewritePatternSet &patterns) { IREE::LinalgExt::LinalgTransformationFilter f( StringAttr::get(patterns.getContext(), getVectorizeMarker())); + // only handle aligned ops + f.addFilter(alignedOpFilter); VectorizationPatterns::insert(patterns, f); patterns.add( patterns.getContext(), f.addOpFilter()); From 38b578779a54a88630c64f9d6dc0dcf632e9ba01 Mon Sep 17 00:00:00 2001 From: Okwan Kwon Date: Thu, 10 Nov 2022 15:23:50 -0800 Subject: [PATCH 11/27] Bail out unaligned K dimension from tensorcore specialization --- compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp index fbd0cf41264f..69d75cda0ee0 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp @@ -227,7 +227,8 @@ static LogicalResult setContractConfig(func::FuncOp entryPoint, // Pick the best configuration where the original shape is aligned on the // tile size. for (TileWorkgroupSizePair &config : TCtileSizeConfig) { - if (sizeK >= config.tileSize[2] && sizeN >= config.tileSize[1] && + // TODO: support unaligned K size + if (sizeK % config.tileSize[2] == 0 && sizeN >= config.tileSize[1] && sizeM >= config.tileSize[0]) { return setMatmulConfig( config.tileSize[0], config.tileSize[1], config.tileSize[2], From 8211f9944cd2126759895ddae41579b6425f9546 Mon Sep 17 00:00:00 2001 From: Okwan Kwon Date: Thu, 10 Nov 2022 16:24:18 -0800 Subject: [PATCH 12/27] update the unit tests --- .../LLVMGPU/test/illegal_configuration.mlir | 64 ------------------- ...orkgroup_specialization_pipeline_test.mlir | 13 ++-- 2 files changed, 7 insertions(+), 70 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir index 5164192a28e1..a9136ba3430b 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir @@ -224,70 +224,6 @@ hal.executable private @matmul_tensors { // ----- -#config = #iree_codegen.lowering_config -#translation = #iree_codegen.translation_info -#pipeline_layout = #hal.pipeline.layout, - #hal.descriptor_set.binding<1, storage_buffer>, - #hal.descriptor_set.binding<2, storage_buffer> - ]> -]> -hal.executable private @matmul_tensors { - hal.executable.variant @cuda, target = #hal.executable.target<"cuda", "cuda-nvptx-fb"> { - hal.executable.export @illegal layout(#pipeline_layout) attributes { - translation_info = #translation, - workgroup_size = [64 : index, 2 : index, 1 : index] - } - builtin.module { - func.func @illegal() { - %c0 = arith.constant 0 : index - %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<48x16xf32> - %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x32xf32> - %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<48x32xf32> - // expected-error @+1 {{lhsShape doesn't factor into first level tile size for LLVMGPUMatmulTensorCore}} - linalg.matmul {lowering_config = #config} ins(%lhs, %rhs : memref<48x16xf32>, memref<16x32xf32>) - outs(%result: memref<48x32xf32>) - return - } - } - } -} - -// ----- - -#config = #iree_codegen.lowering_config -#translation = #iree_codegen.translation_info -#pipeline_layout = #hal.pipeline.layout, - #hal.descriptor_set.binding<1, storage_buffer>, - #hal.descriptor_set.binding<2, storage_buffer> - ]> -]> -hal.executable private @matmul_tensors { - hal.executable.variant @cuda, target = #hal.executable.target<"cuda", "cuda-nvptx-fb"> { - hal.executable.export @illegal layout(#pipeline_layout) attributes { - translation_info = #translation, - workgroup_size = [64 : index, 2 : index, 1 : index] - } - builtin.module { - func.func @illegal() { - %c0 = arith.constant 0 : index - %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<32x16xf32> - %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x48xf32> - %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<32x48xf32> - // expected-error @+1 {{rhsShape doesn't factor into first level tile size for LLVMGPUMatmulTensorCore}} - linalg.matmul {lowering_config = #config} ins(%lhs, %rhs : memref<32x16xf32>, memref<16x48xf32>) - outs(%result: memref<32x48xf32>) - return - } - } - } -} - -// ----- - #config = #iree_codegen.lowering_config #translation = #iree_codegen.translation_info #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb"> diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/workgroup_specialization_pipeline_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/workgroup_specialization_pipeline_test.mlir index 3703d9f75930..ce78695b3e6e 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/workgroup_specialization_pipeline_test.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/workgroup_specialization_pipeline_test.mlir @@ -42,11 +42,12 @@ module attributes {hal.device.targets = [#hal.device.target<"cuda", {executable_ // CHECK-LABEL: func.func @forward_dispatch_116_matmul_128x30522x768 // CHECK: arith.cmpi eq // CHECK: scf.if -// CHECK: vector.transfer_read -// CHECK: vector.transfer_read -// CHECK: vector.contract -// CHECK: vector.transfer_read -// CHECK: vector.broadcast -// CHECK: vector.transfer_write +// CHECK: gpu.subgroup_mma_load_matrix +// CHECK: gpu.subgroup_mma_load_matrix +// CHECK: gpu.subgroup_mma_load_matrix +// CHECK: gpu.subgroup_mma_load_matrix +// CHECK: gpu.subgroup_mma_compute +// CHECK: gpu.subgroup_mma_compute +// CHECK: gpu.subgroup_mma_store_matrix // CHECK: else // CHECK-NOT: vector.transfer From 157d3f1aa035a1bed852385bc5cb8349e4f947a2 Mon Sep 17 00:00:00 2001 From: Okwan Kwon Date: Wed, 16 Nov 2022 14:22:39 -0800 Subject: [PATCH 13/27] Add a filter param to populateContractPromotionPatterns() --- .../compiler/Codegen/Common/GPUPatterns.cpp | 17 +++++++++++------ .../iree/compiler/Codegen/Common/GPUPatterns.h | 7 +++++-- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/Common/GPUPatterns.cpp b/compiler/src/iree/compiler/Codegen/Common/GPUPatterns.cpp index a69b187c00ce..8f4936c8e79b 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPUPatterns.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GPUPatterns.cpp @@ -6,6 +6,7 @@ #include "iree/compiler/Codegen/Common/GPUPatterns.h" +#include "iree-dialects/Dialect/LinalgExt/Passes/Passes.h" #include "iree-dialects/Dialect/LinalgExt/Transforms/Transforms.h" #include "iree/compiler/Codegen/Utils/GPUUtils.h" #include "iree/compiler/Codegen/Utils/MarkerUtils.h" @@ -165,8 +166,11 @@ void populateVectorTransferToGPUMMAPreparationPatterns( patterns.add(patterns.getContext()); } +using LinalgTransformationFilter = IREE::LinalgExt::LinalgTransformationFilter; + void populateContractPromotionPatterns(RewritePatternSet &patterns, - ArrayRef operandsToPromote) { + ArrayRef operandsToPromote, + LinalgTransformationFilter *filter) { MLIRContext *context = patterns.getContext(); patterns.insert, LinalgPromotionPattern, @@ -178,11 +182,12 @@ void populateContractPromotionPatterns(RewritePatternSet &patterns, .setCopyInOutFns(copyToWorkgroupMemory, copyToWorkgroupMemory) .setOperandsToPromote(operandsToPromote) .setUseFullTileBuffers({false, false}), - IREE::LinalgExt::LinalgTransformationFilter( - {StringAttr::get(context, getWorkgroupKTiledMarker())}, - StringAttr::get(context, getWorkgroupMemoryMarker())) - .setMatchByDefault() - .addFilter(contractOpFilter)); + filter ? *filter + : IREE::LinalgExt::LinalgTransformationFilter( + {StringAttr::get(context, getWorkgroupKTiledMarker())}, + StringAttr::get(context, getWorkgroupMemoryMarker())) + .setMatchByDefault() + .addFilter(contractOpFilter)); } } // namespace iree_compiler diff --git a/compiler/src/iree/compiler/Codegen/Common/GPUPatterns.h b/compiler/src/iree/compiler/Codegen/Common/GPUPatterns.h index deefb38740ac..7f9afc865d44 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPUPatterns.h +++ b/compiler/src/iree/compiler/Codegen/Common/GPUPatterns.h @@ -7,7 +7,9 @@ #ifndef IREE_COMPILER_CODEGEN_COMMON_GPUPATTERNS_H_ #define IREE_COMPILER_CODEGEN_COMMON_GPUPATTERNS_H_ +#include "iree-dialects/Dialect/LinalgExt/Passes/Passes.h" #include "mlir/IR/PatternMatch.h" + namespace mlir { namespace iree_compiler { @@ -18,8 +20,9 @@ void populateVectorTransferToGPUMMAPreparationPatterns( /// Adds patterns for promoting Linalg contract op's operands to use GPU shared /// memory. -void populateContractPromotionPatterns(RewritePatternSet &patterns, - ArrayRef operandsToPromote); +void populateContractPromotionPatterns( + RewritePatternSet &patterns, ArrayRef operandsToPromote, + IREE::LinalgExt::LinalgTransformationFilter *filter = nullptr); } // namespace iree_compiler } // namespace mlir From 1309942c485208f940baa8801bca81c9e9132175 Mon Sep 17 00:00:00 2001 From: Okwan Kwon Date: Wed, 16 Nov 2022 14:23:32 -0800 Subject: [PATCH 14/27] Enable unaligned K for tensorcore specialization --- compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp index 69d75cda0ee0..fbd0cf41264f 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp @@ -227,8 +227,7 @@ static LogicalResult setContractConfig(func::FuncOp entryPoint, // Pick the best configuration where the original shape is aligned on the // tile size. for (TileWorkgroupSizePair &config : TCtileSizeConfig) { - // TODO: support unaligned K size - if (sizeK % config.tileSize[2] == 0 && sizeN >= config.tileSize[1] && + if (sizeK >= config.tileSize[2] && sizeN >= config.tileSize[1] && sizeM >= config.tileSize[0]) { return setMatmulConfig( config.tileSize[0], config.tileSize[1], config.tileSize[2], From d5c6cb01bf47c2bed750171639283f62f97315d1 Mon Sep 17 00:00:00 2001 From: Okwan Kwon Date: Wed, 16 Nov 2022 15:10:04 -0800 Subject: [PATCH 15/27] Mark tensorcore and SIMT lowering candidates --- .../LLVMGPU/LLVMGPUTileAndDistribute.cpp | 21 +++++++++++++++++++ .../compiler/Codegen/Utils/MarkerUtils.cpp | 6 ++++++ .../iree/compiler/Codegen/Utils/MarkerUtils.h | 6 ++++++ 3 files changed, 33 insertions(+) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp index d9c3ba271745..1301a705e40d 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp @@ -24,8 +24,11 @@ #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/GPU/Transforms/Passes.h" #include "mlir/Dialect/LLVMIR/NVVMDialect.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/IR/Matchers.h" +#include "mlir/IR/Visitors.h" #include "mlir/Support/MathExtras.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "mlir/Transforms/Passes.h" @@ -182,6 +185,20 @@ static void populateTilingToInvocationPatterns( context, tilingOptions, f); } +static void markCandidates(func::FuncOp funcOp) { + funcOp.walk([](linalg::LinalgOp op) { + if (!isa(op)) + return WalkResult::skip(); + + if (succeeded(alignedOpFilter(op))) { + setMarker(op, getGPUTensorCoreLoweringReqMarker()); + } else { + setMarker(op, getGPUSimtLoweringReqMarker()); + } + return WalkResult::advance(); + }); +} + namespace { struct LLVMGPUTileAndDistributePass : public LLVMGPUTileAndDistributeBase { @@ -200,6 +217,10 @@ struct LLVMGPUTileAndDistributePass auto funcOp = getOperation(); if (!isEntryPoint(funcOp)) return; + // Mark lowering candidates. An op can be a tensorcore or SIMT lowering + // candidate. + markCandidates(funcOp); + // Promote C matrix and propagate the potential fill producer into the temp // allocation. This needs to be done before reduction tiling. { diff --git a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp index 0399f119891d..d357d742a651 100644 --- a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp +++ b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp @@ -40,6 +40,12 @@ StringRef getWorkgroupSpecializationMarker() { return "workgroup_specialization"; } +StringRef getGPUSimtLoweringReqMarker() { return "gpu_simt_lowering_req"; } + +StringRef getGPUTensorCoreLoweringReqMarker() { + return "gpu_tensorcore_lowering_req"; +} + StringRef getCopyToWorkgroupMemoryMarker() { return "copy_to_workgroup_memory"; } diff --git a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h index 80e11b7d2115..2b94cad19222 100644 --- a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h +++ b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h @@ -37,6 +37,12 @@ StringRef getWorkgroupL1TileMarker(); /// specialization. StringRef getWorkgroupSpecializationMarker(); +/// Marker to denote that the op is marked for SIMT lowering +StringRef getGPUSimtLoweringReqMarker(); + +/// Marker to denote that the op is marked for tensorcore lowering +StringRef getGPUTensorCoreLoweringReqMarker(); + /// Marker for copy operations that are moving data from StorageClass to /// Workgroup memory. StringRef getCopyToWorkgroupMemoryMarker(); From fd4b182b1aa0a987950a3d4178bb6686ef793c00 Mon Sep 17 00:00:00 2001 From: Okwan Kwon Date: Wed, 16 Nov 2022 16:43:57 -0800 Subject: [PATCH 16/27] Use the tensorcore marker to serialize the K loop --- .../LLVMGPU/LLVMGPUTileAndDistribute.cpp | 65 ++++++++++++++++++- 1 file changed, 62 insertions(+), 3 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp index 1301a705e40d..e246074e895b 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp @@ -27,7 +27,9 @@ #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h" #include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Dialect/SCF/Transforms/Transforms.h" #include "mlir/IR/Matchers.h" +#include "mlir/IR/OperationSupport.h" #include "mlir/IR/Visitors.h" #include "mlir/Support/MathExtras.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" @@ -199,6 +201,58 @@ static void markCandidates(func::FuncOp funcOp) { }); } +static LogicalResult tileTensorCoreKDim(func::FuncOp funcOp) { + // mark which linarg op is a tensorcore + markCandidates(funcOp); + + auto context = funcOp.getContext(); + RewritePatternSet patterns(context); + auto tileSizesFn = [](OpBuilder &builder, + Operation *op) -> SmallVector { + auto interfaceOp = cast(*op); + auto partitionedLoops = + interfaceOp.getPartitionableLoops(kNumMaxParallelDims); + SmallVector tileSizes = getTileSizes(builder, op, 0); + auto zero = builder.create(op->getLoc(), 0); + for (unsigned depth : partitionedLoops) { + if (depth < tileSizes.size()) { + tileSizes[depth] = zero; + } + } + return tileSizes; + }; + + auto tilingOptions = + linalg::LinalgTilingOptions() + .setLoopType(linalg::LinalgTilingLoopType::Loops) + .setTileSizeComputationFunction(tileSizesFn) + .setPeeledLoops({0}); // peel off the partial iterations + + IREE::LinalgExt::LinalgTransformationFilter filter( + ArrayRef{ + StringAttr::get(context, getGPUTensorCoreLoweringReqMarker())}, + StringAttr::get(context, getWorkgroupKTiledMarker())); + + TilingPatterns::insert(patterns, tilingOptions, filter); + + if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) { + return failure(); + } + + RewritePatternSet wgTilingCanonicalizationPatterns = + linalg::getLinalgTilingCanonicalizationPatterns(funcOp.getContext()); + populateAffineMinSCFCanonicalizationPattern(wgTilingCanonicalizationPatterns); + scf::populateSCFForLoopCanonicalizationPatterns( + wgTilingCanonicalizationPatterns); + if (failed(applyPatternsAndFoldGreedily( + funcOp, std::move(wgTilingCanonicalizationPatterns)))) { + return failure(); + } + + return success(); +} + namespace { struct LLVMGPUTileAndDistributePass : public LLVMGPUTileAndDistributeBase { @@ -221,11 +275,15 @@ struct LLVMGPUTileAndDistributePass // candidate. markCandidates(funcOp); + auto tensorcoreFilter = IREE::LinalgExt::LinalgTransformationFilter( + {StringAttr::get(context, getGPUTensorCoreLoweringReqMarker())}); + // Promote C matrix and propagate the potential fill producer into the temp // allocation. This needs to be done before reduction tiling. { RewritePatternSet promotionPatterns(&getContext()); - populateContractPromotionPatterns(promotionPatterns, {2}); + populateContractPromotionPatterns(promotionPatterns, {2}, + &tensorcoreFilter); if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(promotionPatterns)))) { return signalPassFailure(); @@ -236,7 +294,7 @@ struct LLVMGPUTileAndDistributePass // Tile again at the workgroup level since reduction dimension were // ignored. Dimensions already tiled will be ignore since we tile to the // same size. - if (failed(tileToSerialLoops(funcOp))) { + if (failed(tileTensorCoreKDim(funcOp))) { return signalPassFailure(); } @@ -255,7 +313,8 @@ struct LLVMGPUTileAndDistributePass if (flatWorkgroupSize > kWarpSize) { RewritePatternSet promotionPatterns(&getContext()); - populateContractPromotionPatterns(promotionPatterns, {0, 1}); + populateContractPromotionPatterns(promotionPatterns, {0, 1}, + &tensorcoreFilter); if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(promotionPatterns)))) { From bccc04cf5cc66b60fc2f2854df38312185486855 Mon Sep 17 00:00:00 2001 From: Okwan Kwon Date: Wed, 16 Nov 2022 16:55:30 -0800 Subject: [PATCH 17/27] do not handle genericOp in the tensorcore path --- .../compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp index e246074e895b..d80e505a0d64 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp @@ -189,7 +189,7 @@ static void populateTilingToInvocationPatterns( static void markCandidates(func::FuncOp funcOp) { funcOp.walk([](linalg::LinalgOp op) { - if (!isa(op)) + if (!isa(op)) return WalkResult::skip(); if (succeeded(alignedOpFilter(op))) { @@ -233,8 +233,8 @@ static LogicalResult tileTensorCoreKDim(func::FuncOp funcOp) { StringAttr::get(context, getGPUTensorCoreLoweringReqMarker())}, StringAttr::get(context, getWorkgroupKTiledMarker())); - TilingPatterns::insert(patterns, tilingOptions, filter); + TilingPatterns::insert( + patterns, tilingOptions, filter); if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) { return failure(); From 74a23ee7ef9be6a12b316eae7a10c4bbe7cda664 Mon Sep 17 00:00:00 2001 From: Okwan Kwon Date: Thu, 17 Nov 2022 11:50:06 -0800 Subject: [PATCH 18/27] Do warp-level tiling selectively using a marker --- .../LLVMGPU/LLVMGPUTileAndDistribute.cpp | 51 ++++++++++++++++--- .../compiler/Codegen/Utils/MarkerUtils.cpp | 4 ++ .../iree/compiler/Codegen/Utils/MarkerUtils.h | 3 ++ 3 files changed, 52 insertions(+), 6 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp index d80e505a0d64..140d3194859c 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp @@ -28,6 +28,7 @@ #include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/SCF/Transforms/Transforms.h" +#include "mlir/IR/BuiltinTypeInterfaces.h" #include "mlir/IR/Matchers.h" #include "mlir/IR/OperationSupport.h" #include "mlir/IR/Visitors.h" @@ -123,7 +124,7 @@ static void populateTilingToWarpPatterns( auto getWarpProcInfoFn = [warpPerWorkgroup]( OpBuilder &builder, Location loc, ArrayRef parallelLoopRanges) { - return getSubgroupIdsAndCounts(builder, loc, /*warpSize=*/32u, + return getSubgroupIdsAndCounts(builder, loc, kWarpSize, parallelLoopRanges.size(), warpPerWorkgroup); }; @@ -136,12 +137,8 @@ static void populateTilingToWarpPatterns( .setDistributionOptions(warpDistributionOptions); MLIRContext *context = patterns.getContext(); IREE::LinalgExt::LinalgTransformationFilter filter( - {StringAttr::get(context, getWorkgroupKTiledMarker()), - StringAttr::get(context, getWorkgroupMemoryMarker())}, + {StringAttr::get(context, getGPUWarpLevelTilingReqMarker())}, StringAttr::get(context, getVectorizeMarker())); - filter.setMatchByDefault(); - // Bail out the case where the tensor sizes are not a multiple of tile sizes. - filter.addFilter(alignedOpFilter); TilingPatterns::insert(patterns, tilingOptions, filter); } @@ -253,6 +250,28 @@ static LogicalResult tileTensorCoreKDim(func::FuncOp funcOp) { return success(); } +// Get K dimension size. It returns kDynamicSize for unknown cases. +static int64_t getSizeK(linalg::LinalgOp op) { + int64_t sizeK = ShapedType::kDynamicSize; + + if (!isa(op)) return sizeK; + + auto lhsShape = + op.getDpsInputOperand(0)->get().getType().cast().getShape(); + SmallVector exprs; + op.getReductionDims(exprs); + if (exprs.size() == 1) { + for (unsigned i = 0; i < lhsShape.size(); i++) { + if (op.getMatchingIndexingMap(op.getDpsInputOperand(0)) + .getDimPosition(i) == exprs[0]) { + sizeK = lhsShape[i]; + break; + } + } + } + return sizeK; +} + namespace { struct LLVMGPUTileAndDistributePass : public LLVMGPUTileAndDistributeBase { @@ -339,6 +358,26 @@ struct LLVMGPUTileAndDistributePass }); if (distributeToWarp) { + // mark candidates for the warp level tiling + funcOp.walk([&](linalg::LinalgOp op) { + if (failed(alignedOpFilter(op))) return WalkResult::skip(); + if (!isa(op)) + return WalkResult::skip(); + + // check if K is a multiple of Tile-K. + int64_t sizeK = getSizeK(op); + if (sizeK != ShapedType::kDynamicSize) { + // WG tile sizes + auto wgTileSizes = getTileSizes(op, 0); + + if (sizeK % wgTileSizes[wgTileSizes.size() - 1] != 0) + return WalkResult::skip(); + } + + setMarker(op, getGPUWarpLevelTilingReqMarker()); + return WalkResult::advance(); + }); + // Apply last level of tiling and distribute to warps for aligned ops. RewritePatternSet warpLevelTilingPatterns(context); populateTilingToWarpPatterns(warpLevelTilingPatterns, workgroupSize); diff --git a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp index d357d742a651..358e4c048168 100644 --- a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp +++ b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp @@ -46,6 +46,10 @@ StringRef getGPUTensorCoreLoweringReqMarker() { return "gpu_tensorcore_lowering_req"; } +StringRef getGPUWarpLevelTilingReqMarker() { + return "gpu_warp_level_tiling_req"; +} + StringRef getCopyToWorkgroupMemoryMarker() { return "copy_to_workgroup_memory"; } diff --git a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h index 2b94cad19222..89bb8a953e0c 100644 --- a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h +++ b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h @@ -43,6 +43,9 @@ StringRef getGPUSimtLoweringReqMarker(); /// Marker to denote that the op is marked for tensorcore lowering StringRef getGPUTensorCoreLoweringReqMarker(); +/// Marker to denote that the op is marked for warp-level tiling +StringRef getGPUWarpLevelTilingReqMarker(); + /// Marker for copy operations that are moving data from StorageClass to /// Workgroup memory. StringRef getCopyToWorkgroupMemoryMarker(); From 04dc39b3f2a21e4d4cf06a7ad3cef6f434f63e70 Mon Sep 17 00:00:00 2001 From: Okwan Kwon Date: Thu, 17 Nov 2022 14:13:17 -0800 Subject: [PATCH 19/27] do thread-level tiling for the partial K and unspecialized ops --- .../Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp index 140d3194859c..9162c520685a 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp @@ -171,12 +171,13 @@ static void populateTilingToInvocationPatterns( MLIRContext *context = patterns.getContext(); IREE::LinalgExt::LinalgTransformationFilter f( {StringAttr::get(context, getWorkgroupKTiledMarker()), - StringAttr::get(context, getWorkgroupMemoryMarker())}, - StringAttr::get(context, getVectorizeMarker())); + StringAttr::get(context, getWorkgroupMemoryMarker()), + StringAttr::get(context, getGPUSimtLoweringReqMarker()) + }); f.addFilter([](Operation *op) { // FFT doesn't support second level of tiling yet. return success(!isa(op)); - }).setMatchByDefault(); + }); // Add the user provided filter if available. if (ff) f.addFilter(ff); patterns.insert Date: Thu, 17 Nov 2022 21:21:49 -0800 Subject: [PATCH 20/27] promote inputs and output always --- .../Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp index 9162c520685a..b7a19467bd34 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp @@ -291,19 +291,11 @@ struct LLVMGPUTileAndDistributePass auto funcOp = getOperation(); if (!isEntryPoint(funcOp)) return; - // Mark lowering candidates. An op can be a tensorcore or SIMT lowering - // candidate. - markCandidates(funcOp); - - auto tensorcoreFilter = IREE::LinalgExt::LinalgTransformationFilter( - {StringAttr::get(context, getGPUTensorCoreLoweringReqMarker())}); - // Promote C matrix and propagate the potential fill producer into the temp // allocation. This needs to be done before reduction tiling. { RewritePatternSet promotionPatterns(&getContext()); - populateContractPromotionPatterns(promotionPatterns, {2}, - &tensorcoreFilter); + populateContractPromotionPatterns(promotionPatterns, {2}); if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(promotionPatterns)))) { return signalPassFailure(); @@ -333,9 +325,7 @@ struct LLVMGPUTileAndDistributePass if (flatWorkgroupSize > kWarpSize) { RewritePatternSet promotionPatterns(&getContext()); - populateContractPromotionPatterns(promotionPatterns, {0, 1}, - &tensorcoreFilter); - + populateContractPromotionPatterns(promotionPatterns, {0, 1}); if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(promotionPatterns)))) { return signalPassFailure(); From 3b67ab89bf59d8ccce1ab9b79e95569b89a2bf98 Mon Sep 17 00:00:00 2001 From: Okwan Kwon Date: Thu, 17 Nov 2022 21:43:40 -0800 Subject: [PATCH 21/27] NFC: remove default value from tileToSerialLoops --- .../src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorAlloc.cpp | 2 +- .../iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp | 2 +- compiler/src/iree/compiler/Codegen/LLVMGPU/TilingUtils.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorAlloc.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorAlloc.cpp index d2f43b69896e..551eba1d66ff 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorAlloc.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorAlloc.cpp @@ -83,7 +83,7 @@ struct LLVMGPUTensorAllocPass auto funcOp = getOperation(); // Tile the reduction first to reduce the alloc size. - if (failed(tileToSerialLoops(funcOp))) { + if (failed(tileToSerialLoops(funcOp, /*onlyReduction=*/true))) { return signalPassFailure(); } diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp index b7a19467bd34..77f851815455 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp @@ -306,7 +306,7 @@ struct LLVMGPUTileAndDistributePass // Tile again at the workgroup level since reduction dimension were // ignored. Dimensions already tiled will be ignore since we tile to the // same size. - if (failed(tileTensorCoreKDim(funcOp))) { + if (failed(tileToSerialLoops(funcOp, /*onlyReduction=*/true))) { return signalPassFailure(); } diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TilingUtils.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/TilingUtils.h index 7ef1b93324d4..6303ee86d79f 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TilingUtils.h +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TilingUtils.h @@ -16,7 +16,7 @@ class FuncOp; namespace iree_compiler { /// Apply tiling to reduction dimensions based on op attributes. -LogicalResult tileToSerialLoops(func::FuncOp funcOp, bool onlyReduction = true); +LogicalResult tileToSerialLoops(func::FuncOp funcOp, bool onlyReduction); } // namespace iree_compiler } // namespace mlir From c8daa2b18d1f7987fddc9424ef76ac3745ee57c2 Mon Sep 17 00:00:00 2001 From: Okwan Kwon Date: Thu, 17 Nov 2022 21:55:18 -0800 Subject: [PATCH 22/27] Remove onlyReduction from tileToSerialLoops and add peel The function is always used for reduction cases, so let's remove the flag. In addition, add `peel` to peel the serial loop when the warp-level tiling is requested. --- .../Codegen/LLVMGPU/LLVMGPUTensorAlloc.cpp | 2 +- .../LLVMGPU/LLVMGPUTileAndDistribute.cpp | 5 ++-- .../Codegen/LLVMGPU/LLVMGPUTileTensor.cpp | 23 +++++++++---------- .../compiler/Codegen/LLVMGPU/TilingUtils.h | 2 +- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorAlloc.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorAlloc.cpp index 551eba1d66ff..3164f3d3dfae 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorAlloc.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorAlloc.cpp @@ -83,7 +83,7 @@ struct LLVMGPUTensorAllocPass auto funcOp = getOperation(); // Tile the reduction first to reduce the alloc size. - if (failed(tileToSerialLoops(funcOp, /*onlyReduction=*/true))) { + if (failed(tileToSerialLoops(funcOp, /*peel=*/false))) { return signalPassFailure(); } diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp index 77f851815455..a1b9656a4a64 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp @@ -305,8 +305,9 @@ struct LLVMGPUTileAndDistributePass // Tile again at the workgroup level since reduction dimension were // ignored. Dimensions already tiled will be ignore since we tile to the - // same size. - if (failed(tileToSerialLoops(funcOp, /*onlyReduction=*/true))) { + // same size. For distributing to warps, peel the partial iterations as + // a separate loop, since the warp distribution is requested for wmma. + if (failed(tileToSerialLoops(funcOp, /*peel=*/distributeToWarp))) { return signalPassFailure(); } diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileTensor.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileTensor.cpp index 6348ddf1d05a..aeb1140b7675 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileTensor.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileTensor.cpp @@ -32,20 +32,18 @@ namespace iree_compiler { /// Patterns for workgroup level tiling. Workgroup tiling is done at the flow /// level but we may have extra tiling for the reduction dimension. Therefore we /// tile again without distributing. -static void populateTilingPatterns(RewritePatternSet &patterns, - bool onlyReduction) { - auto tileSizesFn = [onlyReduction](OpBuilder &builder, +static void populateReductionTilingPatterns(RewritePatternSet &patterns, + bool peel) { + auto tileSizesFn = [](OpBuilder &builder, Operation *op) -> SmallVector { auto interfaceOp = cast(*op); auto partitionedLoops = interfaceOp.getPartitionableLoops(kNumMaxParallelDims); SmallVector tileSizes = getTileSizes(builder, op, 0); - if (onlyReduction) { - auto zero = builder.create(op->getLoc(), 0); - for (unsigned depth : partitionedLoops) { - if (depth < tileSizes.size()) { - tileSizes[depth] = zero; - } + auto zero = builder.create(op->getLoc(), 0); + for (unsigned depth : partitionedLoops) { + if (depth < tileSizes.size()) { + tileSizes[depth] = zero; } } return tileSizes; @@ -54,6 +52,7 @@ static void populateTilingPatterns(RewritePatternSet &patterns, auto tilingOptions = linalg::LinalgTilingOptions() .setLoopType(linalg::LinalgTilingLoopType::Loops) .setTileSizeComputationFunction(tileSizesFn); + if (peel) tilingOptions.setPeeledLoops({0}); MLIRContext *context = patterns.getContext(); IREE::LinalgExt::LinalgTransformationFilter filter( @@ -67,13 +66,13 @@ static void populateTilingPatterns(RewritePatternSet &patterns, filter); } -LogicalResult tileToSerialLoops(func::FuncOp funcOp, bool onlyReduction) { +LogicalResult tileToSerialLoops(func::FuncOp funcOp, bool peel) { { // Tile again at the workgroup level since reduction dimension were // ignored. Dimensions already tiled will be ignore since we tile to the // same size. RewritePatternSet wgTilingPatterns(funcOp.getContext()); - populateTilingPatterns(wgTilingPatterns, onlyReduction); + populateReductionTilingPatterns(wgTilingPatterns, peel); if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(wgTilingPatterns)))) { return failure(); @@ -223,7 +222,7 @@ struct LLVMGPUTileTensorPass // Tile to serial loops to the wg tile size to handle reductions and other // dimension that have not been distributed. - if (failed(tileToSerialLoops(funcOp, /*onlyReduction=*/true))) { + if (failed(tileToSerialLoops(funcOp, /*peel=*/false))) { return signalPassFailure(); } diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TilingUtils.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/TilingUtils.h index 6303ee86d79f..8616a92d6fc0 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TilingUtils.h +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TilingUtils.h @@ -16,7 +16,7 @@ class FuncOp; namespace iree_compiler { /// Apply tiling to reduction dimensions based on op attributes. -LogicalResult tileToSerialLoops(func::FuncOp funcOp, bool onlyReduction); +LogicalResult tileToSerialLoops(func::FuncOp funcOp, bool peel); } // namespace iree_compiler } // namespace mlir From a4cb7b5669c06c43a82bf1d2316dfc3e9a1b82c4 Mon Sep 17 00:00:00 2001 From: Okwan Kwon Date: Fri, 18 Nov 2022 13:49:13 -0800 Subject: [PATCH 23/27] Use `vectorize_for_tensorcore` marker --- .../LLVMGPU/LLVMGPUTensorCoreVectorization.cpp | 6 ++---- .../Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp | 15 ++++++--------- .../Codegen/LLVMGPU/LLVMGPUTileTensor.cpp | 2 +- .../LLVMGPU/test/tensorcore_vectorization.mlir | 4 ++-- .../iree/compiler/Codegen/Utils/MarkerUtils.cpp | 4 ++++ .../src/iree/compiler/Codegen/Utils/MarkerUtils.h | 3 +++ 6 files changed, 18 insertions(+), 16 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp index 5bb7bdfb6022..64dc7e65b1a3 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp @@ -29,10 +29,8 @@ extern llvm::cl::opt llvmgpuUseMMASync; //====---------------------------------------------------------------------===// static void populateVectorizationPatterns(RewritePatternSet &patterns) { - IREE::LinalgExt::LinalgTransformationFilter f( - StringAttr::get(patterns.getContext(), getVectorizeMarker())); - // only handle aligned ops - f.addFilter(alignedOpFilter); + IREE::LinalgExt::LinalgTransformationFilter f(StringAttr::get( + patterns.getContext(), getVectorizeForTensorCoreMarker())); VectorizationPatterns::insert(patterns, f); patterns.add( patterns.getContext(), f.addOpFilter()); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp index a1b9656a4a64..56db23351023 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp @@ -138,7 +138,7 @@ static void populateTilingToWarpPatterns( MLIRContext *context = patterns.getContext(); IREE::LinalgExt::LinalgTransformationFilter filter( {StringAttr::get(context, getGPUWarpLevelTilingReqMarker())}, - StringAttr::get(context, getVectorizeMarker())); + StringAttr::get(context, getVectorizeForTensorCoreMarker())); TilingPatterns::insert(patterns, tilingOptions, filter); } @@ -147,8 +147,7 @@ using FilterFunction = std::function; /// Patterns for thread level tiling. static void populateTilingToInvocationPatterns( - RewritePatternSet &patterns, SmallVectorImpl &workgroupSize, - const FilterFunction &ff = nullptr) { + RewritePatternSet &patterns, SmallVectorImpl &workgroupSize) { linalg::TileSizeComputationFunction getInnerTileSizeFn = [&](OpBuilder &builder, Operation *operation) { return calculateDistributedTileSize(workgroupSize, builder, operation); @@ -172,14 +171,12 @@ static void populateTilingToInvocationPatterns( IREE::LinalgExt::LinalgTransformationFilter f( {StringAttr::get(context, getWorkgroupKTiledMarker()), StringAttr::get(context, getWorkgroupMemoryMarker()), - StringAttr::get(context, getGPUSimtLoweringReqMarker()) - }); + StringAttr::get(context, getGPUSimtLoweringReqMarker())}, + StringAttr::get(context, getVectorizeMarker())); f.addFilter([](Operation *op) { - // FFT doesn't support second level of tiling yet. - return success(!isa(op)); + // FFT doesn't support second level of tiling yet. + return success(!isa(op)); }); - // Add the user provided filter if available. - if (ff) f.addFilter(ff); patterns.insert( context, tilingOptions, f); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileTensor.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileTensor.cpp index aeb1140b7675..a9c2a7810f16 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileTensor.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileTensor.cpp @@ -35,7 +35,7 @@ namespace iree_compiler { static void populateReductionTilingPatterns(RewritePatternSet &patterns, bool peel) { auto tileSizesFn = [](OpBuilder &builder, - Operation *op) -> SmallVector { + Operation *op) -> SmallVector { auto interfaceOp = cast(*op); auto partitionedLoops = interfaceOp.getPartitionableLoops(kNumMaxParallelDims); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/tensorcore_vectorization.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/tensorcore_vectorization.mlir index 425e406127ae..ba6ed805fe55 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/tensorcore_vectorization.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/tensorcore_vectorization.mlir @@ -20,7 +20,7 @@ func.func @dot() { %10 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%9] %11 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 32)>(%8) %12 = memref.subview %7[%10, %11] [32, 32] [1, 1] : memref<64x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<32x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> - linalg.fill {__internal_linalg_transform__ = "vectorize"} ins(%cst : f32) outs(%12 : memref<32x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>) + linalg.fill {__internal_linalg_transform__ = "vectorize_for_tensorcore"} ins(%cst : f32) outs(%12 : memref<32x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>) scf.for %arg0 = %c0 to %c1024 step %c16 { %13 = memref.subview %5[0, %arg0] [64, 16] [1, 1] : memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>> to memref<64x16xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>> %14 = memref.subview %6[%arg0, 0] [16, 64] [1, 1] : memref<1024x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<16x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> @@ -28,7 +28,7 @@ func.func @dot() { %16 = memref.subview %13[%10, 0] [32, 16] [1, 1] : memref<64x16xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>> to memref<32x16xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>> %17 = memref.subview %14[0, %15] [16, 32] [1, 1] : memref<16x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<16x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> %18 = memref.subview %7[%10, %15] [32, 32] [1, 1] : memref<64x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<32x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> - linalg.matmul {__internal_linalg_transform__ = "vectorize"} ins(%16, %17 : memref<32x16xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, memref<16x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>) outs(%18 : memref<32x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>) + linalg.matmul {__internal_linalg_transform__ = "vectorize_for_tensorcore"} ins(%16, %17 : memref<32x16xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, memref<16x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>) outs(%18 : memref<32x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>) } return } diff --git a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp index 358e4c048168..bb44b7681187 100644 --- a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp +++ b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp @@ -58,6 +58,10 @@ StringRef getTileReductionMarker() { return "tile_reduction"; } StringRef getVectorizeMarker() { return "vectorize"; } +StringRef getVectorizeForTensorCoreMarker() { + return "vectorize_for_tensorcore"; +} + StringRef getDeleteMarker() { return "delete"; } StringRef getMarkerOrNull(Operation *op) { diff --git a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h index 89bb8a953e0c..f62ddbba6cd7 100644 --- a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h +++ b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h @@ -56,6 +56,9 @@ StringRef getTileReductionMarker(); /// Marker for operations that are going to be vectorized. StringRef getVectorizeMarker(); +/// Marker for operations that are going to be vectorized for tensorcore. +StringRef getVectorizeForTensorCoreMarker(); + /// Marker for tagging an operation for deletion. Tile and fuse pattern does /// not delete the original operation to not invalidate the /// `linalg::LinalgDependenceGraph` data structure. Instead it is marked with From 6d7305bad311ddb9a69da5fce37f485b9fe38c09 Mon Sep 17 00:00:00 2001 From: Okwan Kwon Date: Fri, 18 Nov 2022 15:04:37 -0800 Subject: [PATCH 24/27] match by default for the non-tensorcore flow --- .../compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp index 56db23351023..75b3a874ef80 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp @@ -147,7 +147,8 @@ using FilterFunction = std::function; /// Patterns for thread level tiling. static void populateTilingToInvocationPatterns( - RewritePatternSet &patterns, SmallVectorImpl &workgroupSize) { + RewritePatternSet &patterns, SmallVectorImpl &workgroupSize, + bool matchByDefault = true) { linalg::TileSizeComputationFunction getInnerTileSizeFn = [&](OpBuilder &builder, Operation *operation) { return calculateDistributedTileSize(workgroupSize, builder, operation); @@ -177,6 +178,7 @@ static void populateTilingToInvocationPatterns( // FFT doesn't support second level of tiling yet. return success(!isa(op)); }); + if (matchByDefault) f.setMatchByDefault(); patterns.insert( context, tilingOptions, f); @@ -378,7 +380,8 @@ struct LLVMGPUTileAndDistributePass // Apply last level of tiling and distribute to threads for unaligned ops. RewritePatternSet threadLevelTilingPatterns(context); populateTilingToInvocationPatterns(threadLevelTilingPatterns, - workgroupSize); + workgroupSize, + /*matchByDefault=*/false); if (failed(applyPatternsAndFoldGreedily( funcOp, std::move(threadLevelTilingPatterns)))) { return signalPassFailure(); From d7fe191e5cfc642814f13e65071c3e7d6e29f718 Mon Sep 17 00:00:00 2001 From: Okwan Kwon Date: Fri, 18 Nov 2022 15:40:45 -0800 Subject: [PATCH 25/27] add missing genericOp supports for tensorcore lowering --- .../iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp index 75b3a874ef80..e4dd64cdbc79 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp @@ -352,7 +352,8 @@ struct LLVMGPUTileAndDistributePass // mark candidates for the warp level tiling funcOp.walk([&](linalg::LinalgOp op) { if (failed(alignedOpFilter(op))) return WalkResult::skip(); - if (!isa(op)) + if (!isa(op)) return WalkResult::skip(); // check if K is a multiple of Tile-K. From 23e68dbedb1aa95b1e4c654cd0d20520b223c478 Mon Sep 17 00:00:00 2001 From: Okwan Kwon Date: Sat, 19 Nov 2022 16:28:17 -0800 Subject: [PATCH 26/27] add more debug print to LLVMGPUTileAndDistribute --- .../Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp index e4dd64cdbc79..c6f393f9a9bb 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp @@ -299,7 +299,17 @@ struct LLVMGPUTileAndDistributePass std::move(promotionPatterns)))) { return signalPassFailure(); } + LLVM_DEBUG({ + llvm::dbgs() << "After promote C:\n"; + funcOp.dump(); + }); + propagateSharedMemoryCopy(funcOp); + + LLVM_DEBUG({ + llvm::dbgs() << "After propagateSharedMemoryCopy():\n"; + funcOp.dump(); + }); } // Tile again at the workgroup level since reduction dimension were @@ -353,7 +363,7 @@ struct LLVMGPUTileAndDistributePass funcOp.walk([&](linalg::LinalgOp op) { if (failed(alignedOpFilter(op))) return WalkResult::skip(); if (!isa(op)) + linalg::GenericOp>(op)) return WalkResult::skip(); // check if K is a multiple of Tile-K. From 1f8c8c9a8adc8682ba9f192646b473d7bbf34a3b Mon Sep 17 00:00:00 2001 From: Okwan Kwon Date: Sat, 19 Nov 2022 16:37:25 -0800 Subject: [PATCH 27/27] Do not handle the fused op with warp-level tiling --- .../compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp index c6f393f9a9bb..64973d2123a9 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp @@ -366,6 +366,12 @@ struct LLVMGPUTileAndDistributePass linalg::GenericOp>(op)) return WalkResult::skip(); + if (isa(op) && + hasMarker(op, getCopyToWorkgroupMemoryMarker())) { + // The GPUDistributeSharedMemoryCopy pass will handle it later. + return WalkResult::skip(); + } + // check if K is a multiple of Tile-K. int64_t sizeK = getSizeK(op); if (sizeK != ShapedType::kDynamicSize) {