diff --git a/compiler/src/iree/compiler/Codegen/Common/GPUMultiBuffering.cpp b/compiler/src/iree/compiler/Codegen/Common/GPUMultiBuffering.cpp index e8d8e18a1a7b..1dbc93ed1623 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPUMultiBuffering.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GPUMultiBuffering.cpp @@ -38,10 +38,10 @@ struct GPUMultiBufferingPass }); // Apply multi-buffering to all of them. for (memref::AllocOp alloc : allocs) { - if (failed(memref::multiBuffer(alloc, numBuffers))) - // Stop if any buffer cannot be multi buffered as pipelining will assume - // this happened. - return signalPassFailure(); + if (failed(memref::multiBuffer(alloc, numBuffers))) { + // There can be a failing case. Continue processing eligible ones. + continue; + } } } diff --git a/compiler/src/iree/compiler/Codegen/Common/GPUPatterns.cpp b/compiler/src/iree/compiler/Codegen/Common/GPUPatterns.cpp index a69b187c00ce..8f4936c8e79b 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPUPatterns.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GPUPatterns.cpp @@ -6,6 +6,7 @@ #include "iree/compiler/Codegen/Common/GPUPatterns.h" +#include "iree-dialects/Dialect/LinalgExt/Passes/Passes.h" #include "iree-dialects/Dialect/LinalgExt/Transforms/Transforms.h" #include "iree/compiler/Codegen/Utils/GPUUtils.h" #include "iree/compiler/Codegen/Utils/MarkerUtils.h" @@ -165,8 +166,11 @@ void populateVectorTransferToGPUMMAPreparationPatterns( patterns.add(patterns.getContext()); } +using LinalgTransformationFilter = IREE::LinalgExt::LinalgTransformationFilter; + void populateContractPromotionPatterns(RewritePatternSet &patterns, - ArrayRef operandsToPromote) { + ArrayRef operandsToPromote, + LinalgTransformationFilter *filter) { MLIRContext *context = patterns.getContext(); patterns.insert, LinalgPromotionPattern, @@ -178,11 +182,12 @@ void populateContractPromotionPatterns(RewritePatternSet &patterns, .setCopyInOutFns(copyToWorkgroupMemory, copyToWorkgroupMemory) .setOperandsToPromote(operandsToPromote) .setUseFullTileBuffers({false, false}), - IREE::LinalgExt::LinalgTransformationFilter( - {StringAttr::get(context, getWorkgroupKTiledMarker())}, - StringAttr::get(context, getWorkgroupMemoryMarker())) - .setMatchByDefault() - .addFilter(contractOpFilter)); + filter ? *filter + : IREE::LinalgExt::LinalgTransformationFilter( + {StringAttr::get(context, getWorkgroupKTiledMarker())}, + StringAttr::get(context, getWorkgroupMemoryMarker())) + .setMatchByDefault() + .addFilter(contractOpFilter)); } } // namespace iree_compiler diff --git a/compiler/src/iree/compiler/Codegen/Common/GPUPatterns.h b/compiler/src/iree/compiler/Codegen/Common/GPUPatterns.h index deefb38740ac..7f9afc865d44 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPUPatterns.h +++ b/compiler/src/iree/compiler/Codegen/Common/GPUPatterns.h @@ -7,7 +7,9 @@ #ifndef IREE_COMPILER_CODEGEN_COMMON_GPUPATTERNS_H_ #define IREE_COMPILER_CODEGEN_COMMON_GPUPATTERNS_H_ +#include "iree-dialects/Dialect/LinalgExt/Passes/Passes.h" #include "mlir/IR/PatternMatch.h" + namespace mlir { namespace iree_compiler { @@ -18,8 +20,9 @@ void populateVectorTransferToGPUMMAPreparationPatterns( /// Adds patterns for promoting Linalg contract op's operands to use GPU shared /// memory. -void populateContractPromotionPatterns(RewritePatternSet &patterns, - ArrayRef operandsToPromote); +void populateContractPromotionPatterns( + RewritePatternSet &patterns, ArrayRef operandsToPromote, + IREE::LinalgExt::LinalgTransformationFilter *filter = nullptr); } // namespace iree_compiler } // namespace mlir diff --git a/compiler/src/iree/compiler/Codegen/Common/WorkgroupSpecializationPass.cpp b/compiler/src/iree/compiler/Codegen/Common/WorkgroupSpecializationPass.cpp index 5a1d1da2386f..09d2b9105a3a 100644 --- a/compiler/src/iree/compiler/Codegen/Common/WorkgroupSpecializationPass.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/WorkgroupSpecializationPass.cpp @@ -36,6 +36,7 @@ #include "iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.h" #include "iree/compiler/Codegen/PassDetail.h" #include "iree/compiler/Codegen/Passes.h" +#include "iree/compiler/Codegen/Utils/MarkerUtils.h" #include "iree/compiler/Codegen/Utils/Utils.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -220,6 +221,7 @@ static void specializeDistributionLoops( // generate scf.if %cond auto ifOp = builder.create(loc, cond, /*withElseRegion=*/true); + setMarker(ifOp, getWorkgroupSpecializationMarker()); // Transfer the original body to the scf.else body. auto origBodyBegin = ++Block::iterator(ifOp); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp index e6dd8cd20809..fbd0cf41264f 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp @@ -227,9 +227,8 @@ static LogicalResult setContractConfig(func::FuncOp entryPoint, // Pick the best configuration where the original shape is aligned on the // tile size. for (TileWorkgroupSizePair &config : TCtileSizeConfig) { - if (sizeK % config.tileSize[2] == 0 && - sizeN % config.tileSize[1] == 0 && - sizeM % config.tileSize[0] == 0) { + if (sizeK >= config.tileSize[2] && sizeN >= config.tileSize[1] && + sizeM >= config.tileSize[0]) { return setMatmulConfig( config.tileSize[0], config.tileSize[1], config.tileSize[2], config.workgroupSize, @@ -410,7 +409,7 @@ static LogicalResult setRootDefaultConfig(func::FuncOp entryPoint, } auto linalgOp = dyn_cast(op); - // Pick a vectorSize of 1 for op that we know won't get vectorizedd. + // Pick a vectorSize of 1 for op that we know won't get vectorized. // Also skip vectorization for linalg on memref (no result) as the pipeline // relies on tensor level tiling. // TODO(thomasraoux): This could be improved by checking if the linalg op diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorAlloc.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorAlloc.cpp index d2f43b69896e..3164f3d3dfae 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorAlloc.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorAlloc.cpp @@ -83,7 +83,7 @@ struct LLVMGPUTensorAllocPass auto funcOp = getOperation(); // Tile the reduction first to reduce the alloc size. - if (failed(tileToSerialLoops(funcOp))) { + if (failed(tileToSerialLoops(funcOp, /*peel=*/false))) { return signalPassFailure(); } diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp index 27b0ccd92a9c..64dc7e65b1a3 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp @@ -29,8 +29,8 @@ extern llvm::cl::opt llvmgpuUseMMASync; //====---------------------------------------------------------------------===// static void populateVectorizationPatterns(RewritePatternSet &patterns) { - IREE::LinalgExt::LinalgTransformationFilter f( - StringAttr::get(patterns.getContext(), getVectorizeMarker())); + IREE::LinalgExt::LinalgTransformationFilter f(StringAttr::get( + patterns.getContext(), getVectorizeForTensorCoreMarker())); VectorizationPatterns::insert(patterns, f); patterns.add( patterns.getContext(), f.addOpFilter()); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp index 23f76a9779cc..64973d2123a9 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp @@ -24,7 +24,14 @@ #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/GPU/Transforms/Passes.h" #include "mlir/Dialect/LLVMIR/NVVMDialect.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Dialect/SCF/Transforms/Transforms.h" +#include "mlir/IR/BuiltinTypeInterfaces.h" #include "mlir/IR/Matchers.h" +#include "mlir/IR/OperationSupport.h" +#include "mlir/IR/Visitors.h" #include "mlir/Support/MathExtras.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "mlir/Transforms/Passes.h" @@ -117,9 +124,10 @@ static void populateTilingToWarpPatterns( auto getWarpProcInfoFn = [warpPerWorkgroup]( OpBuilder &builder, Location loc, ArrayRef parallelLoopRanges) { - return getSubgroupIdsAndCounts(builder, loc, /*warpSize=*/32u, + return getSubgroupIdsAndCounts(builder, loc, kWarpSize, parallelLoopRanges.size(), warpPerWorkgroup); }; + linalg::LinalgLoopDistributionOptions warpDistributionOptions; warpDistributionOptions.procInfo = getWarpProcInfoFn; @@ -129,17 +137,18 @@ static void populateTilingToWarpPatterns( .setDistributionOptions(warpDistributionOptions); MLIRContext *context = patterns.getContext(); IREE::LinalgExt::LinalgTransformationFilter filter( - {StringAttr::get(context, getWorkgroupKTiledMarker()), - StringAttr::get(context, getWorkgroupMemoryMarker())}, - StringAttr::get(context, getVectorizeMarker())); - filter.setMatchByDefault(); + {StringAttr::get(context, getGPUWarpLevelTilingReqMarker())}, + StringAttr::get(context, getVectorizeForTensorCoreMarker())); TilingPatterns::insert(patterns, tilingOptions, filter); } +using FilterFunction = std::function; + /// Patterns for thread level tiling. static void populateTilingToInvocationPatterns( - RewritePatternSet &patterns, SmallVectorImpl &workgroupSize) { + RewritePatternSet &patterns, SmallVectorImpl &workgroupSize, + bool matchByDefault = true) { linalg::TileSizeComputationFunction getInnerTileSizeFn = [&](OpBuilder &builder, Operation *operation) { return calculateDistributedTileSize(workgroupSize, builder, operation); @@ -162,17 +171,107 @@ static void populateTilingToInvocationPatterns( MLIRContext *context = patterns.getContext(); IREE::LinalgExt::LinalgTransformationFilter f( {StringAttr::get(context, getWorkgroupKTiledMarker()), - StringAttr::get(context, getWorkgroupMemoryMarker())}, + StringAttr::get(context, getWorkgroupMemoryMarker()), + StringAttr::get(context, getGPUSimtLoweringReqMarker())}, StringAttr::get(context, getVectorizeMarker())); f.addFilter([](Operation *op) { - // FFT doesn't support second level of tiling yet. - return success(!isa(op)); - }).setMatchByDefault(); + // FFT doesn't support second level of tiling yet. + return success(!isa(op)); + }); + if (matchByDefault) f.setMatchByDefault(); patterns.insert( context, tilingOptions, f); } +static void markCandidates(func::FuncOp funcOp) { + funcOp.walk([](linalg::LinalgOp op) { + if (!isa(op)) + return WalkResult::skip(); + + if (succeeded(alignedOpFilter(op))) { + setMarker(op, getGPUTensorCoreLoweringReqMarker()); + } else { + setMarker(op, getGPUSimtLoweringReqMarker()); + } + return WalkResult::advance(); + }); +} + +static LogicalResult tileTensorCoreKDim(func::FuncOp funcOp) { + // mark which linarg op is a tensorcore + markCandidates(funcOp); + + auto context = funcOp.getContext(); + RewritePatternSet patterns(context); + auto tileSizesFn = [](OpBuilder &builder, + Operation *op) -> SmallVector { + auto interfaceOp = cast(*op); + auto partitionedLoops = + interfaceOp.getPartitionableLoops(kNumMaxParallelDims); + SmallVector tileSizes = getTileSizes(builder, op, 0); + auto zero = builder.create(op->getLoc(), 0); + for (unsigned depth : partitionedLoops) { + if (depth < tileSizes.size()) { + tileSizes[depth] = zero; + } + } + return tileSizes; + }; + + auto tilingOptions = + linalg::LinalgTilingOptions() + .setLoopType(linalg::LinalgTilingLoopType::Loops) + .setTileSizeComputationFunction(tileSizesFn) + .setPeeledLoops({0}); // peel off the partial iterations + + IREE::LinalgExt::LinalgTransformationFilter filter( + ArrayRef{ + StringAttr::get(context, getGPUTensorCoreLoweringReqMarker())}, + StringAttr::get(context, getWorkgroupKTiledMarker())); + + TilingPatterns::insert( + patterns, tilingOptions, filter); + + if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) { + return failure(); + } + + RewritePatternSet wgTilingCanonicalizationPatterns = + linalg::getLinalgTilingCanonicalizationPatterns(funcOp.getContext()); + populateAffineMinSCFCanonicalizationPattern(wgTilingCanonicalizationPatterns); + scf::populateSCFForLoopCanonicalizationPatterns( + wgTilingCanonicalizationPatterns); + if (failed(applyPatternsAndFoldGreedily( + funcOp, std::move(wgTilingCanonicalizationPatterns)))) { + return failure(); + } + + return success(); +} + +// Get K dimension size. It returns kDynamicSize for unknown cases. +static int64_t getSizeK(linalg::LinalgOp op) { + int64_t sizeK = ShapedType::kDynamicSize; + + if (!isa(op)) return sizeK; + + auto lhsShape = + op.getDpsInputOperand(0)->get().getType().cast().getShape(); + SmallVector exprs; + op.getReductionDims(exprs); + if (exprs.size() == 1) { + for (unsigned i = 0; i < lhsShape.size(); i++) { + if (op.getMatchingIndexingMap(op.getDpsInputOperand(0)) + .getDimPosition(i) == exprs[0]) { + sizeK = lhsShape[i]; + break; + } + } + } + return sizeK; +} + namespace { struct LLVMGPUTileAndDistributePass : public LLVMGPUTileAndDistributeBase { @@ -191,7 +290,7 @@ struct LLVMGPUTileAndDistributePass auto funcOp = getOperation(); if (!isEntryPoint(funcOp)) return; - // Promote C matrix and propagate the potential fill producer into the temp + // Promote C matrix and propagate the potential fill producer into the temp // allocation. This needs to be done before reduction tiling. { RewritePatternSet promotionPatterns(&getContext()); @@ -200,13 +299,24 @@ struct LLVMGPUTileAndDistributePass std::move(promotionPatterns)))) { return signalPassFailure(); } + LLVM_DEBUG({ + llvm::dbgs() << "After promote C:\n"; + funcOp.dump(); + }); + propagateSharedMemoryCopy(funcOp); + + LLVM_DEBUG({ + llvm::dbgs() << "After propagateSharedMemoryCopy():\n"; + funcOp.dump(); + }); } // Tile again at the workgroup level since reduction dimension were // ignored. Dimensions already tiled will be ignore since we tile to the - // same size. - if (failed(tileToSerialLoops(funcOp))) { + // same size. For distributing to warps, peel the partial iterations as + // a separate loop, since the warp distribution is requested for wmma. + if (failed(tileToSerialLoops(funcOp, /*peel=*/distributeToWarp))) { return signalPassFailure(); } @@ -226,7 +336,6 @@ struct LLVMGPUTileAndDistributePass RewritePatternSet promotionPatterns(&getContext()); populateContractPromotionPatterns(promotionPatterns, {0, 1}); - if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(promotionPatterns)))) { return signalPassFailure(); @@ -250,7 +359,34 @@ struct LLVMGPUTileAndDistributePass }); if (distributeToWarp) { - // Apply last level of tiling and distribute to warps. + // mark candidates for the warp level tiling + funcOp.walk([&](linalg::LinalgOp op) { + if (failed(alignedOpFilter(op))) return WalkResult::skip(); + if (!isa(op)) + return WalkResult::skip(); + + if (isa(op) && + hasMarker(op, getCopyToWorkgroupMemoryMarker())) { + // The GPUDistributeSharedMemoryCopy pass will handle it later. + return WalkResult::skip(); + } + + // check if K is a multiple of Tile-K. + int64_t sizeK = getSizeK(op); + if (sizeK != ShapedType::kDynamicSize) { + // WG tile sizes + auto wgTileSizes = getTileSizes(op, 0); + + if (sizeK % wgTileSizes[wgTileSizes.size() - 1] != 0) + return WalkResult::skip(); + } + + setMarker(op, getGPUWarpLevelTilingReqMarker()); + return WalkResult::advance(); + }); + + // Apply last level of tiling and distribute to warps for aligned ops. RewritePatternSet warpLevelTilingPatterns(context); populateTilingToWarpPatterns(warpLevelTilingPatterns, workgroupSize); if (failed(applyPatternsAndFoldGreedily( @@ -258,6 +394,15 @@ struct LLVMGPUTileAndDistributePass return signalPassFailure(); } + // Apply last level of tiling and distribute to threads for unaligned ops. + RewritePatternSet threadLevelTilingPatterns(context); + populateTilingToInvocationPatterns(threadLevelTilingPatterns, + workgroupSize, + /*matchByDefault=*/false); + if (failed(applyPatternsAndFoldGreedily( + funcOp, std::move(threadLevelTilingPatterns)))) { + return signalPassFailure(); + } } else { // Apply last level of tiling and distribute to threads. RewritePatternSet threadLevelTilingPatterns(context); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileTensor.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileTensor.cpp index ae0ef3e6edc4..a9c2a7810f16 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileTensor.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileTensor.cpp @@ -32,20 +32,18 @@ namespace iree_compiler { /// Patterns for workgroup level tiling. Workgroup tiling is done at the flow /// level but we may have extra tiling for the reduction dimension. Therefore we /// tile again without distributing. -static void populateTilingPatterns(RewritePatternSet &patterns, - bool onlyReduction) { - auto tileSizesFn = [onlyReduction](OpBuilder &builder, - Operation *op) -> SmallVector { +static void populateReductionTilingPatterns(RewritePatternSet &patterns, + bool peel) { + auto tileSizesFn = [](OpBuilder &builder, + Operation *op) -> SmallVector { auto interfaceOp = cast(*op); auto partitionedLoops = interfaceOp.getPartitionableLoops(kNumMaxParallelDims); SmallVector tileSizes = getTileSizes(builder, op, 0); - if (onlyReduction) { - auto zero = builder.create(op->getLoc(), 0); - for (unsigned depth : partitionedLoops) { - if (depth < tileSizes.size()) { - tileSizes[depth] = zero; - } + auto zero = builder.create(op->getLoc(), 0); + for (unsigned depth : partitionedLoops) { + if (depth < tileSizes.size()) { + tileSizes[depth] = zero; } } return tileSizes; @@ -54,6 +52,7 @@ static void populateTilingPatterns(RewritePatternSet &patterns, auto tilingOptions = linalg::LinalgTilingOptions() .setLoopType(linalg::LinalgTilingLoopType::Loops) .setTileSizeComputationFunction(tileSizesFn); + if (peel) tilingOptions.setPeeledLoops({0}); MLIRContext *context = patterns.getContext(); IREE::LinalgExt::LinalgTransformationFilter filter( @@ -67,13 +66,13 @@ static void populateTilingPatterns(RewritePatternSet &patterns, filter); } -LogicalResult tileToSerialLoops(func::FuncOp funcOp, bool onlyReduction) { +LogicalResult tileToSerialLoops(func::FuncOp funcOp, bool peel) { { - // Tile again at the workgroup level since redution dimension were + // Tile again at the workgroup level since reduction dimension were // ignored. Dimensions already tiled will be ignore since we tile to the // same size. RewritePatternSet wgTilingPatterns(funcOp.getContext()); - populateTilingPatterns(wgTilingPatterns, onlyReduction); + populateReductionTilingPatterns(wgTilingPatterns, peel); if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(wgTilingPatterns)))) { return failure(); @@ -223,7 +222,7 @@ struct LLVMGPUTileTensorPass // Tile to serial loops to the wg tile size to handle reductions and other // dimension that have not been distributed. - if (failed(tileToSerialLoops(funcOp, /*onlyReduction=*/true))) { + if (failed(tileToSerialLoops(funcOp, /*peel=*/false))) { return signalPassFailure(); } diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp index 8d05e0d179a8..8322ba5a36ec 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp @@ -199,6 +199,12 @@ void addGPUMatmulTensorCorePassPipeline(OpPassManager &pm, tileAndBufferize(pm); auto &nestedModulePM = pm.nest(); + // Do specialization + nestedModulePM.addNestedPass( + createWorkgroupSpecializationPass()); + nestedModulePM.addPass(createCanonicalizerPass()); + nestedModulePM.addPass(createCSEPass()); + // Distribute linalg onto warps within the workgroup. nestedModulePM.addNestedPass( createLLVMGPUTileAndDistribute(/*distributeToWarp=*/true)); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TilingUtils.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/TilingUtils.h index 7ef1b93324d4..8616a92d6fc0 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TilingUtils.h +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TilingUtils.h @@ -16,7 +16,7 @@ class FuncOp; namespace iree_compiler { /// Apply tiling to reduction dimensions based on op attributes. -LogicalResult tileToSerialLoops(func::FuncOp funcOp, bool onlyReduction = true); +LogicalResult tileToSerialLoops(func::FuncOp funcOp, bool peel); } // namespace iree_compiler } // namespace mlir diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp index c04186c75ea6..8395a1adf329 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp @@ -188,20 +188,20 @@ LogicalResult verifyGPUMatmulTensorCorePipeline( // Verify the first level tile size divides the matmul // inputs A [M x K] & B [K x N] - if (lhsShape[0] % firstLevelTileSizes[0] != 0 || - lhsShape[1] % firstLevelTileSizes[2] != 0) { - return op->emitOpError( - "lhsShape doesn't factor into first level tile size for ") - << pipelineName << " [ " << lhsShape[0] << ", " << lhsShape[1] - << "]"; - } - if (rhsShape[0] % firstLevelTileSizes[2] != 0 || - rhsShape[1] % firstLevelTileSizes[1] != 0) { - return op->emitOpError( - "rhsShape doesn't factor into first level tile size for ") - << pipelineName << " [ " << rhsShape[0] << ", " << rhsShape[1] - << "]"; - } + // if (lhsShape[0] % firstLevelTileSizes[0] != 0 || + // lhsShape[1] % firstLevelTileSizes[2] != 0) { + // return op->emitOpError( + // "lhsShape doesn't factor into first level tile size for ") + // << pipelineName << " [ " << lhsShape[0] << ", " << lhsShape[1] + // << "]"; + // } + // if (rhsShape[0] % firstLevelTileSizes[2] != 0 || + // rhsShape[1] % firstLevelTileSizes[1] != 0) { + // return op->emitOpError( + // "rhsShape doesn't factor into first level tile size for ") + // << pipelineName << " [ " << rhsShape[0] << ", " << rhsShape[1] + // << "]"; + // } // Verify shared memory usage of operands after tiling requires <= 64Kb // combined space. diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir index 5164192a28e1..a9136ba3430b 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir @@ -224,70 +224,6 @@ hal.executable private @matmul_tensors { // ----- -#config = #iree_codegen.lowering_config -#translation = #iree_codegen.translation_info -#pipeline_layout = #hal.pipeline.layout, - #hal.descriptor_set.binding<1, storage_buffer>, - #hal.descriptor_set.binding<2, storage_buffer> - ]> -]> -hal.executable private @matmul_tensors { - hal.executable.variant @cuda, target = #hal.executable.target<"cuda", "cuda-nvptx-fb"> { - hal.executable.export @illegal layout(#pipeline_layout) attributes { - translation_info = #translation, - workgroup_size = [64 : index, 2 : index, 1 : index] - } - builtin.module { - func.func @illegal() { - %c0 = arith.constant 0 : index - %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<48x16xf32> - %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x32xf32> - %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<48x32xf32> - // expected-error @+1 {{lhsShape doesn't factor into first level tile size for LLVMGPUMatmulTensorCore}} - linalg.matmul {lowering_config = #config} ins(%lhs, %rhs : memref<48x16xf32>, memref<16x32xf32>) - outs(%result: memref<48x32xf32>) - return - } - } - } -} - -// ----- - -#config = #iree_codegen.lowering_config -#translation = #iree_codegen.translation_info -#pipeline_layout = #hal.pipeline.layout, - #hal.descriptor_set.binding<1, storage_buffer>, - #hal.descriptor_set.binding<2, storage_buffer> - ]> -]> -hal.executable private @matmul_tensors { - hal.executable.variant @cuda, target = #hal.executable.target<"cuda", "cuda-nvptx-fb"> { - hal.executable.export @illegal layout(#pipeline_layout) attributes { - translation_info = #translation, - workgroup_size = [64 : index, 2 : index, 1 : index] - } - builtin.module { - func.func @illegal() { - %c0 = arith.constant 0 : index - %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<32x16xf32> - %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x48xf32> - %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<32x48xf32> - // expected-error @+1 {{rhsShape doesn't factor into first level tile size for LLVMGPUMatmulTensorCore}} - linalg.matmul {lowering_config = #config} ins(%lhs, %rhs : memref<32x16xf32>, memref<16x48xf32>) - outs(%result: memref<32x48xf32>) - return - } - } - } -} - -// ----- - #config = #iree_codegen.lowering_config #translation = #iree_codegen.translation_info #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb"> diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/tensorcore_vectorization.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/tensorcore_vectorization.mlir index 425e406127ae..ba6ed805fe55 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/tensorcore_vectorization.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/tensorcore_vectorization.mlir @@ -20,7 +20,7 @@ func.func @dot() { %10 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%9] %11 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 32)>(%8) %12 = memref.subview %7[%10, %11] [32, 32] [1, 1] : memref<64x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<32x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> - linalg.fill {__internal_linalg_transform__ = "vectorize"} ins(%cst : f32) outs(%12 : memref<32x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>) + linalg.fill {__internal_linalg_transform__ = "vectorize_for_tensorcore"} ins(%cst : f32) outs(%12 : memref<32x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>) scf.for %arg0 = %c0 to %c1024 step %c16 { %13 = memref.subview %5[0, %arg0] [64, 16] [1, 1] : memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>> to memref<64x16xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>> %14 = memref.subview %6[%arg0, 0] [16, 64] [1, 1] : memref<1024x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<16x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> @@ -28,7 +28,7 @@ func.func @dot() { %16 = memref.subview %13[%10, 0] [32, 16] [1, 1] : memref<64x16xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>> to memref<32x16xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>> %17 = memref.subview %14[0, %15] [16, 32] [1, 1] : memref<16x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<16x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> %18 = memref.subview %7[%10, %15] [32, 32] [1, 1] : memref<64x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<32x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> - linalg.matmul {__internal_linalg_transform__ = "vectorize"} ins(%16, %17 : memref<32x16xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, memref<16x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>) outs(%18 : memref<32x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>) + linalg.matmul {__internal_linalg_transform__ = "vectorize_for_tensorcore"} ins(%16, %17 : memref<32x16xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, memref<16x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>) outs(%18 : memref<32x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>) } return } diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/workgroup_specialization_pipeline_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/workgroup_specialization_pipeline_test.mlir index 3703d9f75930..ce78695b3e6e 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/workgroup_specialization_pipeline_test.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/workgroup_specialization_pipeline_test.mlir @@ -42,11 +42,12 @@ module attributes {hal.device.targets = [#hal.device.target<"cuda", {executable_ // CHECK-LABEL: func.func @forward_dispatch_116_matmul_128x30522x768 // CHECK: arith.cmpi eq // CHECK: scf.if -// CHECK: vector.transfer_read -// CHECK: vector.transfer_read -// CHECK: vector.contract -// CHECK: vector.transfer_read -// CHECK: vector.broadcast -// CHECK: vector.transfer_write +// CHECK: gpu.subgroup_mma_load_matrix +// CHECK: gpu.subgroup_mma_load_matrix +// CHECK: gpu.subgroup_mma_load_matrix +// CHECK: gpu.subgroup_mma_load_matrix +// CHECK: gpu.subgroup_mma_compute +// CHECK: gpu.subgroup_mma_compute +// CHECK: gpu.subgroup_mma_store_matrix // CHECK: else // CHECK-NOT: vector.transfer diff --git a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp index e62239e9f81f..bb44b7681187 100644 --- a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp +++ b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.cpp @@ -36,6 +36,20 @@ StringRef getWorkgroupMemoryNumItemsGENumItersMarker() { return "workgroup_memory_numprocs_ge_numiters"; } +StringRef getWorkgroupSpecializationMarker() { + return "workgroup_specialization"; +} + +StringRef getGPUSimtLoweringReqMarker() { return "gpu_simt_lowering_req"; } + +StringRef getGPUTensorCoreLoweringReqMarker() { + return "gpu_tensorcore_lowering_req"; +} + +StringRef getGPUWarpLevelTilingReqMarker() { + return "gpu_warp_level_tiling_req"; +} + StringRef getCopyToWorkgroupMemoryMarker() { return "copy_to_workgroup_memory"; } @@ -44,6 +58,10 @@ StringRef getTileReductionMarker() { return "tile_reduction"; } StringRef getVectorizeMarker() { return "vectorize"; } +StringRef getVectorizeForTensorCoreMarker() { + return "vectorize_for_tensorcore"; +} + StringRef getDeleteMarker() { return "delete"; } StringRef getMarkerOrNull(Operation *op) { @@ -67,5 +85,13 @@ void setMarker(Operation *op, StringRef marker) { StringAttr::get(op->getContext(), marker)); } +Operation *findAncestorWithMarker(Operation *op, StringRef marker) { + while (op) { + if (hasMarker(op, marker)) return op; + op = op->getParentOp(); + }; + return nullptr; +} + } // namespace iree_compiler } // namespace mlir diff --git a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h index 2057ede82128..f62ddbba6cd7 100644 --- a/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h +++ b/compiler/src/iree/compiler/Codegen/Utils/MarkerUtils.h @@ -33,6 +33,19 @@ StringRef getWorkgroupMemoryMarker(); /// to workgroups L1 tiles. StringRef getWorkgroupL1TileMarker(); +/// Marker to scf.IfOp to denote that the op is inserted for workgroup +/// specialization. +StringRef getWorkgroupSpecializationMarker(); + +/// Marker to denote that the op is marked for SIMT lowering +StringRef getGPUSimtLoweringReqMarker(); + +/// Marker to denote that the op is marked for tensorcore lowering +StringRef getGPUTensorCoreLoweringReqMarker(); + +/// Marker to denote that the op is marked for warp-level tiling +StringRef getGPUWarpLevelTilingReqMarker(); + /// Marker for copy operations that are moving data from StorageClass to /// Workgroup memory. StringRef getCopyToWorkgroupMemoryMarker(); @@ -43,6 +56,9 @@ StringRef getTileReductionMarker(); /// Marker for operations that are going to be vectorized. StringRef getVectorizeMarker(); +/// Marker for operations that are going to be vectorized for tensorcore. +StringRef getVectorizeForTensorCoreMarker(); + /// Marker for tagging an operation for deletion. Tile and fuse pattern does /// not delete the original operation to not invalidate the /// `linalg::LinalgDependenceGraph` data structure. Instead it is marked with @@ -59,6 +75,9 @@ bool hasMarker(Operation *, ArrayRef markers = {}); /// Sets a given marker on an operation. void setMarker(Operation *, StringRef); +/// Find an ancester with the given marker. +Operation *findAncestorWithMarker(Operation *op, StringRef marker); + } // namespace iree_compiler } // namespace mlir diff --git a/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp b/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp index 03ae9f65fa53..600ba9bbcafe 100644 --- a/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp +++ b/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp @@ -677,5 +677,31 @@ void replaceMemrefUsesAndPropagateType(Operation *oldOp, Value val, for (Operation *op : opToDelete) op->erase(); } +LogicalResult alignedOpFilter(Operation *op) { + Operation *opWithMarker = + findAncestorWithMarker(op, getWorkgroupSpecializationMarker()); + + if (opWithMarker) { + auto ifOp = cast(opWithMarker); + return success(ifOp.getThenRegion().isAncestor(op->getParentRegion())); + } else { + return success(); + } +} + +LogicalResult unalignedOpFilter(Operation *op) { + Operation *opWithMarker = + findAncestorWithMarker(op, getWorkgroupSpecializationMarker()); + + if (opWithMarker) { + auto ifOp = cast(opWithMarker); + return success(ifOp.getElseRegion().isAncestor(op->getParentRegion())); + } else { + // When there is no workgroup specialization, it means the op is already + // aligned. + return failure(); + } +} + } // namespace iree_compiler } // namespace mlir diff --git a/compiler/src/iree/compiler/Codegen/Utils/Utils.h b/compiler/src/iree/compiler/Codegen/Utils/Utils.h index 506f707e9217..37479dfca7b5 100644 --- a/compiler/src/iree/compiler/Codegen/Utils/Utils.h +++ b/compiler/src/iree/compiler/Codegen/Utils/Utils.h @@ -175,6 +175,20 @@ linalg::LinalgLoopDistributionOptions getIREELinalgLoopDistributionOptions(); void replaceMemrefUsesAndPropagateType(Operation *oldOp, Value val, OpBuilder &builder); +/// Returns true if an op is aligned by checking if +/// 1. the op is inside the workgroup-specialized region, or +/// 2. the op's parent is not the workgroup-specialized region. +/// The second case does not have a workgroup-spcialized region because +/// it is already aligned. +LogicalResult alignedOpFilter(Operation *op); + +/// Returns true if an op is unaligned by checking if +/// 1. the op is inside the workgroup-specialized region, or +/// 2. the op's parent is not the workgroup-specialized region. +/// The second case does not have a workgroup-spcialized region because +/// it is already aligned. +LogicalResult unalignedOpFilter(Operation *op); + } // namespace iree_compiler } // namespace mlir