diff --git a/docs/PTO_IR_manual.md b/docs/PTO_IR_manual.md index 6bc72a8d..0ee4936e 100644 --- a/docs/PTO_IR_manual.md +++ b/docs/PTO_IR_manual.md @@ -65,7 +65,7 @@ Common element categories include: Element type constraints are operation-specific: - **Shape/type consistency**: most elementwise ops require all operands and results to have the same element type. -- **Numeric domain**: reductions, math ops, and division typically restrict element types to floating-point or a subset of integer types. +- **Numeric domain**: reductions, math ops, and division typically restrict element types to floating-point or a limited set of integer types. - **Bitwise ops**: require integer element types. - **Conversions**: `pto.tcvt` defines explicit element type changes and is controlled by `RoundMode` when converting between numeric domains. @@ -467,14 +467,16 @@ result = alloc_tile(base_addr, valid_row, valid_col) // operands are optional %tb3 = pto.alloc_tile addr = %ad : !pto.tile_buf ``` -##### `pto.subset` - Subview Tile View +##### `pto.subview` - Tile SubView -**Summary:** Create a strided view from a parent tile. The result tile buffer is a logical subset of the input tile buffer. +**Summary:** Create a logical subview from a parent tile. The subview window is expressed by `offsets + sizes`, while the result tile type keeps the parent tile shape. **Semantics:** ``` result = source[offsets] with static sizes +result.shape = source.shape +result.valid = clip(explicit_valid_or_sizes, sizes) ``` **Arguments:** @@ -484,27 +486,35 @@ result = source[offsets] with static sizes | `source` | `pto.tile_buf` | Parent tile buffer | | `offsets` | `Variadic` | Runtime dynamic offsets [i, j] | | `sizes` | `I64ArrayAttr` | Static shape [rows, cols] | +| `valid_row` | `Optional` | Optional explicit valid row | +| `valid_col` | `Optional` | Optional explicit valid col | **Results:** `pto.tile_buf` **Constraints & Verification:** - The verifier derives boxed-vs-non-boxed behavior from `source`'s tile config (`blayout`, `slayout`, `fractal`) and element type. -- For non-boxed layouts (`slayout=none_box`), no additional subset-specific structural checks are enforced. +- For non-boxed layouts (`slayout=none_box`), no additional subview-specific structural checks are enforced. - For boxed layouts (`slayout != none_box`): - - The tile layout must be one of the subset layouts supported by the current implementation; otherwise verification fails. - - `sizes` must be present, must have length 2, and both subset sizes must be positive. - - The subset sizes must be multiples of the inferred inner boxed shape. + - The tile layout must be one of the subview layouts supported by the current implementation; otherwise verification fails. + - `sizes` must be present, must have length 2, and both subview sizes must be positive. + - The subview sizes must be multiples of the inferred inner boxed shape. - `offsets` must have length 2. - If an offset is compile-time constant, it must be non-negative and must be a multiple of the inferred inner boxed shape in that dimension. - The source tile shape must be statically known. - - For boxed row-major tiles, the subset must keep the full source column extent, and the column offset must be the constant `0`. - - For boxed col-major tiles, the subset must keep the full source row extent, and the row offset must be the constant `0`. + - For boxed row-major tiles, the subview must keep the full source column extent, and the column offset must be the constant `0`. + - For boxed col-major tiles, the subview must keep the full source row extent, and the row offset must be the constant `0`. +- `valid_row` and `valid_col` must be both present or both absent. +- If `valid_row/valid_col` are omitted, result `valid_shape` defaults to `sizes`. +- If `valid_row/valid_col` are provided: + - constant values must be positive and `<= sizes` in each dimension + - non-constant values are represented as dynamic valid dims in the result type - The inferred result type uses: - - `shape = sizes` + - `shape = source.shape` (parent shape is preserved) - the same element type and address space as `source` - the same tile config as `source` - - a `valid_shape` derived from the parent `valid_shape` and constant offsets when possible, otherwise dynamic in that dimension + - `valid_shape` defaults to `sizes` + - if explicit `valid_row/valid_col` are provided, `valid_shape` is clipped by `sizes` **Hardware Mapping:** @@ -513,7 +523,8 @@ result = source[offsets] with static sizes **Basic Example:** ```mlir -%sub = pto.subset %src[%i, %j] sizes [32, 32] : !pto.tile_buf +%sub = pto.subview %src[%i, %j] sizes [32, 32] : !pto.tile_buf +%sub2 = pto.subview %src[%i, %j] sizes [32, 32] valid [%vr, %vc] : !pto.tile_buf ``` ##### `pto.set_validshape` - Update Dynamic Tile Valid Row/Col In Place diff --git a/include/PTO/IR/PTOOps.td b/include/PTO/IR/PTOOps.td index 39a7e678..33fef71d 100644 --- a/include/PTO/IR/PTOOps.td +++ b/include/PTO/IR/PTOOps.td @@ -270,16 +270,20 @@ def BindTileOp : PTO_Op<"bind_tile", [ }]; } -def SubsetOp : PTO_Op<"subset", [ +def SubViewOp : PTO_Op<"subview", [ Pure, ViewLikeOpInterface, + AttrSizedOperandSegments, DeclareOpInterfaceMethods // 启用 C++ 推导 ]> { - let summary = "Create a strided view (subset) from a parent tile."; + let summary = "Create a tile subview from a parent tile (parent-shape + valid dims)."; let description = [{ Creates a view into the source tile. - - Result Shape: Defined by static `sizes`. + - Logical subview window: defined by `offsets` + static `sizes`. + - Result tile type shape: inherited from parent `source`. + - Effective subview extent: represented by inferred `valid_row/valid_col` + (i.e. `valid_shape`), clipped by subview `sizes`. - Result Strides: Inherited from `source`. - Result Offset: Represented as multi-dimensional symbols (s0, s1...) in the layout map. }]; @@ -287,16 +291,20 @@ def SubsetOp : PTO_Op<"subset", [ let arguments = (ins TileBufType:$source, Variadic:$offsets, // 运行时动态偏移 [i, j] - I64ArrayAttr:$sizes // 静态形状 [32, 32] + I64ArrayAttr:$sizes, // 静态形状 [32, 32] + Optional:$valid_row, + Optional:$valid_col ); let results = (outs TileBufType:$result); let hasVerifier = 1; - // 语法示例: %sub = pto.subset %src[%i, %j] sizes [32, 32] : !type + // 语法示例: %sub = pto.subview %src[%i, %j] sizes [32, 32] : !type // 注意:没有 -> qualified(type($result)) let assemblyFormat = [{ - $source `[` $offsets `]` `sizes` $sizes attr-dict `:` qualified(type($source)) + $source `[` $offsets `]` `sizes` $sizes + (`valid` `[` $valid_row^ `,` $valid_col `]`)? + attr-dict `:` qualified(type($source)) }]; // [新增] 显式实现 ViewLikeOpInterface 缺失的方法 diff --git a/lib/PTO/IR/PTO.cpp b/lib/PTO/IR/PTO.cpp index 08fd63ac..320dbce9 100644 --- a/lib/PTO/IR/PTO.cpp +++ b/lib/PTO/IR/PTO.cpp @@ -6895,13 +6895,13 @@ static void printLayout(AsmPrinter &printer, Attribute layoutAttr) { // ---- TileBuf --- -// Tile subset 相关实现 +// Tile subview 相关实现 // ============================================================================= -// Op Interface Implementation: SubsetOp +// Op Interface Implementation: SubViewOp // ============================================================================= -LogicalResult SubsetOp::inferReturnTypes( +LogicalResult SubViewOp::inferReturnTypes( MLIRContext *context, std::optional location, ValueRange operands, DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions, SmallVectorImpl &inferredReturnTypes) { @@ -6911,10 +6911,10 @@ LogicalResult SubsetOp::inferReturnTypes( auto sourceType = llvm::dyn_cast(operands[0].getType()); if (!sourceType) return failure(); - // 2. 获取 Result Shape (Sizes) + // 2. 获取 subview 逻辑窗口(sizes) ArrayAttr sizeAttr; if (properties) { - const auto *prop = properties.as(); + const auto *prop = properties.as(); if (prop) sizeAttr = prop->sizes; } if (!sizeAttr && attributes) { @@ -6922,60 +6922,38 @@ LogicalResult SubsetOp::inferReturnTypes( } if (!sizeAttr) return failure(); - SmallVector resultShape; + SmallVector subviewShape; for (auto attr : sizeAttr) { int64_t dim = llvm::cast(attr).getInt(); - resultShape.push_back(dim); + subviewShape.push_back(dim); } - // Derive valid shape from parent valid dims when possible. + // Design: subview 的结果 tile 类型继承父 tile 的 shape;subview 的逻辑大小 + // 通过 valid_row/valid_col(valid_shape)表达,而不是改写 type-level shape。 + ArrayRef parentShape = sourceType.getShape(); + if (subviewShape.size() != parentShape.size()) + return failure(); + + // Derive valid shape from explicit valid_row/valid_col when provided. + // Otherwise default to subview shape (no parent valid-shape inheritance). SmallVector validShape; constexpr int64_t kDynamicValidDim = -1; - ArrayRef parentValid = sourceType.getValidShape(); - for (size_t i = 0, e = resultShape.size(); i < e; ++i) { - int64_t sizeDim = resultShape[i]; - int64_t vdim = sizeDim; - - if (parentValid.size() == resultShape.size()) { - int64_t pv = parentValid[i]; - if (pv < 0) { - vdim = kDynamicValidDim; - } else { - int64_t off = 0; - // operands: [source, offsets...] - if (operands.size() > 1 + i) { - auto offOpt = getConstIndexValue(operands[1 + i]); - if (!offOpt) { - vdim = kDynamicValidDim; - validShape.push_back(vdim); - continue; - } - off = *offOpt; - // Interpret parent valid dims as a per-tile "period" when the parent - // buffer is wider than the valid region (e.g. ping/pong workspace). - // This avoids inferring a zero valid dim when taking a view at an - // offset equal to the parent valid dim. - // - // Example: - // parent: shape 32x64, valid 32x32 - // subset: offset [0,32], sizes [32,32] - // should infer v_col=32 (not 0). - int64_t diff = 0; - if (pv > 0) { - int64_t offMod = off % pv; - if (offMod < 0) - offMod += pv; - diff = pv - offMod; // in [1, pv] when pv>0 - } - if (diff < 0) - diff = 0; - vdim = std::min(sizeDim, diff); - } else { - vdim = kDynamicValidDim; - } - } + int64_t rank = static_cast(subviewShape.size()); + size_t expectedWithoutValid = static_cast(1 + rank); + Value explicitVRow; + Value explicitVCol; + if (rank == 2 && operands.size() >= expectedWithoutValid + 2) { + explicitVRow = operands[expectedWithoutValid]; + explicitVCol = operands[expectedWithoutValid + 1]; + } + + for (size_t i = 0, e = subviewShape.size(); i < e; ++i) { + int64_t vdim = subviewShape[i]; + Value explicitV = (i == 0) ? explicitVRow : (i == 1 ? explicitVCol : Value()); + if (explicitV) { + auto cst = getConstIndexValue(explicitV); + vdim = cst ? std::min(*cst, subviewShape[i]) : kDynamicValidDim; } - validShape.push_back(vdim); } @@ -6986,7 +6964,7 @@ LogicalResult SubsetOp::inferReturnTypes( // 4. 构建 Result Type auto canonicalValidShape = canonicalizeTileBufValidShape(validShape); auto resultType = TileBufType::get( - context, resultShape, sourceType.getElementType(), + context, parentShape, sourceType.getElementType(), sourceType.getMemorySpace(), canonicalValidShape, cfg); inferredReturnTypes.push_back(resultType); @@ -6994,7 +6972,7 @@ LogicalResult SubsetOp::inferReturnTypes( } // ============================================================================= -// SubsetOp verifier +// SubViewOp verifier // ============================================================================= static bool getConstIndex(Value v, int64_t &out) { if (auto cOp = v.getDefiningOp()) { @@ -7097,7 +7075,7 @@ static LogicalResult computeInnerShape(TileBufConfigAttr cfg, Type elemTy, return failure(); } -mlir::LogicalResult mlir::pto::SubsetOp::verify() { +mlir::LogicalResult mlir::pto::SubViewOp::verify() { if (shouldBypassDecodedMemrefVerifier(getOperation())) return success(); auto srcTy = llvm::dyn_cast(getSource().getType()); @@ -7107,6 +7085,73 @@ mlir::LogicalResult mlir::pto::SubsetOp::verify() { if (srcTy.getRank() != 2 || dstTy.getRank() != 2) return emitOpError("expects rank-2 tilebuf for src/dst"); + auto sizesAttr = getSizes(); + if (!sizesAttr || sizesAttr.size() != 2) + return emitOpError("subview expects 2D sizes"); + int64_t sizeR = cast(sizesAttr[0]).getInt(); + int64_t sizeC = cast(sizesAttr[1]).getInt(); + if (sizeR <= 0 || sizeC <= 0) + return emitOpError("subview sizes must be positive"); + if (getOffsets().size() != 2) + return emitOpError("subview expects 2D offsets"); + + int64_t offR = 0, offC = 0; + bool offRConst = getConstIndex(getOffsets()[0], offR); + bool offCConst = getConstIndex(getOffsets()[1], offC); + if (offRConst && offR < 0) + return emitOpError("subview offsets must be non-negative"); + if (offCConst && offC < 0) + return emitOpError("subview offsets must be non-negative"); + + bool hasValidRow = static_cast(getValidRow()); + bool hasValidCol = static_cast(getValidCol()); + if (hasValidRow != hasValidCol) + return emitOpError( + "subview expects valid_row and valid_col to be both present or both absent"); + + if (hasValidRow) { + int64_t vRow = 0, vCol = 0; + if (getConstIndex(getValidRow(), vRow)) { + if (vRow <= 0) + return emitOpError("valid_row must be positive when constant"); + if (vRow > sizeR) + return emitOpError("valid_row must be <= subview row size"); + } + if (getConstIndex(getValidCol(), vCol)) { + if (vCol <= 0) + return emitOpError("valid_col must be positive when constant"); + if (vCol > sizeC) + return emitOpError("valid_col must be <= subview col size"); + } + } + + auto dstShape = dstTy.getShape(); + if (dstShape.size() != 2) + return emitOpError("expects result to be rank-2"); + auto srcShape = srcTy.getShape(); + if (srcShape.size() != 2) + return emitOpError("expects source to be rank-2"); + if (dstShape[0] != srcShape[0] || dstShape[1] != srcShape[1]) + return emitOpError("expects result shape to match source shape"); + + auto expectedValidDim = [&](Value explicitValid, int64_t defaultSize) { + if (!explicitValid) + return defaultSize; + int64_t c = 0; + if (getConstIndex(explicitValid, c)) + return std::min(c, defaultSize); + return ShapedType::kDynamic; + }; + int64_t expectedVRow = expectedValidDim(getValidRow(), sizeR); + int64_t expectedVCol = expectedValidDim(getValidCol(), sizeC); + auto dstValid = dstTy.getValidShape(); + if (dstValid.size() != 2) + return emitOpError("expects result to have rank-2 valid_shape"); + if (dstValid[0] != expectedVRow) + return emitOpError("expects result valid_shape[0] to match inferred/explicit valid_row"); + if (dstValid[1] != expectedVCol) + return emitOpError("expects result valid_shape[1] to match inferred/explicit valid_col"); + auto cfg = srcTy.getConfigAttr(); if (!cfg) cfg = TileBufConfigAttr::getDefault(getContext()); @@ -7115,62 +7160,41 @@ mlir::LogicalResult mlir::pto::SubsetOp::verify() { int32_t bl = 0, sl = 0; if (failed(computeInnerShape(cfg, srcTy.getElementType(), innerRows, innerCols, boxed, bl, sl))) - return emitOpError("unsupported tile layout for subset"); + return emitOpError("unsupported tile layout for subview"); if (!boxed) return success(); // Boxed layout: require static 2D sizes with inner alignment. Offsets may be // dynamic, but static offsets must be aligned. - auto sizesAttr = getSizes(); - if (!sizesAttr || sizesAttr.size() != 2) - return emitOpError("boxed layout subset expects 2D sizes"); - - int64_t sizeR = cast(sizesAttr[0]).getInt(); - int64_t sizeC = cast(sizesAttr[1]).getInt(); - if (sizeR <= 0 || sizeC <= 0) - return emitOpError("subset sizes must be positive"); - if (sizeR % innerRows != 0 || sizeC % innerCols != 0) - return emitOpError("boxed layout subset sizes must be multiples of inner shape"); - - if (getOffsets().size() != 2) - return emitOpError("boxed layout subset expects 2D offsets"); - - int64_t offR = 0, offC = 0; - bool offRConst = getConstIndex(getOffsets()[0], offR); - bool offCConst = getConstIndex(getOffsets()[1], offC); + return emitOpError("boxed layout subview sizes must be multiples of inner shape"); if (offRConst) { - if (offR < 0) - return emitOpError("subset offsets must be non-negative"); if (offR % innerRows != 0) - return emitOpError("boxed layout subset offsets must be multiples of inner shape"); + return emitOpError("boxed layout subview offsets must be multiples of inner shape"); } if (offCConst) { - if (offC < 0) - return emitOpError("subset offsets must be non-negative"); if (offC % innerCols != 0) - return emitOpError("boxed layout subset offsets must be multiples of inner shape"); + return emitOpError("boxed layout subview offsets must be multiples of inner shape"); } - auto srcShape = srcTy.getShape(); if (srcShape.size() == 2 && srcShape[0] != ShapedType::kDynamic && srcShape[1] != ShapedType::kDynamic) { if (bl == 0) { if (sizeC != srcShape[1]) - return emitOpError("boxed RowMajor subset must keep full cols"); + return emitOpError("boxed RowMajor subview must keep full cols"); if (!offCConst || offC != 0) - return emitOpError("boxed RowMajor subset requires static col offset = 0"); + return emitOpError("boxed RowMajor subview requires static col offset = 0"); } else if (bl == 1) { if (sizeR != srcShape[0]) - return emitOpError("boxed ColMajor subset must keep full rows"); + return emitOpError("boxed ColMajor subview must keep full rows"); if (!offRConst || offR != 0) - return emitOpError("boxed ColMajor subset requires static row offset = 0"); + return emitOpError("boxed ColMajor subview requires static row offset = 0"); } } else { - return emitOpError("boxed layout subset requires static source shape"); + return emitOpError("boxed layout subview requires static source shape"); } return success(); diff --git a/lib/PTO/Transforms/PTOToEmitC.cpp b/lib/PTO/Transforms/PTOToEmitC.cpp index 9ad62e11..9501bbf1 100644 --- a/lib/PTO/Transforms/PTOToEmitC.cpp +++ b/lib/PTO/Transforms/PTOToEmitC.cpp @@ -3366,6 +3366,8 @@ struct PointerCastConversion : public OpConversionPattern { auto *ctx = rewriter.getContext(); auto selfType = mlir::cast(op.getType()); ArrayRef shape = selfType.getShape(); + int64_t physRows = shape.size() > 0 ? shape[0] : ShapedType::kDynamic; + int64_t physCols = shape.size() > 1 ? shape[1] : ShapedType::kDynamic; Type elemType = selfType.getElementType(); // 1. 推导 Tile Role @@ -3386,10 +3388,10 @@ struct PointerCastConversion : public OpConversionPattern { return (dim == ShapedType::kDynamic) ? std::string(symbol) : std::to_string(dim); }; - if (role == TileRole::Left) dimStr = dimToString(shape[0], "M") + ", " + dimToString(shape[1], "K"); - else if (role == TileRole::Right) dimStr = dimToString(shape[0], "K") + ", " + dimToString(shape[1], "N"); - else if (role == TileRole::Bias) dimStr = "1, " + dimToString(shape[1], "N"); - else dimStr = dimToString(shape[0], "M") + ", " + dimToString(shape[1], "N"); + if (role == TileRole::Left) dimStr = dimToString(physRows, "M") + ", " + dimToString(physCols, "K"); + else if (role == TileRole::Right) dimStr = dimToString(physRows, "K") + ", " + dimToString(physCols, "N"); + else if (role == TileRole::Bias) dimStr = "1, " + dimToString(physCols, "N"); + else dimStr = dimToString(physRows, "M") + ", " + dimToString(physCols, "N"); // 3. Role Token const char *roleTok = "TileType::Vec"; @@ -3470,9 +3472,9 @@ struct PointerCastConversion : public OpConversionPattern { vcolTok = "-1"; useConstructor = true; constructorArgs.push_back( - makeCtorDimValue(vRowEmitC, rowIsConst ? cRow : shape[0])); + makeCtorDimValue(vRowEmitC, rowIsConst ? cRow : physRows)); constructorArgs.push_back( - makeCtorDimValue(vColEmitC, colIsConst ? cCol : shape[1])); + makeCtorDimValue(vColEmitC, colIsConst ? cCol : physCols)); } else { if (rowIsConst) { vrowTok = std::to_string(cRow); @@ -3481,7 +3483,7 @@ struct PointerCastConversion : public OpConversionPattern { rowIsDynamic = true; useConstructor = true; } else { - vrowTok = std::to_string(shape[0]); + vrowTok = std::to_string(physRows); } if (colIsConst) { @@ -3491,7 +3493,7 @@ struct PointerCastConversion : public OpConversionPattern { colIsDynamic = true; useConstructor = true; } else { - vcolTok = std::to_string(shape[1]); + vcolTok = std::to_string(physCols); } if (useConstructor) { @@ -7475,6 +7477,7 @@ struct PTOBindTileToEmitC : public OpConversionPattern { auto *ctx = rewriter.getContext(); auto configAttr = op.getConfigAttr(); auto viewSemantics = op->getAttrOfType("pto.view_semantics"); + bool isSubView = viewSemantics && viewSemantics.getValue() == "subview"; auto peelAllCasts = [](Value v) { while (auto castOp = v.getDefiningOp()) @@ -7758,6 +7761,25 @@ struct PTOBindTileToEmitC : public OpConversionPattern { return success(); } + // Subview origins are kept distinct from generic tile rebinding: + // even when source/destination C++ tile types match, subview may carry + // shifted base address semantics and should materialize a fresh handle. + if (isSubView) { + FailureOr tileSpec = buildTileSpec(); + if (failed(tileSpec)) + return failure(); + Value dstTile = buildTileValue(*tileSpec); + FailureOr addr = buildIntegralAddress(tileCandidate); + if (failed(addr)) + return failure(); + + rewriter.create(loc, TypeRange{}, "TASSIGN", + ArrayAttr{}, ArrayAttr{}, + ValueRange{dstTile, *addr}); + rewriter.replaceOp(op, dstTile); + return success(); + } + // Generic tile-to-tile rebind path: preserve the same backing storage and // rebuild a sibling tile with updated metadata/valid dims. if (isTileLike(tileCandidate)) { @@ -7805,6 +7827,8 @@ struct PTOBindTileToEmitC : public OpConversionPattern { auto newCast = rewriter.create( loc, op.getType(), physAddrs, vRow ? vRow : Value(), vCol ? vCol : Value(), configAttr); + if (viewSemantics) + newCast->setAttr("pto.view_semantics", viewSemantics); if (op->hasAttr(kForceDynamicValidShapeAttrName)) newCast->setAttr(kForceDynamicValidShapeAttrName, op->getAttr(kForceDynamicValidShapeAttrName)); diff --git a/lib/PTO/Transforms/PTOViewToMemref.cpp b/lib/PTO/Transforms/PTOViewToMemref.cpp index a8d9f6c7..1f0e89b2 100644 --- a/lib/PTO/Transforms/PTOViewToMemref.cpp +++ b/lib/PTO/Transforms/PTOViewToMemref.cpp @@ -30,6 +30,8 @@ #include #include #include +#include +#include using namespace mlir; @@ -311,46 +313,21 @@ static Value ensureIndex(IRRewriter &rewriter, Location loc, Value v, return Value(); } -static Value computeSubsetValidDim(IRRewriter &rewriter, Location loc, - Value parentValid, Value offset, - int64_t size, Operation *anchorOp) { +static Value clampSubViewValidDim(IRRewriter &rewriter, Location loc, + Value explicitValid, int64_t size, + Operation *anchorOp) { Value sizeVal = rewriter.create(loc, size); - if (!parentValid) + if (!explicitValid) return sizeVal; - int64_t pvConst = 0, offConst = 0; - if (getConstIndexValue(parentValid, pvConst) && - getConstIndexValue(offset, offConst)) { - int64_t diff = 0; - if (pvConst > 0) { - int64_t offMod = offConst % pvConst; - if (offMod < 0) - offMod += pvConst; - diff = pvConst - offMod; // in [1, pvConst] when pvConst>0 - } - if (diff < 0) - diff = 0; - int64_t clipped = std::min(size, diff); - return rewriter.create(loc, clipped); - } + int64_t cst = 0; + if (getConstIndexValue(explicitValid, cst)) + return rewriter.create(loc, std::min(cst, size)); - Value pv = ensureIndex(rewriter, loc, parentValid, anchorOp); - Value off = ensureIndex(rewriter, loc, offset, anchorOp); - - // Use the same "periodic valid dims" rule as SubsetOp::inferReturnTypes: - // diff = pv - (off % pv), so offsets that land on the next tile (off == pv) - // still produce a full valid dim (diff == pv), instead of 0. - Type i64Ty = rewriter.getI64Type(); - Value pvI64 = rewriter.create(loc, i64Ty, pv); - Value offI64 = rewriter.create(loc, i64Ty, off); - Value remI64 = rewriter.create(loc, offI64, pvI64); - Value diffI64 = rewriter.create(loc, pvI64, remI64); - Value diff = rewriter.create(loc, rewriter.getIndexType(), - diffI64); - - Value lt = rewriter.create(loc, arith::CmpIPredicate::slt, diff, + Value v = ensureIndex(rewriter, loc, explicitValid, anchorOp); + Value lt = rewriter.create(loc, arith::CmpIPredicate::slt, v, sizeVal); - return rewriter.create(loc, lt, diff, sizeVal); + return rewriter.create(loc, lt, v, sizeVal); } static void dumpPretty(Operation *op, llvm::raw_ostream &os) { @@ -931,12 +908,12 @@ struct PTOViewToMemrefPass } // ------------------------------------------------------------------ - // Stage 2.4: lower pto.subset -> memref.subview + bind_tile + // Stage 2.4: lower pto.subview -> memref.subview + bind_tile // ------------------------------------------------------------------ - SmallVector subsets; - func.walk([&](mlir::pto::SubsetOp op) { subsets.push_back(op); }); + SmallVector subViews; + func.walk([&](mlir::pto::SubViewOp op) { subViews.push_back(op); }); - for (auto op : subsets) { + for (auto op : subViews) { IRRewriter rewriter(ctx); rewriter.setInsertionPoint(op); Location loc = op.getLoc(); @@ -947,7 +924,7 @@ struct PTOViewToMemrefPass Value src = op->getOperand(0); auto srcMrTy = dyn_cast(src.getType()); if (!srcMrTy) { - op.emitError("pto.subset source must be lowered to memref first"); + op.emitError("pto.subview source must be lowered to memref first"); signalPassFailure(); return; } @@ -991,14 +968,14 @@ struct PTOViewToMemrefPass computeTileLayoutInfo(configAttr, srcMrTy.getElementType(), srcMrTy.getShape(), layoutInfo); if (!hasLayout) { - op.emitError("unsupported tile layout for pto.subset"); + op.emitError("unsupported tile layout for pto.subview"); signalPassFailure(); return; } if (layoutInfo.boxed) { if (staticSizes.size() != 2 || op.getOffsets().size() != 2) { - op.emitError("boxed layout subset expects 2D sizes/offsets"); + op.emitError("boxed layout subview expects 2D sizes/offsets"); signalPassFailure(); return; } @@ -1042,23 +1019,23 @@ struct PTOViewToMemrefPass if (srcShape.size() == 2) { if (bl == 0) { if (staticSizes[1] != srcShape[1]) { - op.emitError("boxed RowMajor subset must keep full cols"); + op.emitError("boxed RowMajor subview must keep full cols"); signalPassFailure(); return; } if (!off1Const || off1 != 0) { - op.emitError("boxed RowMajor subset requires static col offset = 0"); + op.emitError("boxed RowMajor subview requires static col offset = 0"); signalPassFailure(); return; } } else { if (staticSizes[0] != srcShape[0]) { - op.emitError("boxed ColMajor subset must keep full rows"); + op.emitError("boxed ColMajor subview must keep full rows"); signalPassFailure(); return; } if (!off0Const || off0 != 0) { - op.emitError("boxed ColMajor subset requires static row offset = 0"); + op.emitError("boxed ColMajor subview requires static row offset = 0"); signalPassFailure(); return; } @@ -1066,7 +1043,13 @@ struct PTOViewToMemrefPass } } - // 4. Result layout inherits source strides (offset is dynamic) + // 4. Result layout inherits source strides (offset is dynamic). + // + // Design choice: + // - Keep lowering for compact/non-compact subview unified. + // - Lowered subview tile uses the *parent tile shape*. + // - Sub-tile size is represented through valid_row/valid_col. + // This avoids bifurcating codegen paths based on address compactness. SmallVector srcStrides; int64_t srcOffset = ShapedType::kDynamic; if (failed(getStridesAndOffset(srcMrTy, srcStrides, srcOffset))) { @@ -1082,39 +1065,91 @@ struct PTOViewToMemrefPass (void)srcOffset; auto resultLayout = StridedLayoutAttr::get(ctx, ShapedType::kDynamic, srcStrides); + auto parentShape = srcMrTy.getShape(); auto resultMemRefType = + MemRefType::get(parentShape, srcMrTy.getElementType(), resultLayout, + srcMrTy.getMemorySpace()); + + // 5. Build subview first (base address shifted by offsets). + // The intermediate subview keeps static subview sizes. + auto subViewMemRefType = MemRefType::get(staticSizes, srcMrTy.getElementType(), resultLayout, srcMrTy.getMemorySpace()); - // 5. Strides for subview: keep same stride (use 1) + // Strides for subview: element-wise stepping on each dim. SmallVector mixedStrides; mixedStrides.reserve(staticSizes.size()); for (size_t i = 0; i < staticSizes.size(); ++i) mixedStrides.push_back(rewriter.getIndexAttr(1)); auto sv = rewriter.create( - loc, resultMemRefType, src, mixedOffsets, mixedSizes, mixedStrides); + loc, subViewMemRefType, src, mixedOffsets, mixedSizes, mixedStrides); + + // Reinterpret the subview base as a parent-shaped tile view. + // valid_row/valid_col (below) carries the actual sub-tile extent. + SmallVector parentMixedSizes; + SmallVector parentMixedStrides; + parentMixedSizes.reserve(parentShape.size()); + parentMixedStrides.reserve(srcStrides.size()); + + memref::ExtractStridedMetadataOp srcMd; + bool needDynamicMeta = false; + for (size_t i = 0; i < parentShape.size(); ++i) + needDynamicMeta |= (parentShape[i] == ShapedType::kDynamic); + for (int64_t s : srcStrides) + needDynamicMeta |= (s == ShapedType::kDynamic); + if (needDynamicMeta) + srcMd = rewriter.create(loc, src); + + for (size_t i = 0; i < parentShape.size(); ++i) { + if (parentShape[i] == ShapedType::kDynamic) { + if (!srcMd) { + op.emitError("failed to materialize dynamic parent size for subview"); + signalPassFailure(); + return; + } + parentMixedSizes.push_back(srcMd.getSizes()[i]); + } else { + parentMixedSizes.push_back(rewriter.getIndexAttr(parentShape[i])); + } + } - // 6. Re-bind tile metadata (config + valid dims) - Value parentVRow; - Value parentVCol; - lookupValidDims(src, parentVRow, parentVCol); + for (size_t i = 0; i < srcStrides.size(); ++i) { + if (srcStrides[i] == ShapedType::kDynamic) { + if (!srcMd) { + op.emitError("failed to materialize dynamic parent stride for subview"); + signalPassFailure(); + return; + } + parentMixedStrides.push_back(srcMd.getStrides()[i]); + } else { + parentMixedStrides.push_back(rewriter.getIndexAttr(srcStrides[i])); + } + } + + auto subAsParent = rewriter.create( + loc, resultMemRefType, sv.getResult(), rewriter.getIndexAttr(0), + parentMixedSizes, parentMixedStrides); + // 6. Re-bind tile metadata (config + valid dims). + // subview defaults valid dims to subview shape unless user explicitly + // provides valid_row/valid_col. Value vRow; Value vCol; if (!staticSizes.empty()) - vRow = computeSubsetValidDim(rewriter, loc, parentVRow, - op.getOffsets()[0], staticSizes[0], op); + vRow = clampSubViewValidDim(rewriter, loc, op.getValidRow(), + staticSizes[0], op); if (staticSizes.size() > 1) - vCol = computeSubsetValidDim(rewriter, loc, parentVCol, - op.getOffsets()[1], staticSizes[1], op); + vCol = clampSubViewValidDim(rewriter, loc, op.getValidCol(), + staticSizes[1], op); auto bindOp = rewriter.create( - loc, resultMemRefType, sv.getResult(), + loc, resultMemRefType, subAsParent.getResult(), vRow ? vRow : Value(), vCol ? vCol : Value(), configAttr); markForceDynamicValidShape(bindOp, resultTileTy && resultTileTy.hasDynamicValid(), ctx); + bindOp->setAttr("pto.view_semantics", rewriter.getStringAttr("subview")); rewriter.replaceOp(op, bindOp.getResult()); } diff --git a/test/basic/subview_bind_tile_preserve_stride.pto b/test/basic/subview_bind_tile_preserve_stride.pto new file mode 100644 index 00000000..9b4b888a --- /dev/null +++ b/test/basic/subview_bind_tile_preserve_stride.pto @@ -0,0 +1,41 @@ +// RUN: ptoas %s 2>&1 | FileCheck %s + +module { + func.func @subview_bind_tile_preserve_stride( + %src: memref<16x16xf32, #pto.address_space>, + %dst0: memref<8x8xf32, #pto.address_space>, + %dst1: memref<8x8xf32, #pto.address_space>, + %dst2: memref<8x8xf32, #pto.address_space>, + %dst3: memref<8x8xf32, #pto.address_space>) { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + + %tile = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%src : memref<16x16xf32, #pto.address_space>) + outs(%tile : !pto.tile_buf) + + %s00 = pto.subview %tile[%c0, %c0] sizes [8, 8] : + !pto.tile_buf + %s01 = pto.subview %tile[%c0, %c8] sizes [8, 8] : + !pto.tile_buf + %s10 = pto.subview %tile[%c8, %c0] sizes [8, 8] : + !pto.tile_buf + %s11 = pto.subview %tile[%c8, %c8] sizes [8, 8] : + !pto.tile_buf + + pto.tstore ins(%s00 : !pto.tile_buf) + outs(%dst0 : memref<8x8xf32, #pto.address_space>) + pto.tstore ins(%s01 : !pto.tile_buf) + outs(%dst1 : memref<8x8xf32, #pto.address_space>) + pto.tstore ins(%s10 : !pto.tile_buf) + outs(%dst2 : memref<8x8xf32, #pto.address_space>) + pto.tstore ins(%s11 : !pto.tile_buf) + outs(%dst3 : memref<8x8xf32, #pto.address_space>) + return + } +} + +// CHECK-COUNT-4: Tile +// CHECK-COUNT-4: TSTORE( +// CHECK-NOT: Tile +// CHECK-NOT: copy_ubuf_to_gm_align_b32( diff --git a/test/basic/subview_col_major_compact_keeps_normal_shape.pto b/test/basic/subview_col_major_compact_keeps_normal_shape.pto new file mode 100644 index 00000000..43640bc8 --- /dev/null +++ b/test/basic/subview_col_major_compact_keeps_normal_shape.pto @@ -0,0 +1,23 @@ +// RUN: ptoas %s 2>&1 | FileCheck %s + +module { + func.func @subview_col_major_compact_keeps_normal_shape( + %src: memref<16x16xf32, #pto.address_space>, + %dst: memref<16x8xf32, #pto.address_space>) { + %c0 = arith.constant 0 : index + + %tile = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%src : memref<16x16xf32, #pto.address_space>) + outs(%tile : !pto.tile_buf) + + %s = pto.subview %tile[%c0, %c0] sizes [16, 8] : + !pto.tile_buf + + pto.tstore ins(%s : !pto.tile_buf) + outs(%dst : memref<16x8xf32, #pto.address_space>) + return + } +} + +// CHECK: Tile +// CHECK-NOT: Tile diff --git a/test/basic/subview_col_major_noncompact_preserve_stride.pto b/test/basic/subview_col_major_noncompact_preserve_stride.pto new file mode 100644 index 00000000..a73da8de --- /dev/null +++ b/test/basic/subview_col_major_noncompact_preserve_stride.pto @@ -0,0 +1,24 @@ +// RUN: ptoas %s 2>&1 | FileCheck %s + +module { + func.func @subview_col_major_noncompact_preserve_stride( + %src: memref<16x16xf32, #pto.address_space>, + %dst: memref<8x8xf32, #pto.address_space>) { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + + %tile = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%src : memref<16x16xf32, #pto.address_space>) + outs(%tile : !pto.tile_buf) + + %s = pto.subview %tile[%c8, %c0] sizes [8, 8] : + !pto.tile_buf + + pto.tstore ins(%s : !pto.tile_buf) + outs(%dst : memref<8x8xf32, #pto.address_space>) + return + } +} + +// CHECK: Tile +// CHECK-NOT: Tile diff --git a/test/basic/subview_compact_keeps_normal_shape.pto b/test/basic/subview_compact_keeps_normal_shape.pto new file mode 100644 index 00000000..499b743d --- /dev/null +++ b/test/basic/subview_compact_keeps_normal_shape.pto @@ -0,0 +1,23 @@ +// RUN: ptoas %s 2>&1 | FileCheck %s + +module { + func.func @subview_compact_keeps_normal_shape( + %src: memref<16x16xf32, #pto.address_space>, + %dst: memref<8x16xf32, #pto.address_space>) { + %c0 = arith.constant 0 : index + + %tile = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%src : memref<16x16xf32, #pto.address_space>) + outs(%tile : !pto.tile_buf) + + %s0 = pto.subview %tile[%c0, %c0] sizes [8, 16] : + !pto.tile_buf + + pto.tstore ins(%s0 : !pto.tile_buf) + outs(%dst : memref<8x16xf32, #pto.address_space>) + return + } +} + +// CHECK: Tile +// CHECK-NOT: Tile diff --git a/test/basic/subview_validshape_guard.pto b/test/basic/subview_validshape_guard.pto new file mode 100644 index 00000000..c508016f --- /dev/null +++ b/test/basic/subview_validshape_guard.pto @@ -0,0 +1,72 @@ +// RUN: ptoas %s 2>&1 | FileCheck %s + +module { + func.func @subview_validshape_noncompact_default( + %dst: memref<2x2xf32, #pto.address_space>) { + %c2 = arith.constant 2 : index + + %tile = pto.alloc_tile : !pto.tile_buf + %s = pto.subview %tile[%c2, %c2] sizes [2, 2] : + !pto.tile_buf + + pto.tstore ins(%s : !pto.tile_buf) + outs(%dst : memref<2x2xf32, #pto.address_space>) + return + } + + func.func @subview_validshape_compact_default( + %dst: memref<2x4xf32, #pto.address_space>) { + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + + %tile = pto.alloc_tile : !pto.tile_buf + %s = pto.subview %tile[%c2, %c0] sizes [2, 4] : + !pto.tile_buf + + pto.tstore ins(%s : !pto.tile_buf) + outs(%dst : memref<2x4xf32, #pto.address_space>) + return + } + + func.func @subview_validshape_explicit_override( + %dst: memref<2x2xf32, #pto.address_space>) { + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + + %tile = pto.alloc_tile : !pto.tile_buf + %s = pto.subview %tile[%c2, %c2] sizes [2, 2] valid [%c1, %c1] : + !pto.tile_buf + + pto.tstore ins(%s : !pto.tile_buf) + outs(%dst : memref<2x2xf32, #pto.address_space>) + return + } + + func.func @subview_validshape_explicit_dynamic( + %vr: index, %vc: index, + %dst: memref<2x2xf32, #pto.address_space>) { + %c2 = arith.constant 2 : index + + %tile = pto.alloc_tile : !pto.tile_buf + %s = pto.subview %tile[%c2, %c2] sizes [2, 2] valid [%vr, %vc] : + !pto.tile_buf + + pto.tstore ins(%s : !pto.tile_buf) + outs(%dst : memref<2x2xf32, #pto.address_space>) + return + } +} + +// Default non-compact subview keeps parent physical shape (4x4) but valid defaults +// to subview shape (2x2), not clipped from parent valid shape. +// CHECK: Tile +// CHECK-NOT: Tile + +// Default compact subview keeps compact shape and valid defaults to sizes (2x4). +// CHECK: Tile + +// Explicit valid override takes effect. +// CHECK: Tile + +// Dynamic explicit valid keeps dynamic valid dims at lowering. +// CHECK: Tile diff --git a/test/samples/Subset/Subset.pto b/test/samples/SubView/SubView.pto similarity index 87% rename from test/samples/Subset/Subset.pto rename to test/samples/SubView/SubView.pto index 71623bf9..dbc75c5e 100644 --- a/test/samples/Subset/Subset.pto +++ b/test/samples/SubView/SubView.pto @@ -2,8 +2,8 @@ module { func.func @test_double_buffer_step(%arg0: memref<32x32xf32, strided<[?, 1], offset: ?>, #pto.address_space>, %arg1: memref<32x32xf32, strided<[?, 1], offset: ?>, #pto.address_space>, %arg2: !pto.tile_buf<32x64xf32, #pto.address_space>) { %c0 = arith.constant 0 : index %c32 = arith.constant 32 : index - %0 = pto.subset %arg2[%c0, %c0] sizes [32, 32] : <32x64xf32, #pto.address_space> - %1 = pto.subset %arg2[%c0, %c32] sizes [32, 32] : <32x64xf32, #pto.address_space> + %0 = pto.subview %arg2[%c0, %c0] sizes [32, 32] : <32x64xf32, #pto.address_space> + %1 = pto.subview %arg2[%c0, %c32] sizes [32, 32] : <32x64xf32, #pto.address_space> pto.tadd ins(%0, %0 : !pto.tile_buf<32x32xf32, strided<[32, 1], offset: [?, ?]>, #pto.address_space>, !pto.tile_buf<32x32xf32, strided<[32, 1], offset: [?, ?]>, #pto.address_space>) outs(%0 : !pto.tile_buf<32x32xf32, strided<[32, 1], offset: [?, ?]>, #pto.address_space>) pto.tload ins(%arg0 : memref<32x32xf32, strided<[?, 1], offset: ?>, #pto.address_space>) outs(%1 : !pto.tile_buf<32x32xf32, strided<[32, 1], offset: [?, ?]>, #pto.address_space>) pto.tstore ins(%0 : !pto.tile_buf<32x32xf32, strided<[32, 1], offset: [?, ?]>, #pto.address_space>) outs(%arg1 : memref<32x32xf32, strided<[?, 1], offset: ?>, #pto.address_space>) diff --git a/test/samples/SubView/board_validation/CMakeLists.txt b/test/samples/SubView/board_validation/CMakeLists.txt new file mode 100644 index 00000000..64d7ec04 --- /dev/null +++ b/test/samples/SubView/board_validation/CMakeLists.txt @@ -0,0 +1,101 @@ +cmake_minimum_required(VERSION 3.16) + +set(CMAKE_C_COMPILER bisheng) +set(CMAKE_CXX_COMPILER bisheng) + +project(subview_split4_npu_validation) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + +if(NOT DEFINED ENV{ASCEND_HOME_PATH}) + message(FATAL_ERROR "Cannot find ASCEND_HOME_PATH, please source the CANN set_env.sh.") +else() + set(ASCEND_HOME_PATH $ENV{ASCEND_HOME_PATH}) +endif() + +set(PTO_ISA_ROOT "" CACHE PATH "Path to pto-isa repo") +if(NOT PTO_ISA_ROOT) + set(_PTO_ISA_CANDIDATES + "${CMAKE_CURRENT_LIST_DIR}/../../../../pto-isa" + "${CMAKE_CURRENT_LIST_DIR}/../../../../../pto-isa" + "${CMAKE_CURRENT_LIST_DIR}/../../../../../../pto-isa" + ) + foreach(_cand IN LISTS _PTO_ISA_CANDIDATES) + if(EXISTS "${_cand}/include" AND EXISTS "${_cand}/tests/common") + set(PTO_ISA_ROOT "${_cand}" CACHE PATH "Path to pto-isa repo" FORCE) + break() + endif() + endforeach() +endif() +if(NOT PTO_ISA_ROOT) + message(FATAL_ERROR "Cannot find PTO_ISA_ROOT, please pass -DPTO_ISA_ROOT=/path/to/pto-isa.") +endif() + +set(ASCEND_DRIVER_PATH /usr/local/Ascend/driver) + +add_compile_options( + -D_FORTIFY_SOURCE=2 + -O2 -std=c++17 + -Wno-macro-redefined -Wno-ignored-attributes + -fstack-protector-strong + -fPIC +) +add_link_options( + -s + -Wl,-z,relro + -Wl,-z,now +) + +set(CMAKE_CCE_COMPILE_OPTIONS + -xcce + -fenable-matrix + --cce-aicore-enable-tl + -fPIC + -Xhost-start -Xhost-end + "SHELL:-mllvm -cce-aicore-stack-size=0x8000" + "SHELL:-mllvm -cce-aicore-function-stack-size=0x8000" + "SHELL:-mllvm -cce-aicore-record-overflow=true" + "SHELL:-mllvm -cce-aicore-addr-transform" + "SHELL:-mllvm -cce-aicore-dcci-insert-for-scalar=false" +) + +set(CMAKE_CPP_COMPILE_OPTIONS + -xc++ + "SHELL:-include stdint.h" + "SHELL:-include stddef.h" +) + +include_directories( + ${PTO_ISA_ROOT}/include + ${PTO_ISA_ROOT}/tests/common + ${ASCEND_HOME_PATH}/include + ${ASCEND_DRIVER_PATH}/kernel/inc +) + +add_library(subview_kernel SHARED subview_kernel.cpp launch.cpp) +target_compile_options(subview_kernel PRIVATE ${CMAKE_CCE_COMPILE_OPTIONS} --cce-aicore-arch=dav-c220-vec -DMEMORY_BASE -std=c++17) +target_include_directories(subview_kernel PRIVATE + ${ASCEND_HOME_PATH}/pkg_inc/ + ${ASCEND_HOME_PATH}/pkg_inc/profiling/ + ${ASCEND_HOME_PATH}/pkg_inc/runtime/runtime +) +target_link_options(subview_kernel PRIVATE --cce-fatobj-link) + +add_executable(subview_board main.cpp) +target_compile_options(subview_board PRIVATE ${CMAKE_CPP_COMPILE_OPTIONS}) +target_include_directories(subview_board PRIVATE + ${PTO_ISA_ROOT}/include + ${PTO_ISA_ROOT}/tests/common +) + +target_link_directories(subview_board PUBLIC + ${ASCEND_HOME_PATH}/lib64 +) + +target_link_libraries(subview_board PRIVATE + subview_kernel + runtime + stdc++ ascendcl m tiling_api platform c_sec dl nnopbase +) diff --git a/test/samples/SubView/board_validation/README.md b/test/samples/SubView/board_validation/README.md new file mode 100644 index 00000000..22c467c6 --- /dev/null +++ b/test/samples/SubView/board_validation/README.md @@ -0,0 +1,10 @@ +# SubView Split Board Validation (A3) + +This case validates PTO subview split correctness on board. + +- Input: one `16x16` f32 parent tile (`src.bin`) +- Kernel: subview into four `8x8` tiles at offsets `(0,0)`, `(0,8)`, `(8,0)`, `(8,8)` +- Outputs: `out0.bin`, `out1.bin`, `out2.bin`, `out3.bin` +- Golden: produced by `golden.py` via NumPy slicing of the input + +Pass condition: all 4 outputs exactly match golden (`np.allclose` with `1e-6`). diff --git a/test/samples/SubView/board_validation/compare.py b/test/samples/SubView/board_validation/compare.py new file mode 100644 index 00000000..69ab96d6 --- /dev/null +++ b/test/samples/SubView/board_validation/compare.py @@ -0,0 +1,55 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import sys +import numpy as np + + +def compare_one(name: str, eps: float = 1e-6) -> bool: + g = f"golden_{name}.bin" + o = f"{name}.bin" + if not os.path.exists(g): + print(f"[ERROR] Golden missing: {g}") + return False + if not os.path.exists(o): + print(f"[ERROR] Output missing: {o}") + return False + + gv = np.fromfile(g, dtype=np.float32) + ov = np.fromfile(o, dtype=np.float32) + if gv.shape != ov.shape: + print(f"[ERROR] Shape mismatch for {name}: golden={gv.shape}, out={ov.shape}") + return False + + if not np.allclose(gv, ov, atol=eps, rtol=eps, equal_nan=True): + diff = np.abs(gv.astype(np.float64) - ov.astype(np.float64)) + idx = int(np.argmax(diff)) + print( + f"[ERROR] {name} mismatch: max_diff={float(diff[idx])} idx={idx} " + f"golden={float(gv[idx])} out={float(ov[idx])}" + ) + return False + + print(f"[INFO] {name} compare passed") + return True + + +def main(): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + ok = True + for n in ["out0", "out1", "out2", "out3"]: + ok = compare_one(n) and ok + + if ok: + print("[INFO] compare passed") + return + + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + + +if __name__ == "__main__": + main() diff --git a/test/samples/SubView/board_validation/golden.py b/test/samples/SubView/board_validation/golden.py new file mode 100644 index 00000000..90be386f --- /dev/null +++ b/test/samples/SubView/board_validation/golden.py @@ -0,0 +1,28 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import numpy as np + + +def main(): + src = np.arange(16 * 16, dtype=np.float32).reshape(16, 16) + + out0 = src[0:8, 0:8].copy() + out1 = src[0:8, 8:16].copy() + out2 = src[8:16, 0:8].copy() + out3 = src[8:16, 8:16].copy() + + src.tofile("src.bin") + np.zeros((8, 8), dtype=np.float32).tofile("out0.bin") + np.zeros((8, 8), dtype=np.float32).tofile("out1.bin") + np.zeros((8, 8), dtype=np.float32).tofile("out2.bin") + np.zeros((8, 8), dtype=np.float32).tofile("out3.bin") + + out0.tofile("golden_out0.bin") + out1.tofile("golden_out1.bin") + out2.tofile("golden_out2.bin") + out3.tofile("golden_out3.bin") + + +if __name__ == "__main__": + main() diff --git a/test/samples/SubView/board_validation/launch.cpp b/test/samples/SubView/board_validation/launch.cpp new file mode 100644 index 00000000..dd8f654f --- /dev/null +++ b/test/samples/SubView/board_validation/launch.cpp @@ -0,0 +1,51 @@ +#ifndef __VEC_SCOPE__ +#define __VEC_SCOPE__ +#endif + +#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) +typedef struct { unsigned char v; } hifloat8_t; +typedef struct { unsigned char v; } float8_e4m3_t; +typedef struct { unsigned char v; } float8_e5m2_t; +typedef struct { unsigned char v; } float8_e8m0_t; +typedef struct { unsigned char v; } float4_e1m2x2_t; +typedef struct { unsigned char v; } float4_e2m1x2_t; +#endif + +#include + +#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) +#include +#endif +#include +#include + +#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) +namespace pto { +struct MrgSortExecutedNumList { + uint16_t mrgSortList0; + uint16_t mrgSortList1; + uint16_t mrgSortList2; + uint16_t mrgSortList3; +}; +} // namespace pto +#endif +#ifndef __CPU_SIM +#include "acl/acl.h" +#endif + +#if defined(__CCE_AICORE__) +__global__ AICORE void subview_split4(__gm__ float* src, __gm__ float* out0, + __gm__ float* out1, __gm__ float* out2, + __gm__ float* out3); +#else +__global__ AICORE void subview_split4(__gm__ float* src, __gm__ float* out0, + __gm__ float* out1, __gm__ float* out2, + __gm__ float* out3); +#endif + +void LaunchSubViewSplit4_kernel(float *src, float *out0, float *out1, float *out2, + float *out3, void *stream) { + subview_split4<<<1, nullptr, stream>>>((__gm__ float*)src, (__gm__ float*)out0, + (__gm__ float*)out1, (__gm__ float*)out2, + (__gm__ float*)out3); +} diff --git a/test/samples/SubView/board_validation/main.cpp b/test/samples/SubView/board_validation/main.cpp new file mode 100644 index 00000000..c2b62dfe --- /dev/null +++ b/test/samples/SubView/board_validation/main.cpp @@ -0,0 +1,130 @@ +#include "test_common.h" +#include "acl/acl.h" + +#include +#include + +using namespace PtoTestCommon; + +#define ACL_CHECK(expr) \ + do { \ + const aclError _ret = (expr); \ + if (_ret != ACL_SUCCESS) { \ + std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ + const char *_recent = aclGetRecentErrMsg(); \ + if (_recent != nullptr && _recent[0] != '\0') { \ + std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ + } \ + rc = 1; \ + goto cleanup; \ + } \ + } while (0) + +void LaunchSubViewSplit4_kernel(float *src, float *out0, float *out1, float *out2, + float *out3, void *stream); + +int main() { + size_t elem_src = 16 * 16; + size_t elem_out = 8 * 8; + size_t bytes_src = elem_src * sizeof(float); + size_t bytes_out = elem_out * sizeof(float); + + float *srcHost = nullptr; + float *out0Host = nullptr; + float *out1Host = nullptr; + float *out2Host = nullptr; + float *out3Host = nullptr; + + float *srcDevice = nullptr; + float *out0Device = nullptr; + float *out1Device = nullptr; + float *out2Device = nullptr; + float *out3Device = nullptr; + + int rc = 0; + bool aclInited = false; + bool deviceSet = false; + int deviceId = 0; + aclrtStream stream = nullptr; + + ACL_CHECK(aclInit(nullptr)); + aclInited = true; + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + ACL_CHECK(aclrtSetDevice(deviceId)); + deviceSet = true; + ACL_CHECK(aclrtCreateStream(&stream)); + + ACL_CHECK(aclrtMallocHost((void **)(&srcHost), bytes_src)); + ACL_CHECK(aclrtMallocHost((void **)(&out0Host), bytes_out)); + ACL_CHECK(aclrtMallocHost((void **)(&out1Host), bytes_out)); + ACL_CHECK(aclrtMallocHost((void **)(&out2Host), bytes_out)); + ACL_CHECK(aclrtMallocHost((void **)(&out3Host), bytes_out)); + + ACL_CHECK(aclrtMalloc((void **)&srcDevice, bytes_src, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&out0Device, bytes_out, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&out1Device, bytes_out, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&out2Device, bytes_out, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&out3Device, bytes_out, ACL_MEM_MALLOC_HUGE_FIRST)); + + ReadFile("./src.bin", bytes_src, srcHost, bytes_src); + ReadFile("./out0.bin", bytes_out, out0Host, bytes_out); + ReadFile("./out1.bin", bytes_out, out1Host, bytes_out); + ReadFile("./out2.bin", bytes_out, out2Host, bytes_out); + ReadFile("./out3.bin", bytes_out, out3Host, bytes_out); + + ACL_CHECK(aclrtMemcpy(srcDevice, bytes_src, srcHost, bytes_src, + ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(out0Device, bytes_out, out0Host, bytes_out, + ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(out1Device, bytes_out, out1Host, bytes_out, + ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(out2Device, bytes_out, out2Host, bytes_out, + ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(out3Device, bytes_out, out3Host, bytes_out, + ACL_MEMCPY_HOST_TO_DEVICE)); + + LaunchSubViewSplit4_kernel(srcDevice, out0Device, out1Device, out2Device, + out3Device, stream); + ACL_CHECK(aclrtSynchronizeStream(stream)); + + ACL_CHECK(aclrtMemcpy(out0Host, bytes_out, out0Device, bytes_out, + ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(out1Host, bytes_out, out1Device, bytes_out, + ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(out2Host, bytes_out, out2Device, bytes_out, + ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(out3Host, bytes_out, out3Device, bytes_out, + ACL_MEMCPY_DEVICE_TO_HOST)); + + WriteFile("./out0.bin", out0Host, bytes_out); + WriteFile("./out1.bin", out1Host, bytes_out); + WriteFile("./out2.bin", out2Host, bytes_out); + WriteFile("./out3.bin", out3Host, bytes_out); + +cleanup: + aclrtFree(srcDevice); + aclrtFree(out0Device); + aclrtFree(out1Device); + aclrtFree(out2Device); + aclrtFree(out3Device); + + aclrtFreeHost(srcHost); + aclrtFreeHost(out0Host); + aclrtFreeHost(out1Host); + aclrtFreeHost(out2Host); + aclrtFreeHost(out3Host); + + if (stream != nullptr) { + (void)aclrtDestroyStream(stream); + } + if (deviceSet) { + (void)aclrtResetDevice(deviceId); + } + if (aclInited) { + (void)aclFinalize(); + } + + return rc; +} diff --git a/test/samples/SubView/board_validation/run.sh b/test/samples/SubView/board_validation/run.sh new file mode 100755 index 00000000..2e1f229b --- /dev/null +++ b/test/samples/SubView/board_validation/run.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash +set -euo pipefail + +SOC_VERSION="${SOC_VERSION:-Ascend910}" +BUILD_DIR="${BUILD_DIR:-build}" +ACL_DEVICE_ID_NPU="${ACL_DEVICE_ID:-}" + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "${ROOT_DIR}" + +python3 "${ROOT_DIR}/golden.py" + +if [[ -z "${PTO_ISA_ROOT:-}" ]]; then + search_dir="${ROOT_DIR}" + for _ in {1..8}; do + if [[ -d "${search_dir}/pto-isa/include" && -d "${search_dir}/pto-isa/tests/common" ]]; then + PTO_ISA_ROOT="${search_dir}/pto-isa" + break + fi + if [[ "${search_dir}" == "/" ]]; then + break + fi + search_dir="$(dirname "${search_dir}")" + done + export PTO_ISA_ROOT="${PTO_ISA_ROOT:-}" +fi + +if [[ -z "${ASCEND_HOME_PATH:-}" && -f "/usr/local/Ascend/cann/set_env.sh" ]]; then + echo "[INFO] Sourcing /usr/local/Ascend/cann/set_env.sh" + set +e + set +u + set +o pipefail + source "/usr/local/Ascend/cann/set_env.sh" || true + set -o pipefail + set -u + set -e +fi + +if [[ -n "${ASCEND_HOME_PATH:-}" ]]; then + export LD_LIBRARY_PATH="${ASCEND_HOME_PATH}/lib64:${LD_LIBRARY_PATH:-}" +fi + +mkdir -p "${ROOT_DIR}/${BUILD_DIR}" +cd "${ROOT_DIR}/${BUILD_DIR}" +if [[ -n "${PTO_ISA_ROOT:-}" ]]; then + cmake -DSOC_VERSION="${SOC_VERSION}" -DPTO_ISA_ROOT="${PTO_ISA_ROOT}" .. +else + cmake -DSOC_VERSION="${SOC_VERSION}" .. +fi +make -j + +cd "${ROOT_DIR}" +if [[ -n "${ACL_DEVICE_ID_NPU}" ]]; then + ACL_DEVICE_ID="${ACL_DEVICE_ID_NPU}" "${ROOT_DIR}/${BUILD_DIR}/subview_board" +else + "${ROOT_DIR}/${BUILD_DIR}/subview_board" +fi + +COMPARE_STRICT=1 python3 "${ROOT_DIR}/compare.py" diff --git a/test/samples/SubView/board_validation/subview_kernel.cpp b/test/samples/SubView/board_validation/subview_kernel.cpp new file mode 100644 index 00000000..c72d4ebf --- /dev/null +++ b/test/samples/SubView/board_validation/subview_kernel.cpp @@ -0,0 +1,129 @@ +#ifndef __VEC_SCOPE__ +#define __VEC_SCOPE__ +#endif + +#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) +typedef struct { unsigned char v; } hifloat8_t; +typedef struct { unsigned char v; } float8_e4m3_t; +typedef struct { unsigned char v; } float8_e5m2_t; +typedef struct { unsigned char v; } float8_e8m0_t; +typedef struct { unsigned char v; } float4_e1m2x2_t; +typedef struct { unsigned char v; } float4_e2m1x2_t; +#endif +#include + +#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) +#include +#endif +#include +#include + +#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) +namespace pto { +struct MrgSortExecutedNumList { + uint16_t mrgSortList0; + uint16_t mrgSortList1; + uint16_t mrgSortList2; + uint16_t mrgSortList3; +}; +} // namespace pto +#endif +#ifndef __CPU_SIM +#include "acl/acl.h" +#endif + +#include "pto/pto-inst.hpp" +using namespace pto; + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static AICORE inline void ptoas_auto_sync_tail( + PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +__global__ AICORE void subview_split4(__gm__ float* v1, __gm__ float* v2, __gm__ float* v3, __gm__ float* v4, __gm__ float* v5) { + unsigned v6 = 8; + unsigned v7 = 16; + unsigned v8 = 1; + unsigned v9 = 0; + int64_t v10 = 0; + int32_t v11 = 16; + int32_t v12 = 8; + using T = float; + Tile v13; + TASSIGN(v13, v10); + using GTShape_5736857712 = pto::Shape<1, 1, 1, 16, 16>; + using GTStride_5736857712 = pto::Stride<256, 256, 256, 16, 1>; + constexpr pto::Layout GT_5736857712_layout = pto::Layout::ND; + GTShape_5736857712 v14 = GTShape_5736857712(); + GTStride_5736857712 v15 = GTStride_5736857712(); + using GT_5736857712 = GlobalTensor; + GT_5736857712 v16 = GT_5736857712(v1, v14, v15); + TLOAD(v13, v16); + set_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0); + __ubuf__ float* v17 = v13.data(); + Tile v18; + uint64_t v19 = reinterpret_cast((__ubuf__ float*) (v17 + (v9 + v9 * v7) + v9 * v8)); + TASSIGN(v18, v19); + __ubuf__ float* v20 = v13.data(); + Tile v21; + uint64_t v22 = reinterpret_cast((__ubuf__ float*) (v20 + (v9 + v9 * v7) + v6 * v8)); + TASSIGN(v21, v22); + __ubuf__ float* v23 = v13.data(); + Tile v24; + uint64_t v25 = reinterpret_cast((__ubuf__ float*) (v23 + (v9 + v6 * v7) + v9 * v8)); + TASSIGN(v24, v25); + __ubuf__ float* v26 = v13.data(); + Tile v27; + uint64_t v28 = reinterpret_cast((__ubuf__ float*) (v26 + (v9 + v6 * v7) + v6 * v8)); + TASSIGN(v27, v28); + wait_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0); + using GTShape_5736826112 = pto::Shape<1, 1, 1, 8, 8>; + using GTStride_5736826112 = pto::Stride<64, 64, 64, 8, 1>; + constexpr pto::Layout GT_5736826112_layout = pto::Layout::ND; + GTShape_5736826112 v29 = GTShape_5736826112(); + GTStride_5736826112 v30 = GTStride_5736826112(); + using GT_5736826112 = GlobalTensor; + GT_5736826112 v31 = GT_5736826112(v2, v29, v30); + TSTORE(v31, v18); + using GTShape_5736826480 = pto::Shape<1, 1, 1, 8, 8>; + using GTStride_5736826480 = pto::Stride<64, 64, 64, 8, 1>; + constexpr pto::Layout GT_5736826480_layout = pto::Layout::ND; + GTShape_5736826480 v32 = GTShape_5736826480(); + GTStride_5736826480 v33 = GTStride_5736826480(); + using GT_5736826480 = GlobalTensor; + GT_5736826480 v34 = GT_5736826480(v3, v32, v33); + TSTORE(v34, v21); + using GTShape_5736826624 = pto::Shape<1, 1, 1, 8, 8>; + using GTStride_5736826624 = pto::Stride<64, 64, 64, 8, 1>; + constexpr pto::Layout GT_5736826624_layout = pto::Layout::ND; + GTShape_5736826624 v35 = GTShape_5736826624(); + GTStride_5736826624 v36 = GTStride_5736826624(); + using GT_5736826624 = GlobalTensor; + GT_5736826624 v37 = GT_5736826624(v4, v35, v36); + TSTORE(v37, v24); + using GTShape_5736826768 = pto::Shape<1, 1, 1, 8, 8>; + using GTStride_5736826768 = pto::Stride<64, 64, 64, 8, 1>; + constexpr pto::Layout GT_5736826768_layout = pto::Layout::ND; + GTShape_5736826768 v38 = GTShape_5736826768(); + GTStride_5736826768 v39 = GTStride_5736826768(); + using GT_5736826768 = GlobalTensor; + GT_5736826768 v40 = GT_5736826768(v5, v38, v39); + TSTORE(v40, v27); + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} + diff --git a/test/samples/Subset/subset.cpp b/test/samples/SubView/subview.cpp similarity index 100% rename from test/samples/Subset/subset.cpp rename to test/samples/SubView/subview.cpp diff --git a/test/samples/Subset/subset.py b/test/samples/SubView/subview.py similarity index 87% rename from test/samples/Subset/subset.py rename to test/samples/SubView/subview.py index eaf4cd7d..fe8a381b 100644 --- a/test/samples/Subset/subset.py +++ b/test/samples/SubView/subview.py @@ -25,7 +25,7 @@ def build(): fn_ty = func.FunctionType.get([], []) with InsertionPoint(m.body): - fn = func.FuncOp("subset_pingpong_demo", fn_ty) + fn = func.FuncOp("subview_pingpong_demo", fn_ty) entry = fn.add_entry_block() with InsertionPoint(entry): @@ -33,8 +33,8 @@ def build(): c32 = arith.ConstantOp(idx, 32).result workspace = pto.AllocTileOp(ws_type).result - ping = pto.SubsetOp(workspace, [c0, c0], sizes=[32, 32]).result - pong = pto.SubsetOp(workspace, [c0, c32], sizes=[32, 32]).result + ping = pto.SubViewOp(workspace, [c0, c0], sizes=[32, 32]).result + pong = pto.SubViewOp(workspace, [c0, c32], sizes=[32, 32]).result pto.TAddOp(ping, ping, ping) pto.TAddOp(pong, pong, pong) diff --git a/test/samples/Subset/subset_boxed_dynamic.py b/test/samples/SubView/subview_boxed_dynamic.py similarity index 91% rename from test/samples/Subset/subset_boxed_dynamic.py rename to test/samples/SubView/subview_boxed_dynamic.py index 250fb691..e7882d66 100644 --- a/test/samples/Subset/subset_boxed_dynamic.py +++ b/test/samples/SubView/subview_boxed_dynamic.py @@ -26,7 +26,7 @@ def build(): fn_ty = func.FunctionType.get([idx], []) with InsertionPoint(m.body): - fn = func.FuncOp("subset_boxed_dynamic", fn_ty) + fn = func.FuncOp("subview_boxed_dynamic", fn_ty) entry = fn.add_entry_block() with InsertionPoint(entry): @@ -36,7 +36,7 @@ def build(): row_off = arith.MulIOp(i0, c16).result t0 = pto.AllocTileOp(tile_ty).result - _sub = pto.SubsetOp(t0, [row_off, c0], sizes=[16, 32]).result + _sub = pto.SubViewOp(t0, [row_off, c0], sizes=[16, 32]).result func.ReturnOp([]) diff --git a/test/samples/Subset/subset_boxed_invalid.py b/test/samples/SubView/subview_boxed_invalid.py similarity index 84% rename from test/samples/Subset/subset_boxed_invalid.py rename to test/samples/SubView/subview_boxed_invalid.py index 28a185f1..2a177e5e 100644 --- a/test/samples/Subset/subset_boxed_invalid.py +++ b/test/samples/SubView/subview_boxed_invalid.py @@ -21,12 +21,12 @@ def build(): cfg = pto.TileBufConfigAttr.get(bl, sl, fractal_ab_size, pd, ctx) # Boxed layout: innerRows=16, innerCols=32/2=16 (f16). - # Invalid subset: column offset not aligned (offC=8). + # Invalid subview: column offset not aligned (offC=8). tile_ty = pto.TileBufType.get([32, 32], f16, vec, [32, 32], cfg, ctx) fn_ty = func.FunctionType.get([], []) with InsertionPoint(m.body): - fn = func.FuncOp("subset_invalid_boxed", fn_ty) + fn = func.FuncOp("subview_invalid_boxed", fn_ty) entry = fn.add_entry_block() with InsertionPoint(entry): @@ -35,14 +35,14 @@ def build(): t0 = pto.AllocTileOp(tile_ty).result # Expect verifier failure: offC=8 not multiple of innerCols=16. - _bad = pto.SubsetOp(t0, [c0, c8], sizes=[16, 16]).result + _bad = pto.SubViewOp(t0, [c0, c8], sizes=[16, 16]).result func.ReturnOp([]) ok = m.operation.verify() if ok: return m - # Expected failure for invalid subset; make python exit non-zero. + # Expected failure for invalid subview; make python exit non-zero. raise SystemExit(1) diff --git a/test/samples/Subset/subset_tsubs.py b/test/samples/SubView/subview_tsubs.py similarity index 85% rename from test/samples/Subset/subset_tsubs.py rename to test/samples/SubView/subview_tsubs.py index b93ced59..55c0c780 100644 --- a/test/samples/Subset/subset_tsubs.py +++ b/test/samples/SubView/subview_tsubs.py @@ -24,7 +24,7 @@ def build(): fn_ty = func.FunctionType.get([], []) with InsertionPoint(m.body): - fn = func.FuncOp("subset_tsubs_demo", fn_ty) + fn = func.FuncOp("subview_tsubs_demo", fn_ty) entry = fn.add_entry_block() with InsertionPoint(entry): @@ -32,9 +32,9 @@ def build(): scale = arith.ConstantOp(f32, 1.0).result workspace = pto.AllocTileOp(tile_8x128).result - sub0 = pto.SubsetOp(workspace, [c0, c0], sizes=[8, 64]).result + sub0 = pto.SubViewOp(workspace, [c0, c0], sizes=[8, 64]).result - # Use subset as both src and dst to ensure tile lowering is preserved. + # Use subview as both src and dst to ensure tile lowering is preserved. pto.TSubSOp(sub0, scale, sub0) func.ReturnOp([]) diff --git a/test/samples/Subset/vadd_pto_pingpong.py b/test/samples/SubView/vadd_pto_pingpong.py similarity index 92% rename from test/samples/Subset/vadd_pto_pingpong.py rename to test/samples/SubView/vadd_pto_pingpong.py index 1211e056..93085fc5 100644 --- a/test/samples/Subset/vadd_pto_pingpong.py +++ b/test/samples/SubView/vadd_pto_pingpong.py @@ -53,11 +53,11 @@ def build_pingpong(): sv_dst = pto.PartitionViewOp(pto.PartitionTensorViewType.get([32,32], f32, ctx), tv_dst, offsets=[c0, c0], sizes=[c32, c32]).result - # Subset: Ping [0,0], Pong [0,32] + # SubView: Ping [0,0], Pong [0,32] # 这里不需要指定 Result 类型,C++ 会自动推导 - # Subset sizes must be static (I64ArrayAttr); offsets must be SSA. - ping = pto.SubsetOp(workspace, [c0, c0], sizes=[32, 32]).result - pong = pto.SubsetOp(workspace, [c0, c32], sizes=[32, 32]).result + # SubView sizes must be static (I64ArrayAttr); offsets must be SSA. + ping = pto.SubViewOp(workspace, [c0, c0], sizes=[32, 32]).result + pong = pto.SubViewOp(workspace, [c0, c32], sizes=[32, 32]).result # DPS: Compute, Prefetch, WriteBack pto.TLoadOp(None, sv_src, pong) diff --git a/test/samples/Sync/syncHigh.py b/test/samples/Sync/syncHigh.py index 406057f0..8c317f14 100755 --- a/test/samples/Sync/syncHigh.py +++ b/test/samples/Sync/syncHigh.py @@ -27,7 +27,7 @@ def build(): # A real kernel that manually inserts set_flag/wait_flag. # - # NOTE(A5): `set_flag/wait_flag` only accept a subset of PIPE enums on A5. + # NOTE(A5): `set_flag/wait_flag` only accept a limited set of PIPE enums on A5. # Keep this sample in the supported set by using MTE2/V/MTE3 only. fn_ty = func.FunctionType.get([ptr_f32, ptr_f32], []) with InsertionPoint(m.body): diff --git a/test/samples/runop.sh b/test/samples/runop.sh index b0c3c024..506b4d8d 100755 --- a/test/samples/runop.sh +++ b/test/samples/runop.sh @@ -302,11 +302,11 @@ process_one_dir() { continue fi - # Regression guard: SubsetOp valid-shape inference must not produce 0. + # Regression guard: SubViewOp valid-shape inference must not produce 0. # This breaks downstream NPU compilation (e.g. vadd_pto_pingpong workspace ping/pong). if [[ "$base" == "vadd_pto_pingpong" ]]; then if grep -Fq ", 0, SLayout" "$cpp"; then - echo -e "${A}(${base}.py)\tFAIL\tgenerated tile has valid dim 0 (subset valid-shape bug)" + echo -e "${A}(${base}.py)\tFAIL\tgenerated tile has valid dim 0 (subview valid-shape bug)" overall=1 continue fi