From 8aced45e772bb09b0ed78b0809d07aae805b1ac6 Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Fri, 6 Mar 2026 11:01:42 +0800 Subject: [PATCH 1/6] [InsertSync] add PIPE_S scalar sync and addptr alias-through --- include/PTO/IR/PTOOps.td | 10 +++ .../InsertSync/InsertSyncAnalysis.cpp | 11 ++- .../Transforms/InsertSync/PTOIRTranslator.cpp | 76 +++++++++++++++---- .../test_inject_sync_scalar_cross_pipe.py | 68 +++++++++++++++++ ...t_inject_sync_scalar_intra_pipe_barrier.py | 39 ++++++++++ test/samples/runop.sh | 35 +++++++++ 6 files changed, 223 insertions(+), 16 deletions(-) create mode 100644 test/samples/Sync/test_inject_sync_scalar_cross_pipe.py create mode 100644 test/samples/Sync/test_inject_sync_scalar_intra_pipe_barrier.py diff --git a/include/PTO/IR/PTOOps.td b/include/PTO/IR/PTOOps.td index c967a75e..28b1c85d 100644 --- a/include/PTO/IR/PTOOps.td +++ b/include/PTO/IR/PTOOps.td @@ -100,6 +100,7 @@ def AddPtrOp : PTO_Op<"addptr", [ //===----------------------------------------------------------------------===// def LoadScalarOp : PTO_Op<"load_scalar", [ + OpPipeInterface, DeclareOpInterfaceMethods ]> { let summary = "Load a single scalar element from a pointer at offset."; @@ -116,9 +117,14 @@ def LoadScalarOp : PTO_Op<"load_scalar", [ let assemblyFormat = [{ $ptr `[` $offset `]` attr-dict `:` type($ptr) `->` type($value) }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_S; } + }]; } def StoreScalarOp : PTO_Op<"store_scalar", [ + OpPipeInterface, DeclareOpInterfaceMethods ]> { let summary = "Store a single scalar element to a pointer at offset."; @@ -136,6 +142,10 @@ def StoreScalarOp : PTO_Op<"store_scalar", [ let assemblyFormat = [{ $value `,` $ptr `[` $offset `]` attr-dict `:` type($ptr) `,` type($value) }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_S; } + }]; } def MakeTensorViewOp : PTO_Op<"make_tensor_view", [AttrSizedOperandSegments]> { diff --git a/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp b/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp index a33bf889..e236f3c4 100644 --- a/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp +++ b/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp @@ -1,4 +1,5 @@ #include "PTO/Transforms/InsertSync/InsertSyncAnalysis.h" +#include "PTO/IR/PTO.h" #include "PTO/Transforms/InsertSync/SyncCommon.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/IR/SCF.h" @@ -23,6 +24,10 @@ static bool isValidPipeIndex(PipelineType pipe) { return static_cast(pipe) < kPipeStateSize; } +static bool isScalarMemoryOp(Operation *op) { + return isa(op); +} + // ============================================================================== // 1. Entry Point // ============================================================================== @@ -122,7 +127,11 @@ bool InsertSyncAnalysis::IsNoNeedToInsertSync( const PipelineType nowPipe = nowCompound->kPipeValue; if (frontPipe == nowPipe && frontPipe == PipelineType::PIPE_S) { - return true; + Operation *nowOp = nowCompound->elementOp; + Operation *frontOp = frontCompound->elementOp; + if (!isScalarMemoryOp(nowOp) && !isScalarMemoryOp(frontOp)) { + return true; + } } if (nowCompound->elementOp == frontCompound->elementOp && !isBackwardDep) { diff --git a/lib/PTO/Transforms/InsertSync/PTOIRTranslator.cpp b/lib/PTO/Transforms/InsertSync/PTOIRTranslator.cpp index 33aec28b..b5e3b463 100644 --- a/lib/PTO/Transforms/InsertSync/PTOIRTranslator.cpp +++ b/lib/PTO/Transforms/InsertSync/PTOIRTranslator.cpp @@ -7,6 +7,7 @@ #include "llvm/Support/FormatVariadic.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/IR/Matchers.h" +#include // [P0 新增] 引入副作用接口和 PTO 接口 #include "mlir/Interfaces/SideEffectInterfaces.h" @@ -15,18 +16,41 @@ using namespace mlir; using namespace mlir::pto; +static int64_t getElementSizeInBytes(Type elemType) { + if (auto intTy = dyn_cast(elemType)) { + return std::max(1, intTy.getWidth() / 8); + } + if (auto floatTy = dyn_cast(elemType)) { + return std::max(1, floatTy.getWidth() / 8); + } + if (isa(elemType)) { + return 8; + } + return 1; +} + // [辅助函数] 尝试从 Operation 中计算相对于 Source 的字节偏移量和新大小 // 返回值: pair // 如果无法计算静态值,返回 {-1, -1} 表示这是动态的 static std::pair getStaticOffsetAndSize(Operation *op, Value src) { - auto srcType = dyn_cast(src.getType()); - if (!srcType) return {0, 0}; - - int64_t elemSize = srcType.getElementType().getIntOrFloatBitWidth() / 8; - if (elemSize == 0) elemSize = 1; + Type srcElemType = nullptr; + if (auto srcType = dyn_cast(src.getType())) { + srcElemType = srcType.getElementType(); + } else if (auto ptrType = dyn_cast(src.getType())) { + srcElemType = ptrType.getElementType(); + } else { + return {0, 0}; + } + + const int64_t elemSize = getElementSizeInBytes(srcElemType); // === Case 1: memref.subview === if (auto subView = dyn_cast(op)) { + auto srcType = dyn_cast(src.getType()); + if (!srcType) { + return {-1, -1}; + } + int64_t baseOffset; SmallVector strides; if (failed(mlir::getStridesAndOffset(srcType, strides, baseOffset))) { @@ -71,6 +95,15 @@ static std::pair getStaticOffsetAndSize(Operation *op, Value s } return {staticOffsets[0] * elemSize, 0}; } + + // === Case 3: pto.addptr === + if (auto addPtrOp = dyn_cast(op)) { + llvm::APInt apIntValue; + if (!matchPattern(addPtrOp.getOffset(), m_ConstantInt(&apIntValue))) { + return {-1, -1}; + } + return {apIntValue.getSExtValue() * elemSize, 0}; + } return {0, 0}; } @@ -138,6 +171,9 @@ void PTOIRTranslator::RecursionIR(Region *region) { else if (auto bindTileOp = dyn_cast(op)) { UpdateAliasBufferInfo(bindTileOp.getResult(), bindTileOp.getSource()); } + else if (auto addPtrOp = dyn_cast(op)) { + UpdateAliasBufferInfo(addPtrOp.getResult(), addPtrOp.getPtr()); + } else if (auto subViewOp = dyn_cast(op)) { UpdateAliasBufferInfo(subViewOp.getResult(), subViewOp.getSource()); } @@ -496,28 +532,38 @@ void PTOIRTranslator::UpdateAliasBufferInfo(Value result, Value source) { if (!buffer2MemInfoMap_.contains(source)) return; int64_t deltaOffset = 0; - int64_t newSize = -1; + int64_t newSize = -1; + bool hasUnknownAliasRange = false; if (auto op = result.getDefiningOp()) { auto info = getStaticOffsetAndSize(op, source); - if (info.first != -1) { - deltaOffset = info.first; - if (info.second > 0) newSize = info.second; - } + if (info.first == -1) { + hasUnknownAliasRange = true; + } else { + deltaOffset = info.first; + if (info.second > 0) newSize = info.second; + } } auto &resultMemInfoVec = buffer2MemInfoMap_[result]; for (auto &parentInfo : buffer2MemInfoMap_[source]) { auto newInfo = parentInfo->clone(result); - - if (!newInfo->baseAddresses.empty()) { - newInfo->baseAddresses[0] += deltaOffset; + + if (hasUnknownAliasRange) { + // Dynamic pointer arithmetic cannot be modeled precisely here. + // Keep root/scope aliasing, but drop concrete range info conservatively. + newInfo->baseAddresses.clear(); + newInfo->allocateSize = 0; } else { + if (!newInfo->baseAddresses.empty()) { + newInfo->baseAddresses[0] += deltaOffset; + } else { newInfo->baseAddresses.push_back(deltaOffset); + } } - - if (newSize > 0) { + + if (!hasUnknownAliasRange && newSize > 0) { newInfo->allocateSize = newSize; } diff --git a/test/samples/Sync/test_inject_sync_scalar_cross_pipe.py b/test/samples/Sync/test_inject_sync_scalar_cross_pipe.py new file mode 100644 index 00000000..306642dd --- /dev/null +++ b/test/samples/Sync/test_inject_sync_scalar_cross_pipe.py @@ -0,0 +1,68 @@ +from mlir.ir import Context, Location, Module, InsertionPoint, F32Type, IndexType +from mlir.dialects import func, arith, pto + + +def _idx_const(v: int): + return arith.ConstantOp(IndexType.get(), v).result + + +def build(): + with Context() as ctx: + pto.register_dialect(ctx, load=True) + + with Location.unknown(ctx): + m = Module.create() + + f32 = F32Type.get(ctx) + ptr_f32 = pto.PtrType.get(f32, ctx) + tv2 = pto.TensorViewType.get(2, f32, ctx) + tile_view = pto.PartitionTensorViewType.get([16, 16], f32, ctx) + + vec = pto.AddressSpaceAttr.get(pto.AddressSpace.VEC, ctx) + bl = pto.BLayoutAttr.get(pto.BLayout.RowMajor, ctx) + sl = pto.SLayoutAttr.get(pto.SLayout.NoneBox, ctx) + pd = pto.PadValueAttr.get(pto.PadValue.Null, ctx) + cfg = pto.TileBufConfigAttr.get(bl, sl, pto.TileConfig.fractalABSize, pd, ctx) + tile_buf = pto.TileBufType.get([16, 16], f32, vec, [16, 16], cfg, ctx) + + fn_ty = func.FunctionType.get([ptr_f32, ptr_f32], []) + with InsertionPoint(m.body): + fn = func.FuncOp("test_scalar_cross_pipe", fn_ty) + entry = fn.add_entry_block() + + with InsertionPoint(entry): + src, dst = entry.arguments + c0 = _idx_const(0) + c1 = _idx_const(1) + c4 = _idx_const(4) + c16 = _idx_const(16) + + src_tv = pto.MakeTensorViewOp(tv2, src, [c16, c16], [c16, c1]).result + dst_tv = pto.MakeTensorViewOp(tv2, dst, [c16, c16], [c16, c1]).result + src_part = pto.PartitionViewOp( + tile_view, src_tv, offsets=[c0, c0], sizes=[c16, c16] + ).result + dst_part = pto.PartitionViewOp( + tile_view, dst_tv, offsets=[c0, c0], sizes=[c16, c16] + ).result + ub = pto.AllocTileOp(tile_buf).result + + src_off = pto.addptr(src, c4) + one = arith.ConstantOp(f32, 1.0).result + pto.store_scalar(src_off, c0, one) + + pto.TLoadOp(None, src_part, ub) + pto.TStoreOp(None, ub, dst_part) + + dst_off = pto.addptr(dst, c4) + loaded = pto.load_scalar(f32, dst_off, c0) + pto.store_scalar(dst_off, c1, loaded) + + func.ReturnOp([]) + + m.operation.verify() + return m + + +if __name__ == "__main__": + print(build()) diff --git a/test/samples/Sync/test_inject_sync_scalar_intra_pipe_barrier.py b/test/samples/Sync/test_inject_sync_scalar_intra_pipe_barrier.py new file mode 100644 index 00000000..6c61ae26 --- /dev/null +++ b/test/samples/Sync/test_inject_sync_scalar_intra_pipe_barrier.py @@ -0,0 +1,39 @@ +from mlir.ir import Context, Location, Module, InsertionPoint, F32Type, IndexType +from mlir.dialects import func, arith, pto + + +def build(): + with Context() as ctx: + pto.register_dialect(ctx, load=True) + + with Location.unknown(ctx): + m = Module.create() + + f32 = F32Type.get(ctx) + idx = IndexType.get(ctx) + ptr_f32 = pto.PtrType.get(f32, ctx) + + fn_ty = func.FunctionType.get([ptr_f32], []) + with InsertionPoint(m.body): + fn = func.FuncOp("test_scalar_intra_pipe_barrier", fn_ty) + entry = fn.add_entry_block() + + with InsertionPoint(entry): + ptr = entry.arguments[0] + c0 = arith.ConstantOp(idx, 0).result + c4 = arith.ConstantOp(idx, 4).result + one = arith.ConstantOp(f32, 1.0).result + two = arith.ConstantOp(f32, 2.0).result + + ptr_off = pto.addptr(ptr, c4) + pto.store_scalar(ptr_off, c0, one) + pto.store_scalar(ptr_off, c0, two) + + func.ReturnOp([]) + + m.operation.verify() + return m + + +if __name__ == "__main__": + print(build()) diff --git a/test/samples/runop.sh b/test/samples/runop.sh index ef71560d..60ea4688 100755 --- a/test/samples/runop.sh +++ b/test/samples/runop.sh @@ -292,6 +292,41 @@ process_one_dir() { fi fi + # Scalar sync regression: scalar store/load should participate in PIPE_S + # auto-sync and correctly connect with DMA pipelines. + if [[ "$base" == "test_inject_sync_scalar_cross_pipe" ]]; then + if ! grep -Eq "set_flag\\(PIPE_S,[[:space:]]*PIPE_MTE2,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tmissing set_flag PIPE_S->PIPE_MTE2" + overall=1 + continue + fi + if ! grep -Eq "wait_flag\\(PIPE_S,[[:space:]]*PIPE_MTE2,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tmissing wait_flag PIPE_S->PIPE_MTE2" + overall=1 + continue + fi + if ! grep -Eq "set_flag\\(PIPE_MTE3,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tmissing set_flag PIPE_MTE3->PIPE_S" + overall=1 + continue + fi + if ! grep -Eq "wait_flag\\(PIPE_MTE3,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tmissing wait_flag PIPE_MTE3->PIPE_S" + overall=1 + continue + fi + fi + + # Scalar intra-pipe regression: dependent scalar PIPE_S accesses should be + # serialized by a per-pipe barrier. + if [[ "$base" == "test_inject_sync_scalar_intra_pipe_barrier" ]]; then + if ! grep -Fq "pipe_barrier(PIPE_S)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tmissing pipe_barrier(PIPE_S) for scalar intra-pipe dependency" + overall=1 + continue + fi + fi + # Regression guard for issue #185: barrier_sync must support op types # beyond TMATMUL/TVEC and lower to the expected per-pipe barrier. if [[ "$base" == "test_barrier_sync" ]]; then From 40cee3b694f2789fd4eeb25bc0dd58f053cb578f Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Fri, 6 Mar 2026 16:27:36 +0800 Subject: [PATCH 2/6] [InsertSync] align scalar sync tests and add A3-safe PIPE_ALL fallback --- .../InsertSync/InsertSyncAnalysis.cpp | 29 ++++++++++++- .../test_inject_sync_scalar_cross_pipe.py | 8 ++-- test/samples/runop.sh | 42 ++++++++++++------- 3 files changed, 59 insertions(+), 20 deletions(-) diff --git a/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp b/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp index e236f3c4..92c81f3e 100644 --- a/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp +++ b/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp @@ -28,6 +28,21 @@ static bool isScalarMemoryOp(Operation *op) { return isa(op); } +static bool needsPipeAllBarrier(PipelineType srcPipe, PipelineType dstPipe) { + // A3 runtime is unstable for these scalar synchronization forms: + // 1) PIPE_S local barrier + // 2) PIPE_S -> PIPE_MTE2 + // 3) PIPE_MTE3 -> PIPE_S + // Conservatively fall back to PIPE_ALL barrier to preserve correctness. + if (srcPipe == PipelineType::PIPE_S && dstPipe == PipelineType::PIPE_S) + return true; + if (srcPipe == PipelineType::PIPE_S && dstPipe == PipelineType::PIPE_MTE2) + return true; + if (srcPipe == PipelineType::PIPE_MTE3 && dstPipe == PipelineType::PIPE_S) + return true; + return false; +} + // ============================================================================== // 1. Entry Point // ============================================================================== @@ -338,7 +353,19 @@ void InsertSyncAnalysis::InsertSyncOperation( PipelineType nowPipe = nowCompound->kPipeValue; PipelineType frontPipe = frontCompound->kPipeValue; - if (nowPipe == frontPipe) { + if (needsPipeAllBarrier(frontPipe, nowPipe)) { + unsigned insertBarrierId = nowCompound->GetIndex(); + auto barrierOp = std::make_unique( + SyncOperation::TYPE::PIPE_BARRIER, PipelineType::PIPE_ALL, + PipelineType::PIPE_ALL, syncIndex_, insertBarrierId, forEndIndex); + barrierOp->SetDepSyncIRIndex(frontCompound->GetIndex()); + syncIR_[insertBarrierId]->pipeBefore.push_back(barrierOp.get()); + barrierOp->SetSyncIRIndex(insertBarrierId); + + SmallVector> newSync; + newSync.emplace_back(std::move(barrierOp)); + syncOperations_.emplace_back(std::move(newSync)); + } else if (nowPipe == frontPipe) { unsigned insertBarrierId = nowCompound->GetIndex(); auto barrierOp = std::make_unique( SyncOperation::TYPE::PIPE_BARRIER, frontPipe, nowPipe, syncIndex_, diff --git a/test/samples/Sync/test_inject_sync_scalar_cross_pipe.py b/test/samples/Sync/test_inject_sync_scalar_cross_pipe.py index 306642dd..47310dc3 100644 --- a/test/samples/Sync/test_inject_sync_scalar_cross_pipe.py +++ b/test/samples/Sync/test_inject_sync_scalar_cross_pipe.py @@ -48,15 +48,13 @@ def build(): ub = pto.AllocTileOp(tile_buf).result src_off = pto.addptr(src, c4) + dst_off = pto.addptr(dst, c4) one = arith.ConstantOp(f32, 1.0).result - pto.store_scalar(src_off, c0, one) - pto.TLoadOp(None, src_part, ub) - pto.TStoreOp(None, ub, dst_part) - - dst_off = pto.addptr(dst, c4) + pto.store_scalar(src_off, c0, one) loaded = pto.load_scalar(f32, dst_off, c0) pto.store_scalar(dst_off, c1, loaded) + pto.TStoreOp(None, ub, dst_part) func.ReturnOp([]) diff --git a/test/samples/runop.sh b/test/samples/runop.sh index 60ea4688..896ed443 100755 --- a/test/samples/runop.sh +++ b/test/samples/runop.sh @@ -292,36 +292,50 @@ process_one_dir() { fi fi - # Scalar sync regression: scalar store/load should participate in PIPE_S - # auto-sync and correctly connect with DMA pipelines. + # Scalar sync regression: scalar load/store should participate in PIPE_S + # auto-sync and correctly connect with supported DMA directions. if [[ "$base" == "test_inject_sync_scalar_cross_pipe" ]]; then - if ! grep -Eq "set_flag\\(PIPE_S,[[:space:]]*PIPE_MTE2,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then - echo -e "${A}(${base}.py)\tFAIL\tmissing set_flag PIPE_S->PIPE_MTE2" + if ! grep -Eq "set_flag\\(PIPE_MTE2,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tmissing set_flag PIPE_MTE2->PIPE_S" overall=1 continue fi - if ! grep -Eq "wait_flag\\(PIPE_S,[[:space:]]*PIPE_MTE2,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then - echo -e "${A}(${base}.py)\tFAIL\tmissing wait_flag PIPE_S->PIPE_MTE2" + if ! grep -Eq "wait_flag\\(PIPE_MTE2,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tmissing wait_flag PIPE_MTE2->PIPE_S" overall=1 continue fi - if ! grep -Eq "set_flag\\(PIPE_MTE3,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then - echo -e "${A}(${base}.py)\tFAIL\tmissing set_flag PIPE_MTE3->PIPE_S" + if ! grep -Eq "set_flag\\(PIPE_S,[[:space:]]*PIPE_MTE3,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tmissing set_flag PIPE_S->PIPE_MTE3" overall=1 continue fi - if ! grep -Eq "wait_flag\\(PIPE_MTE3,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then - echo -e "${A}(${base}.py)\tFAIL\tmissing wait_flag PIPE_MTE3->PIPE_S" + if ! grep -Eq "wait_flag\\(PIPE_S,[[:space:]]*PIPE_MTE3,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tmissing wait_flag PIPE_S->PIPE_MTE3" + overall=1 + continue + fi + if grep -Eq "set_flag\\(PIPE_S,[[:space:]]*PIPE_MTE2,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp" || \ + grep -Eq "wait_flag\\(PIPE_S,[[:space:]]*PIPE_MTE2,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tunexpected PIPE_S->PIPE_MTE2 event in scalar cross-pipe case" + overall=1 + continue + fi + if grep -Eq "set_flag\\(PIPE_MTE3,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp" || \ + grep -Eq "wait_flag\\(PIPE_MTE3,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tunexpected PIPE_MTE3->PIPE_S event in scalar cross-pipe case" overall=1 continue fi fi - # Scalar intra-pipe regression: dependent scalar PIPE_S accesses should be - # serialized by a per-pipe barrier. + # Scalar intra-pipe regression: dependent scalar accesses should be + # serialized by an extra safety barrier (beyond the function-tail PIPE_ALL). if [[ "$base" == "test_inject_sync_scalar_intra_pipe_barrier" ]]; then - if ! grep -Fq "pipe_barrier(PIPE_S)" "$cpp"; then - echo -e "${A}(${base}.py)\tFAIL\tmissing pipe_barrier(PIPE_S) for scalar intra-pipe dependency" + local bar_all_cnt + bar_all_cnt="$(grep -Fc "pipe_barrier(PIPE_ALL)" "$cpp" || true)" + if [[ "${bar_all_cnt}" -lt 2 ]]; then + echo -e "${A}(${base}.py)\tFAIL\tmissing extra pipe_barrier(PIPE_ALL) for scalar intra-pipe dependency" overall=1 continue fi From 3938e52f23345db1411f739d3e1d73605d4b5f48 Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Fri, 6 Mar 2026 17:08:11 +0800 Subject: [PATCH 3/6] [InsertSync] skip S->S sync and align scalar intra-pipe check --- .../InsertSync/InsertSyncAnalysis.cpp | 21 +++++-------------- test/samples/runop.sh | 19 +++++++++++++---- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp b/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp index 92c81f3e..3bb3b034 100644 --- a/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp +++ b/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp @@ -24,18 +24,11 @@ static bool isValidPipeIndex(PipelineType pipe) { return static_cast(pipe) < kPipeStateSize; } -static bool isScalarMemoryOp(Operation *op) { - return isa(op); -} - static bool needsPipeAllBarrier(PipelineType srcPipe, PipelineType dstPipe) { // A3 runtime is unstable for these scalar synchronization forms: - // 1) PIPE_S local barrier - // 2) PIPE_S -> PIPE_MTE2 - // 3) PIPE_MTE3 -> PIPE_S + // 1) PIPE_S -> PIPE_MTE2 + // 2) PIPE_MTE3 -> PIPE_S // Conservatively fall back to PIPE_ALL barrier to preserve correctness. - if (srcPipe == PipelineType::PIPE_S && dstPipe == PipelineType::PIPE_S) - return true; if (srcPipe == PipelineType::PIPE_S && dstPipe == PipelineType::PIPE_MTE2) return true; if (srcPipe == PipelineType::PIPE_MTE3 && dstPipe == PipelineType::PIPE_S) @@ -141,13 +134,9 @@ bool InsertSyncAnalysis::IsNoNeedToInsertSync( const PipelineType frontPipe = frontCompound->kPipeValue; const PipelineType nowPipe = nowCompound->kPipeValue; - if (frontPipe == nowPipe && frontPipe == PipelineType::PIPE_S) { - Operation *nowOp = nowCompound->elementOp; - Operation *frontOp = frontCompound->elementOp; - if (!isScalarMemoryOp(nowOp) && !isScalarMemoryOp(frontOp)) { - return true; - } - } + // Scalar pipe is in-order on target hardware; skip same-pipe sync. + if (frontPipe == nowPipe && frontPipe == PipelineType::PIPE_S) + return true; if (nowCompound->elementOp == frontCompound->elementOp && !isBackwardDep) { return true; diff --git a/test/samples/runop.sh b/test/samples/runop.sh index 896ed443..933bfb2f 100755 --- a/test/samples/runop.sh +++ b/test/samples/runop.sh @@ -329,13 +329,24 @@ process_one_dir() { fi fi - # Scalar intra-pipe regression: dependent scalar accesses should be - # serialized by an extra safety barrier (beyond the function-tail PIPE_ALL). + # Scalar intra-pipe regression: PIPE_S local dependency should not inject + # extra sync (PIPE_S is in-order); only function-tail PIPE_ALL remains. if [[ "$base" == "test_inject_sync_scalar_intra_pipe_barrier" ]]; then local bar_all_cnt bar_all_cnt="$(grep -Fc "pipe_barrier(PIPE_ALL)" "$cpp" || true)" - if [[ "${bar_all_cnt}" -lt 2 ]]; then - echo -e "${A}(${base}.py)\tFAIL\tmissing extra pipe_barrier(PIPE_ALL) for scalar intra-pipe dependency" + if grep -Fq "pipe_barrier(PIPE_S)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tunexpected pipe_barrier(PIPE_S) for scalar intra-pipe dependency" + overall=1 + continue + fi + if grep -Eq "set_flag\\(PIPE_S,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp" || \ + grep -Eq "wait_flag\\(PIPE_S,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tunexpected PIPE_S<->PIPE_S event sync for scalar intra-pipe dependency" + overall=1 + continue + fi + if [[ "${bar_all_cnt}" -ne 1 ]]; then + echo -e "${A}(${base}.py)\tFAIL\tunexpected PIPE_ALL barrier count=${bar_all_cnt} (expect 1 tail barrier)" overall=1 continue fi From 6c8c44e6cf641e96c8a41adbb26ea178a13757b7 Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Fri, 6 Mar 2026 17:14:09 +0800 Subject: [PATCH 4/6] [InsertSync] remove scalar cross-pipe PIPE_ALL fallback --- .../InsertSync/InsertSyncAnalysis.cpp | 26 +------------------ 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp b/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp index 3bb3b034..b91e9f68 100644 --- a/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp +++ b/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp @@ -24,18 +24,6 @@ static bool isValidPipeIndex(PipelineType pipe) { return static_cast(pipe) < kPipeStateSize; } -static bool needsPipeAllBarrier(PipelineType srcPipe, PipelineType dstPipe) { - // A3 runtime is unstable for these scalar synchronization forms: - // 1) PIPE_S -> PIPE_MTE2 - // 2) PIPE_MTE3 -> PIPE_S - // Conservatively fall back to PIPE_ALL barrier to preserve correctness. - if (srcPipe == PipelineType::PIPE_S && dstPipe == PipelineType::PIPE_MTE2) - return true; - if (srcPipe == PipelineType::PIPE_MTE3 && dstPipe == PipelineType::PIPE_S) - return true; - return false; -} - // ============================================================================== // 1. Entry Point // ============================================================================== @@ -342,19 +330,7 @@ void InsertSyncAnalysis::InsertSyncOperation( PipelineType nowPipe = nowCompound->kPipeValue; PipelineType frontPipe = frontCompound->kPipeValue; - if (needsPipeAllBarrier(frontPipe, nowPipe)) { - unsigned insertBarrierId = nowCompound->GetIndex(); - auto barrierOp = std::make_unique( - SyncOperation::TYPE::PIPE_BARRIER, PipelineType::PIPE_ALL, - PipelineType::PIPE_ALL, syncIndex_, insertBarrierId, forEndIndex); - barrierOp->SetDepSyncIRIndex(frontCompound->GetIndex()); - syncIR_[insertBarrierId]->pipeBefore.push_back(barrierOp.get()); - barrierOp->SetSyncIRIndex(insertBarrierId); - - SmallVector> newSync; - newSync.emplace_back(std::move(barrierOp)); - syncOperations_.emplace_back(std::move(newSync)); - } else if (nowPipe == frontPipe) { + if (nowPipe == frontPipe) { unsigned insertBarrierId = nowCompound->GetIndex(); auto barrierOp = std::make_unique( SyncOperation::TYPE::PIPE_BARRIER, frontPipe, nowPipe, syncIndex_, From 8be599bccfb2c57d482a70ec589836b678deac2e Mon Sep 17 00:00:00 2001 From: lishengtao Date: Tue, 17 Mar 2026 16:35:24 +0800 Subject: [PATCH 5/6] [InsertSync] preserve unknown local alias range conservatively --- .../PTO/Transforms/InsertSync/SyncCommon.h | 19 +++++++--- .../InsertSync/MemoryDependentAnalyzer.cpp | 3 ++ .../Transforms/InsertSync/PTOIRTranslator.cpp | 10 +++-- ..._inject_sync_unknown_alias_local_chain.pto | 37 +++++++++++++++++++ ...t_inject_sync_unknown_alias_local_chain.py | 6 +++ test/samples/runop.sh | 15 ++++++++ 6 files changed, 81 insertions(+), 9 deletions(-) create mode 100644 test/samples/Sync/test_inject_sync_unknown_alias_local_chain.pto create mode 100644 test/samples/Sync/test_inject_sync_unknown_alias_local_chain.py diff --git a/include/PTO/Transforms/InsertSync/SyncCommon.h b/include/PTO/Transforms/InsertSync/SyncCommon.h index 25a02551..587766c4 100644 --- a/include/PTO/Transforms/InsertSync/SyncCommon.h +++ b/include/PTO/Transforms/InsertSync/SyncCommon.h @@ -73,9 +73,11 @@ enum class TCoreType { struct BaseMemInfo { BaseMemInfo( Value baseBuffer, Value rootBuffer, pto::AddressSpace scope, - SmallVector baseAddresses, uint64_t allocateSize) + SmallVector baseAddresses, uint64_t allocateSize, + bool unknownRange = false) : baseBuffer(baseBuffer), rootBuffer(rootBuffer), scope(scope), - baseAddresses(std::move(baseAddresses)), allocateSize(allocateSize) {} + baseAddresses(std::move(baseAddresses)), allocateSize(allocateSize), + unknownRange(unknownRange) {} /// baseBuffer: 当前操作直接使用的 Buffer (可能是 View 或 Alias) Value baseBuffer; @@ -85,6 +87,8 @@ struct BaseMemInfo { pto::AddressSpace scope; SmallVector baseAddresses; // 用于 Offset 分析 uint64_t allocateSize; + // True means alias range is unknown and must be treated conservatively. + bool unknownRange{false}; bool areVectorEqual(const SmallVector& vec1, const SmallVector& vec2) const { @@ -99,6 +103,7 @@ struct BaseMemInfo { if (!areVectorEqual(baseAddresses, other.baseAddresses)) return false; if (rootBuffer != other.rootBuffer) return false; if (scope != other.scope) return false; + if (unknownRange != other.unknownRange) return false; // allocateSize 和 baseBuffer 的严格相等性在某些别名分析中可能太强了, // 但为了保持原有逻辑,先保留。重点是 rootBuffer 必须一致。 if (allocateSize != other.allocateSize) return false; @@ -108,12 +113,14 @@ struct BaseMemInfo { std::unique_ptr clone() const { return std::make_unique( - baseBuffer, rootBuffer, scope, baseAddresses, allocateSize); + baseBuffer, rootBuffer, scope, baseAddresses, allocateSize, + unknownRange); } - + std::unique_ptr clone(Value cloneBaseBuffer) const { return std::make_unique( - cloneBaseBuffer, rootBuffer, scope, baseAddresses, allocateSize); + cloneBaseBuffer, rootBuffer, scope, baseAddresses, allocateSize, + unknownRange); } }; @@ -355,4 +362,4 @@ void checkCondition(bool condition, const std::string &message); } // namespace pto } // namespace mlir -#endif // MLIR_DIALECT_PTO_TRANSFORMS_SYNC_COMMON_H \ No newline at end of file +#endif // MLIR_DIALECT_PTO_TRANSFORMS_SYNC_COMMON_H diff --git a/lib/PTO/Transforms/InsertSync/MemoryDependentAnalyzer.cpp b/lib/PTO/Transforms/InsertSync/MemoryDependentAnalyzer.cpp index 50135748..a8afe1f2 100644 --- a/lib/PTO/Transforms/InsertSync/MemoryDependentAnalyzer.cpp +++ b/lib/PTO/Transforms/InsertSync/MemoryDependentAnalyzer.cpp @@ -144,7 +144,9 @@ bool MemoryDependentAnalyzer::MemAlias(const BaseMemInfo *a, // 2. Local Memory (UB/L1) if (a->rootBuffer == b->rootBuffer) { + if (a->unknownRange || b->unknownRange) return true; if (a->baseAddresses.empty() || b->baseAddresses.empty()) return true; + if (a->allocateSize == 0 || b->allocateSize == 0) return true; return isBufferAddressRangeOverlap(a, b); } @@ -182,6 +184,7 @@ bool MemoryDependentAnalyzer::isGMBufferOverlap(const BaseMemInfo *a, return true; } + if (a->unknownRange || b->unknownRange) return true; if (a->baseAddresses.empty() || b->baseAddresses.empty()) return true; if (a->allocateSize == 0 || b->allocateSize == 0) return true; diff --git a/lib/PTO/Transforms/InsertSync/PTOIRTranslator.cpp b/lib/PTO/Transforms/InsertSync/PTOIRTranslator.cpp index b5e3b463..25be8897 100644 --- a/lib/PTO/Transforms/InsertSync/PTOIRTranslator.cpp +++ b/lib/PTO/Transforms/InsertSync/PTOIRTranslator.cpp @@ -549,13 +549,17 @@ void PTOIRTranslator::UpdateAliasBufferInfo(Value result, Value source) { for (auto &parentInfo : buffer2MemInfoMap_[source]) { auto newInfo = parentInfo->clone(result); + bool unknownRange = parentInfo->unknownRange || hasUnknownAliasRange; - if (hasUnknownAliasRange) { + if (unknownRange) { // Dynamic pointer arithmetic cannot be modeled precisely here. - // Keep root/scope aliasing, but drop concrete range info conservatively. + // Keep root/scope aliasing and preserve unknown-range state across + // descendant aliases to avoid dropping real dependencies. + newInfo->unknownRange = true; newInfo->baseAddresses.clear(); newInfo->allocateSize = 0; } else { + newInfo->unknownRange = false; if (!newInfo->baseAddresses.empty()) { newInfo->baseAddresses[0] += deltaOffset; } else { @@ -563,7 +567,7 @@ void PTOIRTranslator::UpdateAliasBufferInfo(Value result, Value source) { } } - if (!hasUnknownAliasRange && newSize > 0) { + if (!unknownRange && newSize > 0) { newInfo->allocateSize = newSize; } diff --git a/test/samples/Sync/test_inject_sync_unknown_alias_local_chain.pto b/test/samples/Sync/test_inject_sync_unknown_alias_local_chain.pto new file mode 100644 index 00000000..2b20b9cb --- /dev/null +++ b/test/samples/Sync/test_inject_sync_unknown_alias_local_chain.pto @@ -0,0 +1,37 @@ +module attributes {"pto.device-spec" = "Ascend910B1"} { + func.func @test_unknown_alias_local_chain(%src: !pto.ptr, %dst: !pto.ptr, %dyn_off: index) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + + %src_tv = pto.make_tensor_view %src, shape = [%c1, %c32], strides = [%c32, %c1] : !pto.tensor_view + %dst_tv = pto.make_tensor_view %dst, shape = [%c1, %c32], strides = [%c32, %c1] : !pto.tensor_view + %src_part = pto.partition_view %src_tv, offsets = [%c0, %c0], sizes = [%c1, %c32] + : !pto.tensor_view -> !pto.partition_tensor_view<1x32xf32> + %dst_part = pto.partition_view %dst_tv, offsets = [%c0, %c0], sizes = [%c1, %c32] + : !pto.tensor_view -> !pto.partition_tensor_view<1x32xf32> + + %local = memref.alloc() : memref<1x32xf32, strided<[32, 1]>, #pto.address_space> + %tile_load = pto.bind_tile %local, %c1, %c32 + {config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value>} + : memref<1x32xf32, strided<[32, 1]>, #pto.address_space> + -> memref<1x32xf32, strided<[32, 1], offset: ?>, #pto.address_space> + + %sub = memref.subview %local[%c0, %dyn_off] [1, 32] [1, 1] + : memref<1x32xf32, strided<[32, 1]>, #pto.address_space> + to memref<1x32xf32, strided<[32, 1], offset: ?>, #pto.address_space> + %cast = memref.reinterpret_cast %sub to offset: [0], sizes: [%c1, %c32], strides: [%c32, %c1] + : memref<1x32xf32, strided<[32, 1], offset: ?>, #pto.address_space> + to memref<1x32xf32, strided<[?, ?], offset: ?>, #pto.address_space> + %tile_store = pto.bind_tile %cast, %c1, %c32 + {config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value>} + : memref<1x32xf32, strided<[?, ?], offset: ?>, #pto.address_space> + -> memref<1x32xf32, strided<[32, 1], offset: ?>, #pto.address_space> + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x32xf32>) + outs(%tile_load : memref<1x32xf32, strided<[32, 1], offset: ?>, #pto.address_space>) + pto.tstore ins(%tile_store : memref<1x32xf32, strided<[32, 1], offset: ?>, #pto.address_space>) + outs(%dst_part : !pto.partition_tensor_view<1x32xf32>) + return + } +} diff --git a/test/samples/Sync/test_inject_sync_unknown_alias_local_chain.py b/test/samples/Sync/test_inject_sync_unknown_alias_local_chain.py new file mode 100644 index 00000000..7baa3c4c --- /dev/null +++ b/test/samples/Sync/test_inject_sync_unknown_alias_local_chain.py @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 +from pathlib import Path + + +if __name__ == "__main__": + print(Path(__file__).with_suffix(".pto").read_text(encoding="utf-8")) diff --git a/test/samples/runop.sh b/test/samples/runop.sh index 933bfb2f..b7073ce2 100755 --- a/test/samples/runop.sh +++ b/test/samples/runop.sh @@ -352,6 +352,21 @@ process_one_dir() { fi fi + # Alias regression: dynamic local alias chains must stay conservative. + # Unknown-range local aliases should still preserve MTE2->MTE3 dependency. + if [[ "$base" == "test_inject_sync_unknown_alias_local_chain" ]]; then + if ! grep -Eq "set_flag\\(PIPE_MTE2,[[:space:]]*PIPE_MTE3,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tmissing set_flag PIPE_MTE2->PIPE_MTE3 for unknown local alias chain" + overall=1 + continue + fi + if ! grep -Eq "wait_flag\\(PIPE_MTE2,[[:space:]]*PIPE_MTE3,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tmissing wait_flag PIPE_MTE2->PIPE_MTE3 for unknown local alias chain" + overall=1 + continue + fi + fi + # Regression guard for issue #185: barrier_sync must support op types # beyond TMATMUL/TVEC and lower to the expected per-pipe barrier. if [[ "$base" == "test_barrier_sync" ]]; then From 3d668c6fde1f3c5547748c57eb0e4b03aafb57ff Mon Sep 17 00:00:00 2001 From: lishengtao Date: Wed, 18 Mar 2026 18:37:21 +0800 Subject: [PATCH 6/6] [InsertSync] model scalar access ranges and add disjoint-range regression --- .../Transforms/InsertSync/PTOIRTranslator.h | 8 ++ .../Transforms/InsertSync/PTOIRTranslator.cpp | 108 +++++++++++++++--- ...test_inject_sync_scalar_disjoint_range.pto | 37 ++++++ .../test_inject_sync_scalar_disjoint_range.py | 6 + test/samples/runop.sh | 21 ++++ 5 files changed, 161 insertions(+), 19 deletions(-) create mode 100644 test/samples/Sync/test_inject_sync_scalar_disjoint_range.pto create mode 100644 test/samples/Sync/test_inject_sync_scalar_disjoint_range.py diff --git a/include/PTO/Transforms/InsertSync/PTOIRTranslator.h b/include/PTO/Transforms/InsertSync/PTOIRTranslator.h index 179531a0..7f38bd2f 100644 --- a/include/PTO/Transforms/InsertSync/PTOIRTranslator.h +++ b/include/PTO/Transforms/InsertSync/PTOIRTranslator.h @@ -8,6 +8,7 @@ #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/IR/BuiltinOps.h" #include "llvm/Support/raw_ostream.h" +#include namespace mlir { namespace pto { @@ -76,12 +77,19 @@ class PTOIRTranslator { // 根据 Values 填充 Def/Use 列表 void UpdateDefUseVec(ValueRange values, SmallVector &vec); + + // scalar 访问切片建模:按 ptr+offset 构建访问级依赖信息。 + void UpdateScalarDefUseVec(Value ptr, Value offset, Type scalarType, + SmallVector &vec); // 调试辅助 std::string getPipelineName(PipelineType pipe); void printMemInfoList(llvm::raw_ostream &os, const SmallVector &list, AsmState &state); + + // 持久化 scalar 访问切片,保证 def/use 指针在分析期间有效。 + std::vector> scalarAccessMemInfoPool_; }; } // namespace pto diff --git a/lib/PTO/Transforms/InsertSync/PTOIRTranslator.cpp b/lib/PTO/Transforms/InsertSync/PTOIRTranslator.cpp index 25be8897..cc9c346d 100644 --- a/lib/PTO/Transforms/InsertSync/PTOIRTranslator.cpp +++ b/lib/PTO/Transforms/InsertSync/PTOIRTranslator.cpp @@ -8,6 +8,7 @@ #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/IR/Matchers.h" #include +#include // [P0 新增] 引入副作用接口和 PTO 接口 #include "mlir/Interfaces/SideEffectInterfaces.h" @@ -112,6 +113,7 @@ static std::pair getStaticOffsetAndSize(Operation *op, Value s // 1. 构建入口 // ============================================================================ void PTOIRTranslator::Build() { + scalarAccessMemInfoPool_.clear(); Region &funcRegion = func_.getBody(); UpdateKernelArgMemInfo(); RecursionIR(&funcRegion); @@ -335,28 +337,34 @@ void PTOIRTranslator::UpdatePTOOpInfo(Operation *op) { SmallVector defVec; SmallVector useVec; - - // 2. [关键] 使用 MemoryEffects 接口自动获取读写依赖 - if (auto memEffect = dyn_cast(op)) { - SmallVector, 4> effects; - memEffect.getEffects(effects); - - for (auto &effect : effects) { - Value val = effect.getValue(); - if (!val) continue; - - // 只有当 Value 在我们的 BufferMap 中有记录时,才视为有效依赖 - // (过滤掉比如 Loop Iterator 或其他标量) - if (isa(effect.getEffect())) { - UpdateDefUseVec({val}, useVec); - } else if (isa(effect.getEffect())) { - UpdateDefUseVec({val}, defVec); - } - } + + // 2. scalar 走访问级建模,其余 op 继续走通用 MemoryEffects 路径。 + if (auto loadScalar = dyn_cast(op)) { + UpdateScalarDefUseVec(loadScalar.getPtr(), loadScalar.getOffset(), + loadScalar.getValue().getType(), useVec); + } else if (auto storeScalar = dyn_cast(op)) { + UpdateScalarDefUseVec(storeScalar.getPtr(), storeScalar.getOffset(), + storeScalar.getValue().getType(), defVec); + } else if (auto memEffect = dyn_cast(op)) { + SmallVector, 4> effects; + memEffect.getEffects(effects); + + for (auto &effect : effects) { + Value val = effect.getValue(); + if (!val) continue; + + // 只有当 Value 在我们的 BufferMap 中有记录时,才视为有效依赖 + // (过滤掉比如 Loop Iterator 或其他标量) + if (isa(effect.getEffect())) { + UpdateDefUseVec({val}, useVec); + } else if (isa(effect.getEffect())) { + UpdateDefUseVec({val}, defVec); + } + } } else { // 如果算子有 Pipe 属性但没实现 MemoryEffects,这是一个定义错误 // 我们可以打印个 Warning 或者保持为空 (认为无副作用) - LLVM_DEBUG(llvm::dbgs() << "Warning: Op " << op->getName() + LLVM_DEBUG(llvm::dbgs() << "Warning: Op " << op->getName() << " has Pipe but no MemoryEffects interface.\n"); } @@ -628,6 +636,68 @@ void PTOIRTranslator::UpdateDefUseVec(ValueRange values, SmallVector &vec) { + if (!ptr) return; + + if (!buffer2MemInfoMap_.contains(ptr)) { + // 保留历史行为:当 ptr 未建模时退回指针级依赖。 + UpdateDefUseVec({ptr}, vec); + return; + } + + const uint64_t elemBytes = + static_cast(std::max(1, getElementSizeInBytes(scalarType))); + + bool hasPreciseOffset = false; + uint64_t offsetBytes = 0; + llvm::APInt offsetApInt; + if (matchPattern(offset, m_ConstantInt(&offsetApInt)) && + !offsetApInt.isNegative() && offsetApInt.getActiveBits() <= 64) { + const uint64_t offsetElems = offsetApInt.getZExtValue(); + const unsigned __int128 wideOffsetBytes = + static_cast(offsetElems) * elemBytes; + if (wideOffsetBytes <= std::numeric_limits::max()) { + hasPreciseOffset = true; + offsetBytes = static_cast(wideOffsetBytes); + } + } + + for (auto &parentInfo : buffer2MemInfoMap_[ptr]) { + auto sliceInfo = parentInfo->clone(ptr); + bool unknownRange = parentInfo->unknownRange || !hasPreciseOffset; + + if (!unknownRange) { + if (sliceInfo->baseAddresses.empty()) { + unknownRange = true; + } else { + for (uint64_t baseAddr : sliceInfo->baseAddresses) { + if (offsetBytes > std::numeric_limits::max() - baseAddr) { + unknownRange = true; + break; + } + } + } + } + + if (unknownRange) { + sliceInfo->unknownRange = true; + sliceInfo->baseAddresses.clear(); + sliceInfo->allocateSize = 0; + } else { + sliceInfo->unknownRange = false; + for (uint64_t &baseAddr : sliceInfo->baseAddresses) { + baseAddr += offsetBytes; + } + sliceInfo->allocateSize = elemBytes; + } + + scalarAccessMemInfoPool_.emplace_back(std::move(sliceInfo)); + vec.push_back(scalarAccessMemInfoPool_.back().get()); + } +} // ============================================================================ // 9. 调试与打印支持 diff --git a/test/samples/Sync/test_inject_sync_scalar_disjoint_range.pto b/test/samples/Sync/test_inject_sync_scalar_disjoint_range.pto new file mode 100644 index 00000000..275a2865 --- /dev/null +++ b/test/samples/Sync/test_inject_sync_scalar_disjoint_range.pto @@ -0,0 +1,37 @@ +module attributes {"pto.device-spec" = "Ascend910B1"} { + func.func @test_scalar_disjoint_range( + %src: memref<1024xf32, #pto.address_space>, + %dst: memref<1024xf32, #pto.address_space>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c0_i64 = arith.constant 0 : i64 + %f1 = arith.constant 1.000000e+00 : f32 + + %src_dma = memref.subview %src[0] [128] [1] + : memref<1024xf32, #pto.address_space> + to memref<128xf32, strided<[1]>, #pto.address_space> + %dst_dma = memref.subview %dst[0] [128] [1] + : memref<1024xf32, #pto.address_space> + to memref<128xf32, strided<[1]>, #pto.address_space> + + %src_scalar = memref.subview %src[512] [8] [1] + : memref<1024xf32, #pto.address_space> + to memref<8xf32, strided<[1], offset: 512>, #pto.address_space> + %dst_scalar = memref.subview %dst[512] [8] [1] + : memref<1024xf32, #pto.address_space> + to memref<8xf32, strided<[1], offset: 512>, #pto.address_space> + + %ub = pto.pointer_cast(%c0_i64) : memref<128xf32, #pto.address_space> + + pto.tload ins(%src_dma : memref<128xf32, strided<[1]>, #pto.address_space>) + outs(%ub : memref<128xf32, #pto.address_space>) + + pto.store_scalar %f1, %src_scalar[%c0] : memref<8xf32, strided<[1], offset: 512>, #pto.address_space>, f32 + %loaded = pto.load_scalar %dst_scalar[%c0] : memref<8xf32, strided<[1], offset: 512>, #pto.address_space> -> f32 + pto.store_scalar %loaded, %dst_scalar[%c1] : memref<8xf32, strided<[1], offset: 512>, #pto.address_space>, f32 + + pto.tstore ins(%ub : memref<128xf32, #pto.address_space>) + outs(%dst_dma : memref<128xf32, strided<[1]>, #pto.address_space>) + return + } +} diff --git a/test/samples/Sync/test_inject_sync_scalar_disjoint_range.py b/test/samples/Sync/test_inject_sync_scalar_disjoint_range.py new file mode 100644 index 00000000..7baa3c4c --- /dev/null +++ b/test/samples/Sync/test_inject_sync_scalar_disjoint_range.py @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 +from pathlib import Path + + +if __name__ == "__main__": + print(Path(__file__).with_suffix(".pto").read_text(encoding="utf-8")) diff --git a/test/samples/runop.sh b/test/samples/runop.sh index b7073ce2..eb6b6b44 100755 --- a/test/samples/runop.sh +++ b/test/samples/runop.sh @@ -352,6 +352,27 @@ process_one_dir() { fi fi + # Scalar disjoint-range regression: scalar accesses on the same root buffer + # but non-overlapping static ranges must not trigger scalar<->DMA events. + if [[ "$base" == "test_inject_sync_scalar_disjoint_range" ]]; then + if grep -Eq "set_flag\\(PIPE_MTE2,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp" || \ + grep -Eq "wait_flag\\(PIPE_MTE2,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp" || \ + grep -Eq "set_flag\\(PIPE_S,[[:space:]]*PIPE_MTE2,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp" || \ + grep -Eq "wait_flag\\(PIPE_S,[[:space:]]*PIPE_MTE2,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tunexpected PIPE_MTE2<->PIPE_S event sync for scalar disjoint-range case" + overall=1 + continue + fi + if grep -Eq "set_flag\\(PIPE_S,[[:space:]]*PIPE_MTE3,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp" || \ + grep -Eq "wait_flag\\(PIPE_S,[[:space:]]*PIPE_MTE3,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp" || \ + grep -Eq "set_flag\\(PIPE_MTE3,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp" || \ + grep -Eq "wait_flag\\(PIPE_MTE3,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tunexpected PIPE_S<->PIPE_MTE3 event sync for scalar disjoint-range case" + overall=1 + continue + fi + fi + # Alias regression: dynamic local alias chains must stay conservative. # Unknown-range local aliases should still preserve MTE2->MTE3 dependency. if [[ "$base" == "test_inject_sync_unknown_alias_local_chain" ]]; then