From 1716c29530d3a4a9b95b7919a74e3796771ca370 Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Thu, 2 Apr 2026 10:30:50 +0800 Subject: [PATCH 01/16] [A5][Sync] remove identity tmov before insert-sync --- include/PTO/Transforms/Passes.h | 1 + include/PTO/Transforms/Passes.td | 14 ++++ lib/PTO/Transforms/CMakeLists.txt | 1 + .../Transforms/PTORemoveIdentityTMovPass.cpp | 75 +++++++++++++++++++ test/basic/identity_tmov_autosync_a5_only.pto | 33 ++++++++ tools/ptoas/ptoas.cpp | 5 +- 6 files changed, 128 insertions(+), 1 deletion(-) create mode 100644 lib/PTO/Transforms/PTORemoveIdentityTMovPass.cpp create mode 100644 test/basic/identity_tmov_autosync_a5_only.pto diff --git a/include/PTO/Transforms/Passes.h b/include/PTO/Transforms/Passes.h index cafdb784c..4861b155d 100644 --- a/include/PTO/Transforms/Passes.h +++ b/include/PTO/Transforms/Passes.h @@ -38,6 +38,7 @@ std::unique_ptr createPTOLowerFrontendPipeOpsPass(); std::unique_ptr createPTOResolveReservedBuffersPass(); std::unique_ptr createPTOWrapFunctionsInSectionsPass(); std::unique_ptr createPTOVerifyTFreePass(); +std::unique_ptr createPTORemoveIdentityTMovPass(); // Creates a pass for ... std::unique_ptr createPTOInsertSyncPass(); diff --git a/include/PTO/Transforms/Passes.td b/include/PTO/Transforms/Passes.td index 37979bf21..96521685b 100644 --- a/include/PTO/Transforms/Passes.td +++ b/include/PTO/Transforms/Passes.td @@ -38,6 +38,20 @@ def PTOInsertSync : Pass<"pto-insert-sync", "func::FuncOp"> { ]; } +def PTORemoveIdentityTMov : Pass<"pto-remove-identity-tmov", "func::FuncOp"> { + let summary = "Remove identity pto.tmov before auto-sync on A5"; + let description = [{ + Erases `pto.tmov` operations where source and destination are the same SSA + value. The pass is gated by `pto.target_arch = "a5"` and is intended to run + before `pto-insert-sync` to avoid generating synchronization edges for a + no-op move. + }]; + let constructor = "mlir::pto::createPTORemoveIdentityTMovPass()"; + let dependentDialects = [ + "mlir::pto::PTODialect" + ]; +} + def ConvertToPTOOp : Pass<"convert-to-pto-op"> { let summary = "Convert Ops from other dialects to PTO Ops"; let constructor = "mlir::pto::createConvertToPTOOpPass()"; diff --git a/lib/PTO/Transforms/CMakeLists.txt b/lib/PTO/Transforms/CMakeLists.txt index b82d227fe..755c2a1bb 100644 --- a/lib/PTO/Transforms/CMakeLists.txt +++ b/lib/PTO/Transforms/CMakeLists.txt @@ -23,6 +23,7 @@ add_mlir_dialect_library(PTOTransforms PTOPlanMemory.cpp PTORemoveRedundantBarrier.cpp InferPTOLayout.cpp + PTORemoveIdentityTMovPass.cpp BufferizableOpInterfaceImpl.cpp ConvertToPTOOp.cpp PTOLowerFrontendPipeOpsPass.cpp diff --git a/lib/PTO/Transforms/PTORemoveIdentityTMovPass.cpp b/lib/PTO/Transforms/PTORemoveIdentityTMovPass.cpp new file mode 100644 index 000000000..88f359ba7 --- /dev/null +++ b/lib/PTO/Transforms/PTORemoveIdentityTMovPass.cpp @@ -0,0 +1,75 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include "PTO/IR/PTO.h" +#include "PTO/Transforms/Passes.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/Pass/Pass.h" + +namespace mlir { +namespace pto { +namespace func = ::mlir::func; +#define GEN_PASS_DEF_PTOREMOVEIDENTITYTMOV +#include "PTO/Transforms/Passes.h.inc" +} // namespace pto +} // namespace mlir + +using namespace mlir; +using namespace mlir::pto; + +namespace { + +static bool isA5Target(func::FuncOp funcOp) { + ModuleOp module = funcOp->getParentOfType(); + if (!module) + return false; + auto arch = module->getAttrOfType("pto.target_arch"); + return arch && arch.getValue() == "a5"; +} + +static bool canEraseIdentityTMov(TMovOp op) { + if (op.getSrc() != op.getDst()) + return false; + + Value result = op.getResult(); + if (!result || result.use_empty()) + return true; + + return result.getType() == op.getDst().getType(); +} + +struct PTORemoveIdentityTMovPass + : public mlir::pto::impl::PTORemoveIdentityTMovBase< + PTORemoveIdentityTMovPass> { + void runOnOperation() override { + func::FuncOp funcOp = getOperation(); + if (!isA5Target(funcOp)) + return; + + SmallVector toErase; + funcOp.walk([&](TMovOp op) { + if (canEraseIdentityTMov(op)) + toErase.push_back(op); + }); + + for (TMovOp op : toErase) { + Value result = op.getResult(); + if (result && !result.use_empty()) + result.replaceAllUsesWith(op.getDst()); + op.erase(); + } + } +}; + +} // namespace + +std::unique_ptr mlir::pto::createPTORemoveIdentityTMovPass() { + return std::make_unique(); +} diff --git a/test/basic/identity_tmov_autosync_a5_only.pto b/test/basic/identity_tmov_autosync_a5_only.pto new file mode 100644 index 000000000..e02a33014 --- /dev/null +++ b/test/basic/identity_tmov_autosync_a5_only.pto @@ -0,0 +1,33 @@ +// RUN: ptoas --pto-arch=a5 --enable-insert-sync %s | FileCheck %s --check-prefix=A5 +// RUN: ptoas --pto-arch=a3 --enable-insert-sync %s | FileCheck %s --check-prefix=A3 + +module attributes {"pto.device-spec" = "Ascend950"} { + func.func @identity_tmov_autosync_a5_only( + %src: memref<1x64xf16, #pto.address_space>, + %dst: memref<1x64xf16, #pto.address_space>) { + %ub = memref.alloc() : memref<1x64xf16, #pto.address_space> + + pto.tload ins(%src : memref<1x64xf16, #pto.address_space>) + outs(%ub : memref<1x64xf16, #pto.address_space>) + + // Identity move: should be removed on A5 before sync insertion. + pto.tmov ins(%ub : memref<1x64xf16, #pto.address_space>) + outs(%ub : memref<1x64xf16, #pto.address_space>) + + pto.tstore ins(%ub : memref<1x64xf16, #pto.address_space>) + outs(%dst : memref<1x64xf16, #pto.address_space>) + return + } +} + +// A5-LABEL: __global__ AICORE void identity_tmov_autosync_a5_only( +// A5: set_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0); +// A5-NOT: set_flag(PIPE_MTE2, PIPE_V +// A5-NOT: wait_flag(PIPE_MTE2, PIPE_V +// A5-NOT: set_flag(PIPE_V, PIPE_MTE3 +// A5-NOT: wait_flag(PIPE_V, PIPE_MTE3 +// A5-NOT: TMOV( +// A5: wait_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0); + +// A3-LABEL: __global__ AICORE void identity_tmov_autosync_a5_only( +// A3: TMOV( diff --git a/tools/ptoas/ptoas.cpp b/tools/ptoas/ptoas.cpp index e7034ecab..1d71acd87 100644 --- a/tools/ptoas/ptoas.cpp +++ b/tools/ptoas/ptoas.cpp @@ -1111,8 +1111,11 @@ int main(int argc, char **argv) { pm.addPass(pto::createPTOResolveReservedBuffersPass()); // Conditionally add Sync pass based on flag. - if (enableInsertSync) + if (enableInsertSync) { + pm.addNestedPass( + pto::createPTORemoveIdentityTMovPass()); pm.addNestedPass(pto::createPTOInsertSyncPass()); + } pm.addPass(createCSEPass()); if (arch == "a3") { From 18d0db052448b3edaec12b80cb1a1f28a0035e66 Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Thu, 2 Apr 2026 14:23:44 +0800 Subject: [PATCH 02/16] [A5][Sync] strengthen identity tmov elision criteria --- include/PTO/Transforms/Passes.td | 9 +- .../Transforms/PTORemoveIdentityTMovPass.cpp | 154 +++++++++++++++++- .../basic/identity_tmov_diff_addr_keep_a5.pto | 20 +++ .../identity_tmov_dynamic_addr_keep_a5.pto | 14 ++ ...entity_tmov_same_range_distinct_ssa_a5.pto | 24 +++ 5 files changed, 210 insertions(+), 11 deletions(-) create mode 100644 test/basic/identity_tmov_diff_addr_keep_a5.pto create mode 100644 test/basic/identity_tmov_dynamic_addr_keep_a5.pto create mode 100644 test/basic/identity_tmov_same_range_distinct_ssa_a5.pto diff --git a/include/PTO/Transforms/Passes.td b/include/PTO/Transforms/Passes.td index 96521685b..7a0821717 100644 --- a/include/PTO/Transforms/Passes.td +++ b/include/PTO/Transforms/Passes.td @@ -41,10 +41,11 @@ def PTOInsertSync : Pass<"pto-insert-sync", "func::FuncOp"> { def PTORemoveIdentityTMov : Pass<"pto-remove-identity-tmov", "func::FuncOp"> { let summary = "Remove identity pto.tmov before auto-sync on A5"; let description = [{ - Erases `pto.tmov` operations where source and destination are the same SSA - value. The pass is gated by `pto.target_arch = "a5"` and is intended to run - before `pto-insert-sync` to avoid generating synchronization edges for a - no-op move. + Erases provably-identity `pto.tmov` operations on A5 before + `pto-insert-sync`. Besides `src == dst`, this also handles distinct SSA + values when alias analysis proves source and destination have the exact same + address range and type/layout. Dynamic/unknown address cases are kept + conservative and are not removed. }]; let constructor = "mlir::pto::createPTORemoveIdentityTMovPass()"; let dependentDialects = [ diff --git a/lib/PTO/Transforms/PTORemoveIdentityTMovPass.cpp b/lib/PTO/Transforms/PTORemoveIdentityTMovPass.cpp index 88f359ba7..feff3efff 100644 --- a/lib/PTO/Transforms/PTORemoveIdentityTMovPass.cpp +++ b/lib/PTO/Transforms/PTORemoveIdentityTMovPass.cpp @@ -8,7 +8,11 @@ #include "PTO/IR/PTO.h" #include "PTO/Transforms/Passes.h" +#include "PTO/Transforms/InsertSync/MemoryDependentAnalyzer.h" +#include "PTO/Transforms/InsertSync/PTOIRTranslator.h" +#include "PTO/Transforms/InsertSync/SyncCommon.h" #include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/Pass/Pass.h" @@ -31,18 +35,139 @@ static bool isA5Target(func::FuncOp funcOp) { if (!module) return false; auto arch = module->getAttrOfType("pto.target_arch"); - return arch && arch.getValue() == "a5"; + return arch && arch.getValue().equals_insensitive("a5"); } -static bool canEraseIdentityTMov(TMovOp op) { - if (op.getSrc() != op.getDst()) +static const BaseMemInfo * +getSingleMemInfo(const Buffer2MemInfoMap &buffer2MemInfoMap, Value value) { + auto it = buffer2MemInfoMap.find(value); + if (it == buffer2MemInfoMap.end()) + return nullptr; + if (it->second.size() != 1) + return nullptr; + return it->second.front().get(); +} + +static bool hasConcreteAddress(const BaseMemInfo *info) { + if (!info || info->baseAddresses.empty()) + return false; + + Operation *defOp = info->rootBuffer.getDefiningOp(); + if (!defOp) + return false; + + if (isa(defOp)) + return true; + + if (auto alloc = dyn_cast(defOp)) + return static_cast(alloc.getAddr()); + + return false; +} + +static bool hasDynamicStaticList(ArrayRef values) { + return llvm::any_of(values, [](int64_t value) { + return value == ShapedType::kDynamic; + }); +} + +static bool isStaticallyAddressableValue(Value value) { + int depth = 0; + constexpr int kMaxDepth = 32; + while (value && depth++ < kMaxDepth) { + Operation *defOp = value.getDefiningOp(); + if (!defOp) + return false; + + if (auto subView = dyn_cast(defOp)) { + if (hasDynamicStaticList(subView.getStaticOffsets()) || + hasDynamicStaticList(subView.getStaticSizes()) || + hasDynamicStaticList(subView.getStaticStrides())) { + return false; + } + value = subView.getSource(); + continue; + } + + if (auto reCast = dyn_cast(defOp)) { + if (hasDynamicStaticList(reCast.getStaticOffsets()) || + hasDynamicStaticList(reCast.getStaticSizes()) || + hasDynamicStaticList(reCast.getStaticStrides())) { + return false; + } + value = reCast.getSource(); + continue; + } + + if (auto cast = dyn_cast(defOp)) { + value = cast.getSource(); + continue; + } + if (auto collapse = dyn_cast(defOp)) { + value = collapse.getSrc(); + continue; + } + if (auto expand = dyn_cast(defOp)) { + value = expand.getSrc(); + continue; + } + if (auto view = dyn_cast(defOp)) { + if (view.getByteShift()) + return false; + value = view.getSource(); + continue; + } + + return true; + } + + return false; +} + +static bool hasExactSameAddressRange(const BaseMemInfo *srcInfo, + const BaseMemInfo *dstInfo) { + if (!srcInfo || !dstInfo) return false; - Value result = op.getResult(); - if (!result || result.use_empty()) + if (srcInfo->scope != dstInfo->scope) + return false; + if (srcInfo->allocateSize == 0 || dstInfo->allocateSize == 0) + return false; + if (srcInfo->allocateSize != dstInfo->allocateSize) + return false; + if (srcInfo->baseAddresses.empty() || dstInfo->baseAddresses.empty()) + return false; + if (srcInfo->baseAddresses != dstInfo->baseAddresses) + return false; + + return true; +} + +static bool canEraseIdentityTMov( + TMovOp op, const Buffer2MemInfoMap &buffer2MemInfoMap) { + Value src = op.getSrc(); + Value dst = op.getDst(); + + if (src == dst) return true; - return result.getType() == op.getDst().getType(); + if (src.getType() != dst.getType()) + return false; + if (!isStaticallyAddressableValue(src) || !isStaticallyAddressableValue(dst)) + return false; + + const BaseMemInfo *srcInfo = getSingleMemInfo(buffer2MemInfoMap, src); + const BaseMemInfo *dstInfo = getSingleMemInfo(buffer2MemInfoMap, dst); + if (!srcInfo || !dstInfo) + return false; + + if (!hasExactSameAddressRange(srcInfo, dstInfo)) + return false; + + if (srcInfo->rootBuffer == dstInfo->rootBuffer) + return true; + + return hasConcreteAddress(srcInfo) && hasConcreteAddress(dstInfo); } struct PTORemoveIdentityTMovPass @@ -53,9 +178,24 @@ struct PTORemoveIdentityTMovPass if (!isA5Target(funcOp)) return; + bool hasTMov = false; + funcOp.walk([&](TMovOp) { + hasTMov = true; + return WalkResult::interrupt(); + }); + if (!hasTMov) + return; + + MemoryDependentAnalyzer memAnalyzer; + SyncIRs syncIR; + Buffer2MemInfoMap buffer2MemInfoMap; + PTOIRTranslator translator(syncIR, memAnalyzer, buffer2MemInfoMap, funcOp, + SyncAnalysisMode::NORMALSYNC); + translator.Build(); + SmallVector toErase; funcOp.walk([&](TMovOp op) { - if (canEraseIdentityTMov(op)) + if (canEraseIdentityTMov(op, buffer2MemInfoMap)) toErase.push_back(op); }); diff --git a/test/basic/identity_tmov_diff_addr_keep_a5.pto b/test/basic/identity_tmov_diff_addr_keep_a5.pto new file mode 100644 index 000000000..ea5734e94 --- /dev/null +++ b/test/basic/identity_tmov_diff_addr_keep_a5.pto @@ -0,0 +1,20 @@ +// RUN: ptoas --pto-arch=a5 --enable-insert-sync %s | FileCheck %s + +module attributes {"pto.device-spec" = "Ascend950"} { + func.func @identity_tmov_diff_addr_keep_a5( + %buf: memref<1024xi32, #pto.address_space>) { + %src = memref.subview %buf[0] [256] [1] + : memref<1024xi32, #pto.address_space> + to memref<256xi32, strided<[1]>, #pto.address_space> + %dst = memref.subview %buf[256] [256] [1] + : memref<1024xi32, #pto.address_space> + to memref<256xi32, strided<[1], offset: 256>, #pto.address_space> + + pto.tmov ins(%src : memref<256xi32, strided<[1]>, #pto.address_space>) + outs(%dst : memref<256xi32, strided<[1], offset: 256>, #pto.address_space>) + return + } +} + +// CHECK-LABEL: __global__ AICORE void identity_tmov_diff_addr_keep_a5( +// CHECK: TMOV( diff --git a/test/basic/identity_tmov_dynamic_addr_keep_a5.pto b/test/basic/identity_tmov_dynamic_addr_keep_a5.pto new file mode 100644 index 000000000..088f14127 --- /dev/null +++ b/test/basic/identity_tmov_dynamic_addr_keep_a5.pto @@ -0,0 +1,14 @@ +// RUN: ptoas --pto-arch=a5 --enable-insert-sync %s | FileCheck %s + +module attributes {"pto.device-spec" = "Ascend950"} { + func.func @identity_tmov_dynamic_addr_keep_a5( + %src: memref>, + %dst: memref>) { + pto.tmov ins(%src : memref>) + outs(%dst : memref>) + return + } +} + +// CHECK-LABEL: __global__ AICORE void identity_tmov_dynamic_addr_keep_a5( +// CHECK: TMOV( diff --git a/test/basic/identity_tmov_same_range_distinct_ssa_a5.pto b/test/basic/identity_tmov_same_range_distinct_ssa_a5.pto new file mode 100644 index 000000000..280770dd6 --- /dev/null +++ b/test/basic/identity_tmov_same_range_distinct_ssa_a5.pto @@ -0,0 +1,24 @@ +// RUN: ptoas --pto-arch=a5 --enable-insert-sync %s | FileCheck %s --check-prefix=A5 +// RUN: ptoas --pto-arch=a3 --enable-insert-sync %s | FileCheck %s --check-prefix=A3 + +module attributes {"pto.device-spec" = "Ascend950"} { + func.func @identity_tmov_same_range_distinct_ssa_a5() { + %buf = memref.alloc() : memref<1024xi32, #pto.address_space> + %src = memref.subview %buf[0] [1024] [1] + : memref<1024xi32, #pto.address_space> + to memref<1024xi32, strided<[1]>, #pto.address_space> + %dst = memref.subview %buf[0] [1024] [1] + : memref<1024xi32, #pto.address_space> + to memref<1024xi32, strided<[1]>, #pto.address_space> + + pto.tmov ins(%src : memref<1024xi32, strided<[1]>, #pto.address_space>) + outs(%dst : memref<1024xi32, strided<[1]>, #pto.address_space>) + return + } +} + +// A5-LABEL: __global__ AICORE void identity_tmov_same_range_distinct_ssa_a5( +// A5-NOT: TMOV( + +// A3-LABEL: __global__ AICORE void identity_tmov_same_range_distinct_ssa_a5( +// A3: TMOV( From e09af6d4be64cc1a101cdba68637f1f897ec4b9f Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Thu, 2 Apr 2026 14:35:15 +0800 Subject: [PATCH 03/16] [A5][Sync][test] add if-else alias identity tmov guard --- test/basic/identity_tmov_if_else_alias_a5.pto | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 test/basic/identity_tmov_if_else_alias_a5.pto diff --git a/test/basic/identity_tmov_if_else_alias_a5.pto b/test/basic/identity_tmov_if_else_alias_a5.pto new file mode 100644 index 000000000..62fad4543 --- /dev/null +++ b/test/basic/identity_tmov_if_else_alias_a5.pto @@ -0,0 +1,51 @@ +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s | FileCheck %s --check-prefix=A5 +// RUN: ptoas --pto-level=level3 --pto-arch=a3 --enable-insert-sync %s | FileCheck %s --check-prefix=A3 + +module attributes {"pto.device-spec" = "Ascend950"} { + func.func @identity_tmov_if_else_alias_a5(%cond: i1) { + %c0_i64 = arith.constant 0 : i64 + %c64_i64 = arith.constant 64 : i64 + + // Phi-like destinations (same addresses are reused in both branches). + %phi0 = pto.alloc_tile addr = %c0_i64 + : !pto.tile_buf + %phi1 = pto.alloc_tile addr = %c64_i64 + : !pto.tile_buf + + scf.if %cond { + // Then: distinct SSA, same address range as destinations. + %then_src0 = pto.alloc_tile addr = %c0_i64 + : !pto.tile_buf + pto.tmov ins(%then_src0 : !pto.tile_buf) + outs(%phi0 : !pto.tile_buf) + + %then_src1 = pto.alloc_tile addr = %c64_i64 + : !pto.tile_buf + pto.tmov ins(%then_src1 : !pto.tile_buf) + outs(%phi1 : !pto.tile_buf) + } else { + // Else: include a move chain with identical addresses to mirror the issue style. + %else_src0 = pto.alloc_tile addr = %c0_i64 + : !pto.tile_buf + pto.tmov ins(%else_src0 : !pto.tile_buf) + outs(%phi0 : !pto.tile_buf) + + %else_src1 = pto.alloc_tile addr = %c64_i64 + : !pto.tile_buf + %else_mid1 = pto.alloc_tile addr = %c64_i64 + : !pto.tile_buf + pto.tmov ins(%else_src1 : !pto.tile_buf) + outs(%else_mid1 : !pto.tile_buf) + pto.tmov ins(%else_mid1 : !pto.tile_buf) + outs(%phi1 : !pto.tile_buf) + } + + return + } +} + +// A5-LABEL: __global__ AICORE void identity_tmov_if_else_alias_a5( +// A5-NOT: TMOV( + +// A3-LABEL: __global__ AICORE void identity_tmov_if_else_alias_a5( +// A3: TMOV( From 2c49361c203a4978813f758332dec5fa0adacbc5 Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Thu, 2 Apr 2026 15:04:14 +0800 Subject: [PATCH 04/16] [A5][Sync][test] add deadlock-signature guard for identity tmov --- ...ntity_tmov_alias_deadlock_signature_a5.pto | 40 +++++++++++++++++++ tools/ptoas/ptoas.cpp | 12 +++++- 2 files changed, 50 insertions(+), 2 deletions(-) create mode 100644 test/basic/identity_tmov_alias_deadlock_signature_a5.pto diff --git a/test/basic/identity_tmov_alias_deadlock_signature_a5.pto b/test/basic/identity_tmov_alias_deadlock_signature_a5.pto new file mode 100644 index 000000000..4974c845b --- /dev/null +++ b/test/basic/identity_tmov_alias_deadlock_signature_a5.pto @@ -0,0 +1,40 @@ +// RUN: ptoas --pto-arch=a5 --enable-insert-sync %s | FileCheck %s --check-prefix=SAFE +// RUN: ptoas --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s | FileCheck %s --check-prefix=UNSAFE + +module attributes {"pto.device-spec" = "Ascend950"} { + func.func @identity_tmov_alias_deadlock_signature_a5( + %src: memref<1x64xf16, #pto.address_space>, + %dst: memref<1x64xf16, #pto.address_space>) { + %ub = memref.alloc() : memref<1x64xf16, #pto.address_space> + %src_alias = memref.subview %ub[0, 0] [1, 64] [1, 1] + : memref<1x64xf16, #pto.address_space> + to memref<1x64xf16, strided<[64, 1]>, #pto.address_space> + %dst_alias = memref.subview %ub[0, 0] [1, 64] [1, 1] + : memref<1x64xf16, #pto.address_space> + to memref<1x64xf16, strided<[64, 1]>, #pto.address_space> + + pto.tload ins(%src : memref<1x64xf16, #pto.address_space>) + outs(%src_alias : memref<1x64xf16, strided<[64, 1]>, #pto.address_space>) + pto.tmov ins(%src_alias : memref<1x64xf16, strided<[64, 1]>, #pto.address_space>) + outs(%dst_alias : memref<1x64xf16, strided<[64, 1]>, #pto.address_space>) + pto.tstore ins(%dst_alias : memref<1x64xf16, strided<[64, 1]>, #pto.address_space>) + outs(%dst : memref<1x64xf16, #pto.address_space>) + return + } +} + +// SAFE-LABEL: __global__ AICORE void identity_tmov_alias_deadlock_signature_a5( +// SAFE-NOT: TMOV( +// SAFE: set_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0); +// SAFE: wait_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0); +// SAFE-NOT: set_flag(PIPE_MTE2, PIPE_V +// SAFE-NOT: wait_flag(PIPE_MTE2, PIPE_V +// SAFE-NOT: set_flag(PIPE_V, PIPE_MTE3 +// SAFE-NOT: wait_flag(PIPE_V, PIPE_MTE3 + +// UNSAFE-LABEL: __global__ AICORE void identity_tmov_alias_deadlock_signature_a5( +// UNSAFE: TMOV( +// UNSAFE: set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); +// UNSAFE: wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); +// UNSAFE: set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); +// UNSAFE: wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); diff --git a/tools/ptoas/ptoas.cpp b/tools/ptoas/ptoas.cpp index 1d71acd87..d531d22fd 100644 --- a/tools/ptoas/ptoas.cpp +++ b/tools/ptoas/ptoas.cpp @@ -183,6 +183,12 @@ static llvm::cl::opt enableInsertSync("enable-insert-sync", llvm::cl::desc("Enable automatic synchronization insertion pass"), llvm::cl::init(false)); +static llvm::cl::opt disableIdentityTMovCleanup( + "disable-identity-tmov-cleanup", + llvm::cl::desc("Disable A5 identity tmov cleanup pass before auto sync (debug only)"), + llvm::cl::Hidden, + llvm::cl::init(false)); + static llvm::cl::opt disableInferLayout( "disable-infer-layout", llvm::cl::desc("Disable PTO layout inference pass (static-only)"), @@ -1112,8 +1118,10 @@ int main(int argc, char **argv) { // Conditionally add Sync pass based on flag. if (enableInsertSync) { - pm.addNestedPass( - pto::createPTORemoveIdentityTMovPass()); + if (!disableIdentityTMovCleanup) { + pm.addNestedPass( + pto::createPTORemoveIdentityTMovPass()); + } pm.addNestedPass(pto::createPTOInsertSyncPass()); } From 49515a638446f4b825ccba48b3cf96e1e06a4443 Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Thu, 2 Apr 2026 16:19:44 +0800 Subject: [PATCH 05/16] [A5][Sync] harden identity tmov proof and add guards --- include/PTO/Transforms/Passes.td | 6 +- .../Transforms/PTORemoveIdentityTMovPass.cpp | 68 +++++++++++++------ ...ty_tmov_pointer_cast_diff_addr_keep_a5.pto | 24 +++++++ ...mov_pointer_cast_same_addr_diff_ssa_a5.pto | 27 ++++++++ ...identity_tmov_reinterpret_cast_keep_a5.pto | 26 +++++++ 5 files changed, 130 insertions(+), 21 deletions(-) create mode 100644 test/basic/identity_tmov_pointer_cast_diff_addr_keep_a5.pto create mode 100644 test/basic/identity_tmov_pointer_cast_same_addr_diff_ssa_a5.pto create mode 100644 test/basic/identity_tmov_reinterpret_cast_keep_a5.pto diff --git a/include/PTO/Transforms/Passes.td b/include/PTO/Transforms/Passes.td index 7a0821717..0766b599c 100644 --- a/include/PTO/Transforms/Passes.td +++ b/include/PTO/Transforms/Passes.td @@ -44,8 +44,10 @@ def PTORemoveIdentityTMov : Pass<"pto-remove-identity-tmov", "func::FuncOp"> { Erases provably-identity `pto.tmov` operations on A5 before `pto-insert-sync`. Besides `src == dst`, this also handles distinct SSA values when alias analysis proves source and destination have the exact same - address range and type/layout. Dynamic/unknown address cases are kept - conservative and are not removed. + address range and type/layout. `memref.reinterpret_cast` views are treated + conservatively (not elided), and when roots differ the pass requires + concrete-equal root addresses. Dynamic/unknown address cases are not + removed. }]; let constructor = "mlir::pto::createPTORemoveIdentityTMovPass()"; let dependentDialects = [ diff --git a/lib/PTO/Transforms/PTORemoveIdentityTMovPass.cpp b/lib/PTO/Transforms/PTORemoveIdentityTMovPass.cpp index feff3efff..0d77ea100 100644 --- a/lib/PTO/Transforms/PTORemoveIdentityTMovPass.cpp +++ b/lib/PTO/Transforms/PTORemoveIdentityTMovPass.cpp @@ -11,11 +11,14 @@ #include "PTO/Transforms/InsertSync/MemoryDependentAnalyzer.h" #include "PTO/Transforms/InsertSync/PTOIRTranslator.h" #include "PTO/Transforms/InsertSync/SyncCommon.h" +#include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/Matchers.h" #include "mlir/Pass/Pass.h" +#include namespace mlir { namespace pto { @@ -48,21 +51,51 @@ getSingleMemInfo(const Buffer2MemInfoMap &buffer2MemInfoMap, Value value) { return it->second.front().get(); } -static bool hasConcreteAddress(const BaseMemInfo *info) { - if (!info || info->baseAddresses.empty()) - return false; +static std::optional tryEvalI64Constant(Value value) { + if (!value) + return std::nullopt; - Operation *defOp = info->rootBuffer.getDefiningOp(); + APInt apInt; + if (matchPattern(value, m_ConstantInt(&apInt))) + return apInt.getSExtValue(); + + Operation *defOp = value.getDefiningOp(); if (!defOp) - return false; + return std::nullopt; + + if (auto castOp = dyn_cast(defOp)) + return tryEvalI64Constant(castOp.getIn()); + if (auto castOp = dyn_cast(defOp)) + return tryEvalI64Constant(castOp.getIn()); + if (auto castOp = dyn_cast(defOp)) + return tryEvalI64Constant(castOp.getIn()); + if (auto castOp = dyn_cast(defOp)) + return tryEvalI64Constant(castOp.getIn()); + + return std::nullopt; +} - if (isa(defOp)) - return true; +static std::optional +tryGetConcreteRootAddress(const BaseMemInfo *info) { + if (!info) + return std::nullopt; + + if (auto direct = tryEvalI64Constant(info->rootBuffer)) + return direct; + + Operation *defOp = info->rootBuffer.getDefiningOp(); + if (!defOp) + return std::nullopt; if (auto alloc = dyn_cast(defOp)) - return static_cast(alloc.getAddr()); + return tryEvalI64Constant(alloc.getAddr()); - return false; + if (auto cast = dyn_cast(defOp)) { + if (!cast.getAddrs().empty()) + return tryEvalI64Constant(cast.getAddrs().front()); + } + + return std::nullopt; } static bool hasDynamicStaticList(ArrayRef values) { @@ -89,15 +122,8 @@ static bool isStaticallyAddressableValue(Value value) { continue; } - if (auto reCast = dyn_cast(defOp)) { - if (hasDynamicStaticList(reCast.getStaticOffsets()) || - hasDynamicStaticList(reCast.getStaticSizes()) || - hasDynamicStaticList(reCast.getStaticStrides())) { - return false; - } - value = reCast.getSource(); - continue; - } + if (isa(defOp)) + return false; if (auto cast = dyn_cast(defOp)) { value = cast.getSource(); @@ -167,7 +193,11 @@ static bool canEraseIdentityTMov( if (srcInfo->rootBuffer == dstInfo->rootBuffer) return true; - return hasConcreteAddress(srcInfo) && hasConcreteAddress(dstInfo); + auto srcRootAddr = tryGetConcreteRootAddress(srcInfo); + auto dstRootAddr = tryGetConcreteRootAddress(dstInfo); + if (!srcRootAddr || !dstRootAddr) + return false; + return *srcRootAddr == *dstRootAddr; } struct PTORemoveIdentityTMovPass diff --git a/test/basic/identity_tmov_pointer_cast_diff_addr_keep_a5.pto b/test/basic/identity_tmov_pointer_cast_diff_addr_keep_a5.pto new file mode 100644 index 000000000..e9f63ee8b --- /dev/null +++ b/test/basic/identity_tmov_pointer_cast_diff_addr_keep_a5.pto @@ -0,0 +1,24 @@ +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s | FileCheck %s + +module attributes {"pto.device-spec" = "Ascend950"} { + func.func @identity_tmov_pointer_cast_diff_addr_keep_a5( + %src: memref<1x64xf16, #pto.address_space>, + %dst: memref<1x64xf16, #pto.address_space>) { + %c0 = arith.constant 0 : i64 + %c64 = arith.constant 64 : i64 + + %src_buf = pto.pointer_cast(%c0) : memref<1x64xf16, #pto.address_space> + %dst_buf = pto.pointer_cast(%c64) : memref<1x64xf16, #pto.address_space> + + pto.tload ins(%src : memref<1x64xf16, #pto.address_space>) + outs(%src_buf : memref<1x64xf16, #pto.address_space>) + pto.tmov ins(%src_buf : memref<1x64xf16, #pto.address_space>) + outs(%dst_buf : memref<1x64xf16, #pto.address_space>) + pto.tstore ins(%dst_buf : memref<1x64xf16, #pto.address_space>) + outs(%dst : memref<1x64xf16, #pto.address_space>) + return + } +} + +// CHECK-LABEL: __global__ AICORE void identity_tmov_pointer_cast_diff_addr_keep_a5( +// CHECK: TMOV( diff --git a/test/basic/identity_tmov_pointer_cast_same_addr_diff_ssa_a5.pto b/test/basic/identity_tmov_pointer_cast_same_addr_diff_ssa_a5.pto new file mode 100644 index 000000000..841745ee6 --- /dev/null +++ b/test/basic/identity_tmov_pointer_cast_same_addr_diff_ssa_a5.pto @@ -0,0 +1,27 @@ +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s | FileCheck %s + +module attributes {"pto.device-spec" = "Ascend950"} { + func.func @identity_tmov_pointer_cast_same_addr_diff_ssa_a5( + %src: memref<1x64xf16, #pto.address_space>, + %dst: memref<1x64xf16, #pto.address_space>) { + %c0 = arith.constant 0 : i64 + %c0_idx = arith.constant 0 : index + %c0_alt = arith.index_cast %c0_idx : index to i64 + + %src_buf = pto.pointer_cast(%c0) : memref<1x64xf16, #pto.address_space> + %dst_buf = pto.pointer_cast(%c0_alt) : memref<1x64xf16, #pto.address_space> + + pto.tload ins(%src : memref<1x64xf16, #pto.address_space>) + outs(%src_buf : memref<1x64xf16, #pto.address_space>) + pto.tmov ins(%src_buf : memref<1x64xf16, #pto.address_space>) + outs(%dst_buf : memref<1x64xf16, #pto.address_space>) + pto.tstore ins(%dst_buf : memref<1x64xf16, #pto.address_space>) + outs(%dst : memref<1x64xf16, #pto.address_space>) + return + } +} + +// CHECK-LABEL: __global__ AICORE void identity_tmov_pointer_cast_same_addr_diff_ssa_a5( +// CHECK-NOT: TMOV( +// CHECK: set_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0); +// CHECK: wait_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0); diff --git a/test/basic/identity_tmov_reinterpret_cast_keep_a5.pto b/test/basic/identity_tmov_reinterpret_cast_keep_a5.pto new file mode 100644 index 000000000..9130de0d6 --- /dev/null +++ b/test/basic/identity_tmov_reinterpret_cast_keep_a5.pto @@ -0,0 +1,26 @@ +// RUN: ptoas --pto-arch=a5 --enable-insert-sync %s | FileCheck %s + +module attributes {"pto.device-spec" = "Ascend950"} { + func.func @identity_tmov_reinterpret_cast_keep_a5( + %src: memref<1x64xf16, #pto.address_space>, + %dst: memref<1x64xf16, #pto.address_space>) { + %ub = memref.alloc() : memref<64xf16, #pto.address_space> + %v0 = memref.reinterpret_cast %ub to offset: [0], sizes: [1, 64], strides: [64, 1] + : memref<64xf16, #pto.address_space> + to memref, #pto.address_space> + %v1 = memref.reinterpret_cast %ub to offset: [0], sizes: [1, 64], strides: [1, 64] + : memref<64xf16, #pto.address_space> + to memref, #pto.address_space> + + pto.tload ins(%src : memref<1x64xf16, #pto.address_space>) + outs(%v0 : memref, #pto.address_space>) + pto.tmov ins(%v0 : memref, #pto.address_space>) + outs(%v1 : memref, #pto.address_space>) + pto.tstore ins(%v1 : memref, #pto.address_space>) + outs(%dst : memref<1x64xf16, #pto.address_space>) + return + } +} + +// CHECK-LABEL: __global__ AICORE void identity_tmov_reinterpret_cast_keep_a5( +// CHECK: TMOV( From 50f5235d29f41eb4ba6047db53091f86465ee02d Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Thu, 2 Apr 2026 16:25:15 +0800 Subject: [PATCH 06/16] [A5][Sync][test] add direct issue828 pto regression guard --- .../issue828_softmax_rescale_incore_1_a5.pto | 130 ++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 test/basic/issue828_softmax_rescale_incore_1_a5.pto diff --git a/test/basic/issue828_softmax_rescale_incore_1_a5.pto b/test/basic/issue828_softmax_rescale_incore_1_a5.pto new file mode 100644 index 000000000..2f4704e6c --- /dev/null +++ b/test/basic/issue828_softmax_rescale_incore_1_a5.pto @@ -0,0 +1,130 @@ +// Regression from: https://github.com/hw-native-sys/pypto/issues/828 +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s | FileCheck %s --check-prefix=SAFE +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s | FileCheck %s --check-prefix=UNSAFE + +module attributes {pto.target_arch = "a5"} { + func.func @softmax_rescale_incore_1(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c64 = arith.constant 64 : i64 + %c128 = arith.constant 128 : i64 + %c192 = arith.constant 192 : i64 + %c256 = arith.constant 256 : i64 + %c8448 = arith.constant 8448 : i64 + %c16640 = arith.constant 16640 : i64 + %c16704 = arith.constant 16704 : i64 + %c16768 = arith.constant 16768 : i64 + %c16832 = arith.constant 16832 : i64 + %c16896 = arith.constant 16896 : i64 + %c16 = arith.constant 16 : index + %c1 = arith.constant 1 : index + %7 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %cur_li__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %cur_mi__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %li__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %mi__iter_v1_view = pto.make_tensor_view %arg3, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %oi__iter_v1_view = pto.make_tensor_view %arg4, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %oi_tmp__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %li__phi_v5 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %mi__phi_v5 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %oi__phi_v5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %cur_li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %cur_li__ssa_v0_pview = pto.partition_view %cur_li__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%cur_li__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_li__tile : !pto.tile_buf) + %cur_mi__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf + %cur_mi__ssa_v0_pview = pto.partition_view %cur_mi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%cur_mi__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_mi__tile : !pto.tile_buf) + %li__tile = pto.alloc_tile addr = %c128 : !pto.tile_buf + %li__iter_v1_pview = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%li__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%li__tile : !pto.tile_buf) + %mi__tile = pto.alloc_tile addr = %c192 : !pto.tile_buf + %mi__iter_v1_pview = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%mi__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%mi__tile : !pto.tile_buf) + %oi__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf + %oi__iter_v1_pview = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%oi__iter_v1_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi__tile : !pto.tile_buf) + %oi_tmp__tile = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %oi_tmp__ssa_v0_pview = pto.partition_view %oi_tmp__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%oi_tmp__ssa_v0_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi_tmp__tile : !pto.tile_buf) + %8 = arith.cmpi eq, %arg6, %c0 : index + scf.if %8 { + %oi__ssa_v3 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %li__ssa_v3 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %mi__ssa_v3 = pto.alloc_tile addr = %c64 : !pto.tile_buf + pto.tmov ins(%li__ssa_v3 : !pto.tile_buf) outs(%li__phi_v5 : !pto.tile_buf) + pto.tmov ins(%mi__ssa_v3 : !pto.tile_buf) outs(%mi__phi_v5 : !pto.tile_buf) + pto.tmov ins(%oi__ssa_v3 : !pto.tile_buf) outs(%oi__phi_v5 : !pto.tile_buf) + } else { + %mi_new__rm_a0_tmp_v0 = pto.alloc_tile addr = %c192 : !pto.tile_buf + %mi_new__rm_a1_tmp_v1 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %mi_new__row_major_tmp_v2 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + pto.tmax ins(%mi_new__rm_a0_tmp_v0, %mi_new__rm_a1_tmp_v1 : !pto.tile_buf, !pto.tile_buf) outs(%mi_new__row_major_tmp_v2 : !pto.tile_buf) + %mi_new__tile = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__rm_a0_tmp_v3 = pto.alloc_tile addr = %c192 : !pto.tile_buf + %t__rm_a1_tmp_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__row_major_tmp_v5 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + pto.tsub ins(%t__rm_a0_tmp_v3, %t__rm_a1_tmp_v4 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v5 : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %alpha__rm_a0_tmp_v6 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %alpha__row_major_tmp_v7 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + pto.texp ins(%alpha__rm_a0_tmp_v6 : !pto.tile_buf) outs(%alpha__row_major_tmp_v7 : !pto.tile_buf) + %alpha__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %t__rm_a0_tmp_v8 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %t__rm_a1_tmp_v9 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__row_major_tmp_v10 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + pto.tsub ins(%t__rm_a0_tmp_v8, %t__rm_a1_tmp_v9 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v10 : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %beta__rm_a0_tmp_v11 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %beta__row_major_tmp_v12 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + pto.texp ins(%beta__rm_a0_tmp_v11 : !pto.tile_buf) outs(%beta__row_major_tmp_v12 : !pto.tile_buf) + %beta__tile = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %t__rm_a0_tmp_v13 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %t__rm_a1_tmp_v14 = pto.alloc_tile addr = %c128 : !pto.tile_buf + %t__row_major_tmp_v15 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + pto.tmul ins(%t__rm_a0_tmp_v13, %t__rm_a1_tmp_v14 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v15 : !pto.tile_buf) + %1 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %t__rm_a0_tmp_v16 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %t__rm_a1_tmp_v17 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %t__row_major_tmp_v18 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + pto.tmul ins(%t__rm_a0_tmp_v16, %t__rm_a1_tmp_v17 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v18 : !pto.tile_buf) + %2 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + %li__rm_a0_tmp_v19 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %li__rm_a1_tmp_v20 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + %li__row_major_tmp_v21 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + pto.tadd ins(%li__rm_a0_tmp_v19, %li__rm_a1_tmp_v20 : !pto.tile_buf, !pto.tile_buf) outs(%li__row_major_tmp_v21 : !pto.tile_buf) + %3 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %4 = pto.alloc_tile addr = %c256 : !pto.tile_buf + pto.trowexpandmul ins(%oi__tile, %alpha__tile : !pto.tile_buf, !pto.tile_buf) outs(%4 : !pto.tile_buf) + %5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + pto.trowexpandmul ins(%oi_tmp__tile, %beta__tile : !pto.tile_buf, !pto.tile_buf) outs(%5 : !pto.tile_buf) + %6 = pto.alloc_tile addr = %c256 : !pto.tile_buf + pto.tadd ins(%4, %5 : !pto.tile_buf, !pto.tile_buf) outs(%6 : !pto.tile_buf) + %mi__ssa_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %li__tile_mv = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%3 : !pto.tile_buf) outs(%li__tile_mv : !pto.tile_buf) + %mi__ssa_v4_mv = pto.alloc_tile addr = %c64 : !pto.tile_buf + pto.tmov ins(%mi__ssa_v4 : !pto.tile_buf) outs(%mi__ssa_v4_mv : !pto.tile_buf) + %oi__tile_mv = pto.alloc_tile addr = %c8448 : !pto.tile_buf + pto.tmov ins(%6 : !pto.tile_buf) outs(%oi__tile_mv : !pto.tile_buf) + pto.tmov ins(%li__tile_mv : !pto.tile_buf) outs(%li__phi_v5 : !pto.tile_buf) + pto.tmov ins(%mi__ssa_v4_mv : !pto.tile_buf) outs(%mi__phi_v5 : !pto.tile_buf) + pto.tmov ins(%oi__tile_mv : !pto.tile_buf) outs(%oi__phi_v5 : !pto.tile_buf) + } + %9 = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tstore ins(%li__phi_v5 : !pto.tile_buf) outs(%9 : !pto.partition_tensor_view<16x1xf32>) + %10 = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tstore ins(%mi__phi_v5 : !pto.tile_buf) outs(%10 : !pto.partition_tensor_view<16x1xf32>) + %11 = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tstore ins(%oi__phi_v5 : !pto.tile_buf) outs(%11 : !pto.partition_tensor_view<16x128xf32>) + return + } +} + +// SAFE-LABEL: __global__ AICORE void softmax_rescale_incore_1( +// SAFE: set_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); +// SAFE-NOT: set_flag(PIPE_MTE2, PIPE_V, EVENT_ID4); +// SAFE-NOT: wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID4); + +// UNSAFE-LABEL: __global__ AICORE void softmax_rescale_incore_1( +// UNSAFE: set_flag(PIPE_MTE2, PIPE_V, EVENT_ID4); +// UNSAFE: wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID4); From ee59ce0f0f6eaf141e3e7c1a82141d60136ef703 Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Thu, 2 Apr 2026 16:47:58 +0800 Subject: [PATCH 07/16] [A5][Sync][test] force issue828 board case into arg6==0 path --- test/npu_validation/scripts/generate_testcase.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/test/npu_validation/scripts/generate_testcase.py b/test/npu_validation/scripts/generate_testcase.py index ca802b567..4f67b70df 100644 --- a/test/npu_validation/scripts/generate_testcase.py +++ b/test/npu_validation/scripts/generate_testcase.py @@ -86,6 +86,17 @@ }) +def _get_case_scalar_overrides(testcase: str, params): + scalar_params = [p for p in params if p.get("kind") == "scalar"] + + # issue#828 guard: the target path is the `arg6 == 0` branch. + # Keep default behavior for all other cases. + if testcase == "issue828_softmax_rescale_incore_1_a5" and len(scalar_params) == 1: + return {scalar_params[0]["name"]: "0"} + + return {} + + def _parse_shape(text: str): match = re.search(r"Shape<(\d+)\s*,\s*(\d+)>", text) if match: @@ -1137,10 +1148,15 @@ def generate_testcase( f" size_t fileSize_{p['name']} = elemCount_{p['name']} * sizeof({p['host_type']});" ) + scalar_overrides = _get_case_scalar_overrides(testcase, params) for p in params: if p["kind"] != "scalar": continue t = p["host_type"] + override = scalar_overrides.get(p["name"]) + if override is not None: + param_decls_lines.append(f" {t} {p['name']} = {override};") + continue # Some PTO-ISA APIs use small POD structs as scalar parameters. # Example: pto::MrgSortExecutedNumList (used by TMRGSORT multi-list variants). if t.endswith("MrgSortExecutedNumList"): From 055b6440e543d5df4c5bdf88ed0adef1db72e03d Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Thu, 2 Apr 2026 17:01:16 +0800 Subject: [PATCH 08/16] [A5][Sync] expose identity tmov cleanup switch for A/B validation --- README.md | 3 +++ tools/ptoas/ptoas.cpp | 3 +-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 9a3132c9f..337478b66 100644 --- a/README.md +++ b/README.md @@ -193,6 +193,9 @@ ptoas tests/input.pto # 运行 AutoSyncInsert Pass ptoas tests/input.pto --enable-insert-sync -o outputfile.cpp +# 调试开关:关闭 A5 identity tmov cleanup(用于 A/B 验证) +ptoas tests/input.pto --enable-insert-sync --disable-identity-tmov-cleanup -o outputfile.cpp + # 指定目标硬件架构(A3 / A5) ptoas tests/input.pto --pto-arch=a3 -o outputfile.cpp diff --git a/tools/ptoas/ptoas.cpp b/tools/ptoas/ptoas.cpp index d531d22fd..6fbb83b2a 100644 --- a/tools/ptoas/ptoas.cpp +++ b/tools/ptoas/ptoas.cpp @@ -185,8 +185,7 @@ static llvm::cl::opt enableInsertSync("enable-insert-sync", static llvm::cl::opt disableIdentityTMovCleanup( "disable-identity-tmov-cleanup", - llvm::cl::desc("Disable A5 identity tmov cleanup pass before auto sync (debug only)"), - llvm::cl::Hidden, + llvm::cl::desc("Disable A5 identity tmov cleanup pass before auto sync"), llvm::cl::init(false)); static llvm::cl::opt disableInferLayout( From 73fac29691bfa5f8595f5a757b7e1fbe6e7fb443 Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Thu, 2 Apr 2026 19:16:08 +0800 Subject: [PATCH 09/16] [A5][Sync][test] run issue828 case through if/else branches --- .../scripts/generate_testcase.py | 29 ++++++++++++++++--- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/test/npu_validation/scripts/generate_testcase.py b/test/npu_validation/scripts/generate_testcase.py index 4f67b70df..431ca3ccd 100644 --- a/test/npu_validation/scripts/generate_testcase.py +++ b/test/npu_validation/scripts/generate_testcase.py @@ -97,6 +97,18 @@ def _get_case_scalar_overrides(testcase: str, params): return {} +def _get_case_scalar_sweep(testcase: str, params): + scalar_params = [p for p in params if p.get("kind") == "scalar"] + + # Run issue#828 twice in one executable: + # 1) arg6 == 0 => if branch + # 2) arg6 == 1 => else branch + if testcase == "issue828_softmax_rescale_incore_1_a5" and len(scalar_params) == 1: + return [(scalar_params[0]["name"], "0"), (scalar_params[0]["name"], "1")] + + return [] + + def _parse_shape(text: str): match = re.search(r"Shape<(\d+)\s*,\s*(\d+)>", text) if match: @@ -1252,6 +1264,18 @@ def generate_testcase( # header here instead of `runtime/rt.h` to avoid environment-specific # include path issues on some board images. runtime_rt_include = '#include \n#include ' + launch_call = f" {launch_name}({', '.join(launch_call_args + ['stream'])});" + scalar_sweep = _get_case_scalar_sweep(testcase, params) + if scalar_sweep: + launch_lines = [ + " // Branch coverage for issue828: run both if/else paths in one test.", + ] + for scalar_name, scalar_value in scalar_sweep: + launch_lines.append(f" {scalar_name} = {scalar_value};") + launch_lines.append(f" {launch_name}({', '.join(launch_call_args + ['stream'])});") + launch_lines.append(" ACL_CHECK(aclrtSynchronizeStream(stream));") + launch_call = "\n".join(launch_lines) + main_cpp = ( template .replace("@RUNTIME_RT_INCLUDE@", runtime_rt_include) @@ -1268,10 +1292,7 @@ def generate_testcase( .replace("@INIT_RUNTIME_PTRS@", "\n".join(init_runtime_ptrs)) .replace("@READ_INPUTS@", "\n".join(read_inputs)) .replace("@COPY_TO_DEVICE@", "\n".join(copy_inputs)) - .replace( - "@LAUNCH_CALL@", - f" {launch_name}({', '.join(launch_call_args + ['stream'])});", - ) + .replace("@LAUNCH_CALL@", launch_call) .replace("@COPY_BACK@", "\n".join(output_copy_back)) .replace("@WRITE_OUTPUT@", "\n".join(output_write)) .replace("@FREE_DEVICE@", "\n".join(free_device)) From 2deb942e2207750200b6e66ea345ec2398330f5d Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Thu, 2 Apr 2026 19:42:01 +0800 Subject: [PATCH 10/16] [A5][Sync][test] split issue828 into if/else pto regressions --- .github/workflows/ci.yml | 13 ++ ...ue828_softmax_rescale_incore_1_a5_else.pto | 121 ++++++++++++++++++ ...ssue828_softmax_rescale_incore_1_a5_if.pto | 121 ++++++++++++++++++ 3 files changed, 255 insertions(+) create mode 100644 test/basic/issue828_softmax_rescale_incore_1_a5_else.pto create mode 100644 test/basic/issue828_softmax_rescale_incore_1_a5_if.pto diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fb4749ec0..5884adf59 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -215,6 +215,19 @@ jobs: export PTOAS_OUT_DIR="${PAYLOAD_DIR}/test/samples" bash test/samples/runop.sh --enablebc all + - name: Run issue828 branch-fixed basic pto regressions + shell: bash + env: + PTOAS_BIN: ${{ github.workspace }}/build/tools/ptoas/ptoas + run: | + set -euo pipefail + for case in \ + test/basic/issue828_softmax_rescale_incore_1_a5_if.pto \ + test/basic/issue828_softmax_rescale_incore_1_a5_else.pto; do + "${PTOAS_BIN}" --pto-level=level3 --pto-arch=a5 --enable-insert-sync "${case}" >/dev/null + "${PTOAS_BIN}" --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup "${case}" >/dev/null + done + - name: Build payload artifact if: >- ${{ diff --git a/test/basic/issue828_softmax_rescale_incore_1_a5_else.pto b/test/basic/issue828_softmax_rescale_incore_1_a5_else.pto new file mode 100644 index 000000000..bbecfdd43 --- /dev/null +++ b/test/basic/issue828_softmax_rescale_incore_1_a5_else.pto @@ -0,0 +1,121 @@ +// ELSE-only variant from issue #828 (force branch condition false). +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s > /dev/null +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s > /dev/null + +module attributes {pto.target_arch = "a5"} { + func.func @softmax_rescale_incore_1(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c64 = arith.constant 64 : i64 + %c128 = arith.constant 128 : i64 + %c192 = arith.constant 192 : i64 + %c256 = arith.constant 256 : i64 + %c8448 = arith.constant 8448 : i64 + %c16640 = arith.constant 16640 : i64 + %c16704 = arith.constant 16704 : i64 + %c16768 = arith.constant 16768 : i64 + %c16832 = arith.constant 16832 : i64 + %c16896 = arith.constant 16896 : i64 + %c16 = arith.constant 16 : index + %c1 = arith.constant 1 : index + %7 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %cur_li__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %cur_mi__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %li__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %mi__iter_v1_view = pto.make_tensor_view %arg3, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %oi__iter_v1_view = pto.make_tensor_view %arg4, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %oi_tmp__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %li__phi_v5 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %mi__phi_v5 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %oi__phi_v5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %cur_li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %cur_li__ssa_v0_pview = pto.partition_view %cur_li__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%cur_li__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_li__tile : !pto.tile_buf) + %cur_mi__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf + %cur_mi__ssa_v0_pview = pto.partition_view %cur_mi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%cur_mi__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_mi__tile : !pto.tile_buf) + %li__tile = pto.alloc_tile addr = %c128 : !pto.tile_buf + %li__iter_v1_pview = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%li__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%li__tile : !pto.tile_buf) + %mi__tile = pto.alloc_tile addr = %c192 : !pto.tile_buf + %mi__iter_v1_pview = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%mi__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%mi__tile : !pto.tile_buf) + %oi__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf + %oi__iter_v1_pview = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%oi__iter_v1_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi__tile : !pto.tile_buf) + %oi_tmp__tile = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %oi_tmp__ssa_v0_pview = pto.partition_view %oi_tmp__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%oi_tmp__ssa_v0_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi_tmp__tile : !pto.tile_buf) + %8 = arith.cmpi eq, %c1, %c0 : index + scf.if %8 { + %oi__ssa_v3 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %li__ssa_v3 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %mi__ssa_v3 = pto.alloc_tile addr = %c64 : !pto.tile_buf + pto.tmov ins(%li__ssa_v3 : !pto.tile_buf) outs(%li__phi_v5 : !pto.tile_buf) + pto.tmov ins(%mi__ssa_v3 : !pto.tile_buf) outs(%mi__phi_v5 : !pto.tile_buf) + pto.tmov ins(%oi__ssa_v3 : !pto.tile_buf) outs(%oi__phi_v5 : !pto.tile_buf) + } else { + %mi_new__rm_a0_tmp_v0 = pto.alloc_tile addr = %c192 : !pto.tile_buf + %mi_new__rm_a1_tmp_v1 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %mi_new__row_major_tmp_v2 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + pto.tmax ins(%mi_new__rm_a0_tmp_v0, %mi_new__rm_a1_tmp_v1 : !pto.tile_buf, !pto.tile_buf) outs(%mi_new__row_major_tmp_v2 : !pto.tile_buf) + %mi_new__tile = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__rm_a0_tmp_v3 = pto.alloc_tile addr = %c192 : !pto.tile_buf + %t__rm_a1_tmp_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__row_major_tmp_v5 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + pto.tsub ins(%t__rm_a0_tmp_v3, %t__rm_a1_tmp_v4 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v5 : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %alpha__rm_a0_tmp_v6 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %alpha__row_major_tmp_v7 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + pto.texp ins(%alpha__rm_a0_tmp_v6 : !pto.tile_buf) outs(%alpha__row_major_tmp_v7 : !pto.tile_buf) + %alpha__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %t__rm_a0_tmp_v8 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %t__rm_a1_tmp_v9 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__row_major_tmp_v10 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + pto.tsub ins(%t__rm_a0_tmp_v8, %t__rm_a1_tmp_v9 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v10 : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %beta__rm_a0_tmp_v11 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %beta__row_major_tmp_v12 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + pto.texp ins(%beta__rm_a0_tmp_v11 : !pto.tile_buf) outs(%beta__row_major_tmp_v12 : !pto.tile_buf) + %beta__tile = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %t__rm_a0_tmp_v13 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %t__rm_a1_tmp_v14 = pto.alloc_tile addr = %c128 : !pto.tile_buf + %t__row_major_tmp_v15 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + pto.tmul ins(%t__rm_a0_tmp_v13, %t__rm_a1_tmp_v14 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v15 : !pto.tile_buf) + %1 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %t__rm_a0_tmp_v16 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %t__rm_a1_tmp_v17 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %t__row_major_tmp_v18 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + pto.tmul ins(%t__rm_a0_tmp_v16, %t__rm_a1_tmp_v17 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v18 : !pto.tile_buf) + %2 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + %li__rm_a0_tmp_v19 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %li__rm_a1_tmp_v20 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + %li__row_major_tmp_v21 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + pto.tadd ins(%li__rm_a0_tmp_v19, %li__rm_a1_tmp_v20 : !pto.tile_buf, !pto.tile_buf) outs(%li__row_major_tmp_v21 : !pto.tile_buf) + %3 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %4 = pto.alloc_tile addr = %c256 : !pto.tile_buf + pto.trowexpandmul ins(%oi__tile, %alpha__tile : !pto.tile_buf, !pto.tile_buf) outs(%4 : !pto.tile_buf) + %5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + pto.trowexpandmul ins(%oi_tmp__tile, %beta__tile : !pto.tile_buf, !pto.tile_buf) outs(%5 : !pto.tile_buf) + %6 = pto.alloc_tile addr = %c256 : !pto.tile_buf + pto.tadd ins(%4, %5 : !pto.tile_buf, !pto.tile_buf) outs(%6 : !pto.tile_buf) + %mi__ssa_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %li__tile_mv = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%3 : !pto.tile_buf) outs(%li__tile_mv : !pto.tile_buf) + %mi__ssa_v4_mv = pto.alloc_tile addr = %c64 : !pto.tile_buf + pto.tmov ins(%mi__ssa_v4 : !pto.tile_buf) outs(%mi__ssa_v4_mv : !pto.tile_buf) + %oi__tile_mv = pto.alloc_tile addr = %c8448 : !pto.tile_buf + pto.tmov ins(%6 : !pto.tile_buf) outs(%oi__tile_mv : !pto.tile_buf) + pto.tmov ins(%li__tile_mv : !pto.tile_buf) outs(%li__phi_v5 : !pto.tile_buf) + pto.tmov ins(%mi__ssa_v4_mv : !pto.tile_buf) outs(%mi__phi_v5 : !pto.tile_buf) + pto.tmov ins(%oi__tile_mv : !pto.tile_buf) outs(%oi__phi_v5 : !pto.tile_buf) + } + %9 = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tstore ins(%li__phi_v5 : !pto.tile_buf) outs(%9 : !pto.partition_tensor_view<16x1xf32>) + %10 = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tstore ins(%mi__phi_v5 : !pto.tile_buf) outs(%10 : !pto.partition_tensor_view<16x1xf32>) + %11 = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tstore ins(%oi__phi_v5 : !pto.tile_buf) outs(%11 : !pto.partition_tensor_view<16x128xf32>) + return + } +} diff --git a/test/basic/issue828_softmax_rescale_incore_1_a5_if.pto b/test/basic/issue828_softmax_rescale_incore_1_a5_if.pto new file mode 100644 index 000000000..66fd836f0 --- /dev/null +++ b/test/basic/issue828_softmax_rescale_incore_1_a5_if.pto @@ -0,0 +1,121 @@ +// IF-only variant from issue #828 (force branch condition true). +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s > /dev/null +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s > /dev/null + +module attributes {pto.target_arch = "a5"} { + func.func @softmax_rescale_incore_1(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c64 = arith.constant 64 : i64 + %c128 = arith.constant 128 : i64 + %c192 = arith.constant 192 : i64 + %c256 = arith.constant 256 : i64 + %c8448 = arith.constant 8448 : i64 + %c16640 = arith.constant 16640 : i64 + %c16704 = arith.constant 16704 : i64 + %c16768 = arith.constant 16768 : i64 + %c16832 = arith.constant 16832 : i64 + %c16896 = arith.constant 16896 : i64 + %c16 = arith.constant 16 : index + %c1 = arith.constant 1 : index + %7 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %cur_li__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %cur_mi__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %li__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %mi__iter_v1_view = pto.make_tensor_view %arg3, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %oi__iter_v1_view = pto.make_tensor_view %arg4, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %oi_tmp__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %li__phi_v5 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %mi__phi_v5 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %oi__phi_v5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %cur_li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %cur_li__ssa_v0_pview = pto.partition_view %cur_li__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%cur_li__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_li__tile : !pto.tile_buf) + %cur_mi__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf + %cur_mi__ssa_v0_pview = pto.partition_view %cur_mi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%cur_mi__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_mi__tile : !pto.tile_buf) + %li__tile = pto.alloc_tile addr = %c128 : !pto.tile_buf + %li__iter_v1_pview = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%li__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%li__tile : !pto.tile_buf) + %mi__tile = pto.alloc_tile addr = %c192 : !pto.tile_buf + %mi__iter_v1_pview = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%mi__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%mi__tile : !pto.tile_buf) + %oi__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf + %oi__iter_v1_pview = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%oi__iter_v1_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi__tile : !pto.tile_buf) + %oi_tmp__tile = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %oi_tmp__ssa_v0_pview = pto.partition_view %oi_tmp__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%oi_tmp__ssa_v0_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi_tmp__tile : !pto.tile_buf) + %8 = arith.cmpi eq, %c0, %c0 : index + scf.if %8 { + %oi__ssa_v3 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %li__ssa_v3 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %mi__ssa_v3 = pto.alloc_tile addr = %c64 : !pto.tile_buf + pto.tmov ins(%li__ssa_v3 : !pto.tile_buf) outs(%li__phi_v5 : !pto.tile_buf) + pto.tmov ins(%mi__ssa_v3 : !pto.tile_buf) outs(%mi__phi_v5 : !pto.tile_buf) + pto.tmov ins(%oi__ssa_v3 : !pto.tile_buf) outs(%oi__phi_v5 : !pto.tile_buf) + } else { + %mi_new__rm_a0_tmp_v0 = pto.alloc_tile addr = %c192 : !pto.tile_buf + %mi_new__rm_a1_tmp_v1 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %mi_new__row_major_tmp_v2 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + pto.tmax ins(%mi_new__rm_a0_tmp_v0, %mi_new__rm_a1_tmp_v1 : !pto.tile_buf, !pto.tile_buf) outs(%mi_new__row_major_tmp_v2 : !pto.tile_buf) + %mi_new__tile = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__rm_a0_tmp_v3 = pto.alloc_tile addr = %c192 : !pto.tile_buf + %t__rm_a1_tmp_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__row_major_tmp_v5 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + pto.tsub ins(%t__rm_a0_tmp_v3, %t__rm_a1_tmp_v4 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v5 : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %alpha__rm_a0_tmp_v6 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %alpha__row_major_tmp_v7 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + pto.texp ins(%alpha__rm_a0_tmp_v6 : !pto.tile_buf) outs(%alpha__row_major_tmp_v7 : !pto.tile_buf) + %alpha__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %t__rm_a0_tmp_v8 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %t__rm_a1_tmp_v9 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__row_major_tmp_v10 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + pto.tsub ins(%t__rm_a0_tmp_v8, %t__rm_a1_tmp_v9 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v10 : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %beta__rm_a0_tmp_v11 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %beta__row_major_tmp_v12 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + pto.texp ins(%beta__rm_a0_tmp_v11 : !pto.tile_buf) outs(%beta__row_major_tmp_v12 : !pto.tile_buf) + %beta__tile = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %t__rm_a0_tmp_v13 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %t__rm_a1_tmp_v14 = pto.alloc_tile addr = %c128 : !pto.tile_buf + %t__row_major_tmp_v15 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + pto.tmul ins(%t__rm_a0_tmp_v13, %t__rm_a1_tmp_v14 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v15 : !pto.tile_buf) + %1 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %t__rm_a0_tmp_v16 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %t__rm_a1_tmp_v17 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %t__row_major_tmp_v18 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + pto.tmul ins(%t__rm_a0_tmp_v16, %t__rm_a1_tmp_v17 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v18 : !pto.tile_buf) + %2 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + %li__rm_a0_tmp_v19 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %li__rm_a1_tmp_v20 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + %li__row_major_tmp_v21 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + pto.tadd ins(%li__rm_a0_tmp_v19, %li__rm_a1_tmp_v20 : !pto.tile_buf, !pto.tile_buf) outs(%li__row_major_tmp_v21 : !pto.tile_buf) + %3 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %4 = pto.alloc_tile addr = %c256 : !pto.tile_buf + pto.trowexpandmul ins(%oi__tile, %alpha__tile : !pto.tile_buf, !pto.tile_buf) outs(%4 : !pto.tile_buf) + %5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + pto.trowexpandmul ins(%oi_tmp__tile, %beta__tile : !pto.tile_buf, !pto.tile_buf) outs(%5 : !pto.tile_buf) + %6 = pto.alloc_tile addr = %c256 : !pto.tile_buf + pto.tadd ins(%4, %5 : !pto.tile_buf, !pto.tile_buf) outs(%6 : !pto.tile_buf) + %mi__ssa_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %li__tile_mv = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%3 : !pto.tile_buf) outs(%li__tile_mv : !pto.tile_buf) + %mi__ssa_v4_mv = pto.alloc_tile addr = %c64 : !pto.tile_buf + pto.tmov ins(%mi__ssa_v4 : !pto.tile_buf) outs(%mi__ssa_v4_mv : !pto.tile_buf) + %oi__tile_mv = pto.alloc_tile addr = %c8448 : !pto.tile_buf + pto.tmov ins(%6 : !pto.tile_buf) outs(%oi__tile_mv : !pto.tile_buf) + pto.tmov ins(%li__tile_mv : !pto.tile_buf) outs(%li__phi_v5 : !pto.tile_buf) + pto.tmov ins(%mi__ssa_v4_mv : !pto.tile_buf) outs(%mi__phi_v5 : !pto.tile_buf) + pto.tmov ins(%oi__tile_mv : !pto.tile_buf) outs(%oi__phi_v5 : !pto.tile_buf) + } + %9 = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tstore ins(%li__phi_v5 : !pto.tile_buf) outs(%9 : !pto.partition_tensor_view<16x1xf32>) + %10 = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tstore ins(%mi__phi_v5 : !pto.tile_buf) outs(%10 : !pto.partition_tensor_view<16x1xf32>) + %11 = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tstore ins(%oi__phi_v5 : !pto.tile_buf) outs(%11 : !pto.partition_tensor_view<16x128xf32>) + return + } +} From b13c8a42fb8c0dc23a9cc26ef448a2680f5ac239 Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Thu, 2 Apr 2026 20:26:36 +0800 Subject: [PATCH 11/16] [A5][Test] add issue828 diagnostic split cases --- .../issue828_diag_else_3tmov_only_a5.pto | 66 +++++++++++++++++++ test/basic/issue828_diag_else_no_tmov_a5.pto | 53 +++++++++++++++ .../issue828_diag_if_identity_only_a5.pto | 64 ++++++++++++++++++ 3 files changed, 183 insertions(+) create mode 100644 test/basic/issue828_diag_else_3tmov_only_a5.pto create mode 100644 test/basic/issue828_diag_else_no_tmov_a5.pto create mode 100644 test/basic/issue828_diag_if_identity_only_a5.pto diff --git a/test/basic/issue828_diag_else_3tmov_only_a5.pto b/test/basic/issue828_diag_else_3tmov_only_a5.pto new file mode 100644 index 000000000..26e541855 --- /dev/null +++ b/test/basic/issue828_diag_else_3tmov_only_a5.pto @@ -0,0 +1,66 @@ +// Diagnostic case B (ELSE residual 3 TMOV only): +// Goal: isolate the 3 non-identity TMOVs that remain after identity cleanup. +// Suggested board runs: +// /run A5 issue828_diag_else_3tmov_only_a5 --pto-level=level3 +// /run A5 issue828_diag_else_3tmov_only_a5 --pto-level=level3 --disable-identity-tmov-cleanup +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s > /dev/null +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s > /dev/null + +module attributes {pto.target_arch = "a5"} { + func.func @issue828_diag_else_3tmov_only_a5( + %in0: !pto.ptr, %in1: !pto.ptr, %in2: !pto.ptr, + %out0: !pto.ptr, %out1: !pto.ptr, %out2: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c64 = arith.constant 64 : i64 + %c256 = arith.constant 256 : i64 + %c8448 = arith.constant 8448 : i64 + %c16640 = arith.constant 16640 : i64 + %c16832 = arith.constant 16832 : i64 + + %c16 = arith.constant 16 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + + %in0_view = pto.make_tensor_view %in0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout} : !pto.tensor_view + %in1_view = pto.make_tensor_view %in1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout} : !pto.tensor_view + %in2_view = pto.make_tensor_view %in2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout} : !pto.tensor_view + + %out0_view = pto.make_tensor_view %out0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout} : !pto.tensor_view + %out1_view = pto.make_tensor_view %out1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout} : !pto.tensor_view + %out2_view = pto.make_tensor_view %out2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout} : !pto.tensor_view + + // Source addresses mirror residual else-path producers. + %src0 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %src1 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %src2 = pto.alloc_tile addr = %c256 : !pto.tile_buf + + // Destination addresses mirror phi/result tiles in issue828. + %dst0 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %dst1 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %dst2 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + + %in0_part = pto.partition_view %in0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + %in1_part = pto.partition_view %in1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + %in2_part = pto.partition_view %in2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + + pto.tload ins(%in0_part : !pto.partition_tensor_view<16x1xf32>) outs(%src0 : !pto.tile_buf) + pto.tload ins(%in1_part : !pto.partition_tensor_view<16x1xf32>) outs(%src1 : !pto.tile_buf) + pto.tload ins(%in2_part : !pto.partition_tensor_view<16x128xf32>) outs(%src2 : !pto.tile_buf) + + // Non-identity TMOV triplet under investigation. + pto.tmov ins(%src0 : !pto.tile_buf) outs(%dst0 : !pto.tile_buf) + pto.tmov ins(%src1 : !pto.tile_buf) outs(%dst1 : !pto.tile_buf) + pto.tmov ins(%src2 : !pto.tile_buf) outs(%dst2 : !pto.tile_buf) + + %out0_part = pto.partition_view %out0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + %out1_part = pto.partition_view %out1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + %out2_part = pto.partition_view %out2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + + pto.tstore ins(%dst0 : !pto.tile_buf) outs(%out0_part : !pto.partition_tensor_view<16x1xf32>) + pto.tstore ins(%dst1 : !pto.tile_buf) outs(%out1_part : !pto.partition_tensor_view<16x1xf32>) + pto.tstore ins(%dst2 : !pto.tile_buf) outs(%out2_part : !pto.partition_tensor_view<16x128xf32>) + return + } +} diff --git a/test/basic/issue828_diag_else_no_tmov_a5.pto b/test/basic/issue828_diag_else_no_tmov_a5.pto new file mode 100644 index 000000000..d3cc1015f --- /dev/null +++ b/test/basic/issue828_diag_else_no_tmov_a5.pto @@ -0,0 +1,53 @@ +// Diagnostic case C (ELSE control: no TMOV): +// Goal: verify whether failure still happens without any TMOV on the path. +// Suggested board runs: +// /run A5 issue828_diag_else_no_tmov_a5 --pto-level=level3 +// /run A5 issue828_diag_else_no_tmov_a5 --pto-level=level3 --disable-identity-tmov-cleanup +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s > /dev/null +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s > /dev/null + +module attributes {pto.target_arch = "a5"} { + func.func @issue828_diag_else_no_tmov_a5( + %in0: !pto.ptr, %in1: !pto.ptr, %in2: !pto.ptr, + %out0: !pto.ptr, %out1: !pto.ptr, %out2: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c64 = arith.constant 64 : i64 + %c8448 = arith.constant 8448 : i64 + + %c16 = arith.constant 16 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + + %in0_view = pto.make_tensor_view %in0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout} : !pto.tensor_view + %in1_view = pto.make_tensor_view %in1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout} : !pto.tensor_view + %in2_view = pto.make_tensor_view %in2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout} : !pto.tensor_view + + %out0_view = pto.make_tensor_view %out0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout} : !pto.tensor_view + %out1_view = pto.make_tensor_view %out1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout} : !pto.tensor_view + %out2_view = pto.make_tensor_view %out2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout} : !pto.tensor_view + + // Direct path without TMOV to isolate whether TMOV is necessary for the failure. + %dst0 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %dst1 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %dst2 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + + %in0_part = pto.partition_view %in0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + %in1_part = pto.partition_view %in1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + %in2_part = pto.partition_view %in2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + + pto.tload ins(%in0_part : !pto.partition_tensor_view<16x1xf32>) outs(%dst0 : !pto.tile_buf) + pto.tload ins(%in1_part : !pto.partition_tensor_view<16x1xf32>) outs(%dst1 : !pto.tile_buf) + pto.tload ins(%in2_part : !pto.partition_tensor_view<16x128xf32>) outs(%dst2 : !pto.tile_buf) + + %out0_part = pto.partition_view %out0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + %out1_part = pto.partition_view %out1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + %out2_part = pto.partition_view %out2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + + pto.tstore ins(%dst0 : !pto.tile_buf) outs(%out0_part : !pto.partition_tensor_view<16x1xf32>) + pto.tstore ins(%dst1 : !pto.tile_buf) outs(%out1_part : !pto.partition_tensor_view<16x1xf32>) + pto.tstore ins(%dst2 : !pto.tile_buf) outs(%out2_part : !pto.partition_tensor_view<16x128xf32>) + return + } +} diff --git a/test/basic/issue828_diag_if_identity_only_a5.pto b/test/basic/issue828_diag_if_identity_only_a5.pto new file mode 100644 index 000000000..f4d12dd98 --- /dev/null +++ b/test/basic/issue828_diag_if_identity_only_a5.pto @@ -0,0 +1,64 @@ +// Diagnostic case A (IF identity TMOV only): +// Goal: isolate IF-path identity tmov behavior under auto-sync. +// Suggested board runs: +// /run A5 issue828_diag_if_identity_only_a5 --pto-level=level3 +// /run A5 issue828_diag_if_identity_only_a5 --pto-level=level3 --disable-identity-tmov-cleanup +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s > /dev/null +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s > /dev/null + +module attributes {pto.target_arch = "a5"} { + func.func @issue828_diag_if_identity_only_a5( + %in0: !pto.ptr, %in1: !pto.ptr, %in2: !pto.ptr, + %out0: !pto.ptr, %out1: !pto.ptr, %out2: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c64 = arith.constant 64 : i64 + %c8448 = arith.constant 8448 : i64 + + %c16 = arith.constant 16 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + + %in0_view = pto.make_tensor_view %in0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout} : !pto.tensor_view + %in1_view = pto.make_tensor_view %in1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout} : !pto.tensor_view + %in2_view = pto.make_tensor_view %in2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout} : !pto.tensor_view + + %out0_view = pto.make_tensor_view %out0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout} : !pto.tensor_view + %out1_view = pto.make_tensor_view %out1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout} : !pto.tensor_view + %out2_view = pto.make_tensor_view %out2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout} : !pto.tensor_view + + %dst0 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %dst1 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %dst2 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + + %in0_part = pto.partition_view %in0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + %in1_part = pto.partition_view %in1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + %in2_part = pto.partition_view %in2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + + pto.tload ins(%in0_part : !pto.partition_tensor_view<16x1xf32>) outs(%dst0 : !pto.tile_buf) + pto.tload ins(%in1_part : !pto.partition_tensor_view<16x1xf32>) outs(%dst1 : !pto.tile_buf) + pto.tload ins(%in2_part : !pto.partition_tensor_view<16x128xf32>) outs(%dst2 : !pto.tile_buf) + + %cond = arith.cmpi eq, %c0, %c0 : index + scf.if %cond { + // Distinct SSA, same address as destination (identity by address-range proof). + %src0_alias = pto.alloc_tile addr = %c0i : !pto.tile_buf + %src1_alias = pto.alloc_tile addr = %c64 : !pto.tile_buf + %src2_alias = pto.alloc_tile addr = %c8448 : !pto.tile_buf + + pto.tmov ins(%src0_alias : !pto.tile_buf) outs(%dst0 : !pto.tile_buf) + pto.tmov ins(%src1_alias : !pto.tile_buf) outs(%dst1 : !pto.tile_buf) + pto.tmov ins(%src2_alias : !pto.tile_buf) outs(%dst2 : !pto.tile_buf) + } + + %out0_part = pto.partition_view %out0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + %out1_part = pto.partition_view %out1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + %out2_part = pto.partition_view %out2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + + pto.tstore ins(%dst0 : !pto.tile_buf) outs(%out0_part : !pto.partition_tensor_view<16x1xf32>) + pto.tstore ins(%dst1 : !pto.tile_buf) outs(%out1_part : !pto.partition_tensor_view<16x1xf32>) + pto.tstore ins(%dst2 : !pto.tile_buf) outs(%out2_part : !pto.partition_tensor_view<16x128xf32>) + return + } +} From e8e7bc82ab223bafbc8e3e2ff7453e1fba4cf2fb Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Thu, 2 Apr 2026 21:01:22 +0800 Subject: [PATCH 12/16] [A5][Test] add aligned issue828 if/else variants --- ...ftmax_rescale_incore_1_a5_else_aligned.pto | 121 ++++++++++++++++++ ...softmax_rescale_incore_1_a5_if_aligned.pto | 121 ++++++++++++++++++ 2 files changed, 242 insertions(+) create mode 100644 test/basic/issue828_softmax_rescale_incore_1_a5_else_aligned.pto create mode 100644 test/basic/issue828_softmax_rescale_incore_1_a5_if_aligned.pto diff --git a/test/basic/issue828_softmax_rescale_incore_1_a5_else_aligned.pto b/test/basic/issue828_softmax_rescale_incore_1_a5_else_aligned.pto new file mode 100644 index 000000000..b1a2b70e1 --- /dev/null +++ b/test/basic/issue828_softmax_rescale_incore_1_a5_else_aligned.pto @@ -0,0 +1,121 @@ +// ELSE-only aligned-address variant from issue #828 (force branch condition false). +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s > /dev/null +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s > /dev/null + +module attributes {pto.target_arch = "a5"} { + func.func @softmax_rescale_incore_1(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c64 = arith.constant 256 : i64 + %c128 = arith.constant 512 : i64 + %c192 = arith.constant 768 : i64 + %c256 = arith.constant 1024 : i64 + %c8448 = arith.constant 33792 : i64 + %c16640 = arith.constant 66560 : i64 + %c16704 = arith.constant 66816 : i64 + %c16768 = arith.constant 67072 : i64 + %c16832 = arith.constant 67328 : i64 + %c16896 = arith.constant 67584 : i64 + %c16 = arith.constant 16 : index + %c1 = arith.constant 1 : index + %7 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %cur_li__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %cur_mi__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %li__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %mi__iter_v1_view = pto.make_tensor_view %arg3, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %oi__iter_v1_view = pto.make_tensor_view %arg4, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %oi_tmp__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %li__phi_v5 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %mi__phi_v5 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %oi__phi_v5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %cur_li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %cur_li__ssa_v0_pview = pto.partition_view %cur_li__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%cur_li__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_li__tile : !pto.tile_buf) + %cur_mi__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf + %cur_mi__ssa_v0_pview = pto.partition_view %cur_mi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%cur_mi__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_mi__tile : !pto.tile_buf) + %li__tile = pto.alloc_tile addr = %c128 : !pto.tile_buf + %li__iter_v1_pview = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%li__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%li__tile : !pto.tile_buf) + %mi__tile = pto.alloc_tile addr = %c192 : !pto.tile_buf + %mi__iter_v1_pview = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%mi__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%mi__tile : !pto.tile_buf) + %oi__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf + %oi__iter_v1_pview = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%oi__iter_v1_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi__tile : !pto.tile_buf) + %oi_tmp__tile = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %oi_tmp__ssa_v0_pview = pto.partition_view %oi_tmp__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%oi_tmp__ssa_v0_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi_tmp__tile : !pto.tile_buf) + %8 = arith.cmpi eq, %c1, %c0 : index + scf.if %8 { + %oi__ssa_v3 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %li__ssa_v3 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %mi__ssa_v3 = pto.alloc_tile addr = %c64 : !pto.tile_buf + pto.tmov ins(%li__ssa_v3 : !pto.tile_buf) outs(%li__phi_v5 : !pto.tile_buf) + pto.tmov ins(%mi__ssa_v3 : !pto.tile_buf) outs(%mi__phi_v5 : !pto.tile_buf) + pto.tmov ins(%oi__ssa_v3 : !pto.tile_buf) outs(%oi__phi_v5 : !pto.tile_buf) + } else { + %mi_new__rm_a0_tmp_v0 = pto.alloc_tile addr = %c192 : !pto.tile_buf + %mi_new__rm_a1_tmp_v1 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %mi_new__row_major_tmp_v2 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + pto.tmax ins(%mi_new__rm_a0_tmp_v0, %mi_new__rm_a1_tmp_v1 : !pto.tile_buf, !pto.tile_buf) outs(%mi_new__row_major_tmp_v2 : !pto.tile_buf) + %mi_new__tile = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__rm_a0_tmp_v3 = pto.alloc_tile addr = %c192 : !pto.tile_buf + %t__rm_a1_tmp_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__row_major_tmp_v5 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + pto.tsub ins(%t__rm_a0_tmp_v3, %t__rm_a1_tmp_v4 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v5 : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %alpha__rm_a0_tmp_v6 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %alpha__row_major_tmp_v7 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + pto.texp ins(%alpha__rm_a0_tmp_v6 : !pto.tile_buf) outs(%alpha__row_major_tmp_v7 : !pto.tile_buf) + %alpha__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %t__rm_a0_tmp_v8 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %t__rm_a1_tmp_v9 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__row_major_tmp_v10 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + pto.tsub ins(%t__rm_a0_tmp_v8, %t__rm_a1_tmp_v9 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v10 : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %beta__rm_a0_tmp_v11 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %beta__row_major_tmp_v12 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + pto.texp ins(%beta__rm_a0_tmp_v11 : !pto.tile_buf) outs(%beta__row_major_tmp_v12 : !pto.tile_buf) + %beta__tile = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %t__rm_a0_tmp_v13 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %t__rm_a1_tmp_v14 = pto.alloc_tile addr = %c128 : !pto.tile_buf + %t__row_major_tmp_v15 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + pto.tmul ins(%t__rm_a0_tmp_v13, %t__rm_a1_tmp_v14 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v15 : !pto.tile_buf) + %1 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %t__rm_a0_tmp_v16 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %t__rm_a1_tmp_v17 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %t__row_major_tmp_v18 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + pto.tmul ins(%t__rm_a0_tmp_v16, %t__rm_a1_tmp_v17 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v18 : !pto.tile_buf) + %2 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + %li__rm_a0_tmp_v19 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %li__rm_a1_tmp_v20 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + %li__row_major_tmp_v21 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + pto.tadd ins(%li__rm_a0_tmp_v19, %li__rm_a1_tmp_v20 : !pto.tile_buf, !pto.tile_buf) outs(%li__row_major_tmp_v21 : !pto.tile_buf) + %3 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %4 = pto.alloc_tile addr = %c256 : !pto.tile_buf + pto.trowexpandmul ins(%oi__tile, %alpha__tile : !pto.tile_buf, !pto.tile_buf) outs(%4 : !pto.tile_buf) + %5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + pto.trowexpandmul ins(%oi_tmp__tile, %beta__tile : !pto.tile_buf, !pto.tile_buf) outs(%5 : !pto.tile_buf) + %6 = pto.alloc_tile addr = %c256 : !pto.tile_buf + pto.tadd ins(%4, %5 : !pto.tile_buf, !pto.tile_buf) outs(%6 : !pto.tile_buf) + %mi__ssa_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %li__tile_mv = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%3 : !pto.tile_buf) outs(%li__tile_mv : !pto.tile_buf) + %mi__ssa_v4_mv = pto.alloc_tile addr = %c64 : !pto.tile_buf + pto.tmov ins(%mi__ssa_v4 : !pto.tile_buf) outs(%mi__ssa_v4_mv : !pto.tile_buf) + %oi__tile_mv = pto.alloc_tile addr = %c8448 : !pto.tile_buf + pto.tmov ins(%6 : !pto.tile_buf) outs(%oi__tile_mv : !pto.tile_buf) + pto.tmov ins(%li__tile_mv : !pto.tile_buf) outs(%li__phi_v5 : !pto.tile_buf) + pto.tmov ins(%mi__ssa_v4_mv : !pto.tile_buf) outs(%mi__phi_v5 : !pto.tile_buf) + pto.tmov ins(%oi__tile_mv : !pto.tile_buf) outs(%oi__phi_v5 : !pto.tile_buf) + } + %9 = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tstore ins(%li__phi_v5 : !pto.tile_buf) outs(%9 : !pto.partition_tensor_view<16x1xf32>) + %10 = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tstore ins(%mi__phi_v5 : !pto.tile_buf) outs(%10 : !pto.partition_tensor_view<16x1xf32>) + %11 = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tstore ins(%oi__phi_v5 : !pto.tile_buf) outs(%11 : !pto.partition_tensor_view<16x128xf32>) + return + } +} diff --git a/test/basic/issue828_softmax_rescale_incore_1_a5_if_aligned.pto b/test/basic/issue828_softmax_rescale_incore_1_a5_if_aligned.pto new file mode 100644 index 000000000..590e4885b --- /dev/null +++ b/test/basic/issue828_softmax_rescale_incore_1_a5_if_aligned.pto @@ -0,0 +1,121 @@ +// IF-only aligned-address variant from issue #828 (force branch condition true). +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s > /dev/null +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s > /dev/null + +module attributes {pto.target_arch = "a5"} { + func.func @softmax_rescale_incore_1(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c64 = arith.constant 256 : i64 + %c128 = arith.constant 512 : i64 + %c192 = arith.constant 768 : i64 + %c256 = arith.constant 1024 : i64 + %c8448 = arith.constant 33792 : i64 + %c16640 = arith.constant 66560 : i64 + %c16704 = arith.constant 66816 : i64 + %c16768 = arith.constant 67072 : i64 + %c16832 = arith.constant 67328 : i64 + %c16896 = arith.constant 67584 : i64 + %c16 = arith.constant 16 : index + %c1 = arith.constant 1 : index + %7 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %cur_li__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %cur_mi__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %li__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %mi__iter_v1_view = pto.make_tensor_view %arg3, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %oi__iter_v1_view = pto.make_tensor_view %arg4, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %oi_tmp__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %li__phi_v5 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %mi__phi_v5 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %oi__phi_v5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %cur_li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %cur_li__ssa_v0_pview = pto.partition_view %cur_li__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%cur_li__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_li__tile : !pto.tile_buf) + %cur_mi__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf + %cur_mi__ssa_v0_pview = pto.partition_view %cur_mi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%cur_mi__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_mi__tile : !pto.tile_buf) + %li__tile = pto.alloc_tile addr = %c128 : !pto.tile_buf + %li__iter_v1_pview = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%li__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%li__tile : !pto.tile_buf) + %mi__tile = pto.alloc_tile addr = %c192 : !pto.tile_buf + %mi__iter_v1_pview = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%mi__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%mi__tile : !pto.tile_buf) + %oi__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf + %oi__iter_v1_pview = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%oi__iter_v1_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi__tile : !pto.tile_buf) + %oi_tmp__tile = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %oi_tmp__ssa_v0_pview = pto.partition_view %oi_tmp__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%oi_tmp__ssa_v0_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi_tmp__tile : !pto.tile_buf) + %8 = arith.cmpi eq, %c0, %c0 : index + scf.if %8 { + %oi__ssa_v3 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %li__ssa_v3 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %mi__ssa_v3 = pto.alloc_tile addr = %c64 : !pto.tile_buf + pto.tmov ins(%li__ssa_v3 : !pto.tile_buf) outs(%li__phi_v5 : !pto.tile_buf) + pto.tmov ins(%mi__ssa_v3 : !pto.tile_buf) outs(%mi__phi_v5 : !pto.tile_buf) + pto.tmov ins(%oi__ssa_v3 : !pto.tile_buf) outs(%oi__phi_v5 : !pto.tile_buf) + } else { + %mi_new__rm_a0_tmp_v0 = pto.alloc_tile addr = %c192 : !pto.tile_buf + %mi_new__rm_a1_tmp_v1 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %mi_new__row_major_tmp_v2 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + pto.tmax ins(%mi_new__rm_a0_tmp_v0, %mi_new__rm_a1_tmp_v1 : !pto.tile_buf, !pto.tile_buf) outs(%mi_new__row_major_tmp_v2 : !pto.tile_buf) + %mi_new__tile = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__rm_a0_tmp_v3 = pto.alloc_tile addr = %c192 : !pto.tile_buf + %t__rm_a1_tmp_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__row_major_tmp_v5 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + pto.tsub ins(%t__rm_a0_tmp_v3, %t__rm_a1_tmp_v4 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v5 : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %alpha__rm_a0_tmp_v6 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %alpha__row_major_tmp_v7 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + pto.texp ins(%alpha__rm_a0_tmp_v6 : !pto.tile_buf) outs(%alpha__row_major_tmp_v7 : !pto.tile_buf) + %alpha__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %t__rm_a0_tmp_v8 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %t__rm_a1_tmp_v9 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__row_major_tmp_v10 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + pto.tsub ins(%t__rm_a0_tmp_v8, %t__rm_a1_tmp_v9 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v10 : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %beta__rm_a0_tmp_v11 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %beta__row_major_tmp_v12 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + pto.texp ins(%beta__rm_a0_tmp_v11 : !pto.tile_buf) outs(%beta__row_major_tmp_v12 : !pto.tile_buf) + %beta__tile = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %t__rm_a0_tmp_v13 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %t__rm_a1_tmp_v14 = pto.alloc_tile addr = %c128 : !pto.tile_buf + %t__row_major_tmp_v15 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + pto.tmul ins(%t__rm_a0_tmp_v13, %t__rm_a1_tmp_v14 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v15 : !pto.tile_buf) + %1 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %t__rm_a0_tmp_v16 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %t__rm_a1_tmp_v17 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %t__row_major_tmp_v18 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + pto.tmul ins(%t__rm_a0_tmp_v16, %t__rm_a1_tmp_v17 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v18 : !pto.tile_buf) + %2 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + %li__rm_a0_tmp_v19 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %li__rm_a1_tmp_v20 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + %li__row_major_tmp_v21 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + pto.tadd ins(%li__rm_a0_tmp_v19, %li__rm_a1_tmp_v20 : !pto.tile_buf, !pto.tile_buf) outs(%li__row_major_tmp_v21 : !pto.tile_buf) + %3 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %4 = pto.alloc_tile addr = %c256 : !pto.tile_buf + pto.trowexpandmul ins(%oi__tile, %alpha__tile : !pto.tile_buf, !pto.tile_buf) outs(%4 : !pto.tile_buf) + %5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + pto.trowexpandmul ins(%oi_tmp__tile, %beta__tile : !pto.tile_buf, !pto.tile_buf) outs(%5 : !pto.tile_buf) + %6 = pto.alloc_tile addr = %c256 : !pto.tile_buf + pto.tadd ins(%4, %5 : !pto.tile_buf, !pto.tile_buf) outs(%6 : !pto.tile_buf) + %mi__ssa_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %li__tile_mv = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%3 : !pto.tile_buf) outs(%li__tile_mv : !pto.tile_buf) + %mi__ssa_v4_mv = pto.alloc_tile addr = %c64 : !pto.tile_buf + pto.tmov ins(%mi__ssa_v4 : !pto.tile_buf) outs(%mi__ssa_v4_mv : !pto.tile_buf) + %oi__tile_mv = pto.alloc_tile addr = %c8448 : !pto.tile_buf + pto.tmov ins(%6 : !pto.tile_buf) outs(%oi__tile_mv : !pto.tile_buf) + pto.tmov ins(%li__tile_mv : !pto.tile_buf) outs(%li__phi_v5 : !pto.tile_buf) + pto.tmov ins(%mi__ssa_v4_mv : !pto.tile_buf) outs(%mi__phi_v5 : !pto.tile_buf) + pto.tmov ins(%oi__tile_mv : !pto.tile_buf) outs(%oi__phi_v5 : !pto.tile_buf) + } + %9 = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tstore ins(%li__phi_v5 : !pto.tile_buf) outs(%9 : !pto.partition_tensor_view<16x1xf32>) + %10 = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tstore ins(%mi__phi_v5 : !pto.tile_buf) outs(%10 : !pto.partition_tensor_view<16x1xf32>) + %11 = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tstore ins(%oi__phi_v5 : !pto.tile_buf) outs(%11 : !pto.partition_tensor_view<16x128xf32>) + return + } +} From 913e2be2940f48ae1491a95d8672e9e0ad7b3ab7 Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Thu, 2 Apr 2026 21:53:01 +0800 Subject: [PATCH 13/16] [A5][Diag] add else trace-print case for issue828 --- .../issue828_diag_else_trace_print_a5.pto | 162 ++++++++++++++++++ 1 file changed, 162 insertions(+) create mode 100644 test/basic/issue828_diag_else_trace_print_a5.pto diff --git a/test/basic/issue828_diag_else_trace_print_a5.pto b/test/basic/issue828_diag_else_trace_print_a5.pto new file mode 100644 index 000000000..7fb127206 --- /dev/null +++ b/test/basic/issue828_diag_else_trace_print_a5.pto @@ -0,0 +1,162 @@ +// ELSE-only diagnostic variant from issue #828 (force branch condition false). +// This case inserts print checkpoints to narrow down which op triggers runtime failure. +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s > /dev/null +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s > /dev/null + +module attributes {pto.target_arch = "a5"} { + func.func @softmax_rescale_incore_1(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c64 = arith.constant 256 : i64 + %c128 = arith.constant 512 : i64 + %c192 = arith.constant 768 : i64 + %c256 = arith.constant 1024 : i64 + %c8448 = arith.constant 33792 : i64 + %c16640 = arith.constant 66560 : i64 + %c16704 = arith.constant 66816 : i64 + %c16768 = arith.constant 67072 : i64 + %c16832 = arith.constant 67328 : i64 + %c16896 = arith.constant 67584 : i64 + %c16 = arith.constant 16 : index + %c1 = arith.constant 1 : index + %7 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %dbg0 = arith.constant 0 : i32 + %cur_li__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %cur_mi__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %li__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %mi__iter_v1_view = pto.make_tensor_view %arg3, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %oi__iter_v1_view = pto.make_tensor_view %arg4, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %oi_tmp__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %li__phi_v5 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %mi__phi_v5 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %oi__phi_v5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %cur_li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %cur_li__ssa_v0_pview = pto.partition_view %cur_li__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%cur_li__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_li__tile : !pto.tile_buf) + %cur_mi__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf + %cur_mi__ssa_v0_pview = pto.partition_view %cur_mi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%cur_mi__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_mi__tile : !pto.tile_buf) + %li__tile = pto.alloc_tile addr = %c128 : !pto.tile_buf + %li__iter_v1_pview = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%li__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%li__tile : !pto.tile_buf) + %mi__tile = pto.alloc_tile addr = %c192 : !pto.tile_buf + %mi__iter_v1_pview = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%mi__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%mi__tile : !pto.tile_buf) + %oi__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf + %oi__iter_v1_pview = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%oi__iter_v1_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi__tile : !pto.tile_buf) + %oi_tmp__tile = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %oi_tmp__ssa_v0_pview = pto.partition_view %oi_tmp__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%oi_tmp__ssa_v0_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi_tmp__tile : !pto.tile_buf) + %8 = arith.cmpi eq, %c1, %c0 : index + scf.if %8 { + %oi__ssa_v3 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %li__ssa_v3 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %mi__ssa_v3 = pto.alloc_tile addr = %c64 : !pto.tile_buf + pto.tmov ins(%li__ssa_v3 : !pto.tile_buf) outs(%li__phi_v5 : !pto.tile_buf) + pto.tmov ins(%mi__ssa_v3 : !pto.tile_buf) outs(%mi__phi_v5 : !pto.tile_buf) + pto.tmov ins(%oi__ssa_v3 : !pto.tile_buf) outs(%oi__phi_v5 : !pto.tile_buf) + } else { + pto.print ins("E00_enter_else", %dbg0 : i32) + %mi_new__rm_a0_tmp_v0 = pto.alloc_tile addr = %c192 : !pto.tile_buf + %mi_new__rm_a1_tmp_v1 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %mi_new__row_major_tmp_v2 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + pto.print ins("E01_B_tmax", %dbg0 : i32) + pto.tmax ins(%mi_new__rm_a0_tmp_v0, %mi_new__rm_a1_tmp_v1 : !pto.tile_buf, !pto.tile_buf) outs(%mi_new__row_major_tmp_v2 : !pto.tile_buf) + pto.print ins("E01_A_tmax", %dbg0 : i32) + pto.tprint ins(%mi_new__row_major_tmp_v2 : !pto.tile_buf) + %mi_new__tile = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__rm_a0_tmp_v3 = pto.alloc_tile addr = %c192 : !pto.tile_buf + %t__rm_a1_tmp_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__row_major_tmp_v5 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + pto.print ins("E02_B_tsub0", %dbg0 : i32) + pto.tsub ins(%t__rm_a0_tmp_v3, %t__rm_a1_tmp_v4 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v5 : !pto.tile_buf) + pto.print ins("E02_A_tsub0", %dbg0 : i32) + %t__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %alpha__rm_a0_tmp_v6 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %alpha__row_major_tmp_v7 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + pto.print ins("E03_B_texp0", %dbg0 : i32) + pto.texp ins(%alpha__rm_a0_tmp_v6 : !pto.tile_buf) outs(%alpha__row_major_tmp_v7 : !pto.tile_buf) + pto.print ins("E03_A_texp0", %dbg0 : i32) + pto.tprint ins(%alpha__row_major_tmp_v7 : !pto.tile_buf) + %alpha__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %t__rm_a0_tmp_v8 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %t__rm_a1_tmp_v9 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__row_major_tmp_v10 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + pto.print ins("E04_B_tsub1", %dbg0 : i32) + pto.tsub ins(%t__rm_a0_tmp_v8, %t__rm_a1_tmp_v9 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v10 : !pto.tile_buf) + pto.print ins("E04_A_tsub1", %dbg0 : i32) + %0 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %beta__rm_a0_tmp_v11 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %beta__row_major_tmp_v12 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + pto.print ins("E05_B_texp1", %dbg0 : i32) + pto.texp ins(%beta__rm_a0_tmp_v11 : !pto.tile_buf) outs(%beta__row_major_tmp_v12 : !pto.tile_buf) + pto.print ins("E05_A_texp1", %dbg0 : i32) + pto.tprint ins(%beta__row_major_tmp_v12 : !pto.tile_buf) + %beta__tile = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %t__rm_a0_tmp_v13 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %t__rm_a1_tmp_v14 = pto.alloc_tile addr = %c128 : !pto.tile_buf + %t__row_major_tmp_v15 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + pto.print ins("E06_B_tmul0", %dbg0 : i32) + pto.tmul ins(%t__rm_a0_tmp_v13, %t__rm_a1_tmp_v14 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v15 : !pto.tile_buf) + pto.print ins("E06_A_tmul0", %dbg0 : i32) + %1 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %t__rm_a0_tmp_v16 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %t__rm_a1_tmp_v17 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %t__row_major_tmp_v18 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + pto.print ins("E07_B_tmul1", %dbg0 : i32) + pto.tmul ins(%t__rm_a0_tmp_v16, %t__rm_a1_tmp_v17 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v18 : !pto.tile_buf) + pto.print ins("E07_A_tmul1", %dbg0 : i32) + %2 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + %li__rm_a0_tmp_v19 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %li__rm_a1_tmp_v20 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + %li__row_major_tmp_v21 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + pto.print ins("E08_B_tadd16", %dbg0 : i32) + pto.tadd ins(%li__rm_a0_tmp_v19, %li__rm_a1_tmp_v20 : !pto.tile_buf, !pto.tile_buf) outs(%li__row_major_tmp_v21 : !pto.tile_buf) + pto.print ins("E08_A_tadd16", %dbg0 : i32) + %3 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %4 = pto.alloc_tile addr = %c256 : !pto.tile_buf + pto.print ins("E09_B_trowexpandmul0", %dbg0 : i32) + pto.trowexpandmul ins(%oi__tile, %alpha__tile : !pto.tile_buf, !pto.tile_buf) outs(%4 : !pto.tile_buf) + pto.print ins("E09_A_trowexpandmul0", %dbg0 : i32) + %5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + pto.print ins("E10_B_trowexpandmul1", %dbg0 : i32) + pto.trowexpandmul ins(%oi_tmp__tile, %beta__tile : !pto.tile_buf, !pto.tile_buf) outs(%5 : !pto.tile_buf) + pto.print ins("E10_A_trowexpandmul1", %dbg0 : i32) + %6 = pto.alloc_tile addr = %c256 : !pto.tile_buf + pto.print ins("E11_B_tadd128", %dbg0 : i32) + pto.tadd ins(%4, %5 : !pto.tile_buf, !pto.tile_buf) outs(%6 : !pto.tile_buf) + pto.print ins("E11_A_tadd128", %dbg0 : i32) + %mi__ssa_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %li__tile_mv = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.print ins("E12_B_tmov_li_tmp", %dbg0 : i32) + pto.tmov ins(%3 : !pto.tile_buf) outs(%li__tile_mv : !pto.tile_buf) + pto.print ins("E12_A_tmov_li_tmp", %dbg0 : i32) + %mi__ssa_v4_mv = pto.alloc_tile addr = %c64 : !pto.tile_buf + pto.print ins("E13_B_tmov_mi_tmp", %dbg0 : i32) + pto.tmov ins(%mi__ssa_v4 : !pto.tile_buf) outs(%mi__ssa_v4_mv : !pto.tile_buf) + pto.print ins("E13_A_tmov_mi_tmp", %dbg0 : i32) + %oi__tile_mv = pto.alloc_tile addr = %c8448 : !pto.tile_buf + pto.print ins("E14_B_tmov_oi_tmp", %dbg0 : i32) + pto.tmov ins(%6 : !pto.tile_buf) outs(%oi__tile_mv : !pto.tile_buf) + pto.print ins("E14_A_tmov_oi_tmp", %dbg0 : i32) + pto.print ins("E15_B_tmov_li_phi", %dbg0 : i32) + pto.tmov ins(%li__tile_mv : !pto.tile_buf) outs(%li__phi_v5 : !pto.tile_buf) + pto.print ins("E15_A_tmov_li_phi", %dbg0 : i32) + pto.print ins("E16_B_tmov_mi_phi", %dbg0 : i32) + pto.tmov ins(%mi__ssa_v4_mv : !pto.tile_buf) outs(%mi__phi_v5 : !pto.tile_buf) + pto.print ins("E16_A_tmov_mi_phi", %dbg0 : i32) + pto.print ins("E17_B_tmov_oi_phi", %dbg0 : i32) + pto.tmov ins(%oi__tile_mv : !pto.tile_buf) outs(%oi__phi_v5 : !pto.tile_buf) + pto.print ins("E17_A_tmov_oi_phi", %dbg0 : i32) + pto.print ins("E99_leave_else", %dbg0 : i32) + } + %9 = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tstore ins(%li__phi_v5 : !pto.tile_buf) outs(%9 : !pto.partition_tensor_view<16x1xf32>) + %10 = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tstore ins(%mi__phi_v5 : !pto.tile_buf) outs(%10 : !pto.partition_tensor_view<16x1xf32>) + %11 = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tstore ins(%oi__phi_v5 : !pto.tile_buf) outs(%11 : !pto.partition_tensor_view<16x128xf32>) + return + } +} From 884f6c5e27b6e3cf90fa3aea70eb262921cb13f7 Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Fri, 3 Apr 2026 09:36:10 +0800 Subject: [PATCH 14/16] [A5][Diag] avoid tprint in else trace case for board compatibility --- test/basic/issue828_diag_else_trace_print_a5.pto | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/test/basic/issue828_diag_else_trace_print_a5.pto b/test/basic/issue828_diag_else_trace_print_a5.pto index 7fb127206..d5be3e3d2 100644 --- a/test/basic/issue828_diag_else_trace_print_a5.pto +++ b/test/basic/issue828_diag_else_trace_print_a5.pto @@ -1,5 +1,6 @@ // ELSE-only diagnostic variant from issue #828 (force branch condition false). -// This case inserts print checkpoints to narrow down which op triggers runtime failure. +// This case inserts scalar print checkpoints to narrow down which op triggers runtime failure. +// Note: avoid pto.tprint here because some board CANN versions do not provide TPRINT_IMPL. // RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s > /dev/null // RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s > /dev/null @@ -64,7 +65,6 @@ module attributes {pto.target_arch = "a5"} { pto.print ins("E01_B_tmax", %dbg0 : i32) pto.tmax ins(%mi_new__rm_a0_tmp_v0, %mi_new__rm_a1_tmp_v1 : !pto.tile_buf, !pto.tile_buf) outs(%mi_new__row_major_tmp_v2 : !pto.tile_buf) pto.print ins("E01_A_tmax", %dbg0 : i32) - pto.tprint ins(%mi_new__row_major_tmp_v2 : !pto.tile_buf) %mi_new__tile = pto.alloc_tile addr = %c16640 : !pto.tile_buf %t__rm_a0_tmp_v3 = pto.alloc_tile addr = %c192 : !pto.tile_buf %t__rm_a1_tmp_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf @@ -78,7 +78,6 @@ module attributes {pto.target_arch = "a5"} { pto.print ins("E03_B_texp0", %dbg0 : i32) pto.texp ins(%alpha__rm_a0_tmp_v6 : !pto.tile_buf) outs(%alpha__row_major_tmp_v7 : !pto.tile_buf) pto.print ins("E03_A_texp0", %dbg0 : i32) - pto.tprint ins(%alpha__row_major_tmp_v7 : !pto.tile_buf) %alpha__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf %t__rm_a0_tmp_v8 = pto.alloc_tile addr = %c64 : !pto.tile_buf %t__rm_a1_tmp_v9 = pto.alloc_tile addr = %c16640 : !pto.tile_buf @@ -92,7 +91,6 @@ module attributes {pto.target_arch = "a5"} { pto.print ins("E05_B_texp1", %dbg0 : i32) pto.texp ins(%beta__rm_a0_tmp_v11 : !pto.tile_buf) outs(%beta__row_major_tmp_v12 : !pto.tile_buf) pto.print ins("E05_A_texp1", %dbg0 : i32) - pto.tprint ins(%beta__row_major_tmp_v12 : !pto.tile_buf) %beta__tile = pto.alloc_tile addr = %c16768 : !pto.tile_buf %t__rm_a0_tmp_v13 = pto.alloc_tile addr = %c16704 : !pto.tile_buf %t__rm_a1_tmp_v14 = pto.alloc_tile addr = %c128 : !pto.tile_buf From 8ed1a05869efbdffbda41c847dfc009578f4ac0c Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Fri, 3 Apr 2026 11:04:47 +0800 Subject: [PATCH 15/16] [A5][Diag] force PIPE_ALL barriers around residual tmov triad --- test/basic/issue828_diag_else_3tmov_only_a5.pto | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test/basic/issue828_diag_else_3tmov_only_a5.pto b/test/basic/issue828_diag_else_3tmov_only_a5.pto index 26e541855..72c7a171a 100644 --- a/test/basic/issue828_diag_else_3tmov_only_a5.pto +++ b/test/basic/issue828_diag_else_3tmov_only_a5.pto @@ -50,9 +50,16 @@ module attributes {pto.target_arch = "a5"} { pto.tload ins(%in2_part : !pto.partition_tensor_view<16x128xf32>) outs(%src2 : !pto.tile_buf) // Non-identity TMOV triplet under investigation. + // Force full serialization around each TMOV for board-side diagnosis. + pto.barrier pto.tmov ins(%src0 : !pto.tile_buf) outs(%dst0 : !pto.tile_buf) + pto.barrier + pto.barrier pto.tmov ins(%src1 : !pto.tile_buf) outs(%dst1 : !pto.tile_buf) + pto.barrier + pto.barrier pto.tmov ins(%src2 : !pto.tile_buf) outs(%dst2 : !pto.tile_buf) + pto.barrier %out0_part = pto.partition_view %out0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> %out1_part = pto.partition_view %out1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> From 94917ed008e93b01013595f8604b36f673d9ac31 Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Fri, 3 Apr 2026 16:00:41 +0800 Subject: [PATCH 16/16] [A5][Diag] add aligned TMOV A/B variants for issue828 --- .../issue828_diag_else_3tmov_rowmajor_a5.pto | 73 +++++++++++++++++++ .../issue828_diag_else_3tmov_vrow1_a5.pto | 73 +++++++++++++++++++ 2 files changed, 146 insertions(+) create mode 100644 test/basic/issue828_diag_else_3tmov_rowmajor_a5.pto create mode 100644 test/basic/issue828_diag_else_3tmov_vrow1_a5.pto diff --git a/test/basic/issue828_diag_else_3tmov_rowmajor_a5.pto b/test/basic/issue828_diag_else_3tmov_rowmajor_a5.pto new file mode 100644 index 000000000..06b66ba78 --- /dev/null +++ b/test/basic/issue828_diag_else_3tmov_rowmajor_a5.pto @@ -0,0 +1,73 @@ +// Diagnostic case B2 (ELSE residual 3 TMOV, first two become row_major 1x16): +// Goal: keep sync pattern unchanged while making first two TMOVs row-major 1x16 tiles. +// Suggested board runs: +// /run A5 issue828_diag_else_3tmov_rowmajor_a5 --pto-level=level3 +// /run A5 issue828_diag_else_3tmov_rowmajor_a5 --pto-level=level3 --disable-identity-tmov-cleanup +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s > /dev/null +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s > /dev/null + +module attributes {pto.target_arch = "a5"} { + func.func @issue828_diag_else_3tmov_rowmajor_a5( + %in0: !pto.ptr, %in1: !pto.ptr, %in2: !pto.ptr, + %out0: !pto.ptr, %out1: !pto.ptr, %out2: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c64 = arith.constant 64 : i64 + %c256 = arith.constant 256 : i64 + %c8448 = arith.constant 8448 : i64 + %c16640 = arith.constant 16640 : i64 + %c16832 = arith.constant 16832 : i64 + + %c16 = arith.constant 16 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + + %in0_view = pto.make_tensor_view %in0, shape = [%c1, %c16], strides = [%c16, %c1] {layout = #pto.layout} : !pto.tensor_view + %in1_view = pto.make_tensor_view %in1, shape = [%c1, %c16], strides = [%c16, %c1] {layout = #pto.layout} : !pto.tensor_view + %in2_view = pto.make_tensor_view %in2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout} : !pto.tensor_view + + %out0_view = pto.make_tensor_view %out0, shape = [%c1, %c16], strides = [%c16, %c1] {layout = #pto.layout} : !pto.tensor_view + %out1_view = pto.make_tensor_view %out1, shape = [%c1, %c16], strides = [%c16, %c1] {layout = #pto.layout} : !pto.tensor_view + %out2_view = pto.make_tensor_view %out2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout} : !pto.tensor_view + + // Source addresses mirror residual else-path producers. + %src0 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %src1 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %src2 = pto.alloc_tile addr = %c256 : !pto.tile_buf + + // Destination addresses mirror phi/result tiles in issue828. + %dst0 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %dst1 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %dst2 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + + %in0_part = pto.partition_view %in0_view, offsets = [%c0, %c0], sizes = [%c1, %c16] : !pto.tensor_view -> !pto.partition_tensor_view<1x16xf32> + %in1_part = pto.partition_view %in1_view, offsets = [%c0, %c0], sizes = [%c1, %c16] : !pto.tensor_view -> !pto.partition_tensor_view<1x16xf32> + %in2_part = pto.partition_view %in2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + + pto.tload ins(%in0_part : !pto.partition_tensor_view<1x16xf32>) outs(%src0 : !pto.tile_buf) + pto.tload ins(%in1_part : !pto.partition_tensor_view<1x16xf32>) outs(%src1 : !pto.tile_buf) + pto.tload ins(%in2_part : !pto.partition_tensor_view<16x128xf32>) outs(%src2 : !pto.tile_buf) + + // Non-identity TMOV triplet under investigation. + // Force full serialization around each TMOV for board-side diagnosis. + pto.barrier + pto.tmov ins(%src0 : !pto.tile_buf) outs(%dst0 : !pto.tile_buf) + pto.barrier + pto.barrier + pto.tmov ins(%src1 : !pto.tile_buf) outs(%dst1 : !pto.tile_buf) + pto.barrier + pto.barrier + pto.tmov ins(%src2 : !pto.tile_buf) outs(%dst2 : !pto.tile_buf) + pto.barrier + + %out0_part = pto.partition_view %out0_view, offsets = [%c0, %c0], sizes = [%c1, %c16] : !pto.tensor_view -> !pto.partition_tensor_view<1x16xf32> + %out1_part = pto.partition_view %out1_view, offsets = [%c0, %c0], sizes = [%c1, %c16] : !pto.tensor_view -> !pto.partition_tensor_view<1x16xf32> + %out2_part = pto.partition_view %out2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + + pto.tstore ins(%dst0 : !pto.tile_buf) outs(%out0_part : !pto.partition_tensor_view<1x16xf32>) + pto.tstore ins(%dst1 : !pto.tile_buf) outs(%out1_part : !pto.partition_tensor_view<1x16xf32>) + pto.tstore ins(%dst2 : !pto.tile_buf) outs(%out2_part : !pto.partition_tensor_view<16x128xf32>) + return + } +} diff --git a/test/basic/issue828_diag_else_3tmov_vrow1_a5.pto b/test/basic/issue828_diag_else_3tmov_vrow1_a5.pto new file mode 100644 index 000000000..da2232401 --- /dev/null +++ b/test/basic/issue828_diag_else_3tmov_vrow1_a5.pto @@ -0,0 +1,73 @@ +// Diagnostic case B1 (ELSE residual 3 TMOV, first two use v_row=1): +// Goal: keep sync pattern unchanged while forcing first two DN tiles to single-row valid shape. +// Suggested board runs: +// /run A5 issue828_diag_else_3tmov_vrow1_a5 --pto-level=level3 +// /run A5 issue828_diag_else_3tmov_vrow1_a5 --pto-level=level3 --disable-identity-tmov-cleanup +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s > /dev/null +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s > /dev/null + +module attributes {pto.target_arch = "a5"} { + func.func @issue828_diag_else_3tmov_vrow1_a5( + %in0: !pto.ptr, %in1: !pto.ptr, %in2: !pto.ptr, + %out0: !pto.ptr, %out1: !pto.ptr, %out2: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c64 = arith.constant 64 : i64 + %c256 = arith.constant 256 : i64 + %c8448 = arith.constant 8448 : i64 + %c16640 = arith.constant 16640 : i64 + %c16832 = arith.constant 16832 : i64 + + %c16 = arith.constant 16 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + + %in0_view = pto.make_tensor_view %in0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout} : !pto.tensor_view + %in1_view = pto.make_tensor_view %in1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout} : !pto.tensor_view + %in2_view = pto.make_tensor_view %in2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout} : !pto.tensor_view + + %out0_view = pto.make_tensor_view %out0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout} : !pto.tensor_view + %out1_view = pto.make_tensor_view %out1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout} : !pto.tensor_view + %out2_view = pto.make_tensor_view %out2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout} : !pto.tensor_view + + // Source addresses mirror residual else-path producers. + %src0 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %src1 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %src2 = pto.alloc_tile addr = %c256 : !pto.tile_buf + + // Destination addresses mirror phi/result tiles in issue828. + %dst0 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %dst1 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %dst2 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + + %in0_part = pto.partition_view %in0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + %in1_part = pto.partition_view %in1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + %in2_part = pto.partition_view %in2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + + pto.tload ins(%in0_part : !pto.partition_tensor_view<16x1xf32>) outs(%src0 : !pto.tile_buf) + pto.tload ins(%in1_part : !pto.partition_tensor_view<16x1xf32>) outs(%src1 : !pto.tile_buf) + pto.tload ins(%in2_part : !pto.partition_tensor_view<16x128xf32>) outs(%src2 : !pto.tile_buf) + + // Non-identity TMOV triplet under investigation. + // Force full serialization around each TMOV for board-side diagnosis. + pto.barrier + pto.tmov ins(%src0 : !pto.tile_buf) outs(%dst0 : !pto.tile_buf) + pto.barrier + pto.barrier + pto.tmov ins(%src1 : !pto.tile_buf) outs(%dst1 : !pto.tile_buf) + pto.barrier + pto.barrier + pto.tmov ins(%src2 : !pto.tile_buf) outs(%dst2 : !pto.tile_buf) + pto.barrier + + %out0_part = pto.partition_view %out0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + %out1_part = pto.partition_view %out1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + %out2_part = pto.partition_view %out2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + + pto.tstore ins(%dst0 : !pto.tile_buf) outs(%out0_part : !pto.partition_tensor_view<16x1xf32>) + pto.tstore ins(%dst1 : !pto.tile_buf) outs(%out1_part : !pto.partition_tensor_view<16x1xf32>) + pto.tstore ins(%dst2 : !pto.tile_buf) outs(%out2_part : !pto.partition_tensor_view<16x128xf32>) + return + } +}