diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fb4749ec0..5884adf59 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -215,6 +215,19 @@ jobs: export PTOAS_OUT_DIR="${PAYLOAD_DIR}/test/samples" bash test/samples/runop.sh --enablebc all + - name: Run issue828 branch-fixed basic pto regressions + shell: bash + env: + PTOAS_BIN: ${{ github.workspace }}/build/tools/ptoas/ptoas + run: | + set -euo pipefail + for case in \ + test/basic/issue828_softmax_rescale_incore_1_a5_if.pto \ + test/basic/issue828_softmax_rescale_incore_1_a5_else.pto; do + "${PTOAS_BIN}" --pto-level=level3 --pto-arch=a5 --enable-insert-sync "${case}" >/dev/null + "${PTOAS_BIN}" --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup "${case}" >/dev/null + done + - name: Build payload artifact if: >- ${{ diff --git a/README.md b/README.md index 9a3132c9f..337478b66 100644 --- a/README.md +++ b/README.md @@ -193,6 +193,9 @@ ptoas tests/input.pto # 运行 AutoSyncInsert Pass ptoas tests/input.pto --enable-insert-sync -o outputfile.cpp +# 调试开关:关闭 A5 identity tmov cleanup(用于 A/B 验证) +ptoas tests/input.pto --enable-insert-sync --disable-identity-tmov-cleanup -o outputfile.cpp + # 指定目标硬件架构(A3 / A5) ptoas tests/input.pto --pto-arch=a3 -o outputfile.cpp diff --git a/include/PTO/Transforms/Passes.h b/include/PTO/Transforms/Passes.h index cafdb784c..4861b155d 100644 --- a/include/PTO/Transforms/Passes.h +++ b/include/PTO/Transforms/Passes.h @@ -38,6 +38,7 @@ std::unique_ptr createPTOLowerFrontendPipeOpsPass(); std::unique_ptr createPTOResolveReservedBuffersPass(); std::unique_ptr createPTOWrapFunctionsInSectionsPass(); std::unique_ptr createPTOVerifyTFreePass(); +std::unique_ptr createPTORemoveIdentityTMovPass(); // Creates a pass for ... std::unique_ptr createPTOInsertSyncPass(); diff --git a/include/PTO/Transforms/Passes.td b/include/PTO/Transforms/Passes.td index 37979bf21..0766b599c 100644 --- a/include/PTO/Transforms/Passes.td +++ b/include/PTO/Transforms/Passes.td @@ -38,6 +38,23 @@ def PTOInsertSync : Pass<"pto-insert-sync", "func::FuncOp"> { ]; } +def PTORemoveIdentityTMov : Pass<"pto-remove-identity-tmov", "func::FuncOp"> { + let summary = "Remove identity pto.tmov before auto-sync on A5"; + let description = [{ + Erases provably-identity `pto.tmov` operations on A5 before + `pto-insert-sync`. Besides `src == dst`, this also handles distinct SSA + values when alias analysis proves source and destination have the exact same + address range and type/layout. `memref.reinterpret_cast` views are treated + conservatively (not elided), and when roots differ the pass requires + concrete-equal root addresses. Dynamic/unknown address cases are not + removed. + }]; + let constructor = "mlir::pto::createPTORemoveIdentityTMovPass()"; + let dependentDialects = [ + "mlir::pto::PTODialect" + ]; +} + def ConvertToPTOOp : Pass<"convert-to-pto-op"> { let summary = "Convert Ops from other dialects to PTO Ops"; let constructor = "mlir::pto::createConvertToPTOOpPass()"; diff --git a/lib/PTO/Transforms/CMakeLists.txt b/lib/PTO/Transforms/CMakeLists.txt index b82d227fe..755c2a1bb 100644 --- a/lib/PTO/Transforms/CMakeLists.txt +++ b/lib/PTO/Transforms/CMakeLists.txt @@ -23,6 +23,7 @@ add_mlir_dialect_library(PTOTransforms PTOPlanMemory.cpp PTORemoveRedundantBarrier.cpp InferPTOLayout.cpp + PTORemoveIdentityTMovPass.cpp BufferizableOpInterfaceImpl.cpp ConvertToPTOOp.cpp PTOLowerFrontendPipeOpsPass.cpp diff --git a/lib/PTO/Transforms/PTORemoveIdentityTMovPass.cpp b/lib/PTO/Transforms/PTORemoveIdentityTMovPass.cpp new file mode 100644 index 000000000..0d77ea100 --- /dev/null +++ b/lib/PTO/Transforms/PTORemoveIdentityTMovPass.cpp @@ -0,0 +1,245 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include "PTO/IR/PTO.h" +#include "PTO/Transforms/Passes.h" +#include "PTO/Transforms/InsertSync/MemoryDependentAnalyzer.h" +#include "PTO/Transforms/InsertSync/PTOIRTranslator.h" +#include "PTO/Transforms/InsertSync/SyncCommon.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/Matchers.h" +#include "mlir/Pass/Pass.h" +#include + +namespace mlir { +namespace pto { +namespace func = ::mlir::func; +#define GEN_PASS_DEF_PTOREMOVEIDENTITYTMOV +#include "PTO/Transforms/Passes.h.inc" +} // namespace pto +} // namespace mlir + +using namespace mlir; +using namespace mlir::pto; + +namespace { + +static bool isA5Target(func::FuncOp funcOp) { + ModuleOp module = funcOp->getParentOfType(); + if (!module) + return false; + auto arch = module->getAttrOfType("pto.target_arch"); + return arch && arch.getValue().equals_insensitive("a5"); +} + +static const BaseMemInfo * +getSingleMemInfo(const Buffer2MemInfoMap &buffer2MemInfoMap, Value value) { + auto it = buffer2MemInfoMap.find(value); + if (it == buffer2MemInfoMap.end()) + return nullptr; + if (it->second.size() != 1) + return nullptr; + return it->second.front().get(); +} + +static std::optional tryEvalI64Constant(Value value) { + if (!value) + return std::nullopt; + + APInt apInt; + if (matchPattern(value, m_ConstantInt(&apInt))) + return apInt.getSExtValue(); + + Operation *defOp = value.getDefiningOp(); + if (!defOp) + return std::nullopt; + + if (auto castOp = dyn_cast(defOp)) + return tryEvalI64Constant(castOp.getIn()); + if (auto castOp = dyn_cast(defOp)) + return tryEvalI64Constant(castOp.getIn()); + if (auto castOp = dyn_cast(defOp)) + return tryEvalI64Constant(castOp.getIn()); + if (auto castOp = dyn_cast(defOp)) + return tryEvalI64Constant(castOp.getIn()); + + return std::nullopt; +} + +static std::optional +tryGetConcreteRootAddress(const BaseMemInfo *info) { + if (!info) + return std::nullopt; + + if (auto direct = tryEvalI64Constant(info->rootBuffer)) + return direct; + + Operation *defOp = info->rootBuffer.getDefiningOp(); + if (!defOp) + return std::nullopt; + + if (auto alloc = dyn_cast(defOp)) + return tryEvalI64Constant(alloc.getAddr()); + + if (auto cast = dyn_cast(defOp)) { + if (!cast.getAddrs().empty()) + return tryEvalI64Constant(cast.getAddrs().front()); + } + + return std::nullopt; +} + +static bool hasDynamicStaticList(ArrayRef values) { + return llvm::any_of(values, [](int64_t value) { + return value == ShapedType::kDynamic; + }); +} + +static bool isStaticallyAddressableValue(Value value) { + int depth = 0; + constexpr int kMaxDepth = 32; + while (value && depth++ < kMaxDepth) { + Operation *defOp = value.getDefiningOp(); + if (!defOp) + return false; + + if (auto subView = dyn_cast(defOp)) { + if (hasDynamicStaticList(subView.getStaticOffsets()) || + hasDynamicStaticList(subView.getStaticSizes()) || + hasDynamicStaticList(subView.getStaticStrides())) { + return false; + } + value = subView.getSource(); + continue; + } + + if (isa(defOp)) + return false; + + if (auto cast = dyn_cast(defOp)) { + value = cast.getSource(); + continue; + } + if (auto collapse = dyn_cast(defOp)) { + value = collapse.getSrc(); + continue; + } + if (auto expand = dyn_cast(defOp)) { + value = expand.getSrc(); + continue; + } + if (auto view = dyn_cast(defOp)) { + if (view.getByteShift()) + return false; + value = view.getSource(); + continue; + } + + return true; + } + + return false; +} + +static bool hasExactSameAddressRange(const BaseMemInfo *srcInfo, + const BaseMemInfo *dstInfo) { + if (!srcInfo || !dstInfo) + return false; + + if (srcInfo->scope != dstInfo->scope) + return false; + if (srcInfo->allocateSize == 0 || dstInfo->allocateSize == 0) + return false; + if (srcInfo->allocateSize != dstInfo->allocateSize) + return false; + if (srcInfo->baseAddresses.empty() || dstInfo->baseAddresses.empty()) + return false; + if (srcInfo->baseAddresses != dstInfo->baseAddresses) + return false; + + return true; +} + +static bool canEraseIdentityTMov( + TMovOp op, const Buffer2MemInfoMap &buffer2MemInfoMap) { + Value src = op.getSrc(); + Value dst = op.getDst(); + + if (src == dst) + return true; + + if (src.getType() != dst.getType()) + return false; + if (!isStaticallyAddressableValue(src) || !isStaticallyAddressableValue(dst)) + return false; + + const BaseMemInfo *srcInfo = getSingleMemInfo(buffer2MemInfoMap, src); + const BaseMemInfo *dstInfo = getSingleMemInfo(buffer2MemInfoMap, dst); + if (!srcInfo || !dstInfo) + return false; + + if (!hasExactSameAddressRange(srcInfo, dstInfo)) + return false; + + if (srcInfo->rootBuffer == dstInfo->rootBuffer) + return true; + + auto srcRootAddr = tryGetConcreteRootAddress(srcInfo); + auto dstRootAddr = tryGetConcreteRootAddress(dstInfo); + if (!srcRootAddr || !dstRootAddr) + return false; + return *srcRootAddr == *dstRootAddr; +} + +struct PTORemoveIdentityTMovPass + : public mlir::pto::impl::PTORemoveIdentityTMovBase< + PTORemoveIdentityTMovPass> { + void runOnOperation() override { + func::FuncOp funcOp = getOperation(); + if (!isA5Target(funcOp)) + return; + + bool hasTMov = false; + funcOp.walk([&](TMovOp) { + hasTMov = true; + return WalkResult::interrupt(); + }); + if (!hasTMov) + return; + + MemoryDependentAnalyzer memAnalyzer; + SyncIRs syncIR; + Buffer2MemInfoMap buffer2MemInfoMap; + PTOIRTranslator translator(syncIR, memAnalyzer, buffer2MemInfoMap, funcOp, + SyncAnalysisMode::NORMALSYNC); + translator.Build(); + + SmallVector toErase; + funcOp.walk([&](TMovOp op) { + if (canEraseIdentityTMov(op, buffer2MemInfoMap)) + toErase.push_back(op); + }); + + for (TMovOp op : toErase) { + Value result = op.getResult(); + if (result && !result.use_empty()) + result.replaceAllUsesWith(op.getDst()); + op.erase(); + } + } +}; + +} // namespace + +std::unique_ptr mlir::pto::createPTORemoveIdentityTMovPass() { + return std::make_unique(); +} diff --git a/test/basic/identity_tmov_alias_deadlock_signature_a5.pto b/test/basic/identity_tmov_alias_deadlock_signature_a5.pto new file mode 100644 index 000000000..4974c845b --- /dev/null +++ b/test/basic/identity_tmov_alias_deadlock_signature_a5.pto @@ -0,0 +1,40 @@ +// RUN: ptoas --pto-arch=a5 --enable-insert-sync %s | FileCheck %s --check-prefix=SAFE +// RUN: ptoas --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s | FileCheck %s --check-prefix=UNSAFE + +module attributes {"pto.device-spec" = "Ascend950"} { + func.func @identity_tmov_alias_deadlock_signature_a5( + %src: memref<1x64xf16, #pto.address_space>, + %dst: memref<1x64xf16, #pto.address_space>) { + %ub = memref.alloc() : memref<1x64xf16, #pto.address_space> + %src_alias = memref.subview %ub[0, 0] [1, 64] [1, 1] + : memref<1x64xf16, #pto.address_space> + to memref<1x64xf16, strided<[64, 1]>, #pto.address_space> + %dst_alias = memref.subview %ub[0, 0] [1, 64] [1, 1] + : memref<1x64xf16, #pto.address_space> + to memref<1x64xf16, strided<[64, 1]>, #pto.address_space> + + pto.tload ins(%src : memref<1x64xf16, #pto.address_space>) + outs(%src_alias : memref<1x64xf16, strided<[64, 1]>, #pto.address_space>) + pto.tmov ins(%src_alias : memref<1x64xf16, strided<[64, 1]>, #pto.address_space>) + outs(%dst_alias : memref<1x64xf16, strided<[64, 1]>, #pto.address_space>) + pto.tstore ins(%dst_alias : memref<1x64xf16, strided<[64, 1]>, #pto.address_space>) + outs(%dst : memref<1x64xf16, #pto.address_space>) + return + } +} + +// SAFE-LABEL: __global__ AICORE void identity_tmov_alias_deadlock_signature_a5( +// SAFE-NOT: TMOV( +// SAFE: set_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0); +// SAFE: wait_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0); +// SAFE-NOT: set_flag(PIPE_MTE2, PIPE_V +// SAFE-NOT: wait_flag(PIPE_MTE2, PIPE_V +// SAFE-NOT: set_flag(PIPE_V, PIPE_MTE3 +// SAFE-NOT: wait_flag(PIPE_V, PIPE_MTE3 + +// UNSAFE-LABEL: __global__ AICORE void identity_tmov_alias_deadlock_signature_a5( +// UNSAFE: TMOV( +// UNSAFE: set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); +// UNSAFE: wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); +// UNSAFE: set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); +// UNSAFE: wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); diff --git a/test/basic/identity_tmov_autosync_a5_only.pto b/test/basic/identity_tmov_autosync_a5_only.pto new file mode 100644 index 000000000..e02a33014 --- /dev/null +++ b/test/basic/identity_tmov_autosync_a5_only.pto @@ -0,0 +1,33 @@ +// RUN: ptoas --pto-arch=a5 --enable-insert-sync %s | FileCheck %s --check-prefix=A5 +// RUN: ptoas --pto-arch=a3 --enable-insert-sync %s | FileCheck %s --check-prefix=A3 + +module attributes {"pto.device-spec" = "Ascend950"} { + func.func @identity_tmov_autosync_a5_only( + %src: memref<1x64xf16, #pto.address_space>, + %dst: memref<1x64xf16, #pto.address_space>) { + %ub = memref.alloc() : memref<1x64xf16, #pto.address_space> + + pto.tload ins(%src : memref<1x64xf16, #pto.address_space>) + outs(%ub : memref<1x64xf16, #pto.address_space>) + + // Identity move: should be removed on A5 before sync insertion. + pto.tmov ins(%ub : memref<1x64xf16, #pto.address_space>) + outs(%ub : memref<1x64xf16, #pto.address_space>) + + pto.tstore ins(%ub : memref<1x64xf16, #pto.address_space>) + outs(%dst : memref<1x64xf16, #pto.address_space>) + return + } +} + +// A5-LABEL: __global__ AICORE void identity_tmov_autosync_a5_only( +// A5: set_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0); +// A5-NOT: set_flag(PIPE_MTE2, PIPE_V +// A5-NOT: wait_flag(PIPE_MTE2, PIPE_V +// A5-NOT: set_flag(PIPE_V, PIPE_MTE3 +// A5-NOT: wait_flag(PIPE_V, PIPE_MTE3 +// A5-NOT: TMOV( +// A5: wait_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0); + +// A3-LABEL: __global__ AICORE void identity_tmov_autosync_a5_only( +// A3: TMOV( diff --git a/test/basic/identity_tmov_diff_addr_keep_a5.pto b/test/basic/identity_tmov_diff_addr_keep_a5.pto new file mode 100644 index 000000000..ea5734e94 --- /dev/null +++ b/test/basic/identity_tmov_diff_addr_keep_a5.pto @@ -0,0 +1,20 @@ +// RUN: ptoas --pto-arch=a5 --enable-insert-sync %s | FileCheck %s + +module attributes {"pto.device-spec" = "Ascend950"} { + func.func @identity_tmov_diff_addr_keep_a5( + %buf: memref<1024xi32, #pto.address_space>) { + %src = memref.subview %buf[0] [256] [1] + : memref<1024xi32, #pto.address_space> + to memref<256xi32, strided<[1]>, #pto.address_space> + %dst = memref.subview %buf[256] [256] [1] + : memref<1024xi32, #pto.address_space> + to memref<256xi32, strided<[1], offset: 256>, #pto.address_space> + + pto.tmov ins(%src : memref<256xi32, strided<[1]>, #pto.address_space>) + outs(%dst : memref<256xi32, strided<[1], offset: 256>, #pto.address_space>) + return + } +} + +// CHECK-LABEL: __global__ AICORE void identity_tmov_diff_addr_keep_a5( +// CHECK: TMOV( diff --git a/test/basic/identity_tmov_dynamic_addr_keep_a5.pto b/test/basic/identity_tmov_dynamic_addr_keep_a5.pto new file mode 100644 index 000000000..088f14127 --- /dev/null +++ b/test/basic/identity_tmov_dynamic_addr_keep_a5.pto @@ -0,0 +1,14 @@ +// RUN: ptoas --pto-arch=a5 --enable-insert-sync %s | FileCheck %s + +module attributes {"pto.device-spec" = "Ascend950"} { + func.func @identity_tmov_dynamic_addr_keep_a5( + %src: memref>, + %dst: memref>) { + pto.tmov ins(%src : memref>) + outs(%dst : memref>) + return + } +} + +// CHECK-LABEL: __global__ AICORE void identity_tmov_dynamic_addr_keep_a5( +// CHECK: TMOV( diff --git a/test/basic/identity_tmov_if_else_alias_a5.pto b/test/basic/identity_tmov_if_else_alias_a5.pto new file mode 100644 index 000000000..62fad4543 --- /dev/null +++ b/test/basic/identity_tmov_if_else_alias_a5.pto @@ -0,0 +1,51 @@ +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s | FileCheck %s --check-prefix=A5 +// RUN: ptoas --pto-level=level3 --pto-arch=a3 --enable-insert-sync %s | FileCheck %s --check-prefix=A3 + +module attributes {"pto.device-spec" = "Ascend950"} { + func.func @identity_tmov_if_else_alias_a5(%cond: i1) { + %c0_i64 = arith.constant 0 : i64 + %c64_i64 = arith.constant 64 : i64 + + // Phi-like destinations (same addresses are reused in both branches). + %phi0 = pto.alloc_tile addr = %c0_i64 + : !pto.tile_buf + %phi1 = pto.alloc_tile addr = %c64_i64 + : !pto.tile_buf + + scf.if %cond { + // Then: distinct SSA, same address range as destinations. + %then_src0 = pto.alloc_tile addr = %c0_i64 + : !pto.tile_buf + pto.tmov ins(%then_src0 : !pto.tile_buf) + outs(%phi0 : !pto.tile_buf) + + %then_src1 = pto.alloc_tile addr = %c64_i64 + : !pto.tile_buf + pto.tmov ins(%then_src1 : !pto.tile_buf) + outs(%phi1 : !pto.tile_buf) + } else { + // Else: include a move chain with identical addresses to mirror the issue style. + %else_src0 = pto.alloc_tile addr = %c0_i64 + : !pto.tile_buf + pto.tmov ins(%else_src0 : !pto.tile_buf) + outs(%phi0 : !pto.tile_buf) + + %else_src1 = pto.alloc_tile addr = %c64_i64 + : !pto.tile_buf + %else_mid1 = pto.alloc_tile addr = %c64_i64 + : !pto.tile_buf + pto.tmov ins(%else_src1 : !pto.tile_buf) + outs(%else_mid1 : !pto.tile_buf) + pto.tmov ins(%else_mid1 : !pto.tile_buf) + outs(%phi1 : !pto.tile_buf) + } + + return + } +} + +// A5-LABEL: __global__ AICORE void identity_tmov_if_else_alias_a5( +// A5-NOT: TMOV( + +// A3-LABEL: __global__ AICORE void identity_tmov_if_else_alias_a5( +// A3: TMOV( diff --git a/test/basic/identity_tmov_pointer_cast_diff_addr_keep_a5.pto b/test/basic/identity_tmov_pointer_cast_diff_addr_keep_a5.pto new file mode 100644 index 000000000..e9f63ee8b --- /dev/null +++ b/test/basic/identity_tmov_pointer_cast_diff_addr_keep_a5.pto @@ -0,0 +1,24 @@ +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s | FileCheck %s + +module attributes {"pto.device-spec" = "Ascend950"} { + func.func @identity_tmov_pointer_cast_diff_addr_keep_a5( + %src: memref<1x64xf16, #pto.address_space>, + %dst: memref<1x64xf16, #pto.address_space>) { + %c0 = arith.constant 0 : i64 + %c64 = arith.constant 64 : i64 + + %src_buf = pto.pointer_cast(%c0) : memref<1x64xf16, #pto.address_space> + %dst_buf = pto.pointer_cast(%c64) : memref<1x64xf16, #pto.address_space> + + pto.tload ins(%src : memref<1x64xf16, #pto.address_space>) + outs(%src_buf : memref<1x64xf16, #pto.address_space>) + pto.tmov ins(%src_buf : memref<1x64xf16, #pto.address_space>) + outs(%dst_buf : memref<1x64xf16, #pto.address_space>) + pto.tstore ins(%dst_buf : memref<1x64xf16, #pto.address_space>) + outs(%dst : memref<1x64xf16, #pto.address_space>) + return + } +} + +// CHECK-LABEL: __global__ AICORE void identity_tmov_pointer_cast_diff_addr_keep_a5( +// CHECK: TMOV( diff --git a/test/basic/identity_tmov_pointer_cast_same_addr_diff_ssa_a5.pto b/test/basic/identity_tmov_pointer_cast_same_addr_diff_ssa_a5.pto new file mode 100644 index 000000000..841745ee6 --- /dev/null +++ b/test/basic/identity_tmov_pointer_cast_same_addr_diff_ssa_a5.pto @@ -0,0 +1,27 @@ +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s | FileCheck %s + +module attributes {"pto.device-spec" = "Ascend950"} { + func.func @identity_tmov_pointer_cast_same_addr_diff_ssa_a5( + %src: memref<1x64xf16, #pto.address_space>, + %dst: memref<1x64xf16, #pto.address_space>) { + %c0 = arith.constant 0 : i64 + %c0_idx = arith.constant 0 : index + %c0_alt = arith.index_cast %c0_idx : index to i64 + + %src_buf = pto.pointer_cast(%c0) : memref<1x64xf16, #pto.address_space> + %dst_buf = pto.pointer_cast(%c0_alt) : memref<1x64xf16, #pto.address_space> + + pto.tload ins(%src : memref<1x64xf16, #pto.address_space>) + outs(%src_buf : memref<1x64xf16, #pto.address_space>) + pto.tmov ins(%src_buf : memref<1x64xf16, #pto.address_space>) + outs(%dst_buf : memref<1x64xf16, #pto.address_space>) + pto.tstore ins(%dst_buf : memref<1x64xf16, #pto.address_space>) + outs(%dst : memref<1x64xf16, #pto.address_space>) + return + } +} + +// CHECK-LABEL: __global__ AICORE void identity_tmov_pointer_cast_same_addr_diff_ssa_a5( +// CHECK-NOT: TMOV( +// CHECK: set_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0); +// CHECK: wait_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0); diff --git a/test/basic/identity_tmov_reinterpret_cast_keep_a5.pto b/test/basic/identity_tmov_reinterpret_cast_keep_a5.pto new file mode 100644 index 000000000..9130de0d6 --- /dev/null +++ b/test/basic/identity_tmov_reinterpret_cast_keep_a5.pto @@ -0,0 +1,26 @@ +// RUN: ptoas --pto-arch=a5 --enable-insert-sync %s | FileCheck %s + +module attributes {"pto.device-spec" = "Ascend950"} { + func.func @identity_tmov_reinterpret_cast_keep_a5( + %src: memref<1x64xf16, #pto.address_space>, + %dst: memref<1x64xf16, #pto.address_space>) { + %ub = memref.alloc() : memref<64xf16, #pto.address_space> + %v0 = memref.reinterpret_cast %ub to offset: [0], sizes: [1, 64], strides: [64, 1] + : memref<64xf16, #pto.address_space> + to memref, #pto.address_space> + %v1 = memref.reinterpret_cast %ub to offset: [0], sizes: [1, 64], strides: [1, 64] + : memref<64xf16, #pto.address_space> + to memref, #pto.address_space> + + pto.tload ins(%src : memref<1x64xf16, #pto.address_space>) + outs(%v0 : memref, #pto.address_space>) + pto.tmov ins(%v0 : memref, #pto.address_space>) + outs(%v1 : memref, #pto.address_space>) + pto.tstore ins(%v1 : memref, #pto.address_space>) + outs(%dst : memref<1x64xf16, #pto.address_space>) + return + } +} + +// CHECK-LABEL: __global__ AICORE void identity_tmov_reinterpret_cast_keep_a5( +// CHECK: TMOV( diff --git a/test/basic/identity_tmov_same_range_distinct_ssa_a5.pto b/test/basic/identity_tmov_same_range_distinct_ssa_a5.pto new file mode 100644 index 000000000..280770dd6 --- /dev/null +++ b/test/basic/identity_tmov_same_range_distinct_ssa_a5.pto @@ -0,0 +1,24 @@ +// RUN: ptoas --pto-arch=a5 --enable-insert-sync %s | FileCheck %s --check-prefix=A5 +// RUN: ptoas --pto-arch=a3 --enable-insert-sync %s | FileCheck %s --check-prefix=A3 + +module attributes {"pto.device-spec" = "Ascend950"} { + func.func @identity_tmov_same_range_distinct_ssa_a5() { + %buf = memref.alloc() : memref<1024xi32, #pto.address_space> + %src = memref.subview %buf[0] [1024] [1] + : memref<1024xi32, #pto.address_space> + to memref<1024xi32, strided<[1]>, #pto.address_space> + %dst = memref.subview %buf[0] [1024] [1] + : memref<1024xi32, #pto.address_space> + to memref<1024xi32, strided<[1]>, #pto.address_space> + + pto.tmov ins(%src : memref<1024xi32, strided<[1]>, #pto.address_space>) + outs(%dst : memref<1024xi32, strided<[1]>, #pto.address_space>) + return + } +} + +// A5-LABEL: __global__ AICORE void identity_tmov_same_range_distinct_ssa_a5( +// A5-NOT: TMOV( + +// A3-LABEL: __global__ AICORE void identity_tmov_same_range_distinct_ssa_a5( +// A3: TMOV( diff --git a/test/basic/issue828_diag_else_3tmov_only_a5.pto b/test/basic/issue828_diag_else_3tmov_only_a5.pto new file mode 100644 index 000000000..72c7a171a --- /dev/null +++ b/test/basic/issue828_diag_else_3tmov_only_a5.pto @@ -0,0 +1,73 @@ +// Diagnostic case B (ELSE residual 3 TMOV only): +// Goal: isolate the 3 non-identity TMOVs that remain after identity cleanup. +// Suggested board runs: +// /run A5 issue828_diag_else_3tmov_only_a5 --pto-level=level3 +// /run A5 issue828_diag_else_3tmov_only_a5 --pto-level=level3 --disable-identity-tmov-cleanup +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s > /dev/null +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s > /dev/null + +module attributes {pto.target_arch = "a5"} { + func.func @issue828_diag_else_3tmov_only_a5( + %in0: !pto.ptr, %in1: !pto.ptr, %in2: !pto.ptr, + %out0: !pto.ptr, %out1: !pto.ptr, %out2: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c64 = arith.constant 64 : i64 + %c256 = arith.constant 256 : i64 + %c8448 = arith.constant 8448 : i64 + %c16640 = arith.constant 16640 : i64 + %c16832 = arith.constant 16832 : i64 + + %c16 = arith.constant 16 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + + %in0_view = pto.make_tensor_view %in0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout} : !pto.tensor_view + %in1_view = pto.make_tensor_view %in1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout} : !pto.tensor_view + %in2_view = pto.make_tensor_view %in2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout} : !pto.tensor_view + + %out0_view = pto.make_tensor_view %out0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout} : !pto.tensor_view + %out1_view = pto.make_tensor_view %out1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout} : !pto.tensor_view + %out2_view = pto.make_tensor_view %out2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout} : !pto.tensor_view + + // Source addresses mirror residual else-path producers. + %src0 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %src1 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %src2 = pto.alloc_tile addr = %c256 : !pto.tile_buf + + // Destination addresses mirror phi/result tiles in issue828. + %dst0 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %dst1 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %dst2 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + + %in0_part = pto.partition_view %in0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + %in1_part = pto.partition_view %in1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + %in2_part = pto.partition_view %in2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + + pto.tload ins(%in0_part : !pto.partition_tensor_view<16x1xf32>) outs(%src0 : !pto.tile_buf) + pto.tload ins(%in1_part : !pto.partition_tensor_view<16x1xf32>) outs(%src1 : !pto.tile_buf) + pto.tload ins(%in2_part : !pto.partition_tensor_view<16x128xf32>) outs(%src2 : !pto.tile_buf) + + // Non-identity TMOV triplet under investigation. + // Force full serialization around each TMOV for board-side diagnosis. + pto.barrier + pto.tmov ins(%src0 : !pto.tile_buf) outs(%dst0 : !pto.tile_buf) + pto.barrier + pto.barrier + pto.tmov ins(%src1 : !pto.tile_buf) outs(%dst1 : !pto.tile_buf) + pto.barrier + pto.barrier + pto.tmov ins(%src2 : !pto.tile_buf) outs(%dst2 : !pto.tile_buf) + pto.barrier + + %out0_part = pto.partition_view %out0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + %out1_part = pto.partition_view %out1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + %out2_part = pto.partition_view %out2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + + pto.tstore ins(%dst0 : !pto.tile_buf) outs(%out0_part : !pto.partition_tensor_view<16x1xf32>) + pto.tstore ins(%dst1 : !pto.tile_buf) outs(%out1_part : !pto.partition_tensor_view<16x1xf32>) + pto.tstore ins(%dst2 : !pto.tile_buf) outs(%out2_part : !pto.partition_tensor_view<16x128xf32>) + return + } +} diff --git a/test/basic/issue828_diag_else_3tmov_rowmajor_a5.pto b/test/basic/issue828_diag_else_3tmov_rowmajor_a5.pto new file mode 100644 index 000000000..06b66ba78 --- /dev/null +++ b/test/basic/issue828_diag_else_3tmov_rowmajor_a5.pto @@ -0,0 +1,73 @@ +// Diagnostic case B2 (ELSE residual 3 TMOV, first two become row_major 1x16): +// Goal: keep sync pattern unchanged while making first two TMOVs row-major 1x16 tiles. +// Suggested board runs: +// /run A5 issue828_diag_else_3tmov_rowmajor_a5 --pto-level=level3 +// /run A5 issue828_diag_else_3tmov_rowmajor_a5 --pto-level=level3 --disable-identity-tmov-cleanup +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s > /dev/null +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s > /dev/null + +module attributes {pto.target_arch = "a5"} { + func.func @issue828_diag_else_3tmov_rowmajor_a5( + %in0: !pto.ptr, %in1: !pto.ptr, %in2: !pto.ptr, + %out0: !pto.ptr, %out1: !pto.ptr, %out2: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c64 = arith.constant 64 : i64 + %c256 = arith.constant 256 : i64 + %c8448 = arith.constant 8448 : i64 + %c16640 = arith.constant 16640 : i64 + %c16832 = arith.constant 16832 : i64 + + %c16 = arith.constant 16 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + + %in0_view = pto.make_tensor_view %in0, shape = [%c1, %c16], strides = [%c16, %c1] {layout = #pto.layout} : !pto.tensor_view + %in1_view = pto.make_tensor_view %in1, shape = [%c1, %c16], strides = [%c16, %c1] {layout = #pto.layout} : !pto.tensor_view + %in2_view = pto.make_tensor_view %in2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout} : !pto.tensor_view + + %out0_view = pto.make_tensor_view %out0, shape = [%c1, %c16], strides = [%c16, %c1] {layout = #pto.layout} : !pto.tensor_view + %out1_view = pto.make_tensor_view %out1, shape = [%c1, %c16], strides = [%c16, %c1] {layout = #pto.layout} : !pto.tensor_view + %out2_view = pto.make_tensor_view %out2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout} : !pto.tensor_view + + // Source addresses mirror residual else-path producers. + %src0 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %src1 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %src2 = pto.alloc_tile addr = %c256 : !pto.tile_buf + + // Destination addresses mirror phi/result tiles in issue828. + %dst0 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %dst1 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %dst2 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + + %in0_part = pto.partition_view %in0_view, offsets = [%c0, %c0], sizes = [%c1, %c16] : !pto.tensor_view -> !pto.partition_tensor_view<1x16xf32> + %in1_part = pto.partition_view %in1_view, offsets = [%c0, %c0], sizes = [%c1, %c16] : !pto.tensor_view -> !pto.partition_tensor_view<1x16xf32> + %in2_part = pto.partition_view %in2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + + pto.tload ins(%in0_part : !pto.partition_tensor_view<1x16xf32>) outs(%src0 : !pto.tile_buf) + pto.tload ins(%in1_part : !pto.partition_tensor_view<1x16xf32>) outs(%src1 : !pto.tile_buf) + pto.tload ins(%in2_part : !pto.partition_tensor_view<16x128xf32>) outs(%src2 : !pto.tile_buf) + + // Non-identity TMOV triplet under investigation. + // Force full serialization around each TMOV for board-side diagnosis. + pto.barrier + pto.tmov ins(%src0 : !pto.tile_buf) outs(%dst0 : !pto.tile_buf) + pto.barrier + pto.barrier + pto.tmov ins(%src1 : !pto.tile_buf) outs(%dst1 : !pto.tile_buf) + pto.barrier + pto.barrier + pto.tmov ins(%src2 : !pto.tile_buf) outs(%dst2 : !pto.tile_buf) + pto.barrier + + %out0_part = pto.partition_view %out0_view, offsets = [%c0, %c0], sizes = [%c1, %c16] : !pto.tensor_view -> !pto.partition_tensor_view<1x16xf32> + %out1_part = pto.partition_view %out1_view, offsets = [%c0, %c0], sizes = [%c1, %c16] : !pto.tensor_view -> !pto.partition_tensor_view<1x16xf32> + %out2_part = pto.partition_view %out2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + + pto.tstore ins(%dst0 : !pto.tile_buf) outs(%out0_part : !pto.partition_tensor_view<1x16xf32>) + pto.tstore ins(%dst1 : !pto.tile_buf) outs(%out1_part : !pto.partition_tensor_view<1x16xf32>) + pto.tstore ins(%dst2 : !pto.tile_buf) outs(%out2_part : !pto.partition_tensor_view<16x128xf32>) + return + } +} diff --git a/test/basic/issue828_diag_else_3tmov_vrow1_a5.pto b/test/basic/issue828_diag_else_3tmov_vrow1_a5.pto new file mode 100644 index 000000000..da2232401 --- /dev/null +++ b/test/basic/issue828_diag_else_3tmov_vrow1_a5.pto @@ -0,0 +1,73 @@ +// Diagnostic case B1 (ELSE residual 3 TMOV, first two use v_row=1): +// Goal: keep sync pattern unchanged while forcing first two DN tiles to single-row valid shape. +// Suggested board runs: +// /run A5 issue828_diag_else_3tmov_vrow1_a5 --pto-level=level3 +// /run A5 issue828_diag_else_3tmov_vrow1_a5 --pto-level=level3 --disable-identity-tmov-cleanup +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s > /dev/null +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s > /dev/null + +module attributes {pto.target_arch = "a5"} { + func.func @issue828_diag_else_3tmov_vrow1_a5( + %in0: !pto.ptr, %in1: !pto.ptr, %in2: !pto.ptr, + %out0: !pto.ptr, %out1: !pto.ptr, %out2: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c64 = arith.constant 64 : i64 + %c256 = arith.constant 256 : i64 + %c8448 = arith.constant 8448 : i64 + %c16640 = arith.constant 16640 : i64 + %c16832 = arith.constant 16832 : i64 + + %c16 = arith.constant 16 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + + %in0_view = pto.make_tensor_view %in0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout} : !pto.tensor_view + %in1_view = pto.make_tensor_view %in1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout} : !pto.tensor_view + %in2_view = pto.make_tensor_view %in2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout} : !pto.tensor_view + + %out0_view = pto.make_tensor_view %out0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout} : !pto.tensor_view + %out1_view = pto.make_tensor_view %out1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout} : !pto.tensor_view + %out2_view = pto.make_tensor_view %out2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout} : !pto.tensor_view + + // Source addresses mirror residual else-path producers. + %src0 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %src1 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %src2 = pto.alloc_tile addr = %c256 : !pto.tile_buf + + // Destination addresses mirror phi/result tiles in issue828. + %dst0 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %dst1 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %dst2 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + + %in0_part = pto.partition_view %in0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + %in1_part = pto.partition_view %in1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + %in2_part = pto.partition_view %in2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + + pto.tload ins(%in0_part : !pto.partition_tensor_view<16x1xf32>) outs(%src0 : !pto.tile_buf) + pto.tload ins(%in1_part : !pto.partition_tensor_view<16x1xf32>) outs(%src1 : !pto.tile_buf) + pto.tload ins(%in2_part : !pto.partition_tensor_view<16x128xf32>) outs(%src2 : !pto.tile_buf) + + // Non-identity TMOV triplet under investigation. + // Force full serialization around each TMOV for board-side diagnosis. + pto.barrier + pto.tmov ins(%src0 : !pto.tile_buf) outs(%dst0 : !pto.tile_buf) + pto.barrier + pto.barrier + pto.tmov ins(%src1 : !pto.tile_buf) outs(%dst1 : !pto.tile_buf) + pto.barrier + pto.barrier + pto.tmov ins(%src2 : !pto.tile_buf) outs(%dst2 : !pto.tile_buf) + pto.barrier + + %out0_part = pto.partition_view %out0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + %out1_part = pto.partition_view %out1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + %out2_part = pto.partition_view %out2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + + pto.tstore ins(%dst0 : !pto.tile_buf) outs(%out0_part : !pto.partition_tensor_view<16x1xf32>) + pto.tstore ins(%dst1 : !pto.tile_buf) outs(%out1_part : !pto.partition_tensor_view<16x1xf32>) + pto.tstore ins(%dst2 : !pto.tile_buf) outs(%out2_part : !pto.partition_tensor_view<16x128xf32>) + return + } +} diff --git a/test/basic/issue828_diag_else_no_tmov_a5.pto b/test/basic/issue828_diag_else_no_tmov_a5.pto new file mode 100644 index 000000000..d3cc1015f --- /dev/null +++ b/test/basic/issue828_diag_else_no_tmov_a5.pto @@ -0,0 +1,53 @@ +// Diagnostic case C (ELSE control: no TMOV): +// Goal: verify whether failure still happens without any TMOV on the path. +// Suggested board runs: +// /run A5 issue828_diag_else_no_tmov_a5 --pto-level=level3 +// /run A5 issue828_diag_else_no_tmov_a5 --pto-level=level3 --disable-identity-tmov-cleanup +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s > /dev/null +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s > /dev/null + +module attributes {pto.target_arch = "a5"} { + func.func @issue828_diag_else_no_tmov_a5( + %in0: !pto.ptr, %in1: !pto.ptr, %in2: !pto.ptr, + %out0: !pto.ptr, %out1: !pto.ptr, %out2: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c64 = arith.constant 64 : i64 + %c8448 = arith.constant 8448 : i64 + + %c16 = arith.constant 16 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + + %in0_view = pto.make_tensor_view %in0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout} : !pto.tensor_view + %in1_view = pto.make_tensor_view %in1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout} : !pto.tensor_view + %in2_view = pto.make_tensor_view %in2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout} : !pto.tensor_view + + %out0_view = pto.make_tensor_view %out0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout} : !pto.tensor_view + %out1_view = pto.make_tensor_view %out1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout} : !pto.tensor_view + %out2_view = pto.make_tensor_view %out2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout} : !pto.tensor_view + + // Direct path without TMOV to isolate whether TMOV is necessary for the failure. + %dst0 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %dst1 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %dst2 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + + %in0_part = pto.partition_view %in0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + %in1_part = pto.partition_view %in1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + %in2_part = pto.partition_view %in2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + + pto.tload ins(%in0_part : !pto.partition_tensor_view<16x1xf32>) outs(%dst0 : !pto.tile_buf) + pto.tload ins(%in1_part : !pto.partition_tensor_view<16x1xf32>) outs(%dst1 : !pto.tile_buf) + pto.tload ins(%in2_part : !pto.partition_tensor_view<16x128xf32>) outs(%dst2 : !pto.tile_buf) + + %out0_part = pto.partition_view %out0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + %out1_part = pto.partition_view %out1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + %out2_part = pto.partition_view %out2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + + pto.tstore ins(%dst0 : !pto.tile_buf) outs(%out0_part : !pto.partition_tensor_view<16x1xf32>) + pto.tstore ins(%dst1 : !pto.tile_buf) outs(%out1_part : !pto.partition_tensor_view<16x1xf32>) + pto.tstore ins(%dst2 : !pto.tile_buf) outs(%out2_part : !pto.partition_tensor_view<16x128xf32>) + return + } +} diff --git a/test/basic/issue828_diag_else_trace_print_a5.pto b/test/basic/issue828_diag_else_trace_print_a5.pto new file mode 100644 index 000000000..d5be3e3d2 --- /dev/null +++ b/test/basic/issue828_diag_else_trace_print_a5.pto @@ -0,0 +1,160 @@ +// ELSE-only diagnostic variant from issue #828 (force branch condition false). +// This case inserts scalar print checkpoints to narrow down which op triggers runtime failure. +// Note: avoid pto.tprint here because some board CANN versions do not provide TPRINT_IMPL. +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s > /dev/null +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s > /dev/null + +module attributes {pto.target_arch = "a5"} { + func.func @softmax_rescale_incore_1(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c64 = arith.constant 256 : i64 + %c128 = arith.constant 512 : i64 + %c192 = arith.constant 768 : i64 + %c256 = arith.constant 1024 : i64 + %c8448 = arith.constant 33792 : i64 + %c16640 = arith.constant 66560 : i64 + %c16704 = arith.constant 66816 : i64 + %c16768 = arith.constant 67072 : i64 + %c16832 = arith.constant 67328 : i64 + %c16896 = arith.constant 67584 : i64 + %c16 = arith.constant 16 : index + %c1 = arith.constant 1 : index + %7 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %dbg0 = arith.constant 0 : i32 + %cur_li__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %cur_mi__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %li__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %mi__iter_v1_view = pto.make_tensor_view %arg3, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %oi__iter_v1_view = pto.make_tensor_view %arg4, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %oi_tmp__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %li__phi_v5 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %mi__phi_v5 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %oi__phi_v5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %cur_li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %cur_li__ssa_v0_pview = pto.partition_view %cur_li__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%cur_li__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_li__tile : !pto.tile_buf) + %cur_mi__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf + %cur_mi__ssa_v0_pview = pto.partition_view %cur_mi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%cur_mi__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_mi__tile : !pto.tile_buf) + %li__tile = pto.alloc_tile addr = %c128 : !pto.tile_buf + %li__iter_v1_pview = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%li__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%li__tile : !pto.tile_buf) + %mi__tile = pto.alloc_tile addr = %c192 : !pto.tile_buf + %mi__iter_v1_pview = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%mi__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%mi__tile : !pto.tile_buf) + %oi__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf + %oi__iter_v1_pview = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%oi__iter_v1_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi__tile : !pto.tile_buf) + %oi_tmp__tile = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %oi_tmp__ssa_v0_pview = pto.partition_view %oi_tmp__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%oi_tmp__ssa_v0_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi_tmp__tile : !pto.tile_buf) + %8 = arith.cmpi eq, %c1, %c0 : index + scf.if %8 { + %oi__ssa_v3 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %li__ssa_v3 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %mi__ssa_v3 = pto.alloc_tile addr = %c64 : !pto.tile_buf + pto.tmov ins(%li__ssa_v3 : !pto.tile_buf) outs(%li__phi_v5 : !pto.tile_buf) + pto.tmov ins(%mi__ssa_v3 : !pto.tile_buf) outs(%mi__phi_v5 : !pto.tile_buf) + pto.tmov ins(%oi__ssa_v3 : !pto.tile_buf) outs(%oi__phi_v5 : !pto.tile_buf) + } else { + pto.print ins("E00_enter_else", %dbg0 : i32) + %mi_new__rm_a0_tmp_v0 = pto.alloc_tile addr = %c192 : !pto.tile_buf + %mi_new__rm_a1_tmp_v1 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %mi_new__row_major_tmp_v2 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + pto.print ins("E01_B_tmax", %dbg0 : i32) + pto.tmax ins(%mi_new__rm_a0_tmp_v0, %mi_new__rm_a1_tmp_v1 : !pto.tile_buf, !pto.tile_buf) outs(%mi_new__row_major_tmp_v2 : !pto.tile_buf) + pto.print ins("E01_A_tmax", %dbg0 : i32) + %mi_new__tile = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__rm_a0_tmp_v3 = pto.alloc_tile addr = %c192 : !pto.tile_buf + %t__rm_a1_tmp_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__row_major_tmp_v5 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + pto.print ins("E02_B_tsub0", %dbg0 : i32) + pto.tsub ins(%t__rm_a0_tmp_v3, %t__rm_a1_tmp_v4 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v5 : !pto.tile_buf) + pto.print ins("E02_A_tsub0", %dbg0 : i32) + %t__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %alpha__rm_a0_tmp_v6 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %alpha__row_major_tmp_v7 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + pto.print ins("E03_B_texp0", %dbg0 : i32) + pto.texp ins(%alpha__rm_a0_tmp_v6 : !pto.tile_buf) outs(%alpha__row_major_tmp_v7 : !pto.tile_buf) + pto.print ins("E03_A_texp0", %dbg0 : i32) + %alpha__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %t__rm_a0_tmp_v8 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %t__rm_a1_tmp_v9 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__row_major_tmp_v10 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + pto.print ins("E04_B_tsub1", %dbg0 : i32) + pto.tsub ins(%t__rm_a0_tmp_v8, %t__rm_a1_tmp_v9 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v10 : !pto.tile_buf) + pto.print ins("E04_A_tsub1", %dbg0 : i32) + %0 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %beta__rm_a0_tmp_v11 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %beta__row_major_tmp_v12 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + pto.print ins("E05_B_texp1", %dbg0 : i32) + pto.texp ins(%beta__rm_a0_tmp_v11 : !pto.tile_buf) outs(%beta__row_major_tmp_v12 : !pto.tile_buf) + pto.print ins("E05_A_texp1", %dbg0 : i32) + %beta__tile = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %t__rm_a0_tmp_v13 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %t__rm_a1_tmp_v14 = pto.alloc_tile addr = %c128 : !pto.tile_buf + %t__row_major_tmp_v15 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + pto.print ins("E06_B_tmul0", %dbg0 : i32) + pto.tmul ins(%t__rm_a0_tmp_v13, %t__rm_a1_tmp_v14 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v15 : !pto.tile_buf) + pto.print ins("E06_A_tmul0", %dbg0 : i32) + %1 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %t__rm_a0_tmp_v16 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %t__rm_a1_tmp_v17 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %t__row_major_tmp_v18 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + pto.print ins("E07_B_tmul1", %dbg0 : i32) + pto.tmul ins(%t__rm_a0_tmp_v16, %t__rm_a1_tmp_v17 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v18 : !pto.tile_buf) + pto.print ins("E07_A_tmul1", %dbg0 : i32) + %2 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + %li__rm_a0_tmp_v19 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %li__rm_a1_tmp_v20 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + %li__row_major_tmp_v21 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + pto.print ins("E08_B_tadd16", %dbg0 : i32) + pto.tadd ins(%li__rm_a0_tmp_v19, %li__rm_a1_tmp_v20 : !pto.tile_buf, !pto.tile_buf) outs(%li__row_major_tmp_v21 : !pto.tile_buf) + pto.print ins("E08_A_tadd16", %dbg0 : i32) + %3 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %4 = pto.alloc_tile addr = %c256 : !pto.tile_buf + pto.print ins("E09_B_trowexpandmul0", %dbg0 : i32) + pto.trowexpandmul ins(%oi__tile, %alpha__tile : !pto.tile_buf, !pto.tile_buf) outs(%4 : !pto.tile_buf) + pto.print ins("E09_A_trowexpandmul0", %dbg0 : i32) + %5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + pto.print ins("E10_B_trowexpandmul1", %dbg0 : i32) + pto.trowexpandmul ins(%oi_tmp__tile, %beta__tile : !pto.tile_buf, !pto.tile_buf) outs(%5 : !pto.tile_buf) + pto.print ins("E10_A_trowexpandmul1", %dbg0 : i32) + %6 = pto.alloc_tile addr = %c256 : !pto.tile_buf + pto.print ins("E11_B_tadd128", %dbg0 : i32) + pto.tadd ins(%4, %5 : !pto.tile_buf, !pto.tile_buf) outs(%6 : !pto.tile_buf) + pto.print ins("E11_A_tadd128", %dbg0 : i32) + %mi__ssa_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %li__tile_mv = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.print ins("E12_B_tmov_li_tmp", %dbg0 : i32) + pto.tmov ins(%3 : !pto.tile_buf) outs(%li__tile_mv : !pto.tile_buf) + pto.print ins("E12_A_tmov_li_tmp", %dbg0 : i32) + %mi__ssa_v4_mv = pto.alloc_tile addr = %c64 : !pto.tile_buf + pto.print ins("E13_B_tmov_mi_tmp", %dbg0 : i32) + pto.tmov ins(%mi__ssa_v4 : !pto.tile_buf) outs(%mi__ssa_v4_mv : !pto.tile_buf) + pto.print ins("E13_A_tmov_mi_tmp", %dbg0 : i32) + %oi__tile_mv = pto.alloc_tile addr = %c8448 : !pto.tile_buf + pto.print ins("E14_B_tmov_oi_tmp", %dbg0 : i32) + pto.tmov ins(%6 : !pto.tile_buf) outs(%oi__tile_mv : !pto.tile_buf) + pto.print ins("E14_A_tmov_oi_tmp", %dbg0 : i32) + pto.print ins("E15_B_tmov_li_phi", %dbg0 : i32) + pto.tmov ins(%li__tile_mv : !pto.tile_buf) outs(%li__phi_v5 : !pto.tile_buf) + pto.print ins("E15_A_tmov_li_phi", %dbg0 : i32) + pto.print ins("E16_B_tmov_mi_phi", %dbg0 : i32) + pto.tmov ins(%mi__ssa_v4_mv : !pto.tile_buf) outs(%mi__phi_v5 : !pto.tile_buf) + pto.print ins("E16_A_tmov_mi_phi", %dbg0 : i32) + pto.print ins("E17_B_tmov_oi_phi", %dbg0 : i32) + pto.tmov ins(%oi__tile_mv : !pto.tile_buf) outs(%oi__phi_v5 : !pto.tile_buf) + pto.print ins("E17_A_tmov_oi_phi", %dbg0 : i32) + pto.print ins("E99_leave_else", %dbg0 : i32) + } + %9 = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tstore ins(%li__phi_v5 : !pto.tile_buf) outs(%9 : !pto.partition_tensor_view<16x1xf32>) + %10 = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tstore ins(%mi__phi_v5 : !pto.tile_buf) outs(%10 : !pto.partition_tensor_view<16x1xf32>) + %11 = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tstore ins(%oi__phi_v5 : !pto.tile_buf) outs(%11 : !pto.partition_tensor_view<16x128xf32>) + return + } +} diff --git a/test/basic/issue828_diag_if_identity_only_a5.pto b/test/basic/issue828_diag_if_identity_only_a5.pto new file mode 100644 index 000000000..f4d12dd98 --- /dev/null +++ b/test/basic/issue828_diag_if_identity_only_a5.pto @@ -0,0 +1,64 @@ +// Diagnostic case A (IF identity TMOV only): +// Goal: isolate IF-path identity tmov behavior under auto-sync. +// Suggested board runs: +// /run A5 issue828_diag_if_identity_only_a5 --pto-level=level3 +// /run A5 issue828_diag_if_identity_only_a5 --pto-level=level3 --disable-identity-tmov-cleanup +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s > /dev/null +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s > /dev/null + +module attributes {pto.target_arch = "a5"} { + func.func @issue828_diag_if_identity_only_a5( + %in0: !pto.ptr, %in1: !pto.ptr, %in2: !pto.ptr, + %out0: !pto.ptr, %out1: !pto.ptr, %out2: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c64 = arith.constant 64 : i64 + %c8448 = arith.constant 8448 : i64 + + %c16 = arith.constant 16 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + + %in0_view = pto.make_tensor_view %in0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout} : !pto.tensor_view + %in1_view = pto.make_tensor_view %in1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout} : !pto.tensor_view + %in2_view = pto.make_tensor_view %in2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout} : !pto.tensor_view + + %out0_view = pto.make_tensor_view %out0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout} : !pto.tensor_view + %out1_view = pto.make_tensor_view %out1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout} : !pto.tensor_view + %out2_view = pto.make_tensor_view %out2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout} : !pto.tensor_view + + %dst0 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %dst1 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %dst2 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + + %in0_part = pto.partition_view %in0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + %in1_part = pto.partition_view %in1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + %in2_part = pto.partition_view %in2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + + pto.tload ins(%in0_part : !pto.partition_tensor_view<16x1xf32>) outs(%dst0 : !pto.tile_buf) + pto.tload ins(%in1_part : !pto.partition_tensor_view<16x1xf32>) outs(%dst1 : !pto.tile_buf) + pto.tload ins(%in2_part : !pto.partition_tensor_view<16x128xf32>) outs(%dst2 : !pto.tile_buf) + + %cond = arith.cmpi eq, %c0, %c0 : index + scf.if %cond { + // Distinct SSA, same address as destination (identity by address-range proof). + %src0_alias = pto.alloc_tile addr = %c0i : !pto.tile_buf + %src1_alias = pto.alloc_tile addr = %c64 : !pto.tile_buf + %src2_alias = pto.alloc_tile addr = %c8448 : !pto.tile_buf + + pto.tmov ins(%src0_alias : !pto.tile_buf) outs(%dst0 : !pto.tile_buf) + pto.tmov ins(%src1_alias : !pto.tile_buf) outs(%dst1 : !pto.tile_buf) + pto.tmov ins(%src2_alias : !pto.tile_buf) outs(%dst2 : !pto.tile_buf) + } + + %out0_part = pto.partition_view %out0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + %out1_part = pto.partition_view %out1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + %out2_part = pto.partition_view %out2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + + pto.tstore ins(%dst0 : !pto.tile_buf) outs(%out0_part : !pto.partition_tensor_view<16x1xf32>) + pto.tstore ins(%dst1 : !pto.tile_buf) outs(%out1_part : !pto.partition_tensor_view<16x1xf32>) + pto.tstore ins(%dst2 : !pto.tile_buf) outs(%out2_part : !pto.partition_tensor_view<16x128xf32>) + return + } +} diff --git a/test/basic/issue828_softmax_rescale_incore_1_a5.pto b/test/basic/issue828_softmax_rescale_incore_1_a5.pto new file mode 100644 index 000000000..2f4704e6c --- /dev/null +++ b/test/basic/issue828_softmax_rescale_incore_1_a5.pto @@ -0,0 +1,130 @@ +// Regression from: https://github.com/hw-native-sys/pypto/issues/828 +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s | FileCheck %s --check-prefix=SAFE +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s | FileCheck %s --check-prefix=UNSAFE + +module attributes {pto.target_arch = "a5"} { + func.func @softmax_rescale_incore_1(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c64 = arith.constant 64 : i64 + %c128 = arith.constant 128 : i64 + %c192 = arith.constant 192 : i64 + %c256 = arith.constant 256 : i64 + %c8448 = arith.constant 8448 : i64 + %c16640 = arith.constant 16640 : i64 + %c16704 = arith.constant 16704 : i64 + %c16768 = arith.constant 16768 : i64 + %c16832 = arith.constant 16832 : i64 + %c16896 = arith.constant 16896 : i64 + %c16 = arith.constant 16 : index + %c1 = arith.constant 1 : index + %7 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %cur_li__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %cur_mi__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %li__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %mi__iter_v1_view = pto.make_tensor_view %arg3, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %oi__iter_v1_view = pto.make_tensor_view %arg4, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %oi_tmp__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %li__phi_v5 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %mi__phi_v5 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %oi__phi_v5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %cur_li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %cur_li__ssa_v0_pview = pto.partition_view %cur_li__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%cur_li__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_li__tile : !pto.tile_buf) + %cur_mi__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf + %cur_mi__ssa_v0_pview = pto.partition_view %cur_mi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%cur_mi__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_mi__tile : !pto.tile_buf) + %li__tile = pto.alloc_tile addr = %c128 : !pto.tile_buf + %li__iter_v1_pview = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%li__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%li__tile : !pto.tile_buf) + %mi__tile = pto.alloc_tile addr = %c192 : !pto.tile_buf + %mi__iter_v1_pview = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%mi__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%mi__tile : !pto.tile_buf) + %oi__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf + %oi__iter_v1_pview = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%oi__iter_v1_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi__tile : !pto.tile_buf) + %oi_tmp__tile = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %oi_tmp__ssa_v0_pview = pto.partition_view %oi_tmp__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%oi_tmp__ssa_v0_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi_tmp__tile : !pto.tile_buf) + %8 = arith.cmpi eq, %arg6, %c0 : index + scf.if %8 { + %oi__ssa_v3 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %li__ssa_v3 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %mi__ssa_v3 = pto.alloc_tile addr = %c64 : !pto.tile_buf + pto.tmov ins(%li__ssa_v3 : !pto.tile_buf) outs(%li__phi_v5 : !pto.tile_buf) + pto.tmov ins(%mi__ssa_v3 : !pto.tile_buf) outs(%mi__phi_v5 : !pto.tile_buf) + pto.tmov ins(%oi__ssa_v3 : !pto.tile_buf) outs(%oi__phi_v5 : !pto.tile_buf) + } else { + %mi_new__rm_a0_tmp_v0 = pto.alloc_tile addr = %c192 : !pto.tile_buf + %mi_new__rm_a1_tmp_v1 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %mi_new__row_major_tmp_v2 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + pto.tmax ins(%mi_new__rm_a0_tmp_v0, %mi_new__rm_a1_tmp_v1 : !pto.tile_buf, !pto.tile_buf) outs(%mi_new__row_major_tmp_v2 : !pto.tile_buf) + %mi_new__tile = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__rm_a0_tmp_v3 = pto.alloc_tile addr = %c192 : !pto.tile_buf + %t__rm_a1_tmp_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__row_major_tmp_v5 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + pto.tsub ins(%t__rm_a0_tmp_v3, %t__rm_a1_tmp_v4 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v5 : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %alpha__rm_a0_tmp_v6 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %alpha__row_major_tmp_v7 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + pto.texp ins(%alpha__rm_a0_tmp_v6 : !pto.tile_buf) outs(%alpha__row_major_tmp_v7 : !pto.tile_buf) + %alpha__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %t__rm_a0_tmp_v8 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %t__rm_a1_tmp_v9 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__row_major_tmp_v10 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + pto.tsub ins(%t__rm_a0_tmp_v8, %t__rm_a1_tmp_v9 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v10 : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %beta__rm_a0_tmp_v11 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %beta__row_major_tmp_v12 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + pto.texp ins(%beta__rm_a0_tmp_v11 : !pto.tile_buf) outs(%beta__row_major_tmp_v12 : !pto.tile_buf) + %beta__tile = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %t__rm_a0_tmp_v13 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %t__rm_a1_tmp_v14 = pto.alloc_tile addr = %c128 : !pto.tile_buf + %t__row_major_tmp_v15 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + pto.tmul ins(%t__rm_a0_tmp_v13, %t__rm_a1_tmp_v14 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v15 : !pto.tile_buf) + %1 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %t__rm_a0_tmp_v16 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %t__rm_a1_tmp_v17 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %t__row_major_tmp_v18 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + pto.tmul ins(%t__rm_a0_tmp_v16, %t__rm_a1_tmp_v17 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v18 : !pto.tile_buf) + %2 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + %li__rm_a0_tmp_v19 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %li__rm_a1_tmp_v20 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + %li__row_major_tmp_v21 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + pto.tadd ins(%li__rm_a0_tmp_v19, %li__rm_a1_tmp_v20 : !pto.tile_buf, !pto.tile_buf) outs(%li__row_major_tmp_v21 : !pto.tile_buf) + %3 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %4 = pto.alloc_tile addr = %c256 : !pto.tile_buf + pto.trowexpandmul ins(%oi__tile, %alpha__tile : !pto.tile_buf, !pto.tile_buf) outs(%4 : !pto.tile_buf) + %5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + pto.trowexpandmul ins(%oi_tmp__tile, %beta__tile : !pto.tile_buf, !pto.tile_buf) outs(%5 : !pto.tile_buf) + %6 = pto.alloc_tile addr = %c256 : !pto.tile_buf + pto.tadd ins(%4, %5 : !pto.tile_buf, !pto.tile_buf) outs(%6 : !pto.tile_buf) + %mi__ssa_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %li__tile_mv = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%3 : !pto.tile_buf) outs(%li__tile_mv : !pto.tile_buf) + %mi__ssa_v4_mv = pto.alloc_tile addr = %c64 : !pto.tile_buf + pto.tmov ins(%mi__ssa_v4 : !pto.tile_buf) outs(%mi__ssa_v4_mv : !pto.tile_buf) + %oi__tile_mv = pto.alloc_tile addr = %c8448 : !pto.tile_buf + pto.tmov ins(%6 : !pto.tile_buf) outs(%oi__tile_mv : !pto.tile_buf) + pto.tmov ins(%li__tile_mv : !pto.tile_buf) outs(%li__phi_v5 : !pto.tile_buf) + pto.tmov ins(%mi__ssa_v4_mv : !pto.tile_buf) outs(%mi__phi_v5 : !pto.tile_buf) + pto.tmov ins(%oi__tile_mv : !pto.tile_buf) outs(%oi__phi_v5 : !pto.tile_buf) + } + %9 = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tstore ins(%li__phi_v5 : !pto.tile_buf) outs(%9 : !pto.partition_tensor_view<16x1xf32>) + %10 = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tstore ins(%mi__phi_v5 : !pto.tile_buf) outs(%10 : !pto.partition_tensor_view<16x1xf32>) + %11 = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tstore ins(%oi__phi_v5 : !pto.tile_buf) outs(%11 : !pto.partition_tensor_view<16x128xf32>) + return + } +} + +// SAFE-LABEL: __global__ AICORE void softmax_rescale_incore_1( +// SAFE: set_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); +// SAFE-NOT: set_flag(PIPE_MTE2, PIPE_V, EVENT_ID4); +// SAFE-NOT: wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID4); + +// UNSAFE-LABEL: __global__ AICORE void softmax_rescale_incore_1( +// UNSAFE: set_flag(PIPE_MTE2, PIPE_V, EVENT_ID4); +// UNSAFE: wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID4); diff --git a/test/basic/issue828_softmax_rescale_incore_1_a5_else.pto b/test/basic/issue828_softmax_rescale_incore_1_a5_else.pto new file mode 100644 index 000000000..bbecfdd43 --- /dev/null +++ b/test/basic/issue828_softmax_rescale_incore_1_a5_else.pto @@ -0,0 +1,121 @@ +// ELSE-only variant from issue #828 (force branch condition false). +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s > /dev/null +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s > /dev/null + +module attributes {pto.target_arch = "a5"} { + func.func @softmax_rescale_incore_1(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c64 = arith.constant 64 : i64 + %c128 = arith.constant 128 : i64 + %c192 = arith.constant 192 : i64 + %c256 = arith.constant 256 : i64 + %c8448 = arith.constant 8448 : i64 + %c16640 = arith.constant 16640 : i64 + %c16704 = arith.constant 16704 : i64 + %c16768 = arith.constant 16768 : i64 + %c16832 = arith.constant 16832 : i64 + %c16896 = arith.constant 16896 : i64 + %c16 = arith.constant 16 : index + %c1 = arith.constant 1 : index + %7 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %cur_li__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %cur_mi__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %li__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %mi__iter_v1_view = pto.make_tensor_view %arg3, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %oi__iter_v1_view = pto.make_tensor_view %arg4, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %oi_tmp__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %li__phi_v5 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %mi__phi_v5 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %oi__phi_v5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %cur_li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %cur_li__ssa_v0_pview = pto.partition_view %cur_li__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%cur_li__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_li__tile : !pto.tile_buf) + %cur_mi__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf + %cur_mi__ssa_v0_pview = pto.partition_view %cur_mi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%cur_mi__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_mi__tile : !pto.tile_buf) + %li__tile = pto.alloc_tile addr = %c128 : !pto.tile_buf + %li__iter_v1_pview = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%li__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%li__tile : !pto.tile_buf) + %mi__tile = pto.alloc_tile addr = %c192 : !pto.tile_buf + %mi__iter_v1_pview = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%mi__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%mi__tile : !pto.tile_buf) + %oi__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf + %oi__iter_v1_pview = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%oi__iter_v1_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi__tile : !pto.tile_buf) + %oi_tmp__tile = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %oi_tmp__ssa_v0_pview = pto.partition_view %oi_tmp__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%oi_tmp__ssa_v0_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi_tmp__tile : !pto.tile_buf) + %8 = arith.cmpi eq, %c1, %c0 : index + scf.if %8 { + %oi__ssa_v3 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %li__ssa_v3 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %mi__ssa_v3 = pto.alloc_tile addr = %c64 : !pto.tile_buf + pto.tmov ins(%li__ssa_v3 : !pto.tile_buf) outs(%li__phi_v5 : !pto.tile_buf) + pto.tmov ins(%mi__ssa_v3 : !pto.tile_buf) outs(%mi__phi_v5 : !pto.tile_buf) + pto.tmov ins(%oi__ssa_v3 : !pto.tile_buf) outs(%oi__phi_v5 : !pto.tile_buf) + } else { + %mi_new__rm_a0_tmp_v0 = pto.alloc_tile addr = %c192 : !pto.tile_buf + %mi_new__rm_a1_tmp_v1 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %mi_new__row_major_tmp_v2 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + pto.tmax ins(%mi_new__rm_a0_tmp_v0, %mi_new__rm_a1_tmp_v1 : !pto.tile_buf, !pto.tile_buf) outs(%mi_new__row_major_tmp_v2 : !pto.tile_buf) + %mi_new__tile = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__rm_a0_tmp_v3 = pto.alloc_tile addr = %c192 : !pto.tile_buf + %t__rm_a1_tmp_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__row_major_tmp_v5 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + pto.tsub ins(%t__rm_a0_tmp_v3, %t__rm_a1_tmp_v4 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v5 : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %alpha__rm_a0_tmp_v6 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %alpha__row_major_tmp_v7 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + pto.texp ins(%alpha__rm_a0_tmp_v6 : !pto.tile_buf) outs(%alpha__row_major_tmp_v7 : !pto.tile_buf) + %alpha__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %t__rm_a0_tmp_v8 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %t__rm_a1_tmp_v9 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__row_major_tmp_v10 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + pto.tsub ins(%t__rm_a0_tmp_v8, %t__rm_a1_tmp_v9 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v10 : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %beta__rm_a0_tmp_v11 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %beta__row_major_tmp_v12 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + pto.texp ins(%beta__rm_a0_tmp_v11 : !pto.tile_buf) outs(%beta__row_major_tmp_v12 : !pto.tile_buf) + %beta__tile = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %t__rm_a0_tmp_v13 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %t__rm_a1_tmp_v14 = pto.alloc_tile addr = %c128 : !pto.tile_buf + %t__row_major_tmp_v15 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + pto.tmul ins(%t__rm_a0_tmp_v13, %t__rm_a1_tmp_v14 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v15 : !pto.tile_buf) + %1 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %t__rm_a0_tmp_v16 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %t__rm_a1_tmp_v17 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %t__row_major_tmp_v18 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + pto.tmul ins(%t__rm_a0_tmp_v16, %t__rm_a1_tmp_v17 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v18 : !pto.tile_buf) + %2 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + %li__rm_a0_tmp_v19 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %li__rm_a1_tmp_v20 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + %li__row_major_tmp_v21 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + pto.tadd ins(%li__rm_a0_tmp_v19, %li__rm_a1_tmp_v20 : !pto.tile_buf, !pto.tile_buf) outs(%li__row_major_tmp_v21 : !pto.tile_buf) + %3 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %4 = pto.alloc_tile addr = %c256 : !pto.tile_buf + pto.trowexpandmul ins(%oi__tile, %alpha__tile : !pto.tile_buf, !pto.tile_buf) outs(%4 : !pto.tile_buf) + %5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + pto.trowexpandmul ins(%oi_tmp__tile, %beta__tile : !pto.tile_buf, !pto.tile_buf) outs(%5 : !pto.tile_buf) + %6 = pto.alloc_tile addr = %c256 : !pto.tile_buf + pto.tadd ins(%4, %5 : !pto.tile_buf, !pto.tile_buf) outs(%6 : !pto.tile_buf) + %mi__ssa_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %li__tile_mv = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%3 : !pto.tile_buf) outs(%li__tile_mv : !pto.tile_buf) + %mi__ssa_v4_mv = pto.alloc_tile addr = %c64 : !pto.tile_buf + pto.tmov ins(%mi__ssa_v4 : !pto.tile_buf) outs(%mi__ssa_v4_mv : !pto.tile_buf) + %oi__tile_mv = pto.alloc_tile addr = %c8448 : !pto.tile_buf + pto.tmov ins(%6 : !pto.tile_buf) outs(%oi__tile_mv : !pto.tile_buf) + pto.tmov ins(%li__tile_mv : !pto.tile_buf) outs(%li__phi_v5 : !pto.tile_buf) + pto.tmov ins(%mi__ssa_v4_mv : !pto.tile_buf) outs(%mi__phi_v5 : !pto.tile_buf) + pto.tmov ins(%oi__tile_mv : !pto.tile_buf) outs(%oi__phi_v5 : !pto.tile_buf) + } + %9 = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tstore ins(%li__phi_v5 : !pto.tile_buf) outs(%9 : !pto.partition_tensor_view<16x1xf32>) + %10 = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tstore ins(%mi__phi_v5 : !pto.tile_buf) outs(%10 : !pto.partition_tensor_view<16x1xf32>) + %11 = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tstore ins(%oi__phi_v5 : !pto.tile_buf) outs(%11 : !pto.partition_tensor_view<16x128xf32>) + return + } +} diff --git a/test/basic/issue828_softmax_rescale_incore_1_a5_else_aligned.pto b/test/basic/issue828_softmax_rescale_incore_1_a5_else_aligned.pto new file mode 100644 index 000000000..b1a2b70e1 --- /dev/null +++ b/test/basic/issue828_softmax_rescale_incore_1_a5_else_aligned.pto @@ -0,0 +1,121 @@ +// ELSE-only aligned-address variant from issue #828 (force branch condition false). +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s > /dev/null +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s > /dev/null + +module attributes {pto.target_arch = "a5"} { + func.func @softmax_rescale_incore_1(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c64 = arith.constant 256 : i64 + %c128 = arith.constant 512 : i64 + %c192 = arith.constant 768 : i64 + %c256 = arith.constant 1024 : i64 + %c8448 = arith.constant 33792 : i64 + %c16640 = arith.constant 66560 : i64 + %c16704 = arith.constant 66816 : i64 + %c16768 = arith.constant 67072 : i64 + %c16832 = arith.constant 67328 : i64 + %c16896 = arith.constant 67584 : i64 + %c16 = arith.constant 16 : index + %c1 = arith.constant 1 : index + %7 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %cur_li__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %cur_mi__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %li__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %mi__iter_v1_view = pto.make_tensor_view %arg3, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %oi__iter_v1_view = pto.make_tensor_view %arg4, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %oi_tmp__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %li__phi_v5 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %mi__phi_v5 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %oi__phi_v5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %cur_li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %cur_li__ssa_v0_pview = pto.partition_view %cur_li__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%cur_li__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_li__tile : !pto.tile_buf) + %cur_mi__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf + %cur_mi__ssa_v0_pview = pto.partition_view %cur_mi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%cur_mi__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_mi__tile : !pto.tile_buf) + %li__tile = pto.alloc_tile addr = %c128 : !pto.tile_buf + %li__iter_v1_pview = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%li__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%li__tile : !pto.tile_buf) + %mi__tile = pto.alloc_tile addr = %c192 : !pto.tile_buf + %mi__iter_v1_pview = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%mi__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%mi__tile : !pto.tile_buf) + %oi__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf + %oi__iter_v1_pview = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%oi__iter_v1_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi__tile : !pto.tile_buf) + %oi_tmp__tile = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %oi_tmp__ssa_v0_pview = pto.partition_view %oi_tmp__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%oi_tmp__ssa_v0_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi_tmp__tile : !pto.tile_buf) + %8 = arith.cmpi eq, %c1, %c0 : index + scf.if %8 { + %oi__ssa_v3 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %li__ssa_v3 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %mi__ssa_v3 = pto.alloc_tile addr = %c64 : !pto.tile_buf + pto.tmov ins(%li__ssa_v3 : !pto.tile_buf) outs(%li__phi_v5 : !pto.tile_buf) + pto.tmov ins(%mi__ssa_v3 : !pto.tile_buf) outs(%mi__phi_v5 : !pto.tile_buf) + pto.tmov ins(%oi__ssa_v3 : !pto.tile_buf) outs(%oi__phi_v5 : !pto.tile_buf) + } else { + %mi_new__rm_a0_tmp_v0 = pto.alloc_tile addr = %c192 : !pto.tile_buf + %mi_new__rm_a1_tmp_v1 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %mi_new__row_major_tmp_v2 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + pto.tmax ins(%mi_new__rm_a0_tmp_v0, %mi_new__rm_a1_tmp_v1 : !pto.tile_buf, !pto.tile_buf) outs(%mi_new__row_major_tmp_v2 : !pto.tile_buf) + %mi_new__tile = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__rm_a0_tmp_v3 = pto.alloc_tile addr = %c192 : !pto.tile_buf + %t__rm_a1_tmp_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__row_major_tmp_v5 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + pto.tsub ins(%t__rm_a0_tmp_v3, %t__rm_a1_tmp_v4 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v5 : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %alpha__rm_a0_tmp_v6 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %alpha__row_major_tmp_v7 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + pto.texp ins(%alpha__rm_a0_tmp_v6 : !pto.tile_buf) outs(%alpha__row_major_tmp_v7 : !pto.tile_buf) + %alpha__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %t__rm_a0_tmp_v8 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %t__rm_a1_tmp_v9 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__row_major_tmp_v10 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + pto.tsub ins(%t__rm_a0_tmp_v8, %t__rm_a1_tmp_v9 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v10 : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %beta__rm_a0_tmp_v11 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %beta__row_major_tmp_v12 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + pto.texp ins(%beta__rm_a0_tmp_v11 : !pto.tile_buf) outs(%beta__row_major_tmp_v12 : !pto.tile_buf) + %beta__tile = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %t__rm_a0_tmp_v13 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %t__rm_a1_tmp_v14 = pto.alloc_tile addr = %c128 : !pto.tile_buf + %t__row_major_tmp_v15 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + pto.tmul ins(%t__rm_a0_tmp_v13, %t__rm_a1_tmp_v14 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v15 : !pto.tile_buf) + %1 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %t__rm_a0_tmp_v16 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %t__rm_a1_tmp_v17 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %t__row_major_tmp_v18 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + pto.tmul ins(%t__rm_a0_tmp_v16, %t__rm_a1_tmp_v17 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v18 : !pto.tile_buf) + %2 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + %li__rm_a0_tmp_v19 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %li__rm_a1_tmp_v20 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + %li__row_major_tmp_v21 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + pto.tadd ins(%li__rm_a0_tmp_v19, %li__rm_a1_tmp_v20 : !pto.tile_buf, !pto.tile_buf) outs(%li__row_major_tmp_v21 : !pto.tile_buf) + %3 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %4 = pto.alloc_tile addr = %c256 : !pto.tile_buf + pto.trowexpandmul ins(%oi__tile, %alpha__tile : !pto.tile_buf, !pto.tile_buf) outs(%4 : !pto.tile_buf) + %5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + pto.trowexpandmul ins(%oi_tmp__tile, %beta__tile : !pto.tile_buf, !pto.tile_buf) outs(%5 : !pto.tile_buf) + %6 = pto.alloc_tile addr = %c256 : !pto.tile_buf + pto.tadd ins(%4, %5 : !pto.tile_buf, !pto.tile_buf) outs(%6 : !pto.tile_buf) + %mi__ssa_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %li__tile_mv = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%3 : !pto.tile_buf) outs(%li__tile_mv : !pto.tile_buf) + %mi__ssa_v4_mv = pto.alloc_tile addr = %c64 : !pto.tile_buf + pto.tmov ins(%mi__ssa_v4 : !pto.tile_buf) outs(%mi__ssa_v4_mv : !pto.tile_buf) + %oi__tile_mv = pto.alloc_tile addr = %c8448 : !pto.tile_buf + pto.tmov ins(%6 : !pto.tile_buf) outs(%oi__tile_mv : !pto.tile_buf) + pto.tmov ins(%li__tile_mv : !pto.tile_buf) outs(%li__phi_v5 : !pto.tile_buf) + pto.tmov ins(%mi__ssa_v4_mv : !pto.tile_buf) outs(%mi__phi_v5 : !pto.tile_buf) + pto.tmov ins(%oi__tile_mv : !pto.tile_buf) outs(%oi__phi_v5 : !pto.tile_buf) + } + %9 = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tstore ins(%li__phi_v5 : !pto.tile_buf) outs(%9 : !pto.partition_tensor_view<16x1xf32>) + %10 = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tstore ins(%mi__phi_v5 : !pto.tile_buf) outs(%10 : !pto.partition_tensor_view<16x1xf32>) + %11 = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tstore ins(%oi__phi_v5 : !pto.tile_buf) outs(%11 : !pto.partition_tensor_view<16x128xf32>) + return + } +} diff --git a/test/basic/issue828_softmax_rescale_incore_1_a5_if.pto b/test/basic/issue828_softmax_rescale_incore_1_a5_if.pto new file mode 100644 index 000000000..66fd836f0 --- /dev/null +++ b/test/basic/issue828_softmax_rescale_incore_1_a5_if.pto @@ -0,0 +1,121 @@ +// IF-only variant from issue #828 (force branch condition true). +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s > /dev/null +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s > /dev/null + +module attributes {pto.target_arch = "a5"} { + func.func @softmax_rescale_incore_1(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c64 = arith.constant 64 : i64 + %c128 = arith.constant 128 : i64 + %c192 = arith.constant 192 : i64 + %c256 = arith.constant 256 : i64 + %c8448 = arith.constant 8448 : i64 + %c16640 = arith.constant 16640 : i64 + %c16704 = arith.constant 16704 : i64 + %c16768 = arith.constant 16768 : i64 + %c16832 = arith.constant 16832 : i64 + %c16896 = arith.constant 16896 : i64 + %c16 = arith.constant 16 : index + %c1 = arith.constant 1 : index + %7 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %cur_li__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %cur_mi__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %li__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %mi__iter_v1_view = pto.make_tensor_view %arg3, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %oi__iter_v1_view = pto.make_tensor_view %arg4, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %oi_tmp__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %li__phi_v5 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %mi__phi_v5 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %oi__phi_v5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %cur_li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %cur_li__ssa_v0_pview = pto.partition_view %cur_li__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%cur_li__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_li__tile : !pto.tile_buf) + %cur_mi__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf + %cur_mi__ssa_v0_pview = pto.partition_view %cur_mi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%cur_mi__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_mi__tile : !pto.tile_buf) + %li__tile = pto.alloc_tile addr = %c128 : !pto.tile_buf + %li__iter_v1_pview = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%li__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%li__tile : !pto.tile_buf) + %mi__tile = pto.alloc_tile addr = %c192 : !pto.tile_buf + %mi__iter_v1_pview = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%mi__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%mi__tile : !pto.tile_buf) + %oi__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf + %oi__iter_v1_pview = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%oi__iter_v1_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi__tile : !pto.tile_buf) + %oi_tmp__tile = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %oi_tmp__ssa_v0_pview = pto.partition_view %oi_tmp__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%oi_tmp__ssa_v0_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi_tmp__tile : !pto.tile_buf) + %8 = arith.cmpi eq, %c0, %c0 : index + scf.if %8 { + %oi__ssa_v3 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %li__ssa_v3 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %mi__ssa_v3 = pto.alloc_tile addr = %c64 : !pto.tile_buf + pto.tmov ins(%li__ssa_v3 : !pto.tile_buf) outs(%li__phi_v5 : !pto.tile_buf) + pto.tmov ins(%mi__ssa_v3 : !pto.tile_buf) outs(%mi__phi_v5 : !pto.tile_buf) + pto.tmov ins(%oi__ssa_v3 : !pto.tile_buf) outs(%oi__phi_v5 : !pto.tile_buf) + } else { + %mi_new__rm_a0_tmp_v0 = pto.alloc_tile addr = %c192 : !pto.tile_buf + %mi_new__rm_a1_tmp_v1 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %mi_new__row_major_tmp_v2 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + pto.tmax ins(%mi_new__rm_a0_tmp_v0, %mi_new__rm_a1_tmp_v1 : !pto.tile_buf, !pto.tile_buf) outs(%mi_new__row_major_tmp_v2 : !pto.tile_buf) + %mi_new__tile = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__rm_a0_tmp_v3 = pto.alloc_tile addr = %c192 : !pto.tile_buf + %t__rm_a1_tmp_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__row_major_tmp_v5 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + pto.tsub ins(%t__rm_a0_tmp_v3, %t__rm_a1_tmp_v4 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v5 : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %alpha__rm_a0_tmp_v6 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %alpha__row_major_tmp_v7 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + pto.texp ins(%alpha__rm_a0_tmp_v6 : !pto.tile_buf) outs(%alpha__row_major_tmp_v7 : !pto.tile_buf) + %alpha__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %t__rm_a0_tmp_v8 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %t__rm_a1_tmp_v9 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__row_major_tmp_v10 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + pto.tsub ins(%t__rm_a0_tmp_v8, %t__rm_a1_tmp_v9 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v10 : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %beta__rm_a0_tmp_v11 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %beta__row_major_tmp_v12 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + pto.texp ins(%beta__rm_a0_tmp_v11 : !pto.tile_buf) outs(%beta__row_major_tmp_v12 : !pto.tile_buf) + %beta__tile = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %t__rm_a0_tmp_v13 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %t__rm_a1_tmp_v14 = pto.alloc_tile addr = %c128 : !pto.tile_buf + %t__row_major_tmp_v15 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + pto.tmul ins(%t__rm_a0_tmp_v13, %t__rm_a1_tmp_v14 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v15 : !pto.tile_buf) + %1 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %t__rm_a0_tmp_v16 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %t__rm_a1_tmp_v17 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %t__row_major_tmp_v18 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + pto.tmul ins(%t__rm_a0_tmp_v16, %t__rm_a1_tmp_v17 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v18 : !pto.tile_buf) + %2 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + %li__rm_a0_tmp_v19 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %li__rm_a1_tmp_v20 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + %li__row_major_tmp_v21 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + pto.tadd ins(%li__rm_a0_tmp_v19, %li__rm_a1_tmp_v20 : !pto.tile_buf, !pto.tile_buf) outs(%li__row_major_tmp_v21 : !pto.tile_buf) + %3 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %4 = pto.alloc_tile addr = %c256 : !pto.tile_buf + pto.trowexpandmul ins(%oi__tile, %alpha__tile : !pto.tile_buf, !pto.tile_buf) outs(%4 : !pto.tile_buf) + %5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + pto.trowexpandmul ins(%oi_tmp__tile, %beta__tile : !pto.tile_buf, !pto.tile_buf) outs(%5 : !pto.tile_buf) + %6 = pto.alloc_tile addr = %c256 : !pto.tile_buf + pto.tadd ins(%4, %5 : !pto.tile_buf, !pto.tile_buf) outs(%6 : !pto.tile_buf) + %mi__ssa_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %li__tile_mv = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%3 : !pto.tile_buf) outs(%li__tile_mv : !pto.tile_buf) + %mi__ssa_v4_mv = pto.alloc_tile addr = %c64 : !pto.tile_buf + pto.tmov ins(%mi__ssa_v4 : !pto.tile_buf) outs(%mi__ssa_v4_mv : !pto.tile_buf) + %oi__tile_mv = pto.alloc_tile addr = %c8448 : !pto.tile_buf + pto.tmov ins(%6 : !pto.tile_buf) outs(%oi__tile_mv : !pto.tile_buf) + pto.tmov ins(%li__tile_mv : !pto.tile_buf) outs(%li__phi_v5 : !pto.tile_buf) + pto.tmov ins(%mi__ssa_v4_mv : !pto.tile_buf) outs(%mi__phi_v5 : !pto.tile_buf) + pto.tmov ins(%oi__tile_mv : !pto.tile_buf) outs(%oi__phi_v5 : !pto.tile_buf) + } + %9 = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tstore ins(%li__phi_v5 : !pto.tile_buf) outs(%9 : !pto.partition_tensor_view<16x1xf32>) + %10 = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tstore ins(%mi__phi_v5 : !pto.tile_buf) outs(%10 : !pto.partition_tensor_view<16x1xf32>) + %11 = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tstore ins(%oi__phi_v5 : !pto.tile_buf) outs(%11 : !pto.partition_tensor_view<16x128xf32>) + return + } +} diff --git a/test/basic/issue828_softmax_rescale_incore_1_a5_if_aligned.pto b/test/basic/issue828_softmax_rescale_incore_1_a5_if_aligned.pto new file mode 100644 index 000000000..590e4885b --- /dev/null +++ b/test/basic/issue828_softmax_rescale_incore_1_a5_if_aligned.pto @@ -0,0 +1,121 @@ +// IF-only aligned-address variant from issue #828 (force branch condition true). +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s > /dev/null +// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s > /dev/null + +module attributes {pto.target_arch = "a5"} { + func.func @softmax_rescale_incore_1(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0i = arith.constant 0 : i64 + %c64 = arith.constant 256 : i64 + %c128 = arith.constant 512 : i64 + %c192 = arith.constant 768 : i64 + %c256 = arith.constant 1024 : i64 + %c8448 = arith.constant 33792 : i64 + %c16640 = arith.constant 66560 : i64 + %c16704 = arith.constant 66816 : i64 + %c16768 = arith.constant 67072 : i64 + %c16832 = arith.constant 67328 : i64 + %c16896 = arith.constant 67584 : i64 + %c16 = arith.constant 16 : index + %c1 = arith.constant 1 : index + %7 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %cur_li__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %cur_mi__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %li__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %mi__iter_v1_view = pto.make_tensor_view %arg3, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout}: !pto.tensor_view + %oi__iter_v1_view = pto.make_tensor_view %arg4, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %oi_tmp__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout}: !pto.tensor_view + %li__phi_v5 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %mi__phi_v5 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %oi__phi_v5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %cur_li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf + %cur_li__ssa_v0_pview = pto.partition_view %cur_li__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%cur_li__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_li__tile : !pto.tile_buf) + %cur_mi__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf + %cur_mi__ssa_v0_pview = pto.partition_view %cur_mi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%cur_mi__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_mi__tile : !pto.tile_buf) + %li__tile = pto.alloc_tile addr = %c128 : !pto.tile_buf + %li__iter_v1_pview = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%li__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%li__tile : !pto.tile_buf) + %mi__tile = pto.alloc_tile addr = %c192 : !pto.tile_buf + %mi__iter_v1_pview = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tload ins(%mi__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%mi__tile : !pto.tile_buf) + %oi__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf + %oi__iter_v1_pview = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%oi__iter_v1_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi__tile : !pto.tile_buf) + %oi_tmp__tile = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %oi_tmp__ssa_v0_pview = pto.partition_view %oi_tmp__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tload ins(%oi_tmp__ssa_v0_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi_tmp__tile : !pto.tile_buf) + %8 = arith.cmpi eq, %c0, %c0 : index + scf.if %8 { + %oi__ssa_v3 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + %li__ssa_v3 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %mi__ssa_v3 = pto.alloc_tile addr = %c64 : !pto.tile_buf + pto.tmov ins(%li__ssa_v3 : !pto.tile_buf) outs(%li__phi_v5 : !pto.tile_buf) + pto.tmov ins(%mi__ssa_v3 : !pto.tile_buf) outs(%mi__phi_v5 : !pto.tile_buf) + pto.tmov ins(%oi__ssa_v3 : !pto.tile_buf) outs(%oi__phi_v5 : !pto.tile_buf) + } else { + %mi_new__rm_a0_tmp_v0 = pto.alloc_tile addr = %c192 : !pto.tile_buf + %mi_new__rm_a1_tmp_v1 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %mi_new__row_major_tmp_v2 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + pto.tmax ins(%mi_new__rm_a0_tmp_v0, %mi_new__rm_a1_tmp_v1 : !pto.tile_buf, !pto.tile_buf) outs(%mi_new__row_major_tmp_v2 : !pto.tile_buf) + %mi_new__tile = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__rm_a0_tmp_v3 = pto.alloc_tile addr = %c192 : !pto.tile_buf + %t__rm_a1_tmp_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__row_major_tmp_v5 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + pto.tsub ins(%t__rm_a0_tmp_v3, %t__rm_a1_tmp_v4 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v5 : !pto.tile_buf) + %t__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %alpha__rm_a0_tmp_v6 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %alpha__row_major_tmp_v7 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + pto.texp ins(%alpha__rm_a0_tmp_v6 : !pto.tile_buf) outs(%alpha__row_major_tmp_v7 : !pto.tile_buf) + %alpha__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %t__rm_a0_tmp_v8 = pto.alloc_tile addr = %c64 : !pto.tile_buf + %t__rm_a1_tmp_v9 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %t__row_major_tmp_v10 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + pto.tsub ins(%t__rm_a0_tmp_v8, %t__rm_a1_tmp_v9 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v10 : !pto.tile_buf) + %0 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %beta__rm_a0_tmp_v11 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %beta__row_major_tmp_v12 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + pto.texp ins(%beta__rm_a0_tmp_v11 : !pto.tile_buf) outs(%beta__row_major_tmp_v12 : !pto.tile_buf) + %beta__tile = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %t__rm_a0_tmp_v13 = pto.alloc_tile addr = %c16704 : !pto.tile_buf + %t__rm_a1_tmp_v14 = pto.alloc_tile addr = %c128 : !pto.tile_buf + %t__row_major_tmp_v15 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + pto.tmul ins(%t__rm_a0_tmp_v13, %t__rm_a1_tmp_v14 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v15 : !pto.tile_buf) + %1 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %t__rm_a0_tmp_v16 = pto.alloc_tile addr = %c16768 : !pto.tile_buf + %t__rm_a1_tmp_v17 = pto.alloc_tile addr = %c0i : !pto.tile_buf + %t__row_major_tmp_v18 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + pto.tmul ins(%t__rm_a0_tmp_v16, %t__rm_a1_tmp_v17 : !pto.tile_buf, !pto.tile_buf) outs(%t__row_major_tmp_v18 : !pto.tile_buf) + %2 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + %li__rm_a0_tmp_v19 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %li__rm_a1_tmp_v20 = pto.alloc_tile addr = %c16896 : !pto.tile_buf + %li__row_major_tmp_v21 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + pto.tadd ins(%li__rm_a0_tmp_v19, %li__rm_a1_tmp_v20 : !pto.tile_buf, !pto.tile_buf) outs(%li__row_major_tmp_v21 : !pto.tile_buf) + %3 = pto.alloc_tile addr = %c16832 : !pto.tile_buf + %4 = pto.alloc_tile addr = %c256 : !pto.tile_buf + pto.trowexpandmul ins(%oi__tile, %alpha__tile : !pto.tile_buf, !pto.tile_buf) outs(%4 : !pto.tile_buf) + %5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf + pto.trowexpandmul ins(%oi_tmp__tile, %beta__tile : !pto.tile_buf, !pto.tile_buf) outs(%5 : !pto.tile_buf) + %6 = pto.alloc_tile addr = %c256 : !pto.tile_buf + pto.tadd ins(%4, %5 : !pto.tile_buf, !pto.tile_buf) outs(%6 : !pto.tile_buf) + %mi__ssa_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf + %li__tile_mv = pto.alloc_tile addr = %c0i : !pto.tile_buf + pto.tmov ins(%3 : !pto.tile_buf) outs(%li__tile_mv : !pto.tile_buf) + %mi__ssa_v4_mv = pto.alloc_tile addr = %c64 : !pto.tile_buf + pto.tmov ins(%mi__ssa_v4 : !pto.tile_buf) outs(%mi__ssa_v4_mv : !pto.tile_buf) + %oi__tile_mv = pto.alloc_tile addr = %c8448 : !pto.tile_buf + pto.tmov ins(%6 : !pto.tile_buf) outs(%oi__tile_mv : !pto.tile_buf) + pto.tmov ins(%li__tile_mv : !pto.tile_buf) outs(%li__phi_v5 : !pto.tile_buf) + pto.tmov ins(%mi__ssa_v4_mv : !pto.tile_buf) outs(%mi__phi_v5 : !pto.tile_buf) + pto.tmov ins(%oi__tile_mv : !pto.tile_buf) outs(%oi__phi_v5 : !pto.tile_buf) + } + %9 = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tstore ins(%li__phi_v5 : !pto.tile_buf) outs(%9 : !pto.partition_tensor_view<16x1xf32>) + %10 = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view -> !pto.partition_tensor_view<16x1xf32> + pto.tstore ins(%mi__phi_v5 : !pto.tile_buf) outs(%10 : !pto.partition_tensor_view<16x1xf32>) + %11 = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view -> !pto.partition_tensor_view<16x128xf32> + pto.tstore ins(%oi__phi_v5 : !pto.tile_buf) outs(%11 : !pto.partition_tensor_view<16x128xf32>) + return + } +} diff --git a/test/npu_validation/scripts/generate_testcase.py b/test/npu_validation/scripts/generate_testcase.py index ca802b567..431ca3ccd 100644 --- a/test/npu_validation/scripts/generate_testcase.py +++ b/test/npu_validation/scripts/generate_testcase.py @@ -86,6 +86,29 @@ }) +def _get_case_scalar_overrides(testcase: str, params): + scalar_params = [p for p in params if p.get("kind") == "scalar"] + + # issue#828 guard: the target path is the `arg6 == 0` branch. + # Keep default behavior for all other cases. + if testcase == "issue828_softmax_rescale_incore_1_a5" and len(scalar_params) == 1: + return {scalar_params[0]["name"]: "0"} + + return {} + + +def _get_case_scalar_sweep(testcase: str, params): + scalar_params = [p for p in params if p.get("kind") == "scalar"] + + # Run issue#828 twice in one executable: + # 1) arg6 == 0 => if branch + # 2) arg6 == 1 => else branch + if testcase == "issue828_softmax_rescale_incore_1_a5" and len(scalar_params) == 1: + return [(scalar_params[0]["name"], "0"), (scalar_params[0]["name"], "1")] + + return [] + + def _parse_shape(text: str): match = re.search(r"Shape<(\d+)\s*,\s*(\d+)>", text) if match: @@ -1137,10 +1160,15 @@ def generate_testcase( f" size_t fileSize_{p['name']} = elemCount_{p['name']} * sizeof({p['host_type']});" ) + scalar_overrides = _get_case_scalar_overrides(testcase, params) for p in params: if p["kind"] != "scalar": continue t = p["host_type"] + override = scalar_overrides.get(p["name"]) + if override is not None: + param_decls_lines.append(f" {t} {p['name']} = {override};") + continue # Some PTO-ISA APIs use small POD structs as scalar parameters. # Example: pto::MrgSortExecutedNumList (used by TMRGSORT multi-list variants). if t.endswith("MrgSortExecutedNumList"): @@ -1236,6 +1264,18 @@ def generate_testcase( # header here instead of `runtime/rt.h` to avoid environment-specific # include path issues on some board images. runtime_rt_include = '#include \n#include ' + launch_call = f" {launch_name}({', '.join(launch_call_args + ['stream'])});" + scalar_sweep = _get_case_scalar_sweep(testcase, params) + if scalar_sweep: + launch_lines = [ + " // Branch coverage for issue828: run both if/else paths in one test.", + ] + for scalar_name, scalar_value in scalar_sweep: + launch_lines.append(f" {scalar_name} = {scalar_value};") + launch_lines.append(f" {launch_name}({', '.join(launch_call_args + ['stream'])});") + launch_lines.append(" ACL_CHECK(aclrtSynchronizeStream(stream));") + launch_call = "\n".join(launch_lines) + main_cpp = ( template .replace("@RUNTIME_RT_INCLUDE@", runtime_rt_include) @@ -1252,10 +1292,7 @@ def generate_testcase( .replace("@INIT_RUNTIME_PTRS@", "\n".join(init_runtime_ptrs)) .replace("@READ_INPUTS@", "\n".join(read_inputs)) .replace("@COPY_TO_DEVICE@", "\n".join(copy_inputs)) - .replace( - "@LAUNCH_CALL@", - f" {launch_name}({', '.join(launch_call_args + ['stream'])});", - ) + .replace("@LAUNCH_CALL@", launch_call) .replace("@COPY_BACK@", "\n".join(output_copy_back)) .replace("@WRITE_OUTPUT@", "\n".join(output_write)) .replace("@FREE_DEVICE@", "\n".join(free_device)) diff --git a/tools/ptoas/ptoas.cpp b/tools/ptoas/ptoas.cpp index e7034ecab..6fbb83b2a 100644 --- a/tools/ptoas/ptoas.cpp +++ b/tools/ptoas/ptoas.cpp @@ -183,6 +183,11 @@ static llvm::cl::opt enableInsertSync("enable-insert-sync", llvm::cl::desc("Enable automatic synchronization insertion pass"), llvm::cl::init(false)); +static llvm::cl::opt disableIdentityTMovCleanup( + "disable-identity-tmov-cleanup", + llvm::cl::desc("Disable A5 identity tmov cleanup pass before auto sync"), + llvm::cl::init(false)); + static llvm::cl::opt disableInferLayout( "disable-infer-layout", llvm::cl::desc("Disable PTO layout inference pass (static-only)"), @@ -1111,8 +1116,13 @@ int main(int argc, char **argv) { pm.addPass(pto::createPTOResolveReservedBuffersPass()); // Conditionally add Sync pass based on flag. - if (enableInsertSync) + if (enableInsertSync) { + if (!disableIdentityTMovCleanup) { + pm.addNestedPass( + pto::createPTORemoveIdentityTMovPass()); + } pm.addNestedPass(pto::createPTOInsertSyncPass()); + } pm.addPass(createCSEPass()); if (arch == "a3") {