diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index fb4749ec0..5884adf59 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -215,6 +215,19 @@ jobs:
           export PTOAS_OUT_DIR="${PAYLOAD_DIR}/test/samples"
           bash test/samples/runop.sh --enablebc all
 
+      - name: Run issue828 branch-fixed basic pto regressions
+        shell: bash
+        env:
+          PTOAS_BIN: ${{ github.workspace }}/build/tools/ptoas/ptoas
+        run: |
+          set -euo pipefail
+          for case in \
+            test/basic/issue828_softmax_rescale_incore_1_a5_if.pto \
+            test/basic/issue828_softmax_rescale_incore_1_a5_else.pto; do
+            "${PTOAS_BIN}" --pto-level=level3 --pto-arch=a5 --enable-insert-sync "${case}" >/dev/null
+            "${PTOAS_BIN}" --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup "${case}" >/dev/null
+          done
+
       - name: Build payload artifact
         if: >-
           ${{
diff --git a/README.md b/README.md
index 9a3132c9f..337478b66 100644
--- a/README.md
+++ b/README.md
@@ -193,6 +193,9 @@ ptoas tests/input.pto
 # 运行 AutoSyncInsert Pass
 ptoas tests/input.pto --enable-insert-sync -o outputfile.cpp
 
+# 调试开关：关闭 A5 identity tmov cleanup（用于 A/B 验证）
+ptoas tests/input.pto --enable-insert-sync --disable-identity-tmov-cleanup -o outputfile.cpp
+
 # 指定目标硬件架构（A3 / A5）
 ptoas tests/input.pto --pto-arch=a3 -o outputfile.cpp
 
diff --git a/include/PTO/Transforms/Passes.h b/include/PTO/Transforms/Passes.h
index cafdb784c..4861b155d 100644
--- a/include/PTO/Transforms/Passes.h
+++ b/include/PTO/Transforms/Passes.h
@@ -38,6 +38,7 @@ std::unique_ptr<Pass> createPTOLowerFrontendPipeOpsPass();
 std::unique_ptr<Pass> createPTOResolveReservedBuffersPass();
 std::unique_ptr<Pass> createPTOWrapFunctionsInSectionsPass();
 std::unique_ptr<Pass> createPTOVerifyTFreePass();
+std::unique_ptr<Pass> createPTORemoveIdentityTMovPass();
 
 // Creates a pass for ...
 std::unique_ptr<Pass> createPTOInsertSyncPass();
diff --git a/include/PTO/Transforms/Passes.td b/include/PTO/Transforms/Passes.td
index 37979bf21..0766b599c 100644
--- a/include/PTO/Transforms/Passes.td
+++ b/include/PTO/Transforms/Passes.td
@@ -38,6 +38,23 @@ def PTOInsertSync : Pass<"pto-insert-sync", "func::FuncOp"> {
   ];
 }
 
+def PTORemoveIdentityTMov : Pass<"pto-remove-identity-tmov", "func::FuncOp"> {
+  let summary = "Remove identity pto.tmov before auto-sync on A5";
+  let description = [{
+    Erases provably-identity `pto.tmov` operations on A5 before
+    `pto-insert-sync`. Besides `src == dst`, this also handles distinct SSA
+    values when alias analysis proves source and destination have the exact same
+    address range and type/layout. `memref.reinterpret_cast` views are treated
+    conservatively (not elided), and when roots differ the pass requires
+    concrete-equal root addresses. Dynamic/unknown address cases are not
+    removed.
+  }];
+  let constructor = "mlir::pto::createPTORemoveIdentityTMovPass()";
+  let dependentDialects = [
+    "mlir::pto::PTODialect"
+  ];
+}
+
 def ConvertToPTOOp : Pass<"convert-to-pto-op"> {
   let summary = "Convert Ops from other dialects to PTO Ops";
   let constructor = "mlir::pto::createConvertToPTOOpPass()";
diff --git a/lib/PTO/Transforms/CMakeLists.txt b/lib/PTO/Transforms/CMakeLists.txt
index b82d227fe..755c2a1bb 100644
--- a/lib/PTO/Transforms/CMakeLists.txt
+++ b/lib/PTO/Transforms/CMakeLists.txt
@@ -23,6 +23,7 @@ add_mlir_dialect_library(PTOTransforms
   PTOPlanMemory.cpp
   PTORemoveRedundantBarrier.cpp
   InferPTOLayout.cpp
+  PTORemoveIdentityTMovPass.cpp
   BufferizableOpInterfaceImpl.cpp
   ConvertToPTOOp.cpp
   PTOLowerFrontendPipeOpsPass.cpp
diff --git a/lib/PTO/Transforms/PTORemoveIdentityTMovPass.cpp b/lib/PTO/Transforms/PTORemoveIdentityTMovPass.cpp
new file mode 100644
index 000000000..0d77ea100
--- /dev/null
+++ b/lib/PTO/Transforms/PTORemoveIdentityTMovPass.cpp
@@ -0,0 +1,245 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+#include "PTO/IR/PTO.h"
+#include "PTO/Transforms/Passes.h"
+#include "PTO/Transforms/InsertSync/MemoryDependentAnalyzer.h"
+#include "PTO/Transforms/InsertSync/PTOIRTranslator.h"
+#include "PTO/Transforms/InsertSync/SyncCommon.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/Pass/Pass.h"
+#include <optional>
+
+namespace mlir {
+namespace pto {
+namespace func = ::mlir::func;
+#define GEN_PASS_DEF_PTOREMOVEIDENTITYTMOV
+#include "PTO/Transforms/Passes.h.inc"
+} // namespace pto
+} // namespace mlir
+
+using namespace mlir;
+using namespace mlir::pto;
+
+namespace {
+
+static bool isA5Target(func::FuncOp funcOp) {
+  ModuleOp module = funcOp->getParentOfType<ModuleOp>();
+  if (!module)
+    return false;
+  auto arch = module->getAttrOfType<StringAttr>("pto.target_arch");
+  return arch && arch.getValue().equals_insensitive("a5");
+}
+
+static const BaseMemInfo *
+getSingleMemInfo(const Buffer2MemInfoMap &buffer2MemInfoMap, Value value) {
+  auto it = buffer2MemInfoMap.find(value);
+  if (it == buffer2MemInfoMap.end())
+    return nullptr;
+  if (it->second.size() != 1)
+    return nullptr;
+  return it->second.front().get();
+}
+
+static std::optional<int64_t> tryEvalI64Constant(Value value) {
+  if (!value)
+    return std::nullopt;
+
+  APInt apInt;
+  if (matchPattern(value, m_ConstantInt(&apInt)))
+    return apInt.getSExtValue();
+
+  Operation *defOp = value.getDefiningOp();
+  if (!defOp)
+    return std::nullopt;
+
+  if (auto castOp = dyn_cast<arith::IndexCastOp>(defOp))
+    return tryEvalI64Constant(castOp.getIn());
+  if (auto castOp = dyn_cast<arith::ExtSIOp>(defOp))
+    return tryEvalI64Constant(castOp.getIn());
+  if (auto castOp = dyn_cast<arith::ExtUIOp>(defOp))
+    return tryEvalI64Constant(castOp.getIn());
+  if (auto castOp = dyn_cast<arith::TruncIOp>(defOp))
+    return tryEvalI64Constant(castOp.getIn());
+
+  return std::nullopt;
+}
+
+static std::optional<int64_t>
+tryGetConcreteRootAddress(const BaseMemInfo *info) {
+  if (!info)
+    return std::nullopt;
+
+  if (auto direct = tryEvalI64Constant(info->rootBuffer))
+    return direct;
+
+  Operation *defOp = info->rootBuffer.getDefiningOp();
+  if (!defOp)
+    return std::nullopt;
+
+  if (auto alloc = dyn_cast<pto::AllocTileOp>(defOp))
+    return tryEvalI64Constant(alloc.getAddr());
+
+  if (auto cast = dyn_cast<pto::PointerCastOp>(defOp)) {
+    if (!cast.getAddrs().empty())
+      return tryEvalI64Constant(cast.getAddrs().front());
+  }
+
+  return std::nullopt;
+}
+
+static bool hasDynamicStaticList(ArrayRef<int64_t> values) {
+  return llvm::any_of(values, [](int64_t value) {
+    return value == ShapedType::kDynamic;
+  });
+}
+
+static bool isStaticallyAddressableValue(Value value) {
+  int depth = 0;
+  constexpr int kMaxDepth = 32;
+  while (value && depth++ < kMaxDepth) {
+    Operation *defOp = value.getDefiningOp();
+    if (!defOp)
+      return false;
+
+    if (auto subView = dyn_cast<memref::SubViewOp>(defOp)) {
+      if (hasDynamicStaticList(subView.getStaticOffsets()) ||
+          hasDynamicStaticList(subView.getStaticSizes()) ||
+          hasDynamicStaticList(subView.getStaticStrides())) {
+        return false;
+      }
+      value = subView.getSource();
+      continue;
+    }
+
+    if (isa<memref::ReinterpretCastOp>(defOp))
+      return false;
+
+    if (auto cast = dyn_cast<memref::CastOp>(defOp)) {
+      value = cast.getSource();
+      continue;
+    }
+    if (auto collapse = dyn_cast<memref::CollapseShapeOp>(defOp)) {
+      value = collapse.getSrc();
+      continue;
+    }
+    if (auto expand = dyn_cast<memref::ExpandShapeOp>(defOp)) {
+      value = expand.getSrc();
+      continue;
+    }
+    if (auto view = dyn_cast<memref::ViewOp>(defOp)) {
+      if (view.getByteShift())
+        return false;
+      value = view.getSource();
+      continue;
+    }
+
+    return true;
+  }
+
+  return false;
+}
+
+static bool hasExactSameAddressRange(const BaseMemInfo *srcInfo,
+                                     const BaseMemInfo *dstInfo) {
+  if (!srcInfo || !dstInfo)
+    return false;
+
+  if (srcInfo->scope != dstInfo->scope)
+    return false;
+  if (srcInfo->allocateSize == 0 || dstInfo->allocateSize == 0)
+    return false;
+  if (srcInfo->allocateSize != dstInfo->allocateSize)
+    return false;
+  if (srcInfo->baseAddresses.empty() || dstInfo->baseAddresses.empty())
+    return false;
+  if (srcInfo->baseAddresses != dstInfo->baseAddresses)
+    return false;
+
+  return true;
+}
+
+static bool canEraseIdentityTMov(
+    TMovOp op, const Buffer2MemInfoMap &buffer2MemInfoMap) {
+  Value src = op.getSrc();
+  Value dst = op.getDst();
+
+  if (src == dst)
+    return true;
+
+  if (src.getType() != dst.getType())
+    return false;
+  if (!isStaticallyAddressableValue(src) || !isStaticallyAddressableValue(dst))
+    return false;
+
+  const BaseMemInfo *srcInfo = getSingleMemInfo(buffer2MemInfoMap, src);
+  const BaseMemInfo *dstInfo = getSingleMemInfo(buffer2MemInfoMap, dst);
+  if (!srcInfo || !dstInfo)
+    return false;
+
+  if (!hasExactSameAddressRange(srcInfo, dstInfo))
+    return false;
+
+  if (srcInfo->rootBuffer == dstInfo->rootBuffer)
+    return true;
+
+  auto srcRootAddr = tryGetConcreteRootAddress(srcInfo);
+  auto dstRootAddr = tryGetConcreteRootAddress(dstInfo);
+  if (!srcRootAddr || !dstRootAddr)
+    return false;
+  return *srcRootAddr == *dstRootAddr;
+}
+
+struct PTORemoveIdentityTMovPass
+    : public mlir::pto::impl::PTORemoveIdentityTMovBase<
+          PTORemoveIdentityTMovPass> {
+  void runOnOperation() override {
+    func::FuncOp funcOp = getOperation();
+    if (!isA5Target(funcOp))
+      return;
+
+    bool hasTMov = false;
+    funcOp.walk([&](TMovOp) {
+      hasTMov = true;
+      return WalkResult::interrupt();
+    });
+    if (!hasTMov)
+      return;
+
+    MemoryDependentAnalyzer memAnalyzer;
+    SyncIRs syncIR;
+    Buffer2MemInfoMap buffer2MemInfoMap;
+    PTOIRTranslator translator(syncIR, memAnalyzer, buffer2MemInfoMap, funcOp,
+                               SyncAnalysisMode::NORMALSYNC);
+    translator.Build();
+
+    SmallVector<TMovOp> toErase;
+    funcOp.walk([&](TMovOp op) {
+      if (canEraseIdentityTMov(op, buffer2MemInfoMap))
+        toErase.push_back(op);
+    });
+
+    for (TMovOp op : toErase) {
+      Value result = op.getResult();
+      if (result && !result.use_empty())
+        result.replaceAllUsesWith(op.getDst());
+      op.erase();
+    }
+  }
+};
+
+} // namespace
+
+std::unique_ptr<Pass> mlir::pto::createPTORemoveIdentityTMovPass() {
+  return std::make_unique<PTORemoveIdentityTMovPass>();
+}
diff --git a/test/basic/identity_tmov_alias_deadlock_signature_a5.pto b/test/basic/identity_tmov_alias_deadlock_signature_a5.pto
new file mode 100644
index 000000000..4974c845b
--- /dev/null
+++ b/test/basic/identity_tmov_alias_deadlock_signature_a5.pto
@@ -0,0 +1,40 @@
+// RUN: ptoas --pto-arch=a5 --enable-insert-sync %s | FileCheck %s --check-prefix=SAFE
+// RUN: ptoas --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s | FileCheck %s --check-prefix=UNSAFE
+
+module attributes {"pto.device-spec" = "Ascend950"} {
+  func.func @identity_tmov_alias_deadlock_signature_a5(
+      %src: memref<1x64xf16, #pto.address_space<gm>>,
+      %dst: memref<1x64xf16, #pto.address_space<gm>>) {
+    %ub = memref.alloc() : memref<1x64xf16, #pto.address_space<vec>>
+    %src_alias = memref.subview %ub[0, 0] [1, 64] [1, 1]
+      : memref<1x64xf16, #pto.address_space<vec>>
+      to memref<1x64xf16, strided<[64, 1]>, #pto.address_space<vec>>
+    %dst_alias = memref.subview %ub[0, 0] [1, 64] [1, 1]
+      : memref<1x64xf16, #pto.address_space<vec>>
+      to memref<1x64xf16, strided<[64, 1]>, #pto.address_space<vec>>
+
+    pto.tload ins(%src : memref<1x64xf16, #pto.address_space<gm>>)
+              outs(%src_alias : memref<1x64xf16, strided<[64, 1]>, #pto.address_space<vec>>)
+    pto.tmov ins(%src_alias : memref<1x64xf16, strided<[64, 1]>, #pto.address_space<vec>>)
+             outs(%dst_alias : memref<1x64xf16, strided<[64, 1]>, #pto.address_space<vec>>)
+    pto.tstore ins(%dst_alias : memref<1x64xf16, strided<[64, 1]>, #pto.address_space<vec>>)
+               outs(%dst : memref<1x64xf16, #pto.address_space<gm>>)
+    return
+  }
+}
+
+// SAFE-LABEL: __global__ AICORE void identity_tmov_alias_deadlock_signature_a5(
+// SAFE-NOT: TMOV(
+// SAFE: set_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0);
+// SAFE: wait_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0);
+// SAFE-NOT: set_flag(PIPE_MTE2, PIPE_V
+// SAFE-NOT: wait_flag(PIPE_MTE2, PIPE_V
+// SAFE-NOT: set_flag(PIPE_V, PIPE_MTE3
+// SAFE-NOT: wait_flag(PIPE_V, PIPE_MTE3
+
+// UNSAFE-LABEL: __global__ AICORE void identity_tmov_alias_deadlock_signature_a5(
+// UNSAFE: TMOV(
+// UNSAFE: set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+// UNSAFE: wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+// UNSAFE: set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+// UNSAFE: wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
diff --git a/test/basic/identity_tmov_autosync_a5_only.pto b/test/basic/identity_tmov_autosync_a5_only.pto
new file mode 100644
index 000000000..e02a33014
--- /dev/null
+++ b/test/basic/identity_tmov_autosync_a5_only.pto
@@ -0,0 +1,33 @@
+// RUN: ptoas --pto-arch=a5 --enable-insert-sync %s | FileCheck %s --check-prefix=A5
+// RUN: ptoas --pto-arch=a3 --enable-insert-sync %s | FileCheck %s --check-prefix=A3
+
+module attributes {"pto.device-spec" = "Ascend950"} {
+  func.func @identity_tmov_autosync_a5_only(
+      %src: memref<1x64xf16, #pto.address_space<gm>>,
+      %dst: memref<1x64xf16, #pto.address_space<gm>>) {
+    %ub = memref.alloc() : memref<1x64xf16, #pto.address_space<vec>>
+
+    pto.tload ins(%src : memref<1x64xf16, #pto.address_space<gm>>)
+              outs(%ub : memref<1x64xf16, #pto.address_space<vec>>)
+
+    // Identity move: should be removed on A5 before sync insertion.
+    pto.tmov ins(%ub : memref<1x64xf16, #pto.address_space<vec>>)
+             outs(%ub : memref<1x64xf16, #pto.address_space<vec>>)
+
+    pto.tstore ins(%ub : memref<1x64xf16, #pto.address_space<vec>>)
+               outs(%dst : memref<1x64xf16, #pto.address_space<gm>>)
+    return
+  }
+}
+
+// A5-LABEL: __global__ AICORE void identity_tmov_autosync_a5_only(
+// A5: set_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0);
+// A5-NOT: set_flag(PIPE_MTE2, PIPE_V
+// A5-NOT: wait_flag(PIPE_MTE2, PIPE_V
+// A5-NOT: set_flag(PIPE_V, PIPE_MTE3
+// A5-NOT: wait_flag(PIPE_V, PIPE_MTE3
+// A5-NOT: TMOV(
+// A5: wait_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0);
+
+// A3-LABEL: __global__ AICORE void identity_tmov_autosync_a5_only(
+// A3: TMOV(
diff --git a/test/basic/identity_tmov_diff_addr_keep_a5.pto b/test/basic/identity_tmov_diff_addr_keep_a5.pto
new file mode 100644
index 000000000..ea5734e94
--- /dev/null
+++ b/test/basic/identity_tmov_diff_addr_keep_a5.pto
@@ -0,0 +1,20 @@
+// RUN: ptoas --pto-arch=a5 --enable-insert-sync %s | FileCheck %s
+
+module attributes {"pto.device-spec" = "Ascend950"} {
+  func.func @identity_tmov_diff_addr_keep_a5(
+      %buf: memref<1024xi32, #pto.address_space<vec>>) {
+    %src = memref.subview %buf[0] [256] [1]
+      : memref<1024xi32, #pto.address_space<vec>>
+      to memref<256xi32, strided<[1]>, #pto.address_space<vec>>
+    %dst = memref.subview %buf[256] [256] [1]
+      : memref<1024xi32, #pto.address_space<vec>>
+      to memref<256xi32, strided<[1], offset: 256>, #pto.address_space<vec>>
+
+    pto.tmov ins(%src : memref<256xi32, strided<[1]>, #pto.address_space<vec>>)
+             outs(%dst : memref<256xi32, strided<[1], offset: 256>, #pto.address_space<vec>>)
+    return
+  }
+}
+
+// CHECK-LABEL: __global__ AICORE void identity_tmov_diff_addr_keep_a5(
+// CHECK: TMOV(
diff --git a/test/basic/identity_tmov_dynamic_addr_keep_a5.pto b/test/basic/identity_tmov_dynamic_addr_keep_a5.pto
new file mode 100644
index 000000000..088f14127
--- /dev/null
+++ b/test/basic/identity_tmov_dynamic_addr_keep_a5.pto
@@ -0,0 +1,14 @@
+// RUN: ptoas --pto-arch=a5 --enable-insert-sync %s | FileCheck %s
+
+module attributes {"pto.device-spec" = "Ascend950"} {
+  func.func @identity_tmov_dynamic_addr_keep_a5(
+      %src: memref<?xi32, #pto.address_space<vec>>,
+      %dst: memref<?xi32, #pto.address_space<vec>>) {
+    pto.tmov ins(%src : memref<?xi32, #pto.address_space<vec>>)
+             outs(%dst : memref<?xi32, #pto.address_space<vec>>)
+    return
+  }
+}
+
+// CHECK-LABEL: __global__ AICORE void identity_tmov_dynamic_addr_keep_a5(
+// CHECK: TMOV(
diff --git a/test/basic/identity_tmov_if_else_alias_a5.pto b/test/basic/identity_tmov_if_else_alias_a5.pto
new file mode 100644
index 000000000..62fad4543
--- /dev/null
+++ b/test/basic/identity_tmov_if_else_alias_a5.pto
@@ -0,0 +1,51 @@
+// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s | FileCheck %s --check-prefix=A5
+// RUN: ptoas --pto-level=level3 --pto-arch=a3 --enable-insert-sync %s | FileCheck %s --check-prefix=A3
+
+module attributes {"pto.device-spec" = "Ascend950"} {
+  func.func @identity_tmov_if_else_alias_a5(%cond: i1) {
+    %c0_i64 = arith.constant 0 : i64
+    %c64_i64 = arith.constant 64 : i64
+
+    // Phi-like destinations (same addresses are reused in both branches).
+    %phi0 = pto.alloc_tile addr = %c0_i64
+      : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %phi1 = pto.alloc_tile addr = %c64_i64
+      : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+
+    scf.if %cond {
+      // Then: distinct SSA, same address range as destinations.
+      %then_src0 = pto.alloc_tile addr = %c0_i64
+        : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+      pto.tmov ins(%then_src0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+               outs(%phi0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+
+      %then_src1 = pto.alloc_tile addr = %c64_i64
+        : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+      pto.tmov ins(%then_src1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+               outs(%phi1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    } else {
+      // Else: include a move chain with identical addresses to mirror the issue style.
+      %else_src0 = pto.alloc_tile addr = %c0_i64
+        : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+      pto.tmov ins(%else_src0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+               outs(%phi0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+
+      %else_src1 = pto.alloc_tile addr = %c64_i64
+        : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+      %else_mid1 = pto.alloc_tile addr = %c64_i64
+        : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+      pto.tmov ins(%else_src1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+               outs(%else_mid1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+      pto.tmov ins(%else_mid1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+               outs(%phi1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    }
+
+    return
+  }
+}
+
+// A5-LABEL: __global__ AICORE void identity_tmov_if_else_alias_a5(
+// A5-NOT: TMOV(
+
+// A3-LABEL: __global__ AICORE void identity_tmov_if_else_alias_a5(
+// A3: TMOV(
diff --git a/test/basic/identity_tmov_pointer_cast_diff_addr_keep_a5.pto b/test/basic/identity_tmov_pointer_cast_diff_addr_keep_a5.pto
new file mode 100644
index 000000000..e9f63ee8b
--- /dev/null
+++ b/test/basic/identity_tmov_pointer_cast_diff_addr_keep_a5.pto
@@ -0,0 +1,24 @@
+// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s | FileCheck %s
+
+module attributes {"pto.device-spec" = "Ascend950"} {
+  func.func @identity_tmov_pointer_cast_diff_addr_keep_a5(
+      %src: memref<1x64xf16, #pto.address_space<gm>>,
+      %dst: memref<1x64xf16, #pto.address_space<gm>>) {
+    %c0 = arith.constant 0 : i64
+    %c64 = arith.constant 64 : i64
+
+    %src_buf = pto.pointer_cast(%c0) : memref<1x64xf16, #pto.address_space<vec>>
+    %dst_buf = pto.pointer_cast(%c64) : memref<1x64xf16, #pto.address_space<vec>>
+
+    pto.tload ins(%src : memref<1x64xf16, #pto.address_space<gm>>)
+              outs(%src_buf : memref<1x64xf16, #pto.address_space<vec>>)
+    pto.tmov ins(%src_buf : memref<1x64xf16, #pto.address_space<vec>>)
+             outs(%dst_buf : memref<1x64xf16, #pto.address_space<vec>>)
+    pto.tstore ins(%dst_buf : memref<1x64xf16, #pto.address_space<vec>>)
+               outs(%dst : memref<1x64xf16, #pto.address_space<gm>>)
+    return
+  }
+}
+
+// CHECK-LABEL: __global__ AICORE void identity_tmov_pointer_cast_diff_addr_keep_a5(
+// CHECK: TMOV(
diff --git a/test/basic/identity_tmov_pointer_cast_same_addr_diff_ssa_a5.pto b/test/basic/identity_tmov_pointer_cast_same_addr_diff_ssa_a5.pto
new file mode 100644
index 000000000..841745ee6
--- /dev/null
+++ b/test/basic/identity_tmov_pointer_cast_same_addr_diff_ssa_a5.pto
@@ -0,0 +1,27 @@
+// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s | FileCheck %s
+
+module attributes {"pto.device-spec" = "Ascend950"} {
+  func.func @identity_tmov_pointer_cast_same_addr_diff_ssa_a5(
+      %src: memref<1x64xf16, #pto.address_space<gm>>,
+      %dst: memref<1x64xf16, #pto.address_space<gm>>) {
+    %c0 = arith.constant 0 : i64
+    %c0_idx = arith.constant 0 : index
+    %c0_alt = arith.index_cast %c0_idx : index to i64
+
+    %src_buf = pto.pointer_cast(%c0) : memref<1x64xf16, #pto.address_space<vec>>
+    %dst_buf = pto.pointer_cast(%c0_alt) : memref<1x64xf16, #pto.address_space<vec>>
+
+    pto.tload ins(%src : memref<1x64xf16, #pto.address_space<gm>>)
+              outs(%src_buf : memref<1x64xf16, #pto.address_space<vec>>)
+    pto.tmov ins(%src_buf : memref<1x64xf16, #pto.address_space<vec>>)
+             outs(%dst_buf : memref<1x64xf16, #pto.address_space<vec>>)
+    pto.tstore ins(%dst_buf : memref<1x64xf16, #pto.address_space<vec>>)
+               outs(%dst : memref<1x64xf16, #pto.address_space<gm>>)
+    return
+  }
+}
+
+// CHECK-LABEL: __global__ AICORE void identity_tmov_pointer_cast_same_addr_diff_ssa_a5(
+// CHECK-NOT: TMOV(
+// CHECK: set_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0);
+// CHECK: wait_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0);
diff --git a/test/basic/identity_tmov_reinterpret_cast_keep_a5.pto b/test/basic/identity_tmov_reinterpret_cast_keep_a5.pto
new file mode 100644
index 000000000..9130de0d6
--- /dev/null
+++ b/test/basic/identity_tmov_reinterpret_cast_keep_a5.pto
@@ -0,0 +1,26 @@
+// RUN: ptoas --pto-arch=a5 --enable-insert-sync %s | FileCheck %s
+
+module attributes {"pto.device-spec" = "Ascend950"} {
+  func.func @identity_tmov_reinterpret_cast_keep_a5(
+      %src: memref<1x64xf16, #pto.address_space<gm>>,
+      %dst: memref<1x64xf16, #pto.address_space<gm>>) {
+    %ub = memref.alloc() : memref<64xf16, #pto.address_space<vec>>
+    %v0 = memref.reinterpret_cast %ub to offset: [0], sizes: [1, 64], strides: [64, 1]
+      : memref<64xf16, #pto.address_space<vec>>
+      to memref<?x?xf16, strided<[?, ?], offset: ?>, #pto.address_space<vec>>
+    %v1 = memref.reinterpret_cast %ub to offset: [0], sizes: [1, 64], strides: [1, 64]
+      : memref<64xf16, #pto.address_space<vec>>
+      to memref<?x?xf16, strided<[?, ?], offset: ?>, #pto.address_space<vec>>
+
+    pto.tload ins(%src : memref<1x64xf16, #pto.address_space<gm>>)
+              outs(%v0 : memref<?x?xf16, strided<[?, ?], offset: ?>, #pto.address_space<vec>>)
+    pto.tmov ins(%v0 : memref<?x?xf16, strided<[?, ?], offset: ?>, #pto.address_space<vec>>)
+             outs(%v1 : memref<?x?xf16, strided<[?, ?], offset: ?>, #pto.address_space<vec>>)
+    pto.tstore ins(%v1 : memref<?x?xf16, strided<[?, ?], offset: ?>, #pto.address_space<vec>>)
+               outs(%dst : memref<1x64xf16, #pto.address_space<gm>>)
+    return
+  }
+}
+
+// CHECK-LABEL: __global__ AICORE void identity_tmov_reinterpret_cast_keep_a5(
+// CHECK: TMOV(
diff --git a/test/basic/identity_tmov_same_range_distinct_ssa_a5.pto b/test/basic/identity_tmov_same_range_distinct_ssa_a5.pto
new file mode 100644
index 000000000..280770dd6
--- /dev/null
+++ b/test/basic/identity_tmov_same_range_distinct_ssa_a5.pto
@@ -0,0 +1,24 @@
+// RUN: ptoas --pto-arch=a5 --enable-insert-sync %s | FileCheck %s --check-prefix=A5
+// RUN: ptoas --pto-arch=a3 --enable-insert-sync %s | FileCheck %s --check-prefix=A3
+
+module attributes {"pto.device-spec" = "Ascend950"} {
+  func.func @identity_tmov_same_range_distinct_ssa_a5() {
+    %buf = memref.alloc() : memref<1024xi32, #pto.address_space<vec>>
+    %src = memref.subview %buf[0] [1024] [1]
+      : memref<1024xi32, #pto.address_space<vec>>
+      to memref<1024xi32, strided<[1]>, #pto.address_space<vec>>
+    %dst = memref.subview %buf[0] [1024] [1]
+      : memref<1024xi32, #pto.address_space<vec>>
+      to memref<1024xi32, strided<[1]>, #pto.address_space<vec>>
+
+    pto.tmov ins(%src : memref<1024xi32, strided<[1]>, #pto.address_space<vec>>)
+             outs(%dst : memref<1024xi32, strided<[1]>, #pto.address_space<vec>>)
+    return
+  }
+}
+
+// A5-LABEL: __global__ AICORE void identity_tmov_same_range_distinct_ssa_a5(
+// A5-NOT: TMOV(
+
+// A3-LABEL: __global__ AICORE void identity_tmov_same_range_distinct_ssa_a5(
+// A3: TMOV(
diff --git a/test/basic/issue828_diag_else_3tmov_only_a5.pto b/test/basic/issue828_diag_else_3tmov_only_a5.pto
new file mode 100644
index 000000000..72c7a171a
--- /dev/null
+++ b/test/basic/issue828_diag_else_3tmov_only_a5.pto
@@ -0,0 +1,73 @@
+// Diagnostic case B (ELSE residual 3 TMOV only):
+// Goal: isolate the 3 non-identity TMOVs that remain after identity cleanup.
+// Suggested board runs:
+//   /run A5 issue828_diag_else_3tmov_only_a5 --pto-level=level3
+//   /run A5 issue828_diag_else_3tmov_only_a5 --pto-level=level3 --disable-identity-tmov-cleanup
+// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s > /dev/null
+// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s > /dev/null
+
+module attributes {pto.target_arch = "a5"} {
+  func.func @issue828_diag_else_3tmov_only_a5(
+      %in0: !pto.ptr<f32>, %in1: !pto.ptr<f32>, %in2: !pto.ptr<f32>,
+      %out0: !pto.ptr<f32>, %out1: !pto.ptr<f32>, %out2: !pto.ptr<f32>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0i = arith.constant 0 : i64
+    %c64 = arith.constant 64 : i64
+    %c256 = arith.constant 256 : i64
+    %c8448 = arith.constant 8448 : i64
+    %c16640 = arith.constant 16640 : i64
+    %c16832 = arith.constant 16832 : i64
+
+    %c16 = arith.constant 16 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
+
+    %in0_view = pto.make_tensor_view %in0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>} : !pto.tensor_view<?x?xf32>
+    %in1_view = pto.make_tensor_view %in1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>} : !pto.tensor_view<?x?xf32>
+    %in2_view = pto.make_tensor_view %in2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>} : !pto.tensor_view<?x?xf32>
+
+    %out0_view = pto.make_tensor_view %out0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>} : !pto.tensor_view<?x?xf32>
+    %out1_view = pto.make_tensor_view %out1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>} : !pto.tensor_view<?x?xf32>
+    %out2_view = pto.make_tensor_view %out2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>} : !pto.tensor_view<?x?xf32>
+
+    // Source addresses mirror residual else-path producers.
+    %src0 = pto.alloc_tile addr = %c16832 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %src1 = pto.alloc_tile addr = %c16640 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %src2 = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+
+    // Destination addresses mirror phi/result tiles in issue828.
+    %dst0 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %dst1 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %dst2 = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+
+    %in0_part = pto.partition_view %in0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+    %in1_part = pto.partition_view %in1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+    %in2_part = pto.partition_view %in2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+
+    pto.tload ins(%in0_part : !pto.partition_tensor_view<16x1xf32>) outs(%src0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tload ins(%in1_part : !pto.partition_tensor_view<16x1xf32>) outs(%src1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tload ins(%in2_part : !pto.partition_tensor_view<16x128xf32>) outs(%src2 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+
+    // Non-identity TMOV triplet under investigation.
+    // Force full serialization around each TMOV for board-side diagnosis.
+    pto.barrier <PIPE_ALL>
+    pto.tmov ins(%src0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%dst0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.barrier <PIPE_ALL>
+    pto.barrier <PIPE_ALL>
+    pto.tmov ins(%src1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%dst1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.barrier <PIPE_ALL>
+    pto.barrier <PIPE_ALL>
+    pto.tmov ins(%src2 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%dst2 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.barrier <PIPE_ALL>
+
+    %out0_part = pto.partition_view %out0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+    %out1_part = pto.partition_view %out1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+    %out2_part = pto.partition_view %out2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+
+    pto.tstore ins(%dst0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%out0_part : !pto.partition_tensor_view<16x1xf32>)
+    pto.tstore ins(%dst1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%out1_part : !pto.partition_tensor_view<16x1xf32>)
+    pto.tstore ins(%dst2 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%out2_part : !pto.partition_tensor_view<16x128xf32>)
+    return
+  }
+}
diff --git a/test/basic/issue828_diag_else_3tmov_rowmajor_a5.pto b/test/basic/issue828_diag_else_3tmov_rowmajor_a5.pto
new file mode 100644
index 000000000..06b66ba78
--- /dev/null
+++ b/test/basic/issue828_diag_else_3tmov_rowmajor_a5.pto
@@ -0,0 +1,73 @@
+// Diagnostic case B2 (ELSE residual 3 TMOV, first two become row_major 1x16):
+// Goal: keep sync pattern unchanged while making first two TMOVs row-major 1x16 tiles.
+// Suggested board runs:
+//   /run A5 issue828_diag_else_3tmov_rowmajor_a5 --pto-level=level3
+//   /run A5 issue828_diag_else_3tmov_rowmajor_a5 --pto-level=level3 --disable-identity-tmov-cleanup
+// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s > /dev/null
+// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s > /dev/null
+
+module attributes {pto.target_arch = "a5"} {
+  func.func @issue828_diag_else_3tmov_rowmajor_a5(
+      %in0: !pto.ptr<f32>, %in1: !pto.ptr<f32>, %in2: !pto.ptr<f32>,
+      %out0: !pto.ptr<f32>, %out1: !pto.ptr<f32>, %out2: !pto.ptr<f32>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0i = arith.constant 0 : i64
+    %c64 = arith.constant 64 : i64
+    %c256 = arith.constant 256 : i64
+    %c8448 = arith.constant 8448 : i64
+    %c16640 = arith.constant 16640 : i64
+    %c16832 = arith.constant 16832 : i64
+
+    %c16 = arith.constant 16 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
+
+    %in0_view = pto.make_tensor_view %in0, shape = [%c1, %c16], strides = [%c16, %c1] {layout = #pto.layout<nd>} : !pto.tensor_view<?x?xf32>
+    %in1_view = pto.make_tensor_view %in1, shape = [%c1, %c16], strides = [%c16, %c1] {layout = #pto.layout<nd>} : !pto.tensor_view<?x?xf32>
+    %in2_view = pto.make_tensor_view %in2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>} : !pto.tensor_view<?x?xf32>
+
+    %out0_view = pto.make_tensor_view %out0, shape = [%c1, %c16], strides = [%c16, %c1] {layout = #pto.layout<nd>} : !pto.tensor_view<?x?xf32>
+    %out1_view = pto.make_tensor_view %out1, shape = [%c1, %c16], strides = [%c16, %c1] {layout = #pto.layout<nd>} : !pto.tensor_view<?x?xf32>
+    %out2_view = pto.make_tensor_view %out2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>} : !pto.tensor_view<?x?xf32>
+
+    // Source addresses mirror residual else-path producers.
+    %src0 = pto.alloc_tile addr = %c16832 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %src1 = pto.alloc_tile addr = %c16640 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %src2 = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+
+    // Destination addresses mirror phi/result tiles in issue828.
+    %dst0 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %dst1 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %dst2 = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+
+    %in0_part = pto.partition_view %in0_view, offsets = [%c0, %c0], sizes = [%c1, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x16xf32>
+    %in1_part = pto.partition_view %in1_view, offsets = [%c0, %c0], sizes = [%c1, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x16xf32>
+    %in2_part = pto.partition_view %in2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+
+    pto.tload ins(%in0_part : !pto.partition_tensor_view<1x16xf32>) outs(%src0 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tload ins(%in1_part : !pto.partition_tensor_view<1x16xf32>) outs(%src1 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tload ins(%in2_part : !pto.partition_tensor_view<16x128xf32>) outs(%src2 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+
+    // Non-identity TMOV triplet under investigation.
+    // Force full serialization around each TMOV for board-side diagnosis.
+    pto.barrier <PIPE_ALL>
+    pto.tmov ins(%src0 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%dst0 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.barrier <PIPE_ALL>
+    pto.barrier <PIPE_ALL>
+    pto.tmov ins(%src1 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%dst1 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.barrier <PIPE_ALL>
+    pto.barrier <PIPE_ALL>
+    pto.tmov ins(%src2 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%dst2 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.barrier <PIPE_ALL>
+
+    %out0_part = pto.partition_view %out0_view, offsets = [%c0, %c0], sizes = [%c1, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x16xf32>
+    %out1_part = pto.partition_view %out1_view, offsets = [%c0, %c0], sizes = [%c1, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x16xf32>
+    %out2_part = pto.partition_view %out2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+
+    pto.tstore ins(%dst0 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%out0_part : !pto.partition_tensor_view<1x16xf32>)
+    pto.tstore ins(%dst1 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%out1_part : !pto.partition_tensor_view<1x16xf32>)
+    pto.tstore ins(%dst2 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%out2_part : !pto.partition_tensor_view<16x128xf32>)
+    return
+  }
+}
diff --git a/test/basic/issue828_diag_else_3tmov_vrow1_a5.pto b/test/basic/issue828_diag_else_3tmov_vrow1_a5.pto
new file mode 100644
index 000000000..da2232401
--- /dev/null
+++ b/test/basic/issue828_diag_else_3tmov_vrow1_a5.pto
@@ -0,0 +1,73 @@
+// Diagnostic case B1 (ELSE residual 3 TMOV, first two use v_row=1):
+// Goal: keep sync pattern unchanged while forcing first two DN tiles to single-row valid shape.
+// Suggested board runs:
+//   /run A5 issue828_diag_else_3tmov_vrow1_a5 --pto-level=level3
+//   /run A5 issue828_diag_else_3tmov_vrow1_a5 --pto-level=level3 --disable-identity-tmov-cleanup
+// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s > /dev/null
+// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s > /dev/null
+
+module attributes {pto.target_arch = "a5"} {
+  func.func @issue828_diag_else_3tmov_vrow1_a5(
+      %in0: !pto.ptr<f32>, %in1: !pto.ptr<f32>, %in2: !pto.ptr<f32>,
+      %out0: !pto.ptr<f32>, %out1: !pto.ptr<f32>, %out2: !pto.ptr<f32>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0i = arith.constant 0 : i64
+    %c64 = arith.constant 64 : i64
+    %c256 = arith.constant 256 : i64
+    %c8448 = arith.constant 8448 : i64
+    %c16640 = arith.constant 16640 : i64
+    %c16832 = arith.constant 16832 : i64
+
+    %c16 = arith.constant 16 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
+
+    %in0_view = pto.make_tensor_view %in0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>} : !pto.tensor_view<?x?xf32>
+    %in1_view = pto.make_tensor_view %in1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>} : !pto.tensor_view<?x?xf32>
+    %in2_view = pto.make_tensor_view %in2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>} : !pto.tensor_view<?x?xf32>
+
+    %out0_view = pto.make_tensor_view %out0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>} : !pto.tensor_view<?x?xf32>
+    %out1_view = pto.make_tensor_view %out1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>} : !pto.tensor_view<?x?xf32>
+    %out2_view = pto.make_tensor_view %out2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>} : !pto.tensor_view<?x?xf32>
+
+    // Source addresses mirror residual else-path producers.
+    %src0 = pto.alloc_tile addr = %c16832 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=1, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %src1 = pto.alloc_tile addr = %c16640 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=1, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %src2 = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+
+    // Destination addresses mirror phi/result tiles in issue828.
+    %dst0 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=1, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %dst1 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=1, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %dst2 = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+
+    %in0_part = pto.partition_view %in0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+    %in1_part = pto.partition_view %in1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+    %in2_part = pto.partition_view %in2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+
+    pto.tload ins(%in0_part : !pto.partition_tensor_view<16x1xf32>) outs(%src0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=1, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tload ins(%in1_part : !pto.partition_tensor_view<16x1xf32>) outs(%src1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=1, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tload ins(%in2_part : !pto.partition_tensor_view<16x128xf32>) outs(%src2 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+
+    // Non-identity TMOV triplet under investigation.
+    // Force full serialization around each TMOV for board-side diagnosis.
+    pto.barrier <PIPE_ALL>
+    pto.tmov ins(%src0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=1, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%dst0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=1, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.barrier <PIPE_ALL>
+    pto.barrier <PIPE_ALL>
+    pto.tmov ins(%src1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=1, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%dst1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=1, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.barrier <PIPE_ALL>
+    pto.barrier <PIPE_ALL>
+    pto.tmov ins(%src2 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%dst2 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.barrier <PIPE_ALL>
+
+    %out0_part = pto.partition_view %out0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+    %out1_part = pto.partition_view %out1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+    %out2_part = pto.partition_view %out2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+
+    pto.tstore ins(%dst0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=1, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%out0_part : !pto.partition_tensor_view<16x1xf32>)
+    pto.tstore ins(%dst1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=1, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%out1_part : !pto.partition_tensor_view<16x1xf32>)
+    pto.tstore ins(%dst2 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%out2_part : !pto.partition_tensor_view<16x128xf32>)
+    return
+  }
+}
diff --git a/test/basic/issue828_diag_else_no_tmov_a5.pto b/test/basic/issue828_diag_else_no_tmov_a5.pto
new file mode 100644
index 000000000..d3cc1015f
--- /dev/null
+++ b/test/basic/issue828_diag_else_no_tmov_a5.pto
@@ -0,0 +1,53 @@
+// Diagnostic case C (ELSE control: no TMOV):
+// Goal: verify whether failure still happens without any TMOV on the path.
+// Suggested board runs:
+//   /run A5 issue828_diag_else_no_tmov_a5 --pto-level=level3
+//   /run A5 issue828_diag_else_no_tmov_a5 --pto-level=level3 --disable-identity-tmov-cleanup
+// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s > /dev/null
+// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s > /dev/null
+
+module attributes {pto.target_arch = "a5"} {
+  func.func @issue828_diag_else_no_tmov_a5(
+      %in0: !pto.ptr<f32>, %in1: !pto.ptr<f32>, %in2: !pto.ptr<f32>,
+      %out0: !pto.ptr<f32>, %out1: !pto.ptr<f32>, %out2: !pto.ptr<f32>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0i = arith.constant 0 : i64
+    %c64 = arith.constant 64 : i64
+    %c8448 = arith.constant 8448 : i64
+
+    %c16 = arith.constant 16 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
+
+    %in0_view = pto.make_tensor_view %in0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>} : !pto.tensor_view<?x?xf32>
+    %in1_view = pto.make_tensor_view %in1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>} : !pto.tensor_view<?x?xf32>
+    %in2_view = pto.make_tensor_view %in2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>} : !pto.tensor_view<?x?xf32>
+
+    %out0_view = pto.make_tensor_view %out0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>} : !pto.tensor_view<?x?xf32>
+    %out1_view = pto.make_tensor_view %out1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>} : !pto.tensor_view<?x?xf32>
+    %out2_view = pto.make_tensor_view %out2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>} : !pto.tensor_view<?x?xf32>
+
+    // Direct path without TMOV to isolate whether TMOV is necessary for the failure.
+    %dst0 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %dst1 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %dst2 = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+
+    %in0_part = pto.partition_view %in0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+    %in1_part = pto.partition_view %in1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+    %in2_part = pto.partition_view %in2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+
+    pto.tload ins(%in0_part : !pto.partition_tensor_view<16x1xf32>) outs(%dst0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tload ins(%in1_part : !pto.partition_tensor_view<16x1xf32>) outs(%dst1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tload ins(%in2_part : !pto.partition_tensor_view<16x128xf32>) outs(%dst2 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+
+    %out0_part = pto.partition_view %out0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+    %out1_part = pto.partition_view %out1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+    %out2_part = pto.partition_view %out2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+
+    pto.tstore ins(%dst0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%out0_part : !pto.partition_tensor_view<16x1xf32>)
+    pto.tstore ins(%dst1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%out1_part : !pto.partition_tensor_view<16x1xf32>)
+    pto.tstore ins(%dst2 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%out2_part : !pto.partition_tensor_view<16x128xf32>)
+    return
+  }
+}
diff --git a/test/basic/issue828_diag_else_trace_print_a5.pto b/test/basic/issue828_diag_else_trace_print_a5.pto
new file mode 100644
index 000000000..d5be3e3d2
--- /dev/null
+++ b/test/basic/issue828_diag_else_trace_print_a5.pto
@@ -0,0 +1,160 @@
+// ELSE-only diagnostic variant from issue #828 (force branch condition false).
+// This case inserts scalar print checkpoints to narrow down which op triggers runtime failure.
+// Note: avoid pto.tprint here because some board CANN versions do not provide TPRINT_IMPL.
+// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s > /dev/null
+// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s > /dev/null
+
+module attributes {pto.target_arch = "a5"} {
+  func.func @softmax_rescale_incore_1(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<f32>, %arg4: !pto.ptr<f32>, %arg5: !pto.ptr<f32>, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c64 = arith.constant 256 : i64
+  %c128 = arith.constant 512 : i64
+  %c192 = arith.constant 768 : i64
+  %c256 = arith.constant 1024 : i64
+  %c8448 = arith.constant 33792 : i64
+  %c16640 = arith.constant 66560 : i64
+  %c16704 = arith.constant 66816 : i64
+  %c16768 = arith.constant 67072 : i64
+  %c16832 = arith.constant 67328 : i64
+  %c16896 = arith.constant 67584 : i64
+  %c16 = arith.constant 16 : index
+  %c1 = arith.constant 1 : index
+  %7 = arith.constant 128 : index
+  %c0 = arith.constant 0 : index
+  %dbg0 = arith.constant 0 : i32
+  %cur_li__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %cur_mi__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %li__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %mi__iter_v1_view = pto.make_tensor_view %arg3, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %oi__iter_v1_view = pto.make_tensor_view %arg4, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %oi_tmp__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %li__phi_v5 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %mi__phi_v5 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %oi__phi_v5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %cur_li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %cur_li__ssa_v0_pview = pto.partition_view %cur_li__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tload ins(%cur_li__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %cur_mi__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %cur_mi__ssa_v0_pview = pto.partition_view %cur_mi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tload ins(%cur_mi__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %li__tile = pto.alloc_tile addr = %c128 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %li__iter_v1_pview = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tload ins(%li__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %mi__tile = pto.alloc_tile addr = %c192 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %mi__iter_v1_pview = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tload ins(%mi__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %oi__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %oi__iter_v1_pview = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+  pto.tload ins(%oi__iter_v1_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %oi_tmp__tile = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %oi_tmp__ssa_v0_pview = pto.partition_view %oi_tmp__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+  pto.tload ins(%oi_tmp__ssa_v0_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi_tmp__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %8 = arith.cmpi eq, %c1, %c0 : index
+  scf.if %8 {
+    %oi__ssa_v3 = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %li__ssa_v3 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %mi__ssa_v3 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%li__ssa_v3 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%li__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%mi__ssa_v3 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%mi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%oi__ssa_v3 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%oi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  } else {
+    pto.print ins("E00_enter_else", %dbg0 : i32)
+    %mi_new__rm_a0_tmp_v0 = pto.alloc_tile addr = %c192 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %mi_new__rm_a1_tmp_v1 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %mi_new__row_major_tmp_v2 = pto.alloc_tile addr = %c16640 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.print ins("E01_B_tmax", %dbg0 : i32)
+    pto.tmax ins(%mi_new__rm_a0_tmp_v0, %mi_new__rm_a1_tmp_v1 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%mi_new__row_major_tmp_v2 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.print ins("E01_A_tmax", %dbg0 : i32)
+    %mi_new__tile = pto.alloc_tile addr = %c16640 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v3 = pto.alloc_tile addr = %c192 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v5 = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.print ins("E02_B_tsub0", %dbg0 : i32)
+    pto.tsub ins(%t__rm_a0_tmp_v3, %t__rm_a1_tmp_v4 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.print ins("E02_A_tsub0", %dbg0 : i32)
+    %t__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %alpha__rm_a0_tmp_v6 = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %alpha__row_major_tmp_v7 = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.print ins("E03_B_texp0", %dbg0 : i32)
+    pto.texp ins(%alpha__rm_a0_tmp_v6 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%alpha__row_major_tmp_v7 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.print ins("E03_A_texp0", %dbg0 : i32)
+    %alpha__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v8 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v9 = pto.alloc_tile addr = %c16640 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v10 = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.print ins("E04_B_tsub1", %dbg0 : i32)
+    pto.tsub ins(%t__rm_a0_tmp_v8, %t__rm_a1_tmp_v9 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v10 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.print ins("E04_A_tsub1", %dbg0 : i32)
+    %0 = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %beta__rm_a0_tmp_v11 = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %beta__row_major_tmp_v12 = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.print ins("E05_B_texp1", %dbg0 : i32)
+    pto.texp ins(%beta__rm_a0_tmp_v11 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%beta__row_major_tmp_v12 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.print ins("E05_A_texp1", %dbg0 : i32)
+    %beta__tile = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v13 = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v14 = pto.alloc_tile addr = %c128 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v15 = pto.alloc_tile addr = %c16832 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.print ins("E06_B_tmul0", %dbg0 : i32)
+    pto.tmul ins(%t__rm_a0_tmp_v13, %t__rm_a1_tmp_v14 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v15 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.print ins("E06_A_tmul0", %dbg0 : i32)
+    %1 = pto.alloc_tile addr = %c16832 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v16 = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v17 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v18 = pto.alloc_tile addr = %c16896 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.print ins("E07_B_tmul1", %dbg0 : i32)
+    pto.tmul ins(%t__rm_a0_tmp_v16, %t__rm_a1_tmp_v17 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v18 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.print ins("E07_A_tmul1", %dbg0 : i32)
+    %2 = pto.alloc_tile addr = %c16896 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %li__rm_a0_tmp_v19 = pto.alloc_tile addr = %c16832 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %li__rm_a1_tmp_v20 = pto.alloc_tile addr = %c16896 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %li__row_major_tmp_v21 = pto.alloc_tile addr = %c16832 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.print ins("E08_B_tadd16", %dbg0 : i32)
+    pto.tadd ins(%li__rm_a0_tmp_v19, %li__rm_a1_tmp_v20 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%li__row_major_tmp_v21 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.print ins("E08_A_tadd16", %dbg0 : i32)
+    %3 = pto.alloc_tile addr = %c16832 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %4 = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.print ins("E09_B_trowexpandmul0", %dbg0 : i32)
+    pto.trowexpandmul ins(%oi__tile, %alpha__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.print ins("E09_A_trowexpandmul0", %dbg0 : i32)
+    %5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.print ins("E10_B_trowexpandmul1", %dbg0 : i32)
+    pto.trowexpandmul ins(%oi_tmp__tile, %beta__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.print ins("E10_A_trowexpandmul1", %dbg0 : i32)
+    %6 = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.print ins("E11_B_tadd128", %dbg0 : i32)
+    pto.tadd ins(%4, %5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%6 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.print ins("E11_A_tadd128", %dbg0 : i32)
+    %mi__ssa_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %li__tile_mv = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    pto.print ins("E12_B_tmov_li_tmp", %dbg0 : i32)
+    pto.tmov ins(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%li__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.print ins("E12_A_tmov_li_tmp", %dbg0 : i32)
+    %mi__ssa_v4_mv = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    pto.print ins("E13_B_tmov_mi_tmp", %dbg0 : i32)
+    pto.tmov ins(%mi__ssa_v4 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%mi__ssa_v4_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.print ins("E13_A_tmov_mi_tmp", %dbg0 : i32)
+    %oi__tile_mv = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.print ins("E14_B_tmov_oi_tmp", %dbg0 : i32)
+    pto.tmov ins(%6 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%oi__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.print ins("E14_A_tmov_oi_tmp", %dbg0 : i32)
+    pto.print ins("E15_B_tmov_li_phi", %dbg0 : i32)
+    pto.tmov ins(%li__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%li__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.print ins("E15_A_tmov_li_phi", %dbg0 : i32)
+    pto.print ins("E16_B_tmov_mi_phi", %dbg0 : i32)
+    pto.tmov ins(%mi__ssa_v4_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%mi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.print ins("E16_A_tmov_mi_phi", %dbg0 : i32)
+    pto.print ins("E17_B_tmov_oi_phi", %dbg0 : i32)
+    pto.tmov ins(%oi__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%oi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.print ins("E17_A_tmov_oi_phi", %dbg0 : i32)
+    pto.print ins("E99_leave_else", %dbg0 : i32)
+  }
+  %9 = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tstore ins(%li__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%9 : !pto.partition_tensor_view<16x1xf32>)
+  %10 = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tstore ins(%mi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%10 : !pto.partition_tensor_view<16x1xf32>)
+  %11 = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+  pto.tstore ins(%oi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%11 : !pto.partition_tensor_view<16x128xf32>)
+  return
+  }
+}
diff --git a/test/basic/issue828_diag_if_identity_only_a5.pto b/test/basic/issue828_diag_if_identity_only_a5.pto
new file mode 100644
index 000000000..f4d12dd98
--- /dev/null
+++ b/test/basic/issue828_diag_if_identity_only_a5.pto
@@ -0,0 +1,64 @@
+// Diagnostic case A (IF identity TMOV only):
+// Goal: isolate IF-path identity tmov behavior under auto-sync.
+// Suggested board runs:
+//   /run A5 issue828_diag_if_identity_only_a5 --pto-level=level3
+//   /run A5 issue828_diag_if_identity_only_a5 --pto-level=level3 --disable-identity-tmov-cleanup
+// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s > /dev/null
+// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s > /dev/null
+
+module attributes {pto.target_arch = "a5"} {
+  func.func @issue828_diag_if_identity_only_a5(
+      %in0: !pto.ptr<f32>, %in1: !pto.ptr<f32>, %in2: !pto.ptr<f32>,
+      %out0: !pto.ptr<f32>, %out1: !pto.ptr<f32>, %out2: !pto.ptr<f32>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0i = arith.constant 0 : i64
+    %c64 = arith.constant 64 : i64
+    %c8448 = arith.constant 8448 : i64
+
+    %c16 = arith.constant 16 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
+
+    %in0_view = pto.make_tensor_view %in0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>} : !pto.tensor_view<?x?xf32>
+    %in1_view = pto.make_tensor_view %in1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>} : !pto.tensor_view<?x?xf32>
+    %in2_view = pto.make_tensor_view %in2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>} : !pto.tensor_view<?x?xf32>
+
+    %out0_view = pto.make_tensor_view %out0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>} : !pto.tensor_view<?x?xf32>
+    %out1_view = pto.make_tensor_view %out1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>} : !pto.tensor_view<?x?xf32>
+    %out2_view = pto.make_tensor_view %out2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>} : !pto.tensor_view<?x?xf32>
+
+    %dst0 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %dst1 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %dst2 = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+
+    %in0_part = pto.partition_view %in0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+    %in1_part = pto.partition_view %in1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+    %in2_part = pto.partition_view %in2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+
+    pto.tload ins(%in0_part : !pto.partition_tensor_view<16x1xf32>) outs(%dst0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tload ins(%in1_part : !pto.partition_tensor_view<16x1xf32>) outs(%dst1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tload ins(%in2_part : !pto.partition_tensor_view<16x128xf32>) outs(%dst2 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+
+    %cond = arith.cmpi eq, %c0, %c0 : index
+    scf.if %cond {
+      // Distinct SSA, same address as destination (identity by address-range proof).
+      %src0_alias = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+      %src1_alias = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+      %src2_alias = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+
+      pto.tmov ins(%src0_alias : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%dst0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+      pto.tmov ins(%src1_alias : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%dst1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+      pto.tmov ins(%src2_alias : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%dst2 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    }
+
+    %out0_part = pto.partition_view %out0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+    %out1_part = pto.partition_view %out1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+    %out2_part = pto.partition_view %out2_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+
+    pto.tstore ins(%dst0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%out0_part : !pto.partition_tensor_view<16x1xf32>)
+    pto.tstore ins(%dst1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%out1_part : !pto.partition_tensor_view<16x1xf32>)
+    pto.tstore ins(%dst2 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%out2_part : !pto.partition_tensor_view<16x128xf32>)
+    return
+  }
+}
diff --git a/test/basic/issue828_softmax_rescale_incore_1_a5.pto b/test/basic/issue828_softmax_rescale_incore_1_a5.pto
new file mode 100644
index 000000000..2f4704e6c
--- /dev/null
+++ b/test/basic/issue828_softmax_rescale_incore_1_a5.pto
@@ -0,0 +1,130 @@
+// Regression from: https://github.com/hw-native-sys/pypto/issues/828
+// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s | FileCheck %s --check-prefix=SAFE
+// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s | FileCheck %s --check-prefix=UNSAFE
+
+module attributes {pto.target_arch = "a5"} {
+  func.func @softmax_rescale_incore_1(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<f32>, %arg4: !pto.ptr<f32>, %arg5: !pto.ptr<f32>, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c64 = arith.constant 64 : i64
+  %c128 = arith.constant 128 : i64
+  %c192 = arith.constant 192 : i64
+  %c256 = arith.constant 256 : i64
+  %c8448 = arith.constant 8448 : i64
+  %c16640 = arith.constant 16640 : i64
+  %c16704 = arith.constant 16704 : i64
+  %c16768 = arith.constant 16768 : i64
+  %c16832 = arith.constant 16832 : i64
+  %c16896 = arith.constant 16896 : i64
+  %c16 = arith.constant 16 : index
+  %c1 = arith.constant 1 : index
+  %7 = arith.constant 128 : index
+  %c0 = arith.constant 0 : index
+  %cur_li__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %cur_mi__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %li__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %mi__iter_v1_view = pto.make_tensor_view %arg3, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %oi__iter_v1_view = pto.make_tensor_view %arg4, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %oi_tmp__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %li__phi_v5 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %mi__phi_v5 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %oi__phi_v5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %cur_li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %cur_li__ssa_v0_pview = pto.partition_view %cur_li__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tload ins(%cur_li__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %cur_mi__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %cur_mi__ssa_v0_pview = pto.partition_view %cur_mi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tload ins(%cur_mi__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %li__tile = pto.alloc_tile addr = %c128 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %li__iter_v1_pview = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tload ins(%li__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %mi__tile = pto.alloc_tile addr = %c192 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %mi__iter_v1_pview = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tload ins(%mi__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %oi__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %oi__iter_v1_pview = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+  pto.tload ins(%oi__iter_v1_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %oi_tmp__tile = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %oi_tmp__ssa_v0_pview = pto.partition_view %oi_tmp__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+  pto.tload ins(%oi_tmp__ssa_v0_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi_tmp__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %8 = arith.cmpi eq, %arg6, %c0 : index
+  scf.if %8 {
+    %oi__ssa_v3 = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %li__ssa_v3 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %mi__ssa_v3 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%li__ssa_v3 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%li__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%mi__ssa_v3 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%mi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%oi__ssa_v3 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%oi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  } else {
+    %mi_new__rm_a0_tmp_v0 = pto.alloc_tile addr = %c192 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %mi_new__rm_a1_tmp_v1 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %mi_new__row_major_tmp_v2 = pto.alloc_tile addr = %c16640 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmax ins(%mi_new__rm_a0_tmp_v0, %mi_new__rm_a1_tmp_v1 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%mi_new__row_major_tmp_v2 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %mi_new__tile = pto.alloc_tile addr = %c16640 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v3 = pto.alloc_tile addr = %c192 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v5 = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tsub ins(%t__rm_a0_tmp_v3, %t__rm_a1_tmp_v4 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %alpha__rm_a0_tmp_v6 = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %alpha__row_major_tmp_v7 = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.texp ins(%alpha__rm_a0_tmp_v6 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%alpha__row_major_tmp_v7 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %alpha__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v8 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v9 = pto.alloc_tile addr = %c16640 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v10 = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tsub ins(%t__rm_a0_tmp_v8, %t__rm_a1_tmp_v9 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v10 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %0 = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %beta__rm_a0_tmp_v11 = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %beta__row_major_tmp_v12 = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.texp ins(%beta__rm_a0_tmp_v11 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%beta__row_major_tmp_v12 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %beta__tile = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v13 = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v14 = pto.alloc_tile addr = %c128 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v15 = pto.alloc_tile addr = %c16832 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmul ins(%t__rm_a0_tmp_v13, %t__rm_a1_tmp_v14 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v15 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %1 = pto.alloc_tile addr = %c16832 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v16 = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v17 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v18 = pto.alloc_tile addr = %c16896 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmul ins(%t__rm_a0_tmp_v16, %t__rm_a1_tmp_v17 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v18 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %2 = pto.alloc_tile addr = %c16896 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %li__rm_a0_tmp_v19 = pto.alloc_tile addr = %c16832 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %li__rm_a1_tmp_v20 = pto.alloc_tile addr = %c16896 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %li__row_major_tmp_v21 = pto.alloc_tile addr = %c16832 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%li__rm_a0_tmp_v19, %li__rm_a1_tmp_v20 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%li__row_major_tmp_v21 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %3 = pto.alloc_tile addr = %c16832 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %4 = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.trowexpandmul ins(%oi__tile, %alpha__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.trowexpandmul ins(%oi_tmp__tile, %beta__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %6 = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%4, %5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%6 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %mi__ssa_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %li__tile_mv = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%li__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    %mi__ssa_v4_mv = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%mi__ssa_v4 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%mi__ssa_v4_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    %oi__tile_mv = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%6 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%oi__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%li__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%li__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%mi__ssa_v4_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%mi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%oi__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%oi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  }
+  %9 = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tstore ins(%li__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%9 : !pto.partition_tensor_view<16x1xf32>)
+  %10 = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tstore ins(%mi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%10 : !pto.partition_tensor_view<16x1xf32>)
+  %11 = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+  pto.tstore ins(%oi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%11 : !pto.partition_tensor_view<16x128xf32>)
+  return
+  }
+}
+
+// SAFE-LABEL: __global__ AICORE void softmax_rescale_incore_1(
+// SAFE: set_flag(PIPE_MTE2, PIPE_V, EVENT_ID2);
+// SAFE-NOT: set_flag(PIPE_MTE2, PIPE_V, EVENT_ID4);
+// SAFE-NOT: wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID4);
+
+// UNSAFE-LABEL: __global__ AICORE void softmax_rescale_incore_1(
+// UNSAFE: set_flag(PIPE_MTE2, PIPE_V, EVENT_ID4);
+// UNSAFE: wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID4);
diff --git a/test/basic/issue828_softmax_rescale_incore_1_a5_else.pto b/test/basic/issue828_softmax_rescale_incore_1_a5_else.pto
new file mode 100644
index 000000000..bbecfdd43
--- /dev/null
+++ b/test/basic/issue828_softmax_rescale_incore_1_a5_else.pto
@@ -0,0 +1,121 @@
+// ELSE-only variant from issue #828 (force branch condition false).
+// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s > /dev/null
+// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s > /dev/null
+
+module attributes {pto.target_arch = "a5"} {
+  func.func @softmax_rescale_incore_1(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<f32>, %arg4: !pto.ptr<f32>, %arg5: !pto.ptr<f32>, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c64 = arith.constant 64 : i64
+  %c128 = arith.constant 128 : i64
+  %c192 = arith.constant 192 : i64
+  %c256 = arith.constant 256 : i64
+  %c8448 = arith.constant 8448 : i64
+  %c16640 = arith.constant 16640 : i64
+  %c16704 = arith.constant 16704 : i64
+  %c16768 = arith.constant 16768 : i64
+  %c16832 = arith.constant 16832 : i64
+  %c16896 = arith.constant 16896 : i64
+  %c16 = arith.constant 16 : index
+  %c1 = arith.constant 1 : index
+  %7 = arith.constant 128 : index
+  %c0 = arith.constant 0 : index
+  %cur_li__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %cur_mi__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %li__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %mi__iter_v1_view = pto.make_tensor_view %arg3, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %oi__iter_v1_view = pto.make_tensor_view %arg4, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %oi_tmp__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %li__phi_v5 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %mi__phi_v5 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %oi__phi_v5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %cur_li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %cur_li__ssa_v0_pview = pto.partition_view %cur_li__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tload ins(%cur_li__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %cur_mi__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %cur_mi__ssa_v0_pview = pto.partition_view %cur_mi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tload ins(%cur_mi__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %li__tile = pto.alloc_tile addr = %c128 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %li__iter_v1_pview = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tload ins(%li__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %mi__tile = pto.alloc_tile addr = %c192 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %mi__iter_v1_pview = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tload ins(%mi__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %oi__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %oi__iter_v1_pview = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+  pto.tload ins(%oi__iter_v1_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %oi_tmp__tile = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %oi_tmp__ssa_v0_pview = pto.partition_view %oi_tmp__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+  pto.tload ins(%oi_tmp__ssa_v0_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi_tmp__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %8 = arith.cmpi eq, %c1, %c0 : index
+  scf.if %8 {
+    %oi__ssa_v3 = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %li__ssa_v3 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %mi__ssa_v3 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%li__ssa_v3 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%li__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%mi__ssa_v3 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%mi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%oi__ssa_v3 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%oi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  } else {
+    %mi_new__rm_a0_tmp_v0 = pto.alloc_tile addr = %c192 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %mi_new__rm_a1_tmp_v1 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %mi_new__row_major_tmp_v2 = pto.alloc_tile addr = %c16640 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmax ins(%mi_new__rm_a0_tmp_v0, %mi_new__rm_a1_tmp_v1 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%mi_new__row_major_tmp_v2 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %mi_new__tile = pto.alloc_tile addr = %c16640 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v3 = pto.alloc_tile addr = %c192 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v5 = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tsub ins(%t__rm_a0_tmp_v3, %t__rm_a1_tmp_v4 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %alpha__rm_a0_tmp_v6 = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %alpha__row_major_tmp_v7 = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.texp ins(%alpha__rm_a0_tmp_v6 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%alpha__row_major_tmp_v7 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %alpha__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v8 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v9 = pto.alloc_tile addr = %c16640 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v10 = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tsub ins(%t__rm_a0_tmp_v8, %t__rm_a1_tmp_v9 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v10 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %0 = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %beta__rm_a0_tmp_v11 = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %beta__row_major_tmp_v12 = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.texp ins(%beta__rm_a0_tmp_v11 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%beta__row_major_tmp_v12 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %beta__tile = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v13 = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v14 = pto.alloc_tile addr = %c128 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v15 = pto.alloc_tile addr = %c16832 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmul ins(%t__rm_a0_tmp_v13, %t__rm_a1_tmp_v14 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v15 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %1 = pto.alloc_tile addr = %c16832 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v16 = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v17 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v18 = pto.alloc_tile addr = %c16896 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmul ins(%t__rm_a0_tmp_v16, %t__rm_a1_tmp_v17 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v18 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %2 = pto.alloc_tile addr = %c16896 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %li__rm_a0_tmp_v19 = pto.alloc_tile addr = %c16832 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %li__rm_a1_tmp_v20 = pto.alloc_tile addr = %c16896 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %li__row_major_tmp_v21 = pto.alloc_tile addr = %c16832 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%li__rm_a0_tmp_v19, %li__rm_a1_tmp_v20 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%li__row_major_tmp_v21 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %3 = pto.alloc_tile addr = %c16832 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %4 = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.trowexpandmul ins(%oi__tile, %alpha__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.trowexpandmul ins(%oi_tmp__tile, %beta__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %6 = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%4, %5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%6 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %mi__ssa_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %li__tile_mv = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%li__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    %mi__ssa_v4_mv = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%mi__ssa_v4 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%mi__ssa_v4_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    %oi__tile_mv = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%6 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%oi__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%li__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%li__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%mi__ssa_v4_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%mi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%oi__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%oi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  }
+  %9 = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tstore ins(%li__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%9 : !pto.partition_tensor_view<16x1xf32>)
+  %10 = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tstore ins(%mi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%10 : !pto.partition_tensor_view<16x1xf32>)
+  %11 = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+  pto.tstore ins(%oi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%11 : !pto.partition_tensor_view<16x128xf32>)
+  return
+  }
+}
diff --git a/test/basic/issue828_softmax_rescale_incore_1_a5_else_aligned.pto b/test/basic/issue828_softmax_rescale_incore_1_a5_else_aligned.pto
new file mode 100644
index 000000000..b1a2b70e1
--- /dev/null
+++ b/test/basic/issue828_softmax_rescale_incore_1_a5_else_aligned.pto
@@ -0,0 +1,121 @@
+// ELSE-only aligned-address variant from issue #828 (force branch condition false).
+// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s > /dev/null
+// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s > /dev/null
+
+module attributes {pto.target_arch = "a5"} {
+  func.func @softmax_rescale_incore_1(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<f32>, %arg4: !pto.ptr<f32>, %arg5: !pto.ptr<f32>, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c64 = arith.constant 256 : i64
+  %c128 = arith.constant 512 : i64
+  %c192 = arith.constant 768 : i64
+  %c256 = arith.constant 1024 : i64
+  %c8448 = arith.constant 33792 : i64
+  %c16640 = arith.constant 66560 : i64
+  %c16704 = arith.constant 66816 : i64
+  %c16768 = arith.constant 67072 : i64
+  %c16832 = arith.constant 67328 : i64
+  %c16896 = arith.constant 67584 : i64
+  %c16 = arith.constant 16 : index
+  %c1 = arith.constant 1 : index
+  %7 = arith.constant 128 : index
+  %c0 = arith.constant 0 : index
+  %cur_li__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %cur_mi__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %li__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %mi__iter_v1_view = pto.make_tensor_view %arg3, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %oi__iter_v1_view = pto.make_tensor_view %arg4, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %oi_tmp__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %li__phi_v5 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %mi__phi_v5 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %oi__phi_v5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %cur_li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %cur_li__ssa_v0_pview = pto.partition_view %cur_li__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tload ins(%cur_li__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %cur_mi__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %cur_mi__ssa_v0_pview = pto.partition_view %cur_mi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tload ins(%cur_mi__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %li__tile = pto.alloc_tile addr = %c128 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %li__iter_v1_pview = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tload ins(%li__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %mi__tile = pto.alloc_tile addr = %c192 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %mi__iter_v1_pview = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tload ins(%mi__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %oi__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %oi__iter_v1_pview = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+  pto.tload ins(%oi__iter_v1_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %oi_tmp__tile = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %oi_tmp__ssa_v0_pview = pto.partition_view %oi_tmp__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+  pto.tload ins(%oi_tmp__ssa_v0_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi_tmp__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %8 = arith.cmpi eq, %c1, %c0 : index
+  scf.if %8 {
+    %oi__ssa_v3 = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %li__ssa_v3 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %mi__ssa_v3 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%li__ssa_v3 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%li__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%mi__ssa_v3 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%mi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%oi__ssa_v3 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%oi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  } else {
+    %mi_new__rm_a0_tmp_v0 = pto.alloc_tile addr = %c192 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %mi_new__rm_a1_tmp_v1 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %mi_new__row_major_tmp_v2 = pto.alloc_tile addr = %c16640 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmax ins(%mi_new__rm_a0_tmp_v0, %mi_new__rm_a1_tmp_v1 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%mi_new__row_major_tmp_v2 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %mi_new__tile = pto.alloc_tile addr = %c16640 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v3 = pto.alloc_tile addr = %c192 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v5 = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tsub ins(%t__rm_a0_tmp_v3, %t__rm_a1_tmp_v4 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %alpha__rm_a0_tmp_v6 = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %alpha__row_major_tmp_v7 = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.texp ins(%alpha__rm_a0_tmp_v6 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%alpha__row_major_tmp_v7 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %alpha__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v8 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v9 = pto.alloc_tile addr = %c16640 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v10 = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tsub ins(%t__rm_a0_tmp_v8, %t__rm_a1_tmp_v9 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v10 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %0 = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %beta__rm_a0_tmp_v11 = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %beta__row_major_tmp_v12 = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.texp ins(%beta__rm_a0_tmp_v11 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%beta__row_major_tmp_v12 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %beta__tile = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v13 = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v14 = pto.alloc_tile addr = %c128 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v15 = pto.alloc_tile addr = %c16832 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmul ins(%t__rm_a0_tmp_v13, %t__rm_a1_tmp_v14 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v15 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %1 = pto.alloc_tile addr = %c16832 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v16 = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v17 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v18 = pto.alloc_tile addr = %c16896 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmul ins(%t__rm_a0_tmp_v16, %t__rm_a1_tmp_v17 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v18 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %2 = pto.alloc_tile addr = %c16896 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %li__rm_a0_tmp_v19 = pto.alloc_tile addr = %c16832 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %li__rm_a1_tmp_v20 = pto.alloc_tile addr = %c16896 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %li__row_major_tmp_v21 = pto.alloc_tile addr = %c16832 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%li__rm_a0_tmp_v19, %li__rm_a1_tmp_v20 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%li__row_major_tmp_v21 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %3 = pto.alloc_tile addr = %c16832 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %4 = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.trowexpandmul ins(%oi__tile, %alpha__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.trowexpandmul ins(%oi_tmp__tile, %beta__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %6 = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%4, %5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%6 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %mi__ssa_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %li__tile_mv = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%li__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    %mi__ssa_v4_mv = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%mi__ssa_v4 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%mi__ssa_v4_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    %oi__tile_mv = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%6 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%oi__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%li__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%li__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%mi__ssa_v4_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%mi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%oi__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%oi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  }
+  %9 = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tstore ins(%li__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%9 : !pto.partition_tensor_view<16x1xf32>)
+  %10 = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tstore ins(%mi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%10 : !pto.partition_tensor_view<16x1xf32>)
+  %11 = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+  pto.tstore ins(%oi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%11 : !pto.partition_tensor_view<16x128xf32>)
+  return
+  }
+}
diff --git a/test/basic/issue828_softmax_rescale_incore_1_a5_if.pto b/test/basic/issue828_softmax_rescale_incore_1_a5_if.pto
new file mode 100644
index 000000000..66fd836f0
--- /dev/null
+++ b/test/basic/issue828_softmax_rescale_incore_1_a5_if.pto
@@ -0,0 +1,121 @@
+// IF-only variant from issue #828 (force branch condition true).
+// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s > /dev/null
+// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s > /dev/null
+
+module attributes {pto.target_arch = "a5"} {
+  func.func @softmax_rescale_incore_1(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<f32>, %arg4: !pto.ptr<f32>, %arg5: !pto.ptr<f32>, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c64 = arith.constant 64 : i64
+  %c128 = arith.constant 128 : i64
+  %c192 = arith.constant 192 : i64
+  %c256 = arith.constant 256 : i64
+  %c8448 = arith.constant 8448 : i64
+  %c16640 = arith.constant 16640 : i64
+  %c16704 = arith.constant 16704 : i64
+  %c16768 = arith.constant 16768 : i64
+  %c16832 = arith.constant 16832 : i64
+  %c16896 = arith.constant 16896 : i64
+  %c16 = arith.constant 16 : index
+  %c1 = arith.constant 1 : index
+  %7 = arith.constant 128 : index
+  %c0 = arith.constant 0 : index
+  %cur_li__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %cur_mi__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %li__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %mi__iter_v1_view = pto.make_tensor_view %arg3, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %oi__iter_v1_view = pto.make_tensor_view %arg4, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %oi_tmp__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %li__phi_v5 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %mi__phi_v5 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %oi__phi_v5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %cur_li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %cur_li__ssa_v0_pview = pto.partition_view %cur_li__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tload ins(%cur_li__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %cur_mi__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %cur_mi__ssa_v0_pview = pto.partition_view %cur_mi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tload ins(%cur_mi__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %li__tile = pto.alloc_tile addr = %c128 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %li__iter_v1_pview = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tload ins(%li__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %mi__tile = pto.alloc_tile addr = %c192 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %mi__iter_v1_pview = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tload ins(%mi__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %oi__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %oi__iter_v1_pview = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+  pto.tload ins(%oi__iter_v1_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %oi_tmp__tile = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %oi_tmp__ssa_v0_pview = pto.partition_view %oi_tmp__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+  pto.tload ins(%oi_tmp__ssa_v0_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi_tmp__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %8 = arith.cmpi eq, %c0, %c0 : index
+  scf.if %8 {
+    %oi__ssa_v3 = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %li__ssa_v3 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %mi__ssa_v3 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%li__ssa_v3 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%li__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%mi__ssa_v3 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%mi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%oi__ssa_v3 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%oi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  } else {
+    %mi_new__rm_a0_tmp_v0 = pto.alloc_tile addr = %c192 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %mi_new__rm_a1_tmp_v1 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %mi_new__row_major_tmp_v2 = pto.alloc_tile addr = %c16640 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmax ins(%mi_new__rm_a0_tmp_v0, %mi_new__rm_a1_tmp_v1 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%mi_new__row_major_tmp_v2 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %mi_new__tile = pto.alloc_tile addr = %c16640 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v3 = pto.alloc_tile addr = %c192 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v5 = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tsub ins(%t__rm_a0_tmp_v3, %t__rm_a1_tmp_v4 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %alpha__rm_a0_tmp_v6 = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %alpha__row_major_tmp_v7 = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.texp ins(%alpha__rm_a0_tmp_v6 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%alpha__row_major_tmp_v7 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %alpha__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v8 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v9 = pto.alloc_tile addr = %c16640 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v10 = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tsub ins(%t__rm_a0_tmp_v8, %t__rm_a1_tmp_v9 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v10 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %0 = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %beta__rm_a0_tmp_v11 = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %beta__row_major_tmp_v12 = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.texp ins(%beta__rm_a0_tmp_v11 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%beta__row_major_tmp_v12 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %beta__tile = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v13 = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v14 = pto.alloc_tile addr = %c128 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v15 = pto.alloc_tile addr = %c16832 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmul ins(%t__rm_a0_tmp_v13, %t__rm_a1_tmp_v14 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v15 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %1 = pto.alloc_tile addr = %c16832 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v16 = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v17 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v18 = pto.alloc_tile addr = %c16896 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmul ins(%t__rm_a0_tmp_v16, %t__rm_a1_tmp_v17 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v18 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %2 = pto.alloc_tile addr = %c16896 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %li__rm_a0_tmp_v19 = pto.alloc_tile addr = %c16832 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %li__rm_a1_tmp_v20 = pto.alloc_tile addr = %c16896 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %li__row_major_tmp_v21 = pto.alloc_tile addr = %c16832 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%li__rm_a0_tmp_v19, %li__rm_a1_tmp_v20 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%li__row_major_tmp_v21 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %3 = pto.alloc_tile addr = %c16832 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %4 = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.trowexpandmul ins(%oi__tile, %alpha__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.trowexpandmul ins(%oi_tmp__tile, %beta__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %6 = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%4, %5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%6 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %mi__ssa_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %li__tile_mv = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%li__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    %mi__ssa_v4_mv = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%mi__ssa_v4 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%mi__ssa_v4_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    %oi__tile_mv = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%6 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%oi__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%li__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%li__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%mi__ssa_v4_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%mi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%oi__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%oi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  }
+  %9 = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tstore ins(%li__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%9 : !pto.partition_tensor_view<16x1xf32>)
+  %10 = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tstore ins(%mi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%10 : !pto.partition_tensor_view<16x1xf32>)
+  %11 = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+  pto.tstore ins(%oi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%11 : !pto.partition_tensor_view<16x128xf32>)
+  return
+  }
+}
diff --git a/test/basic/issue828_softmax_rescale_incore_1_a5_if_aligned.pto b/test/basic/issue828_softmax_rescale_incore_1_a5_if_aligned.pto
new file mode 100644
index 000000000..590e4885b
--- /dev/null
+++ b/test/basic/issue828_softmax_rescale_incore_1_a5_if_aligned.pto
@@ -0,0 +1,121 @@
+// IF-only aligned-address variant from issue #828 (force branch condition true).
+// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync %s > /dev/null
+// RUN: ptoas --pto-level=level3 --pto-arch=a5 --enable-insert-sync --disable-identity-tmov-cleanup %s > /dev/null
+
+module attributes {pto.target_arch = "a5"} {
+  func.func @softmax_rescale_incore_1(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<f32>, %arg4: !pto.ptr<f32>, %arg5: !pto.ptr<f32>, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c64 = arith.constant 256 : i64
+  %c128 = arith.constant 512 : i64
+  %c192 = arith.constant 768 : i64
+  %c256 = arith.constant 1024 : i64
+  %c8448 = arith.constant 33792 : i64
+  %c16640 = arith.constant 66560 : i64
+  %c16704 = arith.constant 66816 : i64
+  %c16768 = arith.constant 67072 : i64
+  %c16832 = arith.constant 67328 : i64
+  %c16896 = arith.constant 67584 : i64
+  %c16 = arith.constant 16 : index
+  %c1 = arith.constant 1 : index
+  %7 = arith.constant 128 : index
+  %c0 = arith.constant 0 : index
+  %cur_li__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %cur_mi__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %li__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %mi__iter_v1_view = pto.make_tensor_view %arg3, shape = [%c16, %c1], strides = [%c1, %c16] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %oi__iter_v1_view = pto.make_tensor_view %arg4, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %oi_tmp__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %li__phi_v5 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %mi__phi_v5 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %oi__phi_v5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %cur_li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %cur_li__ssa_v0_pview = pto.partition_view %cur_li__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tload ins(%cur_li__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %cur_mi__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %cur_mi__ssa_v0_pview = pto.partition_view %cur_mi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tload ins(%cur_mi__ssa_v0_pview : !pto.partition_tensor_view<16x1xf32>) outs(%cur_mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %li__tile = pto.alloc_tile addr = %c128 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %li__iter_v1_pview = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tload ins(%li__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %mi__tile = pto.alloc_tile addr = %c192 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %mi__iter_v1_pview = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tload ins(%mi__iter_v1_pview : !pto.partition_tensor_view<16x1xf32>) outs(%mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %oi__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %oi__iter_v1_pview = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+  pto.tload ins(%oi__iter_v1_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %oi_tmp__tile = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %oi_tmp__ssa_v0_pview = pto.partition_view %oi_tmp__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+  pto.tload ins(%oi_tmp__ssa_v0_pview : !pto.partition_tensor_view<16x128xf32>) outs(%oi_tmp__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %8 = arith.cmpi eq, %c0, %c0 : index
+  scf.if %8 {
+    %oi__ssa_v3 = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %li__ssa_v3 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %mi__ssa_v3 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%li__ssa_v3 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%li__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%mi__ssa_v3 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%mi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%oi__ssa_v3 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%oi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  } else {
+    %mi_new__rm_a0_tmp_v0 = pto.alloc_tile addr = %c192 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %mi_new__rm_a1_tmp_v1 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %mi_new__row_major_tmp_v2 = pto.alloc_tile addr = %c16640 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmax ins(%mi_new__rm_a0_tmp_v0, %mi_new__rm_a1_tmp_v1 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%mi_new__row_major_tmp_v2 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %mi_new__tile = pto.alloc_tile addr = %c16640 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v3 = pto.alloc_tile addr = %c192 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v5 = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tsub ins(%t__rm_a0_tmp_v3, %t__rm_a1_tmp_v4 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %alpha__rm_a0_tmp_v6 = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %alpha__row_major_tmp_v7 = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.texp ins(%alpha__rm_a0_tmp_v6 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%alpha__row_major_tmp_v7 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %alpha__tile = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v8 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v9 = pto.alloc_tile addr = %c16640 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v10 = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tsub ins(%t__rm_a0_tmp_v8, %t__rm_a1_tmp_v9 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v10 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %0 = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %beta__rm_a0_tmp_v11 = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %beta__row_major_tmp_v12 = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.texp ins(%beta__rm_a0_tmp_v11 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%beta__row_major_tmp_v12 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %beta__tile = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v13 = pto.alloc_tile addr = %c16704 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v14 = pto.alloc_tile addr = %c128 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v15 = pto.alloc_tile addr = %c16832 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmul ins(%t__rm_a0_tmp_v13, %t__rm_a1_tmp_v14 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v15 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %1 = pto.alloc_tile addr = %c16832 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v16 = pto.alloc_tile addr = %c16768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v17 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v18 = pto.alloc_tile addr = %c16896 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmul ins(%t__rm_a0_tmp_v16, %t__rm_a1_tmp_v17 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v18 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %2 = pto.alloc_tile addr = %c16896 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %li__rm_a0_tmp_v19 = pto.alloc_tile addr = %c16832 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %li__rm_a1_tmp_v20 = pto.alloc_tile addr = %c16896 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %li__row_major_tmp_v21 = pto.alloc_tile addr = %c16832 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%li__rm_a0_tmp_v19, %li__rm_a1_tmp_v20 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%li__row_major_tmp_v21 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=16, v_row=1, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %3 = pto.alloc_tile addr = %c16832 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %4 = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.trowexpandmul ins(%oi__tile, %alpha__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %5 = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.trowexpandmul ins(%oi_tmp__tile, %beta__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %6 = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%4, %5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%6 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %mi__ssa_v4 = pto.alloc_tile addr = %c16640 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %li__tile_mv = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%li__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    %mi__ssa_v4_mv = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%mi__ssa_v4 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%mi__ssa_v4_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    %oi__tile_mv = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmov ins(%6 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%oi__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%li__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%li__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%mi__ssa_v4_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%mi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tmov ins(%oi__tile_mv : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%oi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  }
+  %9 = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tstore ins(%li__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%9 : !pto.partition_tensor_view<16x1xf32>)
+  %10 = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x1xf32>
+  pto.tstore ins(%mi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=1, v_row=16, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%10 : !pto.partition_tensor_view<16x1xf32>)
+  %11 = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c16, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+  pto.tstore ins(%oi__phi_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%11 : !pto.partition_tensor_view<16x128xf32>)
+  return
+  }
+}
diff --git a/test/npu_validation/scripts/generate_testcase.py b/test/npu_validation/scripts/generate_testcase.py
index ca802b567..431ca3ccd 100644
--- a/test/npu_validation/scripts/generate_testcase.py
+++ b/test/npu_validation/scripts/generate_testcase.py
@@ -86,6 +86,29 @@
 })
 
 
+def _get_case_scalar_overrides(testcase: str, params):
+    scalar_params = [p for p in params if p.get("kind") == "scalar"]
+
+    # issue#828 guard: the target path is the `arg6 == 0` branch.
+    # Keep default behavior for all other cases.
+    if testcase == "issue828_softmax_rescale_incore_1_a5" and len(scalar_params) == 1:
+        return {scalar_params[0]["name"]: "0"}
+
+    return {}
+
+
+def _get_case_scalar_sweep(testcase: str, params):
+    scalar_params = [p for p in params if p.get("kind") == "scalar"]
+
+    # Run issue#828 twice in one executable:
+    #   1) arg6 == 0 => if branch
+    #   2) arg6 == 1 => else branch
+    if testcase == "issue828_softmax_rescale_incore_1_a5" and len(scalar_params) == 1:
+        return [(scalar_params[0]["name"], "0"), (scalar_params[0]["name"], "1")]
+
+    return []
+
+
 def _parse_shape(text: str):
     match = re.search(r"Shape<(\d+)\s*,\s*(\d+)>", text)
     if match:
@@ -1137,10 +1160,15 @@ def generate_testcase(
                 f"    size_t fileSize_{p['name']} = elemCount_{p['name']} * sizeof({p['host_type']});"
             )
 
+    scalar_overrides = _get_case_scalar_overrides(testcase, params)
     for p in params:
         if p["kind"] != "scalar":
             continue
         t = p["host_type"]
+        override = scalar_overrides.get(p["name"])
+        if override is not None:
+            param_decls_lines.append(f"    {t} {p['name']} = {override};")
+            continue
         # Some PTO-ISA APIs use small POD structs as scalar parameters.
         # Example: pto::MrgSortExecutedNumList (used by TMRGSORT multi-list variants).
         if t.endswith("MrgSortExecutedNumList"):
@@ -1236,6 +1264,18 @@ def generate_testcase(
         # header here instead of `runtime/rt.h` to avoid environment-specific
         # include path issues on some board images.
         runtime_rt_include = '#include <stdint.h>\n#include <ccelib/common/runtime.h>'
+    launch_call = f"    {launch_name}({', '.join(launch_call_args + ['stream'])});"
+    scalar_sweep = _get_case_scalar_sweep(testcase, params)
+    if scalar_sweep:
+        launch_lines = [
+            "    // Branch coverage for issue828: run both if/else paths in one test.",
+        ]
+        for scalar_name, scalar_value in scalar_sweep:
+            launch_lines.append(f"    {scalar_name} = {scalar_value};")
+            launch_lines.append(f"    {launch_name}({', '.join(launch_call_args + ['stream'])});")
+            launch_lines.append("    ACL_CHECK(aclrtSynchronizeStream(stream));")
+        launch_call = "\n".join(launch_lines)
+
     main_cpp = (
         template
         .replace("@RUNTIME_RT_INCLUDE@", runtime_rt_include)
@@ -1252,10 +1292,7 @@ def generate_testcase(
         .replace("@INIT_RUNTIME_PTRS@", "\n".join(init_runtime_ptrs))
         .replace("@READ_INPUTS@", "\n".join(read_inputs))
         .replace("@COPY_TO_DEVICE@", "\n".join(copy_inputs))
-        .replace(
-            "@LAUNCH_CALL@",
-            f"    {launch_name}({', '.join(launch_call_args + ['stream'])});",
-        )
+        .replace("@LAUNCH_CALL@", launch_call)
         .replace("@COPY_BACK@", "\n".join(output_copy_back))
         .replace("@WRITE_OUTPUT@", "\n".join(output_write))
         .replace("@FREE_DEVICE@", "\n".join(free_device))
diff --git a/tools/ptoas/ptoas.cpp b/tools/ptoas/ptoas.cpp
index e7034ecab..6fbb83b2a 100644
--- a/tools/ptoas/ptoas.cpp
+++ b/tools/ptoas/ptoas.cpp
@@ -183,6 +183,11 @@ static llvm::cl::opt<bool> enableInsertSync("enable-insert-sync",
                                             llvm::cl::desc("Enable automatic synchronization insertion pass"),
                                             llvm::cl::init(false));
 
+static llvm::cl::opt<bool> disableIdentityTMovCleanup(
+    "disable-identity-tmov-cleanup",
+    llvm::cl::desc("Disable A5 identity tmov cleanup pass before auto sync"),
+    llvm::cl::init(false));
+
 static llvm::cl::opt<bool> disableInferLayout(
     "disable-infer-layout",
     llvm::cl::desc("Disable PTO layout inference pass (static-only)"),
@@ -1111,8 +1116,13 @@ int main(int argc, char **argv) {
   pm.addPass(pto::createPTOResolveReservedBuffersPass());
 
   // Conditionally add Sync pass based on flag.
-  if (enableInsertSync)
+  if (enableInsertSync) {
+    if (!disableIdentityTMovCleanup) {
+      pm.addNestedPass<mlir::func::FuncOp>(
+          pto::createPTORemoveIdentityTMovPass());
+    }
     pm.addNestedPass<mlir::func::FuncOp>(pto::createPTOInsertSyncPass());
+  }
 
   pm.addPass(createCSEPass());
   if (arch == "a3") {