diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index fb4749ec..01ab24d3 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -33,7 +33,7 @@ on:
       skip_cases:
         description: "Comma/space separated testcase names to skip (e.g. scatter,mrgsort)"
         type: string
-        default: "mix_kernel,vadd_validshape,vadd_validshape_dynamic,print,storefp"
+        default: "mix_kernel,vadd_validshape,vadd_validshape_dynamic,print,storefp,decode_attention_incore_0,decode_attention_incore_1,decode_attention_incore_2,decode_attention_incore_3,decode_attention_incore_4,decode_attention_incore_5,decode_attention_incore_6,decode_attention_incore_7,decode_attention_incore_8,decode_attention_incore_9,decode_attention_incore_10,decode_attention_incore_11,decode_attention_incore_12"
       run_only_cases:
         description: "Comma/space separated testcase names to run (empty = run all)"
         type: string
@@ -261,7 +261,14 @@ jobs:
       # Temporary CI gate: skip cases that still error/flap on the remote NPU.
       # Update this list as we fix the underlying issues.
       DEFAULT_SKIP_CASES: >-
-        mix_kernel,vadd_validshape,vadd_validshape_dynamic,print,storefp
+        mix_kernel,vadd_validshape,vadd_validshape_dynamic,print,storefp,
+        decode_attention_incore_0,decode_attention_incore_1,
+        decode_attention_incore_2,decode_attention_incore_3,
+        decode_attention_incore_4,decode_attention_incore_5,
+        decode_attention_incore_6,decode_attention_incore_7,
+        decode_attention_incore_8,decode_attention_incore_9,
+        decode_attention_incore_10,decode_attention_incore_11,
+        decode_attention_incore_12
     steps:
       - name: Resolve validation parameters
         shell: bash
diff --git a/test/samples/Qwen3Scope2/README.md b/test/samples/Qwen3Scope2/README.md
new file mode 100644
index 00000000..978e54ad
--- /dev/null
+++ b/test/samples/Qwen3Scope2/README.md
@@ -0,0 +1,11 @@
+Qwen3 scope2 PTO kernels generated from `pypto-lib/examples/models/qwen3/qwen3_32b_decode_scope2.py`.
+
+Scope:
+- compile-regression inputs for `ptoas`
+- A5-only kernels; `runop.sh` injects `--pto-arch a5` for this directory unless the caller already overrides `PTOAS_FLAGS`
+
+Notes:
+- The source PyPTO program lowers to 13 kernel-level `.pto` files plus an orchestration C++ file.
+- This sample directory vendors only the kernel `.pto` inputs.
+- No custom `golden.py` or `compare.py` is included in this draft because those are tied to the full orchestration flow, not to individual kernel-only `.pto` files.
+- The existing `test/npu_validation/scripts/generate_testcase.py` flow can still auto-generate generic validation assets for these kernels when needed.
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_0.pto b/test/samples/Qwen3Scope2/decode_attention_incore_0.pto
new file mode 100644
index 00000000..d9df6b9e
--- /dev/null
+++ b/test/samples/Qwen3Scope2/decode_attention_incore_0.pto
@@ -0,0 +1,25 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @decode_attention_incore_0(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c256 = arith.constant 256 : i64
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c1 = arith.constant 1 : index
+  %c16 = arith.constant 16 : index
+  %c1024 = arith.constant 1024 : index
+  %c0 = arith.constant 0 : index
+  %k_group__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %k_proj__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  scf.for %ki__idx_v0 = %c0 to %c8 step %c1 {
+    %1 = arith.muli %ki__idx_v0, %c128 : index
+    %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %k_proj__ssa_v0_pview = pto.partition_view %k_proj__ssa_v0_view, offsets = [%arg2, %1], sizes = [%c1, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x128xbf16>
+    pto.tload ins(%k_proj__ssa_v0_pview : !pto.partition_tensor_view<1x128xbf16>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %0 = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%t__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %k_group__iter_v1_pview = pto.partition_view %k_group__ssa_v0_view, offsets = [%ki__idx_v0, %c0], sizes = [%c1, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x128xf32>
+    pto.tstore ins(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%k_group__iter_v1_pview : !pto.partition_tensor_view<1x128xf32>)
+  }
+  return
+  }
+}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_1.pto b/test/samples/Qwen3Scope2/decode_attention_incore_1.pto
new file mode 100644
index 00000000..d79076b4
--- /dev/null
+++ b/test/samples/Qwen3Scope2/decode_attention_incore_1.pto
@@ -0,0 +1,58 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @decode_attention_incore_1(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<f32>, %arg4: !pto.ptr<f32>, %arg5: !pto.ptr<f32>) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c256 = arith.constant 256 : i64
+  %c512 = arith.constant 512 : i64
+  %c768 = arith.constant 768 : i64
+  %c1024 = arith.constant 1024 : i64
+  %c3072 = arith.constant 3072 : i64
+  %c5120 = arith.constant 5120 : i64
+  %c7168 = arith.constant 7168 : i64
+  %c1 = arith.constant 1 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c0 = arith.constant 0 : index
+  %cos_hi__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %cos_lo__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %k_group__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %k_rot_tensor__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %sin_hi__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %sin_lo__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %cos_hi__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %cos_hi__ssa_v0_pview = pto.partition_view %cos_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%cos_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %cos_lo__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %cos_lo__ssa_v0_pview = pto.partition_view %cos_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%cos_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %sin_hi__tile = pto.alloc_tile addr = %c512 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %sin_hi__ssa_v0_pview = pto.partition_view %sin_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%sin_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %sin_lo__tile = pto.alloc_tile addr = %c768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %sin_lo__ssa_v0_pview = pto.partition_view %sin_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%sin_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %k_lo__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %k_group__rv_v2_pview = pto.partition_view %k_group__rv_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+  pto.tload ins(%k_group__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%k_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %k_hi__tile = pto.alloc_tile addr = %c3072 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %3 = pto.partition_view %k_group__rv_v2_view, offsets = [%c0, %c64], sizes = [%c8, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+  pto.tload ins(%3 : !pto.partition_tensor_view<8x64xf32>) outs(%k_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %t__tile = pto.alloc_tile addr = %c5120 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tcolexpandmul ins(%k_lo__tile, %cos_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %0 = pto.alloc_tile addr = %c7168 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tcolexpandmul ins(%k_hi__tile, %sin_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %rot_lo__tile = pto.alloc_tile addr = %c5120 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tsub ins(%t__tile, %0 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rot_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %1 = pto.alloc_tile addr = %c3072 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tcolexpandmul ins(%k_hi__tile, %cos_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %2 = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tcolexpandmul ins(%k_lo__tile, %sin_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %rot_hi__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tadd ins(%1, %2 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rot_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %k_rot_tensor__ssa_v0_pview = pto.partition_view %k_rot_tensor__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+  pto.tstore ins(%rot_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%k_rot_tensor__ssa_v0_pview : !pto.partition_tensor_view<8x64xf32>)
+  %k_rot_tensor__tile_pview = pto.partition_view %k_rot_tensor__ssa_v0_view, offsets = [%c0, %c64], sizes = [%c8, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+  pto.tstore ins(%rot_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%k_rot_tensor__tile_pview : !pto.partition_tensor_view<8x64xf32>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_10.pto b/test/samples/Qwen3Scope2/decode_attention_incore_10.pto
new file mode 100644
index 00000000..142c570b
--- /dev/null
+++ b/test/samples/Qwen3Scope2/decode_attention_incore_10.pto
@@ -0,0 +1,30 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @decode_attention_incore_10(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<f32>, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+  %c0i = arith.constant 0 : i64
+  %c16384 = arith.constant 16384 : i64
+  %c16 = arith.constant 16 : index
+  %c64 = arith.constant 64 : index
+  %c1 = arith.constant 1 : index
+  %c524288 = arith.constant 524288 : index
+  %c128 = arith.constant 128 : index
+  %c0 = arith.constant 0 : index
+  %exp_padded__ssa_v1_view = pto.make_tensor_view %arg0, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %v_cache__rv_v4_view = pto.make_tensor_view %arg1, shape = [%c524288, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %ret0__out_view = pto.make_tensor_view %arg2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %v_tile__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  %v_cache__rv_v4_pview = pto.partition_view %v_cache__rv_v4_view, offsets = [%arg3, %c0], sizes = [%c64, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<64x128xbf16>
+  pto.tload ins(%v_cache__rv_v4_pview : !pto.partition_tensor_view<64x128xbf16>) outs(%v_tile__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %lhs_mat = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  %exp_padded__ssa_v1_pview = pto.partition_view %exp_padded__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x64xbf16>
+  pto.tload ins(%exp_padded__ssa_v1_pview : !pto.partition_tensor_view<16x64xbf16>) outs(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %lhs_mat_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  pto.tmov ins(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%lhs_mat_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %v_tile__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+  pto.tmov ins(%v_tile__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%v_tile__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+  %oi_tmp_pad__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+  pto.tmatmul ins(%lhs_mat_Left, %v_tile__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%oi_tmp_pad__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+  pto.tstore ins(%oi_tmp_pad__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<16x128xf32>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_11.pto b/test/samples/Qwen3Scope2/decode_attention_incore_11.pto
new file mode 100644
index 00000000..17eae5c2
--- /dev/null
+++ b/test/samples/Qwen3Scope2/decode_attention_incore_11.pto
@@ -0,0 +1,111 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @decode_attention_incore_11(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<f32>, %arg4: !pto.ptr<f32>, %arg5: !pto.ptr<f32>, %arg6: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c32 = arith.constant 32 : i64
+  %c64 = arith.constant 64 : i64
+  %c96 = arith.constant 96 : i64
+  %c128 = arith.constant 128 : i64
+  %c4224 = arith.constant 4224 : i64
+  %c8320 = arith.constant 8320 : i64
+  %c8352 = arith.constant 8352 : i64
+  %c8384 = arith.constant 8384 : i64
+  %c8416 = arith.constant 8416 : i64
+  %c8448 = arith.constant 8448 : i64
+  %c8 = arith.constant 8 : index
+  %c1 = arith.constant 1 : index
+  %7 = arith.constant 128 : index
+  %c16 = arith.constant 16 : index
+  %c0 = arith.constant 0 : index
+  %cur_li__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %cur_mi__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %li__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %mi__iter_v1_view = pto.make_tensor_view %arg3, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %oi__iter_v1_view = pto.make_tensor_view %arg4, shape = [%c8, %7], strides = [%7, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %oi_tmp_pad__ssa_v1_view = pto.make_tensor_view %arg5, shape = [%c16, %7], strides = [%7, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %cur_li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %cur_li__ssa_v0_pview = pto.partition_view %cur_li__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tload ins(%cur_li__ssa_v0_pview : !pto.partition_tensor_view<8x1xf32>) outs(%cur_li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %cur_mi__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %cur_mi__ssa_v0_pview = pto.partition_view %cur_mi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tload ins(%cur_mi__ssa_v0_pview : !pto.partition_tensor_view<8x1xf32>) outs(%cur_mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %li__tile = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %li__iter_v1_pview = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tload ins(%li__iter_v1_pview : !pto.partition_tensor_view<8x1xf32>) outs(%li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %mi__tile = pto.alloc_tile addr = %c96 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %mi__iter_v1_pview = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tload ins(%mi__iter_v1_pview : !pto.partition_tensor_view<8x1xf32>) outs(%mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %oi__tile = pto.alloc_tile addr = %c128 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %oi__iter_v1_pview = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x128xf32>
+  pto.tload ins(%oi__iter_v1_pview : !pto.partition_tensor_view<8x128xf32>) outs(%oi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %oi_tmp__tile = pto.alloc_tile addr = %c4224 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %oi_tmp_pad__ssa_v1_pview = pto.partition_view %oi_tmp_pad__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x128xf32>
+  pto.tload ins(%oi_tmp_pad__ssa_v1_pview : !pto.partition_tensor_view<8x128xf32>) outs(%oi_tmp__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %8 = arith.cmpi eq, %arg6, %c0 : index
+  %li__phi_v5, %mi__phi_v5, %oi__phi_v5 = scf.if %8 -> (!pto.tensor_view<?x?xf32>, !pto.tensor_view<?x?xf32>, !pto.tensor_view<?x?xf32>) {
+    %oi__ssa_v3 = pto.alloc_tile addr = %c4224 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %li__ssa_v3 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %mi__ssa_v3 = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %9 = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+    pto.tstore ins(%cur_li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%9 : !pto.partition_tensor_view<8x1xf32>)
+    %10 = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+    pto.tstore ins(%cur_mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%10 : !pto.partition_tensor_view<8x1xf32>)
+    %11 = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x128xf32>
+    pto.tstore ins(%oi_tmp__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%11 : !pto.partition_tensor_view<8x128xf32>)
+    scf.yield %li__iter_v1_view, %mi__iter_v1_view, %oi__iter_v1_view : !pto.tensor_view<?x?xf32>, !pto.tensor_view<?x?xf32>, !pto.tensor_view<?x?xf32>
+  } else {
+    %mi_new__rm_a0_tmp_v0 = pto.alloc_tile addr = %c96 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %mi_new__rm_a1_tmp_v1 = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %mi_new__row_major_tmp_v2 = pto.alloc_tile addr = %c8320 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmax ins(%mi_new__rm_a0_tmp_v0, %mi_new__rm_a1_tmp_v1 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%mi_new__row_major_tmp_v2 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %mi_new__tile = pto.alloc_tile addr = %c8320 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v3 = pto.alloc_tile addr = %c96 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v4 = pto.alloc_tile addr = %c8320 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v5 = pto.alloc_tile addr = %c8352 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tsub ins(%t__rm_a0_tmp_v3, %t__rm_a1_tmp_v4 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v5 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %t__tile = pto.alloc_tile addr = %c8352 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %alpha__rm_a0_tmp_v6 = pto.alloc_tile addr = %c8352 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %alpha__row_major_tmp_v7 = pto.alloc_tile addr = %c8352 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.texp ins(%alpha__rm_a0_tmp_v6 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%alpha__row_major_tmp_v7 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %alpha__tile = pto.alloc_tile addr = %c8352 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v8 = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v9 = pto.alloc_tile addr = %c8320 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v10 = pto.alloc_tile addr = %c8384 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tsub ins(%t__rm_a0_tmp_v8, %t__rm_a1_tmp_v9 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v10 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %0 = pto.alloc_tile addr = %c8384 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %beta__rm_a0_tmp_v11 = pto.alloc_tile addr = %c8384 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %beta__row_major_tmp_v12 = pto.alloc_tile addr = %c8384 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.texp ins(%beta__rm_a0_tmp_v11 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%beta__row_major_tmp_v12 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %beta__tile = pto.alloc_tile addr = %c8384 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v13 = pto.alloc_tile addr = %c8352 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v14 = pto.alloc_tile addr = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v15 = pto.alloc_tile addr = %c8416 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmul ins(%t__rm_a0_tmp_v13, %t__rm_a1_tmp_v14 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v15 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %1 = pto.alloc_tile addr = %c8416 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a0_tmp_v16 = pto.alloc_tile addr = %c8384 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__rm_a1_tmp_v17 = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %t__row_major_tmp_v18 = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tmul ins(%t__rm_a0_tmp_v16, %t__rm_a1_tmp_v17 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__row_major_tmp_v18 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %2 = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %li__rm_a0_tmp_v19 = pto.alloc_tile addr = %c8416 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %li__rm_a1_tmp_v20 = pto.alloc_tile addr = %c8448 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %li__row_major_tmp_v21 = pto.alloc_tile addr = %c8416 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%li__rm_a0_tmp_v19, %li__rm_a1_tmp_v20 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%li__row_major_tmp_v21 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %3 = pto.alloc_tile addr = %c8416 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %4 = pto.alloc_tile addr = %c128 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.trowexpandmul ins(%oi__tile, %alpha__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%4 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %5 = pto.alloc_tile addr = %c4224 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.trowexpandmul ins(%oi_tmp__tile, %beta__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %6 = pto.alloc_tile addr = %c128 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%4, %5 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%6 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %mi__ssa_v4 = pto.alloc_tile addr = %c8320 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    %13 = pto.partition_view %li__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+    pto.tstore ins(%3 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%13 : !pto.partition_tensor_view<8x1xf32>)
+    %15 = pto.partition_view %mi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+    pto.tstore ins(%mi_new__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%15 : !pto.partition_tensor_view<8x1xf32>)
+    %17 = pto.partition_view %oi__iter_v1_view, offsets = [%c0, %c0], sizes = [%c8, %7] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x128xf32>
+    pto.tstore ins(%6 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%17 : !pto.partition_tensor_view<8x128xf32>)
+    scf.yield %li__iter_v1_view, %mi__iter_v1_view, %oi__iter_v1_view : !pto.tensor_view<?x?xf32>, !pto.tensor_view<?x?xf32>, !pto.tensor_view<?x?xf32>
+  }
+  return
+  }
+}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_12.pto b/test/samples/Qwen3Scope2/decode_attention_incore_12.pto
new file mode 100644
index 00000000..12407852
--- /dev/null
+++ b/test/samples/Qwen3Scope2/decode_attention_incore_12.pto
@@ -0,0 +1,28 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @decode_attention_incore_12(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c32 = arith.constant 32 : i64
+  %c1 = arith.constant 1 : index
+  %c8192 = arith.constant 8192 : index
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c0 = arith.constant 0 : index
+  %c1024 = arith.constant 1024 : index
+  %attn_row__iter_v1_view = pto.make_tensor_view %arg0, shape = [%c1, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %li__rv_v2_view = pto.make_tensor_view %arg1, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %oi__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %li__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %li__rv_v2_pview = pto.partition_view %li__rv_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tload ins(%li__rv_v2_pview : !pto.partition_tensor_view<8x1xf32>) outs(%li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %oi__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %oi__rv_v2_pview = pto.partition_view %oi__rv_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x128xf32>
+  pto.tload ins(%oi__rv_v2_pview : !pto.partition_tensor_view<8x128xf32>) outs(%oi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %ctx__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.trowexpanddiv ins(%oi__tile, %li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ctx__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %ctx_flat__tile = pto.alloc_tile addr = %c32 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=1024, v_row=1, v_col=1024, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %0 = arith.muli %arg3, %c128 : index
+  %attn_row__iter_v1_pview = pto.partition_view %attn_row__iter_v1_view, offsets = [%c0, %0], sizes = [%c1, %c1024] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x1024xf32>
+  pto.tstore ins(%ctx_flat__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=1024, v_row=1, v_col=1024, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%attn_row__iter_v1_pview : !pto.partition_tensor_view<1x1024xf32>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_2.pto b/test/samples/Qwen3Scope2/decode_attention_incore_2.pto
new file mode 100644
index 00000000..5419f419
--- /dev/null
+++ b/test/samples/Qwen3Scope2/decode_attention_incore_2.pto
@@ -0,0 +1,39 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @decode_attention_incore_2(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<bf16>, %arg3: !pto.ptr<bf16>, %arg4: index, %arg5: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c512 = arith.constant 512 : i64
+  %c524288 = arith.constant 524288 : index
+  %c128 = arith.constant 128 : index
+  %c1 = arith.constant 1 : index
+  %c8 = arith.constant 8 : index
+  %c16 = arith.constant 16 : index
+  %c1024 = arith.constant 1024 : index
+  %c0 = arith.constant 0 : index
+  %c4096 = arith.constant 4096 : index
+  %k_cache__iter_v1_view = pto.make_tensor_view %arg0, shape = [%c524288, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %k_rot_tensor__ssa_v2_view = pto.make_tensor_view %arg1, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %v_cache__iter_v1_view = pto.make_tensor_view %arg2, shape = [%c524288, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %v_proj__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  scf.for %ki__idx_v0 = %c0 to %c8 step %c1 {
+    %2 = arith.muli %arg4, %c8 : index
+    %3 = arith.muli %2, %c4096 : index
+    %4 = arith.muli %ki__idx_v0, %c4096 : index
+    %5 = arith.addi %3, %4 : index
+    %6 = arith.addi %5, %arg5 : index
+    %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %k_rot_tensor__ssa_v2_pview = pto.partition_view %k_rot_tensor__ssa_v2_view, offsets = [%ki__idx_v0, %c0], sizes = [%c1, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x128xf32>
+    pto.tload ins(%k_rot_tensor__ssa_v2_pview : !pto.partition_tensor_view<1x128xf32>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %0 = pto.alloc_tile addr = %c512 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%t__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %k_cache__iter_v3_pview = pto.partition_view %k_cache__iter_v1_view, offsets = [%6, %c0], sizes = [%c1, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x128xbf16>
+    pto.tstore ins(%0 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%k_cache__iter_v3_pview : !pto.partition_tensor_view<1x128xbf16>)
+    %1 = pto.alloc_tile addr = %c512 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %7 = arith.muli %ki__idx_v0, %c128 : index
+    %v_proj__ssa_v0_pview = pto.partition_view %v_proj__ssa_v0_view, offsets = [%arg4, %7], sizes = [%c1, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x128xbf16>
+    pto.tload ins(%v_proj__ssa_v0_pview : !pto.partition_tensor_view<1x128xbf16>) outs(%1 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %v_cache__iter_v3_pview = pto.partition_view %v_cache__iter_v1_view, offsets = [%6, %c0], sizes = [%c1, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x128xbf16>
+    pto.tstore ins(%1 : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%v_cache__iter_v3_pview : !pto.partition_tensor_view<1x128xbf16>)
+  }
+  return
+  }
+}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_3.pto b/test/samples/Qwen3Scope2/decode_attention_incore_3.pto
new file mode 100644
index 00000000..143c98a4
--- /dev/null
+++ b/test/samples/Qwen3Scope2/decode_attention_incore_3.pto
@@ -0,0 +1,26 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @decode_attention_incore_3(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: index, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c256 = arith.constant 256 : i64
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c1 = arith.constant 1 : index
+  %c16 = arith.constant 16 : index
+  %c8192 = arith.constant 8192 : index
+  %c0 = arith.constant 0 : index
+  %q_group__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %q_proj__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c8192], strides = [%c8192, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  scf.for %qi__idx_v0 = %c0 to %c8 step %c1 {
+    %1 = arith.addi %arg3, %qi__idx_v0 : index
+    %2 = arith.muli %1, %c128 : index
+    %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %q_proj__ssa_v0_pview = pto.partition_view %q_proj__ssa_v0_view, offsets = [%arg2, %2], sizes = [%c1, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x128xbf16>
+    pto.tload ins(%q_proj__ssa_v0_pview : !pto.partition_tensor_view<1x128xbf16>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %0 = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%t__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %q_group__iter_v1_pview = pto.partition_view %q_group__ssa_v0_view, offsets = [%qi__idx_v0, %c0], sizes = [%c1, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x128xf32>
+    pto.tstore ins(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_group__iter_v1_pview : !pto.partition_tensor_view<1x128xf32>)
+  }
+  return
+  }
+}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_4.pto b/test/samples/Qwen3Scope2/decode_attention_incore_4.pto
new file mode 100644
index 00000000..9de52a73
--- /dev/null
+++ b/test/samples/Qwen3Scope2/decode_attention_incore_4.pto
@@ -0,0 +1,64 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @decode_attention_incore_4(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<bf16>, %arg4: !pto.ptr<f32>, %arg5: !pto.ptr<f32>) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c256 = arith.constant 256 : i64
+  %c512 = arith.constant 512 : i64
+  %c768 = arith.constant 768 : i64
+  %c1024 = arith.constant 1024 : i64
+  %c3072 = arith.constant 3072 : i64
+  %c5120 = arith.constant 5120 : i64
+  %c7168 = arith.constant 7168 : i64
+  %c9216 = arith.constant 9216 : i64
+  %c10240 = arith.constant 10240 : i64
+  %c1 = arith.constant 1 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c0 = arith.constant 0 : index
+  %cos_hi__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %cos_lo__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %q_group__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %q_rot_bf16__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %sin_hi__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %sin_lo__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %cos_hi__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %cos_hi__ssa_v0_pview = pto.partition_view %cos_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%cos_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %cos_lo__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %cos_lo__ssa_v0_pview = pto.partition_view %cos_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%cos_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %sin_hi__tile = pto.alloc_tile addr = %c512 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %sin_hi__ssa_v0_pview = pto.partition_view %sin_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%sin_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %sin_lo__tile = pto.alloc_tile addr = %c768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %sin_lo__ssa_v0_pview = pto.partition_view %sin_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%sin_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %q_lo__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %q_group__rv_v2_pview = pto.partition_view %q_group__rv_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+  pto.tload ins(%q_group__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%q_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %q_hi__tile = pto.alloc_tile addr = %c3072 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %3 = pto.partition_view %q_group__rv_v2_view, offsets = [%c0, %c64], sizes = [%c8, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+  pto.tload ins(%3 : !pto.partition_tensor_view<8x64xf32>) outs(%q_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %t__tile = pto.alloc_tile addr = %c5120 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tcolexpandmul ins(%q_lo__tile, %cos_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %0 = pto.alloc_tile addr = %c7168 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tcolexpandmul ins(%q_hi__tile, %sin_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %q_rot_lo__tile = pto.alloc_tile addr = %c5120 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tsub ins(%t__tile, %0 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_rot_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %1 = pto.alloc_tile addr = %c3072 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tcolexpandmul ins(%q_hi__tile, %cos_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %2 = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tcolexpandmul ins(%q_lo__tile, %sin_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %q_rot_hi__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tadd ins(%1, %2 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_rot_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %q_rot_lo_bf16__tile = pto.alloc_tile addr = %c9216 : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tcvt ins(%q_rot_lo__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_rot_lo_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %q_rot_hi_bf16__tile = pto.alloc_tile addr = %c10240 : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tcvt ins(%q_rot_hi__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_rot_hi_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %q_rot_bf16__ssa_v0_pview = pto.partition_view %q_rot_bf16__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<8x64xbf16>
+  pto.tstore ins(%q_rot_lo_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_rot_bf16__ssa_v0_pview : !pto.partition_tensor_view<8x64xbf16>)
+  %q_rot_bf16__tile_pview = pto.partition_view %q_rot_bf16__ssa_v0_view, offsets = [%c0, %c64], sizes = [%c8, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<8x64xbf16>
+  pto.tstore ins(%q_rot_hi_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_rot_bf16__tile_pview : !pto.partition_tensor_view<8x64xbf16>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_5.pto b/test/samples/Qwen3Scope2/decode_attention_incore_5.pto
new file mode 100644
index 00000000..28ad1932
--- /dev/null
+++ b/test/samples/Qwen3Scope2/decode_attention_incore_5.pto
@@ -0,0 +1,30 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @decode_attention_incore_5(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c4096 = arith.constant 4096 : i64
+  %c4128 = arith.constant 4128 : i64
+  %c8 = arith.constant 8 : index
+  %c1 = arith.constant 1 : index
+  %c128 = arith.constant 128 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %c0 = arith.constant 0 : index
+  %ret0__out_view = pto.make_tensor_view %arg0, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %ret1__out_view = pto.make_tensor_view %arg1, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %ret2__out_view = pto.make_tensor_view %arg2, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %oi__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.texpands ins(%cst : f32) outs(%oi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %li_flat__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.texpands ins(%cst : f32) outs(%li_flat__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %li__tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %mi_flat__tile = pto.alloc_tile addr = %c4128 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.texpands ins(%cst : f32) outs(%mi_flat__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %mi__tile = pto.alloc_tile addr = %c4128 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tstore ins(%li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<8x1xf32>)
+  %ret1__out_pview = pto.partition_view %ret1__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tstore ins(%mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ret1__out_pview : !pto.partition_tensor_view<8x1xf32>)
+  %ret2__out_pview = pto.partition_view %ret2__out_view, offsets = [%c0, %c0], sizes = [%c8, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x128xf32>
+  pto.tstore ins(%oi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%ret2__out_pview : !pto.partition_tensor_view<8x128xf32>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_6.pto b/test/samples/Qwen3Scope2/decode_attention_incore_6.pto
new file mode 100644
index 00000000..26e9555c
--- /dev/null
+++ b/test/samples/Qwen3Scope2/decode_attention_incore_6.pto
@@ -0,0 +1,18 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @decode_attention_incore_6(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<bf16>) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c16 = arith.constant 16 : index
+  %c128 = arith.constant 128 : index
+  %c1 = arith.constant 1 : index
+  %c8 = arith.constant 8 : index
+  %c0 = arith.constant 0 : index
+  %q_padded__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %q_rot_bf16__ssa_v2_view = pto.make_tensor_view %arg1, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %q_bf16_tile__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %q_rot_bf16__ssa_v2_pview = pto.partition_view %q_rot_bf16__ssa_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<8x128xbf16>
+  pto.tload ins(%q_rot_bf16__ssa_v2_pview : !pto.partition_tensor_view<8x128xbf16>) outs(%q_bf16_tile__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %q_padded__ssa_v0_pview = pto.partition_view %q_padded__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<8x128xbf16>
+  pto.tstore ins(%q_bf16_tile__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%q_padded__ssa_v0_pview : !pto.partition_tensor_view<8x128xbf16>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_7.pto b/test/samples/Qwen3Scope2/decode_attention_incore_7.pto
new file mode 100644
index 00000000..282f797e
--- /dev/null
+++ b/test/samples/Qwen3Scope2/decode_attention_incore_7.pto
@@ -0,0 +1,30 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @decode_attention_incore_7(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<f32>, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+  %c0i = arith.constant 0 : i64
+  %c16384 = arith.constant 16384 : i64
+  %c524288 = arith.constant 524288 : index
+  %c128 = arith.constant 128 : index
+  %c1 = arith.constant 1 : index
+  %c16 = arith.constant 16 : index
+  %c64 = arith.constant 64 : index
+  %c0 = arith.constant 0 : index
+  %k_cache__rv_v4_view = pto.make_tensor_view %arg0, shape = [%c128, %c524288], strides = [%c1, %c128] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xbf16>
+  %q_padded__ssa_v1_view = pto.make_tensor_view %arg1, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %ret0__out_view = pto.make_tensor_view %arg2, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %k_tile__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+  %k_cache__rv_v4_pview = pto.partition_view %k_cache__rv_v4_view, offsets = [%c0, %arg3], sizes = [%c128, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<128x64xbf16>
+  pto.tload ins(%k_cache__rv_v4_pview : !pto.partition_tensor_view<128x64xbf16>) outs(%k_tile__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+  %lhs_mat = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  %q_padded__ssa_v1_pview = pto.partition_view %q_padded__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x128xbf16>
+  pto.tload ins(%q_padded__ssa_v1_pview : !pto.partition_tensor_view<16x128xbf16>) outs(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %lhs_mat_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  pto.tmov ins(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%lhs_mat_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %k_tile__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+  pto.tmov ins(%k_tile__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%k_tile__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+  %raw_scores_pad__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+  pto.tmatmul ins(%lhs_mat_Left, %k_tile__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=128, cols=64, v_row=128, v_col=64, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%raw_scores_pad__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x64xf32>
+  pto.tstore ins(%raw_scores_pad__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<16x64xf32>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_8.pto b/test/samples/Qwen3Scope2/decode_attention_incore_8.pto
new file mode 100644
index 00000000..f968b162
--- /dev/null
+++ b/test/samples/Qwen3Scope2/decode_attention_incore_8.pto
@@ -0,0 +1,49 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @decode_attention_incore_8(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<bf16>, %arg4: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c2048 = arith.constant 2048 : i64
+  %c4096 = arith.constant 4096 : i64
+  %c8192 = arith.constant 8192 : i64
+  %c8224 = arith.constant 8224 : i64
+  %c9248 = arith.constant 9248 : i64
+  %c16 = arith.constant 16 : index
+  %c64 = arith.constant 64 : index
+  %c1 = arith.constant 1 : index
+  %c8 = arith.constant 8 : index
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 8.838835e-02 : f32
+  %raw_scores_pad__ssa_v1_view = pto.make_tensor_view %arg0, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %ret0__out_view = pto.make_tensor_view %arg1, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %ret1__out_view = pto.make_tensor_view %arg2, shape = [%c8, %c1], strides = [%c1, %c8] {layout = #pto.layout<dn>}: !pto.tensor_view<?x?xf32>
+  %ret2__out_view = pto.make_tensor_view %arg3, shape = [%c8, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %scores_valid__tile = pto.alloc_tile addr = %c0i valid_row = %c8 valid_col = %c64 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %raw_scores_pad__ssa_v1_pview = pto.partition_view %raw_scores_pad__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+  pto.tload ins(%raw_scores_pad__ssa_v1_pview : !pto.partition_tensor_view<8x64xf32>) outs(%scores_valid__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  pto.set_validshape %scores_valid__tile, %c8, %arg4 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %scores_padded__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
+  pto.tfillpad ins(%scores_valid__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%scores_padded__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
+  %scores__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
+  pto.tmuls ins(%scores_padded__tile, %cst : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>, f32) outs(%scores__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
+  %tmp_tile = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %cur_mi__tile = pto.alloc_tile addr = %c8192 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  pto.trowmax ins(%scores__tile, %tmp_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%cur_mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %t__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
+  pto.trowexpandsub ins(%scores__tile, %cur_mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
+  %exp_scores__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
+  pto.texp ins(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>) outs(%exp_scores__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
+  %exp_scores_bf16__tile = pto.alloc_tile addr = %c8224 : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
+  pto.tcvt ins(%exp_scores__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>) outs(%exp_scores_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
+  %exp_scores_fp32__tile = pto.alloc_tile addr = %c2048 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>
+  pto.tcvt ins(%exp_scores_bf16__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>) outs(%exp_scores_fp32__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>)
+  %0 = pto.alloc_tile addr = %c4096 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %cur_li__tile = pto.alloc_tile addr = %c9248 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>
+  pto.trowsum ins(%exp_scores_fp32__tile, %0 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=128, v_row=8, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%cur_li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tstore ins(%cur_li__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<8x1xf32>)
+  %ret1__out_pview = pto.partition_view %ret1__out_view, offsets = [%c0, %c0], sizes = [%c8, %c1] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x1xf32>
+  pto.tstore ins(%cur_mi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=1, v_row=8, v_col=1, blayout=col_major, slayout=none_box, fractal=512, pad=0>) outs(%ret1__out_pview : !pto.partition_tensor_view<8x1xf32>)
+  %ret2__out_pview = pto.partition_view %ret2__out_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<8x64xbf16>
+  pto.tstore ins(%exp_scores_bf16__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=3>) outs(%ret2__out_pview : !pto.partition_tensor_view<8x64xbf16>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_9.pto b/test/samples/Qwen3Scope2/decode_attention_incore_9.pto
new file mode 100644
index 00000000..0c16cfc6
--- /dev/null
+++ b/test/samples/Qwen3Scope2/decode_attention_incore_9.pto
@@ -0,0 +1,18 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @decode_attention_incore_9(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<bf16>) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c16 = arith.constant 16 : index
+  %c64 = arith.constant 64 : index
+  %c1 = arith.constant 1 : index
+  %c8 = arith.constant 8 : index
+  %c0 = arith.constant 0 : index
+  %exp_padded__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %exp_scores_bf16__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c8, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %exp_tile__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %exp_scores_bf16__ssa_v0_pview = pto.partition_view %exp_scores_bf16__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<8x64xbf16>
+  pto.tload ins(%exp_scores_bf16__ssa_v0_pview : !pto.partition_tensor_view<8x64xbf16>) outs(%exp_tile__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %exp_padded__ssa_v0_pview = pto.partition_view %exp_padded__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<8x64xbf16>
+  pto.tstore ins(%exp_tile__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%exp_padded__ssa_v0_pview : !pto.partition_tensor_view<8x64xbf16>)
+  return
+  }
+}
diff --git a/test/samples/runop.sh b/test/samples/runop.sh
index a57efc8a..b8c02ff0 100755
--- a/test/samples/runop.sh
+++ b/test/samples/runop.sh
@@ -19,7 +19,7 @@ PYTHON_BIN="${PYTHON_BIN:-}"
 PTOAS_OUT_DIR="${PTOAS_OUT_DIR:-}"
 PTOAS_ENABLE_INSERT_SYNC="${PTOAS_ENABLE_INSERT_SYNC:-1}"
 PTOAS_FLAGS="${PTOAS_FLAGS:-}"
-PTO_PTO_DIRS="${PTO_PTO_DIRS:-Sync}"
+PTO_PTO_DIRS="${PTO_PTO_DIRS:-Sync Qwen3Scope2}"
 ENABLE_BC=0
 
 usage() {
@@ -36,7 +36,7 @@ Env:
   PTOAS_OUT_DIR  # where generated *.mlir/*.cpp go (optional; defaults to a temp dir)
   PTOAS_FLAGS  # extra flags passed to ptoas (e.g. --enable-insert-sync)
   PTOAS_ENABLE_INSERT_SYNC  # 1 to append --enable-insert-sync to PTOAS_FLAGS (default: 1)
-  PTO_PTO_DIRS  # space-separated dirs to run .pto directly (default: Sync)
+  PTO_PTO_DIRS  # space-separated dirs to run .pto directly (default: Sync Qwen3Scope2)
 
 Flags:
   --enablebc  # enable: python -> .pto -> ptobc -> .pto -> ptoas
@@ -153,6 +153,12 @@ process_one_dir() {
   if [[ "${ENABLE_BC}" == "1" ]]; then
     use_ptobc_roundtrip=1
   fi
+  # Qwen3 scope2 kernels currently serve as direct ptoas compile-regression
+  # coverage. They require A5/level3 lowering, but are not expected to
+  # roundtrip through ptobc yet.
+  if [[ "$A" == "Qwen3Scope2" ]]; then
+    use_ptobc_roundtrip=0
+  fi
   local -a ptoas_flags=()
   if [[ -n "${PTOAS_FLAGS}" ]]; then
     # shellcheck disable=SC2206
@@ -172,15 +178,22 @@ process_one_dir() {
   fi
 
   local target_arch="a3"
+  local has_pto_arch_override=0
   if ((${#ptoas_flags[@]})); then
     for ((idx=0; idx<${#ptoas_flags[@]}; ++idx)); do
       if [[ "${ptoas_flags[idx]}" == "--pto-arch" && $((idx + 1)) -lt ${#ptoas_flags[@]} ]]; then
         target_arch="${ptoas_flags[idx + 1]}"
+        has_pto_arch_override=1
       elif [[ "${ptoas_flags[idx]}" == --pto-arch=* ]]; then
         target_arch="${ptoas_flags[idx]#--pto-arch=}"
+        has_pto_arch_override=1
       fi
     done
   fi
+  if [[ "$A" == "Qwen3Scope2" && $has_pto_arch_override -eq 0 ]]; then
+    ptoas_flags+=(--pto-arch a5 --pto-level=level3)
+    target_arch="a5"
+  fi
   local expected_vec_barrier="pipe_barrier(PIPE_V)"
   local skip_vec_barrier=0
   if [[ "$(printf '%s' "$target_arch" | tr '[:upper:]' '[:lower:]')" == "a5" ]]; then
@@ -903,7 +916,6 @@ PY
       if [[ "$base" == "test_if_else_tile_result" ]]; then
         sample_use_ptobc_roundtrip=0
       fi
-
       if [[ $sample_use_ptobc_roundtrip -eq 1 ]]; then
         # Allow generic escape for ops that are not yet in the compact v0 opcode table.
         if ! PTOBC_ALLOW_GENERIC=1 "$ptobc" encode "$f" -o "$ptobc_file" >/dev/null 2>&1; then