hw-native-sys · HecreReed · Apr 2, 2026 · Apr 2, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -33,7 +33,7 @@ on:
       skip_cases:
         description: "Comma/space separated testcase names to skip (e.g. scatter,mrgsort)"
         type: string
-        default: "mix_kernel,vadd_validshape,vadd_validshape_dynamic,print,storefp"
+        default: "mix_kernel,vadd_validshape,vadd_validshape_dynamic,print,storefp,decode_attention_incore_0,decode_attention_incore_1,decode_attention_incore_2,decode_attention_incore_3,decode_attention_incore_4,decode_attention_incore_5,decode_attention_incore_6,decode_attention_incore_7,decode_attention_incore_8,decode_attention_incore_9,decode_attention_incore_10,decode_attention_incore_11,decode_attention_incore_12"
       run_only_cases:
         description: "Comma/space separated testcase names to run (empty = run all)"
         type: string
@@ -261,7 +261,14 @@ jobs:
       # Temporary CI gate: skip cases that still error/flap on the remote NPU.
       # Update this list as we fix the underlying issues.
       DEFAULT_SKIP_CASES: >-
-        mix_kernel,vadd_validshape,vadd_validshape_dynamic,print,storefp
+        mix_kernel,vadd_validshape,vadd_validshape_dynamic,print,storefp,
+        decode_attention_incore_0,decode_attention_incore_1,
+        decode_attention_incore_2,decode_attention_incore_3,
+        decode_attention_incore_4,decode_attention_incore_5,
+        decode_attention_incore_6,decode_attention_incore_7,
+        decode_attention_incore_8,decode_attention_incore_9,
+        decode_attention_incore_10,decode_attention_incore_11,
+        decode_attention_incore_12
     steps:
       - name: Resolve validation parameters
         shell: bash

diff --git a/test/samples/Qwen3Scope2/README.md b/test/samples/Qwen3Scope2/README.md
@@ -0,0 +1,11 @@
+Qwen3 scope2 PTO kernels generated from `pypto-lib/examples/models/qwen3/qwen3_32b_decode_scope2.py`.
+
+Scope:
+- compile-regression inputs for `ptoas`
+- A5-only kernels; `runop.sh` injects `--pto-arch a5` for this directory unless the caller already overrides `PTOAS_FLAGS`
+
+Notes:
+- The source PyPTO program lowers to 13 kernel-level `.pto` files plus an orchestration C++ file.
+- This sample directory vendors only the kernel `.pto` inputs.
+- No custom `golden.py` or `compare.py` is included in this draft because those are tied to the full orchestration flow, not to individual kernel-only `.pto` files.
+- The existing `test/npu_validation/scripts/generate_testcase.py` flow can still auto-generate generic validation assets for these kernels when needed.
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_0.pto b/test/samples/Qwen3Scope2/decode_attention_incore_0.pto
@@ -0,0 +1,25 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @decode_attention_incore_0(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c256 = arith.constant 256 : i64
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c1 = arith.constant 1 : index
+  %c16 = arith.constant 16 : index
+  %c1024 = arith.constant 1024 : index
+  %c0 = arith.constant 0 : index
+  %k_group__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %k_proj__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  scf.for %ki__idx_v0 = %c0 to %c8 step %c1 {
+    %1 = arith.muli %ki__idx_v0, %c128 : index
+    %t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %k_proj__ssa_v0_pview = pto.partition_view %k_proj__ssa_v0_view, offsets = [%arg2, %1], sizes = [%c1, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x128xbf16>
+    pto.tload ins(%k_proj__ssa_v0_pview : !pto.partition_tensor_view<1x128xbf16>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %0 = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tcvt ins(%t__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %k_group__iter_v1_pview = pto.partition_view %k_group__ssa_v0_view, offsets = [%ki__idx_v0, %c0], sizes = [%c1, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x128xf32>
+    pto.tstore ins(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%k_group__iter_v1_pview : !pto.partition_tensor_view<1x128xf32>)
+  }
+  return
+  }
+}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_1.pto b/test/samples/Qwen3Scope2/decode_attention_incore_1.pto
@@ -0,0 +1,58 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @decode_attention_incore_1(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<f32>, %arg4: !pto.ptr<f32>, %arg5: !pto.ptr<f32>) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  %c0i = arith.constant 0 : i64
+  %c256 = arith.constant 256 : i64
+  %c512 = arith.constant 512 : i64
+  %c768 = arith.constant 768 : i64
+  %c1024 = arith.constant 1024 : i64
+  %c3072 = arith.constant 3072 : i64
+  %c5120 = arith.constant 5120 : i64
+  %c7168 = arith.constant 7168 : i64
+  %c1 = arith.constant 1 : index
+  %c64 = arith.constant 64 : index
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %c0 = arith.constant 0 : index
+  %cos_hi__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %cos_lo__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %k_group__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %k_rot_tensor__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %sin_hi__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %sin_lo__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %cos_hi__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %cos_hi__ssa_v0_pview = pto.partition_view %cos_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%cos_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %cos_lo__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %cos_lo__ssa_v0_pview = pto.partition_view %cos_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%cos_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %sin_hi__tile = pto.alloc_tile addr = %c512 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %sin_hi__ssa_v0_pview = pto.partition_view %sin_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%sin_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %sin_lo__tile = pto.alloc_tile addr = %c768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %sin_lo__ssa_v0_pview = pto.partition_view %sin_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
+  pto.tload ins(%sin_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %k_lo__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %k_group__rv_v2_pview = pto.partition_view %k_group__rv_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+  pto.tload ins(%k_group__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%k_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %k_hi__tile = pto.alloc_tile addr = %c3072 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  %3 = pto.partition_view %k_group__rv_v2_view, offsets = [%c0, %c64], sizes = [%c8, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+  pto.tload ins(%3 : !pto.partition_tensor_view<8x64xf32>) outs(%k_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %t__tile = pto.alloc_tile addr = %c5120 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tcolexpandmul ins(%k_lo__tile, %cos_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %0 = pto.alloc_tile addr = %c7168 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tcolexpandmul ins(%k_hi__tile, %sin_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %rot_lo__tile = pto.alloc_tile addr = %c5120 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tsub ins(%t__tile, %0 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rot_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %1 = pto.alloc_tile addr = %c3072 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tcolexpandmul ins(%k_hi__tile, %cos_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %2 = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tcolexpandmul ins(%k_lo__tile, %sin_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %rot_hi__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  pto.tadd ins(%1, %2 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rot_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+  %k_rot_tensor__ssa_v0_pview = pto.partition_view %k_rot_tensor__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+  pto.tstore ins(%rot_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%k_rot_tensor__ssa_v0_pview : !pto.partition_tensor_view<8x64xf32>)
+  %k_rot_tensor__tile_pview = pto.partition_view %k_rot_tensor__ssa_v0_view, offsets = [%c0, %c64], sizes = [%c8, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+  pto.tstore ins(%rot_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%k_rot_tensor__tile_pview : !pto.partition_tensor_view<8x64xf32>)
+  return
+  }
+}
diff --git a/test/samples/Qwen3Scope2/decode_attention_incore_10.pto b/test/samples/Qwen3Scope2/decode_attention_incore_10.pto
@@ -0,0 +1,30 @@
+module attributes {pto.target_arch = "a5"} {
+  func.func @decode_attention_incore_10(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<f32>, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+  %c0i = arith.constant 0 : i64
+  %c16384 = arith.constant 16384 : i64
+  %c16 = arith.constant 16 : index
+  %c64 = arith.constant 64 : index
+  %c1 = arith.constant 1 : index
+  %c524288 = arith.constant 524288 : index
+  %c128 = arith.constant 128 : index
+  %c0 = arith.constant 0 : index
+  %exp_padded__ssa_v1_view = pto.make_tensor_view %arg0, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %v_cache__rv_v4_view = pto.make_tensor_view %arg1, shape = [%c524288, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
+  %ret0__out_view = pto.make_tensor_view %arg2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
+  %v_tile__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  %v_cache__rv_v4_pview = pto.partition_view %v_cache__rv_v4_view, offsets = [%arg3, %c0], sizes = [%c64, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<64x128xbf16>
+  pto.tload ins(%v_cache__rv_v4_pview : !pto.partition_tensor_view<64x128xbf16>) outs(%v_tile__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %lhs_mat = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  %exp_padded__ssa_v1_pview = pto.partition_view %exp_padded__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x64xbf16>
+  pto.tload ins(%exp_padded__ssa_v1_pview : !pto.partition_tensor_view<16x64xbf16>) outs(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %lhs_mat_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+  pto.tmov ins(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%lhs_mat_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+  %v_tile__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+  pto.tmov ins(%v_tile__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%v_tile__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+  %oi_tmp_pad__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+  pto.tmatmul ins(%lhs_mat_Left, %v_tile__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%oi_tmp_pad__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+  %ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
+  pto.tstore ins(%oi_tmp_pad__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<16x128xf32>)
+  return
+  }
+}