Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ on:
skip_cases:
description: "Comma/space separated testcase names to skip (e.g. scatter,mrgsort)"
type: string
default: "mix_kernel,vadd_validshape,vadd_validshape_dynamic,print,storefp"
default: "mix_kernel,vadd_validshape,vadd_validshape_dynamic,print,storefp,decode_attention_incore_0,decode_attention_incore_1,decode_attention_incore_2,decode_attention_incore_3,decode_attention_incore_4,decode_attention_incore_5,decode_attention_incore_6,decode_attention_incore_7,decode_attention_incore_8,decode_attention_incore_9,decode_attention_incore_10,decode_attention_incore_11,decode_attention_incore_12"
run_only_cases:
description: "Comma/space separated testcase names to run (empty = run all)"
type: string
Expand Down Expand Up @@ -261,7 +261,14 @@ jobs:
# Temporary CI gate: skip cases that still error/flap on the remote NPU.
# Update this list as we fix the underlying issues.
DEFAULT_SKIP_CASES: >-
mix_kernel,vadd_validshape,vadd_validshape_dynamic,print,storefp
mix_kernel,vadd_validshape,vadd_validshape_dynamic,print,storefp,
decode_attention_incore_0,decode_attention_incore_1,
decode_attention_incore_2,decode_attention_incore_3,
decode_attention_incore_4,decode_attention_incore_5,
decode_attention_incore_6,decode_attention_incore_7,
decode_attention_incore_8,decode_attention_incore_9,
decode_attention_incore_10,decode_attention_incore_11,
decode_attention_incore_12
steps:
- name: Resolve validation parameters
shell: bash
Expand Down
11 changes: 11 additions & 0 deletions test/samples/Qwen3Scope2/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
Qwen3 scope2 PTO kernels generated from `pypto-lib/examples/models/qwen3/qwen3_32b_decode_scope2.py`.

Scope:
- compile-regression inputs for `ptoas`
- A5-only kernels; `runop.sh` injects `--pto-arch a5` for this directory unless the caller already overrides `PTOAS_FLAGS`

Notes:
- The source PyPTO program lowers to 13 kernel-level `.pto` files plus an orchestration C++ file.
- This sample directory vendors only the kernel `.pto` inputs.
- No custom `golden.py` or `compare.py` is included in this draft because those are tied to the full orchestration flow, not to individual kernel-only `.pto` files.
- The existing `test/npu_validation/scripts/generate_testcase.py` flow can still auto-generate generic validation assets for these kernels when needed.
25 changes: 25 additions & 0 deletions test/samples/Qwen3Scope2/decode_attention_incore_0.pto
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
module attributes {pto.target_arch = "a5"} {
func.func @decode_attention_incore_0(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<bf16>, %arg2: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
%c0i = arith.constant 0 : i64
%c256 = arith.constant 256 : i64
%c8 = arith.constant 8 : index
%c128 = arith.constant 128 : index
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%k_group__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
%k_proj__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c16, %c1024], strides = [%c1024, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
scf.for %ki__idx_v0 = %c0 to %c8 step %c1 {
%1 = arith.muli %ki__idx_v0, %c128 : index
%t__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%k_proj__ssa_v0_pview = pto.partition_view %k_proj__ssa_v0_view, offsets = [%arg2, %1], sizes = [%c1, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<1x128xbf16>
pto.tload ins(%k_proj__ssa_v0_pview : !pto.partition_tensor_view<1x128xbf16>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
%0 = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>
pto.tcvt ins(%t__tile{rmode = #pto<round_mode ROUND>} : !pto.tile_buf<loc=vec, dtype=bf16, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
%k_group__iter_v1_pview = pto.partition_view %k_group__ssa_v0_view, offsets = [%ki__idx_v0, %c0], sizes = [%c1, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x128xf32>
pto.tstore ins(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=128, v_row=1, v_col=128, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%k_group__iter_v1_pview : !pto.partition_tensor_view<1x128xf32>)
}
return
}
}
58 changes: 58 additions & 0 deletions test/samples/Qwen3Scope2/decode_attention_incore_1.pto
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
module attributes {pto.target_arch = "a5"} {
func.func @decode_attention_incore_1(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<f32>, %arg4: !pto.ptr<f32>, %arg5: !pto.ptr<f32>) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
%c0i = arith.constant 0 : i64
%c256 = arith.constant 256 : i64
%c512 = arith.constant 512 : i64
%c768 = arith.constant 768 : i64
%c1024 = arith.constant 1024 : i64
%c3072 = arith.constant 3072 : i64
%c5120 = arith.constant 5120 : i64
%c7168 = arith.constant 7168 : i64
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c8 = arith.constant 8 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%cos_hi__ssa_v0_view = pto.make_tensor_view %arg0, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
%cos_lo__ssa_v0_view = pto.make_tensor_view %arg1, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
%k_group__rv_v2_view = pto.make_tensor_view %arg2, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
%k_rot_tensor__ssa_v0_view = pto.make_tensor_view %arg3, shape = [%c8, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
%sin_hi__ssa_v0_view = pto.make_tensor_view %arg4, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
%sin_lo__ssa_v0_view = pto.make_tensor_view %arg5, shape = [%c1, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
%cos_hi__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%cos_hi__ssa_v0_pview = pto.partition_view %cos_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
pto.tload ins(%cos_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
%cos_lo__tile = pto.alloc_tile addr = %c256 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%cos_lo__ssa_v0_pview = pto.partition_view %cos_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
pto.tload ins(%cos_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%cos_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
%sin_hi__tile = pto.alloc_tile addr = %c512 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%sin_hi__ssa_v0_pview = pto.partition_view %sin_hi__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
pto.tload ins(%sin_hi__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
%sin_lo__tile = pto.alloc_tile addr = %c768 : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%sin_lo__ssa_v0_pview = pto.partition_view %sin_lo__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c1, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x64xf32>
pto.tload ins(%sin_lo__ssa_v0_pview : !pto.partition_tensor_view<1x64xf32>) outs(%sin_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
%k_lo__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%k_group__rv_v2_pview = pto.partition_view %k_group__rv_v2_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
pto.tload ins(%k_group__rv_v2_pview : !pto.partition_tensor_view<8x64xf32>) outs(%k_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
%k_hi__tile = pto.alloc_tile addr = %c3072 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%3 = pto.partition_view %k_group__rv_v2_view, offsets = [%c0, %c64], sizes = [%c8, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
pto.tload ins(%3 : !pto.partition_tensor_view<8x64xf32>) outs(%k_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
%t__tile = pto.alloc_tile addr = %c5120 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
pto.tcolexpandmul ins(%k_lo__tile, %cos_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%t__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
%0 = pto.alloc_tile addr = %c7168 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
pto.tcolexpandmul ins(%k_hi__tile, %sin_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%0 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
%rot_lo__tile = pto.alloc_tile addr = %c5120 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
pto.tsub ins(%t__tile, %0 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rot_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
%1 = pto.alloc_tile addr = %c3072 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
pto.tcolexpandmul ins(%k_hi__tile, %cos_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%1 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
%2 = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
pto.tcolexpandmul ins(%k_lo__tile, %sin_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%2 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
%rot_hi__tile = pto.alloc_tile addr = %c1024 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
pto.tadd ins(%1, %2 : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%rot_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
%k_rot_tensor__ssa_v0_pview = pto.partition_view %k_rot_tensor__ssa_v0_view, offsets = [%c0, %c0], sizes = [%c8, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
pto.tstore ins(%rot_lo__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%k_rot_tensor__ssa_v0_pview : !pto.partition_tensor_view<8x64xf32>)
%k_rot_tensor__tile_pview = pto.partition_view %k_rot_tensor__ssa_v0_view, offsets = [%c0, %c64], sizes = [%c8, %c64] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
pto.tstore ins(%rot_hi__tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=64, v_row=8, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%k_rot_tensor__tile_pview : !pto.partition_tensor_view<8x64xf32>)
return
}
}
30 changes: 30 additions & 0 deletions test/samples/Qwen3Scope2/decode_attention_incore_10.pto
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
module attributes {pto.target_arch = "a5"} {
func.func @decode_attention_incore_10(%arg0: !pto.ptr<bf16>, %arg1: !pto.ptr<bf16>, %arg2: !pto.ptr<f32>, %arg3: index) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
%c0i = arith.constant 0 : i64
%c16384 = arith.constant 16384 : i64
%c16 = arith.constant 16 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c524288 = arith.constant 524288 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%exp_padded__ssa_v1_view = pto.make_tensor_view %arg0, shape = [%c16, %c64], strides = [%c64, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
%v_cache__rv_v4_view = pto.make_tensor_view %arg1, shape = [%c524288, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xbf16>
%ret0__out_view = pto.make_tensor_view %arg2, shape = [%c16, %c128], strides = [%c128, %c1] {layout = #pto.layout<nd>}: !pto.tensor_view<?x?xf32>
%v_tile__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>
%v_cache__rv_v4_pview = pto.partition_view %v_cache__rv_v4_view, offsets = [%arg3, %c0], sizes = [%c64, %c128] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<64x128xbf16>
pto.tload ins(%v_cache__rv_v4_pview : !pto.partition_tensor_view<64x128xbf16>) outs(%v_tile__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
%lhs_mat = pto.alloc_tile addr = %c16384 : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
%exp_padded__ssa_v1_pview = pto.partition_view %exp_padded__ssa_v1_view, offsets = [%c0, %c0], sizes = [%c16, %c64] : !pto.tensor_view<?x?xbf16> -> !pto.partition_tensor_view<16x64xbf16>
pto.tload ins(%exp_padded__ssa_v1_pview : !pto.partition_tensor_view<16x64xbf16>) outs(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
%lhs_mat_Left = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
pto.tmov ins(%lhs_mat : !pto.tile_buf<loc=mat, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%lhs_mat_Left : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
%v_tile__tile_Right = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>
pto.tmov ins(%v_tile__tile : !pto.tile_buf<loc=mat, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%v_tile__tile_Right : !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
%oi_tmp_pad__tile = pto.alloc_tile addr = %c0i : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
pto.tmatmul ins(%lhs_mat_Left, %v_tile__tile_Right : !pto.tile_buf<loc=left, dtype=bf16, rows=16, cols=64, v_row=16, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=bf16, rows=64, cols=128, v_row=64, v_col=128, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%oi_tmp_pad__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
%ret0__out_pview = pto.partition_view %ret0__out_view, offsets = [%c0, %c0], sizes = [%c16, %c128] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x128xf32>
pto.tstore ins(%oi_tmp_pad__tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=128, v_row=16, v_col=128, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) outs(%ret0__out_pview : !pto.partition_tensor_view<16x128xf32>)
return
}
}
Loading
Loading