Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .claude/rules/coding-style.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ PyPTO supports three function types:
|------|---------|-------------|
| `pl.FunctionType.InCore` | Runs on AICore. Manually load/store between GM and UB. | Explicit control over data movement and memory placement |
| `pl.FunctionType.Orchestration` | Host/AICPU scheduling. Calls InCore kernels, manages tensor allocation. | Composing InCore kernels into a computation graph |
| `pl.FunctionType.Opaque` | Compiler decides InCore/Orchestration boundary. Use with `pl.auto_incore()`. | When you don't need manual placement control |
| `pl.FunctionType.Opaque` | Compiler decides InCore/Orchestration boundary. Use with `pl.at()`. | When you don't need manual placement control |

### Explicit InCore + Orchestration (pypto standard style)

Expand Down Expand Up @@ -70,7 +70,7 @@ class HelloWorldProgram:
class SoftmaxProgram:
@pl.function(type=pl.FunctionType.Opaque)
def softmax(self, input_tensor: pl.Tensor[[B, S, H], pl.FP32], ...):
with pl.auto_incore():
with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
for b in pl.parallel(0, B, 1, chunk=4):
...
```
Expand Down
6 changes: 3 additions & 3 deletions docs/para_for.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ When a **`with pl.incore`** scope **covers** nested chunked loops, each such chu
**Example:** two chunked loops inside one incore scope.

```python
with pl.incore():
with pl.at(level=pl.Level.CORE_GROUP):
for i in pl.range(0, 4096, chunk=1024):
for j in pl.range(0, 2048, chunk=512):
<body using i, j>
Expand Down Expand Up @@ -179,7 +179,7 @@ The compiler should **try to interchange** loop nesting so that:
for c_1 in ...: # chunk loop 1
for c_2 in ...: # chunk loop 2
...
with pl.incore(): # incore placed here (see §5.4)
with pl.at(level=pl.Level.CORE_GROUP): # incore placed here (see §5.4)
for i in ...: # in_chunk loop 1
for j in ...: # in_chunk loop 2
<body>
Expand Down Expand Up @@ -214,7 +214,7 @@ for c in pl.range(0, 4):
t_end = min(t_start + 1024, 4096)
r_start = t_start * TILE_M # or derived from t_start
x_tile = pl.slice(x, [TILE_M, N], [r_start, 0])
with pl.incore(): # placed to encompass only the in_chunk loop + body
with pl.at(level=pl.Level.CORE_GROUP): # placed to encompass only the in_chunk loop + body
for t in pl.range(t_start, t_end): # in_chunk loop inside incore
# body: e.g. load(x_tile), softmax, store
...
Expand Down
2 changes: 1 addition & 1 deletion docs/pto2_rt.md
Original file line number Diff line number Diff line change
Expand Up @@ -765,7 +765,7 @@ The `docs/pypto-frontend-coding-style.md` describes the Python-to-C++ code gener

| Type | Description |
|------|-------------|
| **Opaque** | Default function type; may contain `pl.incore()` calls |
| **Opaque** | Default function type; may contain `pl.at()` scopes |
| **Orchestration** | Host/AICPU orchestration function; calls InCore functions |
| **InCore** | AICore kernel subgraph (load/compute/store) |

Expand Down
8 changes: 4 additions & 4 deletions docs/pypto-frontend-coding-style.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,19 +131,19 @@ def BuildExampleGraph(

---

## 5. InCore 作用域与匿名 InCore(with pl.incore())
## 5. InCore 作用域与匿名 InCore(with pl.at())

### 5.1 语法

在 **Opaque** 函数内用 `with pl.incore():` 标记一段匿名” InCore 区域;解析后生成 `ScopeStmt(scope_type=InCore)`。
在 **Opaque** 函数内用 `with pl.at(level=pl.Level.CORE_GROUP):` 标记一段匿名” InCore 区域;解析后生成 `ScopeStmt(scope_type=InCore)`。

```python
@pl.program
class Before:
@pl.function # 默认 Opaque
def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
y = x + 1
with pl.incore():
with pl.at(level=pl.Level.CORE_GROUP):
tile = pl.load(y, [0], [64])
tile_sq = pl.mul(tile, tile)
result = pl.store(tile_sq, [0], [64], x)
Expand All @@ -163,7 +163,7 @@ class Before:

| 类型 | 写法 | 用途 |
|------|------|------|
| Opaque | 默认 / `pl.FunctionType.Opaque` | 未指定,可含 `pl.incore()` 待 outline |
| Opaque | 默认 / `pl.FunctionType.Opaque` | 未指定,可含 `pl.at()` 待 outline |
| Orchestration | `pl.FunctionType.Orchestration` | Host/AICPU 编排,调用 InCore |
| InCore | `pl.FunctionType.InCore` | AICore 上的子图(load/compute/store) |

Expand Down
2 changes: 1 addition & 1 deletion examples/beginner/hello_world.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def add_one(
x: pl.Tensor[[rows, cols], pl.FP32],
y: pl.Out[pl.Tensor[[rows, cols], pl.FP32]],
) -> pl.Tensor[[rows, cols], pl.FP32]:
with pl.auto_incore():
with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
for r in pl.parallel(0, rows, 1, chunk=row_chunk):
tile_x = pl.slice(x, [1, cols], [r, 0])
tile_y = pl.add(tile_x, 1.0)
Expand Down
2 changes: 1 addition & 1 deletion examples/beginner/matmul.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def matmul(
b: pl.Tensor[[k, n], pl.FP32],
c: pl.Out[pl.Tensor[[m, n], pl.FP32]],
) -> pl.Tensor[[m, n], pl.FP32]:
with pl.auto_incore():
with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
for mb in pl.parallel(0, m, m_tile, chunk=m_chunk):
for nb in pl.parallel(0, n, n_tile, chunk=n_chunk):
tile_a = pl.slice(a, [m_tile, k], [mb, 0])
Expand Down
2 changes: 1 addition & 1 deletion examples/intermediate/gemm.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def gemm(
b: pl.Tensor[[k, n], pl.FP32],
c: pl.Out[pl.Tensor[[m, n], pl.FP32]],
) -> pl.Tensor[[m, n], pl.FP32]:
with pl.auto_incore():
with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
for mb in pl.parallel(0, m, m_tile, chunk=m_chunk):
for nb in pl.parallel(0, n, n_tile, chunk=n_chunk):
# First K-tile: initialize accumulator via matmul
Expand Down
2 changes: 1 addition & 1 deletion examples/intermediate/layer_norm.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def layer_norm(
beta: pl.Tensor[[1, hidden], pl.FP32],
y: pl.Out[pl.Tensor[[rows, hidden], pl.FP32]],
) -> pl.Tensor[[rows, hidden], pl.FP32]:
with pl.auto_incore():
with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
for r in pl.parallel(0, rows, row_chunk, chunk=1):
tile_x = pl.slice(x, [row_chunk, hidden], [r, 0])
gamma_tile = pl.slice(gamma, [1, hidden], [0, 0])
Expand Down
2 changes: 1 addition & 1 deletion examples/intermediate/rms_norm.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def rms_norm(
gamma: pl.Tensor[[1, hidden], pl.FP32],
y: pl.Out[pl.Tensor[[rows, hidden], pl.FP32]],
) -> pl.Tensor[[rows, hidden], pl.FP32]:
with pl.auto_incore():
with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
for r in pl.parallel(0, rows, row_chunk, chunk=1):
# Pass 1: accumulate sum(x^2) across hidden chunks
# row_sum produces [row_chunk, 1] col_major; scalar ops
Expand Down
2 changes: 1 addition & 1 deletion examples/intermediate/rope.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def rope(
sin: pl.Tensor[[1, head_dim], pl.FP32],
y: pl.Out[pl.Tensor[[total_rows, head_dim], pl.FP32]],
) -> pl.Tensor[[total_rows, head_dim], pl.FP32]:
with pl.auto_incore():
with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
for b in pl.parallel(0, batch, 1, chunk=batch_chunk):
# Slice cos/sin lo/hi halves directly from tensor
# so each becomes a separate tile.load (no textract).
Expand Down
2 changes: 1 addition & 1 deletion examples/intermediate/softmax.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def softmax(
x: pl.Tensor[[rows, cols], pl.FP32],
y: pl.Out[pl.Tensor[[rows, cols], pl.FP32]],
) -> pl.Tensor[[rows, cols], pl.FP32]:
with pl.auto_incore():
with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
for r in pl.parallel(0, rows, row_chunk, chunk=1):
tile_x = pl.slice(x, [row_chunk, cols], [r, 0])

Expand Down
2 changes: 1 addition & 1 deletion examples/models/deepseek_v3_2/deepseek_v3_2_decode_back.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def deepseek_v3_2_decode_back_layer(
w_down: pl.Tensor[[INTER_CFG, HIDDEN_CFG], pl.BF16],
out: pl.Tensor[[BATCH_CFG, HIDDEN_CFG], pl.BF16],
) -> pl.Tensor[[BATCH_CFG, HIDDEN_CFG], pl.BF16]:
with pl.auto_incore():
with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
node_id = pl.tensor.read(node_id_t, [0])
combined = pl.create_tensor([BATCH_CFG, ATTN_OUT_CFG], dtype=pl.FP32)
# Read combine results from this node view.
Expand Down
4 changes: 2 additions & 2 deletions examples/models/deepseek_v3_2/deepseek_v3_2_decode_front.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def deepseek_v3_2_decode_front_layer(
qr = pl.create_tensor([BATCH_CFG, Q_LORA_RANK_CFG], dtype=pl.BF16)
q_proj = pl.create_tensor([BATCH_CFG, NUM_HEADS_CFG * QK_HEAD_DIM_CFG], dtype=pl.BF16)
kv_a = pl.create_tensor([BATCH_CFG, KV_A_OUT], dtype=pl.BF16)
with pl.auto_incore():
with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
sq_sum = pl.create_tensor([BATCH_CFG, 1], dtype=pl.FP32)
sq_sum = pl.mul(sq_sum, 0)
# Keep an explicit local Vec pad tensor alive in this scope so
Expand Down Expand Up @@ -233,7 +233,7 @@ def deepseek_v3_2_decode_front_layer(
# - C: sparse attention consumes merged topk immediately
# This avoids materializing topk intermediates across kernel boundaries.
attn_front = pl.create_tensor([BATCH_CFG, ATTN_OUT_CFG], dtype=pl.FP32)
with pl.auto_incore():
with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
layer_id = pl.tensor.read(layer_id_t, [0])
for b in pl.parallel(0, BATCH_CFG, 1, chunk=4):
ctx_len = pl.tensor.read(seq_lens, [b])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def deepseek_v3_2_prefill_back_layer(
w_down: pl.Tensor[[INTER_CFG, HIDDEN_CFG], pl.BF16],
out: pl.Tensor[[BATCH_CFG, MAX_SEQ_CFG, HIDDEN_CFG], pl.BF16],
) -> pl.Tensor[[BATCH_CFG, MAX_SEQ_CFG, HIDDEN_CFG], pl.BF16]:
with pl.auto_incore():
with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
node_id = pl.tensor.read(node_id_t, [0])
for b in pl.parallel(0, BATCH_CFG, 1, chunk=4):
seq_len_b = pl.tensor.read(seq_lens, [b])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def deepseek_v3_2_prefill_front_layer(
w_latent_to_v: pl.Tensor[[NUM_HEADS_CFG, KV_LORA_RANK_CFG, V_HEAD_DIM_CFG], pl.BF16],
dispatch_buf: pl.Tensor[[EP_NODES_CFG, BATCH_CFG, MAX_SEQ_CFG, ATTN_OUT_CFG], pl.BF16],
) -> pl.Tensor[[EP_NODES_CFG, BATCH_CFG, MAX_SEQ_CFG, ATTN_OUT_CFG], pl.BF16]:
with pl.auto_incore():
with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
layer_id = pl.tensor.read(layer_id_t, [0])

for b in pl.parallel(0, BATCH_CFG, 1, chunk=4):
Expand Down
6 changes: 3 additions & 3 deletions examples/models/kimi/kimi_k2_decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def kimi_k2_decode_layer(
# =========================================================================
# Scope 1: Input RMSNorm + QKV Projection
# =========================================================================
with pl.auto_incore():
with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
sq_sum = pl.create_tensor([BATCH_CFG, 1], dtype=pl.FP32)
sq_sum = pl.mul(sq_sum, 0.0)

Expand Down Expand Up @@ -241,7 +241,7 @@ def kimi_k2_decode_layer(
)

# Flash Decoding Attention per head
with pl.auto_incore():
with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
attn_row = pl.create_tensor([1, HIDDEN_CFG], dtype=pl.FP32)
attn_row = pl.mul(attn_row, 0.0)

Expand Down Expand Up @@ -325,7 +325,7 @@ def kimi_k2_decode_layer(
# =========================================================================
# Scope 3: Output Projection + Residual + Post RMSNorm + MoE
# =========================================================================
with pl.auto_incore():
with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
for b0 in pl.range(0, BATCH_CFG, BATCH_TILE):
# Output projection + residual
resid1_tile = pl.create_tensor([BATCH_TILE, HIDDEN_CFG], dtype=pl.FP32)
Expand Down
6 changes: 3 additions & 3 deletions examples/models/milm/milm_decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def milm_decode_layer(
# Scope 1: Input RMSNorm + QKV Projection
# Optimized with chunked computation to reduce InCore pressure
# =========================================================================
with pl.auto_incore():
with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
# Compute sum of squares for RMSNorm
sq_sum = pl.create_tensor([BATCH_CFG, 1], dtype=pl.FP32)
sq_sum = pl.mul(sq_sum, 0.0)
Expand Down Expand Up @@ -234,7 +234,7 @@ def milm_decode_layer(
)

# Flash Decoding Attention (per head with GQA)
with pl.auto_incore():
with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
attn_row = pl.create_tensor([1, HIDDEN_CFG], dtype=pl.FP32)
attn_row = pl.mul(attn_row, 0.0)

Expand Down Expand Up @@ -318,7 +318,7 @@ def milm_decode_layer(
# =========================================================================
# Scope 3: Output Projection + Residual + Post RMSNorm + SwiGLU MLP
# =========================================================================
with pl.auto_incore():
with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
for b0 in pl.range(0, BATCH_CFG, BATCH_TILE):
# Output projection + residual (first residual connection)
resid1_tile = pl.create_tensor([BATCH_TILE, HIDDEN_CFG], dtype=pl.FP32)
Expand Down
6 changes: 3 additions & 3 deletions examples/models/qwen3/qwen3-32b.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def qwen3_decode_layer(
attn_out = pl.create_tensor([BATCH_CFG, HIDDEN_CFG], dtype=pl.FP32)

# Scope 1: input RMSNorm + Q/K/V projection.
with pl.auto_incore():
with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
sq_sum = pl.create_tensor([BATCH_CFG, 1], dtype=pl.FP32)
sq_sum = pl.mul(sq_sum, 0.0)

Expand Down Expand Up @@ -197,7 +197,7 @@ def qwen3_decode_layer(
[cache_row, 0],
)

with pl.auto_incore():
with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
attn_row = pl.create_tensor([1, HIDDEN_CFG], dtype=pl.FP32)
attn_row = pl.mul(attn_row, 0.0)

Expand Down Expand Up @@ -269,7 +269,7 @@ def qwen3_decode_layer(
attn_out = pl.assemble(attn_out, attn_row, [b, 0])

# Scope 3: output projection + residual + post RMSNorm + MLP + residual.
with pl.auto_incore():
with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
for b0 in pl.range(0, BATCH_CFG, BATCH_TILE):
resid1_tile = pl.create_tensor([BATCH_TILE, HIDDEN_CFG], dtype=pl.FP32)

Expand Down
Loading
Loading