hw-native-sys · zhangqi-chen · Apr 10, 2026 · Apr 10, 2026
diff --git a/.claude/rules/coding-style.md b/.claude/rules/coding-style.md
@@ -32,7 +32,7 @@ PyPTO supports three function types:
 |------|---------|-------------|
 | `pl.FunctionType.InCore` | Runs on AICore. Manually load/store between GM and UB. | Explicit control over data movement and memory placement |
 | `pl.FunctionType.Orchestration` | Host/AICPU scheduling. Calls InCore kernels, manages tensor allocation. | Composing InCore kernels into a computation graph |
-| `pl.FunctionType.Opaque` | Compiler decides InCore/Orchestration boundary. Use with `pl.auto_incore()`. | When you don't need manual placement control |
+| `pl.FunctionType.Opaque` | Compiler decides InCore/Orchestration boundary. Use with `pl.at()`. | When you don't need manual placement control |
 
 ### Explicit InCore + Orchestration (pypto standard style)
 
@@ -70,7 +70,7 @@ class HelloWorldProgram:
 class SoftmaxProgram:
     @pl.function(type=pl.FunctionType.Opaque)
     def softmax(self, input_tensor: pl.Tensor[[B, S, H], pl.FP32], ...):
-        with pl.auto_incore():
+        with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
             for b in pl.parallel(0, B, 1, chunk=4):
                 ...
 ```

diff --git a/docs/para_for.md b/docs/para_for.md
@@ -151,7 +151,7 @@ When a **`with pl.incore`** scope **covers** nested chunked loops, each such chu
 **Example:** two chunked loops inside one incore scope.
 
 ```python
-with pl.incore():
+with pl.at(level=pl.Level.CORE_GROUP):
     for i in pl.range(0, 4096, chunk=1024):
         for j in pl.range(0, 2048, chunk=512):
             <body using i, j>
@@ -179,7 +179,7 @@ The compiler should **try to interchange** loop nesting so that:
 for c_1 in ...:           # chunk loop 1
   for c_2 in ...:         # chunk loop 2
     ...
-      with pl.incore():   # incore placed here (see §5.4)
+      with pl.at(level=pl.Level.CORE_GROUP):   # incore placed here (see §5.4)
         for i in ...:     # in_chunk loop 1
           for j in ...:   # in_chunk loop 2
             <body>
@@ -214,7 +214,7 @@ for c in pl.range(0, 4):
     t_end   = min(t_start + 1024, 4096)
     r_start = t_start * TILE_M   # or derived from t_start
     x_tile = pl.slice(x, [TILE_M, N], [r_start, 0])
-    with pl.incore():    # placed to encompass only the in_chunk loop + body
+    with pl.at(level=pl.Level.CORE_GROUP):    # placed to encompass only the in_chunk loop + body
         for t in pl.range(t_start, t_end):   # in_chunk loop inside incore
             # body: e.g. load(x_tile), softmax, store
             ...

diff --git a/docs/pto2_rt.md b/docs/pto2_rt.md
@@ -765,7 +765,7 @@ The `docs/pypto-frontend-coding-style.md` describes the Python-to-C++ code gener
 
 | Type | Description |
 |------|-------------|
-| **Opaque** | Default function type; may contain `pl.incore()` calls |
+| **Opaque** | Default function type; may contain `pl.at()` scopes |
 | **Orchestration** | Host/AICPU orchestration function; calls InCore functions |
 | **InCore** | AICore kernel subgraph (load/compute/store) |
 

diff --git a/docs/pypto-frontend-coding-style.md b/docs/pypto-frontend-coding-style.md
@@ -131,19 +131,19 @@ def BuildExampleGraph(
 
 ---
 
-## 5. InCore 作用域与匿名 InCore（with pl.incore()）
+## 5. InCore 作用域与匿名 InCore（with pl.at()）
 
 ### 5.1 语法
 
-在 **Opaque** 函数内用 `with pl.incore():` 标记一段“匿名” InCore 区域；解析后生成 `ScopeStmt(scope_type=InCore)`。
+在 **Opaque** 函数内用 `with pl.at(level=pl.Level.CORE_GROUP):` 标记一段”匿名” InCore 区域；解析后生成 `ScopeStmt(scope_type=InCore)`。
 
 ```python
 @pl.program
 class Before:
     @pl.function   # 默认 Opaque
     def main(self, x: pl.Tensor[[64], pl.FP32]) -> pl.Tensor[[64], pl.FP32]:
         y = x + 1
-        with pl.incore():
+        with pl.at(level=pl.Level.CORE_GROUP):
             tile = pl.load(y, [0], [64])
             tile_sq = pl.mul(tile, tile)
             result = pl.store(tile_sq, [0], [64], x)
@@ -163,7 +163,7 @@ class Before:
 
 | 类型 | 写法 | 用途 |
 |------|------|------|
-| Opaque | 默认 / `pl.FunctionType.Opaque` | 未指定，可含 `pl.incore()` 待 outline |
+| Opaque | 默认 / `pl.FunctionType.Opaque` | 未指定，可含 `pl.at()` 待 outline |
 | Orchestration | `pl.FunctionType.Orchestration` | Host/AICPU 编排，调用 InCore |
 | InCore | `pl.FunctionType.InCore` | AICore 上的子图（load/compute/store） |
 

diff --git a/examples/beginner/hello_world.py b/examples/beginner/hello_world.py
@@ -38,7 +38,7 @@ def add_one(
             x: pl.Tensor[[rows, cols], pl.FP32],
             y: pl.Out[pl.Tensor[[rows, cols], pl.FP32]],
         ) -> pl.Tensor[[rows, cols], pl.FP32]:
-            with pl.auto_incore():
+            with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
                 for r in pl.parallel(0, rows, 1, chunk=row_chunk):
                     tile_x = pl.slice(x, [1, cols], [r, 0])
                     tile_y = pl.add(tile_x, 1.0)

diff --git a/examples/beginner/matmul.py b/examples/beginner/matmul.py
@@ -48,7 +48,7 @@ def matmul(
             b: pl.Tensor[[k, n], pl.FP32],
             c: pl.Out[pl.Tensor[[m, n], pl.FP32]],
         ) -> pl.Tensor[[m, n], pl.FP32]:
-            with pl.auto_incore():
+            with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
                 for mb in pl.parallel(0, m, m_tile, chunk=m_chunk):
                     for nb in pl.parallel(0, n, n_tile, chunk=n_chunk):
                         tile_a = pl.slice(a, [m_tile, k], [mb, 0])

diff --git a/examples/intermediate/gemm.py b/examples/intermediate/gemm.py
@@ -53,7 +53,7 @@ def gemm(
             b: pl.Tensor[[k, n], pl.FP32],
             c: pl.Out[pl.Tensor[[m, n], pl.FP32]],
         ) -> pl.Tensor[[m, n], pl.FP32]:
-            with pl.auto_incore():
+            with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
                 for mb in pl.parallel(0, m, m_tile, chunk=m_chunk):
                     for nb in pl.parallel(0, n, n_tile, chunk=n_chunk):
                         # First K-tile: initialize accumulator via matmul

diff --git a/examples/intermediate/layer_norm.py b/examples/intermediate/layer_norm.py
@@ -44,7 +44,7 @@ def layer_norm(
             beta: pl.Tensor[[1, hidden], pl.FP32],
             y: pl.Out[pl.Tensor[[rows, hidden], pl.FP32]],
         ) -> pl.Tensor[[rows, hidden], pl.FP32]:
-            with pl.auto_incore():
+            with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
                 for r in pl.parallel(0, rows, row_chunk, chunk=1):
                     tile_x = pl.slice(x, [row_chunk, hidden], [r, 0])
                     gamma_tile = pl.slice(gamma, [1, hidden], [0, 0])

diff --git a/examples/intermediate/rms_norm.py b/examples/intermediate/rms_norm.py
@@ -50,7 +50,7 @@ def rms_norm(
             gamma: pl.Tensor[[1, hidden], pl.FP32],
             y: pl.Out[pl.Tensor[[rows, hidden], pl.FP32]],
         ) -> pl.Tensor[[rows, hidden], pl.FP32]:
-            with pl.auto_incore():
+            with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
                 for r in pl.parallel(0, rows, row_chunk, chunk=1):
                     # Pass 1: accumulate sum(x^2) across hidden chunks
                     # row_sum produces [row_chunk, 1] col_major; scalar ops

diff --git a/examples/intermediate/rope.py b/examples/intermediate/rope.py
@@ -59,7 +59,7 @@ def rope(
             sin: pl.Tensor[[1, head_dim], pl.FP32],
             y: pl.Out[pl.Tensor[[total_rows, head_dim], pl.FP32]],
         ) -> pl.Tensor[[total_rows, head_dim], pl.FP32]:
-            with pl.auto_incore():
+            with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
                 for b in pl.parallel(0, batch, 1, chunk=batch_chunk):
                     # Slice cos/sin lo/hi halves directly from tensor
                     # so each becomes a separate tile.load (no textract).

diff --git a/examples/intermediate/softmax.py b/examples/intermediate/softmax.py
@@ -38,7 +38,7 @@ def softmax(
             x: pl.Tensor[[rows, cols], pl.FP32],
             y: pl.Out[pl.Tensor[[rows, cols], pl.FP32]],
         ) -> pl.Tensor[[rows, cols], pl.FP32]:
-            with pl.auto_incore():
+            with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
                 for r in pl.parallel(0, rows, row_chunk, chunk=1):
                     tile_x = pl.slice(x, [row_chunk, cols], [r, 0])
 

diff --git a/examples/models/deepseek_v3_2/deepseek_v3_2_decode_back.py b/examples/models/deepseek_v3_2/deepseek_v3_2_decode_back.py
@@ -76,7 +76,7 @@ def deepseek_v3_2_decode_back_layer(
             w_down: pl.Tensor[[INTER_CFG, HIDDEN_CFG], pl.BF16],
             out: pl.Tensor[[BATCH_CFG, HIDDEN_CFG], pl.BF16],
         ) -> pl.Tensor[[BATCH_CFG, HIDDEN_CFG], pl.BF16]:
-            with pl.auto_incore():
+            with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
                 node_id = pl.tensor.read(node_id_t, [0])
                 combined = pl.create_tensor([BATCH_CFG, ATTN_OUT_CFG], dtype=pl.FP32)
                 # Read combine results from this node view.

diff --git a/examples/models/deepseek_v3_2/deepseek_v3_2_decode_front.py b/examples/models/deepseek_v3_2/deepseek_v3_2_decode_front.py
@@ -156,7 +156,7 @@ def deepseek_v3_2_decode_front_layer(
             qr = pl.create_tensor([BATCH_CFG, Q_LORA_RANK_CFG], dtype=pl.BF16)
             q_proj = pl.create_tensor([BATCH_CFG, NUM_HEADS_CFG * QK_HEAD_DIM_CFG], dtype=pl.BF16)
             kv_a = pl.create_tensor([BATCH_CFG, KV_A_OUT], dtype=pl.BF16)
-            with pl.auto_incore():
+            with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
                 sq_sum = pl.create_tensor([BATCH_CFG, 1], dtype=pl.FP32)
                 sq_sum = pl.mul(sq_sum, 0)
                 # Keep an explicit local Vec pad tensor alive in this scope so
@@ -233,7 +233,7 @@ def deepseek_v3_2_decode_front_layer(
             # - C: sparse attention consumes merged topk immediately
             # This avoids materializing topk intermediates across kernel boundaries.
             attn_front = pl.create_tensor([BATCH_CFG, ATTN_OUT_CFG], dtype=pl.FP32)
-            with pl.auto_incore():
+            with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
                 layer_id = pl.tensor.read(layer_id_t, [0])
                 for b in pl.parallel(0, BATCH_CFG, 1, chunk=4):
                     ctx_len = pl.tensor.read(seq_lens, [b])

diff --git a/examples/models/deepseek_v3_2/deepseek_v3_2_prefill_back.py b/examples/models/deepseek_v3_2/deepseek_v3_2_prefill_back.py
@@ -75,7 +75,7 @@ def deepseek_v3_2_prefill_back_layer(
             w_down: pl.Tensor[[INTER_CFG, HIDDEN_CFG], pl.BF16],
             out: pl.Tensor[[BATCH_CFG, MAX_SEQ_CFG, HIDDEN_CFG], pl.BF16],
         ) -> pl.Tensor[[BATCH_CFG, MAX_SEQ_CFG, HIDDEN_CFG], pl.BF16]:
-            with pl.auto_incore():
+            with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
                 node_id = pl.tensor.read(node_id_t, [0])
                 for b in pl.parallel(0, BATCH_CFG, 1, chunk=4):
                     seq_len_b = pl.tensor.read(seq_lens, [b])

diff --git a/examples/models/deepseek_v3_2/deepseek_v3_2_prefill_front.py b/examples/models/deepseek_v3_2/deepseek_v3_2_prefill_front.py
@@ -141,7 +141,7 @@ def deepseek_v3_2_prefill_front_layer(
             w_latent_to_v: pl.Tensor[[NUM_HEADS_CFG, KV_LORA_RANK_CFG, V_HEAD_DIM_CFG], pl.BF16],
             dispatch_buf: pl.Tensor[[EP_NODES_CFG, BATCH_CFG, MAX_SEQ_CFG, ATTN_OUT_CFG], pl.BF16],
         ) -> pl.Tensor[[EP_NODES_CFG, BATCH_CFG, MAX_SEQ_CFG, ATTN_OUT_CFG], pl.BF16]:
-            with pl.auto_incore():
+            with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
                 layer_id = pl.tensor.read(layer_id_t, [0])
 
                 for b in pl.parallel(0, BATCH_CFG, 1, chunk=4):

diff --git a/examples/models/kimi/kimi_k2_decode.py b/examples/models/kimi/kimi_k2_decode.py
@@ -137,7 +137,7 @@ def kimi_k2_decode_layer(
             # =========================================================================
             # Scope 1: Input RMSNorm + QKV Projection
             # =========================================================================
-            with pl.auto_incore():
+            with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
                 sq_sum = pl.create_tensor([BATCH_CFG, 1], dtype=pl.FP32)
                 sq_sum = pl.mul(sq_sum, 0.0)
 
@@ -241,7 +241,7 @@ def kimi_k2_decode_layer(
                     )
 
                 # Flash Decoding Attention per head
-                with pl.auto_incore():
+                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
                     attn_row = pl.create_tensor([1, HIDDEN_CFG], dtype=pl.FP32)
                     attn_row = pl.mul(attn_row, 0.0)
 
@@ -325,7 +325,7 @@ def kimi_k2_decode_layer(
             # =========================================================================
             # Scope 3: Output Projection + Residual + Post RMSNorm + MoE
             # =========================================================================
-            with pl.auto_incore():
+            with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
                 for b0 in pl.range(0, BATCH_CFG, BATCH_TILE):
                     # Output projection + residual
                     resid1_tile = pl.create_tensor([BATCH_TILE, HIDDEN_CFG], dtype=pl.FP32)

diff --git a/examples/models/milm/milm_decode.py b/examples/models/milm/milm_decode.py
@@ -128,7 +128,7 @@ def milm_decode_layer(
             # Scope 1: Input RMSNorm + QKV Projection
             # Optimized with chunked computation to reduce InCore pressure
             # =========================================================================
-            with pl.auto_incore():
+            with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
                 # Compute sum of squares for RMSNorm
                 sq_sum = pl.create_tensor([BATCH_CFG, 1], dtype=pl.FP32)
                 sq_sum = pl.mul(sq_sum, 0.0)
@@ -234,7 +234,7 @@ def milm_decode_layer(
                     )
 
                 # Flash Decoding Attention (per head with GQA)
-                with pl.auto_incore():
+                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
                     attn_row = pl.create_tensor([1, HIDDEN_CFG], dtype=pl.FP32)
                     attn_row = pl.mul(attn_row, 0.0)
 
@@ -318,7 +318,7 @@ def milm_decode_layer(
             # =========================================================================
             # Scope 3: Output Projection + Residual + Post RMSNorm + SwiGLU MLP
             # =========================================================================
-            with pl.auto_incore():
+            with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
                 for b0 in pl.range(0, BATCH_CFG, BATCH_TILE):
                     # Output projection + residual (first residual connection)
                     resid1_tile = pl.create_tensor([BATCH_TILE, HIDDEN_CFG], dtype=pl.FP32)

diff --git a/examples/models/qwen3/qwen3-32b.py b/examples/models/qwen3/qwen3-32b.py
@@ -106,7 +106,7 @@ def qwen3_decode_layer(
             attn_out = pl.create_tensor([BATCH_CFG, HIDDEN_CFG], dtype=pl.FP32)
 
             # Scope 1: input RMSNorm + Q/K/V projection.
-            with pl.auto_incore():
+            with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
                 sq_sum = pl.create_tensor([BATCH_CFG, 1], dtype=pl.FP32)
                 sq_sum = pl.mul(sq_sum, 0.0)
 
@@ -197,7 +197,7 @@ def qwen3_decode_layer(
                         [cache_row, 0],
                     )
 
-                with pl.auto_incore():
+                with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
                     attn_row = pl.create_tensor([1, HIDDEN_CFG], dtype=pl.FP32)
                     attn_row = pl.mul(attn_row, 0.0)
 
@@ -269,7 +269,7 @@ def qwen3_decode_layer(
                     attn_out = pl.assemble(attn_out, attn_row, [b, 0])
 
             # Scope 3: output projection + residual + post RMSNorm + MLP + residual.
-            with pl.auto_incore():
+            with pl.at(level=pl.Level.CORE_GROUP, optimization=pl.chunked_loop_optimizer):
                 for b0 in pl.range(0, BATCH_CFG, BATCH_TILE):
                     resid1_tile = pl.create_tensor([BATCH_TILE, HIDDEN_CFG], dtype=pl.FP32)