hw-native-sys · ChaoWao · Apr 3, 2026 · Apr 3, 2026 · Apr 3, 2026
diff --git a/examples/a2a3/tensormap_and_ringbuffer/spmd_starvation/golden.py b/examples/a2a3/tensormap_and_ringbuffer/spmd_starvation/golden.py
@@ -0,0 +1,84 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""
+Golden test for SPMD starvation prevention.
+
+Submits 18 normal MIX tasks interleaved with 2 sync_start MIX tasks and
+verifies all 20 tasks complete with correct output.  The test validates that
+the drain mechanism prevents sync_start tasks from being starved.
+
+Layout:
+  Wave 1: 6 × normal(block_num=4)  -> CL 0..71
+  Sync 0: 1 × sync_start(block_num=6) -> CL 72..89
+  Wave 2: 6 × normal(block_num=4)  -> CL 90..161
+  Sync 1: 1 × sync_start(block_num=6) -> CL 162..179
+  Wave 3: 6 × normal(block_num=4)  -> CL 180..251
+
+Total: 252 CL = 4032 float32.
+
+Args layout: [output]
+"""
+
+import torch
+
+__outputs__ = ["output"]
+RTOL = 0
+ATOL = 0
+
+ALL_CASES = {
+    "Case1": {},
+}
+
+DEFAULT_CASE = "Case1"
+
+FLOATS_PER_CACHE_LINE = 16
+SLOTS_PER_BLOCK = 3  # AIC, AIV0, AIV1
+NORMAL_BLOCK_NUM = 4
+SYNC_BLOCK_NUM = 6
+NORMAL_CL = NORMAL_BLOCK_NUM * SLOTS_PER_BLOCK  # 12
+SYNC_CL = SYNC_BLOCK_NUM * SLOTS_PER_BLOCK  # 18
+
+
+# Build flat task list as (block_num, base_cl)
+def _build_tasks():
+    tasks = []
+    cl = 0
+    for _ in range(6):
+        tasks.append((NORMAL_BLOCK_NUM, cl))
+        cl += NORMAL_CL
+    tasks.append((SYNC_BLOCK_NUM, cl))
+    cl += SYNC_CL
+    for _ in range(6):
+        tasks.append((NORMAL_BLOCK_NUM, cl))
+        cl += NORMAL_CL
+    tasks.append((SYNC_BLOCK_NUM, cl))
+    cl += SYNC_CL
+    for _ in range(6):
+        tasks.append((NORMAL_BLOCK_NUM, cl))
+        cl += NORMAL_CL
+    return tasks
+
+
+TASKS = _build_tasks()
+TOTAL_CL = sum(bn * SLOTS_PER_BLOCK for bn, _ in TASKS)  # 252
+
+
+def generate_inputs(params: dict) -> list:
+    output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)
+    return [("output", output)]
+
+
+def compute_golden(tensors: dict, params: dict) -> None:
+    out = torch.as_tensor(tensors["output"])
+    for block_num, base_cl in TASKS:
+        for block_idx in range(block_num):
+            for slot in range(SLOTS_PER_BLOCK):
+                cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot
+                out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx)
+    tensors["output"][:] = out
diff --git a/examples/a2a3/tensormap_and_ringbuffer/spmd_starvation/kernels/kernel_config.py b/examples/a2a3/tensormap_and_ringbuffer/spmd_starvation/kernels/kernel_config.py
@@ -0,0 +1,53 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""
+Kernel configuration for SPMD starvation-prevention test.
+
+Submits many normal MIX tasks interleaved with sync_start tasks to verify
+the drain mechanism prevents starvation under sustained load.
+Reuses the same AIC/AIV kernels from spmd_multiblock_mix.
+"""
+
+from pathlib import Path
+
+_KERNELS_ROOT = Path(__file__).parent
+_MIX_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_mix" / "kernels"
+
+ORCHESTRATION = {
+    "source": str(_KERNELS_ROOT / "orchestration" / "spmd_starvation_orch.cpp"),
+    "function_name": "aicpu_orchestration_entry",
+}
+
+KERNELS = [
+    {
+        "func_id": 0,
+        "name": "SPMD_MIX_AIC",
+        "source": str(_MIX_KERNELS / "aic" / "kernel_spmd_mix.cpp"),
+        "core_type": "aic",
+    },
+    {
+        "func_id": 1,
+        "name": "SPMD_MIX_AIV0",
+        "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"),
+        "core_type": "aiv",
+    },
+    {
+        "func_id": 2,
+        "name": "SPMD_MIX_AIV1",
+        "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"),
+        "core_type": "aiv",
+    },
+]
+
+RUNTIME_CONFIG = {
+    "runtime": "tensormap_and_ringbuffer",
+    "aicpu_thread_num": 4,
+    "orch_thread_num": 1,
+    "block_dim": 24,
+}
diff --git a/...3/tensormap_and_ringbuffer/spmd_starvation/kernels/orchestration/spmd_starvation_orch.cpp b/...3/tensormap_and_ringbuffer/spmd_starvation/kernels/orchestration/spmd_starvation_orch.cpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * SPMD Starvation Prevention Orchestration
+ *
+ * Submits a large wave of normal MIX tasks followed by sync_start tasks,
+ * then another wave of normal tasks.  The drain mechanism must ensure the
+ * sync_start tasks are not indefinitely delayed by the surrounding load.
+ *
+ * Layout: 3 waves × 6 normal tasks (block_num=4) + 2 sync_start tasks (block_num=6)
+ *
+ * Normal task: block_num=4, require_sync_start=false  → 4 blocks × 3 slots = 12 CL each
+ * Sync task:   block_num=6, require_sync_start=true   → 6 blocks × 3 slots = 18 CL each
+ *
+ * Total CL: 3×6×12 + 2×18 = 216 + 36 = 252
+ *
+ * Args layout: [output]
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"
+
+#define FUNC_SPMD_MIX_AIC 0
+#define FUNC_SPMD_MIX_AIV0 1
+#define FUNC_SPMD_MIX_AIV1 2
+
+static constexpr int32_t SLOTS_PER_BLOCK = 3;  // AIC, AIV0, AIV1
+static constexpr int32_t NORMAL_BLOCK_NUM = 4;
+static constexpr int32_t SYNC_BLOCK_NUM = 6;
+static constexpr int32_t NORMAL_CL = NORMAL_BLOCK_NUM * SLOTS_PER_BLOCK;  // 12
+static constexpr int32_t SYNC_CL = SYNC_BLOCK_NUM * SLOTS_PER_BLOCK;      // 18
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig
+aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) {
+    (void)orch_args;  // NOLINT(readability/casting)
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 1,
+    };
+}
+
+static void submit_mix(Tensor &out, int16_t block_num, int64_t base_cl, bool sync_start) {
+    MixedKernels mk;
+    mk.aic_kernel_id = FUNC_SPMD_MIX_AIC;
+    mk.aiv0_kernel_id = FUNC_SPMD_MIX_AIV0;
+    mk.aiv1_kernel_id = FUNC_SPMD_MIX_AIV1;
+
+    Arg args;
+    args.add_inout(out);
+    args.add_scalar(base_cl);
+    args.launch_spec.set_block_num(block_num);
+    args.launch_spec.set_require_sync_start(sync_start);
+    pto2_rt_submit_task(mk, args);
+}
+
+__attribute__((visibility("default"))) void
+aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args, int orch_thread_num, int orch_thread_index) {
+    (void)orch_thread_num;  // NOLINT(readability/casting)
+    if (orch_thread_index != 0) return;
+
+    Tensor ext_output = from_tensor_arg(orch_args.tensor(0));
+
+    int64_t cl = 0;
+
+    // Wave 1: 6 normal MIX tasks
+    for (int i = 0; i < 6; i++, cl += NORMAL_CL)
+        submit_mix(ext_output, NORMAL_BLOCK_NUM, cl, false);
+
+    // Sync-start task 0: must not be starved by wave 1 or wave 2
+    submit_mix(ext_output, SYNC_BLOCK_NUM, cl, true);
+    cl += SYNC_CL;
+
+    // Wave 2: 6 normal MIX tasks
+    for (int i = 0; i < 6; i++, cl += NORMAL_CL)
+        submit_mix(ext_output, NORMAL_BLOCK_NUM, cl, false);
+
+    // Sync-start task 1: must not be starved by wave 2 or wave 3
+    submit_mix(ext_output, SYNC_BLOCK_NUM, cl, true);
+    cl += SYNC_CL;
+
+    // Wave 3: 6 normal MIX tasks
+    for (int i = 0; i < 6; i++, cl += NORMAL_CL)
+        submit_mix(ext_output, NORMAL_BLOCK_NUM, cl, false);
+
+    LOG_ALWAYS("[spmd_starvation] Submitted 20 tasks (18 normal + 2 sync_start)");
+}
+
+}  // extern "C"
diff --git a/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start/golden.py b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start/golden.py
@@ -0,0 +1,66 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""
+Golden test for SPMD sync_start.
+
+Submits 4 MIX tasks (3 with require_sync_start=true, 1 baseline) and verifies
+all blocks of every task write the correct float(block_idx) to their cache line.
+
+Tasks (AIC=slot0, AIV0=slot1, AIV1=slot2):
+  T0: block_num=2,  sync_start=True  -> CL 0..5
+  T1: block_num=8,  sync_start=True  -> CL 6..29
+  T2: block_num=2,  sync_start=False -> CL 30..35  (baseline)
+  T3: block_num=12, sync_start=True  -> CL 36..71
+
+Output tensor: 72 cache lines = 1152 float32.
+
+Args layout: [output]
+"""
+
+import torch
+
+__outputs__ = ["output"]
+RTOL = 0
+ATOL = 0
+
+ALL_CASES = {
+    "Case1": {},
+}
+
+DEFAULT_CASE = "Case1"
+
+FLOATS_PER_CACHE_LINE = 16
+SLOTS_PER_BLOCK = 3  # AIC, AIV0, AIV1
+
+# (block_num, base_cl) for each submitted task
+TASKS = [
+    (2, 0),  # T0: sync_start=True
+    (8, 6),  # T1: sync_start=True
+    (2, 30),  # T2: sync_start=False (baseline)
+    (12, 36),  # T3: sync_start=True
+]
+
+TOTAL_CL = sum(block_num * SLOTS_PER_BLOCK for block_num, _ in TASKS)  # 72
+
+
+def generate_inputs(params: dict) -> list:
+    output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)
+    return [
+        ("output", output),
+    ]
+
+
+def compute_golden(tensors: dict, params: dict) -> None:
+    out = torch.as_tensor(tensors["output"])
+    for block_num, base_cl in TASKS:
+        for block_idx in range(block_num):
+            for slot in range(SLOTS_PER_BLOCK):
+                cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot
+                out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx)
+    tensors["output"][:] = out
diff --git a/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start/kernels/kernel_config.py b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start/kernels/kernel_config.py
@@ -0,0 +1,52 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""
+Kernel configuration for SPMD sync_start test (tensormap_and_ringbuffer Runtime).
+
+Submits MIX tasks with require_sync_start=true to verify atomic batch launch.
+Reuses the same AIC/AIV kernels from spmd_multiblock_mix.
+"""
+
+from pathlib import Path
+
+_KERNELS_ROOT = Path(__file__).parent
+_MIX_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_mix" / "kernels"
+
+ORCHESTRATION = {
+    "source": str(_KERNELS_ROOT / "orchestration" / "spmd_sync_start_orch.cpp"),
+    "function_name": "aicpu_orchestration_entry",
+}
+
+KERNELS = [
+    {
+        "func_id": 0,
+        "name": "SPMD_MIX_AIC",
+        "source": str(_MIX_KERNELS / "aic" / "kernel_spmd_mix.cpp"),
+        "core_type": "aic",
+    },
+    {
+        "func_id": 1,
+        "name": "SPMD_MIX_AIV0",
+        "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"),
+        "core_type": "aiv",
+    },
+    {
+        "func_id": 2,
+        "name": "SPMD_MIX_AIV1",
+        "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"),
+        "core_type": "aiv",
+    },
+]
+
+RUNTIME_CONFIG = {
+    "runtime": "tensormap_and_ringbuffer",
+    "aicpu_thread_num": 4,
+    "orch_thread_num": 1,
+    "block_dim": 24,
+}