From ed801e31dbfa46dbcaacc50068ea84d0174ee134 Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Fri, 3 Apr 2026 16:31:27 +0800
Subject: [PATCH 1/2] Add: require_sync_start for atomic SPMD block launch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce a sync_start mechanism that forces all blocks of an SPMD task
to be dispatched atomically before any can begin execution.

Submission layer (pto_submit_types.h, pto_orchestrator.cpp/h):
- Add LaunchSpec::require_sync_start and active_mask bit-3 flag
- Add pto2_core_mask() / pto2_requires_sync_start() helpers
- Validate block_num < total resources at submit time to prevent deadlock
- Fix total_required_subtasks to use pto2_core_mask (strip flag bits)

Scheduler drain protocol (aicpu_executor.cpp):
- Three-phase drain: ack barrier → global resource check → exclusive
  dispatch
- Elected thread verifies global idle resources before dispatching; if
  insufficient, all threads return to completion polling and retry
- Non-elected threads spin-wait during dispatch, giving the elected
  thread exclusive CoreTracker access (no data race on core_states_)
- Track active_sched_threads_ separately from thread_num_ so
  orchestrator threads that have not transitioned to scheduling do not
  block the ack barrier

SPMD dispatch refactor:
- Extract dispatch_block_to_cluster / dispatch_mix_block_to_cluster
- AIV path uses count_idle_aiv_cores for accurate resource counting

Test examples: spmd_sync_start, spmd_sync_start_aiv,
spmd_sync_start_edge, spmd_sync_start_stress, spmd_starvation
---
 .../spmd_starvation/golden.py                 |  84 ++++
 .../spmd_starvation/kernels/kernel_config.py  |  53 +++
 .../orchestration/spmd_starvation_orch.cpp    | 100 +++++
 .../spmd_sync_start/golden.py                 |  66 +++
 .../spmd_sync_start/kernels/kernel_config.py  |  52 +++
 .../orchestration/spmd_sync_start_orch.cpp    |  82 ++++
 .../spmd_sync_start_aiv/golden.py             |  62 +++
 .../kernels/kernel_config.py                  |  41 ++
 .../spmd_sync_start_aiv_orch.cpp              |  76 ++++
 .../spmd_sync_start_edge/golden.py            |  66 +++
 .../kernels/kernel_config.py                  |  53 +++
 .../spmd_sync_start_edge_orch.cpp             |  82 ++++
 .../spmd_sync_start_stress/golden.py          | 110 +++++
 .../kernels/kernel_config.py                  |  62 +++
 .../spmd_sync_start_stress_orch.cpp           | 109 +++++
 .../aicpu/aicpu_executor.cpp                  | 401 ++++++++++++++----
 .../runtime/pto_orchestrator.cpp              |  18 +-
 .../runtime/pto_orchestrator.h                |   4 +
 .../runtime/pto_submit_types.h                |  28 +-
 .../runtime/runtime.h                         |   2 +-
 20 files changed, 1467 insertions(+), 84 deletions(-)
 create mode 100644 examples/a2a3/tensormap_and_ringbuffer/spmd_starvation/golden.py
 create mode 100644 examples/a2a3/tensormap_and_ringbuffer/spmd_starvation/kernels/kernel_config.py
 create mode 100644 examples/a2a3/tensormap_and_ringbuffer/spmd_starvation/kernels/orchestration/spmd_starvation_orch.cpp
 create mode 100644 examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start/golden.py
 create mode 100644 examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start/kernels/kernel_config.py
 create mode 100644 examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start/kernels/orchestration/spmd_sync_start_orch.cpp
 create mode 100644 examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_aiv/golden.py
 create mode 100644 examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/kernel_config.py
 create mode 100644 examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/orchestration/spmd_sync_start_aiv_orch.cpp
 create mode 100644 examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_edge/golden.py
 create mode 100644 examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/kernel_config.py
 create mode 100644 examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/orchestration/spmd_sync_start_edge_orch.cpp
 create mode 100644 examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_stress/golden.py
 create mode 100644 examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/kernel_config.py
 create mode 100644 examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/orchestration/spmd_sync_start_stress_orch.cpp

diff --git a/examples/a2a3/tensormap_and_ringbuffer/spmd_starvation/golden.py b/examples/a2a3/tensormap_and_ringbuffer/spmd_starvation/golden.py
new file mode 100644
index 000000000..f38002181
--- /dev/null
+++ b/examples/a2a3/tensormap_and_ringbuffer/spmd_starvation/golden.py
@@ -0,0 +1,84 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""
+Golden test for SPMD starvation prevention.
+
+Submits 18 normal MIX tasks interleaved with 2 sync_start MIX tasks and
+verifies all 20 tasks complete with correct output.  The test validates that
+the drain mechanism prevents sync_start tasks from being starved.
+
+Layout:
+  Wave 1: 6 × normal(block_num=4)  -> CL 0..71
+  Sync 0: 1 × sync_start(block_num=6) -> CL 72..89
+  Wave 2: 6 × normal(block_num=4)  -> CL 90..161
+  Sync 1: 1 × sync_start(block_num=6) -> CL 162..179
+  Wave 3: 6 × normal(block_num=4)  -> CL 180..251
+
+Total: 252 CL = 4032 float32.
+
+Args layout: [output]
+"""
+
+import torch
+
+__outputs__ = ["output"]
+RTOL = 0
+ATOL = 0
+
+ALL_CASES = {
+    "Case1": {},
+}
+
+DEFAULT_CASE = "Case1"
+
+FLOATS_PER_CACHE_LINE = 16
+SLOTS_PER_BLOCK = 3  # AIC, AIV0, AIV1
+NORMAL_BLOCK_NUM = 4
+SYNC_BLOCK_NUM = 6
+NORMAL_CL = NORMAL_BLOCK_NUM * SLOTS_PER_BLOCK  # 12
+SYNC_CL = SYNC_BLOCK_NUM * SLOTS_PER_BLOCK  # 18
+
+
+# Build flat task list as (block_num, base_cl)
+def _build_tasks():
+    tasks = []
+    cl = 0
+    for _ in range(6):
+        tasks.append((NORMAL_BLOCK_NUM, cl))
+        cl += NORMAL_CL
+    tasks.append((SYNC_BLOCK_NUM, cl))
+    cl += SYNC_CL
+    for _ in range(6):
+        tasks.append((NORMAL_BLOCK_NUM, cl))
+        cl += NORMAL_CL
+    tasks.append((SYNC_BLOCK_NUM, cl))
+    cl += SYNC_CL
+    for _ in range(6):
+        tasks.append((NORMAL_BLOCK_NUM, cl))
+        cl += NORMAL_CL
+    return tasks
+
+
+TASKS = _build_tasks()
+TOTAL_CL = sum(bn * SLOTS_PER_BLOCK for bn, _ in TASKS)  # 252
+
+
+def generate_inputs(params: dict) -> list:
+    output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)
+    return [("output", output)]
+
+
+def compute_golden(tensors: dict, params: dict) -> None:
+    out = torch.as_tensor(tensors["output"])
+    for block_num, base_cl in TASKS:
+        for block_idx in range(block_num):
+            for slot in range(SLOTS_PER_BLOCK):
+                cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot
+                out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx)
+    tensors["output"][:] = out
diff --git a/examples/a2a3/tensormap_and_ringbuffer/spmd_starvation/kernels/kernel_config.py b/examples/a2a3/tensormap_and_ringbuffer/spmd_starvation/kernels/kernel_config.py
new file mode 100644
index 000000000..b02e66d8e
--- /dev/null
+++ b/examples/a2a3/tensormap_and_ringbuffer/spmd_starvation/kernels/kernel_config.py
@@ -0,0 +1,53 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""
+Kernel configuration for SPMD starvation-prevention test.
+
+Submits many normal MIX tasks interleaved with sync_start tasks to verify
+the drain mechanism prevents starvation under sustained load.
+Reuses the same AIC/AIV kernels from spmd_multiblock_mix.
+"""
+
+from pathlib import Path
+
+_KERNELS_ROOT = Path(__file__).parent
+_MIX_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_mix" / "kernels"
+
+ORCHESTRATION = {
+    "source": str(_KERNELS_ROOT / "orchestration" / "spmd_starvation_orch.cpp"),
+    "function_name": "aicpu_orchestration_entry",
+}
+
+KERNELS = [
+    {
+        "func_id": 0,
+        "name": "SPMD_MIX_AIC",
+        "source": str(_MIX_KERNELS / "aic" / "kernel_spmd_mix.cpp"),
+        "core_type": "aic",
+    },
+    {
+        "func_id": 1,
+        "name": "SPMD_MIX_AIV0",
+        "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"),
+        "core_type": "aiv",
+    },
+    {
+        "func_id": 2,
+        "name": "SPMD_MIX_AIV1",
+        "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"),
+        "core_type": "aiv",
+    },
+]
+
+RUNTIME_CONFIG = {
+    "runtime": "tensormap_and_ringbuffer",
+    "aicpu_thread_num": 4,
+    "orch_thread_num": 1,
+    "block_dim": 24,
+}
diff --git a/examples/a2a3/tensormap_and_ringbuffer/spmd_starvation/kernels/orchestration/spmd_starvation_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/spmd_starvation/kernels/orchestration/spmd_starvation_orch.cpp
new file mode 100644
index 000000000..dd4f0cc7d
--- /dev/null
+++ b/examples/a2a3/tensormap_and_ringbuffer/spmd_starvation/kernels/orchestration/spmd_starvation_orch.cpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * SPMD Starvation Prevention Orchestration
+ *
+ * Submits a large wave of normal MIX tasks followed by sync_start tasks,
+ * then another wave of normal tasks.  The drain mechanism must ensure the
+ * sync_start tasks are not indefinitely delayed by the surrounding load.
+ *
+ * Layout: 3 waves × 6 normal tasks (block_num=4) + 2 sync_start tasks (block_num=6)
+ *
+ * Normal task: block_num=4, require_sync_start=false  → 4 blocks × 3 slots = 12 CL each
+ * Sync task:   block_num=6, require_sync_start=true   → 6 blocks × 3 slots = 18 CL each
+ *
+ * Total CL: 3×6×12 + 2×18 = 216 + 36 = 252
+ *
+ * Args layout: [output]
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"
+
+#define FUNC_SPMD_MIX_AIC 0
+#define FUNC_SPMD_MIX_AIV0 1
+#define FUNC_SPMD_MIX_AIV1 2
+
+static constexpr int32_t SLOTS_PER_BLOCK = 3;  // AIC, AIV0, AIV1
+static constexpr int32_t NORMAL_BLOCK_NUM = 4;
+static constexpr int32_t SYNC_BLOCK_NUM = 6;
+static constexpr int32_t NORMAL_CL = NORMAL_BLOCK_NUM * SLOTS_PER_BLOCK;  // 12
+static constexpr int32_t SYNC_CL = SYNC_BLOCK_NUM * SLOTS_PER_BLOCK;      // 18
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig
+aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) {
+    (void)orch_args;  // NOLINT(readability/casting)
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 1,
+    };
+}
+
+static void submit_mix(Tensor &out, int16_t block_num, int64_t base_cl, bool sync_start) {
+    MixedKernels mk;
+    mk.aic_kernel_id = FUNC_SPMD_MIX_AIC;
+    mk.aiv0_kernel_id = FUNC_SPMD_MIX_AIV0;
+    mk.aiv1_kernel_id = FUNC_SPMD_MIX_AIV1;
+
+    Arg args;
+    args.add_inout(out);
+    args.add_scalar(base_cl);
+    args.launch_spec.set_block_num(block_num);
+    args.launch_spec.set_require_sync_start(sync_start);
+    pto2_rt_submit_task(mk, args);
+}
+
+__attribute__((visibility("default"))) void
+aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args, int orch_thread_num, int orch_thread_index) {
+    (void)orch_thread_num;  // NOLINT(readability/casting)
+    if (orch_thread_index != 0) return;
+
+    Tensor ext_output = from_tensor_arg(orch_args.tensor(0));
+
+    int64_t cl = 0;
+
+    // Wave 1: 6 normal MIX tasks
+    for (int i = 0; i < 6; i++, cl += NORMAL_CL)
+        submit_mix(ext_output, NORMAL_BLOCK_NUM, cl, false);
+
+    // Sync-start task 0: must not be starved by wave 1 or wave 2
+    submit_mix(ext_output, SYNC_BLOCK_NUM, cl, true);
+    cl += SYNC_CL;
+
+    // Wave 2: 6 normal MIX tasks
+    for (int i = 0; i < 6; i++, cl += NORMAL_CL)
+        submit_mix(ext_output, NORMAL_BLOCK_NUM, cl, false);
+
+    // Sync-start task 1: must not be starved by wave 2 or wave 3
+    submit_mix(ext_output, SYNC_BLOCK_NUM, cl, true);
+    cl += SYNC_CL;
+
+    // Wave 3: 6 normal MIX tasks
+    for (int i = 0; i < 6; i++, cl += NORMAL_CL)
+        submit_mix(ext_output, NORMAL_BLOCK_NUM, cl, false);
+
+    LOG_ALWAYS("[spmd_starvation] Submitted 20 tasks (18 normal + 2 sync_start)");
+}
+
+}  // extern "C"
diff --git a/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start/golden.py b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start/golden.py
new file mode 100644
index 000000000..33acd1c1a
--- /dev/null
+++ b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start/golden.py
@@ -0,0 +1,66 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""
+Golden test for SPMD sync_start.
+
+Submits 4 MIX tasks (3 with require_sync_start=true, 1 baseline) and verifies
+all blocks of every task write the correct float(block_idx) to their cache line.
+
+Tasks (AIC=slot0, AIV0=slot1, AIV1=slot2):
+  T0: block_num=2,  sync_start=True  -> CL 0..5
+  T1: block_num=8,  sync_start=True  -> CL 6..29
+  T2: block_num=2,  sync_start=False -> CL 30..35  (baseline)
+  T3: block_num=12, sync_start=True  -> CL 36..71
+
+Output tensor: 72 cache lines = 1152 float32.
+
+Args layout: [output]
+"""
+
+import torch
+
+__outputs__ = ["output"]
+RTOL = 0
+ATOL = 0
+
+ALL_CASES = {
+    "Case1": {},
+}
+
+DEFAULT_CASE = "Case1"
+
+FLOATS_PER_CACHE_LINE = 16
+SLOTS_PER_BLOCK = 3  # AIC, AIV0, AIV1
+
+# (block_num, base_cl) for each submitted task
+TASKS = [
+    (2, 0),  # T0: sync_start=True
+    (8, 6),  # T1: sync_start=True
+    (2, 30),  # T2: sync_start=False (baseline)
+    (12, 36),  # T3: sync_start=True
+]
+
+TOTAL_CL = sum(block_num * SLOTS_PER_BLOCK for block_num, _ in TASKS)  # 72
+
+
+def generate_inputs(params: dict) -> list:
+    output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)
+    return [
+        ("output", output),
+    ]
+
+
+def compute_golden(tensors: dict, params: dict) -> None:
+    out = torch.as_tensor(tensors["output"])
+    for block_num, base_cl in TASKS:
+        for block_idx in range(block_num):
+            for slot in range(SLOTS_PER_BLOCK):
+                cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot
+                out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx)
+    tensors["output"][:] = out
diff --git a/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start/kernels/kernel_config.py b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start/kernels/kernel_config.py
new file mode 100644
index 000000000..95f706c9d
--- /dev/null
+++ b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start/kernels/kernel_config.py
@@ -0,0 +1,52 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""
+Kernel configuration for SPMD sync_start test (tensormap_and_ringbuffer Runtime).
+
+Submits MIX tasks with require_sync_start=true to verify atomic batch launch.
+Reuses the same AIC/AIV kernels from spmd_multiblock_mix.
+"""
+
+from pathlib import Path
+
+_KERNELS_ROOT = Path(__file__).parent
+_MIX_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_mix" / "kernels"
+
+ORCHESTRATION = {
+    "source": str(_KERNELS_ROOT / "orchestration" / "spmd_sync_start_orch.cpp"),
+    "function_name": "aicpu_orchestration_entry",
+}
+
+KERNELS = [
+    {
+        "func_id": 0,
+        "name": "SPMD_MIX_AIC",
+        "source": str(_MIX_KERNELS / "aic" / "kernel_spmd_mix.cpp"),
+        "core_type": "aic",
+    },
+    {
+        "func_id": 1,
+        "name": "SPMD_MIX_AIV0",
+        "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"),
+        "core_type": "aiv",
+    },
+    {
+        "func_id": 2,
+        "name": "SPMD_MIX_AIV1",
+        "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"),
+        "core_type": "aiv",
+    },
+]
+
+RUNTIME_CONFIG = {
+    "runtime": "tensormap_and_ringbuffer",
+    "aicpu_thread_num": 4,
+    "orch_thread_num": 1,
+    "block_dim": 24,
+}
diff --git a/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start/kernels/orchestration/spmd_sync_start_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start/kernels/orchestration/spmd_sync_start_orch.cpp
new file mode 100644
index 000000000..207d26139
--- /dev/null
+++ b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start/kernels/orchestration/spmd_sync_start_orch.cpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * SPMD Sync-Start Orchestration
+ *
+ * Submits MIX tasks with require_sync_start=true to verify that the scheduler
+ * atomically launches all blocks before any can run.
+ *
+ * Tasks:
+ *   T0: block_num=2,  require_sync_start=true   (basic sync launch)
+ *   T1: block_num=8,  require_sync_start=true   (larger batch)
+ *   T2: block_num=2,  require_sync_start=false  (normal, as baseline)
+ *   T3: block_num=12, require_sync_start=true   (cross-thread batch)
+ *
+ * Each block writes float(block_idx) to its allocated cache-line slot,
+ * identical to spmd_multiblock_mix so the same kernel binaries can be reused.
+ *
+ * Args layout: [output]
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"
+
+#define FUNC_SPMD_MIX_AIC 0
+#define FUNC_SPMD_MIX_AIV0 1
+#define FUNC_SPMD_MIX_AIV1 2
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig
+aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) {
+    (void)orch_args;  // NOLINT(readability/casting)
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 1,
+    };
+}
+
+static void submit_mix(Tensor &out, int16_t block_num, int64_t base_cl, bool sync_start) {
+    MixedKernels mk;
+    mk.aic_kernel_id = FUNC_SPMD_MIX_AIC;
+    mk.aiv0_kernel_id = FUNC_SPMD_MIX_AIV0;
+    mk.aiv1_kernel_id = FUNC_SPMD_MIX_AIV1;
+
+    Arg args;
+    args.add_inout(out);
+    args.add_scalar(base_cl);
+    args.launch_spec.set_block_num(block_num);
+    args.launch_spec.set_require_sync_start(sync_start);
+    pto2_rt_submit_task(mk, args);
+}
+
+__attribute__((visibility("default"))) void
+aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args, int orch_thread_num, int orch_thread_index) {
+    (void)orch_thread_num;  // NOLINT(readability/casting)
+    if (orch_thread_index != 0) return;
+
+    Tensor ext_output = from_tensor_arg(orch_args.tensor(0));
+
+    // T0: 2 blocks, sync_start=true  (6 CL)
+    submit_mix(ext_output, 2, 0, true);
+    // T1: 8 blocks, sync_start=true  (24 CL)
+    submit_mix(ext_output, 8, 6, true);
+    // T2: 2 blocks, sync_start=false (6 CL, baseline)
+    submit_mix(ext_output, 2, 30, false);
+    // T3: 12 blocks, sync_start=true (36 CL)
+    submit_mix(ext_output, 12, 36, true);
+
+    LOG_ALWAYS("[spmd_sync_start] Submitted 4 tasks (3 sync_start + 1 baseline)");
+}
+
+}  // extern "C"
diff --git a/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_aiv/golden.py b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_aiv/golden.py
new file mode 100644
index 000000000..3c60f1ac8
--- /dev/null
+++ b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_aiv/golden.py
@@ -0,0 +1,62 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""
+Golden test for SPMD sync_start with AIV-only tasks.
+
+Submits 4 AIV tasks (3 with require_sync_start=true, 1 baseline) to exercise
+the AIV-specific fast path (count_idle_aiv_cores) and drain slow path.
+
+Tasks:
+  T0: block_num=4,  sync_start=True  -> CL 0..3    (fast path)
+  T1: block_num=16, sync_start=True  -> CL 4..19   (saturate one thread)
+  T2: block_num=4,  sync_start=False -> CL 20..23  (baseline)
+  T3: block_num=24, sync_start=True  -> CL 24..47  (cross-thread drain)
+
+Output tensor: 48 cache lines = 768 float32.
+
+Args layout: [output]
+"""
+
+import torch
+
+__outputs__ = ["output"]
+RTOL = 0
+ATOL = 0
+
+ALL_CASES = {
+    "Case1": {},
+}
+
+DEFAULT_CASE = "Case1"
+
+FLOATS_PER_CACHE_LINE = 16
+
+# (block_num, base_cl) for each submitted task
+TASKS = [
+    (4, 0),  # T0: sync_start=True, fast path
+    (16, 4),  # T1: sync_start=True, saturate single thread
+    (4, 20),  # T2: sync_start=False, baseline
+    (24, 24),  # T3: sync_start=True, cross-thread drain
+]
+
+TOTAL_CL = sum(block_num for block_num, _ in TASKS)  # 48
+
+
+def generate_inputs(params: dict) -> list:
+    output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)
+    return [("output", output)]
+
+
+def compute_golden(tensors: dict, params: dict) -> None:
+    out = torch.as_tensor(tensors["output"])
+    for block_num, base_cl in TASKS:
+        for block_idx in range(block_num):
+            cl = base_cl + block_idx
+            out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx)
+    tensors["output"][:] = out
diff --git a/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/kernel_config.py b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/kernel_config.py
new file mode 100644
index 000000000..77102a658
--- /dev/null
+++ b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/kernel_config.py
@@ -0,0 +1,41 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""
+Kernel configuration for SPMD sync_start AIV test (tensormap_and_ringbuffer Runtime).
+
+Submits AIV tasks with require_sync_start=true to verify atomic batch launch
+and the AIV-specific fast path (count_idle_aiv_cores).
+Reuses the same AIV kernel from spmd_multiblock_aiv.
+"""
+
+from pathlib import Path
+
+_KERNELS_ROOT = Path(__file__).parent
+_AIV_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_aiv" / "kernels"
+
+ORCHESTRATION = {
+    "source": str(_KERNELS_ROOT / "orchestration" / "spmd_sync_start_aiv_orch.cpp"),
+    "function_name": "aicpu_orchestration_entry",
+}
+
+KERNELS = [
+    {
+        "func_id": 0,
+        "name": "SPMD_WRITE_AIV",
+        "source": str(_AIV_KERNELS / "aiv" / "kernel_spmd_write.cpp"),
+        "core_type": "aiv",
+    },
+]
+
+RUNTIME_CONFIG = {
+    "runtime": "tensormap_and_ringbuffer",
+    "aicpu_thread_num": 4,
+    "orch_thread_num": 1,
+    "block_dim": 24,
+}
diff --git a/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/orchestration/spmd_sync_start_aiv_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/orchestration/spmd_sync_start_aiv_orch.cpp
new file mode 100644
index 000000000..859329d99
--- /dev/null
+++ b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/orchestration/spmd_sync_start_aiv_orch.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * SPMD Sync-Start AIV Orchestration
+ *
+ * Submits AIV-only tasks with require_sync_start=true to exercise:
+ *   - AIV fast path: count_idle_aiv_cores() >= block_num (small block_num)
+ *   - AIV drain path: block_num exceeds local AIV cores (cross-thread drain)
+ *
+ * Tasks:
+ *   T0: block_num=4,  require_sync_start=true   (fast path)
+ *   T1: block_num=16, require_sync_start=true   (saturate one thread: 8 clusters × 2 AIV)
+ *   T2: block_num=4,  require_sync_start=false  (baseline)
+ *   T3: block_num=24, require_sync_start=true   (cross-thread drain)
+ *
+ * Each block writes float(block_idx) at (base_cl + block_idx) × FLOATS_PER_CACHE_LINE,
+ * reusing the kernel from spmd_multiblock_aiv.
+ *
+ * Args layout: [output]
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"
+
+#define FUNC_SPMD_WRITE_AIV 0
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig
+aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) {
+    (void)orch_args;  // NOLINT(readability/casting)
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 1,
+    };
+}
+
+static void submit_aiv(Tensor &out, int16_t block_num, int64_t base_cl, bool sync_start) {
+    Arg args;
+    args.add_inout(out);
+    args.add_scalar(base_cl);
+    args.launch_spec.set_block_num(block_num);
+    args.launch_spec.set_require_sync_start(sync_start);
+    pto2_rt_submit_aiv_task(FUNC_SPMD_WRITE_AIV, args);
+}
+
+__attribute__((visibility("default"))) void
+aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args, int orch_thread_num, int orch_thread_index) {
+    (void)orch_thread_num;  // NOLINT(readability/casting)
+    if (orch_thread_index != 0) return;
+
+    Tensor ext_output = from_tensor_arg(orch_args.tensor(0));
+
+    // T0: 4 blocks, sync_start=true (fast path: 4 <= idle AIV cores on one thread)
+    submit_aiv(ext_output, 4, 0, true);
+    // T1: 16 blocks, sync_start=true (saturate: 8 clusters × 2 AIV = 16 cores)
+    submit_aiv(ext_output, 16, 4, true);
+    // T2: 4 blocks, sync_start=false (baseline)
+    submit_aiv(ext_output, 4, 20, false);
+    // T3: 24 blocks, sync_start=true (cross-thread drain)
+    submit_aiv(ext_output, 24, 24, true);
+
+    LOG_ALWAYS("[spmd_sync_start_aiv] Submitted 4 AIV tasks (3 sync_start + 1 baseline)");
+}
+
+}  // extern "C"
diff --git a/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_edge/golden.py b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_edge/golden.py
new file mode 100644
index 000000000..7d9b0b6ae
--- /dev/null
+++ b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_edge/golden.py
@@ -0,0 +1,66 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""
+Golden test for SPMD sync_start boundary conditions.
+
+Tests edge-case block_num values relative to per-thread cluster capacity (8 clusters
+with 3 sched threads = 24 total clusters, 48 total AIV cores).
+
+MIX tasks (SLOTS_PER_BLOCK=3):
+  T0: block_num=1,  sync_start=True  -> CL 0..2     (degenerate: always fast path)
+  T1: block_num=8,  sync_start=True  -> CL 3..26    (exactly one thread's capacity)
+  T2: block_num=9,  sync_start=True  -> CL 27..53   (one over: must enter drain)
+  T3: block_num=23, sync_start=True  -> CL 54..122  (max valid: total_clusters - 1)
+  T4: block_num=1,  sync_start=False -> CL 123..125  (baseline)
+
+Output tensor: 126 cache lines = 2016 float32.
+
+Args layout: [output]
+"""
+
+import torch
+
+__outputs__ = ["output"]
+RTOL = 0
+ATOL = 0
+
+ALL_CASES = {
+    "Case1": {},
+}
+
+DEFAULT_CASE = "Case1"
+
+FLOATS_PER_CACHE_LINE = 16
+SLOTS_PER_BLOCK = 3  # AIC, AIV0, AIV1
+
+# (block_num, base_cl) for each submitted task
+TASKS = [
+    (1, 0),  # T0: sync=True, degenerate
+    (8, 3),  # T1: sync=True, exactly one thread's clusters
+    (9, 27),  # T2: sync=True, one over → drain
+    (23, 54),  # T3: sync=True, max valid (total_clusters - 1)
+    (1, 123),  # T4: sync=False, baseline
+]
+
+TOTAL_CL = sum(block_num * SLOTS_PER_BLOCK for block_num, _ in TASKS)  # 126
+
+
+def generate_inputs(params: dict) -> list:
+    output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)
+    return [("output", output)]
+
+
+def compute_golden(tensors: dict, params: dict) -> None:
+    out = torch.as_tensor(tensors["output"])
+    for block_num, base_cl in TASKS:
+        for block_idx in range(block_num):
+            for slot in range(SLOTS_PER_BLOCK):
+                cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot
+                out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx)
+    tensors["output"][:] = out
diff --git a/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/kernel_config.py b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/kernel_config.py
new file mode 100644
index 000000000..29f119ea9
--- /dev/null
+++ b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/kernel_config.py
@@ -0,0 +1,53 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""
+Kernel configuration for SPMD sync_start boundary test.
+
+Tests edge-case block_num values: 1 (degenerate), 8 (one thread capacity),
+9 (just over), 23 (max valid = total_clusters - 1).
+Reuses the same AIC/AIV kernels from spmd_multiblock_mix.
+"""
+
+from pathlib import Path
+
+_KERNELS_ROOT = Path(__file__).parent
+_MIX_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_mix" / "kernels"
+
+ORCHESTRATION = {
+    "source": str(_KERNELS_ROOT / "orchestration" / "spmd_sync_start_edge_orch.cpp"),
+    "function_name": "aicpu_orchestration_entry",
+}
+
+KERNELS = [
+    {
+        "func_id": 0,
+        "name": "SPMD_MIX_AIC",
+        "source": str(_MIX_KERNELS / "aic" / "kernel_spmd_mix.cpp"),
+        "core_type": "aic",
+    },
+    {
+        "func_id": 1,
+        "name": "SPMD_MIX_AIV0",
+        "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"),
+        "core_type": "aiv",
+    },
+    {
+        "func_id": 2,
+        "name": "SPMD_MIX_AIV1",
+        "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"),
+        "core_type": "aiv",
+    },
+]
+
+RUNTIME_CONFIG = {
+    "runtime": "tensormap_and_ringbuffer",
+    "aicpu_thread_num": 4,
+    "orch_thread_num": 1,
+    "block_dim": 24,
+}
diff --git a/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/orchestration/spmd_sync_start_edge_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/orchestration/spmd_sync_start_edge_orch.cpp
new file mode 100644
index 000000000..122d838b0
--- /dev/null
+++ b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/orchestration/spmd_sync_start_edge_orch.cpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * SPMD Sync-Start Boundary Orchestration
+ *
+ * Tests edge-case block_num values relative to per-thread cluster capacity
+ * (8 clusters per sched thread, 24 total clusters).
+ *
+ * Tasks:
+ *   T0: block_num=1,  sync_start=true   (degenerate: always fast path)
+ *   T1: block_num=8,  sync_start=true   (exactly one thread's capacity)
+ *   T2: block_num=9,  sync_start=true   (one over: must enter drain)
+ *   T3: block_num=23, sync_start=true   (max valid: total_clusters - 1)
+ *   T4: block_num=1,  sync_start=false  (baseline)
+ *
+ * Args layout: [output]
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"
+
+#define FUNC_SPMD_MIX_AIC 0
+#define FUNC_SPMD_MIX_AIV0 1
+#define FUNC_SPMD_MIX_AIV1 2
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig
+aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) {
+    (void)orch_args;  // NOLINT(readability/casting)
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 1,
+    };
+}
+
+static void submit_mix(Tensor &out, int16_t block_num, int64_t base_cl, bool sync_start) {
+    MixedKernels mk;
+    mk.aic_kernel_id = FUNC_SPMD_MIX_AIC;
+    mk.aiv0_kernel_id = FUNC_SPMD_MIX_AIV0;
+    mk.aiv1_kernel_id = FUNC_SPMD_MIX_AIV1;
+
+    Arg args;
+    args.add_inout(out);
+    args.add_scalar(base_cl);
+    args.launch_spec.set_block_num(block_num);
+    args.launch_spec.set_require_sync_start(sync_start);
+    pto2_rt_submit_task(mk, args);
+}
+
+__attribute__((visibility("default"))) void
+aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args, int orch_thread_num, int orch_thread_index) {
+    (void)orch_thread_num;  // NOLINT(readability/casting)
+    if (orch_thread_index != 0) return;
+
+    Tensor ext_output = from_tensor_arg(orch_args.tensor(0));
+
+    // T0: block_num=1, sync_start=true (degenerate: always fast path, 3 CL)
+    submit_mix(ext_output, 1, 0, true);
+    // T1: block_num=8, sync_start=true (exactly one thread's cluster capacity, 24 CL)
+    submit_mix(ext_output, 8, 3, true);
+    // T2: block_num=9, sync_start=true (one over single thread → must drain, 27 CL)
+    submit_mix(ext_output, 9, 27, true);
+    // T3: block_num=23, sync_start=true (max valid = total_clusters - 1, 69 CL)
+    submit_mix(ext_output, 23, 54, true);
+    // T4: block_num=1, sync_start=false (baseline, 3 CL)
+    submit_mix(ext_output, 1, 123, false);
+
+    LOG_ALWAYS("[spmd_sync_start_edge] Submitted 5 tasks: block_num=1,8,9,23 (sync) + 1 (baseline)");
+}
+
+}  // extern "C"
diff --git a/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_stress/golden.py b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_stress/golden.py
new file mode 100644
index 000000000..d84f3c270
--- /dev/null
+++ b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_stress/golden.py
@@ -0,0 +1,110 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""
+Golden test for SPMD sync_start stress / CAS contention with mixed shapes.
+
+Submits 6 rounds of mixed-shape tasks to stress drain CAS contention, ack
+barrier, and state cleanup across drain cycles.  All three resource shapes
+(MIX, AIV, AIC) are exercised with both sync and non-sync modes.
+
+Each round (9 tasks):
+  4 × normal MIX  (block_num=4, sync=false) → 4 × 4 × 3 = 48 CL
+  2 × sync MIX    (block_num=12, sync=true) → 2 × 12 × 3 = 72 CL
+  2 × sync AIV    (block_num=8, sync=true)  → 2 × 8 × 1 = 16 CL
+  1 × normal AIV  (block_num=4, sync=false) → 1 × 4 × 1 = 4 CL
+  Round total: 140 CL
+
+6 rounds → 54 tasks (24 normal MIX + 12 sync MIX + 12 sync AIV + 6 normal AIV)
+Grand total: 840 CL = 13440 float32
+
+Stress coverage:
+  - 24 drain cycles (12 MIX + 12 AIV) → validates state cleanup
+  - 2 sync MIX + 2 sync AIV per round → CAS contention across shapes
+  - Normal tasks occupy clusters → forces drain slow path
+  - 54 tasks total → no task loss under sustained load
+
+Args layout: [output]
+"""
+
+import torch
+
+__outputs__ = ["output"]
+RTOL = 0
+ATOL = 0
+
+ALL_CASES = {
+    "Case1": {},
+}
+
+DEFAULT_CASE = "Case1"
+
+FLOATS_PER_CACHE_LINE = 16
+ROUNDS = 6
+
+# shape constants: (slots_per_block, written_slots)
+# MIX: kernel writes at base_cl + block_idx * 3 + {0,1,2}, 3 CL per block, all written
+# AIV: kernel writes at base_cl + block_idx, 1 CL per block
+SHAPE_MIX = "MIX"
+SHAPE_AIV = "AIV"
+
+MIX_SLOTS = 3
+AIV_SLOTS = 1
+
+NORMAL_MIX_BN = 4
+SYNC_MIX_BN = 12
+SYNC_AIV_BN = 8
+NORMAL_AIV_BN = 4
+
+
+def _build_tasks():
+    """Returns list of (block_num, base_cl, shape_str)."""
+    tasks = []
+    cl = 0
+    for _ in range(ROUNDS):
+        # 4 × normal MIX
+        for _ in range(4):
+            tasks.append((NORMAL_MIX_BN, cl, SHAPE_MIX))
+            cl += NORMAL_MIX_BN * MIX_SLOTS
+        # 2 × sync MIX
+        for _ in range(2):
+            tasks.append((SYNC_MIX_BN, cl, SHAPE_MIX))
+            cl += SYNC_MIX_BN * MIX_SLOTS
+        # 2 × sync AIV
+        for _ in range(2):
+            tasks.append((SYNC_AIV_BN, cl, SHAPE_AIV))
+            cl += SYNC_AIV_BN * AIV_SLOTS
+        # 1 × normal AIV
+        tasks.append((NORMAL_AIV_BN, cl, SHAPE_AIV))
+        cl += NORMAL_AIV_BN * AIV_SLOTS
+    return tasks
+
+
+TASKS = _build_tasks()
+TOTAL_CL = sum(bn * (MIX_SLOTS if shape == SHAPE_MIX else AIV_SLOTS) for bn, _, shape in TASKS)  # 840
+
+
+def generate_inputs(params: dict) -> list:
+    output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)
+    return [("output", output)]
+
+
+def compute_golden(tensors: dict, params: dict) -> None:
+    out = torch.as_tensor(tensors["output"])
+    for block_num, base_cl, shape in TASKS:
+        for block_idx in range(block_num):
+            if shape == SHAPE_MIX:
+                # MIX kernel writes float(block_idx) at all 3 slots
+                for slot in range(MIX_SLOTS):
+                    cl = base_cl + block_idx * MIX_SLOTS + slot
+                    out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx)
+            else:
+                # AIV kernel writes float(block_idx) at 1 slot
+                cl = base_cl + block_idx
+                out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx)
+    tensors["output"][:] = out
diff --git a/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/kernel_config.py b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/kernel_config.py
new file mode 100644
index 000000000..09c507863
--- /dev/null
+++ b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/kernel_config.py
@@ -0,0 +1,62 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""
+Kernel configuration for SPMD sync_start stress test with mixed shapes.
+
+Submits 54 tasks (MIX + AIV) over 6 rounds to stress-test drain CAS contention,
+ack barrier, and state cleanup between drain cycles.
+Reuses AIC/AIV kernels from spmd_multiblock_mix and spmd_multiblock_aiv.
+"""
+
+from pathlib import Path
+
+_KERNELS_ROOT = Path(__file__).parent
+_MIX_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_mix" / "kernels"
+_AIV_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_aiv" / "kernels"
+
+ORCHESTRATION = {
+    "source": str(_KERNELS_ROOT / "orchestration" / "spmd_sync_start_stress_orch.cpp"),
+    "function_name": "aicpu_orchestration_entry",
+}
+
+KERNELS = [
+    # func_id 0-2: MIX kernels (AIC + AIV0 + AIV1)
+    {
+        "func_id": 0,
+        "name": "SPMD_MIX_AIC",
+        "source": str(_MIX_KERNELS / "aic" / "kernel_spmd_mix.cpp"),
+        "core_type": "aic",
+    },
+    {
+        "func_id": 1,
+        "name": "SPMD_MIX_AIV0",
+        "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"),
+        "core_type": "aiv",
+    },
+    {
+        "func_id": 2,
+        "name": "SPMD_MIX_AIV1",
+        "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"),
+        "core_type": "aiv",
+    },
+    # func_id 3: standalone AIV kernel
+    {
+        "func_id": 3,
+        "name": "SPMD_WRITE_AIV",
+        "source": str(_AIV_KERNELS / "aiv" / "kernel_spmd_write.cpp"),
+        "core_type": "aiv",
+    },
+]
+
+RUNTIME_CONFIG = {
+    "runtime": "tensormap_and_ringbuffer",
+    "aicpu_thread_num": 4,
+    "orch_thread_num": 1,
+    "block_dim": 24,
+}
diff --git a/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/orchestration/spmd_sync_start_stress_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/orchestration/spmd_sync_start_stress_orch.cpp
new file mode 100644
index 000000000..04692f4fc
--- /dev/null
+++ b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/orchestration/spmd_sync_start_stress_orch.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * SPMD Sync-Start Stress Orchestration (mixed shapes)
+ *
+ * Submits 6 rounds of mixed MIX + AIV tasks to stress-test:
+ *   - Drain CAS contention (multiple sync_start tasks per round)
+ *   - Ack barrier correctness (normal tasks occupy clusters during drain entry)
+ *   - State cleanup between consecutive drain cycles
+ *
+ * Each round (9 tasks):
+ *   4 × normal MIX  (block_num=4,  sync=false) -> 4 × 4 × 3 = 48 CL
+ *   2 × sync   MIX  (block_num=12, sync=true)  -> 2 × 12 × 3 = 72 CL
+ *   2 × sync   AIV  (block_num=8,  sync=true)  -> 2 × 8 × 1 = 16 CL
+ *   1 × normal AIV  (block_num=4,  sync=false) -> 1 × 4 × 1 = 4 CL
+ *   Round total: 140 CL
+ *
+ * 6 rounds → 54 tasks total, 840 CL grand total.
+ *
+ * Args layout: [output]
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"
+
+#define FUNC_SPMD_MIX_AIC 0
+#define FUNC_SPMD_MIX_AIV0 1
+#define FUNC_SPMD_MIX_AIV1 2
+#define FUNC_SPMD_WRITE_AIV 3
+
+static constexpr int32_t MIX_SLOTS = 3;
+static constexpr int32_t NORMAL_MIX_BN = 4;
+static constexpr int32_t SYNC_MIX_BN = 12;
+static constexpr int32_t SYNC_AIV_BN = 8;
+static constexpr int32_t NORMAL_AIV_BN = 4;
+static constexpr int32_t ROUNDS = 6;
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig
+aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) {
+    (void)orch_args;
+    return PTO2OrchestrationConfig{.expected_arg_count = 1};
+}
+
+static void submit_mix(Tensor &out, int16_t block_num, int64_t base_cl, bool sync_start) {
+    MixedKernels mk;
+    mk.aic_kernel_id = FUNC_SPMD_MIX_AIC;
+    mk.aiv0_kernel_id = FUNC_SPMD_MIX_AIV0;
+    mk.aiv1_kernel_id = FUNC_SPMD_MIX_AIV1;
+    Arg args;
+    args.add_inout(out);
+    args.add_scalar(base_cl);
+    args.launch_spec.set_block_num(block_num);
+    args.launch_spec.set_require_sync_start(sync_start);
+    pto2_rt_submit_task(mk, args);
+}
+
+static void submit_aiv(Tensor &out, int16_t block_num, int64_t base_cl, bool sync_start) {
+    Arg args;
+    args.add_inout(out);
+    args.add_scalar(base_cl);
+    args.launch_spec.set_block_num(block_num);
+    args.launch_spec.set_require_sync_start(sync_start);
+    pto2_rt_submit_aiv_task(FUNC_SPMD_WRITE_AIV, args);
+}
+
+__attribute__((visibility("default"))) void
+aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args, int orch_thread_num, int orch_thread_index) {
+    (void)orch_thread_num;
+    if (orch_thread_index != 0) return;
+
+    Tensor ext_output = from_tensor_arg(orch_args.tensor(0));
+
+    int64_t cl = 0;
+
+    for (int32_t r = 0; r < ROUNDS; r++) {
+        // 4 × normal MIX
+        for (int i = 0; i < 4; i++, cl += NORMAL_MIX_BN * MIX_SLOTS)
+            submit_mix(ext_output, NORMAL_MIX_BN, cl, false);
+
+        // 2 × sync MIX — CAS contention: second sync task may arrive while first is draining
+        for (int i = 0; i < 2; i++, cl += SYNC_MIX_BN * MIX_SLOTS)
+            submit_mix(ext_output, SYNC_MIX_BN, cl, true);
+
+        // 2 × sync AIV — cross-shape drain contention with the MIX drain above
+        for (int i = 0; i < 2; i++, cl += SYNC_AIV_BN)
+            submit_aiv(ext_output, SYNC_AIV_BN, cl, true);
+
+        // 1 × normal AIV
+        submit_aiv(ext_output, NORMAL_AIV_BN, cl, false);
+        cl += NORMAL_AIV_BN;
+    }
+
+    LOG_ALWAYS("[spmd_sync_start_stress] Submitted %d tasks over %d rounds", 9 * ROUNDS, ROUNDS);
+}
+
+}  // extern "C"
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index 024846722..ec6e88488 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -248,6 +248,14 @@ class alignas(64) CoreTracker {
         return ((core_states_ >> (cluster_offset + 2)) & BitStates(1ULL)).has_value();
     }
 
+    // Count total idle AIV cores (AIV0 + AIV1) across all clusters.
+    // Unlike get_valid_cluster_offset_states(AIV).count() which counts clusters with
+    // at least one idle AIV, this counts individual idle cores — a cluster with both
+    // AIV0 and AIV1 idle contributes 2, not 1.
+    int32_t count_idle_aiv_cores() const {
+        return ((core_states_ >> 1) & aic_mask_).count() + ((core_states_ >> 2) & aic_mask_).count();
+    }
+
     // --- State mutation ---
 
     // Toggle bit at the given bit offset (running <-> idle)
@@ -268,6 +276,8 @@ class alignas(64) CoreTracker {
 struct AicpuExecutor {
     int32_t orch_thread_num_;
     int32_t sched_thread_num_;
+    int32_t active_sched_threads_{0};  // Threads currently in dispatch loop (initially sched_thread_num_, becomes
+                                       // thread_num_ after orch→sched transition)
     bool orch_to_sched_{false};
 
     // ===== Thread management state =====
@@ -297,6 +307,20 @@ struct AicpuExecutor {
 
     CoreTracker core_trackers_[MAX_AICPU_THREADS];
 
+    // ===== sync_start drain coordination =====
+
+    // When sync_start_pending != 0, all scheduler threads skip Phase 2 dispatch
+    // (only process completions) until the drain worker finishes launching all blocks.
+    struct alignas(64) SyncStartDrainState {
+        std::atomic<int32_t> sync_start_pending{0};    // 0=normal; -1=initializing; >0=active (value=block_num)
+        std::atomic<int32_t> drain_worker_elected{0};  // 0=none; >0: elected thread's (thread_idx+1)
+        std::atomic<uint32_t> drain_ack_mask{0};       // bit per thread; all-set = all threads finished dispatch
+        PTO2TaskSlotState *pending_task{nullptr};      // held task (not re-queued)
+        int32_t _pad[10];
+    };
+    static_assert(sizeof(SyncStartDrainState) == 64);
+    SyncStartDrainState drain_state_;
+
     // ===== Task queue state (managed by scheduler ready queues) =====
 
     // Task execution tracking
@@ -540,17 +564,6 @@ struct AicpuExecutor {
         return count;
     }
 
-    /**
-     * Build per-core dispatch payload: copy tensor pointers and scalars into
-     * the per-core args[] array, then populate SPMD local context at the tail.
-     *
-     * Reads next_block_idx and block_num directly from the task descriptor
-     * to populate LocalContext.  The caller is responsible for incrementing
-     * next_block_idx AFTER dispatch.
-     *
-     * GlobalContext (sub_block_id) is NOT written here — it is initialized once
-     * at runtime startup by init_global_context().
-     */
     void build_payload(PTO2DispatchPayload &dispatch_payload, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot) {
         int32_t slot_idx = static_cast<int32_t>(subslot);
         uint64_t callable_addr = get_function_bin_addr(slot_state.task->kernel_id[slot_idx]);
@@ -564,7 +577,7 @@ struct AicpuExecutor {
         for (int32_t i = 0; i < payload.scalar_count; i++) {
             dispatch_payload.args[n++] = payload.scalars[i];
         }
-        // Per-dispatch local context (read from slot state)
+        // Per-dispatch local context: read block_idx/block_num directly from slot_state.
         dispatch_payload.local_context.block_idx = slot_state.next_block_idx;
         dispatch_payload.local_context.block_num = slot_state.block_num;
         // Store context pointers at fixed suffix positions in args[]
@@ -621,6 +634,242 @@ struct AicpuExecutor {
         tracker.change_core_state(core_offset);
         core_exec_state.executing_reg_task_id = reg_task_id;
     }
+
+    // Dispatch one SPMD block of a MIX task to the cluster at cluster_offset.
+    // Reads slot_state.next_block_idx as block_idx; caller increments it afterwards.
+    void dispatch_mix_block_to_cluster(
+        Runtime *runtime, int32_t thread_idx, int32_t cluster_offset, PTO2TaskSlotState &slot_state
+#if PTO2_PROFILING
+        ,
+        bool profiling_enabled
+#endif
+    ) {
+        CoreTracker &tracker = core_trackers_[thread_idx];
+        uint8_t core_mask = pto2_core_mask(slot_state.active_mask);
+        if (core_mask & PTO2_SUBTASK_MASK_AIC) {
+            dispatch_subtask_to_core(
+                runtime, thread_idx, tracker.get_aic_core_offset(cluster_offset), slot_state, PTO2SubtaskSlot::AIC
+#if PTO2_PROFILING
+                ,
+                profiling_enabled
+#endif
+            );
+        }
+        if (core_mask & PTO2_SUBTASK_MASK_AIV0) {
+            dispatch_subtask_to_core(
+                runtime, thread_idx, tracker.get_aiv0_core_offset(cluster_offset), slot_state, PTO2SubtaskSlot::AIV0
+#if PTO2_PROFILING
+                ,
+                profiling_enabled
+#endif
+            );
+        }
+        if (core_mask & PTO2_SUBTASK_MASK_AIV1) {
+            dispatch_subtask_to_core(
+                runtime, thread_idx, tracker.get_aiv1_core_offset(cluster_offset), slot_state, PTO2SubtaskSlot::AIV1
+#if PTO2_PROFILING
+                ,
+                profiling_enabled
+#endif
+            );
+        }
+    }
+
+    // ===== sync_start drain helpers =====
+
+    // Take ownership of slot_state and signal all threads to enter drain mode.
+    // Returns true if this thread won the CAS and owns the drain slot.
+    // Returns false if another thread already holds drain; caller must re-push slot_state.
+    //
+    // Two-phase protocol: CAS 0 → -1 (sentinel) to claim ownership, store task and
+    // reset election flag, then release-store block_num.  Other threads acquire-load
+    // sync_start_pending; seeing block_num > 0 ensures all relaxed stores are visible.
+    bool enter_drain_mode(PTO2TaskSlotState *slot_state, int32_t block_num) {
+        int32_t expected = 0;
+        if (!drain_state_.sync_start_pending.compare_exchange_strong(
+                expected, -1, std::memory_order_relaxed, std::memory_order_relaxed
+            )) {
+            return false;  // Another thread already holds the drain slot.
+        }
+        // We own the drain slot.  Store the task and reset election flag before making it visible.
+        drain_state_.pending_task = slot_state;
+        drain_state_.drain_ack_mask.store(0, std::memory_order_relaxed);
+        drain_state_.drain_worker_elected.store(0, std::memory_order_relaxed);
+        // Release store: all stores above are now visible to any thread that
+        // acquire-loads sync_start_pending and sees block_num > 0.
+        drain_state_.sync_start_pending.store(block_num, std::memory_order_release);
+        return true;
+    }
+
+    // Dispatch one SPMD block to the cluster at cluster_offset, routing to the correct core(s)
+    // based on shape.  For AIV, picks whichever AIV core in the cluster is currently idle.
+    // Caller is responsible for incrementing slot_state.next_block_idx after this returns.
+    void dispatch_block_to_cluster(
+        Runtime *runtime, int32_t thread_idx, int32_t cluster_offset, PTO2TaskSlotState &slot_state,
+        PTO2ResourceShape shape
+#if PTO2_PROFILING
+        ,
+        bool profiling_enabled, uint32_t &phase_dispatch_count
+#endif
+    ) {
+        CoreTracker &tracker = core_trackers_[thread_idx];
+        if (shape == PTO2ResourceShape::MIX) {
+            dispatch_mix_block_to_cluster(
+                runtime, thread_idx, cluster_offset, slot_state
+#if PTO2_PROFILING
+                ,
+                profiling_enabled
+#endif
+            );
+        } else if (shape == PTO2ResourceShape::AIC) {
+            dispatch_subtask_to_core(
+                runtime, thread_idx, tracker.get_aic_core_offset(cluster_offset), slot_state, PTO2SubtaskSlot::AIC
+#if PTO2_PROFILING
+                ,
+                profiling_enabled
+#endif
+            );
+        } else {  // AIV
+            auto core_offset = tracker.is_aiv0_core_idle(cluster_offset) ?
+                                   tracker.get_aiv0_core_offset(cluster_offset) :
+                                   tracker.get_aiv1_core_offset(cluster_offset);
+            dispatch_subtask_to_core(
+                runtime, thread_idx, core_offset, slot_state, PTO2SubtaskSlot::AIV0
+#if PTO2_PROFILING
+                ,
+                profiling_enabled
+#endif
+            );
+        }
+#if PTO2_PROFILING
+        phase_dispatch_count += __builtin_popcount(pto2_core_mask(slot_state.active_mask));
+#endif
+    }
+
+    // Count total available resources across all scheduler threads for a given shape.
+    int32_t count_global_available(PTO2ResourceShape shape) {
+        int32_t total = 0;
+        for (int32_t t = 0; t < active_sched_threads_; t++) {
+            if (shape == PTO2ResourceShape::AIV) {
+                total += core_trackers_[t].count_idle_aiv_cores();
+            } else {
+                total += core_trackers_[t].get_valid_cluster_offset_states(shape).count();
+            }
+        }
+        return total;
+    }
+
+    // Drain worker: dispatch all blocks in one pass across all threads' trackers.
+    // Called only when global resources >= block_num, so one pass always suffices.
+    // All other threads are spinning — the drain worker has exclusive tracker access.
+    void drain_worker_dispatch(
+        Runtime *runtime, int32_t block_num
+#if PTO2_PROFILING
+        ,
+        bool profiling_enabled, uint32_t &phase_dispatch_count
+#endif
+    ) {
+        PTO2TaskSlotState *slot_state = drain_state_.pending_task;
+        if (!slot_state) {
+            drain_state_.sync_start_pending.store(0, std::memory_order_release);
+            return;
+        }
+        PTO2ResourceShape shape = pto2_active_mask_to_shape(slot_state->active_mask);
+
+        for (int32_t t = 0; t < active_sched_threads_ && slot_state->next_block_idx < block_num; t++) {
+            auto valid = core_trackers_[t].get_valid_cluster_offset_states(shape);
+            while (valid.has_value() && slot_state->next_block_idx < block_num) {
+                dispatch_block_to_cluster(
+                    runtime, t, valid.pop_first(), *slot_state, shape
+#if PTO2_PROFILING
+                    ,
+                    profiling_enabled, phase_dispatch_count
+#endif
+                );
+                slot_state->next_block_idx++;
+                if (slot_state->next_block_idx < block_num)
+                    valid = core_trackers_[t].get_valid_cluster_offset_states(shape);
+            }
+        }
+
+        // All blocks dispatched — clear drain state.
+        // Release fence ensures tracker mutations are visible to threads that
+        // acquire-load sync_start_pending == 0 and resume normal operation.
+        std::atomic_thread_fence(std::memory_order_release);
+        drain_state_.pending_task = nullptr;
+        drain_state_.drain_ack_mask.store(0, std::memory_order_relaxed);
+        drain_state_.drain_worker_elected.store(0, std::memory_order_relaxed);
+        drain_state_.sync_start_pending.store(0, std::memory_order_release);
+    }
+
+    // Called by each scheduler thread when drain_state_.sync_start_pending != 0.
+    //
+    // Three-phase protocol:
+    //   1. Ack barrier: all threads signal they've stopped Phase 2 dispatch.
+    //      If not all acked yet, return to Phase 1 (completion polling).
+    //   2. Resource check: elected thread verifies global idle resources >= block_num.
+    //      If insufficient, reset election state and return — all threads resume
+    //      Phase 1 to free running cores, then retry next iteration.
+    //   3. Dispatch: elected thread dispatches all blocks (one pass, resources guaranteed).
+    //      Non-elected threads spin-wait until sync_start_pending == 0.
+    //      During dispatch the elected thread has exclusive tracker access.
+    void handle_drain_mode(
+        Runtime *runtime, int32_t thread_idx
+#if PTO2_PROFILING
+        ,
+        bool profiling_enabled, uint32_t &phase_dispatch_count
+#endif
+    ) {
+        // Spin until drain is fully initialized (sentinel -1 → block_num > 0).
+        int32_t block_num;
+        do {
+            block_num = drain_state_.sync_start_pending.load(std::memory_order_acquire);
+        } while (block_num < 0);
+        if (block_num == 0) return;
+
+        // Phase 1: Ack barrier — signal this thread has stopped Phase 2 dispatch.
+        uint32_t all_acked = (1u << active_sched_threads_) - 1;
+        drain_state_.drain_ack_mask.fetch_or(1u << thread_idx, std::memory_order_release);
+
+        // If not all threads have acked, return to do Phase 1 (completion polling).
+        if ((drain_state_.drain_ack_mask.load(std::memory_order_acquire) & all_acked) != all_acked) return;
+
+        // Phase 2: Election — exactly one thread wins the CAS.
+        int32_t expected = 0;
+        drain_state_.drain_worker_elected.compare_exchange_strong(
+            expected, thread_idx + 1, std::memory_order_acquire, std::memory_order_relaxed
+        );
+
+        if (drain_state_.drain_worker_elected.load(std::memory_order_relaxed) != thread_idx + 1) {
+            // Non-elected: spin-wait for drain completion or resource-insufficient reset.
+            while (drain_state_.sync_start_pending.load(std::memory_order_acquire) != 0) {
+                if (drain_state_.drain_worker_elected.load(std::memory_order_acquire) == 0) return;
+                SPIN_WAIT_HINT();
+            }
+            return;
+        }
+
+        // Elected: check if global resources are sufficient.
+        PTO2TaskSlotState *slot_state = drain_state_.pending_task;
+        PTO2ResourceShape shape = pto2_active_mask_to_shape(slot_state->active_mask);
+        int32_t available = count_global_available(shape);
+
+        if (available < block_num) {
+            // Insufficient resources — reset election, let all threads do Phase 1.
+            drain_state_.drain_ack_mask.store(0, std::memory_order_relaxed);
+            drain_state_.drain_worker_elected.store(0, std::memory_order_release);
+            return;
+        }
+
+        // Phase 3: Dispatch — all other threads are spinning, exclusive tracker access.
+        drain_worker_dispatch(
+            runtime, block_num
+#if PTO2_PROFILING
+            ,
+            profiling_enabled, phase_dispatch_count
+#endif
+        );
+    }
 };
 
 static AicpuExecutor g_aicpu_executor;
@@ -752,6 +1001,7 @@ bool AicpuExecutor::assign_cores_to_threads() {
 
     // Mark orchestrator threads explicitly (no cores).
     for (int32_t t = divisor; t < thread_num_; t++) {
+        core_trackers_[t].init(0);
         DEV_INFO("Thread %d: orchestrator (0 cores)", t);
     }
 
@@ -776,11 +1026,12 @@ bool AicpuExecutor::assign_cores_to_threads() {
         DEV_INFO("Thread %d: cluster %d (AIC=%d, AIV0=%d, AIV1=%d)", t, ci, aic_wid, aiv0_wid, aiv1_wid);
     }
 
-    for (int32_t t = 0; t < divisor; t++) {
+    for (int32_t t = 0; t < thread_num_; t++) {
         core_count_per_thread_[t] = core_idx[t];
         DEV_INFO("Thread %d: total %d cores (%d clusters)", t, core_idx[t], core_trackers_[t].get_cluster_count());
     }
 
+    active_sched_threads_ = (sched_thread_num_ > 0) ? sched_thread_num_ : thread_num_;
     return true;
 }
 
@@ -853,6 +1104,7 @@ void AicpuExecutor::reassign_cores_for_all_threads() {
             core_trackers_[t].get_cluster_count(), aic_running, aiv_running
         );
     }
+    active_sched_threads_ = thread_num_;
 }
 
 int32_t AicpuExecutor::init(Runtime *runtime) {
@@ -1215,8 +1467,25 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
 #endif
 
         bool try_pushed = false;
+
+        // Phase 2 drain check: if a sync_start task is waiting for resources,
+        // pause normal dispatch and let the drain protocol run.
+        // relaxed load is enough — drain state only needs to be visible within
+        // a few iterations; exact ordering is enforced inside handle_drain_mode.
+        if (drain_state_.sync_start_pending.load(std::memory_order_acquire) != 0) {
+            handle_drain_mode(
+                runtime, thread_idx
+#if PTO2_PROFILING
+                ,
+                profiling_enabled, phase_dispatch_count
+#endif
+            );
+            continue;
+        }
+
         const PTO2ResourceShape *dispatch_order = get_dispatch_order(thread_idx);
-        for (int32_t si = 0; si < PTO2_NUM_RESOURCE_SHAPES; si++) {
+        bool entered_drain = false;
+        for (int32_t si = 0; si < PTO2_NUM_RESOURCE_SHAPES && !entered_drain; si++) {
             PTO2ResourceShape shape = dispatch_order[si];
             auto valid_cluster_states = tracker.get_valid_cluster_offset_states(shape);
             if (!valid_cluster_states.has_value()) {
@@ -1224,7 +1493,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
             }
             auto &local_buf = local_bufs[static_cast<int32_t>(shape)];
 
-            while (valid_cluster_states.has_value()) {
+            while (valid_cluster_states.has_value() && !entered_drain) {
                 int want = valid_cluster_states.count();
                 PTO2TaskSlotState *batch[CoreTracker::MAX_CLUSTERS];
                 int got = pop_ready_tasks_batch(
@@ -1242,75 +1511,47 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
 #if PTO2_SCHED_PROFILING
                     uint64_t t_setup_start = get_sys_cnt_aicpu();
 #endif
+                    // sync_start: all blocks must dispatch atomically.
+                    // Fast path  — enough local slots: fall through to normal dispatch loop below.
+                    // Slow path  — not enough: enter drain mode, then re-push all remaining
+                    //              tasks in the batch so nothing is lost.
+                    // For AIV, one cluster can serve 2 blocks (AIV0 + AIV1), so compare against
+                    // idle AIV core count rather than cluster count.
+                    if (pto2_requires_sync_start(slot_state->active_mask)) {
+                        int32_t available = (shape == PTO2ResourceShape::AIV) ? tracker.count_idle_aiv_cores() :
+                                                                                valid_cluster_states.count();
+                        if (available < slot_state->block_num) {
+                            if (!enter_drain_mode(slot_state, slot_state->block_num)) {
+                                // CAS lost: drain already active for another task; re-push and wait.
+                                rt->scheduler.ready_queues[static_cast<int32_t>(shape)].push(slot_state);
+                            }
+                            // Re-push all unprocessed tasks remaining in this batch.
+                            for (int rem = bi + 1; rem < got; rem++) {
+                                rt->scheduler.ready_queues[static_cast<int32_t>(shape)].push(batch[rem]);
+                            }
+                            entered_drain = true;
+                            break;
+                        }
+                        // Fast path: enough local resources, fall through to normal dispatch.
+                    }
+
                     // Dispatch as many blocks as possible for this task using available clusters.
                     // For block_num=1 the inner body executes exactly once (no overhead).
                     do {
                         auto current_valid_cluster_offset = valid_cluster_states.pop_first();
-                        if (shape == PTO2ResourceShape::MIX) {
-                            // Full-cluster: all active subtasks share the same block_idx.
-                            uint8_t mask = slot_state->active_mask;
-                            if (mask & PTO2_SUBTASK_MASK_AIC) {
-                                dispatch_subtask_to_core(
-                                    runtime, thread_idx, tracker.get_aic_core_offset(current_valid_cluster_offset),
-                                    *slot_state, PTO2SubtaskSlot::AIC
-#if PTO2_PROFILING
-                                    ,
-                                    profiling_enabled
-#endif
-                                );
-                            }
-                            if (mask & PTO2_SUBTASK_MASK_AIV0) {
-                                dispatch_subtask_to_core(
-                                    runtime, thread_idx, tracker.get_aiv0_core_offset(current_valid_cluster_offset),
-                                    *slot_state, PTO2SubtaskSlot::AIV0
+                        dispatch_block_to_cluster(
+                            runtime, thread_idx, current_valid_cluster_offset, *slot_state, shape
 #if PTO2_PROFILING
-                                    ,
-                                    profiling_enabled
+                            ,
+                            profiling_enabled, phase_dispatch_count
 #endif
-                                );
-                            }
-                            if (mask & PTO2_SUBTASK_MASK_AIV1) {
-                                dispatch_subtask_to_core(
-                                    runtime, thread_idx, tracker.get_aiv1_core_offset(current_valid_cluster_offset),
-                                    *slot_state, PTO2SubtaskSlot::AIV1
-#if PTO2_PROFILING
-                                    ,
-                                    profiling_enabled
-#endif
-                                );
-                            }
-                            slot_state->next_block_idx++;
-                        } else if (shape == PTO2ResourceShape::AIC) {
-                            dispatch_subtask_to_core(
-                                runtime, thread_idx, tracker.get_aic_core_offset(current_valid_cluster_offset),
-                                *slot_state, PTO2SubtaskSlot::AIC
-#if PTO2_PROFILING
-                                ,
-                                profiling_enabled
-#endif
-                            );
-                            slot_state->next_block_idx++;
-                        } else {  // shape == PTO2ResourceShape::AIV
-                            auto core_offset = tracker.is_aiv0_core_idle(current_valid_cluster_offset) ?
-                                                   tracker.get_aiv0_core_offset(current_valid_cluster_offset) :
-                                                   tracker.get_aiv1_core_offset(current_valid_cluster_offset);
-                            dispatch_subtask_to_core(
-                                runtime, thread_idx, core_offset, *slot_state, PTO2SubtaskSlot::AIV0
-#if PTO2_PROFILING
-                                ,
-                                profiling_enabled
-#endif
-                            );
-                            slot_state->next_block_idx++;
-                            // Refresh idle state so the do-while naturally picks up
-                            // the other AIV in the same cluster on the next iteration.
-                            if (slot_state->next_block_idx < slot_state->block_num) {
-                                valid_cluster_states = tracker.get_valid_cluster_offset_states(shape);
-                            }
+                        );
+                        slot_state->next_block_idx++;
+                        // For AIV, refresh cluster states so the do-while can pick up the
+                        // other AIV core in the same cluster on the next iteration.
+                        if (shape == PTO2ResourceShape::AIV && slot_state->next_block_idx < slot_state->block_num) {
+                            valid_cluster_states = tracker.get_valid_cluster_offset_states(shape);
                         }
-#if PTO2_PROFILING
-                        phase_dispatch_count += __builtin_popcount(slot_state->active_mask);
-#endif
                         DEV_DEBUG(
                             "Thread %d: Dispatched %s task %" PRId64 " block %d/%d to cluster_offset %d", thread_idx,
                             shape_name(shape), static_cast<int64_t>(slot_state->task->task_id.raw),
@@ -1859,6 +2100,12 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 }
 #endif
 
+                // Total core counts = aic_count_ / aiv_count_ (set once at runtime init).
+                for (int i = 0; i < orch_thread_num_; i++) {
+                    rt->orchestrators[i].total_cluster_count = aic_count_;
+                    rt->orchestrators[i].total_aiv_count = aiv_count_;
+                }
+
                 // With multi-ring, slot_states are per-ring inside the scheduler.
                 runtime->set_pto2_slot_states_ptr(nullptr);
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
index c467fb667..ccbb6b40b 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
@@ -346,6 +346,21 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke
         active_mask = pto2_mixed_kernels_to_active_mask(normalized);
     }
 
+    // Encode require_sync_start into active_mask bit 3 (only meaningful for tasks with block_num > 1)
+    if (block_num > 1 && args.launch_spec.require_sync_start()) {
+        // Deadlock check: block_num >= total available slots of the required type.
+        // For MIX/AIC: limit is total_cluster_count (one AIC per cluster).
+        // For AIV:     limit is total_aiv_count.
+        PTO2ResourceShape shape = pto2_active_mask_to_shape(active_mask);
+        int32_t limit = (shape == PTO2ResourceShape::AIV) ? orch->total_aiv_count : orch->total_cluster_count;
+        if (limit > 0 && block_num > limit) {
+            LOG_ERROR("FATAL: require_sync_start block_num=%d > limit=%d (deadlock guaranteed)", block_num, limit);
+            orch->fatal = true;
+            return TaskOutputTensors{};
+        }
+        active_mask |= PTO2_SUBTASK_FLAG_SYNC_START;
+    }
+
     // Submission without an open scope is illegal
     always_assert(orch->scope_stack_top >= 0 && "Cannot submit task outside a scope");
 
@@ -583,7 +598,8 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke
         cur_slot_state.task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
         cur_slot_state.fanout_refcount.store(0, std::memory_order_relaxed);
         cur_slot_state.completed_subtasks.store(0, std::memory_order_relaxed);
-        cur_slot_state.total_required_subtasks = static_cast<int16_t>(block_num * __builtin_popcount(active_mask));
+        cur_slot_state.total_required_subtasks =
+            static_cast<int16_t>(block_num * __builtin_popcount(pto2_core_mask(active_mask)));
         cur_slot_state.block_num = block_num;
         cur_slot_state.next_block_idx = 0;
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
index a40a6f7ce..0d9d94276 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
@@ -70,6 +70,10 @@ struct PTO2OrchestratorState {
     // Note: In simulated mode, orchestrator and scheduler share address space
     // In real mode, they communicate via shared memory only
     PTO2SchedulerState *scheduler;  // For simulated mode only
+
+    // Total core counts set once at executor init; used for submit-time deadlock detection.
+    int32_t total_cluster_count{0};  // AIC cores = MIX clusters
+    int32_t total_aiv_count{0};      // AIV cores (= 2 × clusters on standard hardware)
 #if PTO2_PROFILING
     // Runtime profiling switch copied from Runtime::enable_profiling.
     bool enable_profiling;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h
index 90e0397ad..9901f12ba 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h
@@ -39,9 +39,10 @@ enum class PTO2SubtaskSlot : uint8_t {
 /**
  * Subtask mask bits (for active_mask / subtask_done_mask)
  */
-inline constexpr uint8_t PTO2_SUBTASK_MASK_AIC = (1u << 0);   // 0x1
-inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV0 = (1u << 1);  // 0x2
-inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV1 = (1u << 2);  // 0x4
+inline constexpr uint8_t PTO2_SUBTASK_MASK_AIC = (1u << 0);         // 0x1
+inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV0 = (1u << 1);        // 0x2
+inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV1 = (1u << 2);        // 0x4
+inline constexpr uint8_t PTO2_SUBTASK_FLAG_SYNC_START = (1u << 3);  // 0x8: all blocks must launch atomically
 
 /**
  * Test whether a subtask slot is active in a given mask
@@ -50,6 +51,18 @@ static inline bool pto2_subtask_active(uint8_t mask, PTO2SubtaskSlot slot) {
     return (mask & (1u << static_cast<uint8_t>(slot))) != 0;
 }
 
+/**
+ * Extract only the core bits from active_mask (strips flag bits).
+ */
+static inline uint8_t pto2_core_mask(uint8_t active_mask) { return active_mask & 0x07u; }
+
+/**
+ * Check whether a task requires all blocks to be launched atomically.
+ */
+static inline bool pto2_requires_sync_start(uint8_t active_mask) {
+    return (active_mask & PTO2_SUBTASK_FLAG_SYNC_START) != 0;
+}
+
 /**
  * Mixed-task submit contract.
  *
@@ -83,9 +96,10 @@ inline constexpr int32_t PTO2_NUM_RESOURCE_SHAPES = 3;
  * Caller must ensure active_mask is valid (at least one bit set).
  */
 static inline PTO2ResourceShape pto2_active_mask_to_shape(uint8_t active_mask) {
-    int bit_count = __builtin_popcount(active_mask);
+    uint8_t core_mask = pto2_core_mask(active_mask);
+    int bit_count = __builtin_popcount(core_mask);
     if (bit_count >= 2) return PTO2ResourceShape::MIX;
-    if (active_mask & PTO2_SUBTASK_MASK_AIC) return PTO2ResourceShape::AIC;
+    if (core_mask & PTO2_SUBTASK_MASK_AIC) return PTO2ResourceShape::AIC;
     return PTO2ResourceShape::AIV;
 }
 
@@ -114,6 +128,10 @@ class PTO2LaunchSpec {
     int16_t block_num() const { return block_num_; }
     void set_block_num(int16_t n) { block_num_ = n; }
 
+    bool require_sync_start() const { return require_sync_start_; }
+    void set_require_sync_start(bool v) { require_sync_start_ = v; }
+
 private:
     int16_t block_num_{1};
+    bool require_sync_start_{false};
 };
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index 411cef710..6af392522 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -48,7 +48,7 @@
 #define RUNTIME_MAX_WORKER 72  // 24 AIC + 48 AIV cores
 #define RUNTIME_MAX_TENSOR_PAIRS 64
 #define RUNTIME_MAX_FUNC_ID 32
-#define RUNTIME_MAX_ORCH_SO_SIZE (4 * 1024 * 1024)  // 1MB max for orchestration SO
+#define RUNTIME_MAX_ORCH_SO_SIZE (4 * 1024 * 1024)  // 4MB max for orchestration SO
 
 // Default ready queue shards: one shard per worker thread (total minus orchestrator)
 constexpr int RUNTIME_DEFAULT_READY_QUEUE_SHARDS = PLATFORM_MAX_AICPU_THREADS - 1;

From e60616d9a7bd5c4e5fe6cbe25fba66e499e22ed6 Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Fri, 3 Apr 2026 21:51:34 +0800
Subject: [PATCH 2/2] Add: require_sync_start for a5 SPMD (port from a2a3)

Port the complete require_sync_start / drain mode implementation from
a2a3 to a5 tensormap_and_ringbuffer runtime:

- pto_submit_types.h: add PTO2_SUBTASK_FLAG_SYNC_START, pto2_core_mask,
  pto2_requires_sync_start; fix pto2_active_mask_to_shape to strip flag
  bits; extend PTO2LaunchSpec with require_sync_start
- pto_orchestrator: add total_cluster_count/total_aiv_count for deadlock
  detection; encode sync_start flag in active_mask at submit time; fix
  total_required_subtasks popcount to use pto2_core_mask
- aicpu_executor: add SyncStartDrainState, active_sched_threads,
  count_idle_aiv_cores, three-phase drain protocol (ack barrier, global
  resource check, exclusive dispatch); modify scheduler main loop with
  drain check and sync_start fast/slow path branching
- Add 5 test examples: spmd_sync_start, spmd_sync_start_aiv,
  spmd_sync_start_edge, spmd_sync_start_stress, spmd_starvation
---
 .../spmd_starvation/golden.py                 |  84 +++++
 .../spmd_starvation/kernels/kernel_config.py  |  53 +++
 .../orchestration/spmd_starvation_orch.cpp    | 100 ++++++
 .../spmd_sync_start/golden.py                 |  66 ++++
 .../spmd_sync_start/kernels/kernel_config.py  |  52 +++
 .../orchestration/spmd_sync_start_orch.cpp    |  82 +++++
 .../spmd_sync_start_aiv/golden.py             |  62 ++++
 .../kernels/kernel_config.py                  |  41 +++
 .../spmd_sync_start_aiv_orch.cpp              |  76 +++++
 .../spmd_sync_start_edge/golden.py            |  66 ++++
 .../kernels/kernel_config.py                  |  52 +++
 .../spmd_sync_start_edge_orch.cpp             |  82 +++++
 .../spmd_sync_start_stress/golden.py          | 104 ++++++
 .../kernels/kernel_config.py                  |  62 ++++
 .../spmd_sync_start_stress_orch.cpp           | 109 ++++++
 .../aicpu/aicpu_executor.cpp                  | 314 +++++++++++++++++-
 .../runtime/pto_orchestrator.cpp              |  18 +-
 .../runtime/pto_orchestrator.h                |   4 +
 .../runtime/pto_submit_types.h                |  28 +-
 19 files changed, 1445 insertions(+), 10 deletions(-)
 create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_starvation/golden.py
 create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_starvation/kernels/kernel_config.py
 create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_starvation/kernels/orchestration/spmd_starvation_orch.cpp
 create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start/golden.py
 create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start/kernels/kernel_config.py
 create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start/kernels/orchestration/spmd_sync_start_orch.cpp
 create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/golden.py
 create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/kernel_config.py
 create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/orchestration/spmd_sync_start_aiv_orch.cpp
 create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/golden.py
 create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/kernel_config.py
 create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/orchestration/spmd_sync_start_edge_orch.cpp
 create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/golden.py
 create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/kernel_config.py
 create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/orchestration/spmd_sync_start_stress_orch.cpp

diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_starvation/golden.py b/examples/a5/tensormap_and_ringbuffer/spmd_starvation/golden.py
new file mode 100644
index 000000000..2e85b0fb6
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/spmd_starvation/golden.py
@@ -0,0 +1,84 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""
+Golden test for SPMD starvation prevention.
+
+Submits 18 normal MIX tasks interleaved with 2 sync_start MIX tasks and
+verifies all 20 tasks complete with correct output.  The test validates that
+the drain mechanism prevents sync_start tasks from being starved.
+
+Layout:
+  Wave 1: 6 x normal(block_num=4)  -> CL 0..71
+  Sync 0: 1 x sync_start(block_num=6) -> CL 72..89
+  Wave 2: 6 x normal(block_num=4)  -> CL 90..161
+  Sync 1: 1 x sync_start(block_num=6) -> CL 162..179
+  Wave 3: 6 x normal(block_num=4)  -> CL 180..251
+
+Total: 252 CL = 4032 float32.
+
+Args layout: [output]
+"""
+
+import torch
+
+__outputs__ = ["output"]
+RTOL = 0
+ATOL = 0
+
+ALL_CASES = {
+    "Case1": {},
+}
+
+DEFAULT_CASE = "Case1"
+
+FLOATS_PER_CACHE_LINE = 16
+SLOTS_PER_BLOCK = 3  # AIC, AIV0, AIV1
+NORMAL_BLOCK_NUM = 4
+SYNC_BLOCK_NUM = 6
+NORMAL_CL = NORMAL_BLOCK_NUM * SLOTS_PER_BLOCK  # 12
+SYNC_CL = SYNC_BLOCK_NUM * SLOTS_PER_BLOCK  # 18
+
+
+# Build flat task list as (block_num, base_cl)
+def _build_tasks():
+    tasks = []
+    cl = 0
+    for _ in range(6):
+        tasks.append((NORMAL_BLOCK_NUM, cl))
+        cl += NORMAL_CL
+    tasks.append((SYNC_BLOCK_NUM, cl))
+    cl += SYNC_CL
+    for _ in range(6):
+        tasks.append((NORMAL_BLOCK_NUM, cl))
+        cl += NORMAL_CL
+    tasks.append((SYNC_BLOCK_NUM, cl))
+    cl += SYNC_CL
+    for _ in range(6):
+        tasks.append((NORMAL_BLOCK_NUM, cl))
+        cl += NORMAL_CL
+    return tasks
+
+
+TASKS = _build_tasks()
+TOTAL_CL = sum(bn * SLOTS_PER_BLOCK for bn, _ in TASKS)  # 252
+
+
+def generate_inputs(params: dict) -> list:
+    output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)
+    return [("output", output)]
+
+
+def compute_golden(tensors: dict, params: dict) -> None:
+    out = torch.as_tensor(tensors["output"])
+    for block_num, base_cl in TASKS:
+        for block_idx in range(block_num):
+            for slot in range(SLOTS_PER_BLOCK):
+                cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot
+                out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx)
+    tensors["output"][:] = out
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_starvation/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/spmd_starvation/kernels/kernel_config.py
new file mode 100644
index 000000000..a613c65ca
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/spmd_starvation/kernels/kernel_config.py
@@ -0,0 +1,53 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""
+Kernel configuration for SPMD starvation prevention test (tensormap_and_ringbuffer Runtime).
+
+Submits 18 normal MIX tasks interleaved with 2 sync_start MIX tasks to verify
+the drain mechanism prevents sync_start tasks from being starved.
+Reuses the same AIC/AIV kernels from spmd_multiblock_mix.
+"""
+
+from pathlib import Path
+
+_KERNELS_ROOT = Path(__file__).parent
+_MIX_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_mix" / "kernels"
+
+ORCHESTRATION = {
+    "source": str(_KERNELS_ROOT / "orchestration" / "spmd_starvation_orch.cpp"),
+    "function_name": "aicpu_orchestration_entry",
+}
+
+KERNELS = [
+    {
+        "func_id": 0,
+        "name": "SPMD_MIX_AIC",
+        "source": str(_MIX_KERNELS / "aic" / "kernel_spmd_mix.cpp"),
+        "core_type": "aic",
+    },
+    {
+        "func_id": 1,
+        "name": "SPMD_MIX_AIV0",
+        "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"),
+        "core_type": "aiv",
+    },
+    {
+        "func_id": 2,
+        "name": "SPMD_MIX_AIV1",
+        "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"),
+        "core_type": "aiv",
+    },
+]
+
+RUNTIME_CONFIG = {
+    "runtime": "tensormap_and_ringbuffer",
+    "aicpu_thread_num": 4,
+    "orch_thread_num": 1,
+    "block_dim": 24,
+}
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_starvation/kernels/orchestration/spmd_starvation_orch.cpp b/examples/a5/tensormap_and_ringbuffer/spmd_starvation/kernels/orchestration/spmd_starvation_orch.cpp
new file mode 100644
index 000000000..2381c5a38
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/spmd_starvation/kernels/orchestration/spmd_starvation_orch.cpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * SPMD Starvation Prevention Orchestration
+ *
+ * Submits a large wave of normal MIX tasks followed by sync_start tasks,
+ * then another wave of normal tasks.  The drain mechanism must ensure the
+ * sync_start tasks are not indefinitely delayed by the surrounding load.
+ *
+ * Layout: 3 waves x 6 normal tasks (block_num=4) + 2 sync_start tasks (block_num=6)
+ *
+ * Normal task: block_num=4, require_sync_start=false  -> 4 blocks x 3 slots = 12 CL each
+ * Sync task:   block_num=6, require_sync_start=true   -> 6 blocks x 3 slots = 18 CL each
+ *
+ * Total CL: 3x6x12 + 2x18 = 216 + 36 = 252
+ *
+ * Args layout: [output]
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"
+
+#define FUNC_SPMD_MIX_AIC 0
+#define FUNC_SPMD_MIX_AIV0 1
+#define FUNC_SPMD_MIX_AIV1 2
+
+static constexpr int32_t SLOTS_PER_BLOCK = 3;  // AIC, AIV0, AIV1
+static constexpr int32_t NORMAL_BLOCK_NUM = 4;
+static constexpr int32_t SYNC_BLOCK_NUM = 6;
+static constexpr int32_t NORMAL_CL = NORMAL_BLOCK_NUM * SLOTS_PER_BLOCK;  // 12
+static constexpr int32_t SYNC_CL = SYNC_BLOCK_NUM * SLOTS_PER_BLOCK;      // 18
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig
+aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) {
+    (void)orch_args;  // NOLINT(readability/casting)
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 1,
+    };
+}
+
+static void submit_mix(Tensor &out, int16_t block_num, int64_t base_cl, bool sync_start) {
+    MixedKernels mk;
+    mk.aic_kernel_id = FUNC_SPMD_MIX_AIC;
+    mk.aiv0_kernel_id = FUNC_SPMD_MIX_AIV0;
+    mk.aiv1_kernel_id = FUNC_SPMD_MIX_AIV1;
+
+    Arg args;
+    args.add_inout(out);
+    args.add_scalar(base_cl);
+    args.launch_spec.set_core_num(block_num);
+    args.launch_spec.set_require_sync_start(sync_start);
+    pto2_rt_submit_task(mk, args);
+}
+
+__attribute__((visibility("default"))) void
+aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args, int orch_thread_num, int orch_thread_index) {
+    (void)orch_thread_num;  // NOLINT(readability/casting)
+    if (orch_thread_index != 0) return;
+
+    Tensor ext_output = from_tensor_arg(orch_args.tensor(0));
+
+    int64_t cl = 0;
+
+    // Wave 1: 6 normal MIX tasks
+    for (int i = 0; i < 6; i++, cl += NORMAL_CL)
+        submit_mix(ext_output, NORMAL_BLOCK_NUM, cl, false);
+
+    // Sync-start task 0: must not be starved by wave 1 or wave 2
+    submit_mix(ext_output, SYNC_BLOCK_NUM, cl, true);
+    cl += SYNC_CL;
+
+    // Wave 2: 6 normal MIX tasks
+    for (int i = 0; i < 6; i++, cl += NORMAL_CL)
+        submit_mix(ext_output, NORMAL_BLOCK_NUM, cl, false);
+
+    // Sync-start task 1: must not be starved by wave 2 or wave 3
+    submit_mix(ext_output, SYNC_BLOCK_NUM, cl, true);
+    cl += SYNC_CL;
+
+    // Wave 3: 6 normal MIX tasks
+    for (int i = 0; i < 6; i++, cl += NORMAL_CL)
+        submit_mix(ext_output, NORMAL_BLOCK_NUM, cl, false);
+
+    LOG_ALWAYS("[spmd_starvation] Submitted 20 tasks (18 normal + 2 sync_start)");
+}
+
+}  // extern "C"
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/golden.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/golden.py
new file mode 100644
index 000000000..33acd1c1a
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/golden.py
@@ -0,0 +1,66 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""
+Golden test for SPMD sync_start.
+
+Submits 4 MIX tasks (3 with require_sync_start=true, 1 baseline) and verifies
+all blocks of every task write the correct float(block_idx) to their cache line.
+
+Tasks (AIC=slot0, AIV0=slot1, AIV1=slot2):
+  T0: block_num=2,  sync_start=True  -> CL 0..5
+  T1: block_num=8,  sync_start=True  -> CL 6..29
+  T2: block_num=2,  sync_start=False -> CL 30..35  (baseline)
+  T3: block_num=12, sync_start=True  -> CL 36..71
+
+Output tensor: 72 cache lines = 1152 float32.
+
+Args layout: [output]
+"""
+
+import torch
+
+__outputs__ = ["output"]
+RTOL = 0
+ATOL = 0
+
+ALL_CASES = {
+    "Case1": {},
+}
+
+DEFAULT_CASE = "Case1"
+
+FLOATS_PER_CACHE_LINE = 16
+SLOTS_PER_BLOCK = 3  # AIC, AIV0, AIV1
+
+# (block_num, base_cl) for each submitted task
+TASKS = [
+    (2, 0),  # T0: sync_start=True
+    (8, 6),  # T1: sync_start=True
+    (2, 30),  # T2: sync_start=False (baseline)
+    (12, 36),  # T3: sync_start=True
+]
+
+TOTAL_CL = sum(block_num * SLOTS_PER_BLOCK for block_num, _ in TASKS)  # 72
+
+
+def generate_inputs(params: dict) -> list:
+    output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)
+    return [
+        ("output", output),
+    ]
+
+
+def compute_golden(tensors: dict, params: dict) -> None:
+    out = torch.as_tensor(tensors["output"])
+    for block_num, base_cl in TASKS:
+        for block_idx in range(block_num):
+            for slot in range(SLOTS_PER_BLOCK):
+                cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot
+                out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx)
+    tensors["output"][:] = out
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/kernels/kernel_config.py
new file mode 100644
index 000000000..95f706c9d
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/kernels/kernel_config.py
@@ -0,0 +1,52 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""
+Kernel configuration for SPMD sync_start test (tensormap_and_ringbuffer Runtime).
+
+Submits MIX tasks with require_sync_start=true to verify atomic batch launch.
+Reuses the same AIC/AIV kernels from spmd_multiblock_mix.
+"""
+
+from pathlib import Path
+
+_KERNELS_ROOT = Path(__file__).parent
+_MIX_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_mix" / "kernels"
+
+ORCHESTRATION = {
+    "source": str(_KERNELS_ROOT / "orchestration" / "spmd_sync_start_orch.cpp"),
+    "function_name": "aicpu_orchestration_entry",
+}
+
+KERNELS = [
+    {
+        "func_id": 0,
+        "name": "SPMD_MIX_AIC",
+        "source": str(_MIX_KERNELS / "aic" / "kernel_spmd_mix.cpp"),
+        "core_type": "aic",
+    },
+    {
+        "func_id": 1,
+        "name": "SPMD_MIX_AIV0",
+        "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"),
+        "core_type": "aiv",
+    },
+    {
+        "func_id": 2,
+        "name": "SPMD_MIX_AIV1",
+        "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"),
+        "core_type": "aiv",
+    },
+]
+
+RUNTIME_CONFIG = {
+    "runtime": "tensormap_and_ringbuffer",
+    "aicpu_thread_num": 4,
+    "orch_thread_num": 1,
+    "block_dim": 24,
+}
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/kernels/orchestration/spmd_sync_start_orch.cpp b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/kernels/orchestration/spmd_sync_start_orch.cpp
new file mode 100644
index 000000000..edeef95d6
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/kernels/orchestration/spmd_sync_start_orch.cpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * SPMD Sync-Start Orchestration
+ *
+ * Submits MIX tasks with require_sync_start=true to verify that the scheduler
+ * atomically launches all blocks before any can run.
+ *
+ * Tasks:
+ *   T0: block_num=2,  require_sync_start=true   (basic sync launch)
+ *   T1: block_num=8,  require_sync_start=true   (larger batch)
+ *   T2: block_num=2,  require_sync_start=false  (normal, as baseline)
+ *   T3: block_num=12, require_sync_start=true   (cross-thread batch)
+ *
+ * Each block writes float(block_idx) to its allocated cache-line slot,
+ * identical to spmd_multiblock_mix so the same kernel binaries can be reused.
+ *
+ * Args layout: [output]
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"
+
+#define FUNC_SPMD_MIX_AIC 0
+#define FUNC_SPMD_MIX_AIV0 1
+#define FUNC_SPMD_MIX_AIV1 2
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig
+aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) {
+    (void)orch_args;  // NOLINT(readability/casting)
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 1,
+    };
+}
+
+static void submit_mix(Tensor &out, int16_t block_num, int64_t base_cl, bool sync_start) {
+    MixedKernels mk;
+    mk.aic_kernel_id = FUNC_SPMD_MIX_AIC;
+    mk.aiv0_kernel_id = FUNC_SPMD_MIX_AIV0;
+    mk.aiv1_kernel_id = FUNC_SPMD_MIX_AIV1;
+
+    Arg args;
+    args.add_inout(out);
+    args.add_scalar(base_cl);
+    args.launch_spec.set_core_num(block_num);
+    args.launch_spec.set_require_sync_start(sync_start);
+    pto2_rt_submit_task(mk, args);
+}
+
+__attribute__((visibility("default"))) void
+aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args, int orch_thread_num, int orch_thread_index) {
+    (void)orch_thread_num;  // NOLINT(readability/casting)
+    if (orch_thread_index != 0) return;
+
+    Tensor ext_output = from_tensor_arg(orch_args.tensor(0));
+
+    // T0: 2 blocks, sync_start=true  (6 CL)
+    submit_mix(ext_output, 2, 0, true);
+    // T1: 8 blocks, sync_start=true  (24 CL)
+    submit_mix(ext_output, 8, 6, true);
+    // T2: 2 blocks, sync_start=false (6 CL, baseline)
+    submit_mix(ext_output, 2, 30, false);
+    // T3: 12 blocks, sync_start=true (36 CL)
+    submit_mix(ext_output, 12, 36, true);
+
+    LOG_ALWAYS("[spmd_sync_start] Submitted 4 tasks (3 sync_start + 1 baseline)");
+}
+
+}  // extern "C"
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/golden.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/golden.py
new file mode 100644
index 000000000..3c60f1ac8
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/golden.py
@@ -0,0 +1,62 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""
+Golden test for SPMD sync_start with AIV-only tasks.
+
+Submits 4 AIV tasks (3 with require_sync_start=true, 1 baseline) to exercise
+the AIV-specific fast path (count_idle_aiv_cores) and drain slow path.
+
+Tasks:
+  T0: block_num=4,  sync_start=True  -> CL 0..3    (fast path)
+  T1: block_num=16, sync_start=True  -> CL 4..19   (saturate one thread)
+  T2: block_num=4,  sync_start=False -> CL 20..23  (baseline)
+  T3: block_num=24, sync_start=True  -> CL 24..47  (cross-thread drain)
+
+Output tensor: 48 cache lines = 768 float32.
+
+Args layout: [output]
+"""
+
+import torch
+
+__outputs__ = ["output"]
+RTOL = 0
+ATOL = 0
+
+ALL_CASES = {
+    "Case1": {},
+}
+
+DEFAULT_CASE = "Case1"
+
+FLOATS_PER_CACHE_LINE = 16
+
+# (block_num, base_cl) for each submitted task
+TASKS = [
+    (4, 0),  # T0: sync_start=True, fast path
+    (16, 4),  # T1: sync_start=True, saturate single thread
+    (4, 20),  # T2: sync_start=False, baseline
+    (24, 24),  # T3: sync_start=True, cross-thread drain
+]
+
+TOTAL_CL = sum(block_num for block_num, _ in TASKS)  # 48
+
+
+def generate_inputs(params: dict) -> list:
+    output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)
+    return [("output", output)]
+
+
+def compute_golden(tensors: dict, params: dict) -> None:
+    out = torch.as_tensor(tensors["output"])
+    for block_num, base_cl in TASKS:
+        for block_idx in range(block_num):
+            cl = base_cl + block_idx
+            out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx)
+    tensors["output"][:] = out
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/kernel_config.py
new file mode 100644
index 000000000..77102a658
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/kernel_config.py
@@ -0,0 +1,41 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""
+Kernel configuration for SPMD sync_start AIV test (tensormap_and_ringbuffer Runtime).
+
+Submits AIV tasks with require_sync_start=true to verify atomic batch launch
+and the AIV-specific fast path (count_idle_aiv_cores).
+Reuses the same AIV kernel from spmd_multiblock_aiv.
+"""
+
+from pathlib import Path
+
+_KERNELS_ROOT = Path(__file__).parent
+_AIV_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_aiv" / "kernels"
+
+ORCHESTRATION = {
+    "source": str(_KERNELS_ROOT / "orchestration" / "spmd_sync_start_aiv_orch.cpp"),
+    "function_name": "aicpu_orchestration_entry",
+}
+
+KERNELS = [
+    {
+        "func_id": 0,
+        "name": "SPMD_WRITE_AIV",
+        "source": str(_AIV_KERNELS / "aiv" / "kernel_spmd_write.cpp"),
+        "core_type": "aiv",
+    },
+]
+
+RUNTIME_CONFIG = {
+    "runtime": "tensormap_and_ringbuffer",
+    "aicpu_thread_num": 4,
+    "orch_thread_num": 1,
+    "block_dim": 24,
+}
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/orchestration/spmd_sync_start_aiv_orch.cpp b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/orchestration/spmd_sync_start_aiv_orch.cpp
new file mode 100644
index 000000000..fa55cee3c
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/orchestration/spmd_sync_start_aiv_orch.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * SPMD Sync-Start AIV Orchestration
+ *
+ * Submits AIV-only tasks with require_sync_start=true to exercise:
+ *   - AIV fast path: count_idle_aiv_cores() >= block_num (small block_num)
+ *   - AIV drain path: block_num exceeds local AIV cores (cross-thread drain)
+ *
+ * Tasks:
+ *   T0: block_num=4,  require_sync_start=true   (fast path)
+ *   T1: block_num=16, require_sync_start=true   (saturate one thread: 8 clusters x 2 AIV)
+ *   T2: block_num=4,  require_sync_start=false  (baseline)
+ *   T3: block_num=24, require_sync_start=true   (cross-thread drain)
+ *
+ * Each block writes float(block_idx) at (base_cl + block_idx) x FLOATS_PER_CACHE_LINE,
+ * reusing the kernel from spmd_multiblock_aiv.
+ *
+ * Args layout: [output]
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"
+
+#define FUNC_SPMD_WRITE_AIV 0
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig
+aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) {
+    (void)orch_args;  // NOLINT(readability/casting)
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 1,
+    };
+}
+
+static void submit_aiv(Tensor &out, int16_t block_num, int64_t base_cl, bool sync_start) {
+    Arg args;
+    args.add_inout(out);
+    args.add_scalar(base_cl);
+    args.launch_spec.set_core_num(block_num);
+    args.launch_spec.set_require_sync_start(sync_start);
+    pto2_rt_submit_aiv_task(FUNC_SPMD_WRITE_AIV, args);
+}
+
+__attribute__((visibility("default"))) void
+aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args, int orch_thread_num, int orch_thread_index) {
+    (void)orch_thread_num;  // NOLINT(readability/casting)
+    if (orch_thread_index != 0) return;
+
+    Tensor ext_output = from_tensor_arg(orch_args.tensor(0));
+
+    // T0: 4 blocks, sync_start=true (fast path: 4 <= idle AIV cores on one thread)
+    submit_aiv(ext_output, 4, 0, true);
+    // T1: 16 blocks, sync_start=true (saturate: 8 clusters x 2 AIV = 16 cores)
+    submit_aiv(ext_output, 16, 4, true);
+    // T2: 4 blocks, sync_start=false (baseline)
+    submit_aiv(ext_output, 4, 20, false);
+    // T3: 24 blocks, sync_start=true (cross-thread drain)
+    submit_aiv(ext_output, 24, 24, true);
+
+    LOG_ALWAYS("[spmd_sync_start_aiv] Submitted 4 AIV tasks (3 sync_start + 1 baseline)");
+}
+
+}  // extern "C"
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/golden.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/golden.py
new file mode 100644
index 000000000..2bfcaea4a
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/golden.py
@@ -0,0 +1,66 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""
+Golden test for SPMD sync_start boundary conditions.
+
+Tests edge-case block_num values relative to per-thread cluster capacity (8 clusters
+with 3 sched threads = 24 total clusters, 48 total AIV cores).
+
+MIX tasks (SLOTS_PER_BLOCK=3):
+  T0: block_num=1,  sync_start=True  -> CL 0..2     (degenerate: always fast path)
+  T1: block_num=8,  sync_start=True  -> CL 3..26    (exactly one thread's capacity)
+  T2: block_num=9,  sync_start=True  -> CL 27..53   (one over: must enter drain)
+  T3: block_num=23, sync_start=True  -> CL 54..122  (max valid: total_clusters - 1)
+  T4: block_num=1,  sync_start=False -> CL 123..125  (baseline)
+
+Output tensor: 126 cache lines = 2016 float32.
+
+Args layout: [output]
+"""
+
+import torch
+
+__outputs__ = ["output"]
+RTOL = 0
+ATOL = 0
+
+ALL_CASES = {
+    "Case1": {},
+}
+
+DEFAULT_CASE = "Case1"
+
+FLOATS_PER_CACHE_LINE = 16
+SLOTS_PER_BLOCK = 3  # AIC, AIV0, AIV1
+
+# (block_num, base_cl) for each submitted task
+TASKS = [
+    (1, 0),  # T0: sync=True, degenerate
+    (8, 3),  # T1: sync=True, exactly one thread's clusters
+    (9, 27),  # T2: sync=True, one over -> drain
+    (23, 54),  # T3: sync=True, max valid (total_clusters - 1)
+    (1, 123),  # T4: sync=False, baseline
+]
+
+TOTAL_CL = sum(block_num * SLOTS_PER_BLOCK for block_num, _ in TASKS)  # 126
+
+
+def generate_inputs(params: dict) -> list:
+    output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)
+    return [("output", output)]
+
+
+def compute_golden(tensors: dict, params: dict) -> None:
+    out = torch.as_tensor(tensors["output"])
+    for block_num, base_cl in TASKS:
+        for block_idx in range(block_num):
+            for slot in range(SLOTS_PER_BLOCK):
+                cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot
+                out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx)
+    tensors["output"][:] = out
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/kernel_config.py
new file mode 100644
index 000000000..84488dd71
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/kernel_config.py
@@ -0,0 +1,52 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""
+Kernel configuration for SPMD sync_start boundary test (tensormap_and_ringbuffer Runtime).
+
+Tests edge-case block_num values relative to per-thread cluster capacity.
+Reuses the same AIC/AIV kernels from spmd_multiblock_mix.
+"""
+
+from pathlib import Path
+
+_KERNELS_ROOT = Path(__file__).parent
+_MIX_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_mix" / "kernels"
+
+ORCHESTRATION = {
+    "source": str(_KERNELS_ROOT / "orchestration" / "spmd_sync_start_edge_orch.cpp"),
+    "function_name": "aicpu_orchestration_entry",
+}
+
+KERNELS = [
+    {
+        "func_id": 0,
+        "name": "SPMD_MIX_AIC",
+        "source": str(_MIX_KERNELS / "aic" / "kernel_spmd_mix.cpp"),
+        "core_type": "aic",
+    },
+    {
+        "func_id": 1,
+        "name": "SPMD_MIX_AIV0",
+        "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"),
+        "core_type": "aiv",
+    },
+    {
+        "func_id": 2,
+        "name": "SPMD_MIX_AIV1",
+        "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"),
+        "core_type": "aiv",
+    },
+]
+
+RUNTIME_CONFIG = {
+    "runtime": "tensormap_and_ringbuffer",
+    "aicpu_thread_num": 4,
+    "orch_thread_num": 1,
+    "block_dim": 24,
+}
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/orchestration/spmd_sync_start_edge_orch.cpp b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/orchestration/spmd_sync_start_edge_orch.cpp
new file mode 100644
index 000000000..ad502c130
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/orchestration/spmd_sync_start_edge_orch.cpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * SPMD Sync-Start Boundary Orchestration
+ *
+ * Tests edge-case block_num values relative to per-thread cluster capacity
+ * (8 clusters per sched thread, 24 total clusters).
+ *
+ * Tasks:
+ *   T0: block_num=1,  sync_start=true   (degenerate: always fast path)
+ *   T1: block_num=8,  sync_start=true   (exactly one thread's capacity)
+ *   T2: block_num=9,  sync_start=true   (one over: must enter drain)
+ *   T3: block_num=23, sync_start=true   (max valid: total_clusters - 1)
+ *   T4: block_num=1,  sync_start=false  (baseline)
+ *
+ * Args layout: [output]
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"
+
+#define FUNC_SPMD_MIX_AIC 0
+#define FUNC_SPMD_MIX_AIV0 1
+#define FUNC_SPMD_MIX_AIV1 2
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig
+aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) {
+    (void)orch_args;  // NOLINT(readability/casting)
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 1,
+    };
+}
+
+static void submit_mix(Tensor &out, int16_t block_num, int64_t base_cl, bool sync_start) {
+    MixedKernels mk;
+    mk.aic_kernel_id = FUNC_SPMD_MIX_AIC;
+    mk.aiv0_kernel_id = FUNC_SPMD_MIX_AIV0;
+    mk.aiv1_kernel_id = FUNC_SPMD_MIX_AIV1;
+
+    Arg args;
+    args.add_inout(out);
+    args.add_scalar(base_cl);
+    args.launch_spec.set_core_num(block_num);
+    args.launch_spec.set_require_sync_start(sync_start);
+    pto2_rt_submit_task(mk, args);
+}
+
+__attribute__((visibility("default"))) void
+aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args, int orch_thread_num, int orch_thread_index) {
+    (void)orch_thread_num;  // NOLINT(readability/casting)
+    if (orch_thread_index != 0) return;
+
+    Tensor ext_output = from_tensor_arg(orch_args.tensor(0));
+
+    // T0: block_num=1, sync_start=true (degenerate: always fast path, 3 CL)
+    submit_mix(ext_output, 1, 0, true);
+    // T1: block_num=8, sync_start=true (exactly one thread's cluster capacity, 24 CL)
+    submit_mix(ext_output, 8, 3, true);
+    // T2: block_num=9, sync_start=true (one over single thread -> must drain, 27 CL)
+    submit_mix(ext_output, 9, 27, true);
+    // T3: block_num=23, sync_start=true (max valid = total_clusters - 1, 69 CL)
+    submit_mix(ext_output, 23, 54, true);
+    // T4: block_num=1, sync_start=false (baseline, 3 CL)
+    submit_mix(ext_output, 1, 123, false);
+
+    LOG_ALWAYS("[spmd_sync_start_edge] Submitted 5 tasks: block_num=1,8,9,23 (sync) + 1 (baseline)");
+}
+
+}  // extern "C"
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/golden.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/golden.py
new file mode 100644
index 000000000..3315360df
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/golden.py
@@ -0,0 +1,104 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""
+Golden test for SPMD sync_start stress / CAS contention with mixed shapes.
+
+Submits 6 rounds of mixed-shape tasks to stress drain CAS contention, ack
+barrier, and state cleanup across drain cycles.  All three resource shapes
+(MIX, AIV, AIC) are exercised with both sync and non-sync modes.
+
+Each round (9 tasks):
+  4 x normal MIX  (block_num=4, sync=false) -> 4 x 4 x 3 = 48 CL
+  2 x sync MIX    (block_num=12, sync=true) -> 2 x 12 x 3 = 72 CL
+  2 x sync AIV    (block_num=8, sync=true)  -> 2 x 8 x 1 = 16 CL
+  1 x normal AIV  (block_num=4, sync=false) -> 1 x 4 x 1 = 4 CL
+  Round total: 140 CL
+
+6 rounds -> 54 tasks (24 normal MIX + 12 sync MIX + 12 sync AIV + 6 normal AIV)
+Grand total: 840 CL = 13440 float32
+
+Args layout: [output]
+"""
+
+import torch
+
+__outputs__ = ["output"]
+RTOL = 0
+ATOL = 0
+
+ALL_CASES = {
+    "Case1": {},
+}
+
+DEFAULT_CASE = "Case1"
+
+FLOATS_PER_CACHE_LINE = 16
+ROUNDS = 6
+
+# shape constants: (slots_per_block, written_slots)
+# MIX: kernel writes at base_cl + block_idx * 3 + {0,1,2}, 3 CL per block, all written
+# AIV: kernel writes at base_cl + block_idx, 1 CL per block
+SHAPE_MIX = "MIX"
+SHAPE_AIV = "AIV"
+
+MIX_SLOTS = 3
+AIV_SLOTS = 1
+
+NORMAL_MIX_BN = 4
+SYNC_MIX_BN = 12
+SYNC_AIV_BN = 8
+NORMAL_AIV_BN = 4
+
+
+def _build_tasks():
+    """Returns list of (block_num, base_cl, shape_str)."""
+    tasks = []
+    cl = 0
+    for _ in range(ROUNDS):
+        # 4 x normal MIX
+        for _ in range(4):
+            tasks.append((NORMAL_MIX_BN, cl, SHAPE_MIX))
+            cl += NORMAL_MIX_BN * MIX_SLOTS
+        # 2 x sync MIX
+        for _ in range(2):
+            tasks.append((SYNC_MIX_BN, cl, SHAPE_MIX))
+            cl += SYNC_MIX_BN * MIX_SLOTS
+        # 2 x sync AIV
+        for _ in range(2):
+            tasks.append((SYNC_AIV_BN, cl, SHAPE_AIV))
+            cl += SYNC_AIV_BN * AIV_SLOTS
+        # 1 x normal AIV
+        tasks.append((NORMAL_AIV_BN, cl, SHAPE_AIV))
+        cl += NORMAL_AIV_BN * AIV_SLOTS
+    return tasks
+
+
+TASKS = _build_tasks()
+TOTAL_CL = sum(bn * (MIX_SLOTS if shape == SHAPE_MIX else AIV_SLOTS) for bn, _, shape in TASKS)  # 840
+
+
+def generate_inputs(params: dict) -> list:
+    output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)
+    return [("output", output)]
+
+
+def compute_golden(tensors: dict, params: dict) -> None:
+    out = torch.as_tensor(tensors["output"])
+    for block_num, base_cl, shape in TASKS:
+        for block_idx in range(block_num):
+            if shape == SHAPE_MIX:
+                # MIX kernel writes float(block_idx) at all 3 slots
+                for slot in range(MIX_SLOTS):
+                    cl = base_cl + block_idx * MIX_SLOTS + slot
+                    out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx)
+            else:
+                # AIV kernel writes float(block_idx) at 1 slot
+                cl = base_cl + block_idx
+                out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx)
+    tensors["output"][:] = out
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/kernel_config.py
new file mode 100644
index 000000000..09c507863
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/kernel_config.py
@@ -0,0 +1,62 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""
+Kernel configuration for SPMD sync_start stress test with mixed shapes.
+
+Submits 54 tasks (MIX + AIV) over 6 rounds to stress-test drain CAS contention,
+ack barrier, and state cleanup between drain cycles.
+Reuses AIC/AIV kernels from spmd_multiblock_mix and spmd_multiblock_aiv.
+"""
+
+from pathlib import Path
+
+_KERNELS_ROOT = Path(__file__).parent
+_MIX_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_mix" / "kernels"
+_AIV_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_aiv" / "kernels"
+
+ORCHESTRATION = {
+    "source": str(_KERNELS_ROOT / "orchestration" / "spmd_sync_start_stress_orch.cpp"),
+    "function_name": "aicpu_orchestration_entry",
+}
+
+KERNELS = [
+    # func_id 0-2: MIX kernels (AIC + AIV0 + AIV1)
+    {
+        "func_id": 0,
+        "name": "SPMD_MIX_AIC",
+        "source": str(_MIX_KERNELS / "aic" / "kernel_spmd_mix.cpp"),
+        "core_type": "aic",
+    },
+    {
+        "func_id": 1,
+        "name": "SPMD_MIX_AIV0",
+        "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"),
+        "core_type": "aiv",
+    },
+    {
+        "func_id": 2,
+        "name": "SPMD_MIX_AIV1",
+        "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"),
+        "core_type": "aiv",
+    },
+    # func_id 3: standalone AIV kernel
+    {
+        "func_id": 3,
+        "name": "SPMD_WRITE_AIV",
+        "source": str(_AIV_KERNELS / "aiv" / "kernel_spmd_write.cpp"),
+        "core_type": "aiv",
+    },
+]
+
+RUNTIME_CONFIG = {
+    "runtime": "tensormap_and_ringbuffer",
+    "aicpu_thread_num": 4,
+    "orch_thread_num": 1,
+    "block_dim": 24,
+}
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/orchestration/spmd_sync_start_stress_orch.cpp b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/orchestration/spmd_sync_start_stress_orch.cpp
new file mode 100644
index 000000000..ddbbea1de
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/orchestration/spmd_sync_start_stress_orch.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * SPMD Sync-Start Stress Orchestration (mixed shapes)
+ *
+ * Submits 6 rounds of mixed MIX + AIV tasks to stress-test:
+ *   - Drain CAS contention (multiple sync_start tasks per round)
+ *   - Ack barrier correctness (normal tasks occupy clusters during drain entry)
+ *   - State cleanup between consecutive drain cycles
+ *
+ * Each round (9 tasks):
+ *   4 x normal MIX  (block_num=4,  sync=false) -> 4 x 4 x 3 = 48 CL
+ *   2 x sync   MIX  (block_num=12, sync=true)  -> 2 x 12 x 3 = 72 CL
+ *   2 x sync   AIV  (block_num=8,  sync=true)  -> 2 x 8 x 1 = 16 CL
+ *   1 x normal AIV  (block_num=4,  sync=false) -> 1 x 4 x 1 = 4 CL
+ *   Round total: 140 CL
+ *
+ * 6 rounds -> 54 tasks total, 840 CL grand total.
+ *
+ * Args layout: [output]
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"
+
+#define FUNC_SPMD_MIX_AIC 0
+#define FUNC_SPMD_MIX_AIV0 1
+#define FUNC_SPMD_MIX_AIV1 2
+#define FUNC_SPMD_WRITE_AIV 3
+
+static constexpr int32_t MIX_SLOTS = 3;
+static constexpr int32_t NORMAL_MIX_BN = 4;
+static constexpr int32_t SYNC_MIX_BN = 12;
+static constexpr int32_t SYNC_AIV_BN = 8;
+static constexpr int32_t NORMAL_AIV_BN = 4;
+static constexpr int32_t ROUNDS = 6;
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig
+aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) {
+    (void)orch_args;
+    return PTO2OrchestrationConfig{.expected_arg_count = 1};
+}
+
+static void submit_mix(Tensor &out, int16_t block_num, int64_t base_cl, bool sync_start) {
+    MixedKernels mk;
+    mk.aic_kernel_id = FUNC_SPMD_MIX_AIC;
+    mk.aiv0_kernel_id = FUNC_SPMD_MIX_AIV0;
+    mk.aiv1_kernel_id = FUNC_SPMD_MIX_AIV1;
+    Arg args;
+    args.add_inout(out);
+    args.add_scalar(base_cl);
+    args.launch_spec.set_core_num(block_num);
+    args.launch_spec.set_require_sync_start(sync_start);
+    pto2_rt_submit_task(mk, args);
+}
+
+static void submit_aiv(Tensor &out, int16_t block_num, int64_t base_cl, bool sync_start) {
+    Arg args;
+    args.add_inout(out);
+    args.add_scalar(base_cl);
+    args.launch_spec.set_core_num(block_num);
+    args.launch_spec.set_require_sync_start(sync_start);
+    pto2_rt_submit_aiv_task(FUNC_SPMD_WRITE_AIV, args);
+}
+
+__attribute__((visibility("default"))) void
+aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args, int orch_thread_num, int orch_thread_index) {
+    (void)orch_thread_num;
+    if (orch_thread_index != 0) return;
+
+    Tensor ext_output = from_tensor_arg(orch_args.tensor(0));
+
+    int64_t cl = 0;
+
+    for (int32_t r = 0; r < ROUNDS; r++) {
+        // 4 x normal MIX
+        for (int i = 0; i < 4; i++, cl += NORMAL_MIX_BN * MIX_SLOTS)
+            submit_mix(ext_output, NORMAL_MIX_BN, cl, false);
+
+        // 2 x sync MIX — CAS contention: second sync task may arrive while first is draining
+        for (int i = 0; i < 2; i++, cl += SYNC_MIX_BN * MIX_SLOTS)
+            submit_mix(ext_output, SYNC_MIX_BN, cl, true);
+
+        // 2 x sync AIV — cross-shape drain contention with the MIX drain above
+        for (int i = 0; i < 2; i++, cl += SYNC_AIV_BN)
+            submit_aiv(ext_output, SYNC_AIV_BN, cl, true);
+
+        // 1 x normal AIV
+        submit_aiv(ext_output, NORMAL_AIV_BN, cl, false);
+        cl += NORMAL_AIV_BN;
+    }
+
+    LOG_ALWAYS("[spmd_sync_start_stress] Submitted %d tasks over %d rounds", 9 * ROUNDS, ROUNDS);
+}
+
+}  // extern "C"
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index 07acbbbf4..238cccfec 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -248,6 +248,14 @@ class alignas(64) CoreTracker {
         return ((core_states_ >> (cluster_offset + 2)) & BitStates(1ULL)).has_value();
     }
 
+    // Count total idle AIV cores (AIV0 + AIV1) across all clusters.
+    // Unlike get_valid_cluster_offset_states(AIV).count() which counts clusters with
+    // at least one idle AIV, this counts individual idle cores — a cluster with both
+    // AIV0 and AIV1 idle contributes 2, not 1.
+    int32_t count_idle_aiv_cores() const {
+        return ((core_states_ >> 1) & aic_mask_).count() + ((core_states_ >> 2) & aic_mask_).count();
+    }
+
     // --- State mutation ---
 
     // Toggle bit at the given bit offset (running <-> idle)
@@ -268,6 +276,8 @@ class alignas(64) CoreTracker {
 struct AicpuExecutor {
     int32_t orch_thread_num_;
     int32_t sched_thread_num_;
+    int32_t active_sched_threads_{0};  // Threads currently in dispatch loop (initially sched_thread_num_, becomes
+                                       // thread_num_ after orch→sched transition)
     bool orch_to_sched_{false};
 
     // ===== Thread management state =====
@@ -297,6 +307,20 @@ struct AicpuExecutor {
 
     CoreTracker core_trackers_[MAX_AICPU_THREADS];
 
+    // ===== sync_start drain coordination =====
+
+    // When sync_start_pending != 0, all scheduler threads skip Phase 2 dispatch
+    // (only process completions) until the drain worker finishes launching all blocks.
+    struct alignas(64) SyncStartDrainState {
+        std::atomic<int32_t> sync_start_pending{0};    // 0=normal; -1=initializing; >0=active (value=block_num)
+        std::atomic<int32_t> drain_worker_elected{0};  // 0=none; >0: elected thread's (thread_idx+1)
+        std::atomic<uint32_t> drain_ack_mask{0};       // bit per thread; all-set = all threads finished dispatch
+        PTO2TaskSlotState *pending_task{nullptr};      // held task (not re-queued)
+        int32_t _pad[10];
+    };
+    static_assert(sizeof(SyncStartDrainState) == 64);
+    SyncStartDrainState drain_state_;
+
     // ===== Task queue state (managed by scheduler ready queues) =====
 
     // Task execution tracking
@@ -621,6 +645,242 @@ struct AicpuExecutor {
         tracker.change_core_state(core_offset);
         core_exec_state.executing_reg_task_id = reg_task_id;
     }
+
+    // Dispatch one SPMD block of a MIX task to the cluster at cluster_offset.
+    // Reads slot_state.next_block_idx as block_idx; caller increments it afterwards.
+    void dispatch_mix_block_to_cluster(
+        Runtime *runtime, int32_t thread_idx, int32_t cluster_offset, PTO2TaskSlotState &slot_state
+#if PTO2_PROFILING
+        ,
+        bool profiling_enabled
+#endif
+    ) {
+        CoreTracker &tracker = core_trackers_[thread_idx];
+        uint8_t core_mask = pto2_core_mask(slot_state.active_mask);
+        if (core_mask & PTO2_SUBTASK_MASK_AIC) {
+            dispatch_subtask_to_core(
+                runtime, thread_idx, tracker.get_aic_core_offset(cluster_offset), slot_state, PTO2SubtaskSlot::AIC
+#if PTO2_PROFILING
+                ,
+                profiling_enabled
+#endif
+            );
+        }
+        if (core_mask & PTO2_SUBTASK_MASK_AIV0) {
+            dispatch_subtask_to_core(
+                runtime, thread_idx, tracker.get_aiv0_core_offset(cluster_offset), slot_state, PTO2SubtaskSlot::AIV0
+#if PTO2_PROFILING
+                ,
+                profiling_enabled
+#endif
+            );
+        }
+        if (core_mask & PTO2_SUBTASK_MASK_AIV1) {
+            dispatch_subtask_to_core(
+                runtime, thread_idx, tracker.get_aiv1_core_offset(cluster_offset), slot_state, PTO2SubtaskSlot::AIV1
+#if PTO2_PROFILING
+                ,
+                profiling_enabled
+#endif
+            );
+        }
+    }
+
+    // ===== sync_start drain helpers =====
+
+    // Take ownership of slot_state and signal all threads to enter drain mode.
+    // Returns true if this thread won the CAS and owns the drain slot.
+    // Returns false if another thread already holds drain; caller must re-push slot_state.
+    //
+    // Two-phase protocol: CAS 0 → -1 (sentinel) to claim ownership, store task and
+    // reset election flag, then release-store block_num.  Other threads acquire-load
+    // sync_start_pending; seeing block_num > 0 ensures all relaxed stores are visible.
+    bool enter_drain_mode(PTO2TaskSlotState *slot_state, int32_t block_num) {
+        int32_t expected = 0;
+        if (!drain_state_.sync_start_pending.compare_exchange_strong(
+                expected, -1, std::memory_order_relaxed, std::memory_order_relaxed
+            )) {
+            return false;  // Another thread already holds the drain slot.
+        }
+        // We own the drain slot.  Store the task and reset election flag before making it visible.
+        drain_state_.pending_task = slot_state;
+        drain_state_.drain_ack_mask.store(0, std::memory_order_relaxed);
+        drain_state_.drain_worker_elected.store(0, std::memory_order_relaxed);
+        // Release store: all stores above are now visible to any thread that
+        // acquire-loads sync_start_pending and sees block_num > 0.
+        drain_state_.sync_start_pending.store(block_num, std::memory_order_release);
+        return true;
+    }
+
+    // Dispatch one SPMD block to the cluster at cluster_offset, routing to the correct core(s)
+    // based on shape.  For AIV, picks whichever AIV core in the cluster is currently idle.
+    // Caller is responsible for incrementing slot_state.next_block_idx after this returns.
+    void dispatch_block_to_cluster(
+        Runtime *runtime, int32_t thread_idx, int32_t cluster_offset, PTO2TaskSlotState &slot_state,
+        PTO2ResourceShape shape
+#if PTO2_PROFILING
+        ,
+        bool profiling_enabled, uint32_t &phase_dispatch_count
+#endif
+    ) {
+        CoreTracker &tracker = core_trackers_[thread_idx];
+        if (shape == PTO2ResourceShape::MIX) {
+            dispatch_mix_block_to_cluster(
+                runtime, thread_idx, cluster_offset, slot_state
+#if PTO2_PROFILING
+                ,
+                profiling_enabled
+#endif
+            );
+        } else if (shape == PTO2ResourceShape::AIC) {
+            dispatch_subtask_to_core(
+                runtime, thread_idx, tracker.get_aic_core_offset(cluster_offset), slot_state, PTO2SubtaskSlot::AIC
+#if PTO2_PROFILING
+                ,
+                profiling_enabled
+#endif
+            );
+        } else {  // AIV
+            auto core_offset = tracker.is_aiv0_core_idle(cluster_offset) ?
+                                   tracker.get_aiv0_core_offset(cluster_offset) :
+                                   tracker.get_aiv1_core_offset(cluster_offset);
+            dispatch_subtask_to_core(
+                runtime, thread_idx, core_offset, slot_state, PTO2SubtaskSlot::AIV0
+#if PTO2_PROFILING
+                ,
+                profiling_enabled
+#endif
+            );
+        }
+#if PTO2_PROFILING
+        phase_dispatch_count += __builtin_popcount(pto2_core_mask(slot_state.active_mask));
+#endif
+    }
+
+    // Count total available resources across all scheduler threads for a given shape.
+    int32_t count_global_available(PTO2ResourceShape shape) {
+        int32_t total = 0;
+        for (int32_t t = 0; t < active_sched_threads_; t++) {
+            if (shape == PTO2ResourceShape::AIV) {
+                total += core_trackers_[t].count_idle_aiv_cores();
+            } else {
+                total += core_trackers_[t].get_valid_cluster_offset_states(shape).count();
+            }
+        }
+        return total;
+    }
+
+    // Drain worker: dispatch all blocks in one pass across all threads' trackers.
+    // Called only when global resources >= block_num, so one pass always suffices.
+    // All other threads are spinning — the drain worker has exclusive tracker access.
+    void drain_worker_dispatch(
+        Runtime *runtime, int32_t block_num
+#if PTO2_PROFILING
+        ,
+        bool profiling_enabled, uint32_t &phase_dispatch_count
+#endif
+    ) {
+        PTO2TaskSlotState *slot_state = drain_state_.pending_task;
+        if (!slot_state) {
+            drain_state_.sync_start_pending.store(0, std::memory_order_release);
+            return;
+        }
+        PTO2ResourceShape shape = pto2_active_mask_to_shape(slot_state->active_mask);
+
+        for (int32_t t = 0; t < active_sched_threads_ && slot_state->next_block_idx < block_num; t++) {
+            auto valid = core_trackers_[t].get_valid_cluster_offset_states(shape);
+            while (valid.has_value() && slot_state->next_block_idx < block_num) {
+                dispatch_block_to_cluster(
+                    runtime, t, valid.pop_first(), *slot_state, shape
+#if PTO2_PROFILING
+                    ,
+                    profiling_enabled, phase_dispatch_count
+#endif
+                );
+                slot_state->next_block_idx++;
+                if (slot_state->next_block_idx < block_num)
+                    valid = core_trackers_[t].get_valid_cluster_offset_states(shape);
+            }
+        }
+
+        // All blocks dispatched — clear drain state.
+        // Release fence ensures tracker mutations are visible to threads that
+        // acquire-load sync_start_pending == 0 and resume normal operation.
+        std::atomic_thread_fence(std::memory_order_release);
+        drain_state_.pending_task = nullptr;
+        drain_state_.drain_ack_mask.store(0, std::memory_order_relaxed);
+        drain_state_.drain_worker_elected.store(0, std::memory_order_relaxed);
+        drain_state_.sync_start_pending.store(0, std::memory_order_release);
+    }
+
+    // Called by each scheduler thread when drain_state_.sync_start_pending != 0.
+    //
+    // Three-phase protocol:
+    //   1. Ack barrier: all threads signal they've stopped Phase 2 dispatch.
+    //      If not all acked yet, return to Phase 1 (completion polling).
+    //   2. Resource check: elected thread verifies global idle resources >= block_num.
+    //      If insufficient, reset election state and return — all threads resume
+    //      Phase 1 to free running cores, then retry next iteration.
+    //   3. Dispatch: elected thread dispatches all blocks (one pass, resources guaranteed).
+    //      Non-elected threads spin-wait until sync_start_pending == 0.
+    //      During dispatch the elected thread has exclusive tracker access.
+    void handle_drain_mode(
+        Runtime *runtime, int32_t thread_idx
+#if PTO2_PROFILING
+        ,
+        bool profiling_enabled, uint32_t &phase_dispatch_count
+#endif
+    ) {
+        // Spin until drain is fully initialized (sentinel -1 → block_num > 0).
+        int32_t block_num;
+        do {
+            block_num = drain_state_.sync_start_pending.load(std::memory_order_acquire);
+        } while (block_num < 0);
+        if (block_num == 0) return;
+
+        // Phase 1: Ack barrier — signal this thread has stopped Phase 2 dispatch.
+        uint32_t all_acked = (1u << active_sched_threads_) - 1;
+        drain_state_.drain_ack_mask.fetch_or(1u << thread_idx, std::memory_order_release);
+
+        // If not all threads have acked, return to do Phase 1 (completion polling).
+        if ((drain_state_.drain_ack_mask.load(std::memory_order_acquire) & all_acked) != all_acked) return;
+
+        // Phase 2: Election — exactly one thread wins the CAS.
+        int32_t expected = 0;
+        drain_state_.drain_worker_elected.compare_exchange_strong(
+            expected, thread_idx + 1, std::memory_order_acquire, std::memory_order_relaxed
+        );
+
+        if (drain_state_.drain_worker_elected.load(std::memory_order_relaxed) != thread_idx + 1) {
+            // Non-elected: spin-wait for drain completion or resource-insufficient reset.
+            while (drain_state_.sync_start_pending.load(std::memory_order_acquire) != 0) {
+                if (drain_state_.drain_worker_elected.load(std::memory_order_acquire) == 0) return;
+                SPIN_WAIT_HINT();
+            }
+            return;
+        }
+
+        // Elected: check if global resources are sufficient.
+        PTO2TaskSlotState *slot_state = drain_state_.pending_task;
+        PTO2ResourceShape shape = pto2_active_mask_to_shape(slot_state->active_mask);
+        int32_t available = count_global_available(shape);
+
+        if (available < block_num) {
+            // Insufficient resources — reset election, let all threads do Phase 1.
+            drain_state_.drain_ack_mask.store(0, std::memory_order_relaxed);
+            drain_state_.drain_worker_elected.store(0, std::memory_order_release);
+            return;
+        }
+
+        // Phase 3: Dispatch — all other threads are spinning, exclusive tracker access.
+        drain_worker_dispatch(
+            runtime, block_num
+#if PTO2_PROFILING
+            ,
+            profiling_enabled, phase_dispatch_count
+#endif
+        );
+    }
 };
 
 static AicpuExecutor g_aicpu_executor;
@@ -781,6 +1041,7 @@ bool AicpuExecutor::assign_cores_to_threads() {
         DEV_INFO("Thread %d: total %d cores (%d clusters)", t, core_idx[t], core_trackers_[t].get_cluster_count());
     }
 
+    active_sched_threads_ = (sched_thread_num_ > 0) ? sched_thread_num_ : thread_num_;
     return true;
 }
 
@@ -853,6 +1114,7 @@ void AicpuExecutor::reassign_cores_for_all_threads() {
             core_trackers_[t].get_cluster_count(), aic_running, aiv_running
         );
     }
+    active_sched_threads_ = thread_num_;
 }
 
 int32_t AicpuExecutor::init(Runtime *runtime) {
@@ -1215,8 +1477,23 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
 #endif
 
         bool try_pushed = false;
+
+        // Phase 2 drain check: if a sync_start task is waiting for resources,
+        // pause normal dispatch and let the drain protocol run.
+        if (drain_state_.sync_start_pending.load(std::memory_order_acquire) != 0) {
+            handle_drain_mode(
+                runtime, thread_idx
+#if PTO2_PROFILING
+                ,
+                profiling_enabled, phase_dispatch_count
+#endif
+            );
+            continue;
+        }
+
         const PTO2ResourceShape *dispatch_order = get_dispatch_order(thread_idx);
-        for (int32_t si = 0; si < PTO2_NUM_RESOURCE_SHAPES; si++) {
+        bool entered_drain = false;
+        for (int32_t si = 0; si < PTO2_NUM_RESOURCE_SHAPES && !entered_drain; si++) {
             PTO2ResourceShape shape = dispatch_order[si];
             auto valid_cluster_states = tracker.get_valid_cluster_offset_states(shape);
             if (!valid_cluster_states.has_value()) {
@@ -1224,7 +1501,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
             }
             auto &local_buf = local_bufs[static_cast<int32_t>(shape)];
 
-            while (valid_cluster_states.has_value()) {
+            while (valid_cluster_states.has_value() && !entered_drain) {
                 int want = valid_cluster_states.count();
                 PTO2TaskSlotState *batch[CoreTracker::MAX_CLUSTERS];
                 int got = pop_ready_tasks_batch(
@@ -1242,13 +1519,37 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
 #if PTO2_SCHED_PROFILING
                     uint64_t t_setup_start = get_sys_cnt_aicpu();
 #endif
+                    // sync_start: all blocks must dispatch atomically.
+                    // Fast path  — enough local slots: fall through to normal dispatch loop below.
+                    // Slow path  — not enough: enter drain mode, then re-push all remaining
+                    //              tasks in the batch so nothing is lost.
+                    // For AIV, one cluster can serve 2 blocks (AIV0 + AIV1), so compare against
+                    // idle AIV core count rather than cluster count.
+                    if (pto2_requires_sync_start(slot_state->active_mask)) {
+                        int32_t available = (shape == PTO2ResourceShape::AIV) ? tracker.count_idle_aiv_cores() :
+                                                                                valid_cluster_states.count();
+                        if (available < slot_state->block_num) {
+                            if (!enter_drain_mode(slot_state, slot_state->block_num)) {
+                                // CAS lost: drain already active for another task; re-push and wait.
+                                rt->scheduler.ready_queues[static_cast<int32_t>(shape)].push(slot_state);
+                            }
+                            // Re-push all unprocessed tasks remaining in this batch.
+                            for (int rem = bi + 1; rem < got; rem++) {
+                                rt->scheduler.ready_queues[static_cast<int32_t>(shape)].push(batch[rem]);
+                            }
+                            entered_drain = true;
+                            break;
+                        }
+                        // Fast path: enough local resources, fall through to normal dispatch.
+                    }
+
                     // Dispatch as many blocks as possible for this task using available clusters.
                     // For block_num=1 the inner body executes exactly once (no overhead).
                     do {
                         auto current_valid_cluster_offset = valid_cluster_states.pop_first();
                         if (shape == PTO2ResourceShape::MIX) {
                             // Full-cluster: all active subtasks share the same block_idx.
-                            uint8_t mask = slot_state->active_mask;
+                            uint8_t mask = pto2_core_mask(slot_state->active_mask);
                             if (mask & PTO2_SUBTASK_MASK_AIC) {
                                 dispatch_subtask_to_core(
                                     runtime, thread_idx, tracker.get_aic_core_offset(current_valid_cluster_offset),
@@ -1309,7 +1610,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
                             }
                         }
 #if PTO2_PROFILING
-                        phase_dispatch_count += __builtin_popcount(slot_state->active_mask);
+                        phase_dispatch_count += __builtin_popcount(pto2_core_mask(slot_state->active_mask));
 #endif
                         DEV_DEBUG(
                             "Thread %d: Dispatched %s task %" PRId64 " block %d/%d to cluster_offset %d", thread_idx,
@@ -1853,6 +2154,11 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                     return -1;
                 }
 
+                // Total core counts for submit-time deadlock detection.
+                for (int i = 0; i < orch_thread_num_; i++) {
+                    rt->orchestrators[i].total_cluster_count = aic_count_;
+                    rt->orchestrators[i].total_aiv_count = aiv_count_;
+                }
 #if PTO2_PROFILING
                 for (int i = 0; i < orch_thread_num_; i++) {
                     rt->orchestrators[i].enable_profiling = runtime->enable_profiling;
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
index e9e9ea183..e161d3ee7 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
@@ -346,6 +346,21 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke
         active_mask = pto2_mixed_kernels_to_active_mask(normalized);
     }
 
+    // Encode require_sync_start into active_mask bit 3 (only meaningful for tasks with block_num > 1)
+    if (block_num > 1 && args.launch_spec.require_sync_start()) {
+        // Deadlock check: block_num >= total available slots of the required type.
+        // For MIX/AIC: limit is total_cluster_count (one AIC per cluster).
+        // For AIV:     limit is total_aiv_count.
+        PTO2ResourceShape shape = pto2_active_mask_to_shape(active_mask);
+        int32_t limit = (shape == PTO2ResourceShape::AIV) ? orch->total_aiv_count : orch->total_cluster_count;
+        if (limit > 0 && block_num > limit) {
+            LOG_ERROR("FATAL: require_sync_start block_num=%d > limit=%d (deadlock guaranteed)", block_num, limit);
+            orch->fatal = true;
+            return TaskOutputTensors{};
+        }
+        active_mask |= PTO2_SUBTASK_FLAG_SYNC_START;
+    }
+
     // Submission without an open scope is illegal
     always_assert(orch->scope_stack_top >= 0 && "Cannot submit task outside a scope");
 
@@ -583,7 +598,8 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke
         cur_slot_state.task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
         cur_slot_state.fanout_refcount.store(0, std::memory_order_relaxed);
         cur_slot_state.completed_subtasks.store(0, std::memory_order_relaxed);
-        cur_slot_state.total_required_subtasks = static_cast<int16_t>(block_num * __builtin_popcount(active_mask));
+        cur_slot_state.total_required_subtasks =
+            static_cast<int16_t>(block_num * __builtin_popcount(pto2_core_mask(active_mask)));
         cur_slot_state.block_num = block_num;
         cur_slot_state.next_block_idx = 0;
 
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
index a40a6f7ce..0d9d94276 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
@@ -70,6 +70,10 @@ struct PTO2OrchestratorState {
     // Note: In simulated mode, orchestrator and scheduler share address space
     // In real mode, they communicate via shared memory only
     PTO2SchedulerState *scheduler;  // For simulated mode only
+
+    // Total core counts set once at executor init; used for submit-time deadlock detection.
+    int32_t total_cluster_count{0};  // AIC cores = MIX clusters
+    int32_t total_aiv_count{0};      // AIV cores (= 2 × clusters on standard hardware)
 #if PTO2_PROFILING
     // Runtime profiling switch copied from Runtime::enable_profiling.
     bool enable_profiling;
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h
index 2a4ad827a..e89781a91 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h
@@ -39,9 +39,10 @@ enum class PTO2SubtaskSlot : uint8_t {
 /**
  * Subtask mask bits (for active_mask / subtask_done_mask)
  */
-inline constexpr uint8_t PTO2_SUBTASK_MASK_AIC = (1u << 0);   // 0x1
-inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV0 = (1u << 1);  // 0x2
-inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV1 = (1u << 2);  // 0x4
+inline constexpr uint8_t PTO2_SUBTASK_MASK_AIC = (1u << 0);         // 0x1
+inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV0 = (1u << 1);        // 0x2
+inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV1 = (1u << 2);        // 0x4
+inline constexpr uint8_t PTO2_SUBTASK_FLAG_SYNC_START = (1u << 3);  // 0x8: all blocks must launch atomically
 
 /**
  * Test whether a subtask slot is active in a given mask
@@ -50,6 +51,18 @@ static inline bool pto2_subtask_active(uint8_t mask, PTO2SubtaskSlot slot) {
     return (mask & (1u << static_cast<uint8_t>(slot))) != 0;
 }
 
+/**
+ * Extract only the core bits from active_mask (strips flag bits).
+ */
+static inline uint8_t pto2_core_mask(uint8_t active_mask) { return active_mask & 0x07u; }
+
+/**
+ * Check whether a task requires all blocks to be launched atomically.
+ */
+static inline bool pto2_requires_sync_start(uint8_t active_mask) {
+    return (active_mask & PTO2_SUBTASK_FLAG_SYNC_START) != 0;
+}
+
 /**
  * Mixed-task submit contract.
  *
@@ -83,9 +96,10 @@ inline constexpr int32_t PTO2_NUM_RESOURCE_SHAPES = 3;
  * Caller must ensure active_mask is valid (at least one bit set).
  */
 static inline PTO2ResourceShape pto2_active_mask_to_shape(uint8_t active_mask) {
-    int bit_count = __builtin_popcount(active_mask);
+    uint8_t core_mask = pto2_core_mask(active_mask);
+    int bit_count = __builtin_popcount(core_mask);
     if (bit_count >= 2) return PTO2ResourceShape::MIX;
-    if (active_mask & PTO2_SUBTASK_MASK_AIC) return PTO2ResourceShape::AIC;
+    if (core_mask & PTO2_SUBTASK_MASK_AIC) return PTO2ResourceShape::AIC;
     return PTO2ResourceShape::AIV;
 }
 
@@ -114,6 +128,10 @@ class PTO2LaunchSpec {
     int16_t core_num() const { return core_num_; }
     void set_core_num(int16_t n) { core_num_ = n; }
 
+    bool require_sync_start() const { return require_sync_start_; }
+    void set_require_sync_start(bool v) { require_sync_start_ = v; }
+
 private:
     int16_t core_num_{1};
+    bool require_sync_start_{false};
 };