From ed801e31dbfa46dbcaacc50068ea84d0174ee134 Mon Sep 17 00:00:00 2001 From: poursoul Date: Fri, 3 Apr 2026 16:31:27 +0800 Subject: [PATCH 1/2] Add: require_sync_start for atomic SPMD block launch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a sync_start mechanism that forces all blocks of an SPMD task to be dispatched atomically before any can begin execution. Submission layer (pto_submit_types.h, pto_orchestrator.cpp/h): - Add LaunchSpec::require_sync_start and active_mask bit-3 flag - Add pto2_core_mask() / pto2_requires_sync_start() helpers - Validate block_num < total resources at submit time to prevent deadlock - Fix total_required_subtasks to use pto2_core_mask (strip flag bits) Scheduler drain protocol (aicpu_executor.cpp): - Three-phase drain: ack barrier → global resource check → exclusive dispatch - Elected thread verifies global idle resources before dispatching; if insufficient, all threads return to completion polling and retry - Non-elected threads spin-wait during dispatch, giving the elected thread exclusive CoreTracker access (no data race on core_states_) - Track active_sched_threads_ separately from thread_num_ so orchestrator threads that have not transitioned to scheduling do not block the ack barrier SPMD dispatch refactor: - Extract dispatch_block_to_cluster / dispatch_mix_block_to_cluster - AIV path uses count_idle_aiv_cores for accurate resource counting Test examples: spmd_sync_start, spmd_sync_start_aiv, spmd_sync_start_edge, spmd_sync_start_stress, spmd_starvation --- .../spmd_starvation/golden.py | 84 ++++ .../spmd_starvation/kernels/kernel_config.py | 53 +++ .../orchestration/spmd_starvation_orch.cpp | 100 +++++ .../spmd_sync_start/golden.py | 66 +++ .../spmd_sync_start/kernels/kernel_config.py | 52 +++ .../orchestration/spmd_sync_start_orch.cpp | 82 ++++ .../spmd_sync_start_aiv/golden.py | 62 +++ .../kernels/kernel_config.py | 41 ++ .../spmd_sync_start_aiv_orch.cpp | 76 ++++ .../spmd_sync_start_edge/golden.py | 66 +++ .../kernels/kernel_config.py | 53 +++ .../spmd_sync_start_edge_orch.cpp | 82 ++++ .../spmd_sync_start_stress/golden.py | 110 +++++ .../kernels/kernel_config.py | 62 +++ .../spmd_sync_start_stress_orch.cpp | 109 +++++ .../aicpu/aicpu_executor.cpp | 401 ++++++++++++++---- .../runtime/pto_orchestrator.cpp | 18 +- .../runtime/pto_orchestrator.h | 4 + .../runtime/pto_submit_types.h | 28 +- .../runtime/runtime.h | 2 +- 20 files changed, 1467 insertions(+), 84 deletions(-) create mode 100644 examples/a2a3/tensormap_and_ringbuffer/spmd_starvation/golden.py create mode 100644 examples/a2a3/tensormap_and_ringbuffer/spmd_starvation/kernels/kernel_config.py create mode 100644 examples/a2a3/tensormap_and_ringbuffer/spmd_starvation/kernels/orchestration/spmd_starvation_orch.cpp create mode 100644 examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start/golden.py create mode 100644 examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start/kernels/kernel_config.py create mode 100644 examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start/kernels/orchestration/spmd_sync_start_orch.cpp create mode 100644 examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_aiv/golden.py create mode 100644 examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/kernel_config.py create mode 100644 examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/orchestration/spmd_sync_start_aiv_orch.cpp create mode 100644 examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_edge/golden.py create mode 100644 examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/kernel_config.py create mode 100644 examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/orchestration/spmd_sync_start_edge_orch.cpp create mode 100644 examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_stress/golden.py create mode 100644 examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/kernel_config.py create mode 100644 examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/orchestration/spmd_sync_start_stress_orch.cpp diff --git a/examples/a2a3/tensormap_and_ringbuffer/spmd_starvation/golden.py b/examples/a2a3/tensormap_and_ringbuffer/spmd_starvation/golden.py new file mode 100644 index 000000000..f38002181 --- /dev/null +++ b/examples/a2a3/tensormap_and_ringbuffer/spmd_starvation/golden.py @@ -0,0 +1,84 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +""" +Golden test for SPMD starvation prevention. + +Submits 18 normal MIX tasks interleaved with 2 sync_start MIX tasks and +verifies all 20 tasks complete with correct output. The test validates that +the drain mechanism prevents sync_start tasks from being starved. + +Layout: + Wave 1: 6 × normal(block_num=4) -> CL 0..71 + Sync 0: 1 × sync_start(block_num=6) -> CL 72..89 + Wave 2: 6 × normal(block_num=4) -> CL 90..161 + Sync 1: 1 × sync_start(block_num=6) -> CL 162..179 + Wave 3: 6 × normal(block_num=4) -> CL 180..251 + +Total: 252 CL = 4032 float32. + +Args layout: [output] +""" + +import torch + +__outputs__ = ["output"] +RTOL = 0 +ATOL = 0 + +ALL_CASES = { + "Case1": {}, +} + +DEFAULT_CASE = "Case1" + +FLOATS_PER_CACHE_LINE = 16 +SLOTS_PER_BLOCK = 3 # AIC, AIV0, AIV1 +NORMAL_BLOCK_NUM = 4 +SYNC_BLOCK_NUM = 6 +NORMAL_CL = NORMAL_BLOCK_NUM * SLOTS_PER_BLOCK # 12 +SYNC_CL = SYNC_BLOCK_NUM * SLOTS_PER_BLOCK # 18 + + +# Build flat task list as (block_num, base_cl) +def _build_tasks(): + tasks = [] + cl = 0 + for _ in range(6): + tasks.append((NORMAL_BLOCK_NUM, cl)) + cl += NORMAL_CL + tasks.append((SYNC_BLOCK_NUM, cl)) + cl += SYNC_CL + for _ in range(6): + tasks.append((NORMAL_BLOCK_NUM, cl)) + cl += NORMAL_CL + tasks.append((SYNC_BLOCK_NUM, cl)) + cl += SYNC_CL + for _ in range(6): + tasks.append((NORMAL_BLOCK_NUM, cl)) + cl += NORMAL_CL + return tasks + + +TASKS = _build_tasks() +TOTAL_CL = sum(bn * SLOTS_PER_BLOCK for bn, _ in TASKS) # 252 + + +def generate_inputs(params: dict) -> list: + output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32) + return [("output", output)] + + +def compute_golden(tensors: dict, params: dict) -> None: + out = torch.as_tensor(tensors["output"]) + for block_num, base_cl in TASKS: + for block_idx in range(block_num): + for slot in range(SLOTS_PER_BLOCK): + cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot + out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx) + tensors["output"][:] = out diff --git a/examples/a2a3/tensormap_and_ringbuffer/spmd_starvation/kernels/kernel_config.py b/examples/a2a3/tensormap_and_ringbuffer/spmd_starvation/kernels/kernel_config.py new file mode 100644 index 000000000..b02e66d8e --- /dev/null +++ b/examples/a2a3/tensormap_and_ringbuffer/spmd_starvation/kernels/kernel_config.py @@ -0,0 +1,53 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +""" +Kernel configuration for SPMD starvation-prevention test. + +Submits many normal MIX tasks interleaved with sync_start tasks to verify +the drain mechanism prevents starvation under sustained load. +Reuses the same AIC/AIV kernels from spmd_multiblock_mix. +""" + +from pathlib import Path + +_KERNELS_ROOT = Path(__file__).parent +_MIX_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_mix" / "kernels" + +ORCHESTRATION = { + "source": str(_KERNELS_ROOT / "orchestration" / "spmd_starvation_orch.cpp"), + "function_name": "aicpu_orchestration_entry", +} + +KERNELS = [ + { + "func_id": 0, + "name": "SPMD_MIX_AIC", + "source": str(_MIX_KERNELS / "aic" / "kernel_spmd_mix.cpp"), + "core_type": "aic", + }, + { + "func_id": 1, + "name": "SPMD_MIX_AIV0", + "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"), + "core_type": "aiv", + }, + { + "func_id": 2, + "name": "SPMD_MIX_AIV1", + "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"), + "core_type": "aiv", + }, +] + +RUNTIME_CONFIG = { + "runtime": "tensormap_and_ringbuffer", + "aicpu_thread_num": 4, + "orch_thread_num": 1, + "block_dim": 24, +} diff --git a/examples/a2a3/tensormap_and_ringbuffer/spmd_starvation/kernels/orchestration/spmd_starvation_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/spmd_starvation/kernels/orchestration/spmd_starvation_orch.cpp new file mode 100644 index 000000000..dd4f0cc7d --- /dev/null +++ b/examples/a2a3/tensormap_and_ringbuffer/spmd_starvation/kernels/orchestration/spmd_starvation_orch.cpp @@ -0,0 +1,100 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * SPMD Starvation Prevention Orchestration + * + * Submits a large wave of normal MIX tasks followed by sync_start tasks, + * then another wave of normal tasks. The drain mechanism must ensure the + * sync_start tasks are not indefinitely delayed by the surrounding load. + * + * Layout: 3 waves × 6 normal tasks (block_num=4) + 2 sync_start tasks (block_num=6) + * + * Normal task: block_num=4, require_sync_start=false → 4 blocks × 3 slots = 12 CL each + * Sync task: block_num=6, require_sync_start=true → 6 blocks × 3 slots = 18 CL each + * + * Total CL: 3×6×12 + 2×18 = 216 + 36 = 252 + * + * Args layout: [output] + */ + +#include +#include + +#include "pto_orchestration_api.h" + +#define FUNC_SPMD_MIX_AIC 0 +#define FUNC_SPMD_MIX_AIV0 1 +#define FUNC_SPMD_MIX_AIV1 2 + +static constexpr int32_t SLOTS_PER_BLOCK = 3; // AIC, AIV0, AIV1 +static constexpr int32_t NORMAL_BLOCK_NUM = 4; +static constexpr int32_t SYNC_BLOCK_NUM = 6; +static constexpr int32_t NORMAL_CL = NORMAL_BLOCK_NUM * SLOTS_PER_BLOCK; // 12 +static constexpr int32_t SYNC_CL = SYNC_BLOCK_NUM * SLOTS_PER_BLOCK; // 18 + +extern "C" { + +__attribute__((visibility("default"))) PTO2OrchestrationConfig +aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) { + (void)orch_args; // NOLINT(readability/casting) + return PTO2OrchestrationConfig{ + .expected_arg_count = 1, + }; +} + +static void submit_mix(Tensor &out, int16_t block_num, int64_t base_cl, bool sync_start) { + MixedKernels mk; + mk.aic_kernel_id = FUNC_SPMD_MIX_AIC; + mk.aiv0_kernel_id = FUNC_SPMD_MIX_AIV0; + mk.aiv1_kernel_id = FUNC_SPMD_MIX_AIV1; + + Arg args; + args.add_inout(out); + args.add_scalar(base_cl); + args.launch_spec.set_block_num(block_num); + args.launch_spec.set_require_sync_start(sync_start); + pto2_rt_submit_task(mk, args); +} + +__attribute__((visibility("default"))) void +aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args, int orch_thread_num, int orch_thread_index) { + (void)orch_thread_num; // NOLINT(readability/casting) + if (orch_thread_index != 0) return; + + Tensor ext_output = from_tensor_arg(orch_args.tensor(0)); + + int64_t cl = 0; + + // Wave 1: 6 normal MIX tasks + for (int i = 0; i < 6; i++, cl += NORMAL_CL) + submit_mix(ext_output, NORMAL_BLOCK_NUM, cl, false); + + // Sync-start task 0: must not be starved by wave 1 or wave 2 + submit_mix(ext_output, SYNC_BLOCK_NUM, cl, true); + cl += SYNC_CL; + + // Wave 2: 6 normal MIX tasks + for (int i = 0; i < 6; i++, cl += NORMAL_CL) + submit_mix(ext_output, NORMAL_BLOCK_NUM, cl, false); + + // Sync-start task 1: must not be starved by wave 2 or wave 3 + submit_mix(ext_output, SYNC_BLOCK_NUM, cl, true); + cl += SYNC_CL; + + // Wave 3: 6 normal MIX tasks + for (int i = 0; i < 6; i++, cl += NORMAL_CL) + submit_mix(ext_output, NORMAL_BLOCK_NUM, cl, false); + + LOG_ALWAYS("[spmd_starvation] Submitted 20 tasks (18 normal + 2 sync_start)"); +} + +} // extern "C" diff --git a/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start/golden.py b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start/golden.py new file mode 100644 index 000000000..33acd1c1a --- /dev/null +++ b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start/golden.py @@ -0,0 +1,66 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +""" +Golden test for SPMD sync_start. + +Submits 4 MIX tasks (3 with require_sync_start=true, 1 baseline) and verifies +all blocks of every task write the correct float(block_idx) to their cache line. + +Tasks (AIC=slot0, AIV0=slot1, AIV1=slot2): + T0: block_num=2, sync_start=True -> CL 0..5 + T1: block_num=8, sync_start=True -> CL 6..29 + T2: block_num=2, sync_start=False -> CL 30..35 (baseline) + T3: block_num=12, sync_start=True -> CL 36..71 + +Output tensor: 72 cache lines = 1152 float32. + +Args layout: [output] +""" + +import torch + +__outputs__ = ["output"] +RTOL = 0 +ATOL = 0 + +ALL_CASES = { + "Case1": {}, +} + +DEFAULT_CASE = "Case1" + +FLOATS_PER_CACHE_LINE = 16 +SLOTS_PER_BLOCK = 3 # AIC, AIV0, AIV1 + +# (block_num, base_cl) for each submitted task +TASKS = [ + (2, 0), # T0: sync_start=True + (8, 6), # T1: sync_start=True + (2, 30), # T2: sync_start=False (baseline) + (12, 36), # T3: sync_start=True +] + +TOTAL_CL = sum(block_num * SLOTS_PER_BLOCK for block_num, _ in TASKS) # 72 + + +def generate_inputs(params: dict) -> list: + output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32) + return [ + ("output", output), + ] + + +def compute_golden(tensors: dict, params: dict) -> None: + out = torch.as_tensor(tensors["output"]) + for block_num, base_cl in TASKS: + for block_idx in range(block_num): + for slot in range(SLOTS_PER_BLOCK): + cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot + out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx) + tensors["output"][:] = out diff --git a/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start/kernels/kernel_config.py b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start/kernels/kernel_config.py new file mode 100644 index 000000000..95f706c9d --- /dev/null +++ b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start/kernels/kernel_config.py @@ -0,0 +1,52 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +""" +Kernel configuration for SPMD sync_start test (tensormap_and_ringbuffer Runtime). + +Submits MIX tasks with require_sync_start=true to verify atomic batch launch. +Reuses the same AIC/AIV kernels from spmd_multiblock_mix. +""" + +from pathlib import Path + +_KERNELS_ROOT = Path(__file__).parent +_MIX_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_mix" / "kernels" + +ORCHESTRATION = { + "source": str(_KERNELS_ROOT / "orchestration" / "spmd_sync_start_orch.cpp"), + "function_name": "aicpu_orchestration_entry", +} + +KERNELS = [ + { + "func_id": 0, + "name": "SPMD_MIX_AIC", + "source": str(_MIX_KERNELS / "aic" / "kernel_spmd_mix.cpp"), + "core_type": "aic", + }, + { + "func_id": 1, + "name": "SPMD_MIX_AIV0", + "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"), + "core_type": "aiv", + }, + { + "func_id": 2, + "name": "SPMD_MIX_AIV1", + "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"), + "core_type": "aiv", + }, +] + +RUNTIME_CONFIG = { + "runtime": "tensormap_and_ringbuffer", + "aicpu_thread_num": 4, + "orch_thread_num": 1, + "block_dim": 24, +} diff --git a/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start/kernels/orchestration/spmd_sync_start_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start/kernels/orchestration/spmd_sync_start_orch.cpp new file mode 100644 index 000000000..207d26139 --- /dev/null +++ b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start/kernels/orchestration/spmd_sync_start_orch.cpp @@ -0,0 +1,82 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * SPMD Sync-Start Orchestration + * + * Submits MIX tasks with require_sync_start=true to verify that the scheduler + * atomically launches all blocks before any can run. + * + * Tasks: + * T0: block_num=2, require_sync_start=true (basic sync launch) + * T1: block_num=8, require_sync_start=true (larger batch) + * T2: block_num=2, require_sync_start=false (normal, as baseline) + * T3: block_num=12, require_sync_start=true (cross-thread batch) + * + * Each block writes float(block_idx) to its allocated cache-line slot, + * identical to spmd_multiblock_mix so the same kernel binaries can be reused. + * + * Args layout: [output] + */ + +#include +#include + +#include "pto_orchestration_api.h" + +#define FUNC_SPMD_MIX_AIC 0 +#define FUNC_SPMD_MIX_AIV0 1 +#define FUNC_SPMD_MIX_AIV1 2 + +extern "C" { + +__attribute__((visibility("default"))) PTO2OrchestrationConfig +aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) { + (void)orch_args; // NOLINT(readability/casting) + return PTO2OrchestrationConfig{ + .expected_arg_count = 1, + }; +} + +static void submit_mix(Tensor &out, int16_t block_num, int64_t base_cl, bool sync_start) { + MixedKernels mk; + mk.aic_kernel_id = FUNC_SPMD_MIX_AIC; + mk.aiv0_kernel_id = FUNC_SPMD_MIX_AIV0; + mk.aiv1_kernel_id = FUNC_SPMD_MIX_AIV1; + + Arg args; + args.add_inout(out); + args.add_scalar(base_cl); + args.launch_spec.set_block_num(block_num); + args.launch_spec.set_require_sync_start(sync_start); + pto2_rt_submit_task(mk, args); +} + +__attribute__((visibility("default"))) void +aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args, int orch_thread_num, int orch_thread_index) { + (void)orch_thread_num; // NOLINT(readability/casting) + if (orch_thread_index != 0) return; + + Tensor ext_output = from_tensor_arg(orch_args.tensor(0)); + + // T0: 2 blocks, sync_start=true (6 CL) + submit_mix(ext_output, 2, 0, true); + // T1: 8 blocks, sync_start=true (24 CL) + submit_mix(ext_output, 8, 6, true); + // T2: 2 blocks, sync_start=false (6 CL, baseline) + submit_mix(ext_output, 2, 30, false); + // T3: 12 blocks, sync_start=true (36 CL) + submit_mix(ext_output, 12, 36, true); + + LOG_ALWAYS("[spmd_sync_start] Submitted 4 tasks (3 sync_start + 1 baseline)"); +} + +} // extern "C" diff --git a/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_aiv/golden.py b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_aiv/golden.py new file mode 100644 index 000000000..3c60f1ac8 --- /dev/null +++ b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_aiv/golden.py @@ -0,0 +1,62 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +""" +Golden test for SPMD sync_start with AIV-only tasks. + +Submits 4 AIV tasks (3 with require_sync_start=true, 1 baseline) to exercise +the AIV-specific fast path (count_idle_aiv_cores) and drain slow path. + +Tasks: + T0: block_num=4, sync_start=True -> CL 0..3 (fast path) + T1: block_num=16, sync_start=True -> CL 4..19 (saturate one thread) + T2: block_num=4, sync_start=False -> CL 20..23 (baseline) + T3: block_num=24, sync_start=True -> CL 24..47 (cross-thread drain) + +Output tensor: 48 cache lines = 768 float32. + +Args layout: [output] +""" + +import torch + +__outputs__ = ["output"] +RTOL = 0 +ATOL = 0 + +ALL_CASES = { + "Case1": {}, +} + +DEFAULT_CASE = "Case1" + +FLOATS_PER_CACHE_LINE = 16 + +# (block_num, base_cl) for each submitted task +TASKS = [ + (4, 0), # T0: sync_start=True, fast path + (16, 4), # T1: sync_start=True, saturate single thread + (4, 20), # T2: sync_start=False, baseline + (24, 24), # T3: sync_start=True, cross-thread drain +] + +TOTAL_CL = sum(block_num for block_num, _ in TASKS) # 48 + + +def generate_inputs(params: dict) -> list: + output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32) + return [("output", output)] + + +def compute_golden(tensors: dict, params: dict) -> None: + out = torch.as_tensor(tensors["output"]) + for block_num, base_cl in TASKS: + for block_idx in range(block_num): + cl = base_cl + block_idx + out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx) + tensors["output"][:] = out diff --git a/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/kernel_config.py b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/kernel_config.py new file mode 100644 index 000000000..77102a658 --- /dev/null +++ b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/kernel_config.py @@ -0,0 +1,41 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +""" +Kernel configuration for SPMD sync_start AIV test (tensormap_and_ringbuffer Runtime). + +Submits AIV tasks with require_sync_start=true to verify atomic batch launch +and the AIV-specific fast path (count_idle_aiv_cores). +Reuses the same AIV kernel from spmd_multiblock_aiv. +""" + +from pathlib import Path + +_KERNELS_ROOT = Path(__file__).parent +_AIV_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_aiv" / "kernels" + +ORCHESTRATION = { + "source": str(_KERNELS_ROOT / "orchestration" / "spmd_sync_start_aiv_orch.cpp"), + "function_name": "aicpu_orchestration_entry", +} + +KERNELS = [ + { + "func_id": 0, + "name": "SPMD_WRITE_AIV", + "source": str(_AIV_KERNELS / "aiv" / "kernel_spmd_write.cpp"), + "core_type": "aiv", + }, +] + +RUNTIME_CONFIG = { + "runtime": "tensormap_and_ringbuffer", + "aicpu_thread_num": 4, + "orch_thread_num": 1, + "block_dim": 24, +} diff --git a/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/orchestration/spmd_sync_start_aiv_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/orchestration/spmd_sync_start_aiv_orch.cpp new file mode 100644 index 000000000..859329d99 --- /dev/null +++ b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/orchestration/spmd_sync_start_aiv_orch.cpp @@ -0,0 +1,76 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * SPMD Sync-Start AIV Orchestration + * + * Submits AIV-only tasks with require_sync_start=true to exercise: + * - AIV fast path: count_idle_aiv_cores() >= block_num (small block_num) + * - AIV drain path: block_num exceeds local AIV cores (cross-thread drain) + * + * Tasks: + * T0: block_num=4, require_sync_start=true (fast path) + * T1: block_num=16, require_sync_start=true (saturate one thread: 8 clusters × 2 AIV) + * T2: block_num=4, require_sync_start=false (baseline) + * T3: block_num=24, require_sync_start=true (cross-thread drain) + * + * Each block writes float(block_idx) at (base_cl + block_idx) × FLOATS_PER_CACHE_LINE, + * reusing the kernel from spmd_multiblock_aiv. + * + * Args layout: [output] + */ + +#include +#include + +#include "pto_orchestration_api.h" + +#define FUNC_SPMD_WRITE_AIV 0 + +extern "C" { + +__attribute__((visibility("default"))) PTO2OrchestrationConfig +aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) { + (void)orch_args; // NOLINT(readability/casting) + return PTO2OrchestrationConfig{ + .expected_arg_count = 1, + }; +} + +static void submit_aiv(Tensor &out, int16_t block_num, int64_t base_cl, bool sync_start) { + Arg args; + args.add_inout(out); + args.add_scalar(base_cl); + args.launch_spec.set_block_num(block_num); + args.launch_spec.set_require_sync_start(sync_start); + pto2_rt_submit_aiv_task(FUNC_SPMD_WRITE_AIV, args); +} + +__attribute__((visibility("default"))) void +aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args, int orch_thread_num, int orch_thread_index) { + (void)orch_thread_num; // NOLINT(readability/casting) + if (orch_thread_index != 0) return; + + Tensor ext_output = from_tensor_arg(orch_args.tensor(0)); + + // T0: 4 blocks, sync_start=true (fast path: 4 <= idle AIV cores on one thread) + submit_aiv(ext_output, 4, 0, true); + // T1: 16 blocks, sync_start=true (saturate: 8 clusters × 2 AIV = 16 cores) + submit_aiv(ext_output, 16, 4, true); + // T2: 4 blocks, sync_start=false (baseline) + submit_aiv(ext_output, 4, 20, false); + // T3: 24 blocks, sync_start=true (cross-thread drain) + submit_aiv(ext_output, 24, 24, true); + + LOG_ALWAYS("[spmd_sync_start_aiv] Submitted 4 AIV tasks (3 sync_start + 1 baseline)"); +} + +} // extern "C" diff --git a/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_edge/golden.py b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_edge/golden.py new file mode 100644 index 000000000..7d9b0b6ae --- /dev/null +++ b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_edge/golden.py @@ -0,0 +1,66 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +""" +Golden test for SPMD sync_start boundary conditions. + +Tests edge-case block_num values relative to per-thread cluster capacity (8 clusters +with 3 sched threads = 24 total clusters, 48 total AIV cores). + +MIX tasks (SLOTS_PER_BLOCK=3): + T0: block_num=1, sync_start=True -> CL 0..2 (degenerate: always fast path) + T1: block_num=8, sync_start=True -> CL 3..26 (exactly one thread's capacity) + T2: block_num=9, sync_start=True -> CL 27..53 (one over: must enter drain) + T3: block_num=23, sync_start=True -> CL 54..122 (max valid: total_clusters - 1) + T4: block_num=1, sync_start=False -> CL 123..125 (baseline) + +Output tensor: 126 cache lines = 2016 float32. + +Args layout: [output] +""" + +import torch + +__outputs__ = ["output"] +RTOL = 0 +ATOL = 0 + +ALL_CASES = { + "Case1": {}, +} + +DEFAULT_CASE = "Case1" + +FLOATS_PER_CACHE_LINE = 16 +SLOTS_PER_BLOCK = 3 # AIC, AIV0, AIV1 + +# (block_num, base_cl) for each submitted task +TASKS = [ + (1, 0), # T0: sync=True, degenerate + (8, 3), # T1: sync=True, exactly one thread's clusters + (9, 27), # T2: sync=True, one over → drain + (23, 54), # T3: sync=True, max valid (total_clusters - 1) + (1, 123), # T4: sync=False, baseline +] + +TOTAL_CL = sum(block_num * SLOTS_PER_BLOCK for block_num, _ in TASKS) # 126 + + +def generate_inputs(params: dict) -> list: + output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32) + return [("output", output)] + + +def compute_golden(tensors: dict, params: dict) -> None: + out = torch.as_tensor(tensors["output"]) + for block_num, base_cl in TASKS: + for block_idx in range(block_num): + for slot in range(SLOTS_PER_BLOCK): + cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot + out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx) + tensors["output"][:] = out diff --git a/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/kernel_config.py b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/kernel_config.py new file mode 100644 index 000000000..29f119ea9 --- /dev/null +++ b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/kernel_config.py @@ -0,0 +1,53 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +""" +Kernel configuration for SPMD sync_start boundary test. + +Tests edge-case block_num values: 1 (degenerate), 8 (one thread capacity), +9 (just over), 23 (max valid = total_clusters - 1). +Reuses the same AIC/AIV kernels from spmd_multiblock_mix. +""" + +from pathlib import Path + +_KERNELS_ROOT = Path(__file__).parent +_MIX_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_mix" / "kernels" + +ORCHESTRATION = { + "source": str(_KERNELS_ROOT / "orchestration" / "spmd_sync_start_edge_orch.cpp"), + "function_name": "aicpu_orchestration_entry", +} + +KERNELS = [ + { + "func_id": 0, + "name": "SPMD_MIX_AIC", + "source": str(_MIX_KERNELS / "aic" / "kernel_spmd_mix.cpp"), + "core_type": "aic", + }, + { + "func_id": 1, + "name": "SPMD_MIX_AIV0", + "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"), + "core_type": "aiv", + }, + { + "func_id": 2, + "name": "SPMD_MIX_AIV1", + "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"), + "core_type": "aiv", + }, +] + +RUNTIME_CONFIG = { + "runtime": "tensormap_and_ringbuffer", + "aicpu_thread_num": 4, + "orch_thread_num": 1, + "block_dim": 24, +} diff --git a/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/orchestration/spmd_sync_start_edge_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/orchestration/spmd_sync_start_edge_orch.cpp new file mode 100644 index 000000000..122d838b0 --- /dev/null +++ b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/orchestration/spmd_sync_start_edge_orch.cpp @@ -0,0 +1,82 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * SPMD Sync-Start Boundary Orchestration + * + * Tests edge-case block_num values relative to per-thread cluster capacity + * (8 clusters per sched thread, 24 total clusters). + * + * Tasks: + * T0: block_num=1, sync_start=true (degenerate: always fast path) + * T1: block_num=8, sync_start=true (exactly one thread's capacity) + * T2: block_num=9, sync_start=true (one over: must enter drain) + * T3: block_num=23, sync_start=true (max valid: total_clusters - 1) + * T4: block_num=1, sync_start=false (baseline) + * + * Args layout: [output] + */ + +#include +#include + +#include "pto_orchestration_api.h" + +#define FUNC_SPMD_MIX_AIC 0 +#define FUNC_SPMD_MIX_AIV0 1 +#define FUNC_SPMD_MIX_AIV1 2 + +extern "C" { + +__attribute__((visibility("default"))) PTO2OrchestrationConfig +aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) { + (void)orch_args; // NOLINT(readability/casting) + return PTO2OrchestrationConfig{ + .expected_arg_count = 1, + }; +} + +static void submit_mix(Tensor &out, int16_t block_num, int64_t base_cl, bool sync_start) { + MixedKernels mk; + mk.aic_kernel_id = FUNC_SPMD_MIX_AIC; + mk.aiv0_kernel_id = FUNC_SPMD_MIX_AIV0; + mk.aiv1_kernel_id = FUNC_SPMD_MIX_AIV1; + + Arg args; + args.add_inout(out); + args.add_scalar(base_cl); + args.launch_spec.set_block_num(block_num); + args.launch_spec.set_require_sync_start(sync_start); + pto2_rt_submit_task(mk, args); +} + +__attribute__((visibility("default"))) void +aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args, int orch_thread_num, int orch_thread_index) { + (void)orch_thread_num; // NOLINT(readability/casting) + if (orch_thread_index != 0) return; + + Tensor ext_output = from_tensor_arg(orch_args.tensor(0)); + + // T0: block_num=1, sync_start=true (degenerate: always fast path, 3 CL) + submit_mix(ext_output, 1, 0, true); + // T1: block_num=8, sync_start=true (exactly one thread's cluster capacity, 24 CL) + submit_mix(ext_output, 8, 3, true); + // T2: block_num=9, sync_start=true (one over single thread → must drain, 27 CL) + submit_mix(ext_output, 9, 27, true); + // T3: block_num=23, sync_start=true (max valid = total_clusters - 1, 69 CL) + submit_mix(ext_output, 23, 54, true); + // T4: block_num=1, sync_start=false (baseline, 3 CL) + submit_mix(ext_output, 1, 123, false); + + LOG_ALWAYS("[spmd_sync_start_edge] Submitted 5 tasks: block_num=1,8,9,23 (sync) + 1 (baseline)"); +} + +} // extern "C" diff --git a/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_stress/golden.py b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_stress/golden.py new file mode 100644 index 000000000..d84f3c270 --- /dev/null +++ b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_stress/golden.py @@ -0,0 +1,110 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +""" +Golden test for SPMD sync_start stress / CAS contention with mixed shapes. + +Submits 6 rounds of mixed-shape tasks to stress drain CAS contention, ack +barrier, and state cleanup across drain cycles. All three resource shapes +(MIX, AIV, AIC) are exercised with both sync and non-sync modes. + +Each round (9 tasks): + 4 × normal MIX (block_num=4, sync=false) → 4 × 4 × 3 = 48 CL + 2 × sync MIX (block_num=12, sync=true) → 2 × 12 × 3 = 72 CL + 2 × sync AIV (block_num=8, sync=true) → 2 × 8 × 1 = 16 CL + 1 × normal AIV (block_num=4, sync=false) → 1 × 4 × 1 = 4 CL + Round total: 140 CL + +6 rounds → 54 tasks (24 normal MIX + 12 sync MIX + 12 sync AIV + 6 normal AIV) +Grand total: 840 CL = 13440 float32 + +Stress coverage: + - 24 drain cycles (12 MIX + 12 AIV) → validates state cleanup + - 2 sync MIX + 2 sync AIV per round → CAS contention across shapes + - Normal tasks occupy clusters → forces drain slow path + - 54 tasks total → no task loss under sustained load + +Args layout: [output] +""" + +import torch + +__outputs__ = ["output"] +RTOL = 0 +ATOL = 0 + +ALL_CASES = { + "Case1": {}, +} + +DEFAULT_CASE = "Case1" + +FLOATS_PER_CACHE_LINE = 16 +ROUNDS = 6 + +# shape constants: (slots_per_block, written_slots) +# MIX: kernel writes at base_cl + block_idx * 3 + {0,1,2}, 3 CL per block, all written +# AIV: kernel writes at base_cl + block_idx, 1 CL per block +SHAPE_MIX = "MIX" +SHAPE_AIV = "AIV" + +MIX_SLOTS = 3 +AIV_SLOTS = 1 + +NORMAL_MIX_BN = 4 +SYNC_MIX_BN = 12 +SYNC_AIV_BN = 8 +NORMAL_AIV_BN = 4 + + +def _build_tasks(): + """Returns list of (block_num, base_cl, shape_str).""" + tasks = [] + cl = 0 + for _ in range(ROUNDS): + # 4 × normal MIX + for _ in range(4): + tasks.append((NORMAL_MIX_BN, cl, SHAPE_MIX)) + cl += NORMAL_MIX_BN * MIX_SLOTS + # 2 × sync MIX + for _ in range(2): + tasks.append((SYNC_MIX_BN, cl, SHAPE_MIX)) + cl += SYNC_MIX_BN * MIX_SLOTS + # 2 × sync AIV + for _ in range(2): + tasks.append((SYNC_AIV_BN, cl, SHAPE_AIV)) + cl += SYNC_AIV_BN * AIV_SLOTS + # 1 × normal AIV + tasks.append((NORMAL_AIV_BN, cl, SHAPE_AIV)) + cl += NORMAL_AIV_BN * AIV_SLOTS + return tasks + + +TASKS = _build_tasks() +TOTAL_CL = sum(bn * (MIX_SLOTS if shape == SHAPE_MIX else AIV_SLOTS) for bn, _, shape in TASKS) # 840 + + +def generate_inputs(params: dict) -> list: + output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32) + return [("output", output)] + + +def compute_golden(tensors: dict, params: dict) -> None: + out = torch.as_tensor(tensors["output"]) + for block_num, base_cl, shape in TASKS: + for block_idx in range(block_num): + if shape == SHAPE_MIX: + # MIX kernel writes float(block_idx) at all 3 slots + for slot in range(MIX_SLOTS): + cl = base_cl + block_idx * MIX_SLOTS + slot + out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx) + else: + # AIV kernel writes float(block_idx) at 1 slot + cl = base_cl + block_idx + out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx) + tensors["output"][:] = out diff --git a/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/kernel_config.py b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/kernel_config.py new file mode 100644 index 000000000..09c507863 --- /dev/null +++ b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/kernel_config.py @@ -0,0 +1,62 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +""" +Kernel configuration for SPMD sync_start stress test with mixed shapes. + +Submits 54 tasks (MIX + AIV) over 6 rounds to stress-test drain CAS contention, +ack barrier, and state cleanup between drain cycles. +Reuses AIC/AIV kernels from spmd_multiblock_mix and spmd_multiblock_aiv. +""" + +from pathlib import Path + +_KERNELS_ROOT = Path(__file__).parent +_MIX_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_mix" / "kernels" +_AIV_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_aiv" / "kernels" + +ORCHESTRATION = { + "source": str(_KERNELS_ROOT / "orchestration" / "spmd_sync_start_stress_orch.cpp"), + "function_name": "aicpu_orchestration_entry", +} + +KERNELS = [ + # func_id 0-2: MIX kernels (AIC + AIV0 + AIV1) + { + "func_id": 0, + "name": "SPMD_MIX_AIC", + "source": str(_MIX_KERNELS / "aic" / "kernel_spmd_mix.cpp"), + "core_type": "aic", + }, + { + "func_id": 1, + "name": "SPMD_MIX_AIV0", + "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"), + "core_type": "aiv", + }, + { + "func_id": 2, + "name": "SPMD_MIX_AIV1", + "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"), + "core_type": "aiv", + }, + # func_id 3: standalone AIV kernel + { + "func_id": 3, + "name": "SPMD_WRITE_AIV", + "source": str(_AIV_KERNELS / "aiv" / "kernel_spmd_write.cpp"), + "core_type": "aiv", + }, +] + +RUNTIME_CONFIG = { + "runtime": "tensormap_and_ringbuffer", + "aicpu_thread_num": 4, + "orch_thread_num": 1, + "block_dim": 24, +} diff --git a/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/orchestration/spmd_sync_start_stress_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/orchestration/spmd_sync_start_stress_orch.cpp new file mode 100644 index 000000000..04692f4fc --- /dev/null +++ b/examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/orchestration/spmd_sync_start_stress_orch.cpp @@ -0,0 +1,109 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * SPMD Sync-Start Stress Orchestration (mixed shapes) + * + * Submits 6 rounds of mixed MIX + AIV tasks to stress-test: + * - Drain CAS contention (multiple sync_start tasks per round) + * - Ack barrier correctness (normal tasks occupy clusters during drain entry) + * - State cleanup between consecutive drain cycles + * + * Each round (9 tasks): + * 4 × normal MIX (block_num=4, sync=false) -> 4 × 4 × 3 = 48 CL + * 2 × sync MIX (block_num=12, sync=true) -> 2 × 12 × 3 = 72 CL + * 2 × sync AIV (block_num=8, sync=true) -> 2 × 8 × 1 = 16 CL + * 1 × normal AIV (block_num=4, sync=false) -> 1 × 4 × 1 = 4 CL + * Round total: 140 CL + * + * 6 rounds → 54 tasks total, 840 CL grand total. + * + * Args layout: [output] + */ + +#include +#include + +#include "pto_orchestration_api.h" + +#define FUNC_SPMD_MIX_AIC 0 +#define FUNC_SPMD_MIX_AIV0 1 +#define FUNC_SPMD_MIX_AIV1 2 +#define FUNC_SPMD_WRITE_AIV 3 + +static constexpr int32_t MIX_SLOTS = 3; +static constexpr int32_t NORMAL_MIX_BN = 4; +static constexpr int32_t SYNC_MIX_BN = 12; +static constexpr int32_t SYNC_AIV_BN = 8; +static constexpr int32_t NORMAL_AIV_BN = 4; +static constexpr int32_t ROUNDS = 6; + +extern "C" { + +__attribute__((visibility("default"))) PTO2OrchestrationConfig +aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) { + (void)orch_args; + return PTO2OrchestrationConfig{.expected_arg_count = 1}; +} + +static void submit_mix(Tensor &out, int16_t block_num, int64_t base_cl, bool sync_start) { + MixedKernels mk; + mk.aic_kernel_id = FUNC_SPMD_MIX_AIC; + mk.aiv0_kernel_id = FUNC_SPMD_MIX_AIV0; + mk.aiv1_kernel_id = FUNC_SPMD_MIX_AIV1; + Arg args; + args.add_inout(out); + args.add_scalar(base_cl); + args.launch_spec.set_block_num(block_num); + args.launch_spec.set_require_sync_start(sync_start); + pto2_rt_submit_task(mk, args); +} + +static void submit_aiv(Tensor &out, int16_t block_num, int64_t base_cl, bool sync_start) { + Arg args; + args.add_inout(out); + args.add_scalar(base_cl); + args.launch_spec.set_block_num(block_num); + args.launch_spec.set_require_sync_start(sync_start); + pto2_rt_submit_aiv_task(FUNC_SPMD_WRITE_AIV, args); +} + +__attribute__((visibility("default"))) void +aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args, int orch_thread_num, int orch_thread_index) { + (void)orch_thread_num; + if (orch_thread_index != 0) return; + + Tensor ext_output = from_tensor_arg(orch_args.tensor(0)); + + int64_t cl = 0; + + for (int32_t r = 0; r < ROUNDS; r++) { + // 4 × normal MIX + for (int i = 0; i < 4; i++, cl += NORMAL_MIX_BN * MIX_SLOTS) + submit_mix(ext_output, NORMAL_MIX_BN, cl, false); + + // 2 × sync MIX — CAS contention: second sync task may arrive while first is draining + for (int i = 0; i < 2; i++, cl += SYNC_MIX_BN * MIX_SLOTS) + submit_mix(ext_output, SYNC_MIX_BN, cl, true); + + // 2 × sync AIV — cross-shape drain contention with the MIX drain above + for (int i = 0; i < 2; i++, cl += SYNC_AIV_BN) + submit_aiv(ext_output, SYNC_AIV_BN, cl, true); + + // 1 × normal AIV + submit_aiv(ext_output, NORMAL_AIV_BN, cl, false); + cl += NORMAL_AIV_BN; + } + + LOG_ALWAYS("[spmd_sync_start_stress] Submitted %d tasks over %d rounds", 9 * ROUNDS, ROUNDS); +} + +} // extern "C" diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 024846722..ec6e88488 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -248,6 +248,14 @@ class alignas(64) CoreTracker { return ((core_states_ >> (cluster_offset + 2)) & BitStates(1ULL)).has_value(); } + // Count total idle AIV cores (AIV0 + AIV1) across all clusters. + // Unlike get_valid_cluster_offset_states(AIV).count() which counts clusters with + // at least one idle AIV, this counts individual idle cores — a cluster with both + // AIV0 and AIV1 idle contributes 2, not 1. + int32_t count_idle_aiv_cores() const { + return ((core_states_ >> 1) & aic_mask_).count() + ((core_states_ >> 2) & aic_mask_).count(); + } + // --- State mutation --- // Toggle bit at the given bit offset (running <-> idle) @@ -268,6 +276,8 @@ class alignas(64) CoreTracker { struct AicpuExecutor { int32_t orch_thread_num_; int32_t sched_thread_num_; + int32_t active_sched_threads_{0}; // Threads currently in dispatch loop (initially sched_thread_num_, becomes + // thread_num_ after orch→sched transition) bool orch_to_sched_{false}; // ===== Thread management state ===== @@ -297,6 +307,20 @@ struct AicpuExecutor { CoreTracker core_trackers_[MAX_AICPU_THREADS]; + // ===== sync_start drain coordination ===== + + // When sync_start_pending != 0, all scheduler threads skip Phase 2 dispatch + // (only process completions) until the drain worker finishes launching all blocks. + struct alignas(64) SyncStartDrainState { + std::atomic sync_start_pending{0}; // 0=normal; -1=initializing; >0=active (value=block_num) + std::atomic drain_worker_elected{0}; // 0=none; >0: elected thread's (thread_idx+1) + std::atomic drain_ack_mask{0}; // bit per thread; all-set = all threads finished dispatch + PTO2TaskSlotState *pending_task{nullptr}; // held task (not re-queued) + int32_t _pad[10]; + }; + static_assert(sizeof(SyncStartDrainState) == 64); + SyncStartDrainState drain_state_; + // ===== Task queue state (managed by scheduler ready queues) ===== // Task execution tracking @@ -540,17 +564,6 @@ struct AicpuExecutor { return count; } - /** - * Build per-core dispatch payload: copy tensor pointers and scalars into - * the per-core args[] array, then populate SPMD local context at the tail. - * - * Reads next_block_idx and block_num directly from the task descriptor - * to populate LocalContext. The caller is responsible for incrementing - * next_block_idx AFTER dispatch. - * - * GlobalContext (sub_block_id) is NOT written here — it is initialized once - * at runtime startup by init_global_context(). - */ void build_payload(PTO2DispatchPayload &dispatch_payload, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot) { int32_t slot_idx = static_cast(subslot); uint64_t callable_addr = get_function_bin_addr(slot_state.task->kernel_id[slot_idx]); @@ -564,7 +577,7 @@ struct AicpuExecutor { for (int32_t i = 0; i < payload.scalar_count; i++) { dispatch_payload.args[n++] = payload.scalars[i]; } - // Per-dispatch local context (read from slot state) + // Per-dispatch local context: read block_idx/block_num directly from slot_state. dispatch_payload.local_context.block_idx = slot_state.next_block_idx; dispatch_payload.local_context.block_num = slot_state.block_num; // Store context pointers at fixed suffix positions in args[] @@ -621,6 +634,242 @@ struct AicpuExecutor { tracker.change_core_state(core_offset); core_exec_state.executing_reg_task_id = reg_task_id; } + + // Dispatch one SPMD block of a MIX task to the cluster at cluster_offset. + // Reads slot_state.next_block_idx as block_idx; caller increments it afterwards. + void dispatch_mix_block_to_cluster( + Runtime *runtime, int32_t thread_idx, int32_t cluster_offset, PTO2TaskSlotState &slot_state +#if PTO2_PROFILING + , + bool profiling_enabled +#endif + ) { + CoreTracker &tracker = core_trackers_[thread_idx]; + uint8_t core_mask = pto2_core_mask(slot_state.active_mask); + if (core_mask & PTO2_SUBTASK_MASK_AIC) { + dispatch_subtask_to_core( + runtime, thread_idx, tracker.get_aic_core_offset(cluster_offset), slot_state, PTO2SubtaskSlot::AIC +#if PTO2_PROFILING + , + profiling_enabled +#endif + ); + } + if (core_mask & PTO2_SUBTASK_MASK_AIV0) { + dispatch_subtask_to_core( + runtime, thread_idx, tracker.get_aiv0_core_offset(cluster_offset), slot_state, PTO2SubtaskSlot::AIV0 +#if PTO2_PROFILING + , + profiling_enabled +#endif + ); + } + if (core_mask & PTO2_SUBTASK_MASK_AIV1) { + dispatch_subtask_to_core( + runtime, thread_idx, tracker.get_aiv1_core_offset(cluster_offset), slot_state, PTO2SubtaskSlot::AIV1 +#if PTO2_PROFILING + , + profiling_enabled +#endif + ); + } + } + + // ===== sync_start drain helpers ===== + + // Take ownership of slot_state and signal all threads to enter drain mode. + // Returns true if this thread won the CAS and owns the drain slot. + // Returns false if another thread already holds drain; caller must re-push slot_state. + // + // Two-phase protocol: CAS 0 → -1 (sentinel) to claim ownership, store task and + // reset election flag, then release-store block_num. Other threads acquire-load + // sync_start_pending; seeing block_num > 0 ensures all relaxed stores are visible. + bool enter_drain_mode(PTO2TaskSlotState *slot_state, int32_t block_num) { + int32_t expected = 0; + if (!drain_state_.sync_start_pending.compare_exchange_strong( + expected, -1, std::memory_order_relaxed, std::memory_order_relaxed + )) { + return false; // Another thread already holds the drain slot. + } + // We own the drain slot. Store the task and reset election flag before making it visible. + drain_state_.pending_task = slot_state; + drain_state_.drain_ack_mask.store(0, std::memory_order_relaxed); + drain_state_.drain_worker_elected.store(0, std::memory_order_relaxed); + // Release store: all stores above are now visible to any thread that + // acquire-loads sync_start_pending and sees block_num > 0. + drain_state_.sync_start_pending.store(block_num, std::memory_order_release); + return true; + } + + // Dispatch one SPMD block to the cluster at cluster_offset, routing to the correct core(s) + // based on shape. For AIV, picks whichever AIV core in the cluster is currently idle. + // Caller is responsible for incrementing slot_state.next_block_idx after this returns. + void dispatch_block_to_cluster( + Runtime *runtime, int32_t thread_idx, int32_t cluster_offset, PTO2TaskSlotState &slot_state, + PTO2ResourceShape shape +#if PTO2_PROFILING + , + bool profiling_enabled, uint32_t &phase_dispatch_count +#endif + ) { + CoreTracker &tracker = core_trackers_[thread_idx]; + if (shape == PTO2ResourceShape::MIX) { + dispatch_mix_block_to_cluster( + runtime, thread_idx, cluster_offset, slot_state +#if PTO2_PROFILING + , + profiling_enabled +#endif + ); + } else if (shape == PTO2ResourceShape::AIC) { + dispatch_subtask_to_core( + runtime, thread_idx, tracker.get_aic_core_offset(cluster_offset), slot_state, PTO2SubtaskSlot::AIC +#if PTO2_PROFILING + , + profiling_enabled +#endif + ); + } else { // AIV + auto core_offset = tracker.is_aiv0_core_idle(cluster_offset) ? + tracker.get_aiv0_core_offset(cluster_offset) : + tracker.get_aiv1_core_offset(cluster_offset); + dispatch_subtask_to_core( + runtime, thread_idx, core_offset, slot_state, PTO2SubtaskSlot::AIV0 +#if PTO2_PROFILING + , + profiling_enabled +#endif + ); + } +#if PTO2_PROFILING + phase_dispatch_count += __builtin_popcount(pto2_core_mask(slot_state.active_mask)); +#endif + } + + // Count total available resources across all scheduler threads for a given shape. + int32_t count_global_available(PTO2ResourceShape shape) { + int32_t total = 0; + for (int32_t t = 0; t < active_sched_threads_; t++) { + if (shape == PTO2ResourceShape::AIV) { + total += core_trackers_[t].count_idle_aiv_cores(); + } else { + total += core_trackers_[t].get_valid_cluster_offset_states(shape).count(); + } + } + return total; + } + + // Drain worker: dispatch all blocks in one pass across all threads' trackers. + // Called only when global resources >= block_num, so one pass always suffices. + // All other threads are spinning — the drain worker has exclusive tracker access. + void drain_worker_dispatch( + Runtime *runtime, int32_t block_num +#if PTO2_PROFILING + , + bool profiling_enabled, uint32_t &phase_dispatch_count +#endif + ) { + PTO2TaskSlotState *slot_state = drain_state_.pending_task; + if (!slot_state) { + drain_state_.sync_start_pending.store(0, std::memory_order_release); + return; + } + PTO2ResourceShape shape = pto2_active_mask_to_shape(slot_state->active_mask); + + for (int32_t t = 0; t < active_sched_threads_ && slot_state->next_block_idx < block_num; t++) { + auto valid = core_trackers_[t].get_valid_cluster_offset_states(shape); + while (valid.has_value() && slot_state->next_block_idx < block_num) { + dispatch_block_to_cluster( + runtime, t, valid.pop_first(), *slot_state, shape +#if PTO2_PROFILING + , + profiling_enabled, phase_dispatch_count +#endif + ); + slot_state->next_block_idx++; + if (slot_state->next_block_idx < block_num) + valid = core_trackers_[t].get_valid_cluster_offset_states(shape); + } + } + + // All blocks dispatched — clear drain state. + // Release fence ensures tracker mutations are visible to threads that + // acquire-load sync_start_pending == 0 and resume normal operation. + std::atomic_thread_fence(std::memory_order_release); + drain_state_.pending_task = nullptr; + drain_state_.drain_ack_mask.store(0, std::memory_order_relaxed); + drain_state_.drain_worker_elected.store(0, std::memory_order_relaxed); + drain_state_.sync_start_pending.store(0, std::memory_order_release); + } + + // Called by each scheduler thread when drain_state_.sync_start_pending != 0. + // + // Three-phase protocol: + // 1. Ack barrier: all threads signal they've stopped Phase 2 dispatch. + // If not all acked yet, return to Phase 1 (completion polling). + // 2. Resource check: elected thread verifies global idle resources >= block_num. + // If insufficient, reset election state and return — all threads resume + // Phase 1 to free running cores, then retry next iteration. + // 3. Dispatch: elected thread dispatches all blocks (one pass, resources guaranteed). + // Non-elected threads spin-wait until sync_start_pending == 0. + // During dispatch the elected thread has exclusive tracker access. + void handle_drain_mode( + Runtime *runtime, int32_t thread_idx +#if PTO2_PROFILING + , + bool profiling_enabled, uint32_t &phase_dispatch_count +#endif + ) { + // Spin until drain is fully initialized (sentinel -1 → block_num > 0). + int32_t block_num; + do { + block_num = drain_state_.sync_start_pending.load(std::memory_order_acquire); + } while (block_num < 0); + if (block_num == 0) return; + + // Phase 1: Ack barrier — signal this thread has stopped Phase 2 dispatch. + uint32_t all_acked = (1u << active_sched_threads_) - 1; + drain_state_.drain_ack_mask.fetch_or(1u << thread_idx, std::memory_order_release); + + // If not all threads have acked, return to do Phase 1 (completion polling). + if ((drain_state_.drain_ack_mask.load(std::memory_order_acquire) & all_acked) != all_acked) return; + + // Phase 2: Election — exactly one thread wins the CAS. + int32_t expected = 0; + drain_state_.drain_worker_elected.compare_exchange_strong( + expected, thread_idx + 1, std::memory_order_acquire, std::memory_order_relaxed + ); + + if (drain_state_.drain_worker_elected.load(std::memory_order_relaxed) != thread_idx + 1) { + // Non-elected: spin-wait for drain completion or resource-insufficient reset. + while (drain_state_.sync_start_pending.load(std::memory_order_acquire) != 0) { + if (drain_state_.drain_worker_elected.load(std::memory_order_acquire) == 0) return; + SPIN_WAIT_HINT(); + } + return; + } + + // Elected: check if global resources are sufficient. + PTO2TaskSlotState *slot_state = drain_state_.pending_task; + PTO2ResourceShape shape = pto2_active_mask_to_shape(slot_state->active_mask); + int32_t available = count_global_available(shape); + + if (available < block_num) { + // Insufficient resources — reset election, let all threads do Phase 1. + drain_state_.drain_ack_mask.store(0, std::memory_order_relaxed); + drain_state_.drain_worker_elected.store(0, std::memory_order_release); + return; + } + + // Phase 3: Dispatch — all other threads are spinning, exclusive tracker access. + drain_worker_dispatch( + runtime, block_num +#if PTO2_PROFILING + , + profiling_enabled, phase_dispatch_count +#endif + ); + } }; static AicpuExecutor g_aicpu_executor; @@ -752,6 +1001,7 @@ bool AicpuExecutor::assign_cores_to_threads() { // Mark orchestrator threads explicitly (no cores). for (int32_t t = divisor; t < thread_num_; t++) { + core_trackers_[t].init(0); DEV_INFO("Thread %d: orchestrator (0 cores)", t); } @@ -776,11 +1026,12 @@ bool AicpuExecutor::assign_cores_to_threads() { DEV_INFO("Thread %d: cluster %d (AIC=%d, AIV0=%d, AIV1=%d)", t, ci, aic_wid, aiv0_wid, aiv1_wid); } - for (int32_t t = 0; t < divisor; t++) { + for (int32_t t = 0; t < thread_num_; t++) { core_count_per_thread_[t] = core_idx[t]; DEV_INFO("Thread %d: total %d cores (%d clusters)", t, core_idx[t], core_trackers_[t].get_cluster_count()); } + active_sched_threads_ = (sched_thread_num_ > 0) ? sched_thread_num_ : thread_num_; return true; } @@ -853,6 +1104,7 @@ void AicpuExecutor::reassign_cores_for_all_threads() { core_trackers_[t].get_cluster_count(), aic_running, aiv_running ); } + active_sched_threads_ = thread_num_; } int32_t AicpuExecutor::init(Runtime *runtime) { @@ -1215,8 +1467,25 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa #endif bool try_pushed = false; + + // Phase 2 drain check: if a sync_start task is waiting for resources, + // pause normal dispatch and let the drain protocol run. + // relaxed load is enough — drain state only needs to be visible within + // a few iterations; exact ordering is enforced inside handle_drain_mode. + if (drain_state_.sync_start_pending.load(std::memory_order_acquire) != 0) { + handle_drain_mode( + runtime, thread_idx +#if PTO2_PROFILING + , + profiling_enabled, phase_dispatch_count +#endif + ); + continue; + } + const PTO2ResourceShape *dispatch_order = get_dispatch_order(thread_idx); - for (int32_t si = 0; si < PTO2_NUM_RESOURCE_SHAPES; si++) { + bool entered_drain = false; + for (int32_t si = 0; si < PTO2_NUM_RESOURCE_SHAPES && !entered_drain; si++) { PTO2ResourceShape shape = dispatch_order[si]; auto valid_cluster_states = tracker.get_valid_cluster_offset_states(shape); if (!valid_cluster_states.has_value()) { @@ -1224,7 +1493,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa } auto &local_buf = local_bufs[static_cast(shape)]; - while (valid_cluster_states.has_value()) { + while (valid_cluster_states.has_value() && !entered_drain) { int want = valid_cluster_states.count(); PTO2TaskSlotState *batch[CoreTracker::MAX_CLUSTERS]; int got = pop_ready_tasks_batch( @@ -1242,75 +1511,47 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa #if PTO2_SCHED_PROFILING uint64_t t_setup_start = get_sys_cnt_aicpu(); #endif + // sync_start: all blocks must dispatch atomically. + // Fast path — enough local slots: fall through to normal dispatch loop below. + // Slow path — not enough: enter drain mode, then re-push all remaining + // tasks in the batch so nothing is lost. + // For AIV, one cluster can serve 2 blocks (AIV0 + AIV1), so compare against + // idle AIV core count rather than cluster count. + if (pto2_requires_sync_start(slot_state->active_mask)) { + int32_t available = (shape == PTO2ResourceShape::AIV) ? tracker.count_idle_aiv_cores() : + valid_cluster_states.count(); + if (available < slot_state->block_num) { + if (!enter_drain_mode(slot_state, slot_state->block_num)) { + // CAS lost: drain already active for another task; re-push and wait. + rt->scheduler.ready_queues[static_cast(shape)].push(slot_state); + } + // Re-push all unprocessed tasks remaining in this batch. + for (int rem = bi + 1; rem < got; rem++) { + rt->scheduler.ready_queues[static_cast(shape)].push(batch[rem]); + } + entered_drain = true; + break; + } + // Fast path: enough local resources, fall through to normal dispatch. + } + // Dispatch as many blocks as possible for this task using available clusters. // For block_num=1 the inner body executes exactly once (no overhead). do { auto current_valid_cluster_offset = valid_cluster_states.pop_first(); - if (shape == PTO2ResourceShape::MIX) { - // Full-cluster: all active subtasks share the same block_idx. - uint8_t mask = slot_state->active_mask; - if (mask & PTO2_SUBTASK_MASK_AIC) { - dispatch_subtask_to_core( - runtime, thread_idx, tracker.get_aic_core_offset(current_valid_cluster_offset), - *slot_state, PTO2SubtaskSlot::AIC -#if PTO2_PROFILING - , - profiling_enabled -#endif - ); - } - if (mask & PTO2_SUBTASK_MASK_AIV0) { - dispatch_subtask_to_core( - runtime, thread_idx, tracker.get_aiv0_core_offset(current_valid_cluster_offset), - *slot_state, PTO2SubtaskSlot::AIV0 + dispatch_block_to_cluster( + runtime, thread_idx, current_valid_cluster_offset, *slot_state, shape #if PTO2_PROFILING - , - profiling_enabled + , + profiling_enabled, phase_dispatch_count #endif - ); - } - if (mask & PTO2_SUBTASK_MASK_AIV1) { - dispatch_subtask_to_core( - runtime, thread_idx, tracker.get_aiv1_core_offset(current_valid_cluster_offset), - *slot_state, PTO2SubtaskSlot::AIV1 -#if PTO2_PROFILING - , - profiling_enabled -#endif - ); - } - slot_state->next_block_idx++; - } else if (shape == PTO2ResourceShape::AIC) { - dispatch_subtask_to_core( - runtime, thread_idx, tracker.get_aic_core_offset(current_valid_cluster_offset), - *slot_state, PTO2SubtaskSlot::AIC -#if PTO2_PROFILING - , - profiling_enabled -#endif - ); - slot_state->next_block_idx++; - } else { // shape == PTO2ResourceShape::AIV - auto core_offset = tracker.is_aiv0_core_idle(current_valid_cluster_offset) ? - tracker.get_aiv0_core_offset(current_valid_cluster_offset) : - tracker.get_aiv1_core_offset(current_valid_cluster_offset); - dispatch_subtask_to_core( - runtime, thread_idx, core_offset, *slot_state, PTO2SubtaskSlot::AIV0 -#if PTO2_PROFILING - , - profiling_enabled -#endif - ); - slot_state->next_block_idx++; - // Refresh idle state so the do-while naturally picks up - // the other AIV in the same cluster on the next iteration. - if (slot_state->next_block_idx < slot_state->block_num) { - valid_cluster_states = tracker.get_valid_cluster_offset_states(shape); - } + ); + slot_state->next_block_idx++; + // For AIV, refresh cluster states so the do-while can pick up the + // other AIV core in the same cluster on the next iteration. + if (shape == PTO2ResourceShape::AIV && slot_state->next_block_idx < slot_state->block_num) { + valid_cluster_states = tracker.get_valid_cluster_offset_states(shape); } -#if PTO2_PROFILING - phase_dispatch_count += __builtin_popcount(slot_state->active_mask); -#endif DEV_DEBUG( "Thread %d: Dispatched %s task %" PRId64 " block %d/%d to cluster_offset %d", thread_idx, shape_name(shape), static_cast(slot_state->task->task_id.raw), @@ -1859,6 +2100,12 @@ int32_t AicpuExecutor::run(Runtime *runtime) { } #endif + // Total core counts = aic_count_ / aiv_count_ (set once at runtime init). + for (int i = 0; i < orch_thread_num_; i++) { + rt->orchestrators[i].total_cluster_count = aic_count_; + rt->orchestrators[i].total_aiv_count = aiv_count_; + } + // With multi-ring, slot_states are per-ring inside the scheduler. runtime->set_pto2_slot_states_ptr(nullptr); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp index c467fb667..ccbb6b40b 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp @@ -346,6 +346,21 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke active_mask = pto2_mixed_kernels_to_active_mask(normalized); } + // Encode require_sync_start into active_mask bit 3 (only meaningful for tasks with block_num > 1) + if (block_num > 1 && args.launch_spec.require_sync_start()) { + // Deadlock check: block_num >= total available slots of the required type. + // For MIX/AIC: limit is total_cluster_count (one AIC per cluster). + // For AIV: limit is total_aiv_count. + PTO2ResourceShape shape = pto2_active_mask_to_shape(active_mask); + int32_t limit = (shape == PTO2ResourceShape::AIV) ? orch->total_aiv_count : orch->total_cluster_count; + if (limit > 0 && block_num > limit) { + LOG_ERROR("FATAL: require_sync_start block_num=%d > limit=%d (deadlock guaranteed)", block_num, limit); + orch->fatal = true; + return TaskOutputTensors{}; + } + active_mask |= PTO2_SUBTASK_FLAG_SYNC_START; + } + // Submission without an open scope is illegal always_assert(orch->scope_stack_top >= 0 && "Cannot submit task outside a scope"); @@ -583,7 +598,8 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke cur_slot_state.task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed); cur_slot_state.fanout_refcount.store(0, std::memory_order_relaxed); cur_slot_state.completed_subtasks.store(0, std::memory_order_relaxed); - cur_slot_state.total_required_subtasks = static_cast(block_num * __builtin_popcount(active_mask)); + cur_slot_state.total_required_subtasks = + static_cast(block_num * __builtin_popcount(pto2_core_mask(active_mask))); cur_slot_state.block_num = block_num; cur_slot_state.next_block_idx = 0; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h index a40a6f7ce..0d9d94276 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h @@ -70,6 +70,10 @@ struct PTO2OrchestratorState { // Note: In simulated mode, orchestrator and scheduler share address space // In real mode, they communicate via shared memory only PTO2SchedulerState *scheduler; // For simulated mode only + + // Total core counts set once at executor init; used for submit-time deadlock detection. + int32_t total_cluster_count{0}; // AIC cores = MIX clusters + int32_t total_aiv_count{0}; // AIV cores (= 2 × clusters on standard hardware) #if PTO2_PROFILING // Runtime profiling switch copied from Runtime::enable_profiling. bool enable_profiling; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h index 90e0397ad..9901f12ba 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h @@ -39,9 +39,10 @@ enum class PTO2SubtaskSlot : uint8_t { /** * Subtask mask bits (for active_mask / subtask_done_mask) */ -inline constexpr uint8_t PTO2_SUBTASK_MASK_AIC = (1u << 0); // 0x1 -inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV0 = (1u << 1); // 0x2 -inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV1 = (1u << 2); // 0x4 +inline constexpr uint8_t PTO2_SUBTASK_MASK_AIC = (1u << 0); // 0x1 +inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV0 = (1u << 1); // 0x2 +inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV1 = (1u << 2); // 0x4 +inline constexpr uint8_t PTO2_SUBTASK_FLAG_SYNC_START = (1u << 3); // 0x8: all blocks must launch atomically /** * Test whether a subtask slot is active in a given mask @@ -50,6 +51,18 @@ static inline bool pto2_subtask_active(uint8_t mask, PTO2SubtaskSlot slot) { return (mask & (1u << static_cast(slot))) != 0; } +/** + * Extract only the core bits from active_mask (strips flag bits). + */ +static inline uint8_t pto2_core_mask(uint8_t active_mask) { return active_mask & 0x07u; } + +/** + * Check whether a task requires all blocks to be launched atomically. + */ +static inline bool pto2_requires_sync_start(uint8_t active_mask) { + return (active_mask & PTO2_SUBTASK_FLAG_SYNC_START) != 0; +} + /** * Mixed-task submit contract. * @@ -83,9 +96,10 @@ inline constexpr int32_t PTO2_NUM_RESOURCE_SHAPES = 3; * Caller must ensure active_mask is valid (at least one bit set). */ static inline PTO2ResourceShape pto2_active_mask_to_shape(uint8_t active_mask) { - int bit_count = __builtin_popcount(active_mask); + uint8_t core_mask = pto2_core_mask(active_mask); + int bit_count = __builtin_popcount(core_mask); if (bit_count >= 2) return PTO2ResourceShape::MIX; - if (active_mask & PTO2_SUBTASK_MASK_AIC) return PTO2ResourceShape::AIC; + if (core_mask & PTO2_SUBTASK_MASK_AIC) return PTO2ResourceShape::AIC; return PTO2ResourceShape::AIV; } @@ -114,6 +128,10 @@ class PTO2LaunchSpec { int16_t block_num() const { return block_num_; } void set_block_num(int16_t n) { block_num_ = n; } + bool require_sync_start() const { return require_sync_start_; } + void set_require_sync_start(bool v) { require_sync_start_ = v; } + private: int16_t block_num_{1}; + bool require_sync_start_{false}; }; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h index 411cef710..6af392522 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -48,7 +48,7 @@ #define RUNTIME_MAX_WORKER 72 // 24 AIC + 48 AIV cores #define RUNTIME_MAX_TENSOR_PAIRS 64 #define RUNTIME_MAX_FUNC_ID 32 -#define RUNTIME_MAX_ORCH_SO_SIZE (4 * 1024 * 1024) // 1MB max for orchestration SO +#define RUNTIME_MAX_ORCH_SO_SIZE (4 * 1024 * 1024) // 4MB max for orchestration SO // Default ready queue shards: one shard per worker thread (total minus orchestrator) constexpr int RUNTIME_DEFAULT_READY_QUEUE_SHARDS = PLATFORM_MAX_AICPU_THREADS - 1; From e60616d9a7bd5c4e5fe6cbe25fba66e499e22ed6 Mon Sep 17 00:00:00 2001 From: poursoul Date: Fri, 3 Apr 2026 21:51:34 +0800 Subject: [PATCH 2/2] Add: require_sync_start for a5 SPMD (port from a2a3) Port the complete require_sync_start / drain mode implementation from a2a3 to a5 tensormap_and_ringbuffer runtime: - pto_submit_types.h: add PTO2_SUBTASK_FLAG_SYNC_START, pto2_core_mask, pto2_requires_sync_start; fix pto2_active_mask_to_shape to strip flag bits; extend PTO2LaunchSpec with require_sync_start - pto_orchestrator: add total_cluster_count/total_aiv_count for deadlock detection; encode sync_start flag in active_mask at submit time; fix total_required_subtasks popcount to use pto2_core_mask - aicpu_executor: add SyncStartDrainState, active_sched_threads, count_idle_aiv_cores, three-phase drain protocol (ack barrier, global resource check, exclusive dispatch); modify scheduler main loop with drain check and sync_start fast/slow path branching - Add 5 test examples: spmd_sync_start, spmd_sync_start_aiv, spmd_sync_start_edge, spmd_sync_start_stress, spmd_starvation --- .../spmd_starvation/golden.py | 84 +++++ .../spmd_starvation/kernels/kernel_config.py | 53 +++ .../orchestration/spmd_starvation_orch.cpp | 100 ++++++ .../spmd_sync_start/golden.py | 66 ++++ .../spmd_sync_start/kernels/kernel_config.py | 52 +++ .../orchestration/spmd_sync_start_orch.cpp | 82 +++++ .../spmd_sync_start_aiv/golden.py | 62 ++++ .../kernels/kernel_config.py | 41 +++ .../spmd_sync_start_aiv_orch.cpp | 76 +++++ .../spmd_sync_start_edge/golden.py | 66 ++++ .../kernels/kernel_config.py | 52 +++ .../spmd_sync_start_edge_orch.cpp | 82 +++++ .../spmd_sync_start_stress/golden.py | 104 ++++++ .../kernels/kernel_config.py | 62 ++++ .../spmd_sync_start_stress_orch.cpp | 109 ++++++ .../aicpu/aicpu_executor.cpp | 314 +++++++++++++++++- .../runtime/pto_orchestrator.cpp | 18 +- .../runtime/pto_orchestrator.h | 4 + .../runtime/pto_submit_types.h | 28 +- 19 files changed, 1445 insertions(+), 10 deletions(-) create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_starvation/golden.py create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_starvation/kernels/kernel_config.py create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_starvation/kernels/orchestration/spmd_starvation_orch.cpp create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start/golden.py create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start/kernels/kernel_config.py create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start/kernels/orchestration/spmd_sync_start_orch.cpp create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/golden.py create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/kernel_config.py create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/orchestration/spmd_sync_start_aiv_orch.cpp create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/golden.py create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/kernel_config.py create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/orchestration/spmd_sync_start_edge_orch.cpp create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/golden.py create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/kernel_config.py create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/orchestration/spmd_sync_start_stress_orch.cpp diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_starvation/golden.py b/examples/a5/tensormap_and_ringbuffer/spmd_starvation/golden.py new file mode 100644 index 000000000..2e85b0fb6 --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/spmd_starvation/golden.py @@ -0,0 +1,84 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +""" +Golden test for SPMD starvation prevention. + +Submits 18 normal MIX tasks interleaved with 2 sync_start MIX tasks and +verifies all 20 tasks complete with correct output. The test validates that +the drain mechanism prevents sync_start tasks from being starved. + +Layout: + Wave 1: 6 x normal(block_num=4) -> CL 0..71 + Sync 0: 1 x sync_start(block_num=6) -> CL 72..89 + Wave 2: 6 x normal(block_num=4) -> CL 90..161 + Sync 1: 1 x sync_start(block_num=6) -> CL 162..179 + Wave 3: 6 x normal(block_num=4) -> CL 180..251 + +Total: 252 CL = 4032 float32. + +Args layout: [output] +""" + +import torch + +__outputs__ = ["output"] +RTOL = 0 +ATOL = 0 + +ALL_CASES = { + "Case1": {}, +} + +DEFAULT_CASE = "Case1" + +FLOATS_PER_CACHE_LINE = 16 +SLOTS_PER_BLOCK = 3 # AIC, AIV0, AIV1 +NORMAL_BLOCK_NUM = 4 +SYNC_BLOCK_NUM = 6 +NORMAL_CL = NORMAL_BLOCK_NUM * SLOTS_PER_BLOCK # 12 +SYNC_CL = SYNC_BLOCK_NUM * SLOTS_PER_BLOCK # 18 + + +# Build flat task list as (block_num, base_cl) +def _build_tasks(): + tasks = [] + cl = 0 + for _ in range(6): + tasks.append((NORMAL_BLOCK_NUM, cl)) + cl += NORMAL_CL + tasks.append((SYNC_BLOCK_NUM, cl)) + cl += SYNC_CL + for _ in range(6): + tasks.append((NORMAL_BLOCK_NUM, cl)) + cl += NORMAL_CL + tasks.append((SYNC_BLOCK_NUM, cl)) + cl += SYNC_CL + for _ in range(6): + tasks.append((NORMAL_BLOCK_NUM, cl)) + cl += NORMAL_CL + return tasks + + +TASKS = _build_tasks() +TOTAL_CL = sum(bn * SLOTS_PER_BLOCK for bn, _ in TASKS) # 252 + + +def generate_inputs(params: dict) -> list: + output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32) + return [("output", output)] + + +def compute_golden(tensors: dict, params: dict) -> None: + out = torch.as_tensor(tensors["output"]) + for block_num, base_cl in TASKS: + for block_idx in range(block_num): + for slot in range(SLOTS_PER_BLOCK): + cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot + out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx) + tensors["output"][:] = out diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_starvation/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/spmd_starvation/kernels/kernel_config.py new file mode 100644 index 000000000..a613c65ca --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/spmd_starvation/kernels/kernel_config.py @@ -0,0 +1,53 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +""" +Kernel configuration for SPMD starvation prevention test (tensormap_and_ringbuffer Runtime). + +Submits 18 normal MIX tasks interleaved with 2 sync_start MIX tasks to verify +the drain mechanism prevents sync_start tasks from being starved. +Reuses the same AIC/AIV kernels from spmd_multiblock_mix. +""" + +from pathlib import Path + +_KERNELS_ROOT = Path(__file__).parent +_MIX_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_mix" / "kernels" + +ORCHESTRATION = { + "source": str(_KERNELS_ROOT / "orchestration" / "spmd_starvation_orch.cpp"), + "function_name": "aicpu_orchestration_entry", +} + +KERNELS = [ + { + "func_id": 0, + "name": "SPMD_MIX_AIC", + "source": str(_MIX_KERNELS / "aic" / "kernel_spmd_mix.cpp"), + "core_type": "aic", + }, + { + "func_id": 1, + "name": "SPMD_MIX_AIV0", + "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"), + "core_type": "aiv", + }, + { + "func_id": 2, + "name": "SPMD_MIX_AIV1", + "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"), + "core_type": "aiv", + }, +] + +RUNTIME_CONFIG = { + "runtime": "tensormap_and_ringbuffer", + "aicpu_thread_num": 4, + "orch_thread_num": 1, + "block_dim": 24, +} diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_starvation/kernels/orchestration/spmd_starvation_orch.cpp b/examples/a5/tensormap_and_ringbuffer/spmd_starvation/kernels/orchestration/spmd_starvation_orch.cpp new file mode 100644 index 000000000..2381c5a38 --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/spmd_starvation/kernels/orchestration/spmd_starvation_orch.cpp @@ -0,0 +1,100 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * SPMD Starvation Prevention Orchestration + * + * Submits a large wave of normal MIX tasks followed by sync_start tasks, + * then another wave of normal tasks. The drain mechanism must ensure the + * sync_start tasks are not indefinitely delayed by the surrounding load. + * + * Layout: 3 waves x 6 normal tasks (block_num=4) + 2 sync_start tasks (block_num=6) + * + * Normal task: block_num=4, require_sync_start=false -> 4 blocks x 3 slots = 12 CL each + * Sync task: block_num=6, require_sync_start=true -> 6 blocks x 3 slots = 18 CL each + * + * Total CL: 3x6x12 + 2x18 = 216 + 36 = 252 + * + * Args layout: [output] + */ + +#include +#include + +#include "pto_orchestration_api.h" + +#define FUNC_SPMD_MIX_AIC 0 +#define FUNC_SPMD_MIX_AIV0 1 +#define FUNC_SPMD_MIX_AIV1 2 + +static constexpr int32_t SLOTS_PER_BLOCK = 3; // AIC, AIV0, AIV1 +static constexpr int32_t NORMAL_BLOCK_NUM = 4; +static constexpr int32_t SYNC_BLOCK_NUM = 6; +static constexpr int32_t NORMAL_CL = NORMAL_BLOCK_NUM * SLOTS_PER_BLOCK; // 12 +static constexpr int32_t SYNC_CL = SYNC_BLOCK_NUM * SLOTS_PER_BLOCK; // 18 + +extern "C" { + +__attribute__((visibility("default"))) PTO2OrchestrationConfig +aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) { + (void)orch_args; // NOLINT(readability/casting) + return PTO2OrchestrationConfig{ + .expected_arg_count = 1, + }; +} + +static void submit_mix(Tensor &out, int16_t block_num, int64_t base_cl, bool sync_start) { + MixedKernels mk; + mk.aic_kernel_id = FUNC_SPMD_MIX_AIC; + mk.aiv0_kernel_id = FUNC_SPMD_MIX_AIV0; + mk.aiv1_kernel_id = FUNC_SPMD_MIX_AIV1; + + Arg args; + args.add_inout(out); + args.add_scalar(base_cl); + args.launch_spec.set_core_num(block_num); + args.launch_spec.set_require_sync_start(sync_start); + pto2_rt_submit_task(mk, args); +} + +__attribute__((visibility("default"))) void +aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args, int orch_thread_num, int orch_thread_index) { + (void)orch_thread_num; // NOLINT(readability/casting) + if (orch_thread_index != 0) return; + + Tensor ext_output = from_tensor_arg(orch_args.tensor(0)); + + int64_t cl = 0; + + // Wave 1: 6 normal MIX tasks + for (int i = 0; i < 6; i++, cl += NORMAL_CL) + submit_mix(ext_output, NORMAL_BLOCK_NUM, cl, false); + + // Sync-start task 0: must not be starved by wave 1 or wave 2 + submit_mix(ext_output, SYNC_BLOCK_NUM, cl, true); + cl += SYNC_CL; + + // Wave 2: 6 normal MIX tasks + for (int i = 0; i < 6; i++, cl += NORMAL_CL) + submit_mix(ext_output, NORMAL_BLOCK_NUM, cl, false); + + // Sync-start task 1: must not be starved by wave 2 or wave 3 + submit_mix(ext_output, SYNC_BLOCK_NUM, cl, true); + cl += SYNC_CL; + + // Wave 3: 6 normal MIX tasks + for (int i = 0; i < 6; i++, cl += NORMAL_CL) + submit_mix(ext_output, NORMAL_BLOCK_NUM, cl, false); + + LOG_ALWAYS("[spmd_starvation] Submitted 20 tasks (18 normal + 2 sync_start)"); +} + +} // extern "C" diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/golden.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/golden.py new file mode 100644 index 000000000..33acd1c1a --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/golden.py @@ -0,0 +1,66 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +""" +Golden test for SPMD sync_start. + +Submits 4 MIX tasks (3 with require_sync_start=true, 1 baseline) and verifies +all blocks of every task write the correct float(block_idx) to their cache line. + +Tasks (AIC=slot0, AIV0=slot1, AIV1=slot2): + T0: block_num=2, sync_start=True -> CL 0..5 + T1: block_num=8, sync_start=True -> CL 6..29 + T2: block_num=2, sync_start=False -> CL 30..35 (baseline) + T3: block_num=12, sync_start=True -> CL 36..71 + +Output tensor: 72 cache lines = 1152 float32. + +Args layout: [output] +""" + +import torch + +__outputs__ = ["output"] +RTOL = 0 +ATOL = 0 + +ALL_CASES = { + "Case1": {}, +} + +DEFAULT_CASE = "Case1" + +FLOATS_PER_CACHE_LINE = 16 +SLOTS_PER_BLOCK = 3 # AIC, AIV0, AIV1 + +# (block_num, base_cl) for each submitted task +TASKS = [ + (2, 0), # T0: sync_start=True + (8, 6), # T1: sync_start=True + (2, 30), # T2: sync_start=False (baseline) + (12, 36), # T3: sync_start=True +] + +TOTAL_CL = sum(block_num * SLOTS_PER_BLOCK for block_num, _ in TASKS) # 72 + + +def generate_inputs(params: dict) -> list: + output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32) + return [ + ("output", output), + ] + + +def compute_golden(tensors: dict, params: dict) -> None: + out = torch.as_tensor(tensors["output"]) + for block_num, base_cl in TASKS: + for block_idx in range(block_num): + for slot in range(SLOTS_PER_BLOCK): + cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot + out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx) + tensors["output"][:] = out diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/kernels/kernel_config.py new file mode 100644 index 000000000..95f706c9d --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/kernels/kernel_config.py @@ -0,0 +1,52 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +""" +Kernel configuration for SPMD sync_start test (tensormap_and_ringbuffer Runtime). + +Submits MIX tasks with require_sync_start=true to verify atomic batch launch. +Reuses the same AIC/AIV kernels from spmd_multiblock_mix. +""" + +from pathlib import Path + +_KERNELS_ROOT = Path(__file__).parent +_MIX_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_mix" / "kernels" + +ORCHESTRATION = { + "source": str(_KERNELS_ROOT / "orchestration" / "spmd_sync_start_orch.cpp"), + "function_name": "aicpu_orchestration_entry", +} + +KERNELS = [ + { + "func_id": 0, + "name": "SPMD_MIX_AIC", + "source": str(_MIX_KERNELS / "aic" / "kernel_spmd_mix.cpp"), + "core_type": "aic", + }, + { + "func_id": 1, + "name": "SPMD_MIX_AIV0", + "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"), + "core_type": "aiv", + }, + { + "func_id": 2, + "name": "SPMD_MIX_AIV1", + "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"), + "core_type": "aiv", + }, +] + +RUNTIME_CONFIG = { + "runtime": "tensormap_and_ringbuffer", + "aicpu_thread_num": 4, + "orch_thread_num": 1, + "block_dim": 24, +} diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/kernels/orchestration/spmd_sync_start_orch.cpp b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/kernels/orchestration/spmd_sync_start_orch.cpp new file mode 100644 index 000000000..edeef95d6 --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/kernels/orchestration/spmd_sync_start_orch.cpp @@ -0,0 +1,82 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * SPMD Sync-Start Orchestration + * + * Submits MIX tasks with require_sync_start=true to verify that the scheduler + * atomically launches all blocks before any can run. + * + * Tasks: + * T0: block_num=2, require_sync_start=true (basic sync launch) + * T1: block_num=8, require_sync_start=true (larger batch) + * T2: block_num=2, require_sync_start=false (normal, as baseline) + * T3: block_num=12, require_sync_start=true (cross-thread batch) + * + * Each block writes float(block_idx) to its allocated cache-line slot, + * identical to spmd_multiblock_mix so the same kernel binaries can be reused. + * + * Args layout: [output] + */ + +#include +#include + +#include "pto_orchestration_api.h" + +#define FUNC_SPMD_MIX_AIC 0 +#define FUNC_SPMD_MIX_AIV0 1 +#define FUNC_SPMD_MIX_AIV1 2 + +extern "C" { + +__attribute__((visibility("default"))) PTO2OrchestrationConfig +aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) { + (void)orch_args; // NOLINT(readability/casting) + return PTO2OrchestrationConfig{ + .expected_arg_count = 1, + }; +} + +static void submit_mix(Tensor &out, int16_t block_num, int64_t base_cl, bool sync_start) { + MixedKernels mk; + mk.aic_kernel_id = FUNC_SPMD_MIX_AIC; + mk.aiv0_kernel_id = FUNC_SPMD_MIX_AIV0; + mk.aiv1_kernel_id = FUNC_SPMD_MIX_AIV1; + + Arg args; + args.add_inout(out); + args.add_scalar(base_cl); + args.launch_spec.set_core_num(block_num); + args.launch_spec.set_require_sync_start(sync_start); + pto2_rt_submit_task(mk, args); +} + +__attribute__((visibility("default"))) void +aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args, int orch_thread_num, int orch_thread_index) { + (void)orch_thread_num; // NOLINT(readability/casting) + if (orch_thread_index != 0) return; + + Tensor ext_output = from_tensor_arg(orch_args.tensor(0)); + + // T0: 2 blocks, sync_start=true (6 CL) + submit_mix(ext_output, 2, 0, true); + // T1: 8 blocks, sync_start=true (24 CL) + submit_mix(ext_output, 8, 6, true); + // T2: 2 blocks, sync_start=false (6 CL, baseline) + submit_mix(ext_output, 2, 30, false); + // T3: 12 blocks, sync_start=true (36 CL) + submit_mix(ext_output, 12, 36, true); + + LOG_ALWAYS("[spmd_sync_start] Submitted 4 tasks (3 sync_start + 1 baseline)"); +} + +} // extern "C" diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/golden.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/golden.py new file mode 100644 index 000000000..3c60f1ac8 --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/golden.py @@ -0,0 +1,62 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +""" +Golden test for SPMD sync_start with AIV-only tasks. + +Submits 4 AIV tasks (3 with require_sync_start=true, 1 baseline) to exercise +the AIV-specific fast path (count_idle_aiv_cores) and drain slow path. + +Tasks: + T0: block_num=4, sync_start=True -> CL 0..3 (fast path) + T1: block_num=16, sync_start=True -> CL 4..19 (saturate one thread) + T2: block_num=4, sync_start=False -> CL 20..23 (baseline) + T3: block_num=24, sync_start=True -> CL 24..47 (cross-thread drain) + +Output tensor: 48 cache lines = 768 float32. + +Args layout: [output] +""" + +import torch + +__outputs__ = ["output"] +RTOL = 0 +ATOL = 0 + +ALL_CASES = { + "Case1": {}, +} + +DEFAULT_CASE = "Case1" + +FLOATS_PER_CACHE_LINE = 16 + +# (block_num, base_cl) for each submitted task +TASKS = [ + (4, 0), # T0: sync_start=True, fast path + (16, 4), # T1: sync_start=True, saturate single thread + (4, 20), # T2: sync_start=False, baseline + (24, 24), # T3: sync_start=True, cross-thread drain +] + +TOTAL_CL = sum(block_num for block_num, _ in TASKS) # 48 + + +def generate_inputs(params: dict) -> list: + output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32) + return [("output", output)] + + +def compute_golden(tensors: dict, params: dict) -> None: + out = torch.as_tensor(tensors["output"]) + for block_num, base_cl in TASKS: + for block_idx in range(block_num): + cl = base_cl + block_idx + out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx) + tensors["output"][:] = out diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/kernel_config.py new file mode 100644 index 000000000..77102a658 --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/kernel_config.py @@ -0,0 +1,41 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +""" +Kernel configuration for SPMD sync_start AIV test (tensormap_and_ringbuffer Runtime). + +Submits AIV tasks with require_sync_start=true to verify atomic batch launch +and the AIV-specific fast path (count_idle_aiv_cores). +Reuses the same AIV kernel from spmd_multiblock_aiv. +""" + +from pathlib import Path + +_KERNELS_ROOT = Path(__file__).parent +_AIV_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_aiv" / "kernels" + +ORCHESTRATION = { + "source": str(_KERNELS_ROOT / "orchestration" / "spmd_sync_start_aiv_orch.cpp"), + "function_name": "aicpu_orchestration_entry", +} + +KERNELS = [ + { + "func_id": 0, + "name": "SPMD_WRITE_AIV", + "source": str(_AIV_KERNELS / "aiv" / "kernel_spmd_write.cpp"), + "core_type": "aiv", + }, +] + +RUNTIME_CONFIG = { + "runtime": "tensormap_and_ringbuffer", + "aicpu_thread_num": 4, + "orch_thread_num": 1, + "block_dim": 24, +} diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/orchestration/spmd_sync_start_aiv_orch.cpp b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/orchestration/spmd_sync_start_aiv_orch.cpp new file mode 100644 index 000000000..fa55cee3c --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/orchestration/spmd_sync_start_aiv_orch.cpp @@ -0,0 +1,76 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * SPMD Sync-Start AIV Orchestration + * + * Submits AIV-only tasks with require_sync_start=true to exercise: + * - AIV fast path: count_idle_aiv_cores() >= block_num (small block_num) + * - AIV drain path: block_num exceeds local AIV cores (cross-thread drain) + * + * Tasks: + * T0: block_num=4, require_sync_start=true (fast path) + * T1: block_num=16, require_sync_start=true (saturate one thread: 8 clusters x 2 AIV) + * T2: block_num=4, require_sync_start=false (baseline) + * T3: block_num=24, require_sync_start=true (cross-thread drain) + * + * Each block writes float(block_idx) at (base_cl + block_idx) x FLOATS_PER_CACHE_LINE, + * reusing the kernel from spmd_multiblock_aiv. + * + * Args layout: [output] + */ + +#include +#include + +#include "pto_orchestration_api.h" + +#define FUNC_SPMD_WRITE_AIV 0 + +extern "C" { + +__attribute__((visibility("default"))) PTO2OrchestrationConfig +aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) { + (void)orch_args; // NOLINT(readability/casting) + return PTO2OrchestrationConfig{ + .expected_arg_count = 1, + }; +} + +static void submit_aiv(Tensor &out, int16_t block_num, int64_t base_cl, bool sync_start) { + Arg args; + args.add_inout(out); + args.add_scalar(base_cl); + args.launch_spec.set_core_num(block_num); + args.launch_spec.set_require_sync_start(sync_start); + pto2_rt_submit_aiv_task(FUNC_SPMD_WRITE_AIV, args); +} + +__attribute__((visibility("default"))) void +aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args, int orch_thread_num, int orch_thread_index) { + (void)orch_thread_num; // NOLINT(readability/casting) + if (orch_thread_index != 0) return; + + Tensor ext_output = from_tensor_arg(orch_args.tensor(0)); + + // T0: 4 blocks, sync_start=true (fast path: 4 <= idle AIV cores on one thread) + submit_aiv(ext_output, 4, 0, true); + // T1: 16 blocks, sync_start=true (saturate: 8 clusters x 2 AIV = 16 cores) + submit_aiv(ext_output, 16, 4, true); + // T2: 4 blocks, sync_start=false (baseline) + submit_aiv(ext_output, 4, 20, false); + // T3: 24 blocks, sync_start=true (cross-thread drain) + submit_aiv(ext_output, 24, 24, true); + + LOG_ALWAYS("[spmd_sync_start_aiv] Submitted 4 AIV tasks (3 sync_start + 1 baseline)"); +} + +} // extern "C" diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/golden.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/golden.py new file mode 100644 index 000000000..2bfcaea4a --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/golden.py @@ -0,0 +1,66 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +""" +Golden test for SPMD sync_start boundary conditions. + +Tests edge-case block_num values relative to per-thread cluster capacity (8 clusters +with 3 sched threads = 24 total clusters, 48 total AIV cores). + +MIX tasks (SLOTS_PER_BLOCK=3): + T0: block_num=1, sync_start=True -> CL 0..2 (degenerate: always fast path) + T1: block_num=8, sync_start=True -> CL 3..26 (exactly one thread's capacity) + T2: block_num=9, sync_start=True -> CL 27..53 (one over: must enter drain) + T3: block_num=23, sync_start=True -> CL 54..122 (max valid: total_clusters - 1) + T4: block_num=1, sync_start=False -> CL 123..125 (baseline) + +Output tensor: 126 cache lines = 2016 float32. + +Args layout: [output] +""" + +import torch + +__outputs__ = ["output"] +RTOL = 0 +ATOL = 0 + +ALL_CASES = { + "Case1": {}, +} + +DEFAULT_CASE = "Case1" + +FLOATS_PER_CACHE_LINE = 16 +SLOTS_PER_BLOCK = 3 # AIC, AIV0, AIV1 + +# (block_num, base_cl) for each submitted task +TASKS = [ + (1, 0), # T0: sync=True, degenerate + (8, 3), # T1: sync=True, exactly one thread's clusters + (9, 27), # T2: sync=True, one over -> drain + (23, 54), # T3: sync=True, max valid (total_clusters - 1) + (1, 123), # T4: sync=False, baseline +] + +TOTAL_CL = sum(block_num * SLOTS_PER_BLOCK for block_num, _ in TASKS) # 126 + + +def generate_inputs(params: dict) -> list: + output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32) + return [("output", output)] + + +def compute_golden(tensors: dict, params: dict) -> None: + out = torch.as_tensor(tensors["output"]) + for block_num, base_cl in TASKS: + for block_idx in range(block_num): + for slot in range(SLOTS_PER_BLOCK): + cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot + out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx) + tensors["output"][:] = out diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/kernel_config.py new file mode 100644 index 000000000..84488dd71 --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/kernel_config.py @@ -0,0 +1,52 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +""" +Kernel configuration for SPMD sync_start boundary test (tensormap_and_ringbuffer Runtime). + +Tests edge-case block_num values relative to per-thread cluster capacity. +Reuses the same AIC/AIV kernels from spmd_multiblock_mix. +""" + +from pathlib import Path + +_KERNELS_ROOT = Path(__file__).parent +_MIX_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_mix" / "kernels" + +ORCHESTRATION = { + "source": str(_KERNELS_ROOT / "orchestration" / "spmd_sync_start_edge_orch.cpp"), + "function_name": "aicpu_orchestration_entry", +} + +KERNELS = [ + { + "func_id": 0, + "name": "SPMD_MIX_AIC", + "source": str(_MIX_KERNELS / "aic" / "kernel_spmd_mix.cpp"), + "core_type": "aic", + }, + { + "func_id": 1, + "name": "SPMD_MIX_AIV0", + "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"), + "core_type": "aiv", + }, + { + "func_id": 2, + "name": "SPMD_MIX_AIV1", + "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"), + "core_type": "aiv", + }, +] + +RUNTIME_CONFIG = { + "runtime": "tensormap_and_ringbuffer", + "aicpu_thread_num": 4, + "orch_thread_num": 1, + "block_dim": 24, +} diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/orchestration/spmd_sync_start_edge_orch.cpp b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/orchestration/spmd_sync_start_edge_orch.cpp new file mode 100644 index 000000000..ad502c130 --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/orchestration/spmd_sync_start_edge_orch.cpp @@ -0,0 +1,82 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * SPMD Sync-Start Boundary Orchestration + * + * Tests edge-case block_num values relative to per-thread cluster capacity + * (8 clusters per sched thread, 24 total clusters). + * + * Tasks: + * T0: block_num=1, sync_start=true (degenerate: always fast path) + * T1: block_num=8, sync_start=true (exactly one thread's capacity) + * T2: block_num=9, sync_start=true (one over: must enter drain) + * T3: block_num=23, sync_start=true (max valid: total_clusters - 1) + * T4: block_num=1, sync_start=false (baseline) + * + * Args layout: [output] + */ + +#include +#include + +#include "pto_orchestration_api.h" + +#define FUNC_SPMD_MIX_AIC 0 +#define FUNC_SPMD_MIX_AIV0 1 +#define FUNC_SPMD_MIX_AIV1 2 + +extern "C" { + +__attribute__((visibility("default"))) PTO2OrchestrationConfig +aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) { + (void)orch_args; // NOLINT(readability/casting) + return PTO2OrchestrationConfig{ + .expected_arg_count = 1, + }; +} + +static void submit_mix(Tensor &out, int16_t block_num, int64_t base_cl, bool sync_start) { + MixedKernels mk; + mk.aic_kernel_id = FUNC_SPMD_MIX_AIC; + mk.aiv0_kernel_id = FUNC_SPMD_MIX_AIV0; + mk.aiv1_kernel_id = FUNC_SPMD_MIX_AIV1; + + Arg args; + args.add_inout(out); + args.add_scalar(base_cl); + args.launch_spec.set_core_num(block_num); + args.launch_spec.set_require_sync_start(sync_start); + pto2_rt_submit_task(mk, args); +} + +__attribute__((visibility("default"))) void +aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args, int orch_thread_num, int orch_thread_index) { + (void)orch_thread_num; // NOLINT(readability/casting) + if (orch_thread_index != 0) return; + + Tensor ext_output = from_tensor_arg(orch_args.tensor(0)); + + // T0: block_num=1, sync_start=true (degenerate: always fast path, 3 CL) + submit_mix(ext_output, 1, 0, true); + // T1: block_num=8, sync_start=true (exactly one thread's cluster capacity, 24 CL) + submit_mix(ext_output, 8, 3, true); + // T2: block_num=9, sync_start=true (one over single thread -> must drain, 27 CL) + submit_mix(ext_output, 9, 27, true); + // T3: block_num=23, sync_start=true (max valid = total_clusters - 1, 69 CL) + submit_mix(ext_output, 23, 54, true); + // T4: block_num=1, sync_start=false (baseline, 3 CL) + submit_mix(ext_output, 1, 123, false); + + LOG_ALWAYS("[spmd_sync_start_edge] Submitted 5 tasks: block_num=1,8,9,23 (sync) + 1 (baseline)"); +} + +} // extern "C" diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/golden.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/golden.py new file mode 100644 index 000000000..3315360df --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/golden.py @@ -0,0 +1,104 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +""" +Golden test for SPMD sync_start stress / CAS contention with mixed shapes. + +Submits 6 rounds of mixed-shape tasks to stress drain CAS contention, ack +barrier, and state cleanup across drain cycles. All three resource shapes +(MIX, AIV, AIC) are exercised with both sync and non-sync modes. + +Each round (9 tasks): + 4 x normal MIX (block_num=4, sync=false) -> 4 x 4 x 3 = 48 CL + 2 x sync MIX (block_num=12, sync=true) -> 2 x 12 x 3 = 72 CL + 2 x sync AIV (block_num=8, sync=true) -> 2 x 8 x 1 = 16 CL + 1 x normal AIV (block_num=4, sync=false) -> 1 x 4 x 1 = 4 CL + Round total: 140 CL + +6 rounds -> 54 tasks (24 normal MIX + 12 sync MIX + 12 sync AIV + 6 normal AIV) +Grand total: 840 CL = 13440 float32 + +Args layout: [output] +""" + +import torch + +__outputs__ = ["output"] +RTOL = 0 +ATOL = 0 + +ALL_CASES = { + "Case1": {}, +} + +DEFAULT_CASE = "Case1" + +FLOATS_PER_CACHE_LINE = 16 +ROUNDS = 6 + +# shape constants: (slots_per_block, written_slots) +# MIX: kernel writes at base_cl + block_idx * 3 + {0,1,2}, 3 CL per block, all written +# AIV: kernel writes at base_cl + block_idx, 1 CL per block +SHAPE_MIX = "MIX" +SHAPE_AIV = "AIV" + +MIX_SLOTS = 3 +AIV_SLOTS = 1 + +NORMAL_MIX_BN = 4 +SYNC_MIX_BN = 12 +SYNC_AIV_BN = 8 +NORMAL_AIV_BN = 4 + + +def _build_tasks(): + """Returns list of (block_num, base_cl, shape_str).""" + tasks = [] + cl = 0 + for _ in range(ROUNDS): + # 4 x normal MIX + for _ in range(4): + tasks.append((NORMAL_MIX_BN, cl, SHAPE_MIX)) + cl += NORMAL_MIX_BN * MIX_SLOTS + # 2 x sync MIX + for _ in range(2): + tasks.append((SYNC_MIX_BN, cl, SHAPE_MIX)) + cl += SYNC_MIX_BN * MIX_SLOTS + # 2 x sync AIV + for _ in range(2): + tasks.append((SYNC_AIV_BN, cl, SHAPE_AIV)) + cl += SYNC_AIV_BN * AIV_SLOTS + # 1 x normal AIV + tasks.append((NORMAL_AIV_BN, cl, SHAPE_AIV)) + cl += NORMAL_AIV_BN * AIV_SLOTS + return tasks + + +TASKS = _build_tasks() +TOTAL_CL = sum(bn * (MIX_SLOTS if shape == SHAPE_MIX else AIV_SLOTS) for bn, _, shape in TASKS) # 840 + + +def generate_inputs(params: dict) -> list: + output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32) + return [("output", output)] + + +def compute_golden(tensors: dict, params: dict) -> None: + out = torch.as_tensor(tensors["output"]) + for block_num, base_cl, shape in TASKS: + for block_idx in range(block_num): + if shape == SHAPE_MIX: + # MIX kernel writes float(block_idx) at all 3 slots + for slot in range(MIX_SLOTS): + cl = base_cl + block_idx * MIX_SLOTS + slot + out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx) + else: + # AIV kernel writes float(block_idx) at 1 slot + cl = base_cl + block_idx + out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx) + tensors["output"][:] = out diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/kernel_config.py new file mode 100644 index 000000000..09c507863 --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/kernel_config.py @@ -0,0 +1,62 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +""" +Kernel configuration for SPMD sync_start stress test with mixed shapes. + +Submits 54 tasks (MIX + AIV) over 6 rounds to stress-test drain CAS contention, +ack barrier, and state cleanup between drain cycles. +Reuses AIC/AIV kernels from spmd_multiblock_mix and spmd_multiblock_aiv. +""" + +from pathlib import Path + +_KERNELS_ROOT = Path(__file__).parent +_MIX_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_mix" / "kernels" +_AIV_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_aiv" / "kernels" + +ORCHESTRATION = { + "source": str(_KERNELS_ROOT / "orchestration" / "spmd_sync_start_stress_orch.cpp"), + "function_name": "aicpu_orchestration_entry", +} + +KERNELS = [ + # func_id 0-2: MIX kernels (AIC + AIV0 + AIV1) + { + "func_id": 0, + "name": "SPMD_MIX_AIC", + "source": str(_MIX_KERNELS / "aic" / "kernel_spmd_mix.cpp"), + "core_type": "aic", + }, + { + "func_id": 1, + "name": "SPMD_MIX_AIV0", + "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"), + "core_type": "aiv", + }, + { + "func_id": 2, + "name": "SPMD_MIX_AIV1", + "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"), + "core_type": "aiv", + }, + # func_id 3: standalone AIV kernel + { + "func_id": 3, + "name": "SPMD_WRITE_AIV", + "source": str(_AIV_KERNELS / "aiv" / "kernel_spmd_write.cpp"), + "core_type": "aiv", + }, +] + +RUNTIME_CONFIG = { + "runtime": "tensormap_and_ringbuffer", + "aicpu_thread_num": 4, + "orch_thread_num": 1, + "block_dim": 24, +} diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/orchestration/spmd_sync_start_stress_orch.cpp b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/orchestration/spmd_sync_start_stress_orch.cpp new file mode 100644 index 000000000..ddbbea1de --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/orchestration/spmd_sync_start_stress_orch.cpp @@ -0,0 +1,109 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * SPMD Sync-Start Stress Orchestration (mixed shapes) + * + * Submits 6 rounds of mixed MIX + AIV tasks to stress-test: + * - Drain CAS contention (multiple sync_start tasks per round) + * - Ack barrier correctness (normal tasks occupy clusters during drain entry) + * - State cleanup between consecutive drain cycles + * + * Each round (9 tasks): + * 4 x normal MIX (block_num=4, sync=false) -> 4 x 4 x 3 = 48 CL + * 2 x sync MIX (block_num=12, sync=true) -> 2 x 12 x 3 = 72 CL + * 2 x sync AIV (block_num=8, sync=true) -> 2 x 8 x 1 = 16 CL + * 1 x normal AIV (block_num=4, sync=false) -> 1 x 4 x 1 = 4 CL + * Round total: 140 CL + * + * 6 rounds -> 54 tasks total, 840 CL grand total. + * + * Args layout: [output] + */ + +#include +#include + +#include "pto_orchestration_api.h" + +#define FUNC_SPMD_MIX_AIC 0 +#define FUNC_SPMD_MIX_AIV0 1 +#define FUNC_SPMD_MIX_AIV1 2 +#define FUNC_SPMD_WRITE_AIV 3 + +static constexpr int32_t MIX_SLOTS = 3; +static constexpr int32_t NORMAL_MIX_BN = 4; +static constexpr int32_t SYNC_MIX_BN = 12; +static constexpr int32_t SYNC_AIV_BN = 8; +static constexpr int32_t NORMAL_AIV_BN = 4; +static constexpr int32_t ROUNDS = 6; + +extern "C" { + +__attribute__((visibility("default"))) PTO2OrchestrationConfig +aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) { + (void)orch_args; + return PTO2OrchestrationConfig{.expected_arg_count = 1}; +} + +static void submit_mix(Tensor &out, int16_t block_num, int64_t base_cl, bool sync_start) { + MixedKernels mk; + mk.aic_kernel_id = FUNC_SPMD_MIX_AIC; + mk.aiv0_kernel_id = FUNC_SPMD_MIX_AIV0; + mk.aiv1_kernel_id = FUNC_SPMD_MIX_AIV1; + Arg args; + args.add_inout(out); + args.add_scalar(base_cl); + args.launch_spec.set_core_num(block_num); + args.launch_spec.set_require_sync_start(sync_start); + pto2_rt_submit_task(mk, args); +} + +static void submit_aiv(Tensor &out, int16_t block_num, int64_t base_cl, bool sync_start) { + Arg args; + args.add_inout(out); + args.add_scalar(base_cl); + args.launch_spec.set_core_num(block_num); + args.launch_spec.set_require_sync_start(sync_start); + pto2_rt_submit_aiv_task(FUNC_SPMD_WRITE_AIV, args); +} + +__attribute__((visibility("default"))) void +aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args, int orch_thread_num, int orch_thread_index) { + (void)orch_thread_num; + if (orch_thread_index != 0) return; + + Tensor ext_output = from_tensor_arg(orch_args.tensor(0)); + + int64_t cl = 0; + + for (int32_t r = 0; r < ROUNDS; r++) { + // 4 x normal MIX + for (int i = 0; i < 4; i++, cl += NORMAL_MIX_BN * MIX_SLOTS) + submit_mix(ext_output, NORMAL_MIX_BN, cl, false); + + // 2 x sync MIX — CAS contention: second sync task may arrive while first is draining + for (int i = 0; i < 2; i++, cl += SYNC_MIX_BN * MIX_SLOTS) + submit_mix(ext_output, SYNC_MIX_BN, cl, true); + + // 2 x sync AIV — cross-shape drain contention with the MIX drain above + for (int i = 0; i < 2; i++, cl += SYNC_AIV_BN) + submit_aiv(ext_output, SYNC_AIV_BN, cl, true); + + // 1 x normal AIV + submit_aiv(ext_output, NORMAL_AIV_BN, cl, false); + cl += NORMAL_AIV_BN; + } + + LOG_ALWAYS("[spmd_sync_start_stress] Submitted %d tasks over %d rounds", 9 * ROUNDS, ROUNDS); +} + +} // extern "C" diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 07acbbbf4..238cccfec 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -248,6 +248,14 @@ class alignas(64) CoreTracker { return ((core_states_ >> (cluster_offset + 2)) & BitStates(1ULL)).has_value(); } + // Count total idle AIV cores (AIV0 + AIV1) across all clusters. + // Unlike get_valid_cluster_offset_states(AIV).count() which counts clusters with + // at least one idle AIV, this counts individual idle cores — a cluster with both + // AIV0 and AIV1 idle contributes 2, not 1. + int32_t count_idle_aiv_cores() const { + return ((core_states_ >> 1) & aic_mask_).count() + ((core_states_ >> 2) & aic_mask_).count(); + } + // --- State mutation --- // Toggle bit at the given bit offset (running <-> idle) @@ -268,6 +276,8 @@ class alignas(64) CoreTracker { struct AicpuExecutor { int32_t orch_thread_num_; int32_t sched_thread_num_; + int32_t active_sched_threads_{0}; // Threads currently in dispatch loop (initially sched_thread_num_, becomes + // thread_num_ after orch→sched transition) bool orch_to_sched_{false}; // ===== Thread management state ===== @@ -297,6 +307,20 @@ struct AicpuExecutor { CoreTracker core_trackers_[MAX_AICPU_THREADS]; + // ===== sync_start drain coordination ===== + + // When sync_start_pending != 0, all scheduler threads skip Phase 2 dispatch + // (only process completions) until the drain worker finishes launching all blocks. + struct alignas(64) SyncStartDrainState { + std::atomic sync_start_pending{0}; // 0=normal; -1=initializing; >0=active (value=block_num) + std::atomic drain_worker_elected{0}; // 0=none; >0: elected thread's (thread_idx+1) + std::atomic drain_ack_mask{0}; // bit per thread; all-set = all threads finished dispatch + PTO2TaskSlotState *pending_task{nullptr}; // held task (not re-queued) + int32_t _pad[10]; + }; + static_assert(sizeof(SyncStartDrainState) == 64); + SyncStartDrainState drain_state_; + // ===== Task queue state (managed by scheduler ready queues) ===== // Task execution tracking @@ -621,6 +645,242 @@ struct AicpuExecutor { tracker.change_core_state(core_offset); core_exec_state.executing_reg_task_id = reg_task_id; } + + // Dispatch one SPMD block of a MIX task to the cluster at cluster_offset. + // Reads slot_state.next_block_idx as block_idx; caller increments it afterwards. + void dispatch_mix_block_to_cluster( + Runtime *runtime, int32_t thread_idx, int32_t cluster_offset, PTO2TaskSlotState &slot_state +#if PTO2_PROFILING + , + bool profiling_enabled +#endif + ) { + CoreTracker &tracker = core_trackers_[thread_idx]; + uint8_t core_mask = pto2_core_mask(slot_state.active_mask); + if (core_mask & PTO2_SUBTASK_MASK_AIC) { + dispatch_subtask_to_core( + runtime, thread_idx, tracker.get_aic_core_offset(cluster_offset), slot_state, PTO2SubtaskSlot::AIC +#if PTO2_PROFILING + , + profiling_enabled +#endif + ); + } + if (core_mask & PTO2_SUBTASK_MASK_AIV0) { + dispatch_subtask_to_core( + runtime, thread_idx, tracker.get_aiv0_core_offset(cluster_offset), slot_state, PTO2SubtaskSlot::AIV0 +#if PTO2_PROFILING + , + profiling_enabled +#endif + ); + } + if (core_mask & PTO2_SUBTASK_MASK_AIV1) { + dispatch_subtask_to_core( + runtime, thread_idx, tracker.get_aiv1_core_offset(cluster_offset), slot_state, PTO2SubtaskSlot::AIV1 +#if PTO2_PROFILING + , + profiling_enabled +#endif + ); + } + } + + // ===== sync_start drain helpers ===== + + // Take ownership of slot_state and signal all threads to enter drain mode. + // Returns true if this thread won the CAS and owns the drain slot. + // Returns false if another thread already holds drain; caller must re-push slot_state. + // + // Two-phase protocol: CAS 0 → -1 (sentinel) to claim ownership, store task and + // reset election flag, then release-store block_num. Other threads acquire-load + // sync_start_pending; seeing block_num > 0 ensures all relaxed stores are visible. + bool enter_drain_mode(PTO2TaskSlotState *slot_state, int32_t block_num) { + int32_t expected = 0; + if (!drain_state_.sync_start_pending.compare_exchange_strong( + expected, -1, std::memory_order_relaxed, std::memory_order_relaxed + )) { + return false; // Another thread already holds the drain slot. + } + // We own the drain slot. Store the task and reset election flag before making it visible. + drain_state_.pending_task = slot_state; + drain_state_.drain_ack_mask.store(0, std::memory_order_relaxed); + drain_state_.drain_worker_elected.store(0, std::memory_order_relaxed); + // Release store: all stores above are now visible to any thread that + // acquire-loads sync_start_pending and sees block_num > 0. + drain_state_.sync_start_pending.store(block_num, std::memory_order_release); + return true; + } + + // Dispatch one SPMD block to the cluster at cluster_offset, routing to the correct core(s) + // based on shape. For AIV, picks whichever AIV core in the cluster is currently idle. + // Caller is responsible for incrementing slot_state.next_block_idx after this returns. + void dispatch_block_to_cluster( + Runtime *runtime, int32_t thread_idx, int32_t cluster_offset, PTO2TaskSlotState &slot_state, + PTO2ResourceShape shape +#if PTO2_PROFILING + , + bool profiling_enabled, uint32_t &phase_dispatch_count +#endif + ) { + CoreTracker &tracker = core_trackers_[thread_idx]; + if (shape == PTO2ResourceShape::MIX) { + dispatch_mix_block_to_cluster( + runtime, thread_idx, cluster_offset, slot_state +#if PTO2_PROFILING + , + profiling_enabled +#endif + ); + } else if (shape == PTO2ResourceShape::AIC) { + dispatch_subtask_to_core( + runtime, thread_idx, tracker.get_aic_core_offset(cluster_offset), slot_state, PTO2SubtaskSlot::AIC +#if PTO2_PROFILING + , + profiling_enabled +#endif + ); + } else { // AIV + auto core_offset = tracker.is_aiv0_core_idle(cluster_offset) ? + tracker.get_aiv0_core_offset(cluster_offset) : + tracker.get_aiv1_core_offset(cluster_offset); + dispatch_subtask_to_core( + runtime, thread_idx, core_offset, slot_state, PTO2SubtaskSlot::AIV0 +#if PTO2_PROFILING + , + profiling_enabled +#endif + ); + } +#if PTO2_PROFILING + phase_dispatch_count += __builtin_popcount(pto2_core_mask(slot_state.active_mask)); +#endif + } + + // Count total available resources across all scheduler threads for a given shape. + int32_t count_global_available(PTO2ResourceShape shape) { + int32_t total = 0; + for (int32_t t = 0; t < active_sched_threads_; t++) { + if (shape == PTO2ResourceShape::AIV) { + total += core_trackers_[t].count_idle_aiv_cores(); + } else { + total += core_trackers_[t].get_valid_cluster_offset_states(shape).count(); + } + } + return total; + } + + // Drain worker: dispatch all blocks in one pass across all threads' trackers. + // Called only when global resources >= block_num, so one pass always suffices. + // All other threads are spinning — the drain worker has exclusive tracker access. + void drain_worker_dispatch( + Runtime *runtime, int32_t block_num +#if PTO2_PROFILING + , + bool profiling_enabled, uint32_t &phase_dispatch_count +#endif + ) { + PTO2TaskSlotState *slot_state = drain_state_.pending_task; + if (!slot_state) { + drain_state_.sync_start_pending.store(0, std::memory_order_release); + return; + } + PTO2ResourceShape shape = pto2_active_mask_to_shape(slot_state->active_mask); + + for (int32_t t = 0; t < active_sched_threads_ && slot_state->next_block_idx < block_num; t++) { + auto valid = core_trackers_[t].get_valid_cluster_offset_states(shape); + while (valid.has_value() && slot_state->next_block_idx < block_num) { + dispatch_block_to_cluster( + runtime, t, valid.pop_first(), *slot_state, shape +#if PTO2_PROFILING + , + profiling_enabled, phase_dispatch_count +#endif + ); + slot_state->next_block_idx++; + if (slot_state->next_block_idx < block_num) + valid = core_trackers_[t].get_valid_cluster_offset_states(shape); + } + } + + // All blocks dispatched — clear drain state. + // Release fence ensures tracker mutations are visible to threads that + // acquire-load sync_start_pending == 0 and resume normal operation. + std::atomic_thread_fence(std::memory_order_release); + drain_state_.pending_task = nullptr; + drain_state_.drain_ack_mask.store(0, std::memory_order_relaxed); + drain_state_.drain_worker_elected.store(0, std::memory_order_relaxed); + drain_state_.sync_start_pending.store(0, std::memory_order_release); + } + + // Called by each scheduler thread when drain_state_.sync_start_pending != 0. + // + // Three-phase protocol: + // 1. Ack barrier: all threads signal they've stopped Phase 2 dispatch. + // If not all acked yet, return to Phase 1 (completion polling). + // 2. Resource check: elected thread verifies global idle resources >= block_num. + // If insufficient, reset election state and return — all threads resume + // Phase 1 to free running cores, then retry next iteration. + // 3. Dispatch: elected thread dispatches all blocks (one pass, resources guaranteed). + // Non-elected threads spin-wait until sync_start_pending == 0. + // During dispatch the elected thread has exclusive tracker access. + void handle_drain_mode( + Runtime *runtime, int32_t thread_idx +#if PTO2_PROFILING + , + bool profiling_enabled, uint32_t &phase_dispatch_count +#endif + ) { + // Spin until drain is fully initialized (sentinel -1 → block_num > 0). + int32_t block_num; + do { + block_num = drain_state_.sync_start_pending.load(std::memory_order_acquire); + } while (block_num < 0); + if (block_num == 0) return; + + // Phase 1: Ack barrier — signal this thread has stopped Phase 2 dispatch. + uint32_t all_acked = (1u << active_sched_threads_) - 1; + drain_state_.drain_ack_mask.fetch_or(1u << thread_idx, std::memory_order_release); + + // If not all threads have acked, return to do Phase 1 (completion polling). + if ((drain_state_.drain_ack_mask.load(std::memory_order_acquire) & all_acked) != all_acked) return; + + // Phase 2: Election — exactly one thread wins the CAS. + int32_t expected = 0; + drain_state_.drain_worker_elected.compare_exchange_strong( + expected, thread_idx + 1, std::memory_order_acquire, std::memory_order_relaxed + ); + + if (drain_state_.drain_worker_elected.load(std::memory_order_relaxed) != thread_idx + 1) { + // Non-elected: spin-wait for drain completion or resource-insufficient reset. + while (drain_state_.sync_start_pending.load(std::memory_order_acquire) != 0) { + if (drain_state_.drain_worker_elected.load(std::memory_order_acquire) == 0) return; + SPIN_WAIT_HINT(); + } + return; + } + + // Elected: check if global resources are sufficient. + PTO2TaskSlotState *slot_state = drain_state_.pending_task; + PTO2ResourceShape shape = pto2_active_mask_to_shape(slot_state->active_mask); + int32_t available = count_global_available(shape); + + if (available < block_num) { + // Insufficient resources — reset election, let all threads do Phase 1. + drain_state_.drain_ack_mask.store(0, std::memory_order_relaxed); + drain_state_.drain_worker_elected.store(0, std::memory_order_release); + return; + } + + // Phase 3: Dispatch — all other threads are spinning, exclusive tracker access. + drain_worker_dispatch( + runtime, block_num +#if PTO2_PROFILING + , + profiling_enabled, phase_dispatch_count +#endif + ); + } }; static AicpuExecutor g_aicpu_executor; @@ -781,6 +1041,7 @@ bool AicpuExecutor::assign_cores_to_threads() { DEV_INFO("Thread %d: total %d cores (%d clusters)", t, core_idx[t], core_trackers_[t].get_cluster_count()); } + active_sched_threads_ = (sched_thread_num_ > 0) ? sched_thread_num_ : thread_num_; return true; } @@ -853,6 +1114,7 @@ void AicpuExecutor::reassign_cores_for_all_threads() { core_trackers_[t].get_cluster_count(), aic_running, aiv_running ); } + active_sched_threads_ = thread_num_; } int32_t AicpuExecutor::init(Runtime *runtime) { @@ -1215,8 +1477,23 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa #endif bool try_pushed = false; + + // Phase 2 drain check: if a sync_start task is waiting for resources, + // pause normal dispatch and let the drain protocol run. + if (drain_state_.sync_start_pending.load(std::memory_order_acquire) != 0) { + handle_drain_mode( + runtime, thread_idx +#if PTO2_PROFILING + , + profiling_enabled, phase_dispatch_count +#endif + ); + continue; + } + const PTO2ResourceShape *dispatch_order = get_dispatch_order(thread_idx); - for (int32_t si = 0; si < PTO2_NUM_RESOURCE_SHAPES; si++) { + bool entered_drain = false; + for (int32_t si = 0; si < PTO2_NUM_RESOURCE_SHAPES && !entered_drain; si++) { PTO2ResourceShape shape = dispatch_order[si]; auto valid_cluster_states = tracker.get_valid_cluster_offset_states(shape); if (!valid_cluster_states.has_value()) { @@ -1224,7 +1501,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa } auto &local_buf = local_bufs[static_cast(shape)]; - while (valid_cluster_states.has_value()) { + while (valid_cluster_states.has_value() && !entered_drain) { int want = valid_cluster_states.count(); PTO2TaskSlotState *batch[CoreTracker::MAX_CLUSTERS]; int got = pop_ready_tasks_batch( @@ -1242,13 +1519,37 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa #if PTO2_SCHED_PROFILING uint64_t t_setup_start = get_sys_cnt_aicpu(); #endif + // sync_start: all blocks must dispatch atomically. + // Fast path — enough local slots: fall through to normal dispatch loop below. + // Slow path — not enough: enter drain mode, then re-push all remaining + // tasks in the batch so nothing is lost. + // For AIV, one cluster can serve 2 blocks (AIV0 + AIV1), so compare against + // idle AIV core count rather than cluster count. + if (pto2_requires_sync_start(slot_state->active_mask)) { + int32_t available = (shape == PTO2ResourceShape::AIV) ? tracker.count_idle_aiv_cores() : + valid_cluster_states.count(); + if (available < slot_state->block_num) { + if (!enter_drain_mode(slot_state, slot_state->block_num)) { + // CAS lost: drain already active for another task; re-push and wait. + rt->scheduler.ready_queues[static_cast(shape)].push(slot_state); + } + // Re-push all unprocessed tasks remaining in this batch. + for (int rem = bi + 1; rem < got; rem++) { + rt->scheduler.ready_queues[static_cast(shape)].push(batch[rem]); + } + entered_drain = true; + break; + } + // Fast path: enough local resources, fall through to normal dispatch. + } + // Dispatch as many blocks as possible for this task using available clusters. // For block_num=1 the inner body executes exactly once (no overhead). do { auto current_valid_cluster_offset = valid_cluster_states.pop_first(); if (shape == PTO2ResourceShape::MIX) { // Full-cluster: all active subtasks share the same block_idx. - uint8_t mask = slot_state->active_mask; + uint8_t mask = pto2_core_mask(slot_state->active_mask); if (mask & PTO2_SUBTASK_MASK_AIC) { dispatch_subtask_to_core( runtime, thread_idx, tracker.get_aic_core_offset(current_valid_cluster_offset), @@ -1309,7 +1610,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa } } #if PTO2_PROFILING - phase_dispatch_count += __builtin_popcount(slot_state->active_mask); + phase_dispatch_count += __builtin_popcount(pto2_core_mask(slot_state->active_mask)); #endif DEV_DEBUG( "Thread %d: Dispatched %s task %" PRId64 " block %d/%d to cluster_offset %d", thread_idx, @@ -1853,6 +2154,11 @@ int32_t AicpuExecutor::run(Runtime *runtime) { return -1; } + // Total core counts for submit-time deadlock detection. + for (int i = 0; i < orch_thread_num_; i++) { + rt->orchestrators[i].total_cluster_count = aic_count_; + rt->orchestrators[i].total_aiv_count = aiv_count_; + } #if PTO2_PROFILING for (int i = 0; i < orch_thread_num_; i++) { rt->orchestrators[i].enable_profiling = runtime->enable_profiling; diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp index e9e9ea183..e161d3ee7 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp @@ -346,6 +346,21 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke active_mask = pto2_mixed_kernels_to_active_mask(normalized); } + // Encode require_sync_start into active_mask bit 3 (only meaningful for tasks with block_num > 1) + if (block_num > 1 && args.launch_spec.require_sync_start()) { + // Deadlock check: block_num >= total available slots of the required type. + // For MIX/AIC: limit is total_cluster_count (one AIC per cluster). + // For AIV: limit is total_aiv_count. + PTO2ResourceShape shape = pto2_active_mask_to_shape(active_mask); + int32_t limit = (shape == PTO2ResourceShape::AIV) ? orch->total_aiv_count : orch->total_cluster_count; + if (limit > 0 && block_num > limit) { + LOG_ERROR("FATAL: require_sync_start block_num=%d > limit=%d (deadlock guaranteed)", block_num, limit); + orch->fatal = true; + return TaskOutputTensors{}; + } + active_mask |= PTO2_SUBTASK_FLAG_SYNC_START; + } + // Submission without an open scope is illegal always_assert(orch->scope_stack_top >= 0 && "Cannot submit task outside a scope"); @@ -583,7 +598,8 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke cur_slot_state.task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed); cur_slot_state.fanout_refcount.store(0, std::memory_order_relaxed); cur_slot_state.completed_subtasks.store(0, std::memory_order_relaxed); - cur_slot_state.total_required_subtasks = static_cast(block_num * __builtin_popcount(active_mask)); + cur_slot_state.total_required_subtasks = + static_cast(block_num * __builtin_popcount(pto2_core_mask(active_mask))); cur_slot_state.block_num = block_num; cur_slot_state.next_block_idx = 0; diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h index a40a6f7ce..0d9d94276 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h @@ -70,6 +70,10 @@ struct PTO2OrchestratorState { // Note: In simulated mode, orchestrator and scheduler share address space // In real mode, they communicate via shared memory only PTO2SchedulerState *scheduler; // For simulated mode only + + // Total core counts set once at executor init; used for submit-time deadlock detection. + int32_t total_cluster_count{0}; // AIC cores = MIX clusters + int32_t total_aiv_count{0}; // AIV cores (= 2 × clusters on standard hardware) #if PTO2_PROFILING // Runtime profiling switch copied from Runtime::enable_profiling. bool enable_profiling; diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h index 2a4ad827a..e89781a91 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h @@ -39,9 +39,10 @@ enum class PTO2SubtaskSlot : uint8_t { /** * Subtask mask bits (for active_mask / subtask_done_mask) */ -inline constexpr uint8_t PTO2_SUBTASK_MASK_AIC = (1u << 0); // 0x1 -inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV0 = (1u << 1); // 0x2 -inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV1 = (1u << 2); // 0x4 +inline constexpr uint8_t PTO2_SUBTASK_MASK_AIC = (1u << 0); // 0x1 +inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV0 = (1u << 1); // 0x2 +inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV1 = (1u << 2); // 0x4 +inline constexpr uint8_t PTO2_SUBTASK_FLAG_SYNC_START = (1u << 3); // 0x8: all blocks must launch atomically /** * Test whether a subtask slot is active in a given mask @@ -50,6 +51,18 @@ static inline bool pto2_subtask_active(uint8_t mask, PTO2SubtaskSlot slot) { return (mask & (1u << static_cast(slot))) != 0; } +/** + * Extract only the core bits from active_mask (strips flag bits). + */ +static inline uint8_t pto2_core_mask(uint8_t active_mask) { return active_mask & 0x07u; } + +/** + * Check whether a task requires all blocks to be launched atomically. + */ +static inline bool pto2_requires_sync_start(uint8_t active_mask) { + return (active_mask & PTO2_SUBTASK_FLAG_SYNC_START) != 0; +} + /** * Mixed-task submit contract. * @@ -83,9 +96,10 @@ inline constexpr int32_t PTO2_NUM_RESOURCE_SHAPES = 3; * Caller must ensure active_mask is valid (at least one bit set). */ static inline PTO2ResourceShape pto2_active_mask_to_shape(uint8_t active_mask) { - int bit_count = __builtin_popcount(active_mask); + uint8_t core_mask = pto2_core_mask(active_mask); + int bit_count = __builtin_popcount(core_mask); if (bit_count >= 2) return PTO2ResourceShape::MIX; - if (active_mask & PTO2_SUBTASK_MASK_AIC) return PTO2ResourceShape::AIC; + if (core_mask & PTO2_SUBTASK_MASK_AIC) return PTO2ResourceShape::AIC; return PTO2ResourceShape::AIV; } @@ -114,6 +128,10 @@ class PTO2LaunchSpec { int16_t core_num() const { return core_num_; } void set_core_num(int16_t n) { core_num_ = n; } + bool require_sync_start() const { return require_sync_start_; } + void set_require_sync_start(bool v) { require_sync_start_ = v; } + private: int16_t core_num_{1}; + bool require_sync_start_{false}; };