Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 84 additions & 0 deletions examples/a2a3/tensormap_and_ringbuffer/spmd_starvation/golden.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# Copyright (c) PyPTO Contributors.
# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
# CANN Open Software License Agreement Version 2.0 (the "License").
# Please refer to the License for details. You may not use this file except in compliance with the License.
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
# See LICENSE in the root of the software repository for the full text of the License.
# -----------------------------------------------------------------------------------------------------------
"""
Golden test for SPMD starvation prevention.

Submits 18 normal MIX tasks interleaved with 2 sync_start MIX tasks and
verifies all 20 tasks complete with correct output. The test validates that
the drain mechanism prevents sync_start tasks from being starved.

Layout:
Wave 1: 6 × normal(block_num=4) -> CL 0..71
Sync 0: 1 × sync_start(block_num=6) -> CL 72..89
Wave 2: 6 × normal(block_num=4) -> CL 90..161
Sync 1: 1 × sync_start(block_num=6) -> CL 162..179
Wave 3: 6 × normal(block_num=4) -> CL 180..251

Total: 252 CL = 4032 float32.

Args layout: [output]
"""

import torch

__outputs__ = ["output"]
RTOL = 0
ATOL = 0

ALL_CASES = {
"Case1": {},
}

DEFAULT_CASE = "Case1"

FLOATS_PER_CACHE_LINE = 16
SLOTS_PER_BLOCK = 3 # AIC, AIV0, AIV1
NORMAL_BLOCK_NUM = 4
SYNC_BLOCK_NUM = 6
NORMAL_CL = NORMAL_BLOCK_NUM * SLOTS_PER_BLOCK # 12
SYNC_CL = SYNC_BLOCK_NUM * SLOTS_PER_BLOCK # 18


# Build flat task list as (block_num, base_cl)
def _build_tasks():
tasks = []
cl = 0
for _ in range(6):
tasks.append((NORMAL_BLOCK_NUM, cl))
cl += NORMAL_CL
tasks.append((SYNC_BLOCK_NUM, cl))
cl += SYNC_CL
for _ in range(6):
tasks.append((NORMAL_BLOCK_NUM, cl))
cl += NORMAL_CL
tasks.append((SYNC_BLOCK_NUM, cl))
cl += SYNC_CL
for _ in range(6):
tasks.append((NORMAL_BLOCK_NUM, cl))
cl += NORMAL_CL
return tasks


TASKS = _build_tasks()
TOTAL_CL = sum(bn * SLOTS_PER_BLOCK for bn, _ in TASKS) # 252


def generate_inputs(params: dict) -> list:
output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)
return [("output", output)]


def compute_golden(tensors: dict, params: dict) -> None:
out = torch.as_tensor(tensors["output"])
for block_num, base_cl in TASKS:
for block_idx in range(block_num):
for slot in range(SLOTS_PER_BLOCK):
cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot
out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx)
tensors["output"][:] = out
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Copyright (c) PyPTO Contributors.
# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
# CANN Open Software License Agreement Version 2.0 (the "License").
# Please refer to the License for details. You may not use this file except in compliance with the License.
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
# See LICENSE in the root of the software repository for the full text of the License.
# -----------------------------------------------------------------------------------------------------------
"""
Kernel configuration for SPMD starvation-prevention test.

Submits many normal MIX tasks interleaved with sync_start tasks to verify
the drain mechanism prevents starvation under sustained load.
Reuses the same AIC/AIV kernels from spmd_multiblock_mix.
"""

from pathlib import Path

_KERNELS_ROOT = Path(__file__).parent
_MIX_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_mix" / "kernels"

ORCHESTRATION = {
"source": str(_KERNELS_ROOT / "orchestration" / "spmd_starvation_orch.cpp"),
"function_name": "aicpu_orchestration_entry",
}

KERNELS = [
{
"func_id": 0,
"name": "SPMD_MIX_AIC",
"source": str(_MIX_KERNELS / "aic" / "kernel_spmd_mix.cpp"),
"core_type": "aic",
},
{
"func_id": 1,
"name": "SPMD_MIX_AIV0",
"source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"),
"core_type": "aiv",
},
{
"func_id": 2,
"name": "SPMD_MIX_AIV1",
"source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"),
"core_type": "aiv",
},
]

RUNTIME_CONFIG = {
"runtime": "tensormap_and_ringbuffer",
"aicpu_thread_num": 4,
"orch_thread_num": 1,
"block_dim": 24,
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
/*
* Copyright (c) PyPTO Contributors.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
* -----------------------------------------------------------------------------------------------------------
*/

/**
* SPMD Starvation Prevention Orchestration
*
* Submits a large wave of normal MIX tasks followed by sync_start tasks,
* then another wave of normal tasks. The drain mechanism must ensure the
* sync_start tasks are not indefinitely delayed by the surrounding load.
*
* Layout: 3 waves × 6 normal tasks (block_num=4) + 2 sync_start tasks (block_num=6)
*
* Normal task: block_num=4, require_sync_start=false → 4 blocks × 3 slots = 12 CL each
* Sync task: block_num=6, require_sync_start=true → 6 blocks × 3 slots = 18 CL each
*
* Total CL: 3×6×12 + 2×18 = 216 + 36 = 252
*
* Args layout: [output]
*/

#include <stddef.h>
#include <stdint.h>

#include "pto_orchestration_api.h"

#define FUNC_SPMD_MIX_AIC 0
#define FUNC_SPMD_MIX_AIV0 1
#define FUNC_SPMD_MIX_AIV1 2

static constexpr int32_t SLOTS_PER_BLOCK = 3; // AIC, AIV0, AIV1
static constexpr int32_t NORMAL_BLOCK_NUM = 4;
static constexpr int32_t SYNC_BLOCK_NUM = 6;
static constexpr int32_t NORMAL_CL = NORMAL_BLOCK_NUM * SLOTS_PER_BLOCK; // 12
static constexpr int32_t SYNC_CL = SYNC_BLOCK_NUM * SLOTS_PER_BLOCK; // 18

extern "C" {

__attribute__((visibility("default"))) PTO2OrchestrationConfig
aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) {
(void)orch_args; // NOLINT(readability/casting)
return PTO2OrchestrationConfig{
.expected_arg_count = 1,
};
}

static void submit_mix(Tensor &out, int16_t block_num, int64_t base_cl, bool sync_start) {
MixedKernels mk;
mk.aic_kernel_id = FUNC_SPMD_MIX_AIC;
mk.aiv0_kernel_id = FUNC_SPMD_MIX_AIV0;
mk.aiv1_kernel_id = FUNC_SPMD_MIX_AIV1;

Arg args;
args.add_inout(out);
args.add_scalar(base_cl);
args.launch_spec.set_block_num(block_num);
args.launch_spec.set_require_sync_start(sync_start);
pto2_rt_submit_task(mk, args);
}

__attribute__((visibility("default"))) void
aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args, int orch_thread_num, int orch_thread_index) {
(void)orch_thread_num; // NOLINT(readability/casting)
if (orch_thread_index != 0) return;

Tensor ext_output = from_tensor_arg(orch_args.tensor(0));

int64_t cl = 0;

// Wave 1: 6 normal MIX tasks
for (int i = 0; i < 6; i++, cl += NORMAL_CL)
submit_mix(ext_output, NORMAL_BLOCK_NUM, cl, false);

// Sync-start task 0: must not be starved by wave 1 or wave 2
submit_mix(ext_output, SYNC_BLOCK_NUM, cl, true);
cl += SYNC_CL;

// Wave 2: 6 normal MIX tasks
for (int i = 0; i < 6; i++, cl += NORMAL_CL)
submit_mix(ext_output, NORMAL_BLOCK_NUM, cl, false);

// Sync-start task 1: must not be starved by wave 2 or wave 3
submit_mix(ext_output, SYNC_BLOCK_NUM, cl, true);
cl += SYNC_CL;

// Wave 3: 6 normal MIX tasks
for (int i = 0; i < 6; i++, cl += NORMAL_CL)
submit_mix(ext_output, NORMAL_BLOCK_NUM, cl, false);

LOG_ALWAYS("[spmd_starvation] Submitted 20 tasks (18 normal + 2 sync_start)");
}

} // extern "C"
66 changes: 66 additions & 0 deletions examples/a2a3/tensormap_and_ringbuffer/spmd_sync_start/golden.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Copyright (c) PyPTO Contributors.
# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
# CANN Open Software License Agreement Version 2.0 (the "License").
# Please refer to the License for details. You may not use this file except in compliance with the License.
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
# See LICENSE in the root of the software repository for the full text of the License.
# -----------------------------------------------------------------------------------------------------------
"""
Golden test for SPMD sync_start.

Submits 4 MIX tasks (3 with require_sync_start=true, 1 baseline) and verifies
all blocks of every task write the correct float(block_idx) to their cache line.

Tasks (AIC=slot0, AIV0=slot1, AIV1=slot2):
T0: block_num=2, sync_start=True -> CL 0..5
T1: block_num=8, sync_start=True -> CL 6..29
T2: block_num=2, sync_start=False -> CL 30..35 (baseline)
T3: block_num=12, sync_start=True -> CL 36..71

Output tensor: 72 cache lines = 1152 float32.

Args layout: [output]
"""

import torch

__outputs__ = ["output"]
RTOL = 0
ATOL = 0

ALL_CASES = {
"Case1": {},
}

DEFAULT_CASE = "Case1"

FLOATS_PER_CACHE_LINE = 16
SLOTS_PER_BLOCK = 3 # AIC, AIV0, AIV1

# (block_num, base_cl) for each submitted task
TASKS = [
(2, 0), # T0: sync_start=True
(8, 6), # T1: sync_start=True
(2, 30), # T2: sync_start=False (baseline)
(12, 36), # T3: sync_start=True
]

TOTAL_CL = sum(block_num * SLOTS_PER_BLOCK for block_num, _ in TASKS) # 72


def generate_inputs(params: dict) -> list:
output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)
return [
("output", output),
]


def compute_golden(tensors: dict, params: dict) -> None:
out = torch.as_tensor(tensors["output"])
for block_num, base_cl in TASKS:
for block_idx in range(block_num):
for slot in range(SLOTS_PER_BLOCK):
cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot
out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx)
tensors["output"][:] = out
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Copyright (c) PyPTO Contributors.
# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
# CANN Open Software License Agreement Version 2.0 (the "License").
# Please refer to the License for details. You may not use this file except in compliance with the License.
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
# See LICENSE in the root of the software repository for the full text of the License.
# -----------------------------------------------------------------------------------------------------------
"""
Kernel configuration for SPMD sync_start test (tensormap_and_ringbuffer Runtime).

Submits MIX tasks with require_sync_start=true to verify atomic batch launch.
Reuses the same AIC/AIV kernels from spmd_multiblock_mix.
"""

from pathlib import Path

_KERNELS_ROOT = Path(__file__).parent
_MIX_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_mix" / "kernels"

ORCHESTRATION = {
"source": str(_KERNELS_ROOT / "orchestration" / "spmd_sync_start_orch.cpp"),
"function_name": "aicpu_orchestration_entry",
}

KERNELS = [
{
"func_id": 0,
"name": "SPMD_MIX_AIC",
"source": str(_MIX_KERNELS / "aic" / "kernel_spmd_mix.cpp"),
"core_type": "aic",
},
{
"func_id": 1,
"name": "SPMD_MIX_AIV0",
"source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"),
"core_type": "aiv",
},
{
"func_id": 2,
"name": "SPMD_MIX_AIV1",
"source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"),
"core_type": "aiv",
},
]

RUNTIME_CONFIG = {
"runtime": "tensormap_and_ringbuffer",
"aicpu_thread_num": 4,
"orch_thread_num": 1,
"block_dim": 24,
}
Loading
Loading