diff --git a/ci.sh b/ci.sh index 589289ec6..522d715c8 100755 --- a/ci.sh +++ b/ci.sh @@ -181,11 +181,14 @@ trap 'kill $WATCHDOG_PID 2>/dev/null; pkill -TERM -P $$ 2>/dev/null; rm -rf "$LO ) >/dev/null 2>&1 & WATCHDOG_PID=$! -# commit_flag starts empty (try latest PTO-ISA first). -# If -c is given AND a test fails, pin_pto_isa_on_failure sets commit_flag. commit_flag=() +if [[ -n "$PTO_ISA_COMMIT" ]]; then + echo "[CI] Using pinned PTO-ISA commit from start: $PTO_ISA_COMMIT" + rm -rf examples/scripts/_deps/pto-isa + commit_flag=(-c "$PTO_ISA_COMMIT") +fi -# Pin PTO-ISA to the specified commit on first failure. +# Legacy fallback path for callers that start on the latest PTO-ISA checkout. # On first failure: cleans cached clone, sets commit_flag, returns 0 (caller retries). # On subsequent failures (already pinned): returns 1 (real failure). pin_pto_isa_on_failure() { diff --git a/src/a5/platform/sim/aicore/kernel.cpp b/src/a5/platform/sim/aicore/kernel.cpp index 1191011f1..ec1ab8c0d 100644 --- a/src/a5/platform/sim/aicore/kernel.cpp +++ b/src/a5/platform/sim/aicore/kernel.cpp @@ -1,3 +1,13 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ /** * AICore Kernel Wrapper for Simulation * @@ -5,7 +15,10 @@ * Sets up per-thread simulated register base before calling the executor. */ +#include + #include + #include "aicore/aicore.h" #include "common/core_type.h" #include "common/platform_config.h" @@ -20,9 +33,20 @@ thread_local uint32_t g_sim_physical_core_id = 0; // Declare the original function (defined in aicore_executor.cpp with weak linkage) void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type); +namespace { +using CpuSimSetExecutionContextHook = void (*)(uint32_t, uint32_t, uint32_t); + +CpuSimSetExecutionContextHook resolve_cpu_sim_set_execution_context_hook() { + static auto hook = + reinterpret_cast(dlsym(RTLD_DEFAULT, "pto_cpu_sim_set_execution_context")); + return hook; +} +} // namespace + // Wrapper with extern "C" for dlsym lookup // NOTE: physical_core_id stays in wrapper signature (DeviceRunner passes it for register indexing) -extern "C" void aicore_execute_wrapper(__gm__ Runtime* runtime, int block_idx, CoreType core_type, uint32_t physical_core_id, uint64_t regs) { +extern "C" void aicore_execute_wrapper( + __gm__ Runtime* runtime, int block_idx, CoreType core_type, uint32_t physical_core_id, uint64_t regs) { // Set up simulated register base for this thread. // regs points to an array of uint64_t base addresses (one per core). // physical_core_id indexes into it to get this core's register block. @@ -32,6 +56,22 @@ extern "C" void aicore_execute_wrapper(__gm__ Runtime* runtime, int block_idx, C } g_sim_physical_core_id = physical_core_id; + const uint32_t num_aic = static_cast(runtime->worker_count / PLATFORM_CORES_PER_BLOCKDIM); + uint32_t cpu_block_idx = static_cast(block_idx); + uint32_t subblock_id = 0; + uint32_t subblock_dim = 1; + + if (core_type == CoreType::AIV && physical_core_id >= num_aic) { + const uint32_t aiv_offset = physical_core_id - num_aic; + cpu_block_idx = aiv_offset / PLATFORM_AIV_CORES_PER_BLOCKDIM; + subblock_id = aiv_offset % PLATFORM_AIV_CORES_PER_BLOCKDIM; + subblock_dim = PLATFORM_AIV_CORES_PER_BLOCKDIM; + } else { + cpu_block_idx = physical_core_id; + } + if (auto hook = resolve_cpu_sim_set_execution_context_hook(); hook != nullptr) { + hook(cpu_block_idx, subblock_id, subblock_dim); + } aicore_execute(runtime, block_idx, core_type); } diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp index e132e58ab..e877be2c2 100644 --- a/src/a5/platform/sim/host/device_runner.cpp +++ b/src/a5/platform/sim/host/device_runner.cpp @@ -27,6 +27,7 @@ #include "device_runner.h" #include +#include #include #include @@ -34,6 +35,62 @@ #include "callable.h" #include "host/raii_scope_guard.h" +namespace { +thread_local uint32_t g_cpu_sim_block_idx = 0; +thread_local uint32_t g_cpu_sim_subblock_id = 0; +thread_local uint32_t g_cpu_sim_subblock_dim = 1; +thread_local uint64_t g_cpu_sim_task_cookie = 0; +std::mutex g_cpu_sim_shared_storage_mutex; +std::map g_cpu_sim_shared_storage; + +void clear_cpu_sim_shared_storage() { + std::lock_guard lock(g_cpu_sim_shared_storage_mutex); + for (auto& [key, storage] : g_cpu_sim_shared_storage) { + (void)key; + std::free(storage); + } + g_cpu_sim_shared_storage.clear(); +} +} // namespace + +extern "C" void pto_cpu_sim_set_execution_context(uint32_t block_idx, uint32_t subblock_id, uint32_t subblock_dim) { + g_cpu_sim_block_idx = block_idx; + g_cpu_sim_subblock_id = subblock_id; + g_cpu_sim_subblock_dim = (subblock_dim == 0) ? 1u : subblock_dim; +} + +extern "C" void pto_cpu_sim_set_task_cookie(uint64_t task_cookie) { g_cpu_sim_task_cookie = task_cookie; } + +extern "C" void pto_cpu_sim_get_execution_context(uint32_t* block_idx, uint32_t* subblock_id, uint32_t* subblock_dim) { + if (block_idx != nullptr) { + *block_idx = g_cpu_sim_block_idx; + } + if (subblock_id != nullptr) { + *subblock_id = g_cpu_sim_subblock_id; + } + if (subblock_dim != nullptr) { + *subblock_dim = g_cpu_sim_subblock_dim; + } +} + +extern "C" uint64_t pto_cpu_sim_get_task_cookie() { return g_cpu_sim_task_cookie; } + +extern "C" void* pto_cpu_sim_get_shared_storage(const char* key, size_t size) { + if (key == nullptr || size == 0) { + return nullptr; + } + + std::lock_guard lock(g_cpu_sim_shared_storage_mutex); + auto it = g_cpu_sim_shared_storage.find(key); + if (it != g_cpu_sim_shared_storage.end()) { + return it->second; + } + + void* storage = std::calloc(1, size); + g_cpu_sim_shared_storage.emplace(key, storage); + return storage; +} + // Function pointer types for dynamically loaded executors typedef int (*aicpu_execute_func_t)(Runtime* runtime); typedef void (*aicore_execute_func_t)( @@ -151,6 +208,7 @@ int DeviceRunner::run(Runtime& runtime, const std::vector& aicpu_so_binary, const std::vector& aicore_kernel_binary, int launch_aicpu_num) { + clear_cpu_sim_shared_storage(); // Validate launch_aicpu_num if (launch_aicpu_num < 1 || launch_aicpu_num > PLATFORM_MAX_AICPU_THREADS) { LOG_ERROR("launch_aicpu_num (%d) must be in range [1, %d]", launch_aicpu_num, PLATFORM_MAX_AICPU_THREADS); @@ -436,6 +494,7 @@ int DeviceRunner::finalize() { // Free all remaining allocations mem_alloc_.finalize(); + clear_cpu_sim_shared_storage(); device_id_ = -1; worker_count_ = 0; diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp b/src/a5/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp index 3511579a7..7d42a75b1 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp @@ -16,6 +16,10 @@ #include "pto2_dispatch_payload.h" // NOLINT(build/include_subdir) #include "runtime.h" // NOLINT(build/include_subdir) +#ifdef __CPU_SIM +#include +#endif + /** * Unified function pointer type for kernel dispatch * @@ -24,6 +28,17 @@ */ typedef void (*UnifiedKernelFunc)(__gm__ int64_t*); +#ifdef __CPU_SIM +namespace { +using CpuSimSetTaskCookieHook = void (*)(uint64_t); + +CpuSimSetTaskCookieHook resolve_cpu_sim_set_task_cookie_hook() { + static auto hook = reinterpret_cast(dlsym(RTLD_DEFAULT, "pto_cpu_sim_set_task_cookie")); + return hook; +} +} // namespace +#endif + /** * Execute task from PTO2DispatchPayload. * @@ -120,6 +135,11 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, in uint64_t start_time = get_sys_cnt_aicore(); // Execute the task +#ifdef __CPU_SIM + if (auto hook = resolve_cpu_sim_set_task_cookie_hook(); hook != nullptr) { + hook(reinterpret_cast(payload->args)); + } +#endif execute_task(payload); // Performance profiling: record task execution diff --git a/tests/st/a5/tensormap_and_ringbuffer/bgemm/kernels/mix/kernel_bgemm.cpp b/tests/st/a5/tensormap_and_ringbuffer/bgemm/kernels/mix/kernel_bgemm.cpp index e9527e41a..94a2dca22 100644 --- a/tests/st/a5/tensormap_and_ringbuffer/bgemm/kernels/mix/kernel_bgemm.cpp +++ b/tests/st/a5/tensormap_and_ringbuffer/bgemm/kernels/mix/kernel_bgemm.cpp @@ -1,301 +1,201 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ /** - * Tile-based BGEMM Kernel — Combined Cube + Vector (TPUSH/TPOP) + * Tile-based BGEMM kernel using the shared cube/vector TPUSH/TPOP path. * - * Computes one tile iteration: P = A[m,k] @ B[k,n], then C[m,n] += P + * Computes one tile iteration: P = A[m,k] @ B[k,n], then C[m,n] += P. * * Single source compiled twice: - * - AIC (Cube): __DAV_CUBE__ defined → TLOAD, TMATMUL, TPUSH - * - AIV (Vector): __DAV_VEC__ defined → TPOP, TADD, TSTORE - * - * Intermediate result P is transferred via VEC_FIFO (TPUSH/TPOP), - * bypassing GM. The accumulator C is still read/written via GM. + * - AIC (cube): __DAV_CUBE__ defined -> TLOAD, TMATMUL, TPUSH + * - AIV (vector): __DAV_VEC__ defined -> TPOP, TADD, TSTORE * - * Simulation fallback (__CPU_SIM): - * Uses separate AIC/AIV tasks with GM intermediary (no TPUSH/TPOP). - * AIC args: [A, B, P_output] AIV args: [C_inout, P_input] + * Intermediate result P is transferred via VEC_FIFO, bypassing GM. + * The accumulator C is still read and written via GM. * - * Hardware args (MixedKernels): - * args[0] = input_a (INPUT) - * args[1] = input_b (INPUT) - * args[2] = C_tile (INOUT: read + write accumulator) + * MixedKernels args: + * args[0] = input_a (input) + * args[1] = input_b (input) + * args[2] = C_tile (inout accumulator) */ - #include - #include - #ifndef __CPU_SIM - #include - #endif - - #include "tensor.h" - - using namespace pto; - - #ifndef __gm__ - #define __gm__ - #endif - - #ifndef __aicore__ - #define __aicore__ [aicore] - #endif - - #ifdef __DAV_CUBE__ - constexpr bool DAV_CUBE = true; - #else - constexpr bool DAV_CUBE = false; - #endif - - #ifdef __DAV_VEC__ - constexpr bool DAV_VEC = true; - #else - constexpr bool DAV_VEC = false; - #endif - - // Tile dimensions (must match golden.py) - constexpr int TILE = 64; - constexpr int M = TILE; - constexpr int K = TILE; - constexpr int N = TILE; - - // ============================================================================= - // Simulation: separate AIC/AIV tasks with GM intermediate (no TPUSH/TPOP) - // ============================================================================= - #ifdef __CPU_SIM - - extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) { - // AIC path: args = [A (input), B (input), P (output)] - if constexpr (DAV_CUBE) { - __gm__ Tensor* input_a_tensor = reinterpret_cast<__gm__ Tensor*>(args[0]); - __gm__ Tensor* input_b_tensor = reinterpret_cast<__gm__ Tensor*>(args[1]); - __gm__ Tensor* output_tensor = reinterpret_cast<__gm__ Tensor*>(args[2]); - - __gm__ float* input_a = reinterpret_cast<__gm__ float*>(input_a_tensor->buffer.addr) + input_a_tensor->start_offset; - __gm__ float* input_b = reinterpret_cast<__gm__ float*>(input_b_tensor->buffer.addr) + input_b_tensor->start_offset; - __gm__ float* output = reinterpret_cast<__gm__ float*>(output_tensor->buffer.addr) + output_tensor->start_offset; - - using GlobalDataA = GlobalTensor, - pto::Stride>; - using GlobalDataB = GlobalTensor, - pto::Stride>; - using GlobalDataC = GlobalTensor, - pto::Stride>; - - GlobalDataA src0Global(input_a); - GlobalDataB src1Global(input_b); - GlobalDataC dstGlobal(output); - - using TileMatA = Tile; - using TileMatB = Tile; - using LeftTile = TileLeft; - using RightTile = TileRight; - using AccTile = TileAcc; - - TileMatA aMatTile; - TileMatB bMatTile; - TASSIGN(aMatTile, 0x0); - TASSIGN(bMatTile, 0x20000); - - LeftTile aTile; - RightTile bTile; - AccTile cTile; - TASSIGN(aTile, 0x0); - TASSIGN(bTile, 0x0); - TASSIGN(cTile, 0x0); - - TLOAD(aMatTile, src0Global); - TLOAD(bMatTile, src1Global); - - set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - - TMOV(aTile, aMatTile); - TMOV(bTile, bMatTile); - - set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - - TMATMUL(cTile, aTile, bTile); - - set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - - TSTORE(dstGlobal, cTile); - - set_flag(PIPE_FIX, PIPE_S, EVENT_ID7); - wait_flag(PIPE_FIX, PIPE_S, EVENT_ID7); - } - - // AIV path: args = [C (inout), P (input)] - if constexpr (DAV_VEC) { - __gm__ Tensor* c_tensor = reinterpret_cast<__gm__ Tensor*>(args[0]); - __gm__ Tensor* p_tensor = reinterpret_cast<__gm__ Tensor*>(args[1]); - - __gm__ float* c_ptr = reinterpret_cast<__gm__ float*>(c_tensor->buffer.addr) + c_tensor->start_offset; - __gm__ float* p_ptr = reinterpret_cast<__gm__ float*>(p_tensor->buffer.addr) + p_tensor->start_offset; - - using DynShapeDim5 = Shape<1, 1, 1, TILE, TILE>; - using DynStridDim5 = pto::Stride<1, 1, 1, TILE, 1>; - using GlobalData = GlobalTensor; - using TileData = Tile; - - TileData cTile(TILE, TILE); - TileData pTile(TILE, TILE); - TileData outTile(TILE, TILE); - TASSIGN(cTile, 0x0); - TASSIGN(pTile, 0x10000); - TASSIGN(outTile, 0x20000); - - GlobalData cGlobal(c_ptr); - GlobalData pGlobal(p_ptr); - GlobalData outGlobal(c_ptr); // write back to same C location - - TLOAD(cTile, cGlobal); - TLOAD(pTile, pGlobal); - set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - TADD(outTile, cTile, pTile); - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - TSTORE(outGlobal, outTile); - - set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); - wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); - } - } - - // ============================================================================= - // Hardware: MixedKernels with TPUSH/TPOP via VEC_FIFO - // ============================================================================= - #else // !__CPU_SIM - - #define VEC_CORES 2 - constexpr int VEC_M = M / VEC_CORES; // each vector sub-core handles half the rows - - // TPUSH/TPOP pipe configuration - constexpr uint16_t PP_FLAG_ID = 0; - constexpr uint8_t PP_FIFO_DEPTH = 2; - - // Cube accumulator (full M×N tile in L0C) - using AccTileT = TileAcc; - // Vector consumer tile (half tile: VEC_M×N in UB, split across 2 vector sub-cores) - using VecFifoTileT = Tile; - - // Cube→Vector pipe via on-chip VEC_FIFO (bypasses global memory) - using PipeT = TPipe; - - extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) { - __gm__ Tensor* input_a_tensor = reinterpret_cast<__gm__ Tensor*>(args[0]); - __gm__ Tensor* input_b_tensor = reinterpret_cast<__gm__ Tensor*>(args[1]); - __gm__ Tensor* c_tensor = reinterpret_cast<__gm__ Tensor*>(args[2]); - - // Pipe and FIFO tile are declared in common scope (both sides reference the type) - VecFifoTileT vecFifoTile; - PipeT mPipe((__gm__ void *)(uint64_t)0x0, (uint32_t)0x0, (uint32_t)0x0); - - // ========================================================================= - // Cube side: TLOAD A,B → TMATMUL → TPUSH result to vector via VEC_FIFO - // ========================================================================= - if constexpr (DAV_CUBE) { - __gm__ float* input_a = reinterpret_cast<__gm__ float*>(input_a_tensor->buffer.addr) - + input_a_tensor->start_offset; - __gm__ float* input_b = reinterpret_cast<__gm__ float*>(input_b_tensor->buffer.addr) - + input_b_tensor->start_offset; - - using GlobalDataA = GlobalTensor, - pto::Stride>; - using GlobalDataB = GlobalTensor, - pto::Stride>; - - GlobalDataA src0Global(input_a); - GlobalDataB src1Global(input_b); - - using TileMatA = Tile; - using TileMatB = Tile; - using LeftTile = TileLeft; - using RightTile = TileRight; - - TileMatA aMatTile; - TileMatB bMatTile; - TASSIGN(aMatTile, 0x0); - TASSIGN(bMatTile, 0x20000); - - LeftTile aTile; - RightTile bTile; - AccTileT accTile; - TASSIGN(aTile, 0x0); - TASSIGN(bTile, 0x0); - TASSIGN(accTile, 0x0); - - // Load A and B from GM to L1 - TLOAD(aMatTile, src0Global); - TLOAD(bMatTile, src1Global); - - set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - - // Move from L1 to L0A/L0B - TMOV(aTile, aMatTile); - TMOV(bTile, bMatTile); - - set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - - // Matrix multiply - TMATMUL(accTile, aTile, bTile); - - set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - - // Push result directly to vector core's UB (replaces TSTORE to GM) - TPUSH(mPipe, accTile); - } - - // ========================================================================= - // Vector side: TPOP result from cube → TLOAD C from GM → TADD → TSTORE - // ========================================================================= - if constexpr (DAV_VEC) { - uint32_t subBlockIdx = get_subblockid(); - - __gm__ float* c_ptr = reinterpret_cast<__gm__ float*>(c_tensor->buffer.addr) - + c_tensor->start_offset; - // Each vector sub-core handles its half: sub-core 0 → rows [0, VEC_M), - // sub-core 1 → rows [VEC_M, M) - __gm__ float* c_sub = c_ptr + static_cast(subBlockIdx) * VEC_M * N; - - using GlobalC = GlobalTensor, - pto::Stride>; - - GlobalC cGlobal(c_sub); - GlobalC outGlobal(c_sub); // write back to same location - - using VecTile = Tile; - - VecTile cTile; - VecTile outTile; - // Place after FIFO buffer: FIFO uses [0x0, FIFO_DEPTH * VEC_M * N * 4) - // = [0x0, 2 * 32 * 64 * 4) = [0x0, 0x4000) - TASSIGN(cTile, 0x4000); - TASSIGN(outTile, 0x6000); - - // Pop matmul result from cube via VEC_FIFO (replaces TLOAD from GM) - TPOP(mPipe, vecFifoTile); - - // Load current C tile from GM - TLOAD(cTile, cGlobal); - - set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - - // Accumulate: C += P - TADD(outTile, cTile, vecFifoTile); - TFREE(mPipe); - - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - - // Store result back to GM - TSTORE(outGlobal, outTile); - } - } - - #endif // __CPU_SIM - \ No newline at end of file +#include +// clang-format off +#include +#include +// clang-format on + +#include "tensor.h" + +using pto::BLayout; +using pto::Direction; +using pto::GlobalTensor; +using pto::Shape; +using pto::SLayout; +using pto::Tile; +using pto::TileAcc; +using pto::TileLeft; +using pto::TileRight; +using pto::TileSplitAxis; +using pto::TileType; +using pto::TPipe; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ +#endif + +#ifdef __DAV_CUBE__ +constexpr bool DAV_CUBE = true; +#else +constexpr bool DAV_CUBE = false; +#endif + +#ifdef __DAV_VEC__ +constexpr bool DAV_VEC = true; +#else +constexpr bool DAV_VEC = false; +#endif + +// Tile dimensions (must match golden.py) +constexpr int TILE = 64; +constexpr int M = TILE; +constexpr int K = TILE; +constexpr int N = TILE; + +#define VEC_CORES 2 +constexpr int VEC_M = M / VEC_CORES; // each vector sub-core handles half the rows + +// TPUSH/TPOP pipe configuration +constexpr uint16_t PP_FLAG_ID = 0; +constexpr uint8_t PP_FIFO_DEPTH = 2; + +// Cube accumulator (full M×N tile in L0C) +using AccTileT = TileAcc; +// Vector consumer tile (half tile: VEC_M×N in UB, split across 2 vector sub-cores) +using VecFifoTileT = Tile; + +// Cube→Vector pipe via on-chip VEC_FIFO (bypasses global memory) +using PipeT = TPipe; + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) { + __gm__ Tensor* input_a_tensor = reinterpret_cast<__gm__ Tensor*>(args[0]); + __gm__ Tensor* input_b_tensor = reinterpret_cast<__gm__ Tensor*>(args[1]); + __gm__ Tensor* c_tensor = reinterpret_cast<__gm__ Tensor*>(args[2]); + + // Pipe and FIFO tile are declared in common scope (both sides reference the type) + VecFifoTileT vecFifoTile; + PipeT mPipe(nullptr, 0U, 0U); + + // ========================================================================= + // Cube side: TLOAD A,B → TMATMUL → TPUSH result to vector via VEC_FIFO + // ========================================================================= + if constexpr (DAV_CUBE) { + __gm__ float* input_a = + reinterpret_cast<__gm__ float*>(input_a_tensor->buffer.addr) + input_a_tensor->start_offset; + __gm__ float* input_b = + reinterpret_cast<__gm__ float*>(input_b_tensor->buffer.addr) + input_b_tensor->start_offset; + + using GlobalDataA = GlobalTensor, pto::Stride>; + using GlobalDataB = GlobalTensor, pto::Stride>; + + GlobalDataA src0Global(input_a); + GlobalDataB src1Global(input_b); + + using TileMatA = Tile; + using TileMatB = Tile; + using LeftTile = TileLeft; + using RightTile = TileRight; + + TileMatA aMatTile; + TileMatB bMatTile; + TASSIGN(aMatTile, 0x0); + TASSIGN(bMatTile, 0x20000); + + LeftTile aTile; + RightTile bTile; + AccTileT accTile; + TASSIGN(aTile, 0x0); + TASSIGN(bTile, 0x0); + TASSIGN(accTile, 0x0); + + // Load A and B from GM to L1 + TLOAD(aMatTile, src0Global); + TLOAD(bMatTile, src1Global); + + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + + // Move from L1 to L0A/L0B + TMOV(aTile, aMatTile); + TMOV(bTile, bMatTile); + + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + + // Matrix multiply + TMATMUL(accTile, aTile, bTile); + + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + + // Push result directly to vector core's UB (replaces TSTORE to GM) + TPUSH(mPipe, accTile); + } + + // ========================================================================= + // Vector side: TPOP result from cube → TLOAD C from GM → TADD → TSTORE + // ========================================================================= + if constexpr (DAV_VEC) { + uint32_t subBlockIdx = get_subblockid(); + + __gm__ float* c_ptr = reinterpret_cast<__gm__ float*>(c_tensor->buffer.addr) + c_tensor->start_offset; + // Each vector sub-core handles its half: sub-core 0 → rows [0, VEC_M), + // sub-core 1 → rows [VEC_M, M) + __gm__ float* c_sub = c_ptr + static_cast(subBlockIdx) * VEC_M * N; + + using GlobalC = + GlobalTensor, pto::Stride>; + + GlobalC cGlobal(c_sub); + GlobalC outGlobal(c_sub); // write back to same location + + using VecTile = Tile; + + VecTile cTile; + VecTile outTile; + // Place after FIFO buffer: FIFO uses [0x0, FIFO_DEPTH * VEC_M * N * 4) + // = [0x0, 2 * 32 * 64 * 4) = [0x0, 0x4000) + TASSIGN(cTile, 0x4000); + TASSIGN(outTile, 0x6000); + + // Pop matmul result from cube via VEC_FIFO (replaces TLOAD from GM) + TPOP(mPipe, vecFifoTile); + + // Load current C tile from GM + TLOAD(cTile, cGlobal); + + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + + // Accumulate: C += P + TADD(outTile, cTile, vecFifoTile); + TFREE(mPipe); + + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + + // Store result back to GM + TSTORE(outGlobal, outTile); + } +}