diff --git a/ci.sh b/ci.sh
index 589289ec6..522d715c8 100755
--- a/ci.sh
+++ b/ci.sh
@@ -181,11 +181,14 @@ trap 'kill $WATCHDOG_PID 2>/dev/null; pkill -TERM -P $$ 2>/dev/null; rm -rf "$LO
 ) >/dev/null 2>&1 &
 WATCHDOG_PID=$!
 
-# commit_flag starts empty (try latest PTO-ISA first).
-# If -c is given AND a test fails, pin_pto_isa_on_failure sets commit_flag.
 commit_flag=()
+if [[ -n "$PTO_ISA_COMMIT" ]]; then
+    echo "[CI] Using pinned PTO-ISA commit from start: $PTO_ISA_COMMIT"
+    rm -rf examples/scripts/_deps/pto-isa
+    commit_flag=(-c "$PTO_ISA_COMMIT")
+fi
 
-# Pin PTO-ISA to the specified commit on first failure.
+# Legacy fallback path for callers that start on the latest PTO-ISA checkout.
 # On first failure: cleans cached clone, sets commit_flag, returns 0 (caller retries).
 # On subsequent failures (already pinned): returns 1 (real failure).
 pin_pto_isa_on_failure() {
diff --git a/src/a5/platform/sim/aicore/kernel.cpp b/src/a5/platform/sim/aicore/kernel.cpp
index 1191011f1..ec1ab8c0d 100644
--- a/src/a5/platform/sim/aicore/kernel.cpp
+++ b/src/a5/platform/sim/aicore/kernel.cpp
@@ -1,3 +1,13 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
 /**
  * AICore Kernel Wrapper for Simulation
  *
@@ -5,7 +15,10 @@
  * Sets up per-thread simulated register base before calling the executor.
  */
 
+#include <dlfcn.h>
+
 #include <cstdint>
+
 #include "aicore/aicore.h"
 #include "common/core_type.h"
 #include "common/platform_config.h"
@@ -20,9 +33,20 @@ thread_local uint32_t g_sim_physical_core_id = 0;
 // Declare the original function (defined in aicore_executor.cpp with weak linkage)
 void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type);
 
+namespace {
+using CpuSimSetExecutionContextHook = void (*)(uint32_t, uint32_t, uint32_t);
+
+CpuSimSetExecutionContextHook resolve_cpu_sim_set_execution_context_hook() {
+    static auto hook =
+        reinterpret_cast<CpuSimSetExecutionContextHook>(dlsym(RTLD_DEFAULT, "pto_cpu_sim_set_execution_context"));
+    return hook;
+}
+}  // namespace
+
 // Wrapper with extern "C" for dlsym lookup
 // NOTE: physical_core_id stays in wrapper signature (DeviceRunner passes it for register indexing)
-extern "C" void aicore_execute_wrapper(__gm__ Runtime* runtime, int block_idx, CoreType core_type, uint32_t physical_core_id, uint64_t regs) {
+extern "C" void aicore_execute_wrapper(
+    __gm__ Runtime* runtime, int block_idx, CoreType core_type, uint32_t physical_core_id, uint64_t regs) {
     // Set up simulated register base for this thread.
     // regs points to an array of uint64_t base addresses (one per core).
     // physical_core_id indexes into it to get this core's register block.
@@ -32,6 +56,22 @@ extern "C" void aicore_execute_wrapper(__gm__ Runtime* runtime, int block_idx, C
     }
 
     g_sim_physical_core_id = physical_core_id;
+    const uint32_t num_aic = static_cast<uint32_t>(runtime->worker_count / PLATFORM_CORES_PER_BLOCKDIM);
+    uint32_t cpu_block_idx = static_cast<uint32_t>(block_idx);
+    uint32_t subblock_id = 0;
+    uint32_t subblock_dim = 1;
+
+    if (core_type == CoreType::AIV && physical_core_id >= num_aic) {
+        const uint32_t aiv_offset = physical_core_id - num_aic;
+        cpu_block_idx = aiv_offset / PLATFORM_AIV_CORES_PER_BLOCKDIM;
+        subblock_id = aiv_offset % PLATFORM_AIV_CORES_PER_BLOCKDIM;
+        subblock_dim = PLATFORM_AIV_CORES_PER_BLOCKDIM;
+    } else {
+        cpu_block_idx = physical_core_id;
+    }
 
+    if (auto hook = resolve_cpu_sim_set_execution_context_hook(); hook != nullptr) {
+        hook(cpu_block_idx, subblock_id, subblock_dim);
+    }
     aicore_execute(runtime, block_idx, core_type);
 }
diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp
index e132e58ab..e877be2c2 100644
--- a/src/a5/platform/sim/host/device_runner.cpp
+++ b/src/a5/platform/sim/host/device_runner.cpp
@@ -27,6 +27,7 @@
 #include "device_runner.h"
 
 #include <cstdio>
+#include <map>
 #include <string>
 #include <vector>
 
@@ -34,6 +35,62 @@
 #include "callable.h"
 #include "host/raii_scope_guard.h"
 
+namespace {
+thread_local uint32_t g_cpu_sim_block_idx = 0;
+thread_local uint32_t g_cpu_sim_subblock_id = 0;
+thread_local uint32_t g_cpu_sim_subblock_dim = 1;
+thread_local uint64_t g_cpu_sim_task_cookie = 0;
+std::mutex g_cpu_sim_shared_storage_mutex;
+std::map<std::string, void*> g_cpu_sim_shared_storage;
+
+void clear_cpu_sim_shared_storage() {
+    std::lock_guard<std::mutex> lock(g_cpu_sim_shared_storage_mutex);
+    for (auto& [key, storage] : g_cpu_sim_shared_storage) {
+        (void)key;
+        std::free(storage);
+    }
+    g_cpu_sim_shared_storage.clear();
+}
+}  // namespace
+
+extern "C" void pto_cpu_sim_set_execution_context(uint32_t block_idx, uint32_t subblock_id, uint32_t subblock_dim) {
+    g_cpu_sim_block_idx = block_idx;
+    g_cpu_sim_subblock_id = subblock_id;
+    g_cpu_sim_subblock_dim = (subblock_dim == 0) ? 1u : subblock_dim;
+}
+
+extern "C" void pto_cpu_sim_set_task_cookie(uint64_t task_cookie) { g_cpu_sim_task_cookie = task_cookie; }
+
+extern "C" void pto_cpu_sim_get_execution_context(uint32_t* block_idx, uint32_t* subblock_id, uint32_t* subblock_dim) {
+    if (block_idx != nullptr) {
+        *block_idx = g_cpu_sim_block_idx;
+    }
+    if (subblock_id != nullptr) {
+        *subblock_id = g_cpu_sim_subblock_id;
+    }
+    if (subblock_dim != nullptr) {
+        *subblock_dim = g_cpu_sim_subblock_dim;
+    }
+}
+
+extern "C" uint64_t pto_cpu_sim_get_task_cookie() { return g_cpu_sim_task_cookie; }
+
+extern "C" void* pto_cpu_sim_get_shared_storage(const char* key, size_t size) {
+    if (key == nullptr || size == 0) {
+        return nullptr;
+    }
+
+    std::lock_guard<std::mutex> lock(g_cpu_sim_shared_storage_mutex);
+    auto it = g_cpu_sim_shared_storage.find(key);
+    if (it != g_cpu_sim_shared_storage.end()) {
+        return it->second;
+    }
+
+    void* storage = std::calloc(1, size);
+    g_cpu_sim_shared_storage.emplace(key, storage);
+    return storage;
+}
+
 // Function pointer types for dynamically loaded executors
 typedef int (*aicpu_execute_func_t)(Runtime* runtime);
 typedef void (*aicore_execute_func_t)(
@@ -151,6 +208,7 @@ int DeviceRunner::run(Runtime& runtime,
     const std::vector<uint8_t>& aicpu_so_binary,
     const std::vector<uint8_t>& aicore_kernel_binary,
     int launch_aicpu_num) {
+    clear_cpu_sim_shared_storage();
     // Validate launch_aicpu_num
     if (launch_aicpu_num < 1 || launch_aicpu_num > PLATFORM_MAX_AICPU_THREADS) {
         LOG_ERROR("launch_aicpu_num (%d) must be in range [1, %d]", launch_aicpu_num, PLATFORM_MAX_AICPU_THREADS);
@@ -436,6 +494,7 @@ int DeviceRunner::finalize() {
 
     // Free all remaining allocations
     mem_alloc_.finalize();
+    clear_cpu_sim_shared_storage();
 
     device_id_ = -1;
     worker_count_ = 0;
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp b/src/a5/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp
index 3511579a7..7d42a75b1 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp
@@ -16,6 +16,10 @@
 #include "pto2_dispatch_payload.h"   // NOLINT(build/include_subdir)
 #include "runtime.h"                 // NOLINT(build/include_subdir)
 
+#ifdef __CPU_SIM
+#include <dlfcn.h>
+#endif
+
 /**
  * Unified function pointer type for kernel dispatch
  *
@@ -24,6 +28,17 @@
  */
 typedef void (*UnifiedKernelFunc)(__gm__ int64_t*);
 
+#ifdef __CPU_SIM
+namespace {
+using CpuSimSetTaskCookieHook = void (*)(uint64_t);
+
+CpuSimSetTaskCookieHook resolve_cpu_sim_set_task_cookie_hook() {
+    static auto hook = reinterpret_cast<CpuSimSetTaskCookieHook>(dlsym(RTLD_DEFAULT, "pto_cpu_sim_set_task_cookie"));
+    return hook;
+}
+}  // namespace
+#endif
+
 /**
  * Execute task from PTO2DispatchPayload.
  *
@@ -120,6 +135,11 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, in
             uint64_t start_time = get_sys_cnt_aicore();
 
             // Execute the task
+#ifdef __CPU_SIM
+            if (auto hook = resolve_cpu_sim_set_task_cookie_hook(); hook != nullptr) {
+                hook(reinterpret_cast<uint64_t>(payload->args));
+            }
+#endif
             execute_task(payload);
 
             // Performance profiling: record task execution
diff --git a/tests/st/a5/tensormap_and_ringbuffer/bgemm/kernels/mix/kernel_bgemm.cpp b/tests/st/a5/tensormap_and_ringbuffer/bgemm/kernels/mix/kernel_bgemm.cpp
index e9527e41a..94a2dca22 100644
--- a/tests/st/a5/tensormap_and_ringbuffer/bgemm/kernels/mix/kernel_bgemm.cpp
+++ b/tests/st/a5/tensormap_and_ringbuffer/bgemm/kernels/mix/kernel_bgemm.cpp
@@ -1,301 +1,201 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
 /**
- * Tile-based BGEMM Kernel — Combined Cube + Vector (TPUSH/TPOP)
+ * Tile-based BGEMM kernel using the shared cube/vector TPUSH/TPOP path.
  *
- * Computes one tile iteration: P = A[m,k] @ B[k,n], then C[m,n] += P
+ * Computes one tile iteration: P = A[m,k] @ B[k,n], then C[m,n] += P.
  *
  * Single source compiled twice:
- *   - AIC (Cube):   __DAV_CUBE__ defined → TLOAD, TMATMUL, TPUSH
- *   - AIV (Vector):  __DAV_VEC__ defined → TPOP, TADD, TSTORE
- *
- * Intermediate result P is transferred via VEC_FIFO (TPUSH/TPOP),
- * bypassing GM. The accumulator C is still read/written via GM.
+ *   - AIC (cube): __DAV_CUBE__ defined -> TLOAD, TMATMUL, TPUSH
+ *   - AIV (vector): __DAV_VEC__ defined -> TPOP, TADD, TSTORE
  *
- * Simulation fallback (__CPU_SIM):
- *   Uses separate AIC/AIV tasks with GM intermediary (no TPUSH/TPOP).
- *   AIC args: [A, B, P_output]    AIV args: [C_inout, P_input]
+ * Intermediate result P is transferred via VEC_FIFO, bypassing GM.
+ * The accumulator C is still read and written via GM.
  *
- * Hardware args (MixedKernels):
- *   args[0] = input_a  (INPUT)
- *   args[1] = input_b  (INPUT)
- *   args[2] = C_tile   (INOUT: read + write accumulator)
+ * MixedKernels args:
+ *   args[0] = input_a (input)
+ *   args[1] = input_b (input)
+ *   args[2] = C_tile (inout accumulator)
  */
 
- #include <cstdint>
- #include <pto/pto-inst.hpp>
- #ifndef __CPU_SIM
- #include <pto/common/fifo.hpp>
- #endif
- 
- #include "tensor.h"
- 
- using namespace pto;
- 
- #ifndef __gm__
- #define __gm__
- #endif
- 
- #ifndef __aicore__
- #define __aicore__ [aicore]
- #endif
- 
- #ifdef __DAV_CUBE__
- constexpr bool DAV_CUBE = true;
- #else
- constexpr bool DAV_CUBE = false;
- #endif
- 
- #ifdef __DAV_VEC__
- constexpr bool DAV_VEC = true;
- #else
- constexpr bool DAV_VEC = false;
- #endif
- 
- // Tile dimensions (must match golden.py)
- constexpr int TILE = 64;
- constexpr int M = TILE;
- constexpr int K = TILE;
- constexpr int N = TILE;
- 
- // =============================================================================
- // Simulation: separate AIC/AIV tasks with GM intermediate (no TPUSH/TPOP)
- // =============================================================================
- #ifdef __CPU_SIM
- 
- extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
-     // AIC path: args = [A (input), B (input), P (output)]
-     if constexpr (DAV_CUBE) {
-         __gm__ Tensor* input_a_tensor = reinterpret_cast<__gm__ Tensor*>(args[0]);
-         __gm__ Tensor* input_b_tensor = reinterpret_cast<__gm__ Tensor*>(args[1]);
-         __gm__ Tensor* output_tensor  = reinterpret_cast<__gm__ Tensor*>(args[2]);
- 
-         __gm__ float* input_a = reinterpret_cast<__gm__ float*>(input_a_tensor->buffer.addr) + input_a_tensor->start_offset;
-         __gm__ float* input_b = reinterpret_cast<__gm__ float*>(input_b_tensor->buffer.addr) + input_b_tensor->start_offset;
-         __gm__ float* output  = reinterpret_cast<__gm__ float*>(output_tensor->buffer.addr)  + output_tensor->start_offset;
- 
-         using GlobalDataA = GlobalTensor<float, Shape<1, 1, 1, M, K>,
-             pto::Stride<M * K, M * K, M * K, K, 1>>;
-         using GlobalDataB = GlobalTensor<float, Shape<1, 1, 1, K, N>,
-             pto::Stride<K * N, K * N, K * N, N, 1>>;
-         using GlobalDataC = GlobalTensor<float, Shape<1, 1, 1, M, N>,
-             pto::Stride<M * N, M * N, M * N, N, 1>>;
- 
-         GlobalDataA src0Global(input_a);
-         GlobalDataB src1Global(input_b);
-         GlobalDataC dstGlobal(output);
- 
-         using TileMatA = Tile<TileType::Mat, float, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
-         using TileMatB = Tile<TileType::Mat, float, K, N, BLayout::ColMajor, K, N, SLayout::RowMajor, 512>;
-         using LeftTile = TileLeft<float, M, K, M, K>;
-         using RightTile = TileRight<float, K, N, K, N>;
-         using AccTile = TileAcc<float, M, N, M, N>;
- 
-         TileMatA aMatTile;
-         TileMatB bMatTile;
-         TASSIGN(aMatTile, 0x0);
-         TASSIGN(bMatTile, 0x20000);
- 
-         LeftTile aTile;
-         RightTile bTile;
-         AccTile cTile;
-         TASSIGN(aTile, 0x0);
-         TASSIGN(bTile, 0x0);
-         TASSIGN(cTile, 0x0);
- 
-         TLOAD(aMatTile, src0Global);
-         TLOAD(bMatTile, src1Global);
- 
-         set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-         wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
- 
-         TMOV(aTile, aMatTile);
-         TMOV(bTile, bMatTile);
- 
-         set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
-         wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
- 
-         TMATMUL(cTile, aTile, bTile);
- 
-         set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
-         wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
- 
-         TSTORE(dstGlobal, cTile);
- 
-         set_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
-         wait_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
-     }
- 
-     // AIV path: args = [C (inout), P (input)]
-     if constexpr (DAV_VEC) {
-         __gm__ Tensor* c_tensor = reinterpret_cast<__gm__ Tensor*>(args[0]);
-         __gm__ Tensor* p_tensor = reinterpret_cast<__gm__ Tensor*>(args[1]);
- 
-         __gm__ float* c_ptr = reinterpret_cast<__gm__ float*>(c_tensor->buffer.addr) + c_tensor->start_offset;
-         __gm__ float* p_ptr = reinterpret_cast<__gm__ float*>(p_tensor->buffer.addr) + p_tensor->start_offset;
- 
-         using DynShapeDim5 = Shape<1, 1, 1, TILE, TILE>;
-         using DynStridDim5 = pto::Stride<1, 1, 1, TILE, 1>;
-         using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
-         using TileData = Tile<TileType::Vec, float, TILE, TILE, BLayout::RowMajor, -1, -1>;
- 
-         TileData cTile(TILE, TILE);
-         TileData pTile(TILE, TILE);
-         TileData outTile(TILE, TILE);
-         TASSIGN(cTile, 0x0);
-         TASSIGN(pTile, 0x10000);
-         TASSIGN(outTile, 0x20000);
- 
-         GlobalData cGlobal(c_ptr);
-         GlobalData pGlobal(p_ptr);
-         GlobalData outGlobal(c_ptr);  // write back to same C location
- 
-         TLOAD(cTile, cGlobal);
-         TLOAD(pTile, pGlobal);
-         set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-         wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-         TADD(outTile, cTile, pTile);
-         set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-         wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-         TSTORE(outGlobal, outTile);
- 
-         set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
-         wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
-     }
- }
- 
- // =============================================================================
- // Hardware: MixedKernels with TPUSH/TPOP via VEC_FIFO
- // =============================================================================
- #else  // !__CPU_SIM
- 
- #define VEC_CORES 2
- constexpr int VEC_M = M / VEC_CORES;  // each vector sub-core handles half the rows
- 
- // TPUSH/TPOP pipe configuration
- constexpr uint16_t PP_FLAG_ID = 0;
- constexpr uint8_t PP_FIFO_DEPTH = 2;
- 
- // Cube accumulator (full M×N tile in L0C)
- using AccTileT = TileAcc<float, M, N, M, N>;
- // Vector consumer tile (half tile: VEC_M×N in UB, split across 2 vector sub-cores)
- using VecFifoTileT = Tile<TileType::Vec, float, VEC_M, N, BLayout::RowMajor, VEC_M, N>;
- 
- // Cube→Vector pipe via on-chip VEC_FIFO (bypasses global memory)
- using PipeT = TPipe<PP_FLAG_ID, Direction::DIR_C2V, sizeof(float) * VEC_M * N, PP_FIFO_DEPTH>;
- 
- extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
-     __gm__ Tensor* input_a_tensor = reinterpret_cast<__gm__ Tensor*>(args[0]);
-     __gm__ Tensor* input_b_tensor = reinterpret_cast<__gm__ Tensor*>(args[1]);
-     __gm__ Tensor* c_tensor       = reinterpret_cast<__gm__ Tensor*>(args[2]);
- 
-     // Pipe and FIFO tile are declared in common scope (both sides reference the type)
-     VecFifoTileT vecFifoTile;
-     PipeT mPipe((__gm__ void *)(uint64_t)0x0, (uint32_t)0x0, (uint32_t)0x0);
- 
-     // =========================================================================
-     // Cube side: TLOAD A,B → TMATMUL → TPUSH result to vector via VEC_FIFO
-     // =========================================================================
-     if constexpr (DAV_CUBE) {
-         __gm__ float* input_a = reinterpret_cast<__gm__ float*>(input_a_tensor->buffer.addr)
-                                 + input_a_tensor->start_offset;
-         __gm__ float* input_b = reinterpret_cast<__gm__ float*>(input_b_tensor->buffer.addr)
-                                 + input_b_tensor->start_offset;
- 
-         using GlobalDataA = GlobalTensor<float, Shape<1, 1, 1, M, K>,
-             pto::Stride<M * K, M * K, M * K, K, 1>>;
-         using GlobalDataB = GlobalTensor<float, Shape<1, 1, 1, K, N>,
-             pto::Stride<K * N, K * N, K * N, N, 1>>;
- 
-         GlobalDataA src0Global(input_a);
-         GlobalDataB src1Global(input_b);
- 
-         using TileMatA = Tile<TileType::Mat, float, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
-         using TileMatB = Tile<TileType::Mat, float, K, N, BLayout::ColMajor, K, N, SLayout::RowMajor, 512>;
-         using LeftTile = TileLeft<float, M, K, M, K>;
-         using RightTile = TileRight<float, K, N, K, N>;
- 
-         TileMatA aMatTile;
-         TileMatB bMatTile;
-         TASSIGN(aMatTile, 0x0);
-         TASSIGN(bMatTile, 0x20000);
- 
-         LeftTile aTile;
-         RightTile bTile;
-         AccTileT accTile;
-         TASSIGN(aTile, 0x0);
-         TASSIGN(bTile, 0x0);
-         TASSIGN(accTile, 0x0);
- 
-         // Load A and B from GM to L1
-         TLOAD(aMatTile, src0Global);
-         TLOAD(bMatTile, src1Global);
- 
-         set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-         wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
- 
-         // Move from L1 to L0A/L0B
-         TMOV(aTile, aMatTile);
-         TMOV(bTile, bMatTile);
- 
-         set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
-         wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
- 
-         // Matrix multiply
-         TMATMUL(accTile, aTile, bTile);
- 
-         set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
-         wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
- 
-         // Push result directly to vector core's UB (replaces TSTORE to GM)
-         TPUSH<PipeT, AccTileT, TileSplitAxis::TILE_UP_DOWN>(mPipe, accTile);
-     }
- 
-     // =========================================================================
-     // Vector side: TPOP result from cube → TLOAD C from GM → TADD → TSTORE
-     // =========================================================================
-     if constexpr (DAV_VEC) {
-         uint32_t subBlockIdx = get_subblockid();
- 
-         __gm__ float* c_ptr = reinterpret_cast<__gm__ float*>(c_tensor->buffer.addr)
-                               + c_tensor->start_offset;
-         // Each vector sub-core handles its half: sub-core 0 → rows [0, VEC_M),
-         //                                       sub-core 1 → rows [VEC_M, M)
-         __gm__ float* c_sub = c_ptr + static_cast<size_t>(subBlockIdx) * VEC_M * N;
- 
-         using GlobalC = GlobalTensor<float, Shape<1, 1, 1, VEC_M, N>,
-             pto::Stride<VEC_M * N, VEC_M * N, VEC_M * N, N, 1>>;
- 
-         GlobalC cGlobal(c_sub);
-         GlobalC outGlobal(c_sub);  // write back to same location
- 
-         using VecTile = Tile<TileType::Vec, float, VEC_M, N, BLayout::RowMajor, VEC_M, N>;
- 
-         VecTile cTile;
-         VecTile outTile;
-         // Place after FIFO buffer: FIFO uses [0x0, FIFO_DEPTH * VEC_M * N * 4)
-         // = [0x0, 2 * 32 * 64 * 4) = [0x0, 0x4000)
-         TASSIGN(cTile, 0x4000);
-         TASSIGN(outTile, 0x6000);
- 
-         // Pop matmul result from cube via VEC_FIFO (replaces TLOAD from GM)
-         TPOP<PipeT, VecFifoTileT, TileSplitAxis::TILE_UP_DOWN>(mPipe, vecFifoTile);
- 
-         // Load current C tile from GM
-         TLOAD(cTile, cGlobal);
- 
-         set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-         wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
- 
-         // Accumulate: C += P
-         TADD(outTile, cTile, vecFifoTile);
-         TFREE<PipeT, TileSplitAxis::TILE_UP_DOWN>(mPipe);
- 
-         set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-         wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
- 
-         // Store result back to GM
-         TSTORE(outGlobal, outTile);
-     }
- }
- 
- #endif  // __CPU_SIM
- 
\ No newline at end of file
+#include <cstdint>
+// clang-format off
+#include <pto/pto-inst.hpp>
+#include <pto/common/fifo.hpp>
+// clang-format on
+
+#include "tensor.h"
+
+using pto::BLayout;
+using pto::Direction;
+using pto::GlobalTensor;
+using pto::Shape;
+using pto::SLayout;
+using pto::Tile;
+using pto::TileAcc;
+using pto::TileLeft;
+using pto::TileRight;
+using pto::TileSplitAxis;
+using pto::TileType;
+using pto::TPipe;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__
+#endif
+
+#ifdef __DAV_CUBE__
+constexpr bool DAV_CUBE = true;
+#else
+constexpr bool DAV_CUBE = false;
+#endif
+
+#ifdef __DAV_VEC__
+constexpr bool DAV_VEC = true;
+#else
+constexpr bool DAV_VEC = false;
+#endif
+
+// Tile dimensions (must match golden.py)
+constexpr int TILE = 64;
+constexpr int M = TILE;
+constexpr int K = TILE;
+constexpr int N = TILE;
+
+#define VEC_CORES 2
+constexpr int VEC_M = M / VEC_CORES;  // each vector sub-core handles half the rows
+
+// TPUSH/TPOP pipe configuration
+constexpr uint16_t PP_FLAG_ID = 0;
+constexpr uint8_t PP_FIFO_DEPTH = 2;
+
+// Cube accumulator (full M×N tile in L0C)
+using AccTileT = TileAcc<float, M, N, M, N>;
+// Vector consumer tile (half tile: VEC_M×N in UB, split across 2 vector sub-cores)
+using VecFifoTileT = Tile<TileType::Vec, float, VEC_M, N, BLayout::RowMajor, VEC_M, N>;
+
+// Cube→Vector pipe via on-chip VEC_FIFO (bypasses global memory)
+using PipeT = TPipe<PP_FLAG_ID, Direction::DIR_C2V, sizeof(float) * VEC_M * N, PP_FIFO_DEPTH>;
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
+    __gm__ Tensor* input_a_tensor = reinterpret_cast<__gm__ Tensor*>(args[0]);
+    __gm__ Tensor* input_b_tensor = reinterpret_cast<__gm__ Tensor*>(args[1]);
+    __gm__ Tensor* c_tensor = reinterpret_cast<__gm__ Tensor*>(args[2]);
+
+    // Pipe and FIFO tile are declared in common scope (both sides reference the type)
+    VecFifoTileT vecFifoTile;
+    PipeT mPipe(nullptr, 0U, 0U);
+
+    // =========================================================================
+    // Cube side: TLOAD A,B → TMATMUL → TPUSH result to vector via VEC_FIFO
+    // =========================================================================
+    if constexpr (DAV_CUBE) {
+        __gm__ float* input_a =
+            reinterpret_cast<__gm__ float*>(input_a_tensor->buffer.addr) + input_a_tensor->start_offset;
+        __gm__ float* input_b =
+            reinterpret_cast<__gm__ float*>(input_b_tensor->buffer.addr) + input_b_tensor->start_offset;
+
+        using GlobalDataA = GlobalTensor<float, Shape<1, 1, 1, M, K>, pto::Stride<M * K, M * K, M * K, K, 1>>;
+        using GlobalDataB = GlobalTensor<float, Shape<1, 1, 1, K, N>, pto::Stride<K * N, K * N, K * N, N, 1>>;
+
+        GlobalDataA src0Global(input_a);
+        GlobalDataB src1Global(input_b);
+
+        using TileMatA = Tile<TileType::Mat, float, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
+        using TileMatB = Tile<TileType::Mat, float, K, N, BLayout::ColMajor, K, N, SLayout::RowMajor, 512>;
+        using LeftTile = TileLeft<float, M, K, M, K>;
+        using RightTile = TileRight<float, K, N, K, N>;
+
+        TileMatA aMatTile;
+        TileMatB bMatTile;
+        TASSIGN(aMatTile, 0x0);
+        TASSIGN(bMatTile, 0x20000);
+
+        LeftTile aTile;
+        RightTile bTile;
+        AccTileT accTile;
+        TASSIGN(aTile, 0x0);
+        TASSIGN(bTile, 0x0);
+        TASSIGN(accTile, 0x0);
+
+        // Load A and B from GM to L1
+        TLOAD(aMatTile, src0Global);
+        TLOAD(bMatTile, src1Global);
+
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+
+        // Move from L1 to L0A/L0B
+        TMOV(aTile, aMatTile);
+        TMOV(bTile, bMatTile);
+
+        set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+        wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+
+        // Matrix multiply
+        TMATMUL(accTile, aTile, bTile);
+
+        set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+        wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+
+        // Push result directly to vector core's UB (replaces TSTORE to GM)
+        TPUSH<PipeT, AccTileT, TileSplitAxis::TILE_UP_DOWN>(mPipe, accTile);
+    }
+
+    // =========================================================================
+    // Vector side: TPOP result from cube → TLOAD C from GM → TADD → TSTORE
+    // =========================================================================
+    if constexpr (DAV_VEC) {
+        uint32_t subBlockIdx = get_subblockid();
+
+        __gm__ float* c_ptr = reinterpret_cast<__gm__ float*>(c_tensor->buffer.addr) + c_tensor->start_offset;
+        // Each vector sub-core handles its half: sub-core 0 → rows [0, VEC_M),
+        //                                       sub-core 1 → rows [VEC_M, M)
+        __gm__ float* c_sub = c_ptr + static_cast<size_t>(subBlockIdx) * VEC_M * N;
+
+        using GlobalC =
+            GlobalTensor<float, Shape<1, 1, 1, VEC_M, N>, pto::Stride<VEC_M * N, VEC_M * N, VEC_M * N, N, 1>>;
+
+        GlobalC cGlobal(c_sub);
+        GlobalC outGlobal(c_sub);  // write back to same location
+
+        using VecTile = Tile<TileType::Vec, float, VEC_M, N, BLayout::RowMajor, VEC_M, N>;
+
+        VecTile cTile;
+        VecTile outTile;
+        // Place after FIFO buffer: FIFO uses [0x0, FIFO_DEPTH * VEC_M * N * 4)
+        // = [0x0, 2 * 32 * 64 * 4) = [0x0, 0x4000)
+        TASSIGN(cTile, 0x4000);
+        TASSIGN(outTile, 0x6000);
+
+        // Pop matmul result from cube via VEC_FIFO (replaces TLOAD from GM)
+        TPOP<PipeT, VecFifoTileT, TileSplitAxis::TILE_UP_DOWN>(mPipe, vecFifoTile);
+
+        // Load current C tile from GM
+        TLOAD(cTile, cGlobal);
+
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        // Accumulate: C += P
+        TADD(outTile, cTile, vecFifoTile);
+        TFREE<PipeT, TileSplitAxis::TILE_UP_DOWN>(mPipe);
+
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+
+        // Store result back to GM
+        TSTORE(outGlobal, outTile);
+    }
+}