Skip to content

Commit 61ee9e8

Browse files
author
RuoyuZhou
committed
fix: run a5sim bgemm through the shared pipe path
1 parent bb1459e commit 61ee9e8

4 files changed

Lines changed: 313 additions & 294 deletions

File tree

src/a5/platform/sim/aicore/kernel.cpp

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,24 @@
1+
/*
2+
* Copyright (c) PyPTO Contributors.
3+
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
4+
* CANN Open Software License Agreement Version 2.0 (the "License").
5+
* Please refer to the License for details. You may not use this file except in compliance with the License.
6+
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
7+
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
8+
* See LICENSE in the root of the software repository for the full text of the License.
9+
* -----------------------------------------------------------------------------------------------------------
10+
*/
111
/**
212
* AICore Kernel Wrapper for Simulation
313
*
414
* Provides a wrapper around aicore_execute for dlsym lookup.
515
* Sets up per-thread simulated register base before calling the executor.
616
*/
717

18+
#include <dlfcn.h>
19+
820
#include <cstdint>
21+
922
#include "aicore/aicore.h"
1023
#include "common/core_type.h"
1124
#include "common/platform_config.h"
@@ -20,9 +33,20 @@ thread_local uint32_t g_sim_physical_core_id = 0;
2033
// Declare the original function (defined in aicore_executor.cpp with weak linkage)
2134
void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type);
2235

36+
namespace {
37+
using CpuSimSetExecutionContextHook = void (*)(uint32_t, uint32_t, uint32_t);
38+
39+
CpuSimSetExecutionContextHook resolve_cpu_sim_set_execution_context_hook() {
40+
static auto hook =
41+
reinterpret_cast<CpuSimSetExecutionContextHook>(dlsym(RTLD_DEFAULT, "pto_cpu_sim_set_execution_context"));
42+
return hook;
43+
}
44+
} // namespace
45+
2346
// Wrapper with extern "C" for dlsym lookup
2447
// NOTE: physical_core_id stays in wrapper signature (DeviceRunner passes it for register indexing)
25-
extern "C" void aicore_execute_wrapper(__gm__ Runtime* runtime, int block_idx, CoreType core_type, uint32_t physical_core_id, uint64_t regs) {
48+
extern "C" void aicore_execute_wrapper(
49+
__gm__ Runtime* runtime, int block_idx, CoreType core_type, uint32_t physical_core_id, uint64_t regs) {
2650
// Set up simulated register base for this thread.
2751
// regs points to an array of uint64_t base addresses (one per core).
2852
// physical_core_id indexes into it to get this core's register block.
@@ -32,6 +56,22 @@ extern "C" void aicore_execute_wrapper(__gm__ Runtime* runtime, int block_idx, C
3256
}
3357

3458
g_sim_physical_core_id = physical_core_id;
59+
const uint32_t num_aic = static_cast<uint32_t>(runtime->worker_count / PLATFORM_CORES_PER_BLOCKDIM);
60+
uint32_t cpu_block_idx = static_cast<uint32_t>(block_idx);
61+
uint32_t subblock_id = 0;
62+
uint32_t subblock_dim = 1;
63+
64+
if (core_type == CoreType::AIV && physical_core_id >= num_aic) {
65+
const uint32_t aiv_offset = physical_core_id - num_aic;
66+
cpu_block_idx = aiv_offset / PLATFORM_AIV_CORES_PER_BLOCKDIM;
67+
subblock_id = aiv_offset % PLATFORM_AIV_CORES_PER_BLOCKDIM;
68+
subblock_dim = PLATFORM_AIV_CORES_PER_BLOCKDIM;
69+
} else {
70+
cpu_block_idx = physical_core_id;
71+
}
3572

73+
if (auto hook = resolve_cpu_sim_set_execution_context_hook(); hook != nullptr) {
74+
hook(cpu_block_idx, subblock_id, subblock_dim);
75+
}
3676
aicore_execute(runtime, block_idx, core_type);
3777
}

src/a5/platform/sim/host/device_runner.cpp

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,70 @@
2727
#include "device_runner.h"
2828

2929
#include <cstdio>
30+
#include <map>
3031
#include <string>
3132
#include <vector>
3233

3334
#include "aicpu/platform_aicpu_affinity.h"
3435
#include "callable.h"
3536
#include "host/raii_scope_guard.h"
3637

38+
namespace {
39+
thread_local uint32_t g_cpu_sim_block_idx = 0;
40+
thread_local uint32_t g_cpu_sim_subblock_id = 0;
41+
thread_local uint32_t g_cpu_sim_subblock_dim = 1;
42+
thread_local uint64_t g_cpu_sim_task_cookie = 0;
43+
std::mutex g_cpu_sim_shared_storage_mutex;
44+
std::map<std::string, void*> g_cpu_sim_shared_storage;
45+
46+
void clear_cpu_sim_shared_storage() {
47+
std::lock_guard<std::mutex> lock(g_cpu_sim_shared_storage_mutex);
48+
for (auto& [key, storage] : g_cpu_sim_shared_storage) {
49+
(void)key;
50+
std::free(storage);
51+
}
52+
g_cpu_sim_shared_storage.clear();
53+
}
54+
} // namespace
55+
56+
extern "C" void pto_cpu_sim_set_execution_context(uint32_t block_idx, uint32_t subblock_id, uint32_t subblock_dim) {
57+
g_cpu_sim_block_idx = block_idx;
58+
g_cpu_sim_subblock_id = subblock_id;
59+
g_cpu_sim_subblock_dim = (subblock_dim == 0) ? 1u : subblock_dim;
60+
}
61+
62+
extern "C" void pto_cpu_sim_set_task_cookie(uint64_t task_cookie) { g_cpu_sim_task_cookie = task_cookie; }
63+
64+
extern "C" void pto_cpu_sim_get_execution_context(uint32_t* block_idx, uint32_t* subblock_id, uint32_t* subblock_dim) {
65+
if (block_idx != nullptr) {
66+
*block_idx = g_cpu_sim_block_idx;
67+
}
68+
if (subblock_id != nullptr) {
69+
*subblock_id = g_cpu_sim_subblock_id;
70+
}
71+
if (subblock_dim != nullptr) {
72+
*subblock_dim = g_cpu_sim_subblock_dim;
73+
}
74+
}
75+
76+
extern "C" uint64_t pto_cpu_sim_get_task_cookie() { return g_cpu_sim_task_cookie; }
77+
78+
extern "C" void* pto_cpu_sim_get_shared_storage(const char* key, size_t size) {
79+
if (key == nullptr || size == 0) {
80+
return nullptr;
81+
}
82+
83+
std::lock_guard<std::mutex> lock(g_cpu_sim_shared_storage_mutex);
84+
auto it = g_cpu_sim_shared_storage.find(key);
85+
if (it != g_cpu_sim_shared_storage.end()) {
86+
return it->second;
87+
}
88+
89+
void* storage = std::calloc(1, size);
90+
g_cpu_sim_shared_storage.emplace(key, storage);
91+
return storage;
92+
}
93+
3794
// Function pointer types for dynamically loaded executors
3895
typedef int (*aicpu_execute_func_t)(Runtime* runtime);
3996
typedef void (*aicore_execute_func_t)(
@@ -151,6 +208,7 @@ int DeviceRunner::run(Runtime& runtime,
151208
const std::vector<uint8_t>& aicpu_so_binary,
152209
const std::vector<uint8_t>& aicore_kernel_binary,
153210
int launch_aicpu_num) {
211+
clear_cpu_sim_shared_storage();
154212
// Validate launch_aicpu_num
155213
if (launch_aicpu_num < 1 || launch_aicpu_num > PLATFORM_MAX_AICPU_THREADS) {
156214
LOG_ERROR("launch_aicpu_num (%d) must be in range [1, %d]", launch_aicpu_num, PLATFORM_MAX_AICPU_THREADS);
@@ -436,6 +494,7 @@ int DeviceRunner::finalize() {
436494

437495
// Free all remaining allocations
438496
mem_alloc_.finalize();
497+
clear_cpu_sim_shared_storage();
439498

440499
device_id_ = -1;
441500
worker_count_ = 0;

src/a5/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@
1616
#include "pto2_dispatch_payload.h" // NOLINT(build/include_subdir)
1717
#include "runtime.h" // NOLINT(build/include_subdir)
1818

19+
#ifdef __CPU_SIM
20+
#include <dlfcn.h>
21+
#endif
22+
1923
/**
2024
* Unified function pointer type for kernel dispatch
2125
*
@@ -24,6 +28,17 @@
2428
*/
2529
typedef void (*UnifiedKernelFunc)(__gm__ int64_t*);
2630

31+
#ifdef __CPU_SIM
32+
namespace {
33+
using CpuSimSetTaskCookieHook = void (*)(uint64_t);
34+
35+
CpuSimSetTaskCookieHook resolve_cpu_sim_set_task_cookie_hook() {
36+
static auto hook = reinterpret_cast<CpuSimSetTaskCookieHook>(dlsym(RTLD_DEFAULT, "pto_cpu_sim_set_task_cookie"));
37+
return hook;
38+
}
39+
} // namespace
40+
#endif
41+
2742
/**
2843
* Execute task from PTO2DispatchPayload.
2944
*
@@ -120,6 +135,11 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, in
120135
uint64_t start_time = get_sys_cnt_aicore();
121136

122137
// Execute the task
138+
#ifdef __CPU_SIM
139+
if (auto hook = resolve_cpu_sim_set_task_cookie_hook(); hook != nullptr) {
140+
hook(reinterpret_cast<uint64_t>(payload->args));
141+
}
142+
#endif
123143
execute_task(payload);
124144

125145
// Performance profiling: record task execution

0 commit comments

Comments
 (0)