Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions ci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -181,11 +181,14 @@ trap 'kill $WATCHDOG_PID 2>/dev/null; pkill -TERM -P $$ 2>/dev/null; rm -rf "$LO
) >/dev/null 2>&1 &
WATCHDOG_PID=$!

# commit_flag starts empty (try latest PTO-ISA first).
# If -c is given AND a test fails, pin_pto_isa_on_failure sets commit_flag.
commit_flag=()
if [[ -n "$PTO_ISA_COMMIT" ]]; then
echo "[CI] Using pinned PTO-ISA commit from start: $PTO_ISA_COMMIT"
rm -rf examples/scripts/_deps/pto-isa
commit_flag=(-c "$PTO_ISA_COMMIT")
fi

# Pin PTO-ISA to the specified commit on first failure.
# Legacy fallback path for callers that start on the latest PTO-ISA checkout.
# On first failure: cleans cached clone, sets commit_flag, returns 0 (caller retries).
# On subsequent failures (already pinned): returns 1 (real failure).
pin_pto_isa_on_failure() {
Expand Down
42 changes: 41 additions & 1 deletion src/a5/platform/sim/aicore/kernel.cpp
Original file line number Diff line number Diff line change
@@ -1,11 +1,24 @@
/*
* Copyright (c) PyPTO Contributors.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
* -----------------------------------------------------------------------------------------------------------
*/
/**
* AICore Kernel Wrapper for Simulation
*
* Provides a wrapper around aicore_execute for dlsym lookup.
* Sets up per-thread simulated register base before calling the executor.
*/

#include <dlfcn.h>

#include <cstdint>

#include "aicore/aicore.h"
#include "common/core_type.h"
#include "common/platform_config.h"
Expand All @@ -20,9 +33,20 @@ thread_local uint32_t g_sim_physical_core_id = 0;
// Declare the original function (defined in aicore_executor.cpp with weak linkage)
void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type);

namespace {
using CpuSimSetExecutionContextHook = void (*)(uint32_t, uint32_t, uint32_t);

CpuSimSetExecutionContextHook resolve_cpu_sim_set_execution_context_hook() {
static auto hook =
reinterpret_cast<CpuSimSetExecutionContextHook>(dlsym(RTLD_DEFAULT, "pto_cpu_sim_set_execution_context"));
return hook;
}
} // namespace

// Wrapper with extern "C" for dlsym lookup
// NOTE: physical_core_id stays in wrapper signature (DeviceRunner passes it for register indexing)
extern "C" void aicore_execute_wrapper(__gm__ Runtime* runtime, int block_idx, CoreType core_type, uint32_t physical_core_id, uint64_t regs) {
extern "C" void aicore_execute_wrapper(
__gm__ Runtime* runtime, int block_idx, CoreType core_type, uint32_t physical_core_id, uint64_t regs) {
// Set up simulated register base for this thread.
// regs points to an array of uint64_t base addresses (one per core).
// physical_core_id indexes into it to get this core's register block.
Expand All @@ -32,6 +56,22 @@ extern "C" void aicore_execute_wrapper(__gm__ Runtime* runtime, int block_idx, C
}

g_sim_physical_core_id = physical_core_id;
const uint32_t num_aic = static_cast<uint32_t>(runtime->worker_count / PLATFORM_CORES_PER_BLOCKDIM);
uint32_t cpu_block_idx = static_cast<uint32_t>(block_idx);
uint32_t subblock_id = 0;
uint32_t subblock_dim = 1;

if (core_type == CoreType::AIV && physical_core_id >= num_aic) {
const uint32_t aiv_offset = physical_core_id - num_aic;
cpu_block_idx = aiv_offset / PLATFORM_AIV_CORES_PER_BLOCKDIM;
subblock_id = aiv_offset % PLATFORM_AIV_CORES_PER_BLOCKDIM;
subblock_dim = PLATFORM_AIV_CORES_PER_BLOCKDIM;
} else {
cpu_block_idx = physical_core_id;
}

if (auto hook = resolve_cpu_sim_set_execution_context_hook(); hook != nullptr) {
hook(cpu_block_idx, subblock_id, subblock_dim);
}
aicore_execute(runtime, block_idx, core_type);
}
59 changes: 59 additions & 0 deletions src/a5/platform/sim/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,70 @@
#include "device_runner.h"

#include <cstdio>
#include <map>
#include <string>
#include <vector>

#include "aicpu/platform_aicpu_affinity.h"
#include "callable.h"
#include "host/raii_scope_guard.h"

namespace {
thread_local uint32_t g_cpu_sim_block_idx = 0;
thread_local uint32_t g_cpu_sim_subblock_id = 0;
thread_local uint32_t g_cpu_sim_subblock_dim = 1;
thread_local uint64_t g_cpu_sim_task_cookie = 0;
std::mutex g_cpu_sim_shared_storage_mutex;
std::map<std::string, void*> g_cpu_sim_shared_storage;

void clear_cpu_sim_shared_storage() {
std::lock_guard<std::mutex> lock(g_cpu_sim_shared_storage_mutex);
for (auto& [key, storage] : g_cpu_sim_shared_storage) {
(void)key;
std::free(storage);
}
g_cpu_sim_shared_storage.clear();
}
} // namespace

extern "C" void pto_cpu_sim_set_execution_context(uint32_t block_idx, uint32_t subblock_id, uint32_t subblock_dim) {
g_cpu_sim_block_idx = block_idx;
g_cpu_sim_subblock_id = subblock_id;
g_cpu_sim_subblock_dim = (subblock_dim == 0) ? 1u : subblock_dim;
}

extern "C" void pto_cpu_sim_set_task_cookie(uint64_t task_cookie) { g_cpu_sim_task_cookie = task_cookie; }

extern "C" void pto_cpu_sim_get_execution_context(uint32_t* block_idx, uint32_t* subblock_id, uint32_t* subblock_dim) {
if (block_idx != nullptr) {
*block_idx = g_cpu_sim_block_idx;
}
if (subblock_id != nullptr) {
*subblock_id = g_cpu_sim_subblock_id;
}
if (subblock_dim != nullptr) {
*subblock_dim = g_cpu_sim_subblock_dim;
}
}

extern "C" uint64_t pto_cpu_sim_get_task_cookie() { return g_cpu_sim_task_cookie; }

extern "C" void* pto_cpu_sim_get_shared_storage(const char* key, size_t size) {
if (key == nullptr || size == 0) {
return nullptr;
}

std::lock_guard<std::mutex> lock(g_cpu_sim_shared_storage_mutex);
auto it = g_cpu_sim_shared_storage.find(key);
if (it != g_cpu_sim_shared_storage.end()) {
return it->second;
}

void* storage = std::calloc(1, size);
g_cpu_sim_shared_storage.emplace(key, storage);
return storage;
}

// Function pointer types for dynamically loaded executors
typedef int (*aicpu_execute_func_t)(Runtime* runtime);
typedef void (*aicore_execute_func_t)(
Expand Down Expand Up @@ -151,6 +208,7 @@ int DeviceRunner::run(Runtime& runtime,
const std::vector<uint8_t>& aicpu_so_binary,
const std::vector<uint8_t>& aicore_kernel_binary,
int launch_aicpu_num) {
clear_cpu_sim_shared_storage();
// Validate launch_aicpu_num
if (launch_aicpu_num < 1 || launch_aicpu_num > PLATFORM_MAX_AICPU_THREADS) {
LOG_ERROR("launch_aicpu_num (%d) must be in range [1, %d]", launch_aicpu_num, PLATFORM_MAX_AICPU_THREADS);
Expand Down Expand Up @@ -436,6 +494,7 @@ int DeviceRunner::finalize() {

// Free all remaining allocations
mem_alloc_.finalize();
clear_cpu_sim_shared_storage();

device_id_ = -1;
worker_count_ = 0;
Expand Down
20 changes: 20 additions & 0 deletions src/a5/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@
#include "pto2_dispatch_payload.h" // NOLINT(build/include_subdir)
#include "runtime.h" // NOLINT(build/include_subdir)

#ifdef __CPU_SIM
#include <dlfcn.h>
#endif

/**
* Unified function pointer type for kernel dispatch
*
Expand All @@ -24,6 +28,17 @@
*/
typedef void (*UnifiedKernelFunc)(__gm__ int64_t*);

#ifdef __CPU_SIM
namespace {
using CpuSimSetTaskCookieHook = void (*)(uint64_t);

CpuSimSetTaskCookieHook resolve_cpu_sim_set_task_cookie_hook() {
static auto hook = reinterpret_cast<CpuSimSetTaskCookieHook>(dlsym(RTLD_DEFAULT, "pto_cpu_sim_set_task_cookie"));
return hook;
}
} // namespace
#endif

/**
* Execute task from PTO2DispatchPayload.
*
Expand Down Expand Up @@ -120,6 +135,11 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, in
uint64_t start_time = get_sys_cnt_aicore();

// Execute the task
#ifdef __CPU_SIM
if (auto hook = resolve_cpu_sim_set_task_cookie_hook(); hook != nullptr) {
hook(reinterpret_cast<uint64_t>(payload->args));
}
#endif
execute_task(payload);

// Performance profiling: record task execution
Expand Down
Loading
Loading