Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions src/a2a3/platform/include/aicpu/platform_regs.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,30 @@ uint64_t get_platform_regs();
*/
uint64_t read_reg(uint64_t reg_base_addr, RegId reg);

/**
* Poll a register value without memory barriers (for hot polling loops)
*
* Unlike read_reg(), this function performs a bare volatile read with no
* memory barriers. This is safe for polling loops where the "not-yet-done"
* fast path has no Normal-memory data dependency on the register value.
*
* Callers MUST insert an explicit memory barrier (e.g. poll_acquire_barrier())
* after detecting the awaited condition, before accessing Normal memory that
* depends on the polled result.
*
* On real hardware: MMIO is Device memory; volatile alone prevents caching
* and compiler reordering. No hardware barrier needed for visibility.
*
* On simulation: registers are Normal memory; volatile prevents compiler
* reordering. Cache coherence ensures cross-thread visibility within
* a bounded number of iterations.
*
* @param reg_base_addr Base address of the AICore's register block
* @param reg Register identifier (C++ enum class)
* @return Register value (zero-extended to uint64_t)
*/
uint64_t poll_reg(uint64_t reg_base_addr, RegId reg);

/**
* Write a value to an AICore's register
*
Expand Down
29 changes: 29 additions & 0 deletions src/a2a3/platform/include/common/memory_barrier.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,4 +60,33 @@
#define OUT_OF_ORDER_STORE_BARRIER() __asm__ __volatile__("" ::: "memory")
#endif

// =============================================================================
// Polling Acquire Barrier
// =============================================================================

/**
* Polling acquire barrier
*
* Use after poll_reg() detects the awaited condition (e.g., task completion),
* before accessing Normal memory whose correctness depends on the polled value.
*
* ARM64: dmb ish (data memory barrier, inner shareable, full)
* Ensures the Device-memory register read is ordered before all subsequent
* Normal-memory loads and stores in the completion path.
* Chosen over dmb ishld (load-only) for safety margin: negligible cost
* (executed once per completion, not per poll iteration) and protects
* against future stores that may be added to the completion path.
*
* x86_64: compiler barrier only (TSO provides implicit acquire on all loads)
*
* Other: full barrier fallback (__sync_synchronize)
*/
#if defined(__aarch64__)
#define poll_acquire_barrier() __asm__ __volatile__("dmb ish" ::: "memory")
#elif defined(__x86_64__)
#define poll_acquire_barrier() __asm__ __volatile__("" ::: "memory")
#else
#define poll_acquire_barrier() __sync_synchronize()
#endif

#endif // SRC_A2A3_PLATFORM_INCLUDE_COMMON_MEMORY_BARRIER_H_
11 changes: 8 additions & 3 deletions src/a2a3/platform/src/aicpu/platform_regs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,20 +36,25 @@ void set_platform_regs(uint64_t regs) { g_platform_regs = regs; }

uint64_t get_platform_regs() { return g_platform_regs; }

uint64_t read_reg(uint64_t reg_base_addr, RegId reg) {
uint64_t read_reg(uint64_t reg_base_addr, RegId reg) { // NOLINT(bugprone-easily-swappable-parameters)
volatile uint32_t *ptr = reinterpret_cast<volatile uint32_t *>(reg_base_addr + reg_offset(reg));

__sync_synchronize();

// Read the register value
uint64_t value = static_cast<uint64_t>(*ptr);
uint64_t value = static_cast<uint64_t>(*ptr); // NOLINT(modernize-use-auto)

__sync_synchronize();

return value;
}

void write_reg(uint64_t reg_base_addr, RegId reg, uint64_t value) {
uint64_t poll_reg(uint64_t reg_base_addr, RegId reg) { // NOLINT(bugprone-easily-swappable-parameters)
volatile uint32_t *ptr = reinterpret_cast<volatile uint32_t *>(reg_base_addr + reg_offset(reg));
return static_cast<uint64_t>(*ptr);
}

void write_reg(uint64_t reg_base_addr, RegId reg, uint64_t value) { // NOLINT(bugprone-easily-swappable-parameters)
volatile uint32_t *ptr = reinterpret_cast<volatile uint32_t *>(reg_base_addr + reg_offset(reg));

__sync_synchronize();
Expand Down
7 changes: 5 additions & 2 deletions src/a2a3/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
* -----------------------------------------------------------------------------------------------------------
*/
#include <dlfcn.h>
// NOLINTBEGIN
#include <fcntl.h>
#include <unistd.h>

Expand All @@ -23,7 +24,7 @@
#include <sys/mman.h>
#endif

#include "aicpu/device_log.h"
#include "aicpu/device_log.h" // NOLINT(clang-diagnostic-error)
#include "aicpu/device_time.h"
#include "pto2_dispatch_payload.h"
#include "runtime.h"
Expand Down Expand Up @@ -310,7 +311,7 @@ struct AicpuExecutor {
uint64_t reg_addr = core_id_to_reg_addr_[core_id];

int32_t expected_reg_task_id = executing_reg_task_ids_[core_id];
uint64_t reg_val = read_reg(reg_addr, RegId::COND);
uint64_t reg_val = poll_reg(reg_addr, RegId::COND);
int32_t reg_task_id = EXTRACT_TASK_ID(reg_val);
int32_t reg_state = EXTRACT_TASK_STATE(reg_val);
bool done = reg_task_id == expected_reg_task_id && reg_state == TASK_FIN_STATE;
Expand All @@ -324,6 +325,7 @@ struct AicpuExecutor {
#endif

if (done) {
poll_acquire_barrier();
executing_reg_task_ids_[core_id] = AICPU_TASK_INVALID;
PTO2SubtaskSlot subslot = executing_subslot_by_core_[core_id];
PTO2TaskSlotState &slot_state = *executing_slot_state_by_core_[core_id];
Expand Down Expand Up @@ -2252,3 +2254,4 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) {
DEV_INFO("%s", "aicpu_execute: Kernel execution completed successfully");
return 0;
}
// NOLINTEND
12 changes: 10 additions & 2 deletions src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,13 @@
* -----------------------------------------------------------------------------------------------------------
*/

// NOLINTBEGIN
#include <atomic>
#include <cstdint>
#include <cstdio>
#include <mutex>

#include "aicpu/device_log.h"
#include "aicpu/device_log.h" // NOLINT(clang-diagnostic-error)
#include "aicpu/device_time.h"
#include "aicpu/performance_collector_aicpu.h"
#include "aicpu/platform_regs.h"
Expand Down Expand Up @@ -608,12 +609,14 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const
uint64_t reg_addr = core_id_to_reg_addr_[core_id];
Handshake *h = &hank[core_id];

uint64_t reg_val = read_reg(reg_addr, RegId::COND);
uint64_t reg_val = poll_reg(reg_addr, RegId::COND);
int reg_task_id = EXTRACT_TASK_ID(reg_val);
int reg_state = EXTRACT_TASK_STATE(reg_val);

// Case 1: Pending task finished directly
if (reg_task_id == pending_task_ids_[core_id] && reg_state == TASK_FIN_STATE) {
poll_acquire_barrier();

LOG_INFO(
"Thread %d: Core %d completed task %d (running_id=%d)", thread_idx, core_id,
pending_task_ids_[core_id], running_task_ids_[core_id]
Expand Down Expand Up @@ -716,6 +719,8 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const
}
} else if (reg_task_id == pending_task_ids_[core_id] && reg_state == TASK_ACK_STATE) {
// Case 2: Pending task received ACK
poll_acquire_barrier();

LOG_INFO(
"Thread %d: Core %d ACKed task %d (running_id=%d)", thread_idx, core_id, pending_task_ids_[core_id],
running_task_ids_[core_id]
Expand Down Expand Up @@ -770,6 +775,8 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const
// Continue to Case 4 to dispatch next task
} else if (reg_task_id == running_task_ids_[core_id] && reg_state == TASK_FIN_STATE) {
// Case 3: Running task finished
poll_acquire_barrier();

LOG_INFO(
"Thread %d: Core %d completed task %d (pending_id=%d)", thread_idx, core_id,
running_task_ids_[core_id], pending_task_ids_[core_id]
Expand Down Expand Up @@ -1200,3 +1207,4 @@ extern "C" int aicpu_execute(Runtime *runtime) {
LOG_INFO("%s", "aicpu_execute: Kernel execution completed successfully");
return 0;
}
// NOLINTEND
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
* -----------------------------------------------------------------------------------------------------------
*/
#include <dlfcn.h>
// NOLINTBEGIN
#include <fcntl.h>
#include <unistd.h>

Expand Down Expand Up @@ -374,7 +375,7 @@ struct AicpuExecutor {
uint64_t reg_addr = core_exec_state.reg_addr;

int32_t expected_reg_task_id = core_exec_state.executing_reg_task_id;
uint64_t reg_val = read_reg(reg_addr, RegId::COND);
uint64_t reg_val = poll_reg(reg_addr, RegId::COND);
int32_t reg_task_id = EXTRACT_TASK_ID(reg_val);
int32_t reg_state = EXTRACT_TASK_STATE(reg_val);
bool done = reg_task_id == expected_reg_task_id && reg_state == TASK_FIN_STATE;
Expand All @@ -388,6 +389,7 @@ struct AicpuExecutor {
#endif

if (done) {
poll_acquire_barrier();
core_exec_state.executing_reg_task_id = AICPU_TASK_INVALID;
PTO2TaskSlotState &slot_state = *core_exec_state.executing_slot_state;

Expand Down Expand Up @@ -2372,3 +2374,4 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) {
DEV_INFO("%s", "aicpu_execute: Kernel execution completed successfully");
return 0;
}
// NOLINTEND
24 changes: 24 additions & 0 deletions src/a5/platform/include/aicpu/platform_regs.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,30 @@ uint64_t get_platform_regs();
*/
uint64_t read_reg(uint64_t reg_base_addr, RegId reg);

/**
* Poll a register value without memory barriers (for hot polling loops)
*
* Unlike read_reg(), this function performs a bare volatile read with no
* memory barriers. This is safe for polling loops where the "not-yet-done"
* fast path has no Normal-memory data dependency on the register value.
*
* Callers MUST insert an explicit memory barrier (e.g. poll_acquire_barrier())
* after detecting the awaited condition, before accessing Normal memory that
* depends on the polled result.
*
* On real hardware: MMIO is Device memory; volatile alone prevents caching
* and compiler reordering. No hardware barrier needed for visibility.
*
* On simulation: registers are Normal memory; volatile prevents compiler
* reordering. Cache coherence ensures cross-thread visibility within
* a bounded number of iterations.
*
* @param reg_base_addr Base address of the AICore's register block
* @param reg Register identifier (C++ enum class)
* @return Register value (zero-extended to uint64_t)
*/
uint64_t poll_reg(uint64_t reg_base_addr, RegId reg);

/**
* Write a value to an AICore's register
*
Expand Down
29 changes: 29 additions & 0 deletions src/a5/platform/include/common/memory_barrier.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,4 +60,33 @@
#define OUT_OF_ORDER_STORE_BARRIER() __asm__ __volatile__("" ::: "memory")
#endif

// =============================================================================
// Polling Acquire Barrier
// =============================================================================

/**
* Polling acquire barrier
*
* Use after poll_reg() detects the awaited condition (e.g., task completion),
* before accessing Normal memory whose correctness depends on the polled value.
*
* ARM64: dmb ish (data memory barrier, inner shareable, full)
* Ensures the Device-memory register read is ordered before all subsequent
* Normal-memory loads and stores in the completion path.
* Chosen over dmb ishld (load-only) for safety margin: negligible cost
* (executed once per completion, not per poll iteration) and protects
* against future stores that may be added to the completion path.
*
* x86_64: compiler barrier only (TSO provides implicit acquire on all loads)
*
* Other: full barrier fallback (__sync_synchronize)
*/
#if defined(__aarch64__)
#define poll_acquire_barrier() __asm__ __volatile__("dmb ish" ::: "memory")
#elif defined(__x86_64__)
#define poll_acquire_barrier() __asm__ __volatile__("" ::: "memory")
#else
#define poll_acquire_barrier() __sync_synchronize()
#endif

#endif // SRC_A5_PLATFORM_INCLUDE_COMMON_MEMORY_BARRIER_H_
11 changes: 9 additions & 2 deletions src/a5/platform/onboard/aicpu/inner_platform_regs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,13 @@
* virtual address with no remapping.
*/

// NOLINTBEGIN(clang-diagnostic-error)
#include <cstdint>
#include "aicpu/platform_regs.h"
#include "common/platform_config.h"
// NOLINTEND(clang-diagnostic-error)

uint64_t read_reg(uint64_t reg_base_addr, RegId reg) {
uint64_t read_reg(uint64_t reg_base_addr, RegId reg) { // NOLINT(bugprone-easily-swappable-parameters)
uint32_t offset = reg_offset(reg);
volatile uint32_t *ptr = reinterpret_cast<volatile uint32_t *>(reg_base_addr + offset);

Expand All @@ -32,7 +34,12 @@ uint64_t read_reg(uint64_t reg_base_addr, RegId reg) {
return value;
}

void write_reg(uint64_t reg_base_addr, RegId reg, uint64_t value) {
uint64_t poll_reg(uint64_t reg_base_addr, RegId reg) { // NOLINT(bugprone-easily-swappable-parameters)
volatile uint32_t *ptr = reinterpret_cast<volatile uint32_t *>(reg_base_addr + reg_offset(reg));
return static_cast<uint64_t>(*ptr);
}

void write_reg(uint64_t reg_base_addr, RegId reg, uint64_t value) { // NOLINT(bugprone-easily-swappable-parameters)
uint32_t offset = reg_offset(reg);
volatile uint32_t *ptr = reinterpret_cast<volatile uint32_t *>(reg_base_addr + offset);

Expand Down
13 changes: 10 additions & 3 deletions src/a5/platform/sim/aicpu/inner_platform_regs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,26 @@
#include "aicpu/platform_regs.h"
#include "common/platform_config.h"

uint64_t read_reg(uint64_t reg_base_addr, RegId reg) {
uint64_t read_reg(uint64_t reg_base_addr, RegId reg) { // NOLINT(bugprone-easily-swappable-parameters)
uint32_t offset = reg_offset(reg);
volatile uint8_t *reg_base = reinterpret_cast<volatile uint8_t *>(reg_base_addr);
volatile uint32_t *ptr = reinterpret_cast<volatile uint32_t *>(sparse_reg_ptr(reg_base, offset));

__sync_synchronize();
uint64_t value = static_cast<uint64_t>(*ptr);
uint64_t value = static_cast<uint64_t>(*ptr); // NOLINT(modernize-use-auto)
__sync_synchronize();

return value;
}

void write_reg(uint64_t reg_base_addr, RegId reg, uint64_t value) {
uint64_t poll_reg(uint64_t reg_base_addr, RegId reg) { // NOLINT(bugprone-easily-swappable-parameters)
uint32_t offset = reg_offset(reg);
volatile uint8_t *reg_base = reinterpret_cast<volatile uint8_t *>(reg_base_addr);
volatile uint32_t *ptr = reinterpret_cast<volatile uint32_t *>(sparse_reg_ptr(reg_base, offset));
return static_cast<uint64_t>(*ptr);
}

void write_reg(uint64_t reg_base_addr, RegId reg, uint64_t value) { // NOLINT(bugprone-easily-swappable-parameters)
uint32_t offset = reg_offset(reg);
volatile uint8_t *reg_base = reinterpret_cast<volatile uint8_t *>(reg_base_addr);
volatile uint32_t *ptr = reinterpret_cast<volatile uint32_t *>(sparse_reg_ptr(reg_base, offset));
Expand Down
10 changes: 9 additions & 1 deletion src/a5/runtime/host_build_graph/aicpu/aicpu_executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
* -----------------------------------------------------------------------------------------------------------
*/
#include <atomic>
// NOLINTBEGIN
#include <cstdint>
#include <cstdio>
#include <mutex>
Expand Down Expand Up @@ -607,12 +608,14 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const
uint64_t reg_addr = core_id_to_reg_addr_[core_id];
Handshake *h = &hank[core_id];

uint64_t reg_val = read_reg(reg_addr, RegId::COND);
uint64_t reg_val = poll_reg(reg_addr, RegId::COND);
int reg_task_id = EXTRACT_TASK_ID(reg_val);
int reg_state = EXTRACT_TASK_STATE(reg_val);

// Case 1: Pending task finished directly
if (reg_task_id == pending_task_ids_[core_id] && reg_state == TASK_FIN_STATE) {
poll_acquire_barrier();

LOG_INFO(
"Thread %d: Core %d completed task %d (running_id=%d)", thread_idx, core_id,
pending_task_ids_[core_id], running_task_ids_[core_id]
Expand Down Expand Up @@ -691,6 +694,8 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const
dispatch_timestamps_[core_id] = get_sys_cnt_aicpu();
}
} else if (reg_task_id == pending_task_ids_[core_id] && reg_state == TASK_ACK_STATE) { // Case 2: ACK
poll_acquire_barrier();

LOG_INFO(
"Thread %d: Core %d ACKed task %d (running_id=%d)", thread_idx, core_id, pending_task_ids_[core_id],
running_task_ids_[core_id]
Expand Down Expand Up @@ -722,6 +727,8 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const
// Core can accept new task now (pipeline!)
// Continue to Case 4 to dispatch next task
} else if (reg_task_id == running_task_ids_[core_id] && reg_state == TASK_FIN_STATE) { // Case 3: FIN
poll_acquire_barrier();

LOG_INFO(
"Thread %d: Core %d completed task %d (pending_id=%d)", thread_idx, core_id,
running_task_ids_[core_id], pending_task_ids_[core_id]
Expand Down Expand Up @@ -1152,3 +1159,4 @@ extern "C" int aicpu_execute(Runtime *runtime) {
LOG_INFO("%s", "aicpu_execute: Kernel execution completed successfully");
return 0;
}
// NOLINTEND
Loading
Loading