diff --git a/src/a2a3/platform/include/aicpu/platform_regs.h b/src/a2a3/platform/include/aicpu/platform_regs.h index c0fefe0e0..b2209dee7 100644 --- a/src/a2a3/platform/include/aicpu/platform_regs.h +++ b/src/a2a3/platform/include/aicpu/platform_regs.h @@ -63,6 +63,30 @@ uint64_t get_platform_regs(); */ uint64_t read_reg(uint64_t reg_base_addr, RegId reg); +/** + * Poll a register value without memory barriers (for hot polling loops) + * + * Unlike read_reg(), this function performs a bare volatile read with no + * memory barriers. This is safe for polling loops where the "not-yet-done" + * fast path has no Normal-memory data dependency on the register value. + * + * Callers MUST insert an explicit memory barrier (e.g. poll_acquire_barrier()) + * after detecting the awaited condition, before accessing Normal memory that + * depends on the polled result. + * + * On real hardware: MMIO is Device memory; volatile alone prevents caching + * and compiler reordering. No hardware barrier needed for visibility. + * + * On simulation: registers are Normal memory; volatile prevents compiler + * reordering. Cache coherence ensures cross-thread visibility within + * a bounded number of iterations. + * + * @param reg_base_addr Base address of the AICore's register block + * @param reg Register identifier (C++ enum class) + * @return Register value (zero-extended to uint64_t) + */ +uint64_t poll_reg(uint64_t reg_base_addr, RegId reg); + /** * Write a value to an AICore's register * diff --git a/src/a2a3/platform/include/common/memory_barrier.h b/src/a2a3/platform/include/common/memory_barrier.h index 404dff12c..8edf152f4 100644 --- a/src/a2a3/platform/include/common/memory_barrier.h +++ b/src/a2a3/platform/include/common/memory_barrier.h @@ -60,4 +60,33 @@ #define OUT_OF_ORDER_STORE_BARRIER() __asm__ __volatile__("" ::: "memory") #endif +// ============================================================================= +// Polling Acquire Barrier +// ============================================================================= + +/** + * Polling acquire barrier + * + * Use after poll_reg() detects the awaited condition (e.g., task completion), + * before accessing Normal memory whose correctness depends on the polled value. + * + * ARM64: dmb ish (data memory barrier, inner shareable, full) + * Ensures the Device-memory register read is ordered before all subsequent + * Normal-memory loads and stores in the completion path. + * Chosen over dmb ishld (load-only) for safety margin: negligible cost + * (executed once per completion, not per poll iteration) and protects + * against future stores that may be added to the completion path. + * + * x86_64: compiler barrier only (TSO provides implicit acquire on all loads) + * + * Other: full barrier fallback (__sync_synchronize) + */ +#if defined(__aarch64__) +#define poll_acquire_barrier() __asm__ __volatile__("dmb ish" ::: "memory") +#elif defined(__x86_64__) +#define poll_acquire_barrier() __asm__ __volatile__("" ::: "memory") +#else +#define poll_acquire_barrier() __sync_synchronize() +#endif + #endif // SRC_A2A3_PLATFORM_INCLUDE_COMMON_MEMORY_BARRIER_H_ diff --git a/src/a2a3/platform/src/aicpu/platform_regs.cpp b/src/a2a3/platform/src/aicpu/platform_regs.cpp index 8ee4d11a7..a9d7b7ac8 100644 --- a/src/a2a3/platform/src/aicpu/platform_regs.cpp +++ b/src/a2a3/platform/src/aicpu/platform_regs.cpp @@ -36,20 +36,25 @@ void set_platform_regs(uint64_t regs) { g_platform_regs = regs; } uint64_t get_platform_regs() { return g_platform_regs; } -uint64_t read_reg(uint64_t reg_base_addr, RegId reg) { +uint64_t read_reg(uint64_t reg_base_addr, RegId reg) { // NOLINT(bugprone-easily-swappable-parameters) volatile uint32_t *ptr = reinterpret_cast(reg_base_addr + reg_offset(reg)); __sync_synchronize(); // Read the register value - uint64_t value = static_cast(*ptr); + uint64_t value = static_cast(*ptr); // NOLINT(modernize-use-auto) __sync_synchronize(); return value; } -void write_reg(uint64_t reg_base_addr, RegId reg, uint64_t value) { +uint64_t poll_reg(uint64_t reg_base_addr, RegId reg) { // NOLINT(bugprone-easily-swappable-parameters) + volatile uint32_t *ptr = reinterpret_cast(reg_base_addr + reg_offset(reg)); + return static_cast(*ptr); +} + +void write_reg(uint64_t reg_base_addr, RegId reg, uint64_t value) { // NOLINT(bugprone-easily-swappable-parameters) volatile uint32_t *ptr = reinterpret_cast(reg_base_addr + reg_offset(reg)); __sync_synchronize(); diff --git a/src/a2a3/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp index b5335740d..5fdaaa3a8 100644 --- a/src/a2a3/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp @@ -9,6 +9,7 @@ * ----------------------------------------------------------------------------------------------------------- */ #include +// NOLINTBEGIN #include #include @@ -23,7 +24,7 @@ #include #endif -#include "aicpu/device_log.h" +#include "aicpu/device_log.h" // NOLINT(clang-diagnostic-error) #include "aicpu/device_time.h" #include "pto2_dispatch_payload.h" #include "runtime.h" @@ -310,7 +311,7 @@ struct AicpuExecutor { uint64_t reg_addr = core_id_to_reg_addr_[core_id]; int32_t expected_reg_task_id = executing_reg_task_ids_[core_id]; - uint64_t reg_val = read_reg(reg_addr, RegId::COND); + uint64_t reg_val = poll_reg(reg_addr, RegId::COND); int32_t reg_task_id = EXTRACT_TASK_ID(reg_val); int32_t reg_state = EXTRACT_TASK_STATE(reg_val); bool done = reg_task_id == expected_reg_task_id && reg_state == TASK_FIN_STATE; @@ -324,6 +325,7 @@ struct AicpuExecutor { #endif if (done) { + poll_acquire_barrier(); executing_reg_task_ids_[core_id] = AICPU_TASK_INVALID; PTO2SubtaskSlot subslot = executing_subslot_by_core_[core_id]; PTO2TaskSlotState &slot_state = *executing_slot_state_by_core_[core_id]; @@ -2252,3 +2254,4 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) { DEV_INFO("%s", "aicpu_execute: Kernel execution completed successfully"); return 0; } +// NOLINTEND diff --git a/src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp index 36ac282ec..e0eccea59 100644 --- a/src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp @@ -9,12 +9,13 @@ * ----------------------------------------------------------------------------------------------------------- */ +// NOLINTBEGIN #include #include #include #include -#include "aicpu/device_log.h" +#include "aicpu/device_log.h" // NOLINT(clang-diagnostic-error) #include "aicpu/device_time.h" #include "aicpu/performance_collector_aicpu.h" #include "aicpu/platform_regs.h" @@ -608,12 +609,14 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const uint64_t reg_addr = core_id_to_reg_addr_[core_id]; Handshake *h = &hank[core_id]; - uint64_t reg_val = read_reg(reg_addr, RegId::COND); + uint64_t reg_val = poll_reg(reg_addr, RegId::COND); int reg_task_id = EXTRACT_TASK_ID(reg_val); int reg_state = EXTRACT_TASK_STATE(reg_val); // Case 1: Pending task finished directly if (reg_task_id == pending_task_ids_[core_id] && reg_state == TASK_FIN_STATE) { + poll_acquire_barrier(); + LOG_INFO( "Thread %d: Core %d completed task %d (running_id=%d)", thread_idx, core_id, pending_task_ids_[core_id], running_task_ids_[core_id] @@ -716,6 +719,8 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const } } else if (reg_task_id == pending_task_ids_[core_id] && reg_state == TASK_ACK_STATE) { // Case 2: Pending task received ACK + poll_acquire_barrier(); + LOG_INFO( "Thread %d: Core %d ACKed task %d (running_id=%d)", thread_idx, core_id, pending_task_ids_[core_id], running_task_ids_[core_id] @@ -770,6 +775,8 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const // Continue to Case 4 to dispatch next task } else if (reg_task_id == running_task_ids_[core_id] && reg_state == TASK_FIN_STATE) { // Case 3: Running task finished + poll_acquire_barrier(); + LOG_INFO( "Thread %d: Core %d completed task %d (pending_id=%d)", thread_idx, core_id, running_task_ids_[core_id], pending_task_ids_[core_id] @@ -1200,3 +1207,4 @@ extern "C" int aicpu_execute(Runtime *runtime) { LOG_INFO("%s", "aicpu_execute: Kernel execution completed successfully"); return 0; } +// NOLINTEND diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index a1a00b856..fcb149325 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -9,6 +9,7 @@ * ----------------------------------------------------------------------------------------------------------- */ #include +// NOLINTBEGIN #include #include @@ -374,7 +375,7 @@ struct AicpuExecutor { uint64_t reg_addr = core_exec_state.reg_addr; int32_t expected_reg_task_id = core_exec_state.executing_reg_task_id; - uint64_t reg_val = read_reg(reg_addr, RegId::COND); + uint64_t reg_val = poll_reg(reg_addr, RegId::COND); int32_t reg_task_id = EXTRACT_TASK_ID(reg_val); int32_t reg_state = EXTRACT_TASK_STATE(reg_val); bool done = reg_task_id == expected_reg_task_id && reg_state == TASK_FIN_STATE; @@ -388,6 +389,7 @@ struct AicpuExecutor { #endif if (done) { + poll_acquire_barrier(); core_exec_state.executing_reg_task_id = AICPU_TASK_INVALID; PTO2TaskSlotState &slot_state = *core_exec_state.executing_slot_state; @@ -2372,3 +2374,4 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) { DEV_INFO("%s", "aicpu_execute: Kernel execution completed successfully"); return 0; } +// NOLINTEND diff --git a/src/a5/platform/include/aicpu/platform_regs.h b/src/a5/platform/include/aicpu/platform_regs.h index 52a9e3c8a..ca771a1ec 100644 --- a/src/a5/platform/include/aicpu/platform_regs.h +++ b/src/a5/platform/include/aicpu/platform_regs.h @@ -66,6 +66,30 @@ uint64_t get_platform_regs(); */ uint64_t read_reg(uint64_t reg_base_addr, RegId reg); +/** + * Poll a register value without memory barriers (for hot polling loops) + * + * Unlike read_reg(), this function performs a bare volatile read with no + * memory barriers. This is safe for polling loops where the "not-yet-done" + * fast path has no Normal-memory data dependency on the register value. + * + * Callers MUST insert an explicit memory barrier (e.g. poll_acquire_barrier()) + * after detecting the awaited condition, before accessing Normal memory that + * depends on the polled result. + * + * On real hardware: MMIO is Device memory; volatile alone prevents caching + * and compiler reordering. No hardware barrier needed for visibility. + * + * On simulation: registers are Normal memory; volatile prevents compiler + * reordering. Cache coherence ensures cross-thread visibility within + * a bounded number of iterations. + * + * @param reg_base_addr Base address of the AICore's register block + * @param reg Register identifier (C++ enum class) + * @return Register value (zero-extended to uint64_t) + */ +uint64_t poll_reg(uint64_t reg_base_addr, RegId reg); + /** * Write a value to an AICore's register * diff --git a/src/a5/platform/include/common/memory_barrier.h b/src/a5/platform/include/common/memory_barrier.h index c0771b058..5ce53bb6c 100644 --- a/src/a5/platform/include/common/memory_barrier.h +++ b/src/a5/platform/include/common/memory_barrier.h @@ -60,4 +60,33 @@ #define OUT_OF_ORDER_STORE_BARRIER() __asm__ __volatile__("" ::: "memory") #endif +// ============================================================================= +// Polling Acquire Barrier +// ============================================================================= + +/** + * Polling acquire barrier + * + * Use after poll_reg() detects the awaited condition (e.g., task completion), + * before accessing Normal memory whose correctness depends on the polled value. + * + * ARM64: dmb ish (data memory barrier, inner shareable, full) + * Ensures the Device-memory register read is ordered before all subsequent + * Normal-memory loads and stores in the completion path. + * Chosen over dmb ishld (load-only) for safety margin: negligible cost + * (executed once per completion, not per poll iteration) and protects + * against future stores that may be added to the completion path. + * + * x86_64: compiler barrier only (TSO provides implicit acquire on all loads) + * + * Other: full barrier fallback (__sync_synchronize) + */ +#if defined(__aarch64__) +#define poll_acquire_barrier() __asm__ __volatile__("dmb ish" ::: "memory") +#elif defined(__x86_64__) +#define poll_acquire_barrier() __asm__ __volatile__("" ::: "memory") +#else +#define poll_acquire_barrier() __sync_synchronize() +#endif + #endif // SRC_A5_PLATFORM_INCLUDE_COMMON_MEMORY_BARRIER_H_ diff --git a/src/a5/platform/onboard/aicpu/inner_platform_regs.cpp b/src/a5/platform/onboard/aicpu/inner_platform_regs.cpp index 5aef5d99f..501998132 100644 --- a/src/a5/platform/onboard/aicpu/inner_platform_regs.cpp +++ b/src/a5/platform/onboard/aicpu/inner_platform_regs.cpp @@ -17,11 +17,13 @@ * virtual address with no remapping. */ +// NOLINTBEGIN(clang-diagnostic-error) #include #include "aicpu/platform_regs.h" #include "common/platform_config.h" +// NOLINTEND(clang-diagnostic-error) -uint64_t read_reg(uint64_t reg_base_addr, RegId reg) { +uint64_t read_reg(uint64_t reg_base_addr, RegId reg) { // NOLINT(bugprone-easily-swappable-parameters) uint32_t offset = reg_offset(reg); volatile uint32_t *ptr = reinterpret_cast(reg_base_addr + offset); @@ -32,7 +34,12 @@ uint64_t read_reg(uint64_t reg_base_addr, RegId reg) { return value; } -void write_reg(uint64_t reg_base_addr, RegId reg, uint64_t value) { +uint64_t poll_reg(uint64_t reg_base_addr, RegId reg) { // NOLINT(bugprone-easily-swappable-parameters) + volatile uint32_t *ptr = reinterpret_cast(reg_base_addr + reg_offset(reg)); + return static_cast(*ptr); +} + +void write_reg(uint64_t reg_base_addr, RegId reg, uint64_t value) { // NOLINT(bugprone-easily-swappable-parameters) uint32_t offset = reg_offset(reg); volatile uint32_t *ptr = reinterpret_cast(reg_base_addr + offset); diff --git a/src/a5/platform/sim/aicpu/inner_platform_regs.cpp b/src/a5/platform/sim/aicpu/inner_platform_regs.cpp index 7bb3b310d..b583459e6 100644 --- a/src/a5/platform/sim/aicpu/inner_platform_regs.cpp +++ b/src/a5/platform/sim/aicpu/inner_platform_regs.cpp @@ -22,19 +22,26 @@ #include "aicpu/platform_regs.h" #include "common/platform_config.h" -uint64_t read_reg(uint64_t reg_base_addr, RegId reg) { +uint64_t read_reg(uint64_t reg_base_addr, RegId reg) { // NOLINT(bugprone-easily-swappable-parameters) uint32_t offset = reg_offset(reg); volatile uint8_t *reg_base = reinterpret_cast(reg_base_addr); volatile uint32_t *ptr = reinterpret_cast(sparse_reg_ptr(reg_base, offset)); __sync_synchronize(); - uint64_t value = static_cast(*ptr); + uint64_t value = static_cast(*ptr); // NOLINT(modernize-use-auto) __sync_synchronize(); return value; } -void write_reg(uint64_t reg_base_addr, RegId reg, uint64_t value) { +uint64_t poll_reg(uint64_t reg_base_addr, RegId reg) { // NOLINT(bugprone-easily-swappable-parameters) + uint32_t offset = reg_offset(reg); + volatile uint8_t *reg_base = reinterpret_cast(reg_base_addr); + volatile uint32_t *ptr = reinterpret_cast(sparse_reg_ptr(reg_base, offset)); + return static_cast(*ptr); +} + +void write_reg(uint64_t reg_base_addr, RegId reg, uint64_t value) { // NOLINT(bugprone-easily-swappable-parameters) uint32_t offset = reg_offset(reg); volatile uint8_t *reg_base = reinterpret_cast(reg_base_addr); volatile uint32_t *ptr = reinterpret_cast(sparse_reg_ptr(reg_base, offset)); diff --git a/src/a5/runtime/host_build_graph/aicpu/aicpu_executor.cpp b/src/a5/runtime/host_build_graph/aicpu/aicpu_executor.cpp index 015f76a46..ee83beba8 100644 --- a/src/a5/runtime/host_build_graph/aicpu/aicpu_executor.cpp +++ b/src/a5/runtime/host_build_graph/aicpu/aicpu_executor.cpp @@ -9,6 +9,7 @@ * ----------------------------------------------------------------------------------------------------------- */ #include +// NOLINTBEGIN #include #include #include @@ -607,12 +608,14 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const uint64_t reg_addr = core_id_to_reg_addr_[core_id]; Handshake *h = &hank[core_id]; - uint64_t reg_val = read_reg(reg_addr, RegId::COND); + uint64_t reg_val = poll_reg(reg_addr, RegId::COND); int reg_task_id = EXTRACT_TASK_ID(reg_val); int reg_state = EXTRACT_TASK_STATE(reg_val); // Case 1: Pending task finished directly if (reg_task_id == pending_task_ids_[core_id] && reg_state == TASK_FIN_STATE) { + poll_acquire_barrier(); + LOG_INFO( "Thread %d: Core %d completed task %d (running_id=%d)", thread_idx, core_id, pending_task_ids_[core_id], running_task_ids_[core_id] @@ -691,6 +694,8 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const dispatch_timestamps_[core_id] = get_sys_cnt_aicpu(); } } else if (reg_task_id == pending_task_ids_[core_id] && reg_state == TASK_ACK_STATE) { // Case 2: ACK + poll_acquire_barrier(); + LOG_INFO( "Thread %d: Core %d ACKed task %d (running_id=%d)", thread_idx, core_id, pending_task_ids_[core_id], running_task_ids_[core_id] @@ -722,6 +727,8 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const // Core can accept new task now (pipeline!) // Continue to Case 4 to dispatch next task } else if (reg_task_id == running_task_ids_[core_id] && reg_state == TASK_FIN_STATE) { // Case 3: FIN + poll_acquire_barrier(); + LOG_INFO( "Thread %d: Core %d completed task %d (pending_id=%d)", thread_idx, core_id, running_task_ids_[core_id], pending_task_ids_[core_id] @@ -1152,3 +1159,4 @@ extern "C" int aicpu_execute(Runtime *runtime) { LOG_INFO("%s", "aicpu_execute: Kernel execution completed successfully"); return 0; } +// NOLINTEND diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 9f8ea2d3e..6bb510e62 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -9,6 +9,7 @@ * ----------------------------------------------------------------------------------------------------------- */ #include +// NOLINTBEGIN #include #include @@ -374,7 +375,7 @@ struct AicpuExecutor { uint64_t reg_addr = core_exec_state.reg_addr; int32_t expected_reg_task_id = core_exec_state.executing_reg_task_id; - uint64_t reg_val = read_reg(reg_addr, RegId::COND); + uint64_t reg_val = poll_reg(reg_addr, RegId::COND); int32_t reg_task_id = EXTRACT_TASK_ID(reg_val); int32_t reg_state = EXTRACT_TASK_STATE(reg_val); bool done = reg_task_id == expected_reg_task_id && reg_state == TASK_FIN_STATE; @@ -388,6 +389,7 @@ struct AicpuExecutor { #endif if (done) { + poll_acquire_barrier(); core_exec_state.executing_reg_task_id = AICPU_TASK_INVALID; PTO2SubtaskSlot subslot = core_exec_state.executing_subslot; PTO2TaskSlotState &slot_state = *core_exec_state.executing_slot_state; @@ -2314,3 +2316,4 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) { DEV_INFO("%s", "aicpu_execute: Kernel execution completed successfully"); return 0; } +// NOLINTEND