Skip to content

Commit c79b5a4

Browse files
Perf: replace read_reg with poll_reg in COND polling loops
Add poll_reg() — a barrier-free volatile read — for use in hot spin-wait loops that poll the AICore COND register. Add poll_acquire_barrier() (dmb ish on ARM64, compiler barrier on x86_64) inserted once on the cold path when the awaited condition is detected. - platform (a2a3, a5): add poll_reg() declaration and implementation; add poll_acquire_barrier() macro to memory_barrier.h - runtimes (host_build_graph, aicpu_build_graph, tensormap_and_ringbuffer on both a2a3 and a5): replace read_reg() → poll_reg() for the COND register reads inside the polling loop; insert poll_acquire_barrier() at each completion branch before accessing Normal memory The barrier cost is now O(1) per task completion instead of O(iterations), eliminating dmb overhead on every iteration of the "not-yet-done" hot path.
1 parent 13bf816 commit c79b5a4

12 files changed

Lines changed: 206 additions & 64 deletions

File tree

src/a2a3/platform/include/aicpu/platform_regs.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,30 @@ uint64_t get_platform_regs();
6363
*/
6464
uint64_t read_reg(uint64_t reg_base_addr, RegId reg);
6565

66+
/**
67+
* Poll a register value without memory barriers (for hot polling loops)
68+
*
69+
* Unlike read_reg(), this function performs a bare volatile read with no
70+
* memory barriers. This is safe for polling loops where the "not-yet-done"
71+
* fast path has no Normal-memory data dependency on the register value.
72+
*
73+
* Callers MUST insert an explicit memory barrier (e.g. poll_acquire_barrier())
74+
* after detecting the awaited condition, before accessing Normal memory that
75+
* depends on the polled result.
76+
*
77+
* On real hardware: MMIO is Device memory; volatile alone prevents caching
78+
* and compiler reordering. No hardware barrier needed for visibility.
79+
*
80+
* On simulation: registers are Normal memory; volatile prevents compiler
81+
* reordering. Cache coherence ensures cross-thread visibility within
82+
* a bounded number of iterations.
83+
*
84+
* @param reg_base_addr Base address of the AICore's register block
85+
* @param reg Register identifier (C++ enum class)
86+
* @return Register value (zero-extended to uint64_t)
87+
*/
88+
uint64_t poll_reg(uint64_t reg_base_addr, RegId reg);
89+
6690
/**
6791
* Write a value to an AICore's register
6892
*

src/a2a3/platform/include/common/memory_barrier.h

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -36,28 +36,57 @@
3636
* Read memory barrier (ARM64)
3737
* Ensures all loads before this point complete before any loads after.
3838
*/
39-
#define rmb() __asm__ __volatile__("dsb ld" ::: "memory")
39+
# define rmb() __asm__ __volatile__("dsb ld" ::: "memory")
4040

4141
/**
4242
* Write memory barrier (ARM64)
4343
* Ensures all stores before this point complete before any stores after.
4444
*/
45-
#define wmb() __asm__ __volatile__("dsb st" ::: "memory")
45+
# define wmb() __asm__ __volatile__("dsb st" ::: "memory")
4646

4747
/**
4848
* Store-store barrier (ARM64, inner shareable domain)
4949
* Ensures all stores before this barrier are globally visible before any
5050
* stores after.
5151
*/
52-
#define OUT_OF_ORDER_STORE_BARRIER() __asm__ __volatile__("dmb ishst" ::: "memory")
52+
# define OUT_OF_ORDER_STORE_BARRIER() __asm__ __volatile__("dmb ishst" ::: "memory")
5353
#else
5454
/**
5555
* Compiler barrier (fallback for non-ARM64 platforms)
5656
* Prevents compiler reordering but does not emit hardware barriers.
5757
*/
58-
#define rmb() __asm__ __volatile__("" ::: "memory")
59-
#define wmb() __asm__ __volatile__("" ::: "memory")
60-
#define OUT_OF_ORDER_STORE_BARRIER() __asm__ __volatile__("" ::: "memory")
58+
# define rmb() __asm__ __volatile__("" ::: "memory")
59+
# define wmb() __asm__ __volatile__("" ::: "memory")
60+
# define OUT_OF_ORDER_STORE_BARRIER() __asm__ __volatile__("" ::: "memory")
61+
#endif
62+
63+
// =============================================================================
64+
// Polling Acquire Barrier
65+
// =============================================================================
66+
67+
/**
68+
* Polling acquire barrier
69+
*
70+
* Use after poll_reg() detects the awaited condition (e.g., task completion),
71+
* before accessing Normal memory whose correctness depends on the polled value.
72+
*
73+
* ARM64: dmb ish (data memory barrier, inner shareable, full)
74+
* Ensures the Device-memory register read is ordered before all subsequent
75+
* Normal-memory loads and stores in the completion path.
76+
* Chosen over dmb ishld (load-only) for safety margin: negligible cost
77+
* (executed once per completion, not per poll iteration) and protects
78+
* against future stores that may be added to the completion path.
79+
*
80+
* x86_64: compiler barrier only (TSO provides implicit acquire on all loads)
81+
*
82+
* Other: full barrier fallback (__sync_synchronize)
83+
*/
84+
#if defined(__aarch64__)
85+
# define poll_acquire_barrier() __asm__ __volatile__("dmb ish" ::: "memory")
86+
#elif defined(__x86_64__)
87+
# define poll_acquire_barrier() __asm__ __volatile__("" ::: "memory")
88+
#else
89+
# define poll_acquire_barrier() __sync_synchronize()
6190
#endif
6291

6392
#endif // SRC_A2A3_PLATFORM_INCLUDE_COMMON_MEMORY_BARRIER_H_

src/a2a3/platform/src/aicpu/platform_regs.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,20 +36,25 @@ void set_platform_regs(uint64_t regs) { g_platform_regs = regs; }
3636

3737
uint64_t get_platform_regs() { return g_platform_regs; }
3838

39-
uint64_t read_reg(uint64_t reg_base_addr, RegId reg) {
39+
uint64_t read_reg(uint64_t reg_base_addr, RegId reg) { // NOLINT(bugprone-easily-swappable-parameters)
4040
volatile uint32_t *ptr = reinterpret_cast<volatile uint32_t *>(reg_base_addr + reg_offset(reg));
4141

4242
__sync_synchronize();
4343

4444
// Read the register value
45-
uint64_t value = static_cast<uint64_t>(*ptr);
45+
uint64_t value = static_cast<uint64_t>(*ptr); // NOLINT(modernize-use-auto)
4646

4747
__sync_synchronize();
4848

4949
return value;
5050
}
5151

52-
void write_reg(uint64_t reg_base_addr, RegId reg, uint64_t value) {
52+
uint64_t poll_reg(uint64_t reg_base_addr, RegId reg) { // NOLINT(bugprone-easily-swappable-parameters)
53+
volatile uint32_t *ptr = reinterpret_cast<volatile uint32_t *>(reg_base_addr + reg_offset(reg));
54+
return static_cast<uint64_t>(*ptr);
55+
}
56+
57+
void write_reg(uint64_t reg_base_addr, RegId reg, uint64_t value) { // NOLINT(bugprone-easily-swappable-parameters)
5358
volatile uint32_t *ptr = reinterpret_cast<volatile uint32_t *>(reg_base_addr + reg_offset(reg));
5459

5560
__sync_synchronize();

src/a2a3/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
* -----------------------------------------------------------------------------------------------------------
1010
*/
1111
#include <dlfcn.h>
12+
// NOLINTBEGIN
1213
#include <fcntl.h>
1314
#include <unistd.h>
1415

@@ -20,10 +21,10 @@
2021
#include <cstdlib>
2122
#include <cstring>
2223
#ifdef __linux__
23-
#include <sys/mman.h>
24+
# include <sys/mman.h>
2425
#endif
2526

26-
#include "aicpu/device_log.h"
27+
#include "aicpu/device_log.h" // NOLINT(clang-diagnostic-error)
2728
#include "aicpu/device_time.h"
2829
#include "pto2_dispatch_payload.h"
2930
#include "runtime.h"
@@ -52,16 +53,16 @@
5253

5354
#if PTO2_PROFILING
5455
// Accumulated nanoseconds per sub-step
55-
#define CYCLE_COUNT_START() uint64_t _t0 = get_sys_cnt_aicpu(), _t1
56-
#define CYCLE_COUNT_LAP(acc) \
57-
do { \
58-
_t1 = get_sys_cnt_aicpu(); \
59-
acc += (_t1 - _t0); \
60-
_t0 = _t1; \
61-
} while (0)
56+
# define CYCLE_COUNT_START() uint64_t _t0 = get_sys_cnt_aicpu(), _t1
57+
# define CYCLE_COUNT_LAP(acc) \
58+
do { \
59+
_t1 = get_sys_cnt_aicpu(); \
60+
acc += (_t1 - _t0); \
61+
_t0 = _t1; \
62+
} while (0)
6263
#else
63-
#define CYCLE_COUNT_START()
64-
#define CYCLE_COUNT_LAP(acc)
64+
# define CYCLE_COUNT_START()
65+
# define CYCLE_COUNT_LAP(acc)
6566
#endif
6667

6768
// Device orchestration function signature (loaded via dlopen).
@@ -310,7 +311,7 @@ struct AicpuExecutor {
310311
uint64_t reg_addr = core_id_to_reg_addr_[core_id];
311312

312313
int32_t expected_reg_task_id = executing_reg_task_ids_[core_id];
313-
uint64_t reg_val = read_reg(reg_addr, RegId::COND);
314+
uint64_t reg_val = poll_reg(reg_addr, RegId::COND);
314315
int32_t reg_task_id = EXTRACT_TASK_ID(reg_val);
315316
int32_t reg_state = EXTRACT_TASK_STATE(reg_val);
316317
bool done = reg_task_id == expected_reg_task_id && reg_state == TASK_FIN_STATE;
@@ -324,6 +325,7 @@ struct AicpuExecutor {
324325
#endif
325326

326327
if (done) {
328+
poll_acquire_barrier();
327329
executing_reg_task_ids_[core_id] = AICPU_TASK_INVALID;
328330
PTO2SubtaskSlot subslot = executing_subslot_by_core_[core_id];
329331
PTO2TaskSlotState &slot_state = *executing_slot_state_by_core_[core_id];
@@ -2252,3 +2254,4 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) {
22522254
DEV_INFO("%s", "aicpu_execute: Kernel execution completed successfully");
22532255
return 0;
22542256
}
2257+
// NOLINTEND

src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,13 @@
99
* -----------------------------------------------------------------------------------------------------------
1010
*/
1111

12+
// NOLINTBEGIN
1213
#include <atomic>
1314
#include <cstdint>
1415
#include <cstdio>
1516
#include <mutex>
1617

17-
#include "aicpu/device_log.h"
18+
#include "aicpu/device_log.h" // NOLINT(clang-diagnostic-error)
1819
#include "aicpu/device_time.h"
1920
#include "aicpu/performance_collector_aicpu.h"
2021
#include "aicpu/platform_regs.h"
@@ -608,12 +609,14 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const
608609
uint64_t reg_addr = core_id_to_reg_addr_[core_id];
609610
Handshake *h = &hank[core_id];
610611

611-
uint64_t reg_val = read_reg(reg_addr, RegId::COND);
612+
uint64_t reg_val = poll_reg(reg_addr, RegId::COND);
612613
int reg_task_id = EXTRACT_TASK_ID(reg_val);
613614
int reg_state = EXTRACT_TASK_STATE(reg_val);
614615

615616
// Case 1: Pending task finished directly
616617
if (reg_task_id == pending_task_ids_[core_id] && reg_state == TASK_FIN_STATE) {
618+
poll_acquire_barrier();
619+
617620
LOG_INFO(
618621
"Thread %d: Core %d completed task %d (running_id=%d)", thread_idx, core_id,
619622
pending_task_ids_[core_id], running_task_ids_[core_id]
@@ -716,6 +719,8 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const
716719
}
717720
} else if (reg_task_id == pending_task_ids_[core_id] && reg_state == TASK_ACK_STATE) {
718721
// Case 2: Pending task received ACK
722+
poll_acquire_barrier();
723+
719724
LOG_INFO(
720725
"Thread %d: Core %d ACKed task %d (running_id=%d)", thread_idx, core_id, pending_task_ids_[core_id],
721726
running_task_ids_[core_id]
@@ -770,6 +775,8 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const
770775
// Continue to Case 4 to dispatch next task
771776
} else if (reg_task_id == running_task_ids_[core_id] && reg_state == TASK_FIN_STATE) {
772777
// Case 3: Running task finished
778+
poll_acquire_barrier();
779+
773780
LOG_INFO(
774781
"Thread %d: Core %d completed task %d (pending_id=%d)", thread_idx, core_id,
775782
running_task_ids_[core_id], pending_task_ids_[core_id]
@@ -1200,3 +1207,4 @@ extern "C" int aicpu_execute(Runtime *runtime) {
12001207
LOG_INFO("%s", "aicpu_execute: Kernel execution completed successfully");
12011208
return 0;
12021209
}
1210+
// NOLINTEND

src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
* -----------------------------------------------------------------------------------------------------------
1010
*/
1111
#include <dlfcn.h>
12+
// NOLINTBEGIN
1213
#include <fcntl.h>
1314
#include <unistd.h>
1415

@@ -20,7 +21,7 @@
2021
#include <cstdlib>
2122
#include <cstring>
2223
#ifdef __linux__
23-
#include <sys/mman.h>
24+
# include <sys/mman.h>
2425
#endif
2526

2627
#include "aicpu/device_log.h"
@@ -52,16 +53,16 @@
5253

5354
#if PTO2_PROFILING
5455
// Accumulated nanoseconds per sub-step
55-
#define CYCLE_COUNT_START() uint64_t _t0 = get_sys_cnt_aicpu(), _t1
56-
#define CYCLE_COUNT_LAP(acc) \
57-
do { \
58-
_t1 = get_sys_cnt_aicpu(); \
59-
acc += (_t1 - _t0); \
60-
_t0 = _t1; \
61-
} while (0)
56+
# define CYCLE_COUNT_START() uint64_t _t0 = get_sys_cnt_aicpu(), _t1
57+
# define CYCLE_COUNT_LAP(acc) \
58+
do { \
59+
_t1 = get_sys_cnt_aicpu(); \
60+
acc += (_t1 - _t0); \
61+
_t0 = _t1; \
62+
} while (0)
6263
#else
63-
#define CYCLE_COUNT_START()
64-
#define CYCLE_COUNT_LAP(acc)
64+
# define CYCLE_COUNT_START()
65+
# define CYCLE_COUNT_LAP(acc)
6566
#endif
6667

6768
// Device orchestration function signature (loaded via dlopen).
@@ -374,7 +375,7 @@ struct AicpuExecutor {
374375
uint64_t reg_addr = core_exec_state.reg_addr;
375376

376377
int32_t expected_reg_task_id = core_exec_state.executing_reg_task_id;
377-
uint64_t reg_val = read_reg(reg_addr, RegId::COND);
378+
uint64_t reg_val = poll_reg(reg_addr, RegId::COND);
378379
int32_t reg_task_id = EXTRACT_TASK_ID(reg_val);
379380
int32_t reg_state = EXTRACT_TASK_STATE(reg_val);
380381
bool done = reg_task_id == expected_reg_task_id && reg_state == TASK_FIN_STATE;
@@ -388,6 +389,7 @@ struct AicpuExecutor {
388389
#endif
389390

390391
if (done) {
392+
poll_acquire_barrier();
391393
core_exec_state.executing_reg_task_id = AICPU_TASK_INVALID;
392394
PTO2TaskSlotState &slot_state = *core_exec_state.executing_slot_state;
393395

@@ -2372,3 +2374,4 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) {
23722374
DEV_INFO("%s", "aicpu_execute: Kernel execution completed successfully");
23732375
return 0;
23742376
}
2377+
// NOLINTEND

src/a5/platform/include/aicpu/platform_regs.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,30 @@ uint64_t get_platform_regs();
6666
*/
6767
uint64_t read_reg(uint64_t reg_base_addr, RegId reg);
6868

69+
/**
70+
* Poll a register value without memory barriers (for hot polling loops)
71+
*
72+
* Unlike read_reg(), this function performs a bare volatile read with no
73+
* memory barriers. This is safe for polling loops where the "not-yet-done"
74+
* fast path has no Normal-memory data dependency on the register value.
75+
*
76+
* Callers MUST insert an explicit memory barrier (e.g. poll_acquire_barrier())
77+
* after detecting the awaited condition, before accessing Normal memory that
78+
* depends on the polled result.
79+
*
80+
* On real hardware: MMIO is Device memory; volatile alone prevents caching
81+
* and compiler reordering. No hardware barrier needed for visibility.
82+
*
83+
* On simulation: registers are Normal memory; volatile prevents compiler
84+
* reordering. Cache coherence ensures cross-thread visibility within
85+
* a bounded number of iterations.
86+
*
87+
* @param reg_base_addr Base address of the AICore's register block
88+
* @param reg Register identifier (C++ enum class)
89+
* @return Register value (zero-extended to uint64_t)
90+
*/
91+
uint64_t poll_reg(uint64_t reg_base_addr, RegId reg);
92+
6993
/**
7094
* Write a value to an AICore's register
7195
*

0 commit comments

Comments
 (0)