diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 229c4c58e..6bfc60684 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -215,7 +215,7 @@ jobs: - name: Run on-device examples (a2a3) run: | export PATH="$HOME/.local/bin:$PATH" - source ${ASCEND_HOME_PATH}/bin/setenv.bash && python ci.py -p a2a3 -d ${DEVICE_RANGE} --parallel -c 882c4db -t 600 --clone-protocol https + source ${ASCEND_HOME_PATH}/bin/setenv.bash && python ci.py -p a2a3 -d ${DEVICE_RANGE} -c 882c4db -t 600 --clone-protocol https # ---------- Detect A5 changes (runs on GitHub server, not A5 machine) ---------- @@ -290,4 +290,4 @@ jobs: export PATH="$HOME/.local/bin:$PATH" source ${ASCEND_HOME_PATH}/bin/setenv.bash DEVICE_LIST=$(python -c "s,e='${DEVICE_RANGE}'.split('-'); print(','.join(str(i) for i in range(int(s),int(e)+1)))") - task-submit --device "$DEVICE_LIST" --run "python ci.py -p a5 -d ${DEVICE_RANGE} --parallel -c 882c4db -t 600 --clone-protocol https" + task-submit --device "$DEVICE_LIST" --run "python ci.py -p a5 -d ${DEVICE_RANGE} -c 882c4db -t 600 --clone-protocol https" diff --git a/.gitignore b/.gitignore index 6ee298f0f..b9818739f 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,7 @@ venv/ .claude/settings.json .claude/settings.local.json .claude/worktrees +.claude/plans # Git cloned dependencies (not tracked in repo) examples/scripts/_deps/ diff --git a/ci.py b/ci.py index 9dea2f71a..054701315 100644 --- a/ci.py +++ b/ci.py @@ -14,7 +14,7 @@ per device, reusing ChipWorker across tasks that share the same runtime. Usage: - python tools/ci.py -p a2a3 -d 5-8 --parallel -c 6622890 -t 600 + python tools/ci.py -p a2a3 -d 5-8 -c 6622890 -t 600 python tools/ci.py -p a2a3sim -r tensormap_and_ringbuffer -c 6622890 -t 600 """ @@ -601,79 +601,80 @@ def device_worker( # --------------------------------------------------------------------------- -def run_sim_tasks(compiled: list[CompiledTask], parallel: bool = False) -> list[TaskResult]: - """Run simulation tasks with ChipWorker reuse per runtime group.""" - groups = group_by_runtime(compiled) +def _run_sim_task_with_worker(ct: CompiledTask, worker: ChipWorker, max_attempts: int) -> list[TaskResult]: + """Run a single sim task using *worker*, returning per-attempt results.""" results: list[TaskResult] = [] - lock = Lock() + rt_bins = cast(RuntimeBinariesLike, ct.runtime_bins) - def _run_group(runtime_name: str, group_tasks: list[CompiledTask]): - worker = ChipWorker() - rt_bins = cast(RuntimeBinariesLike, group_tasks[0].runtime_bins) + for attempt in range(max_attempts): + start = time.monotonic() try: worker.init(0, str(rt_bins.host_path), rt_bins.aicpu_path.read_bytes(), rt_bins.aicore_path.read_bytes()) + run_single_task(ct, worker, 0) + elapsed = time.monotonic() - start + logger.info(f"[sim] PASS: {ct.spec.name} ({elapsed:.1f}s)") + results.append( + TaskResult( + name=ct.spec.name, + platform=ct.spec.platform, + passed=True, + device="sim", + attempt=attempt, + elapsed_s=elapsed, + ) + ) + break except Exception as e: - logger.error(f"[sim] Failed to init ChipWorker for {runtime_name}: {e}") - with lock: - results.extend( - TaskResult( - name=ct.spec.name, - platform=ct.spec.platform, - passed=False, - device="sim", - attempt=0, - elapsed_s=0, - error=str(e), - ) - for ct in group_tasks + elapsed = time.monotonic() - start + logger.error(f"[sim] FAIL: {ct.spec.name} ({elapsed:.1f}s): {e}") + results.append( + TaskResult( + name=ct.spec.name, + platform=ct.spec.platform, + passed=False, + device="sim", + attempt=attempt, + elapsed_s=elapsed, + error=str(e), ) - return - - try: - for ct in group_tasks: - start = time.monotonic() - try: - run_single_task(ct, worker, 0) - elapsed = time.monotonic() - start - logger.info(f"[sim] PASS: {ct.spec.name} ({elapsed:.1f}s)") - r = TaskResult( - name=ct.spec.name, - platform=ct.spec.platform, - passed=True, - device="sim", - attempt=0, - elapsed_s=elapsed, - ) - except Exception as e: - elapsed = time.monotonic() - start - logger.error(f"[sim] FAIL: {ct.spec.name} ({elapsed:.1f}s): {e}") - r = TaskResult( - name=ct.spec.name, - platform=ct.spec.platform, - passed=False, - device="sim", - attempt=0, - elapsed_s=elapsed, - error=str(e), - ) - with lock: - results.append(r) + ) + if attempt + 1 >= max_attempts: + logger.warning(f"[sim] Exhausted retries on {ct.spec.name}") finally: - worker.reset() + if worker.initialized: + worker.reset() + + return results - if parallel: - threads = [Thread(target=_run_group, args=(rt_name, tasks)) for rt_name, tasks in groups.items()] - for t in threads: - t.start() - for t in threads: - t.join() - else: - for rt_name, tasks in groups.items(): - _run_group(rt_name, tasks) +def run_sim_tasks( + compiled: list[CompiledTask], + max_attempts: int = MAX_RETRIES, +) -> list[TaskResult]: + """Run simulation tasks in-process with per-task init/reset isolation.""" + results: list[TaskResult] = [] + worker = ChipWorker() + for ct in compiled: + results.extend(_run_sim_task_with_worker(ct, worker, max_attempts)) return results +def compile_and_run_sim_tasks( + tasks: list[TaskSpec], + args: argparse.Namespace, + pto_isa_root: str, +) -> list[TaskResult]: + """Compile simulation tasks in-process and run them with ChipWorker.reset().""" + logger.info(f"Compiling {len(tasks)} sim task(s)...") + compiled = compile_all_tasks( + tasks, + pto_isa_root, + build_runtime=args.build_runtime, + run_all_cases=args.run_all_cases, + ) + return run_sim_tasks(compiled, max_attempts=args.max_attempts) + + def run_hw_tasks( compiled: list[CompiledTask], devices: list[int], @@ -1094,7 +1095,6 @@ def parse_args() -> argparse.Namespace: parser.add_argument("-c", "--pto-isa-commit", default=None) parser.add_argument("-t", "--timeout", type=int, default=600) parser.add_argument("--clone-protocol", choices=["ssh", "https"], default="ssh") - parser.add_argument("--parallel", action="store_true") parser.add_argument("--all", dest="run_all_cases", action="store_true", help="Run all cases, not just DEFAULT_CASE") parser.add_argument("--device-worker", action="store_true", help=argparse.SUPPRESS) parser.add_argument("--max-attempts", type=int, default=MAX_RETRIES, help=argparse.SUPPRESS) @@ -1150,10 +1150,12 @@ def _watchdog_handler(signum, frame): return 0 logger.info(f"Discovered {len(tasks)} tasks") - # Step 2 & 3: Compile and run via subprocess-per-runtime-group - # Each subprocess loads exactly one host .so, avoiding RTLD_GLOBAL symbol conflicts. + # Step 2 & 3: Compile and run. + # Sim: in-process with RTLD_LOCAL isolation. + # HW: subprocess per device (-d). if is_sim: - all_results = run_hw_tasks_subprocess(tasks, [0], args) + pto_isa_root = ensure_pto_isa(args.pto_isa_commit, args.clone_protocol) + all_results = compile_and_run_sim_tasks(tasks, args, pto_isa_root) else: all_results = run_hw_tasks_subprocess(tasks, args.devices, args) @@ -1169,11 +1171,12 @@ def _watchdog_handler(signum, frame): if failures and args.pto_isa_commit: failed_names = {r.name for r in failures} logger.info(f"[CI] {len(failures)} failure(s), retrying with pinned PTO-ISA {args.pto_isa_commit}") - reset_pto_isa(args.pto_isa_commit, args.clone_protocol) retry_tasks = [task for task in tasks if task.name in failed_names] if is_sim: - retry_results = run_hw_tasks_subprocess(retry_tasks, [0], args) + retry_pto_isa_root = reset_pto_isa(args.pto_isa_commit, args.clone_protocol) + retry_results = compile_and_run_sim_tasks(retry_tasks, args, retry_pto_isa_root) else: + reset_pto_isa(args.pto_isa_commit, args.clone_protocol) retry_results = run_hw_tasks_subprocess(retry_tasks, args.devices, args) all_results.extend(retry_results) diff --git a/docs/dynamic-linking.md b/docs/dynamic-linking.md new file mode 100644 index 000000000..2305272b1 --- /dev/null +++ b/docs/dynamic-linking.md @@ -0,0 +1,292 @@ +# Dynamic Linking and Thread-Local Storage + +This document describes how shared libraries are loaded, symbols are resolved, +and per-thread state is managed across simulation and onboard platforms. + +## SO Loading Hierarchy + +### Simulation + +```text +Python process (ChipWorker) + | + dlopen(host_runtime.so, RTLD_GLOBAL) ← host SO + | + +-- DeviceRunner::ensure_binaries_loaded() + | | + | +-- dlopen(aicpu_sim_XXXXXX, RTLD_NOW | RTLD_LOCAL) ← AICPU SO (temp file) + | | | + | | +-- dlopen(libdevice_orch_.so, RTLD_LAZY | RTLD_LOCAL) ← orch SO (temp file) + | | + | +-- dlopen(aicore_sim_XXXXXX, RTLD_NOW | RTLD_LOCAL) ← AICore SO (temp file) + | + +-- DeviceRunner::upload_kernel_binary() + | + +-- dlopen(kernel__XXXXXX, RTLD_NOW | RTLD_LOCAL) ← kernel SOs (temp file, per func_id) +``` + +### Onboard + +```text +Python process (ChipWorker) + | + dlopen(host_runtime.so, RTLD_GLOBAL) ← host SO + | + +-- DeviceRunner (singleton) + | | + | +-- rtMemcpy(aicpu_binary → device HBM) ← NOT dlopen, binary blob upload + | +-- rtRegisterAllKernel(aicore_binary) ← CANN kernel registration + | +-- rtAicpuKernelLaunchExWithArgs(...) ← device-side execution + | + +-- dlopen("libascend_hal.so", RTLD_NOW | RTLD_LOCAL) ← CANN HAL (profiling only) +``` + +Key difference: onboard does **not** dlopen AICPU/AICore as host-side SOs. +They are binary blobs uploaded to device memory and executed by CANN runtime. + +## RTLD Flags and Rationale + +### Host Runtime SO: `RTLD_NOW | RTLD_GLOBAL` + +`RTLD_GLOBAL` is **required**. PTO ISA's TPUSH/TPOP instructions (AIC-AIV +data transfer for mix-type kernels) use `dlsym(RTLD_DEFAULT, ...)` internally +to locate shared storage hooks defined in the host SO: + +```cpp +// PTO ISA: pto/common/cpu_stub.hpp +inline GetSharedStorageHookFn ResolveSharedStorageHook() { + static auto hook = reinterpret_cast<...>( + dlsym(RTLD_DEFAULT, "pto_cpu_sim_get_shared_storage")); + return hook; +} +``` + +With `RTLD_LOCAL`, this symbol is not in the global scope. The hook returns +`nullptr`, and TPUSH/TPOP fall back to a `static` local variable per SO. +Since AIC and AIV kernel threads run in different contexts, they get separate +storage instances and deadlock — the producer (TPUSH) writes to one storage, +the consumer (TPOP) waits on another. + +**Cross-runtime isolation** (running different runtime SOs sequentially) relies +on `-fno-gnu-unique` to ensure `dlclose` actually unloads the SO. The next +`dlopen` with `RTLD_GLOBAL` then replaces the global symbol scope with the +new runtime's symbols. + +### Inner SOs: `RTLD_LOCAL` + +All SOs loaded by DeviceRunner (AICPU, AICore, kernel, orchestration) use +`RTLD_LOCAL` to prevent symbol pollution between them. Functions that inner +SOs need from the host SO are passed via explicit function pointer injection +(see "Function Pointer Injection" below). + +### Orchestration SO: `RTLD_LAZY | RTLD_LOCAL` + +Loaded by the AICPU executor at runtime from a temp file. Uses `RTLD_LAZY` +because not all symbols may be referenced. Communicates with the runtime +through a function pointer table (`PTO2RuntimeOps`), not direct symbol +linkage. + +**File path collision**: all runtimes write the orch SO to +`/var/tmp/libdevice_orch_.so`. Safe in serial execution (each task +dlcloses before the next writes), but would conflict in parallel in-process +execution. + +### CANN HAL: `RTLD_NOW | RTLD_LOCAL` + +`libascend_hal.so` is loaded only for performance profiling (SVM memory +mapping via `halHostRegister`/`halHostUnregister`). The handle is cached +in a file-scope `g_hal_handle` and never explicitly dlclosed. + +## All dlsym(RTLD_DEFAULT) Calls + +| Symbol | File | Used by | How it works | +| ------ | ---- | ------- | ------------ | +| `pto_cpu_sim_set_execution_context` | PTO ISA `cpu_stub.hpp` | Kernel `set_execution_context()` | Sim: injected via `set_sim_context_helpers` (bypasses dlsym) | +| `pto_cpu_sim_get_execution_context` | PTO ISA `cpu_stub.hpp` | Kernel `get_block_idx()` etc. | Sim: same injection mechanism | +| `pto_cpu_sim_get_shared_storage` | PTO ISA `cpu_stub.hpp` | TPUSH/TPOP shared state | Requires `RTLD_GLOBAL` on host SO | +| `pto_cpu_sim_get_task_cookie` | PTO ISA `cpu_stub.hpp` | Kernel `get_task_cookie()` | Requires `RTLD_GLOBAL` on host SO | +| `halMemAlloc` / `halMemFree` | Onboard `device_malloc.cpp` | AICPU device memory | Resolved once, cached in statics | +| `halGetDeviceInfoByBuff` | Onboard `host_regs.cpp` | Core validity query | a2a3 only | +| `halMemCtl` | Onboard `host_regs.cpp` | Register address mapping | a2a3 only | +| `halResMap` | Onboard `host_regs.cpp` | Per-core register mapping | a5 only | + +The first two are called from AICore SO code (via `inner_kernel.h` macros). +They were converted from `dlsym(RTLD_DEFAULT)` to function pointer injection +through `set_sim_context_helpers()`, so they work under both `RTLD_GLOBAL` +and `RTLD_LOCAL`. + +The next two (`get_shared_storage`, `get_task_cookie`) are called from PTO ISA +template code instantiated **inside kernel SOs** — not the AICore SO. Function +pointer injection into the AICore SO cannot reach them. They require the host +SO to be loaded with `RTLD_GLOBAL`. + +The HAL symbols are onboard-only. CANN's scheduler process pre-loads +`libascend_hal.so` into the global scope before launching AICPU kernels. + +## Function Pointer Injection + +To avoid `dlsym(RTLD_DEFAULT)` in inner SOs loaded with `RTLD_LOCAL`, +DeviceRunner passes function pointers after dlopen: + +**AICore SO** (`set_sim_context_helpers`): + +```text +DeviceRunner → dlsym(aicore_handle, "set_sim_context_helpers") + → set_helpers(pto_cpu_sim_set_execution_context, + pto_cpu_sim_set_task_cookie, + platform_get_cpu_sim_task_cookie) +``` + +**AICPU SO** (`set_aicpu_sim_context_helpers`): + +```text +DeviceRunner → dlsym(aicpu_handle, "set_aicpu_sim_context_helpers") + → set_helpers(platform_set_cpu_sim_task_cookie) +``` + +These injected function pointers are stored as globals in the respective SOs +and called instead of `dlsym(RTLD_DEFAULT)`. + +## Thread-Local Storage + +### Design Principle + +**No C++ `thread_local` in any SO that gets dlclosed and re-dlopen'd.** +C++ `thread_local` uses ELF TLSDESC on aarch64, which has known issues +with dlclose/re-dlopen cycles in older glibc versions. The sim platform +uses `pthread_key_t` (POSIX TLS) for per-thread state in framework SOs. + +### All TLS Variables + +| Variable | Storage | SO | Purpose | +| -------- | ------- | -- | ------- | +| `g_reg_base_key` | `pthread_key_t` | AICore SO | Per-core simulated register base address | +| `g_core_id_key` | `pthread_key_t` | AICore SO | Per-core physical core ID | +| `g_cpu_sim_context_key` | `pthread_key_t` | Host SO | Per-thread execution context (block_idx, subblock_id, etc.) | +| `s_orch_thread_idx` | `__thread int` | AICPU SO | Profiling thread index (profiling off by default) | +| `execution_context` | `thread_local` | Kernel SO (PTO ISA) | Per-thread execution context (fallback, cached values only) | +| `NPUMemoryModel::instance` | `thread_local` | Kernel SO (PTO ISA) | Per-core memory model simulation | + +### Known Risks + +1. **`s_orch_thread_idx`** uses `__thread` (ELF TLS) in the AICPU SO. Could + cause issues on aarch64 glibc <2.39 if the AICPU SO is dlclosed and + re-dlopen'd while profiling is enabled. Currently safe because profiling + is off by default and the variable is only accessed during profiling. + +2. **PTO ISA `thread_local`** variables (`execution_context`, + `NPUMemoryModel::instance`) are in kernel SOs. Kernel SOs are short-lived + (loaded per task, dlclosed after validation), and each kernel thread is + freshly created, so stale TLS is not a concern in practice. + +## `-fno-gnu-unique` + +GCC emits `STB_GNU_UNIQUE` binding for `static` locals in inline/template +functions. glibc marks such SOs as `NODELETE`, making `dlclose` a no-op. +When multiple runtime SOs are loaded sequentially with `RTLD_GLOBAL`, the +first SO's symbols persist and pollute the second. + +Applied to all sim compilation paths: + +- 6 CMakeLists (host/aicpu/aicore for a2a3 and a5): `$<$:-fno-gnu-unique>` +- `toolchain.py` (GxxToolchain, Aarch64GxxToolchain): appended to compile flags + +Additionally, `data_type.h::get_element_size()` uses `constexpr static` +instead of `static` to avoid generating UNIQUE symbols at the source level. + +## AicpuExecutor::deinit() + +The AICPU SO contains a file-scope static `AicpuExecutor g_aicpu_executor`. +When the AICPU SO is dlclosed and re-dlopen'd between tasks, the static is +reconstructed. But when the AICPU SO is **reused** (same runtime, consecutive +tasks), `deinit()` must reset all fields. Previously missing resets: + +- `cores_total_num_`, `thread_num_`, `orch_thread_num_`, `sched_thread_num_` +- `trackers_` / `core_trackers_`, `core_assignments_`, `core_count_per_thread_` +- `orch_func_`, `orch_args_cached_`, `orch_so_handle_`, `orch_so_path_` + +Applies to all 5 runtime executors: a2a3 (abg, hbg, tmr), a5 (hbg, tmr). + +## SO Handle Caching and Reuse + +### Simulation + +| SO | Caching | Lifecycle | +| -- | ------- | --------- | +| Host runtime | `ChipWorker::lib_handle_` | Per-task: dlopen in `init()`, dlclose in `reset()` | +| AICPU | `DeviceRunner::aicpu_so_handle_` | Per-run: loaded in `ensure_binaries_loaded()`, closed in `unload_executor_binaries()` at end of `run()` | +| AICore | `DeviceRunner::aicore_so_handle_` | Same as AICPU | +| Kernel | `DeviceRunner::func_id_to_addr_` (map by func_id) | Per-task: uploaded in `init_runtime_impl()`, removed in `validate_runtime_impl()` | +| Orchestration | `AicpuExecutor::orch_so_handle_` | Per-run: loaded by orchestrator thread, closed by last thread in `deinit()` | + +### Onboard + +| Resource | Caching | Lifecycle | +| -------- | ------- | --------- | +| Host runtime | `ChipWorker::lib_handle_` | Per-runtime-group: shared across tasks in same group | +| AICPU binary | `AicpuSoInfo` in DeviceRunner | Per-runtime-group: uploaded to device HBM once, reused | +| AICore binary | `rtRegisterAllKernel` handle | Per-run: registered each `launch_aicore_kernel()` call | +| Kernel binaries | `func_id_to_addr_` (device GM addresses) | Per-task: uploaded to device GM, cached by func_id | +| CANN HAL | `g_hal_handle` (file-scope static) | Process lifetime: loaded once for profiling, never closed | + +### Key difference + +Onboard caches more aggressively — the DeviceRunner singleton persists +across tasks and the AICPU binary stays in device memory. Simulation +re-loads AICPU/AICore SOs every `run()` call because the SO's internal +static state (`g_aicpu_executor`) must be fresh for each task when +different tasks have different configurations. + +## Execution Lifecycle + +### Simulation (in-process, per-task init/reset) + +```text +ChipWorker.init(host_path, aicpu_bytes, aicore_bytes) + dlopen(host_runtime.so, RTLD_GLOBAL) + dlsym: set_device, get_runtime_size, run_runtime, finalize_device + set_device(device_id) + +ChipWorker.run(callable, args, config) + run_runtime(buf, callable, args, ...) + new (buf) Runtime() + init_runtime_impl(r, callable, args) build graph, upload kernels + DeviceRunner::run(r, ...) + clear_cpu_sim_shared_storage() + ensure_binaries_loaded() dlopen aicpu/aicore SOs + launch AICPU + AICore threads + join all threads + unload_executor_binaries() dlclose aicpu/aicore SOs + validate_runtime_impl(r) copy results, remove kernels + r->~Runtime() + +ChipWorker.reset() + finalize_device() + dlclose(host_runtime.so) -fno-gnu-unique ensures real unload +``` + +### Onboard (subprocess per device, ChipWorker reused per runtime group) + +```text +device_worker_main(device_id) + for each runtime_group: + ChipWorker.init(host_path, aicpu_bytes, aicore_bytes) + dlopen(host_runtime.so, RTLD_GLOBAL) + set_device(device_id) rtSetDevice() + + for each task in group: + ChipWorker.run(callable, args, config) + run_runtime(buf, callable, args, ...) + new (buf) Runtime() + init_runtime_impl() rtMalloc, rtMemcpy to device + DeviceRunner::run() + ensure_binaries_loaded() rtMemcpy AICPU SO to HBM (once) + rtAicpuKernelLaunchExWithArgs() launch on device + rtStreamSynchronize() wait for completion + launch_aicore_kernel() rtRegisterAllKernel + rtKernelLaunch + validate_runtime_impl() rtMemcpy results back to host + + ChipWorker.reset() + finalize_device() rtDeviceReset() + dlclose(host_runtime.so) +``` diff --git a/src/common/worker/chip_worker.cpp b/src/common/worker/chip_worker.cpp index e40586622..33a6f0ee6 100644 --- a/src/common/worker/chip_worker.cpp +++ b/src/common/worker/chip_worker.cpp @@ -45,7 +45,12 @@ void ChipWorker::init( throw std::runtime_error("ChipWorker already initialized; call reset() first"); } - // Load the host runtime shared library + // RTLD_GLOBAL is required: PTO ISA's TPUSH/TPOP (AIC-AIV sync) use + // dlsym(RTLD_DEFAULT, "pto_cpu_sim_get_shared_storage") to find the + // host SO's shared storage hook. Cross-runtime isolation relies on + // -fno-gnu-unique (#453) allowing dlclose to actually unload the + // previous runtime's SO before loading the next one. + dlerror(); void *handle = dlopen(host_lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); if (!handle) { std::string err = "dlopen failed: ";