diff --git a/examples/scripts/code_runner.py b/examples/scripts/code_runner.py index 4b8daa3d5..faddfe3cd 100644 --- a/examples/scripts/code_runner.py +++ b/examples/scripts/code_runner.py @@ -836,10 +836,6 @@ def _compile_one_kernel(kernel): logger.debug(f"Tensor order: {list(tensors.keys())}") logger.debug(f"orch_args count: {len(orch_args)}") - # Create and initialize runtime (including kernel registration) - logger.info("=== Initializing Runtime ===") - runtime = Runtime() - # Build environment for runtime initialization run_env = _kernel_config_runtime_env(self._kernel_config, self.kernels_dir) if run_env: @@ -856,27 +852,32 @@ def _compile_one_kernel(kernel): initial_outputs = {k: v.clone() for k, v in outputs.items()} + runtime = Runtime() + + with _temporary_env(run_env): + runtime.initialize( + orch_so_binary, + self.orchestration["function_name"], + orch_args, + kernel_binaries=kernel_binaries, + ) + for round_idx in range(self.repeat_rounds): if self.repeat_rounds > 1: logger.info(f"--- Round {round_idx + 1}/{self.repeat_rounds} ---") - for k, v in initial_outputs.items(): - outputs[k].copy_(v) - - runtime = Runtime() + t_round_start = time.perf_counter() - # Enable profiling if requested (only first round) if self.enable_profiling and round_idx == 0: runtime.enable_profiling(True) logger.info("Profiling enabled") - with _temporary_env(run_env): - runtime.initialize( - orch_so_binary, - self.orchestration["function_name"], - orch_args, - kernel_binaries=kernel_binaries, - ) + for k, v in initial_outputs.items(): + outputs[k].copy_(v) + + runtime.initialize_round( + orch_args, + ) launch_runtime( runtime, @@ -888,10 +889,14 @@ def _compile_one_kernel(kernel): orch_thread_num=self.orch_thread_num, ) - runtime.finalize() + runtime.finalize_round() if not self.skip_golden: self._compare_with_golden(outputs, golden) + t_round_end = time.perf_counter() + logger.info(f"HOST_TIMING round={round_idx} total_us={(t_round_end - t_round_start) * 1e6:.1f}") + + runtime.finalize() logger.info(f"=== Case {case_idx + 1}/{total_cases} Passed ===") logger.info("=" * 60) diff --git a/python/bindings.py b/python/bindings.py index 049fad887..7f8fd860b 100644 --- a/python/bindings.py +++ b/python/bindings.py @@ -164,6 +164,19 @@ def _setup_functions(self): self.lib.finalize_runtime.argtypes = [c_void_p] self.lib.finalize_runtime.restype = c_int + # init_runtime_round - per-round data copy (INPUT+INOUT) to device + self.lib.init_runtime_round.argtypes = [ + c_void_p, # runtime + POINTER(TaskArgC), # orch_args + c_int, # orch_args_count + POINTER(c_int), # arg_types + ] + self.lib.init_runtime_round.restype = c_int + + # finalize_runtime_round - copy results back without freeing resources + self.lib.finalize_runtime_round.argtypes = [c_void_p] + self.lib.finalize_runtime_round.restype = c_int + # Note: register_kernel has been internalized into init_runtime # Kernel binaries are now passed directly to init_runtime() @@ -232,6 +245,32 @@ def __init__(self, lib: CDLL): size = lib.get_runtime_size() self._buffer = ctypes.create_string_buffer(size) self._handle = ctypes.cast(self._buffer, c_void_p) + self._initialized = False + + def _convert_orch_params(self, orch_args, arg_types): + """Convert orch_args and arg_types to ctypes arrays.""" + orch_args = orch_args or [] + orch_args_count = len(orch_args) + + # Accept either a nanobind TaskArgArray (from task_interface) or a + # plain list of TaskArgC structs. + from _task_interface import TaskArgArray as _NbTaskArgArray + + if isinstance(orch_args, _NbTaskArgArray): + orch_args_array = cast(orch_args.ctypes_ptr(), POINTER(TaskArgC)) if orch_args_count > 0 else None + # Prevent GC of the nanobind array while the ctypes pointer is live + self._nb_args_ref = orch_args + elif orch_args_count > 0: + orch_args_array = (TaskArgC * orch_args_count)(*orch_args) + else: + orch_args_array = None + + if arg_types is not None and len(arg_types) > 0: + arg_types_array = (c_int * len(arg_types))(*arg_types) + else: + arg_types_array = None + + return orch_args_array, orch_args_count, arg_types_array def initialize( self, @@ -323,6 +362,7 @@ def initialize( ) if rc != 0: raise RuntimeError(f"init_runtime failed: {rc}") + self._initialized = True def finalize(self) -> None: """ @@ -335,10 +375,58 @@ def finalize(self) -> None: Raises: RuntimeError: If finalization fails """ + if not self._initialized: + return rc = self.lib.finalize_runtime(self._handle) if rc != 0: raise RuntimeError(f"finalize_runtime failed: {rc}") + self._initialized = False + + def initialize_round( + self, + orch_args: Optional[list] = None, + arg_types: Optional[List[int]] = None, + ) -> None: + """ + Per-round initialization: copy INPUT and INOUT tensor data to device. + + Uses existing device memory allocations from initialize(). + Called every round (including the first) before launch_runtime(). + + Args: + orch_args: List of TaskArgC structs for orchestration + arg_types: Array describing each argument's type + + Raises: + RuntimeError: If round initialization fails + """ + orch_args_array, orch_args_count, arg_types_array = \ + self._convert_orch_params(orch_args, arg_types) + + rc = self.lib.init_runtime_round( + self._handle, + orch_args_array, + orch_args_count, + arg_types_array, + ) + if rc != 0: + raise RuntimeError(f"init_runtime_round failed: {rc}") + + def finalize_round(self) -> None: + """ + Round-level finalize: copy results back but keep device resources alive. + + Copies output/inout tensors from device to host without freeing + device memory or kernel binaries. Use between rounds in the same case. + + Raises: + RuntimeError: If round finalization fails + """ + rc = self.lib.finalize_runtime_round(self._handle) + if rc != 0: + # Not supported by this runtime, fallback to full finalize + self.finalize() def enable_profiling(self, enabled: bool = True) -> None: """ diff --git a/src/a2a3/platform/include/host/pto_runtime_c_api.h b/src/a2a3/platform/include/host/pto_runtime_c_api.h index 075f81899..6ff15681e 100644 --- a/src/a2a3/platform/include/host/pto_runtime_c_api.h +++ b/src/a2a3/platform/include/host/pto_runtime_c_api.h @@ -160,6 +160,38 @@ int launch_runtime(RuntimeHandle runtime, size_t aicore_size, int orch_thread_num); +/** + * Per-round initialization: copy INPUT and INOUT tensor data to device. + * + * Uses existing device memory allocations from init_runtime(). + * Called every round (including the first) before launch_runtime(). + * + * Must be called after a successful init_runtime(). The Runtime handle + * must not have been fully finalized. + * + * @param runtime Runtime handle (previously initialized) + * @param orch_args Array of TaskArg describing orchestration arguments + * @param orch_args_count Number of orchestration arguments + * @param arg_types Array describing each argument's type (ArgType enum) + * @return 0 on success, -1 on failure + */ +int init_runtime_round(RuntimeHandle runtime, + const struct TaskArg* orch_args, + int orch_args_count, + int* arg_types); + +/** + * Round-level finalize: copy results back but keep device resources alive. + * + * Copies output/inout tensors from device to host, but does NOT free + * device memory, kernel binaries, or call the Runtime destructor. + * Use this between rounds within the same case. + * + * @param runtime Runtime handle to finalize for this round + * @return 0 on success, -1 on failure + */ +int finalize_runtime_round(RuntimeHandle runtime); + /** * Finalize and cleanup a runtime instance. * diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp index 72101898a..5e4d21bac 100644 --- a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp @@ -30,6 +30,11 @@ int init_runtime_impl(Runtime* runtime, const size_t* kernel_sizes, int kernel_count); int validate_runtime_impl(Runtime* runtime); +int init_runtime_round_impl(Runtime* runtime, + const TaskArg* orch_args, + int orch_args_count, + int* arg_types); +int validate_runtime_round_impl(Runtime* runtime); /* Forward declarations for device memory functions used in init_runtime */ void* device_malloc(size_t size); @@ -200,6 +205,33 @@ int launch_runtime(RuntimeHandle runtime, } } +int init_runtime_round(RuntimeHandle runtime, + const TaskArg* orch_args, + int orch_args_count, + int* arg_types) { + if (runtime == NULL) { + return -1; + } + try { + Runtime* r = static_cast(runtime); + return init_runtime_round_impl(r, orch_args, orch_args_count, arg_types); + } catch (...) { + return -1; + } +} + +int finalize_runtime_round(RuntimeHandle runtime) { + if (runtime == NULL) { + return -1; + } + try { + Runtime* r = static_cast(runtime); + return validate_runtime_round_impl(r); + } catch (...) { + return -1; + } +} + int finalize_runtime(RuntimeHandle runtime) { if (runtime == NULL) { return -1; diff --git a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp index 417d96a74..aa57a4829 100644 --- a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp @@ -33,6 +33,11 @@ int init_runtime_impl(Runtime* runtime, const size_t* kernel_sizes, int kernel_count); int validate_runtime_impl(Runtime* runtime); +int init_runtime_round_impl(Runtime* runtime, + const TaskArg* orch_args, + int orch_args_count, + int* arg_types); +int validate_runtime_round_impl(Runtime* runtime); /* Forward declarations */ void* device_malloc(size_t size); @@ -203,6 +208,33 @@ int launch_runtime(RuntimeHandle runtime, } } +int init_runtime_round(RuntimeHandle runtime, + const TaskArg* orch_args, + int orch_args_count, + int* arg_types) { + if (runtime == NULL) { + return -1; + } + try { + Runtime* r = static_cast(runtime); + return init_runtime_round_impl(r, orch_args, orch_args_count, arg_types); + } catch (...) { + return -1; + } +} + +int finalize_runtime_round(RuntimeHandle runtime) { + if (runtime == NULL) { + return -1; + } + try { + Runtime* r = static_cast(runtime); + return validate_runtime_round_impl(r); + } catch (...) { + return -1; + } +} + int finalize_runtime(RuntimeHandle runtime) { if (runtime == NULL) { return -1; diff --git a/src/a2a3/runtime/aicpu_build_graph/host/runtime_maker.cpp b/src/a2a3/runtime/aicpu_build_graph/host/runtime_maker.cpp index c8ac410af..1813c2128 100644 --- a/src/a2a3/runtime/aicpu_build_graph/host/runtime_maker.cpp +++ b/src/a2a3/runtime/aicpu_build_graph/host/runtime_maker.cpp @@ -254,7 +254,7 @@ extern "C" int init_runtime_impl(Runtime *runtime, LOG_INFO("Device orchestration ready: %d args", orch_args_count); long long t_total_end = _now_ms(); - LOG_INFO("TIMING: args_malloc_copy = %lldms", t_args_end - t_args_start); + LOG_INFO("TIMING: args_malloc = %lldms", t_args_end - t_args_start); LOG_INFO("TIMING: orch_so_copy = %lldms", t_so_end - t_so_start); LOG_INFO("TIMING: gm_heap_alloc(1GB) = %lldms", t_heap_end - t_heap_start); LOG_INFO("TIMING: shared_mem_alloc = %lldms", t_sm_end - t_sm_start); @@ -267,9 +267,10 @@ extern "C" int init_runtime_impl(Runtime *runtime, * Validate runtime results and cleanup. * * This function: - * 1. Copies recorded tensors from device back to host - * 2. Frees device memory for recorded tensors - * 3. Clears tensor pair state + * 1. Frees device memory for recorded tensors + * 2. Clears tensor pair state + * + * Copy-back is handled by validate_runtime_round_impl (called per round). * * @param runtime Pointer to Runtime * @return 0 on success, -1 on failure @@ -280,97 +281,151 @@ extern "C" int validate_runtime_impl(Runtime *runtime) { return -1; } - int rc = 0; - - LOG_INFO("=== Copying Results Back to Host ==="); - - // Copy all recorded tensors from device back to host TensorPair* tensor_pairs = runtime->get_tensor_pairs(); int tensor_pair_count = runtime->get_tensor_pair_count(); - LOG_INFO("Tensor pairs to process: %d", tensor_pair_count); + // Cleanup device tensors + LOG_INFO("=== Cleaning Up ==="); + for (int i = 0; i < tensor_pair_count; i++) { + if (tensor_pairs[i].dev_ptr != nullptr) { + runtime->host_api.device_free(tensor_pairs[i].dev_ptr); + } + } + LOG_INFO("Freed %d device allocations", tensor_pair_count); + + // Cleanup kernel binaries + int kernel_count = runtime->get_registered_kernel_count(); + for (int i = 0; i < kernel_count; i++) { + int func_id = runtime->get_registered_kernel_func_id(i); + runtime->host_api.remove_kernel_binary(func_id); + runtime->set_function_bin_addr(func_id, 0); + } + if (kernel_count > 0) { + LOG_INFO("Freed %d kernel binaries", kernel_count); + } + runtime->clear_registered_kernels(); + + // Clear tensor pairs + runtime->clear_tensor_pairs(); + + LOG_INFO("=== Finalize Complete ==="); + + return 0; +} + +/** + * Round-level validate: copy results back but keep device resources alive. + * + * Handles PTO2 packed graph output: if the shared-memory header contains a + * graph_output_ptr, the first output tensor is read from that packed buffer + * instead of the individually-recorded device pointer. + * + * @param runtime Pointer to Runtime + * @return 0 on success, -1 on failure + */ +extern "C" int validate_runtime_round_impl(Runtime *runtime) { + if (runtime == nullptr) { + LOG_ERROR("Runtime pointer is null"); + return -1; + } + + LOG_INFO("=== Round Finalize: Copying Results Back ==="); + + TensorPair* tensor_pairs = runtime->get_tensor_pairs(); + const int tensor_pair_count = runtime->get_tensor_pair_count(); - // PTO2 (device orchestration): graph output may be in packed buffer void* pto2_sm = runtime->get_pto2_gm_sm_ptr(); - uint64_t graph_out_ptr = 0; - uint64_t graph_out_size = 0; + void* graph_out_src = nullptr; + size_t graph_out_copy_size = 0; if (pto2_sm != nullptr) { - // Copy header from device to host to read graph_output_ptr/size PTO2SharedMemoryHeader host_header; - int hdr_rc = runtime->host_api.copy_from_device(&host_header, pto2_sm, sizeof(PTO2SharedMemoryHeader)); - if (hdr_rc == 0) { - graph_out_ptr = host_header.graph_output_ptr; - graph_out_size = host_header.graph_output_size; - if (graph_out_ptr != 0) { - LOG_INFO("Graph output buffer: ptr=0x%lx, size=%lu", (unsigned long)graph_out_ptr, (unsigned long)graph_out_size); + if (runtime->host_api.copy_from_device(&host_header, pto2_sm, sizeof(PTO2SharedMemoryHeader)) == 0) { + if (host_header.graph_output_ptr != 0 && host_header.graph_output_size > 0) { + graph_out_src = reinterpret_cast(static_cast(host_header.graph_output_ptr)); + graph_out_copy_size = static_cast(host_header.graph_output_size); } - } else { - LOG_WARN("Failed to copy PTO2 header from device"); } } - bool first_output_tensor = true; - for (int i = 0; i < tensor_pair_count; i++) { - const TensorPair& pair = tensor_pairs[i]; + int rc = 0; + bool first_output = true; - // Skip if device pointer is null - if (pair.dev_ptr == nullptr) { - LOG_WARN("Tensor %d has null device pointer, skipping", i); - continue; - } + for (int i = 0; i < tensor_pair_count; ++i) { + const TensorPair& pair = tensor_pairs[i]; - // If host pointer is null, this is a device-only allocation (no copy-back) - if (pair.host_ptr == nullptr) { - LOG_INFO("Tensor %d: device-only allocation (no copy-back)", i); + if (pair.dev_ptr == nullptr || pair.host_ptr == nullptr || pair.size == 0) { continue; } void* src_ptr = pair.dev_ptr; size_t copy_size = pair.size; - // Use graph_output_ptr for the first output tensor if available - if (first_output_tensor && graph_out_ptr != 0 && graph_out_size > 0) { - src_ptr = reinterpret_cast(static_cast(graph_out_ptr)); - copy_size = static_cast(graph_out_size); - LOG_INFO("Using packed output buffer for tensor %d", i); - first_output_tensor = false; + if (first_output && graph_out_src != nullptr) { + src_ptr = graph_out_src; + copy_size = graph_out_copy_size; + first_output = false; } int copy_rc = runtime->host_api.copy_from_device(pair.host_ptr, src_ptr, copy_size); if (copy_rc != 0) { - LOG_ERROR("Failed to copy tensor %d from device: %d", i, copy_rc); + LOG_ERROR("Failed to copy tensor %d from device", i); rc = copy_rc; - } else { - LOG_INFO("Tensor %d: %zu bytes copied to host", i, pair.size); } } - // Cleanup device tensors - LOG_INFO("=== Cleaning Up ==="); - for (int i = 0; i < tensor_pair_count; i++) { - if (tensor_pairs[i].dev_ptr != nullptr) { - runtime->host_api.device_free(tensor_pairs[i].dev_ptr); - } - } - LOG_INFO("Freed %d device allocations", tensor_pair_count); + LOG_INFO("=== Round Finalize Complete ==="); + return rc; +} - // Cleanup kernel binaries - int kernel_count = runtime->get_registered_kernel_count(); - for (int i = 0; i < kernel_count; i++) { - int func_id = runtime->get_registered_kernel_func_id(i); - runtime->host_api.remove_kernel_binary(func_id); - runtime->set_function_bin_addr(func_id, 0); - } - if (kernel_count > 0) { - LOG_INFO("Freed %d kernel binaries", kernel_count); +/** + * Per-round initialization: copy INPUT and INOUT tensor data to device. + * + * Uses existing device memory allocations from init_runtime_impl. + * Called every round (including the first) before launch_runtime. + * + * @param runtime Pointer to previously initialized Runtime + * @param orch_args Array of TaskArg describing orchestration arguments + * @param orch_args_count Number of orchestration arguments + * @param arg_types Array describing each argument's type (ArgType enum) + * @return 0 on success, -1 on failure + */ +extern "C" int init_runtime_round_impl(Runtime *runtime, + const TaskArg* orch_args, + int orch_args_count, + int* arg_types) { + (void)arg_types; + + if (runtime == nullptr) { + LOG_ERROR("Runtime pointer is null"); + return -1; } - runtime->clear_registered_kernels(); - // Clear tensor pairs - runtime->clear_tensor_pairs(); + TensorPair* tensor_pairs = runtime->get_tensor_pairs(); + const int total_pairs = runtime->get_tensor_pair_count(); + int pair_idx = 0; - LOG_INFO("=== Finalize Complete ==="); + for (int i = 0; i < orch_args_count; ++i) { + if (orch_args[i].kind != TaskArgKind::TENSOR) continue; - return rc; + if (pair_idx >= total_pairs) { + LOG_ERROR("init_round: tensor_pair index out of range at arg %d", i); + return -1; + } + + const TensorPair& pair = tensor_pairs[pair_idx]; + + size_t size = static_cast(orch_args[i].nbytes()); + void* host_ptr = reinterpret_cast(static_cast(orch_args[i].tensor.data)); + + if (size > 0 && host_ptr != nullptr && pair.dev_ptr != nullptr) { + int rc = runtime->host_api.copy_to_device(pair.dev_ptr, host_ptr, size); + if (rc != 0) { + LOG_ERROR("init_round: failed to copy arg %d to device", i); + return -1; + } + } + pair_idx++; + } + return 0; } diff --git a/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp b/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp index 992cced65..d8b91cbba 100644 --- a/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp +++ b/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp @@ -221,6 +221,44 @@ int validate_runtime_impl(Runtime *runtime) { return rc; } +int init_runtime_round_impl(Runtime* runtime, + const TaskArg* orch_args, + int orch_args_count, + int* arg_types) { + (void)runtime; + (void)orch_args; + (void)orch_args_count; + (void)arg_types; + // No-op: host orchestration manages device memory directly, + // so there is no per-round data copy to perform. + return 0; +} + +int validate_runtime_round_impl(Runtime* runtime) { + if (runtime == nullptr) { + LOG_ERROR("Runtime pointer is null"); + return -1; + } + + int rc = 0; + + // Copy recorded tensors from device back to host (same as validate_runtime_impl + // but without freeing device memory or kernel binaries). + TensorPair* tensor_pairs = runtime->get_tensor_pairs(); + int tensor_pair_count = runtime->get_tensor_pair_count(); + + for (int i = 0; i < tensor_pair_count; i++) { + const TensorPair& pair = tensor_pairs[i]; + int copy_rc = runtime->host_api.copy_from_device(pair.host_ptr, pair.dev_ptr, pair.size); + if (copy_rc != 0) { + LOG_ERROR("Failed to copy tensor %d from device: %d", i, copy_rc); + rc = copy_rc; + } + } + + return rc; +} + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp index 134aff055..6b0faa898 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp @@ -254,7 +254,7 @@ extern "C" int init_runtime_impl(Runtime *runtime, LOG_INFO("Device orchestration ready: %d args", orch_args_count); long long t_total_end = _now_ms(); - LOG_INFO("TIMING: args_malloc_copy = %lldms", t_args_end - t_args_start); + LOG_INFO("TIMING: args_malloc = %lldms", t_args_end - t_args_start); LOG_INFO("TIMING: orch_so_copy = %lldms", t_so_end - t_so_start); LOG_INFO("TIMING: gm_heap_alloc(1GB) = %lldms", t_heap_end - t_heap_start); LOG_INFO("TIMING: shared_mem_alloc = %lldms", t_sm_end - t_sm_start); @@ -267,9 +267,10 @@ extern "C" int init_runtime_impl(Runtime *runtime, * Validate runtime results and cleanup. * * This function: - * 1. Copies recorded tensors from device back to host - * 2. Frees device memory for recorded tensors - * 3. Clears tensor pair state + * 1. Frees device memory for recorded tensors + * 2. Clears tensor pair state + * + * Copy-back is handled by validate_runtime_round_impl (called per round). * * @param runtime Pointer to Runtime * @return 0 on success, -1 on failure @@ -280,15 +281,110 @@ extern "C" int validate_runtime_impl(Runtime *runtime) { return -1; } - int rc = 0; + TensorPair* tensor_pairs = runtime->get_tensor_pairs(); + int tensor_pair_count = runtime->get_tensor_pair_count(); + + // Cleanup device tensors + LOG_INFO("=== Cleaning Up ==="); + for (int i = 0; i < tensor_pair_count; i++) { + if (tensor_pairs[i].dev_ptr != nullptr) { + runtime->host_api.device_free(tensor_pairs[i].dev_ptr); + } + } + LOG_INFO("Freed %d device allocations", tensor_pair_count); - LOG_INFO("=== Copying Results Back to Host ==="); + // Cleanup kernel binaries + int kernel_count = runtime->get_registered_kernel_count(); + for (int i = 0; i < kernel_count; i++) { + int func_id = runtime->get_registered_kernel_func_id(i); + runtime->host_api.remove_kernel_binary(func_id); + runtime->set_function_bin_addr(func_id, 0); + } + if (kernel_count > 0) { + LOG_INFO("Freed %d kernel binaries", kernel_count); + } + runtime->clear_registered_kernels(); + + // Clear tensor pairs + runtime->clear_tensor_pairs(); + + LOG_INFO("=== Finalize Complete ==="); + + return 0; +} + +/** + * Per-round initialization: copy INPUT and INOUT tensor data to device. + * + * Uses existing device memory allocations from init_runtime_impl. + * Called every round (including the first) before launch_runtime. + * + * @param runtime Pointer to previously initialized Runtime + * @param orch_args Array of TaskArg describing orchestration arguments + * @param orch_args_count Number of orchestration arguments + * @param arg_types Array describing each argument's type (ArgType enum) + * @return 0 on success, -1 on failure + */ +extern "C" int init_runtime_round_impl(Runtime *runtime, + const TaskArg* orch_args, + int orch_args_count, + int* arg_types) { + (void)arg_types; + + if (runtime == nullptr) { + LOG_ERROR("Runtime pointer is null"); + return -1; + } - // Copy all recorded tensors from device back to host TensorPair* tensor_pairs = runtime->get_tensor_pairs(); - int tensor_pair_count = runtime->get_tensor_pair_count(); + const int total_pairs = runtime->get_tensor_pair_count(); + int pair_idx = 0; + + for (int i = 0; i < orch_args_count; ++i) { + if (orch_args[i].kind != TaskArgKind::TENSOR) continue; + + if (pair_idx >= total_pairs) { + LOG_ERROR("init_round: tensor_pair index out of range at arg %d", i); + return -1; + } + + const TensorPair& pair = tensor_pairs[pair_idx]; + + size_t size = static_cast(orch_args[i].nbytes()); + void* host_ptr = reinterpret_cast(static_cast(orch_args[i].tensor.data)); + + if (size > 0 && host_ptr != nullptr && pair.dev_ptr != nullptr) { + int rc = runtime->host_api.copy_to_device(pair.dev_ptr, host_ptr, size); + if (rc != 0) { + LOG_ERROR("init_round: failed to copy arg %d to device", i); + return -1; + } + } + pair_idx++; + } + return 0; +} + +/** + * Round-level validate: copy results back but keep device resources alive. + * + * Handles PTO2 packed graph output: if the shared-memory header contains a + * graph_output_ptr, the first output tensor is read from that packed buffer + * instead of the individually-recorded device pointer. + * + * @param runtime Pointer to Runtime + * @return 0 on success, -1 on failure + */ +extern "C" int validate_runtime_round_impl(Runtime *runtime) { + if (runtime == nullptr) { + LOG_ERROR("Runtime pointer is null"); + return -1; + } - LOG_INFO("Tensor pairs to process: %d", tensor_pair_count); + LOG_INFO("=== Round Finalize: Copying Results Back ==="); + + TensorPair* tensor_pairs = runtime->get_tensor_pairs(); + const int tensor_pair_count = runtime->get_tensor_pair_count(); // PTO2 (device orchestration): graph output may be in packed buffer void* pto2_sm = runtime->get_pto2_gm_sm_ptr(); @@ -296,7 +392,6 @@ extern "C" int validate_runtime_impl(Runtime *runtime) { uint64_t graph_out_size = 0; if (pto2_sm != nullptr) { - // Copy header from device to host to read graph_output_ptr/size PTO2SharedMemoryHeader host_header; int hdr_rc = runtime->host_api.copy_from_device(&host_header, pto2_sm, sizeof(PTO2SharedMemoryHeader)); if (hdr_rc == 0) { @@ -310,19 +405,13 @@ extern "C" int validate_runtime_impl(Runtime *runtime) { } } + int rc = 0; bool first_output_tensor = true; + for (int i = 0; i < tensor_pair_count; i++) { const TensorPair& pair = tensor_pairs[i]; - // Skip if device pointer is null - if (pair.dev_ptr == nullptr) { - LOG_WARN("Tensor %d has null device pointer, skipping", i); - continue; - } - - // If host pointer is null, this is a device-only allocation (no copy-back) - if (pair.host_ptr == nullptr) { - LOG_INFO("Tensor %d: device-only allocation (no copy-back)", i); + if (pair.dev_ptr == nullptr || pair.host_ptr == nullptr || pair.size == 0) { continue; } @@ -333,44 +422,16 @@ extern "C" int validate_runtime_impl(Runtime *runtime) { if (first_output_tensor && graph_out_ptr != 0 && graph_out_size > 0) { src_ptr = reinterpret_cast(static_cast(graph_out_ptr)); copy_size = static_cast(graph_out_size); - LOG_INFO("Using packed output buffer for tensor %d", i); first_output_tensor = false; } int copy_rc = runtime->host_api.copy_from_device(pair.host_ptr, src_ptr, copy_size); if (copy_rc != 0) { - LOG_ERROR("Failed to copy tensor %d from device: %d", i, copy_rc); + LOG_ERROR("Failed to copy tensor %d from device", i); rc = copy_rc; - } else { - LOG_INFO("Tensor %d: %zu bytes copied to host", i, pair.size); } } - // Cleanup device tensors - LOG_INFO("=== Cleaning Up ==="); - for (int i = 0; i < tensor_pair_count; i++) { - if (tensor_pairs[i].dev_ptr != nullptr) { - runtime->host_api.device_free(tensor_pairs[i].dev_ptr); - } - } - LOG_INFO("Freed %d device allocations", tensor_pair_count); - - // Cleanup kernel binaries - int kernel_count = runtime->get_registered_kernel_count(); - for (int i = 0; i < kernel_count; i++) { - int func_id = runtime->get_registered_kernel_func_id(i); - runtime->host_api.remove_kernel_binary(func_id); - runtime->set_function_bin_addr(func_id, 0); - } - if (kernel_count > 0) { - LOG_INFO("Freed %d kernel binaries", kernel_count); - } - runtime->clear_registered_kernels(); - - // Clear tensor pairs - runtime->clear_tensor_pairs(); - - LOG_INFO("=== Finalize Complete ==="); - + LOG_INFO("=== Round Finalize Complete ==="); return rc; } diff --git a/src/a5/platform/include/host/pto_runtime_c_api.h b/src/a5/platform/include/host/pto_runtime_c_api.h index 1c220486e..c9072d105 100644 --- a/src/a5/platform/include/host/pto_runtime_c_api.h +++ b/src/a5/platform/include/host/pto_runtime_c_api.h @@ -160,6 +160,38 @@ int launch_runtime(RuntimeHandle runtime, size_t aicore_size, int orch_thread_num); +/** + * Per-round initialization: copy INPUT and INOUT tensor data to device. + * + * Uses existing device memory allocations from init_runtime(). + * Called every round (including the first) before launch_runtime(). + * + * Must be called after a successful init_runtime(). The Runtime handle + * must not have been fully finalized. + * + * @param runtime Runtime handle (previously initialized) + * @param orch_args Array of TaskArg describing orchestration arguments + * @param orch_args_count Number of orchestration arguments + * @param arg_types Array describing each argument's type (ArgType enum) + * @return 0 on success, -1 on failure + */ +int init_runtime_round(RuntimeHandle runtime, + const struct TaskArg* orch_args, + int orch_args_count, + int* arg_types); + +/** + * Round-level finalize: copy results back but keep device resources alive. + * + * Copies output/inout tensors from device to host, but does NOT free + * device memory, kernel binaries, or call the Runtime destructor. + * Use this between rounds within the same case. + * + * @param runtime Runtime handle to finalize for this round + * @return 0 on success, -1 on failure + */ +int finalize_runtime_round(RuntimeHandle runtime); + /** * Finalize and cleanup a runtime instance. * diff --git a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp index 209d6563e..b2369942b 100644 --- a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp +++ b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp @@ -30,6 +30,11 @@ int init_runtime_impl(Runtime* runtime, const size_t* kernel_sizes, int kernel_count); int validate_runtime_impl(Runtime* runtime); +int init_runtime_round_impl(Runtime* runtime, + const TaskArg* orch_args, + int orch_args_count, + int* arg_types); +int validate_runtime_round_impl(Runtime* runtime); /* Forward declarations for device memory functions used in init_runtime */ void* device_malloc(size_t size); @@ -195,6 +200,33 @@ int launch_runtime(RuntimeHandle runtime, } } +int init_runtime_round(RuntimeHandle runtime, + const TaskArg* orch_args, + int orch_args_count, + int* arg_types) { + if (runtime == NULL) { + return -1; + } + try { + Runtime* r = static_cast(runtime); + return init_runtime_round_impl(r, orch_args, orch_args_count, arg_types); + } catch (...) { + return -1; + } +} + +int finalize_runtime_round(RuntimeHandle runtime) { + if (runtime == NULL) { + return -1; + } + try { + Runtime* r = static_cast(runtime); + return validate_runtime_round_impl(r); + } catch (...) { + return -1; + } +} + int finalize_runtime(RuntimeHandle runtime) { if (runtime == NULL) { return -1; diff --git a/src/a5/platform/sim/host/pto_runtime_c_api.cpp b/src/a5/platform/sim/host/pto_runtime_c_api.cpp index 513ec698d..4e20b9ae1 100644 --- a/src/a5/platform/sim/host/pto_runtime_c_api.cpp +++ b/src/a5/platform/sim/host/pto_runtime_c_api.cpp @@ -33,6 +33,11 @@ int init_runtime_impl(Runtime* runtime, const size_t* kernel_sizes, int kernel_count); int validate_runtime_impl(Runtime* runtime); +int init_runtime_round_impl(Runtime* runtime, + const TaskArg* orch_args, + int orch_args_count, + int* arg_types); +int validate_runtime_round_impl(Runtime* runtime); /* Forward declarations */ void* device_malloc(size_t size); @@ -198,6 +203,33 @@ int launch_runtime(RuntimeHandle runtime, } } +int init_runtime_round(RuntimeHandle runtime, + const TaskArg* orch_args, + int orch_args_count, + int* arg_types) { + if (runtime == NULL) { + return -1; + } + try { + Runtime* r = static_cast(runtime); + return init_runtime_round_impl(r, orch_args, orch_args_count, arg_types); + } catch (...) { + return -1; + } +} + +int finalize_runtime_round(RuntimeHandle runtime) { + if (runtime == NULL) { + return -1; + } + try { + Runtime* r = static_cast(runtime); + return validate_runtime_round_impl(r); + } catch (...) { + return -1; + } +} + int finalize_runtime(RuntimeHandle runtime) { if (runtime == NULL) { return -1; diff --git a/src/a5/runtime/host_build_graph/host/runtime_maker.cpp b/src/a5/runtime/host_build_graph/host/runtime_maker.cpp index 992cced65..52e1df5da 100644 --- a/src/a5/runtime/host_build_graph/host/runtime_maker.cpp +++ b/src/a5/runtime/host_build_graph/host/runtime_maker.cpp @@ -221,6 +221,44 @@ int validate_runtime_impl(Runtime *runtime) { return rc; } +int init_runtime_round_impl(Runtime* runtime, + const TaskArg* orch_args, + int orch_args_count, + int* arg_types) { + (void)runtime; + (void)orch_args; + (void)orch_args_count; + (void)arg_types; + // No-op: host orchestration manages device memory directly, + // so there is no per-round data copy to perform. + return 0; +} + +int validate_runtime_round_impl(Runtime* runtime) { + if (runtime == nullptr) { + LOG_ERROR("Runtime pointer is null"); + return -1; + } + + int rc = 0; + + // Copy recorded tensors from device back to host (same as validate_runtime_impl + // but without freeing device memory or kernel binaries). + TensorPair* tensor_pairs = runtime->get_tensor_pairs(); + int tensor_pair_count = runtime->get_tensor_pair_count(); + + for (int i = 0; i < tensor_pair_count; i++) { + const TensorPair& pair = tensor_pairs[i]; + int copy_rc = runtime->host_api.copy_from_device(pair.host_ptr, pair.dev_ptr, pair.size); + if (copy_rc != 0) { + LOG_ERROR("Failed to copy tensor %d from device: %d", i, copy_rc); + rc = copy_rc; + } + } + + return rc; +} + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp index 134aff055..d89a97411 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp @@ -254,7 +254,7 @@ extern "C" int init_runtime_impl(Runtime *runtime, LOG_INFO("Device orchestration ready: %d args", orch_args_count); long long t_total_end = _now_ms(); - LOG_INFO("TIMING: args_malloc_copy = %lldms", t_args_end - t_args_start); + LOG_INFO("TIMING: args_malloc = %lldms", t_args_end - t_args_start); LOG_INFO("TIMING: orch_so_copy = %lldms", t_so_end - t_so_start); LOG_INFO("TIMING: gm_heap_alloc(1GB) = %lldms", t_heap_end - t_heap_start); LOG_INFO("TIMING: shared_mem_alloc = %lldms", t_sm_end - t_sm_start); @@ -263,13 +263,15 @@ extern "C" int init_runtime_impl(Runtime *runtime, return 0; } + /** * Validate runtime results and cleanup. * * This function: - * 1. Copies recorded tensors from device back to host - * 2. Frees device memory for recorded tensors - * 3. Clears tensor pair state + * 1. Frees device memory for recorded tensors + * 2. Clears tensor pair state + * + * Copy-back is handled by validate_runtime_round_impl (called per round). * * @param runtime Pointer to Runtime * @return 0 on success, -1 on failure @@ -280,15 +282,111 @@ extern "C" int validate_runtime_impl(Runtime *runtime) { return -1; } - int rc = 0; + TensorPair* tensor_pairs = runtime->get_tensor_pairs(); + int tensor_pair_count = runtime->get_tensor_pair_count(); + + // Cleanup device tensors + LOG_INFO("=== Cleaning Up ==="); + for (int i = 0; i < tensor_pair_count; i++) { + if (tensor_pairs[i].dev_ptr != nullptr) { + runtime->host_api.device_free(tensor_pairs[i].dev_ptr); + } + } + LOG_INFO("Freed %d device allocations", tensor_pair_count); + + // Cleanup kernel binaries + int kernel_count = runtime->get_registered_kernel_count(); + for (int i = 0; i < kernel_count; i++) { + int func_id = runtime->get_registered_kernel_func_id(i); + runtime->host_api.remove_kernel_binary(func_id); + runtime->set_function_bin_addr(func_id, 0); + } + if (kernel_count > 0) { + LOG_INFO("Freed %d kernel binaries", kernel_count); + } + runtime->clear_registered_kernels(); - LOG_INFO("=== Copying Results Back to Host ==="); + // Clear tensor pairs + runtime->clear_tensor_pairs(); + + LOG_INFO("=== Finalize Complete ==="); + + return 0; +} + + +/** + * Per-round initialization: copy INPUT and INOUT tensor data to device. + * + * Uses existing device memory allocations from init_runtime_impl. + * Called every round (including the first) before launch_runtime. + * + * @param runtime Pointer to previously initialized Runtime + * @param orch_args Array of TaskArg describing orchestration arguments + * @param orch_args_count Number of orchestration arguments + * @param arg_types Array describing each argument's type (ArgType enum) + * @return 0 on success, -1 on failure + */ +extern "C" int init_runtime_round_impl(Runtime *runtime, + const TaskArg* orch_args, + int orch_args_count, + int* arg_types) { + (void)arg_types; + + if (runtime == nullptr) { + LOG_ERROR("Runtime pointer is null"); + return -1; + } - // Copy all recorded tensors from device back to host TensorPair* tensor_pairs = runtime->get_tensor_pairs(); - int tensor_pair_count = runtime->get_tensor_pair_count(); + const int total_pairs = runtime->get_tensor_pair_count(); + int pair_idx = 0; + + for (int i = 0; i < orch_args_count; ++i) { + if (orch_args[i].kind != TaskArgKind::TENSOR) continue; + + if (pair_idx >= total_pairs) { + LOG_ERROR("init_round: tensor_pair index out of range at arg %d", i); + return -1; + } + + const TensorPair& pair = tensor_pairs[pair_idx]; + + size_t size = static_cast(orch_args[i].nbytes()); + void* host_ptr = reinterpret_cast(static_cast(orch_args[i].tensor.data)); + + if (size > 0 && host_ptr != nullptr && pair.dev_ptr != nullptr) { + int rc = runtime->host_api.copy_to_device(pair.dev_ptr, host_ptr, size); + if (rc != 0) { + LOG_ERROR("init_round: failed to copy arg %d to device", i); + return -1; + } + } + pair_idx++; + } + return 0; +} + +/** + * Round-level validate: copy results back but keep device resources alive. + * + * Handles PTO2 packed graph output: if the shared-memory header contains a + * graph_output_ptr, the first output tensor is read from that packed buffer + * instead of the individually-recorded device pointer. + * + * @param runtime Pointer to Runtime + * @return 0 on success, -1 on failure + */ +extern "C" int validate_runtime_round_impl(Runtime *runtime) { + if (runtime == nullptr) { + LOG_ERROR("Runtime pointer is null"); + return -1; + } + + LOG_INFO("=== Round Finalize: Copying Results Back ==="); - LOG_INFO("Tensor pairs to process: %d", tensor_pair_count); + TensorPair* tensor_pairs = runtime->get_tensor_pairs(); + const int tensor_pair_count = runtime->get_tensor_pair_count(); // PTO2 (device orchestration): graph output may be in packed buffer void* pto2_sm = runtime->get_pto2_gm_sm_ptr(); @@ -296,7 +394,6 @@ extern "C" int validate_runtime_impl(Runtime *runtime) { uint64_t graph_out_size = 0; if (pto2_sm != nullptr) { - // Copy header from device to host to read graph_output_ptr/size PTO2SharedMemoryHeader host_header; int hdr_rc = runtime->host_api.copy_from_device(&host_header, pto2_sm, sizeof(PTO2SharedMemoryHeader)); if (hdr_rc == 0) { @@ -310,19 +407,13 @@ extern "C" int validate_runtime_impl(Runtime *runtime) { } } + int rc = 0; bool first_output_tensor = true; + for (int i = 0; i < tensor_pair_count; i++) { const TensorPair& pair = tensor_pairs[i]; - // Skip if device pointer is null - if (pair.dev_ptr == nullptr) { - LOG_WARN("Tensor %d has null device pointer, skipping", i); - continue; - } - - // If host pointer is null, this is a device-only allocation (no copy-back) - if (pair.host_ptr == nullptr) { - LOG_INFO("Tensor %d: device-only allocation (no copy-back)", i); + if (pair.dev_ptr == nullptr || pair.host_ptr == nullptr || pair.size == 0) { continue; } @@ -333,44 +424,16 @@ extern "C" int validate_runtime_impl(Runtime *runtime) { if (first_output_tensor && graph_out_ptr != 0 && graph_out_size > 0) { src_ptr = reinterpret_cast(static_cast(graph_out_ptr)); copy_size = static_cast(graph_out_size); - LOG_INFO("Using packed output buffer for tensor %d", i); first_output_tensor = false; } int copy_rc = runtime->host_api.copy_from_device(pair.host_ptr, src_ptr, copy_size); if (copy_rc != 0) { - LOG_ERROR("Failed to copy tensor %d from device: %d", i, copy_rc); + LOG_ERROR("Failed to copy tensor %d from device", i); rc = copy_rc; - } else { - LOG_INFO("Tensor %d: %zu bytes copied to host", i, pair.size); } } - // Cleanup device tensors - LOG_INFO("=== Cleaning Up ==="); - for (int i = 0; i < tensor_pair_count; i++) { - if (tensor_pairs[i].dev_ptr != nullptr) { - runtime->host_api.device_free(tensor_pairs[i].dev_ptr); - } - } - LOG_INFO("Freed %d device allocations", tensor_pair_count); - - // Cleanup kernel binaries - int kernel_count = runtime->get_registered_kernel_count(); - for (int i = 0; i < kernel_count; i++) { - int func_id = runtime->get_registered_kernel_func_id(i); - runtime->host_api.remove_kernel_binary(func_id); - runtime->set_function_bin_addr(func_id, 0); - } - if (kernel_count > 0) { - LOG_INFO("Freed %d kernel binaries", kernel_count); - } - runtime->clear_registered_kernels(); - - // Clear tensor pairs - runtime->clear_tensor_pairs(); - - LOG_INFO("=== Finalize Complete ==="); - + LOG_INFO("=== Round Finalize Complete ==="); return rc; } diff --git a/tools/benchmark_rounds.sh b/tools/benchmark_rounds.sh index 64b283e81..28e6fccf2 100755 --- a/tools/benchmark_rounds.sh +++ b/tools/benchmark_rounds.sh @@ -49,6 +49,7 @@ ROUNDS=100 PLATFORM=a2a3 RUNTIME=tensormap_and_ringbuffer VERBOSE=0 +SHOW_HOST=0 EXTRA_ARGS=() while [[ $# -gt 0 ]]; do @@ -73,12 +74,16 @@ while [[ $# -gt 0 ]]; do VERBOSE=1 shift ;; + --show-host) + SHOW_HOST=1 + shift + ;; --help|-h) cat <<'USAGE' benchmark_rounds.sh — run all examples and report per-round timing from device logs Usage: - ./tools/benchmark_rounds.sh [-p ] [-d ] [-n ] [-r ] [-v] + ./tools/benchmark_rounds.sh [-p ] [-d ] [-n ] [-r ] [-v] [--show-host] Options: -p, --platform Platform to run on (default: a2a3) @@ -86,6 +91,7 @@ Options: -n, --rounds Override number of rounds for each example (default: 100) -r, --runtime Runtime to benchmark: tensormap_and_ringbuffer (default), aicpu_build_graph -v, --verbose Save detailed run_example.py output to a timestamped log file + --show-host Show Host (us) timing column (default: hidden) -h, --help Show this help All other options are passed through to run_example.py (e.g. --case). @@ -168,6 +174,7 @@ DEVICE_LOG_DIR="$LOG_ROOT/device-${DEVICE_ID}" # --------------------------------------------------------------------------- parse_timing() { local log_file="$1" + local host_timing_file="${2:-}" local timing timing=$(grep -E 'Thread [0-9]+: (sched_start|orch_start|orch_end|sched_end|orch_stage_end)' "$log_file" || true) @@ -177,7 +184,7 @@ parse_timing() { return 1 fi - echo "$timing" | awk -v freq="$FREQ" ' + echo "$timing" | awk -v freq="$FREQ" -v host_file="$host_timing_file" -v show_host="$SHOW_HOST" ' function new_round() { flush_round() round++ @@ -203,6 +210,19 @@ parse_timing() { min_sched_start = 0; max_sched_end = 0 min_orch_start = 0; max_orch_end = 0 has_sched = 0; has_orch_end = 0 + has_host = 0; host_count = 0 + if (show_host == "1" && host_file != "") { + while ((getline hline < host_file) > 0) { + match(hline, /round=([0-9]+)/, hr) + match(hline, /total_us=([0-9.]+)/, hv) + if (hr[1] != "" && hv[1] != "") { + host_results[hr[1] + 0] = hv[1] + 0.0 + host_count++ + has_host = 1 + } + } + close(host_file) + } } /sched_start=/ { match($0, /Thread ([0-9]+):/, tm) @@ -255,11 +275,13 @@ parse_timing() { sep = sprintf(" %-8s %12s", "-----", "------------") if (show_sched) { hdr = hdr sprintf(" %12s", "Sched (us)"); sep = sep sprintf(" %12s", "----------") } if (show_orch) { hdr = hdr sprintf(" %12s", "Orch (us)"); sep = sep sprintf(" %12s", "---------") } + if (has_host) { hdr = hdr sprintf(" %12s", "Host (us)"); sep = sep sprintf(" %12s", "---------") } print hdr; print sep sum_v = 0; min_v = results[0]; max_v = results[0] sum_s = 0; min_s = sched_results[0]; max_s = sched_results[0] sum_o = 0; min_o = orch_results[0]; max_o = orch_results[0] + sum_h = 0; min_h = host_results[0]; max_h = host_results[0] for (i = 0; i < count; i++) { line = sprintf(" %-8d %12.1f", i, results[i]) @@ -278,12 +300,19 @@ parse_timing() { if (orch_results[i] < min_o) min_o = orch_results[i] if (orch_results[i] > max_o) max_o = orch_results[i] } + if (has_host) { + line = line sprintf(" %12.1f", host_results[i]) + sum_h += host_results[i] + if (host_results[i] < min_h) min_h = host_results[i] + if (host_results[i] > max_h) max_h = host_results[i] + } print line } printf "\n Avg: %.1f us", sum_v / count if (show_sched) printf " | Sched Avg: %.1f us", sum_s / count if (show_orch) printf " | Orch Avg: %.1f us", sum_o / count + if (has_host) printf " | Host Avg: %.1f us", sum_h / count printf " (%d rounds)\n", count TRIM = 10 @@ -321,6 +350,10 @@ parse_timing() { for (i = TRIM; i < count - TRIM; i++) ts3 += so[i] printf " Orch Trimmed Avg: %.1f us (dropped %d low + %d high)\n", ts3 / tc, TRIM, TRIM } + if (has_host) { + trimmed_h = (sum_h - min_h - max_h) / (count - 2) + printf " Host Trimmed Avg: %.1f us (excluding min=%.1f, max=%.1f)\n", trimmed_h, min_h, max_h + } } }' } @@ -386,15 +419,13 @@ run_bench() { fi run_cmd+=("${EXTRA_ARGS[@]}") - # Run example + # Run example (always capture output for HOST_TIMING extraction) vlog "Running: ${run_cmd[*]}" local rc=0 - if [[ -n "$VERBOSE_LOG" ]]; then - local run_output - run_output=$("${run_cmd[@]}" 2>&1) || rc=$? - if [[ -n "$run_output" ]]; then echo "$run_output" >> "$VERBOSE_LOG"; fi - else - "${run_cmd[@]}" > /dev/null 2>&1 || rc=$? + local run_output + run_output=$("${run_cmd[@]}" 2>&1) || rc=$? + if [[ -n "$VERBOSE_LOG" && -n "$run_output" ]]; then + echo "$run_output" >> "$VERBOSE_LOG" fi if [[ $rc -ne 0 ]]; then echo " FAILED: run_example.py returned non-zero" @@ -414,9 +445,16 @@ run_bench() { fi echo " Log: $new_log" + + # Extract HOST_TIMING lines to temp file for parse_timing + local host_timing_file + host_timing_file=$(mktemp) + echo "$run_output" | grep 'HOST_TIMING' > "$host_timing_file" 2>/dev/null || true + local timing_output local parse_rc=0 - timing_output=$(parse_timing "$new_log") || parse_rc=$? + timing_output=$(parse_timing "$new_log" "$host_timing_file") || parse_rc=$? + rm -f "$host_timing_file" echo "$timing_output" if [[ $parse_rc -ne 0 ]]; then @@ -431,17 +469,19 @@ run_bench() { local avg_line avg_line=$(echo "$timing_output" | grep "^ Avg:" || true) - local avg_elapsed="-" avg_sched="-" avg_orch="-" + local avg_elapsed="-" avg_sched="-" avg_orch="-" avg_host="-" if [[ -n "$avg_line" ]]; then avg_elapsed=$(echo "$avg_line" | awk '{print $2}') avg_sched=$(echo "$avg_line" | grep -o 'Sched Avg: [0-9.]*' | awk '{print $3}') || avg_sched="-" avg_orch=$(echo "$avg_line" | grep -o 'Orch Avg: [0-9.]*' | awk '{print $3}') || avg_orch="-" + avg_host=$(echo "$avg_line" | grep -o 'Host Avg: [0-9.]*' | awk '{print $3}') || avg_host="-" fi SUMMARY_NAMES+=("$label") SUMMARY_ELAPSED+=("$avg_elapsed") SUMMARY_SCHED+=("$avg_sched") SUMMARY_ORCH+=("$avg_orch") + SUMMARY_HOST+=("$avg_host") } # --------------------------------------------------------------------------- @@ -455,6 +495,7 @@ SUMMARY_NAMES=() SUMMARY_ELAPSED=() SUMMARY_SCHED=() SUMMARY_ORCH=() +SUMMARY_HOST=() echo "" echo "Runtime: $RUNTIME" @@ -495,9 +536,11 @@ if [[ ${#SUMMARY_NAMES[@]} -gt 0 ]]; then # Check if any sched/orch data exists across all runs _has_sched=0 _has_orch=0 + _has_host=0 for _i in "${!SUMMARY_NAMES[@]}"; do [[ "${SUMMARY_SCHED[$_i]}" != "-" ]] && _has_sched=1 [[ "${SUMMARY_ORCH[$_i]}" != "-" ]] && _has_orch=1 + [[ $SHOW_HOST -eq 1 && "${SUMMARY_HOST[$_i]}" != "-" ]] && _has_host=1 done echo "" @@ -517,6 +560,10 @@ if [[ ${#SUMMARY_NAMES[@]} -gt 0 ]]; then _hdr=$(printf "%s %12s" "$_hdr" "Orch (us)") _sep=$(printf "%s %12s" "$_sep" "------------") fi + if [[ $_has_host -eq 1 ]]; then + _hdr=$(printf "%s %12s" "$_hdr" "Host (us)") + _sep=$(printf "%s %12s" "$_sep" "------------") + fi echo "$_hdr" echo "$_sep" @@ -529,6 +576,9 @@ if [[ ${#SUMMARY_NAMES[@]} -gt 0 ]]; then if [[ $_has_orch -eq 1 ]]; then _row=$(printf "%s %12s" "$_row" "${SUMMARY_ORCH[$_i]}") fi + if [[ $_has_host -eq 1 ]]; then + _row=$(printf "%s %12s" "$_row" "${SUMMARY_HOST[$_i]}") + fi echo "$_row" done fi