diff --git a/examples/scripts/code_runner.py b/examples/scripts/code_runner.py
index 4b8daa3d5..faddfe3cd 100644
--- a/examples/scripts/code_runner.py
+++ b/examples/scripts/code_runner.py
@@ -836,10 +836,6 @@ def _compile_one_kernel(kernel):
             logger.debug(f"Tensor order: {list(tensors.keys())}")
             logger.debug(f"orch_args count: {len(orch_args)}")
 
-            # Create and initialize runtime (including kernel registration)
-            logger.info("=== Initializing Runtime ===")
-            runtime = Runtime()
-
             # Build environment for runtime initialization
             run_env = _kernel_config_runtime_env(self._kernel_config, self.kernels_dir)
             if run_env:
@@ -856,27 +852,32 @@ def _compile_one_kernel(kernel):
 
             initial_outputs = {k: v.clone() for k, v in outputs.items()}
 
+            runtime = Runtime()
+
+            with _temporary_env(run_env):
+                runtime.initialize(
+                    orch_so_binary,
+                    self.orchestration["function_name"],
+                    orch_args,
+                    kernel_binaries=kernel_binaries,
+                )
+
             for round_idx in range(self.repeat_rounds):
                 if self.repeat_rounds > 1:
                     logger.info(f"--- Round {round_idx + 1}/{self.repeat_rounds} ---")
 
-                for k, v in initial_outputs.items():
-                    outputs[k].copy_(v)
-
-                runtime = Runtime()
+                t_round_start = time.perf_counter()
 
-                # Enable profiling if requested (only first round)
                 if self.enable_profiling and round_idx == 0:
                     runtime.enable_profiling(True)
                     logger.info("Profiling enabled")
 
-                with _temporary_env(run_env):
-                    runtime.initialize(
-                        orch_so_binary,
-                        self.orchestration["function_name"],
-                        orch_args,
-                        kernel_binaries=kernel_binaries,
-                    )
+                for k, v in initial_outputs.items():
+                    outputs[k].copy_(v)
+
+                runtime.initialize_round(
+                    orch_args,
+                )
 
                 launch_runtime(
                     runtime,
@@ -888,10 +889,14 @@ def _compile_one_kernel(kernel):
                     orch_thread_num=self.orch_thread_num,
                 )
 
-                runtime.finalize()
+                runtime.finalize_round()
                 if not self.skip_golden:
                     self._compare_with_golden(outputs, golden)
 
+                t_round_end = time.perf_counter()
+                logger.info(f"HOST_TIMING round={round_idx} total_us={(t_round_end - t_round_start) * 1e6:.1f}")
+                
+            runtime.finalize()
             logger.info(f"=== Case {case_idx + 1}/{total_cases} Passed ===")
 
         logger.info("=" * 60)
diff --git a/python/bindings.py b/python/bindings.py
index 049fad887..7f8fd860b 100644
--- a/python/bindings.py
+++ b/python/bindings.py
@@ -164,6 +164,19 @@ def _setup_functions(self):
         self.lib.finalize_runtime.argtypes = [c_void_p]
         self.lib.finalize_runtime.restype = c_int
 
+        # init_runtime_round - per-round data copy (INPUT+INOUT) to device
+        self.lib.init_runtime_round.argtypes = [
+            c_void_p,               # runtime
+            POINTER(TaskArgC),      # orch_args
+            c_int,                  # orch_args_count
+            POINTER(c_int),         # arg_types
+        ]
+        self.lib.init_runtime_round.restype = c_int
+
+        # finalize_runtime_round - copy results back without freeing resources
+        self.lib.finalize_runtime_round.argtypes = [c_void_p]
+        self.lib.finalize_runtime_round.restype = c_int
+
         # Note: register_kernel has been internalized into init_runtime
         # Kernel binaries are now passed directly to init_runtime()
 
@@ -232,6 +245,32 @@ def __init__(self, lib: CDLL):
         size = lib.get_runtime_size()
         self._buffer = ctypes.create_string_buffer(size)
         self._handle = ctypes.cast(self._buffer, c_void_p)
+        self._initialized = False
+
+    def _convert_orch_params(self, orch_args, arg_types):
+        """Convert orch_args and arg_types to ctypes arrays."""
+        orch_args = orch_args or []
+        orch_args_count = len(orch_args)
+
+        # Accept either a nanobind TaskArgArray (from task_interface) or a
+        # plain list of TaskArgC structs.
+        from _task_interface import TaskArgArray as _NbTaskArgArray
+
+        if isinstance(orch_args, _NbTaskArgArray):
+            orch_args_array = cast(orch_args.ctypes_ptr(), POINTER(TaskArgC)) if orch_args_count > 0 else None
+            # Prevent GC of the nanobind array while the ctypes pointer is live
+            self._nb_args_ref = orch_args
+        elif orch_args_count > 0:
+            orch_args_array = (TaskArgC * orch_args_count)(*orch_args)
+        else:
+            orch_args_array = None
+
+        if arg_types is not None and len(arg_types) > 0:
+            arg_types_array = (c_int * len(arg_types))(*arg_types)
+        else:
+            arg_types_array = None
+
+        return orch_args_array, orch_args_count, arg_types_array
 
     def initialize(
         self,
@@ -323,6 +362,7 @@ def initialize(
         )
         if rc != 0:
             raise RuntimeError(f"init_runtime failed: {rc}")
+        self._initialized = True
 
     def finalize(self) -> None:
         """
@@ -335,10 +375,58 @@ def finalize(self) -> None:
         Raises:
             RuntimeError: If finalization fails
         """
+        if not self._initialized:
+            return
 
         rc = self.lib.finalize_runtime(self._handle)
         if rc != 0:
             raise RuntimeError(f"finalize_runtime failed: {rc}")
+        self._initialized = False
+
+    def initialize_round(
+        self,
+        orch_args: Optional[list] = None,
+        arg_types: Optional[List[int]] = None,
+    ) -> None:
+        """
+        Per-round initialization: copy INPUT and INOUT tensor data to device.
+
+        Uses existing device memory allocations from initialize().
+        Called every round (including the first) before launch_runtime().
+
+        Args:
+            orch_args: List of TaskArgC structs for orchestration
+            arg_types: Array describing each argument's type
+
+        Raises:
+            RuntimeError: If round initialization fails
+        """
+        orch_args_array, orch_args_count, arg_types_array = \
+            self._convert_orch_params(orch_args, arg_types)
+
+        rc = self.lib.init_runtime_round(
+            self._handle,
+            orch_args_array,
+            orch_args_count,
+            arg_types_array,
+        )
+        if rc != 0:
+            raise RuntimeError(f"init_runtime_round failed: {rc}")
+
+    def finalize_round(self) -> None:
+        """
+        Round-level finalize: copy results back but keep device resources alive.
+
+        Copies output/inout tensors from device to host without freeing
+        device memory or kernel binaries. Use between rounds in the same case.
+
+        Raises:
+            RuntimeError: If round finalization fails
+        """
+        rc = self.lib.finalize_runtime_round(self._handle)
+        if rc != 0:
+            # Not supported by this runtime, fallback to full finalize
+            self.finalize()
 
     def enable_profiling(self, enabled: bool = True) -> None:
         """
diff --git a/src/a2a3/platform/include/host/pto_runtime_c_api.h b/src/a2a3/platform/include/host/pto_runtime_c_api.h
index 075f81899..6ff15681e 100644
--- a/src/a2a3/platform/include/host/pto_runtime_c_api.h
+++ b/src/a2a3/platform/include/host/pto_runtime_c_api.h
@@ -160,6 +160,38 @@ int launch_runtime(RuntimeHandle runtime,
     size_t aicore_size,
     int orch_thread_num);
 
+/**
+ * Per-round initialization: copy INPUT and INOUT tensor data to device.
+ *
+ * Uses existing device memory allocations from init_runtime().
+ * Called every round (including the first) before launch_runtime().
+ *
+ * Must be called after a successful init_runtime(). The Runtime handle
+ * must not have been fully finalized.
+ *
+ * @param runtime         Runtime handle (previously initialized)
+ * @param orch_args       Array of TaskArg describing orchestration arguments
+ * @param orch_args_count Number of orchestration arguments
+ * @param arg_types       Array describing each argument's type (ArgType enum)
+ * @return 0 on success, -1 on failure
+ */
+int init_runtime_round(RuntimeHandle runtime,
+                   const struct TaskArg* orch_args,
+                   int orch_args_count,
+                   int* arg_types);
+
+/**
+ * Round-level finalize: copy results back but keep device resources alive.
+ *
+ * Copies output/inout tensors from device to host, but does NOT free
+ * device memory, kernel binaries, or call the Runtime destructor.
+ * Use this between rounds within the same case.
+ *
+ * @param runtime  Runtime handle to finalize for this round
+ * @return 0 on success, -1 on failure
+ */
+int finalize_runtime_round(RuntimeHandle runtime);
+
 /**
  * Finalize and cleanup a runtime instance.
  *
diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
index 72101898a..5e4d21bac 100644
--- a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
+++ b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
@@ -30,6 +30,11 @@ int init_runtime_impl(Runtime* runtime,
                     const size_t* kernel_sizes,
                     int kernel_count);
 int validate_runtime_impl(Runtime* runtime);
+int init_runtime_round_impl(Runtime* runtime,
+                    const TaskArg* orch_args,
+                    int orch_args_count,
+                    int* arg_types);
+int validate_runtime_round_impl(Runtime* runtime);
 
 /* Forward declarations for device memory functions used in init_runtime */
 void* device_malloc(size_t size);
@@ -200,6 +205,33 @@ int launch_runtime(RuntimeHandle runtime,
     }
 }
 
+int init_runtime_round(RuntimeHandle runtime,
+                   const TaskArg* orch_args,
+                   int orch_args_count,
+                   int* arg_types) {
+    if (runtime == NULL) {
+        return -1;
+    }
+    try {
+        Runtime* r = static_cast<Runtime*>(runtime);
+        return init_runtime_round_impl(r, orch_args, orch_args_count, arg_types);
+    } catch (...) {
+        return -1;
+    }
+}
+
+int finalize_runtime_round(RuntimeHandle runtime) {
+    if (runtime == NULL) {
+        return -1;
+    }
+    try {
+        Runtime* r = static_cast<Runtime*>(runtime);
+        return validate_runtime_round_impl(r);
+    } catch (...) {
+        return -1;
+    }
+}
+
 int finalize_runtime(RuntimeHandle runtime) {
     if (runtime == NULL) {
         return -1;
diff --git a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
index 417d96a74..aa57a4829 100644
--- a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
+++ b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
@@ -33,6 +33,11 @@ int init_runtime_impl(Runtime* runtime,
                     const size_t* kernel_sizes,
                     int kernel_count);
 int validate_runtime_impl(Runtime* runtime);
+int init_runtime_round_impl(Runtime* runtime,
+                    const TaskArg* orch_args,
+                    int orch_args_count,
+                    int* arg_types);
+int validate_runtime_round_impl(Runtime* runtime);
 
 /* Forward declarations */
 void* device_malloc(size_t size);
@@ -203,6 +208,33 @@ int launch_runtime(RuntimeHandle runtime,
     }
 }
 
+int init_runtime_round(RuntimeHandle runtime,
+                   const TaskArg* orch_args,
+                   int orch_args_count,
+                   int* arg_types) {
+    if (runtime == NULL) {
+        return -1;
+    }
+    try {
+        Runtime* r = static_cast<Runtime*>(runtime);
+        return init_runtime_round_impl(r, orch_args, orch_args_count, arg_types);
+    } catch (...) {
+        return -1;
+    }
+}
+
+int finalize_runtime_round(RuntimeHandle runtime) {
+    if (runtime == NULL) {
+        return -1;
+    }
+    try {
+        Runtime* r = static_cast<Runtime*>(runtime);
+        return validate_runtime_round_impl(r);
+    } catch (...) {
+        return -1;
+    }
+}
+
 int finalize_runtime(RuntimeHandle runtime) {
     if (runtime == NULL) {
         return -1;
diff --git a/src/a2a3/runtime/aicpu_build_graph/host/runtime_maker.cpp b/src/a2a3/runtime/aicpu_build_graph/host/runtime_maker.cpp
index c8ac410af..1813c2128 100644
--- a/src/a2a3/runtime/aicpu_build_graph/host/runtime_maker.cpp
+++ b/src/a2a3/runtime/aicpu_build_graph/host/runtime_maker.cpp
@@ -254,7 +254,7 @@ extern "C" int init_runtime_impl(Runtime *runtime,
     LOG_INFO("Device orchestration ready: %d args", orch_args_count);
 
     long long t_total_end = _now_ms();
-    LOG_INFO("TIMING: args_malloc_copy = %lldms", t_args_end - t_args_start);
+    LOG_INFO("TIMING: args_malloc = %lldms", t_args_end - t_args_start);
     LOG_INFO("TIMING: orch_so_copy = %lldms", t_so_end - t_so_start);
     LOG_INFO("TIMING: gm_heap_alloc(1GB) = %lldms", t_heap_end - t_heap_start);
     LOG_INFO("TIMING: shared_mem_alloc = %lldms", t_sm_end - t_sm_start);
@@ -267,9 +267,10 @@ extern "C" int init_runtime_impl(Runtime *runtime,
  * Validate runtime results and cleanup.
  *
  * This function:
- * 1. Copies recorded tensors from device back to host
- * 2. Frees device memory for recorded tensors
- * 3. Clears tensor pair state
+ * 1. Frees device memory for recorded tensors
+ * 2. Clears tensor pair state
+ *
+ * Copy-back is handled by validate_runtime_round_impl (called per round).
  *
  * @param runtime  Pointer to Runtime
  * @return 0 on success, -1 on failure
@@ -280,97 +281,151 @@ extern "C" int validate_runtime_impl(Runtime *runtime) {
         return -1;
     }
 
-    int rc = 0;
-
-    LOG_INFO("=== Copying Results Back to Host ===");
-
-    // Copy all recorded tensors from device back to host
     TensorPair* tensor_pairs = runtime->get_tensor_pairs();
     int tensor_pair_count = runtime->get_tensor_pair_count();
 
-    LOG_INFO("Tensor pairs to process: %d", tensor_pair_count);
+    // Cleanup device tensors
+    LOG_INFO("=== Cleaning Up ===");
+    for (int i = 0; i < tensor_pair_count; i++) {
+        if (tensor_pairs[i].dev_ptr != nullptr) {
+            runtime->host_api.device_free(tensor_pairs[i].dev_ptr);
+        }
+    }
+    LOG_INFO("Freed %d device allocations", tensor_pair_count);
+
+    // Cleanup kernel binaries
+    int kernel_count = runtime->get_registered_kernel_count();
+    for (int i = 0; i < kernel_count; i++) {
+        int func_id = runtime->get_registered_kernel_func_id(i);
+        runtime->host_api.remove_kernel_binary(func_id);
+        runtime->set_function_bin_addr(func_id, 0);
+    }
+    if (kernel_count > 0) {
+        LOG_INFO("Freed %d kernel binaries", kernel_count);
+    }
+    runtime->clear_registered_kernels();
+
+    // Clear tensor pairs
+    runtime->clear_tensor_pairs();
+
+    LOG_INFO("=== Finalize Complete ===");
+
+    return 0;
+}
+
+/**
+ * Round-level validate: copy results back but keep device resources alive.
+ *
+ * Handles PTO2 packed graph output: if the shared-memory header contains a
+ * graph_output_ptr, the first output tensor is read from that packed buffer
+ * instead of the individually-recorded device pointer.
+ *
+ * @param runtime  Pointer to Runtime
+ * @return 0 on success, -1 on failure
+ */
+extern "C" int validate_runtime_round_impl(Runtime *runtime) {
+    if (runtime == nullptr) {
+        LOG_ERROR("Runtime pointer is null");
+        return -1;
+    }
+
+    LOG_INFO("=== Round Finalize: Copying Results Back ===");
+
+    TensorPair* tensor_pairs = runtime->get_tensor_pairs();
+    const int tensor_pair_count = runtime->get_tensor_pair_count();
 
-    // PTO2 (device orchestration): graph output may be in packed buffer
     void* pto2_sm = runtime->get_pto2_gm_sm_ptr();
-    uint64_t graph_out_ptr = 0;
-    uint64_t graph_out_size = 0;
+    void* graph_out_src = nullptr;
+    size_t graph_out_copy_size = 0;
 
     if (pto2_sm != nullptr) {
-        // Copy header from device to host to read graph_output_ptr/size
         PTO2SharedMemoryHeader host_header;
-        int hdr_rc = runtime->host_api.copy_from_device(&host_header, pto2_sm, sizeof(PTO2SharedMemoryHeader));
-        if (hdr_rc == 0) {
-            graph_out_ptr = host_header.graph_output_ptr;
-            graph_out_size = host_header.graph_output_size;
-            if (graph_out_ptr != 0) {
-                LOG_INFO("Graph output buffer: ptr=0x%lx, size=%lu", (unsigned long)graph_out_ptr, (unsigned long)graph_out_size);
+        if (runtime->host_api.copy_from_device(&host_header, pto2_sm, sizeof(PTO2SharedMemoryHeader)) == 0) {
+            if (host_header.graph_output_ptr != 0 && host_header.graph_output_size > 0) {
+                graph_out_src = reinterpret_cast<void*>(static_cast<uintptr_t>(host_header.graph_output_ptr));
+                graph_out_copy_size = static_cast<size_t>(host_header.graph_output_size);
             }
-        } else {
-            LOG_WARN("Failed to copy PTO2 header from device");
         }
     }
 
-    bool first_output_tensor = true;
-    for (int i = 0; i < tensor_pair_count; i++) {
-        const TensorPair& pair = tensor_pairs[i];
+    int rc = 0;
+    bool first_output = true;
 
-        // Skip if device pointer is null
-        if (pair.dev_ptr == nullptr) {
-            LOG_WARN("Tensor %d has null device pointer, skipping", i);
-            continue;
-        }
+    for (int i = 0; i < tensor_pair_count; ++i) {
+        const TensorPair& pair = tensor_pairs[i];
 
-        // If host pointer is null, this is a device-only allocation (no copy-back)
-        if (pair.host_ptr == nullptr) {
-            LOG_INFO("Tensor %d: device-only allocation (no copy-back)", i);
+        if (pair.dev_ptr == nullptr || pair.host_ptr == nullptr || pair.size == 0) {
             continue;
         }
 
         void* src_ptr = pair.dev_ptr;
         size_t copy_size = pair.size;
 
-        // Use graph_output_ptr for the first output tensor if available
-        if (first_output_tensor && graph_out_ptr != 0 && graph_out_size > 0) {
-            src_ptr = reinterpret_cast<void*>(static_cast<uintptr_t>(graph_out_ptr));
-            copy_size = static_cast<size_t>(graph_out_size);
-            LOG_INFO("Using packed output buffer for tensor %d", i);
-            first_output_tensor = false;
+        if (first_output && graph_out_src != nullptr) {
+            src_ptr = graph_out_src;
+            copy_size = graph_out_copy_size;
+            first_output = false;
         }
 
         int copy_rc = runtime->host_api.copy_from_device(pair.host_ptr, src_ptr, copy_size);
         if (copy_rc != 0) {
-            LOG_ERROR("Failed to copy tensor %d from device: %d", i, copy_rc);
+            LOG_ERROR("Failed to copy tensor %d from device", i);
             rc = copy_rc;
-        } else {
-            LOG_INFO("Tensor %d: %zu bytes copied to host", i, pair.size);
         }
     }
 
-    // Cleanup device tensors
-    LOG_INFO("=== Cleaning Up ===");
-    for (int i = 0; i < tensor_pair_count; i++) {
-        if (tensor_pairs[i].dev_ptr != nullptr) {
-            runtime->host_api.device_free(tensor_pairs[i].dev_ptr);
-        }
-    }
-    LOG_INFO("Freed %d device allocations", tensor_pair_count);
+    LOG_INFO("=== Round Finalize Complete ===");
+    return rc;
+}
 
-    // Cleanup kernel binaries
-    int kernel_count = runtime->get_registered_kernel_count();
-    for (int i = 0; i < kernel_count; i++) {
-        int func_id = runtime->get_registered_kernel_func_id(i);
-        runtime->host_api.remove_kernel_binary(func_id);
-        runtime->set_function_bin_addr(func_id, 0);
-    }
-    if (kernel_count > 0) {
-        LOG_INFO("Freed %d kernel binaries", kernel_count);
+/**
+ * Per-round initialization: copy INPUT and INOUT tensor data to device.
+ *
+ * Uses existing device memory allocations from init_runtime_impl.
+ * Called every round (including the first) before launch_runtime.
+ *
+ * @param runtime         Pointer to previously initialized Runtime
+ * @param orch_args       Array of TaskArg describing orchestration arguments
+ * @param orch_args_count Number of orchestration arguments
+ * @param arg_types       Array describing each argument's type (ArgType enum)
+ * @return 0 on success, -1 on failure
+ */
+extern "C" int init_runtime_round_impl(Runtime *runtime,
+                    const TaskArg* orch_args,
+                    int orch_args_count,
+                    int* arg_types) {
+    (void)arg_types;
+
+    if (runtime == nullptr) {
+        LOG_ERROR("Runtime pointer is null");
+        return -1;
     }
-    runtime->clear_registered_kernels();
 
-    // Clear tensor pairs
-    runtime->clear_tensor_pairs();
+    TensorPair* tensor_pairs = runtime->get_tensor_pairs();
+    const int total_pairs = runtime->get_tensor_pair_count();
+    int pair_idx = 0;
 
-    LOG_INFO("=== Finalize Complete ===");
+    for (int i = 0; i < orch_args_count; ++i) {
+        if (orch_args[i].kind != TaskArgKind::TENSOR) continue;
 
-    return rc;
+        if (pair_idx >= total_pairs) {
+            LOG_ERROR("init_round: tensor_pair index out of range at arg %d", i);
+            return -1;
+        }
+
+        const TensorPair& pair = tensor_pairs[pair_idx];
+
+        size_t size = static_cast<size_t>(orch_args[i].nbytes());
+        void* host_ptr = reinterpret_cast<void*>(static_cast<uintptr_t>(orch_args[i].tensor.data));
+
+        if (size > 0 && host_ptr != nullptr && pair.dev_ptr != nullptr) {
+            int rc = runtime->host_api.copy_to_device(pair.dev_ptr, host_ptr, size);
+            if (rc != 0) {
+                LOG_ERROR("init_round: failed to copy arg %d to device", i);
+                return -1;
+            }
+        }
+        pair_idx++;
+    }
+    return 0;
 }
diff --git a/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp b/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp
index 992cced65..d8b91cbba 100644
--- a/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp
+++ b/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp
@@ -221,6 +221,44 @@ int validate_runtime_impl(Runtime *runtime) {
     return rc;
 }
 
+int init_runtime_round_impl(Runtime* runtime,
+                    const TaskArg* orch_args,
+                    int orch_args_count,
+                    int* arg_types) {
+    (void)runtime;
+    (void)orch_args;
+    (void)orch_args_count;
+    (void)arg_types;
+    // No-op: host orchestration manages device memory directly,
+    // so there is no per-round data copy to perform.
+    return 0;
+}
+
+int validate_runtime_round_impl(Runtime* runtime) {
+    if (runtime == nullptr) {
+        LOG_ERROR("Runtime pointer is null");
+        return -1;
+    }
+
+    int rc = 0;
+
+    // Copy recorded tensors from device back to host (same as validate_runtime_impl
+    // but without freeing device memory or kernel binaries).
+    TensorPair* tensor_pairs = runtime->get_tensor_pairs();
+    int tensor_pair_count = runtime->get_tensor_pair_count();
+
+    for (int i = 0; i < tensor_pair_count; i++) {
+        const TensorPair& pair = tensor_pairs[i];
+        int copy_rc = runtime->host_api.copy_from_device(pair.host_ptr, pair.dev_ptr, pair.size);
+        if (copy_rc != 0) {
+            LOG_ERROR("Failed to copy tensor %d from device: %d", i, copy_rc);
+            rc = copy_rc;
+        }
+    }
+
+    return rc;
+}
+
 #ifdef __cplusplus
 }  /* extern "C" */
 #endif
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
index 134aff055..6b0faa898 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -254,7 +254,7 @@ extern "C" int init_runtime_impl(Runtime *runtime,
     LOG_INFO("Device orchestration ready: %d args", orch_args_count);
 
     long long t_total_end = _now_ms();
-    LOG_INFO("TIMING: args_malloc_copy = %lldms", t_args_end - t_args_start);
+    LOG_INFO("TIMING: args_malloc = %lldms", t_args_end - t_args_start);
     LOG_INFO("TIMING: orch_so_copy = %lldms", t_so_end - t_so_start);
     LOG_INFO("TIMING: gm_heap_alloc(1GB) = %lldms", t_heap_end - t_heap_start);
     LOG_INFO("TIMING: shared_mem_alloc = %lldms", t_sm_end - t_sm_start);
@@ -267,9 +267,10 @@ extern "C" int init_runtime_impl(Runtime *runtime,
  * Validate runtime results and cleanup.
  *
  * This function:
- * 1. Copies recorded tensors from device back to host
- * 2. Frees device memory for recorded tensors
- * 3. Clears tensor pair state
+ * 1. Frees device memory for recorded tensors
+ * 2. Clears tensor pair state
+ *
+ * Copy-back is handled by validate_runtime_round_impl (called per round).
  *
  * @param runtime  Pointer to Runtime
  * @return 0 on success, -1 on failure
@@ -280,15 +281,110 @@ extern "C" int validate_runtime_impl(Runtime *runtime) {
         return -1;
     }
 
-    int rc = 0;
+    TensorPair* tensor_pairs = runtime->get_tensor_pairs();
+    int tensor_pair_count = runtime->get_tensor_pair_count();
+
+    // Cleanup device tensors
+    LOG_INFO("=== Cleaning Up ===");
+    for (int i = 0; i < tensor_pair_count; i++) {
+        if (tensor_pairs[i].dev_ptr != nullptr) {
+            runtime->host_api.device_free(tensor_pairs[i].dev_ptr);
+        }
+    }
+    LOG_INFO("Freed %d device allocations", tensor_pair_count);
 
-    LOG_INFO("=== Copying Results Back to Host ===");
+    // Cleanup kernel binaries
+    int kernel_count = runtime->get_registered_kernel_count();
+    for (int i = 0; i < kernel_count; i++) {
+        int func_id = runtime->get_registered_kernel_func_id(i);
+        runtime->host_api.remove_kernel_binary(func_id);
+        runtime->set_function_bin_addr(func_id, 0);
+    }
+    if (kernel_count > 0) {
+        LOG_INFO("Freed %d kernel binaries", kernel_count);
+    }
+    runtime->clear_registered_kernels();
+
+    // Clear tensor pairs
+    runtime->clear_tensor_pairs();
+
+    LOG_INFO("=== Finalize Complete ===");
+
+    return 0;
+}
+
+/**
+ * Per-round initialization: copy INPUT and INOUT tensor data to device.
+ *
+ * Uses existing device memory allocations from init_runtime_impl.
+ * Called every round (including the first) before launch_runtime.
+ *
+ * @param runtime           Pointer to previously initialized Runtime
+ * @param orch_args         Array of TaskArg describing orchestration arguments
+ * @param orch_args_count   Number of orchestration arguments
+ * @param arg_types         Array describing each argument's type (ArgType enum)
+ * @return 0 on success, -1 on failure
+ */
+extern "C" int init_runtime_round_impl(Runtime *runtime,
+                                   const TaskArg* orch_args,
+                                   int orch_args_count,
+                                   int* arg_types) {
+    (void)arg_types;
+
+    if (runtime == nullptr) {
+        LOG_ERROR("Runtime pointer is null");
+        return -1;
+    }
 
-    // Copy all recorded tensors from device back to host
     TensorPair* tensor_pairs = runtime->get_tensor_pairs();
-    int tensor_pair_count = runtime->get_tensor_pair_count();
+    const int total_pairs = runtime->get_tensor_pair_count();
+    int pair_idx = 0;
+
+    for (int i = 0; i < orch_args_count; ++i) {
+        if (orch_args[i].kind != TaskArgKind::TENSOR) continue;
+
+        if (pair_idx >= total_pairs) {
+            LOG_ERROR("init_round: tensor_pair index out of range at arg %d", i);
+            return -1;
+        }
+
+        const TensorPair& pair = tensor_pairs[pair_idx];
+
+        size_t size = static_cast<size_t>(orch_args[i].nbytes());
+        void* host_ptr = reinterpret_cast<void*>(static_cast<uintptr_t>(orch_args[i].tensor.data));
+
+        if (size > 0 && host_ptr != nullptr && pair.dev_ptr != nullptr) {
+            int rc = runtime->host_api.copy_to_device(pair.dev_ptr, host_ptr, size);
+            if (rc != 0) {
+                LOG_ERROR("init_round: failed to copy arg %d to device", i);
+                return -1;
+            }
+        }
+        pair_idx++;
+    }
+    return 0;
+}
+
+/**
+ * Round-level validate: copy results back but keep device resources alive.
+ *
+ * Handles PTO2 packed graph output: if the shared-memory header contains a
+ * graph_output_ptr, the first output tensor is read from that packed buffer
+ * instead of the individually-recorded device pointer.
+ *
+ * @param runtime  Pointer to Runtime
+ * @return 0 on success, -1 on failure
+ */
+extern "C" int validate_runtime_round_impl(Runtime *runtime) {
+    if (runtime == nullptr) {
+        LOG_ERROR("Runtime pointer is null");
+        return -1;
+    }
 
-    LOG_INFO("Tensor pairs to process: %d", tensor_pair_count);
+    LOG_INFO("=== Round Finalize: Copying Results Back ===");
+
+    TensorPair* tensor_pairs = runtime->get_tensor_pairs();
+    const int tensor_pair_count = runtime->get_tensor_pair_count();
 
     // PTO2 (device orchestration): graph output may be in packed buffer
     void* pto2_sm = runtime->get_pto2_gm_sm_ptr();
@@ -296,7 +392,6 @@ extern "C" int validate_runtime_impl(Runtime *runtime) {
     uint64_t graph_out_size = 0;
 
     if (pto2_sm != nullptr) {
-        // Copy header from device to host to read graph_output_ptr/size
         PTO2SharedMemoryHeader host_header;
         int hdr_rc = runtime->host_api.copy_from_device(&host_header, pto2_sm, sizeof(PTO2SharedMemoryHeader));
         if (hdr_rc == 0) {
@@ -310,19 +405,13 @@ extern "C" int validate_runtime_impl(Runtime *runtime) {
         }
     }
 
+    int rc = 0;
     bool first_output_tensor = true;
+
     for (int i = 0; i < tensor_pair_count; i++) {
         const TensorPair& pair = tensor_pairs[i];
 
-        // Skip if device pointer is null
-        if (pair.dev_ptr == nullptr) {
-            LOG_WARN("Tensor %d has null device pointer, skipping", i);
-            continue;
-        }
-
-        // If host pointer is null, this is a device-only allocation (no copy-back)
-        if (pair.host_ptr == nullptr) {
-            LOG_INFO("Tensor %d: device-only allocation (no copy-back)", i);
+        if (pair.dev_ptr == nullptr || pair.host_ptr == nullptr || pair.size == 0) {
             continue;
         }
 
@@ -333,44 +422,16 @@ extern "C" int validate_runtime_impl(Runtime *runtime) {
         if (first_output_tensor && graph_out_ptr != 0 && graph_out_size > 0) {
             src_ptr = reinterpret_cast<void*>(static_cast<uintptr_t>(graph_out_ptr));
             copy_size = static_cast<size_t>(graph_out_size);
-            LOG_INFO("Using packed output buffer for tensor %d", i);
             first_output_tensor = false;
         }
 
         int copy_rc = runtime->host_api.copy_from_device(pair.host_ptr, src_ptr, copy_size);
         if (copy_rc != 0) {
-            LOG_ERROR("Failed to copy tensor %d from device: %d", i, copy_rc);
+            LOG_ERROR("Failed to copy tensor %d from device", i);
             rc = copy_rc;
-        } else {
-            LOG_INFO("Tensor %d: %zu bytes copied to host", i, pair.size);
         }
     }
 
-    // Cleanup device tensors
-    LOG_INFO("=== Cleaning Up ===");
-    for (int i = 0; i < tensor_pair_count; i++) {
-        if (tensor_pairs[i].dev_ptr != nullptr) {
-            runtime->host_api.device_free(tensor_pairs[i].dev_ptr);
-        }
-    }
-    LOG_INFO("Freed %d device allocations", tensor_pair_count);
-
-    // Cleanup kernel binaries
-    int kernel_count = runtime->get_registered_kernel_count();
-    for (int i = 0; i < kernel_count; i++) {
-        int func_id = runtime->get_registered_kernel_func_id(i);
-        runtime->host_api.remove_kernel_binary(func_id);
-        runtime->set_function_bin_addr(func_id, 0);
-    }
-    if (kernel_count > 0) {
-        LOG_INFO("Freed %d kernel binaries", kernel_count);
-    }
-    runtime->clear_registered_kernels();
-
-    // Clear tensor pairs
-    runtime->clear_tensor_pairs();
-
-    LOG_INFO("=== Finalize Complete ===");
-
+    LOG_INFO("=== Round Finalize Complete ===");
     return rc;
 }
diff --git a/src/a5/platform/include/host/pto_runtime_c_api.h b/src/a5/platform/include/host/pto_runtime_c_api.h
index 1c220486e..c9072d105 100644
--- a/src/a5/platform/include/host/pto_runtime_c_api.h
+++ b/src/a5/platform/include/host/pto_runtime_c_api.h
@@ -160,6 +160,38 @@ int launch_runtime(RuntimeHandle runtime,
     size_t aicore_size,
     int orch_thread_num);
 
+/**
+ * Per-round initialization: copy INPUT and INOUT tensor data to device.
+ *
+ * Uses existing device memory allocations from init_runtime().
+ * Called every round (including the first) before launch_runtime().
+ *
+ * Must be called after a successful init_runtime(). The Runtime handle
+ * must not have been fully finalized.
+ *
+ * @param runtime         Runtime handle (previously initialized)
+ * @param orch_args       Array of TaskArg describing orchestration arguments
+ * @param orch_args_count Number of orchestration arguments
+ * @param arg_types       Array describing each argument's type (ArgType enum)
+ * @return 0 on success, -1 on failure
+ */
+int init_runtime_round(RuntimeHandle runtime,
+                   const struct TaskArg* orch_args,
+                   int orch_args_count,
+                   int* arg_types);
+
+/**
+ * Round-level finalize: copy results back but keep device resources alive.
+ *
+ * Copies output/inout tensors from device to host, but does NOT free
+ * device memory, kernel binaries, or call the Runtime destructor.
+ * Use this between rounds within the same case.
+ *
+ * @param runtime  Runtime handle to finalize for this round
+ * @return 0 on success, -1 on failure
+ */
+int finalize_runtime_round(RuntimeHandle runtime);
+
 /**
  * Finalize and cleanup a runtime instance.
  *
diff --git a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
index 209d6563e..b2369942b 100644
--- a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
+++ b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
@@ -30,6 +30,11 @@ int init_runtime_impl(Runtime* runtime,
                     const size_t* kernel_sizes,
                     int kernel_count);
 int validate_runtime_impl(Runtime* runtime);
+int init_runtime_round_impl(Runtime* runtime,
+                    const TaskArg* orch_args,
+                    int orch_args_count,
+                    int* arg_types);
+int validate_runtime_round_impl(Runtime* runtime);
 
 /* Forward declarations for device memory functions used in init_runtime */
 void* device_malloc(size_t size);
@@ -195,6 +200,33 @@ int launch_runtime(RuntimeHandle runtime,
     }
 }
 
+int init_runtime_round(RuntimeHandle runtime,
+                   const TaskArg* orch_args,
+                   int orch_args_count,
+                   int* arg_types) {
+    if (runtime == NULL) {
+        return -1;
+    }
+    try {
+        Runtime* r = static_cast<Runtime*>(runtime);
+        return init_runtime_round_impl(r, orch_args, orch_args_count, arg_types);
+    } catch (...) {
+        return -1;
+    }
+}
+
+int finalize_runtime_round(RuntimeHandle runtime) {
+    if (runtime == NULL) {
+        return -1;
+    }
+    try {
+        Runtime* r = static_cast<Runtime*>(runtime);
+        return validate_runtime_round_impl(r);
+    } catch (...) {
+        return -1;
+    }
+}
+
 int finalize_runtime(RuntimeHandle runtime) {
     if (runtime == NULL) {
         return -1;
diff --git a/src/a5/platform/sim/host/pto_runtime_c_api.cpp b/src/a5/platform/sim/host/pto_runtime_c_api.cpp
index 513ec698d..4e20b9ae1 100644
--- a/src/a5/platform/sim/host/pto_runtime_c_api.cpp
+++ b/src/a5/platform/sim/host/pto_runtime_c_api.cpp
@@ -33,6 +33,11 @@ int init_runtime_impl(Runtime* runtime,
                     const size_t* kernel_sizes,
                     int kernel_count);
 int validate_runtime_impl(Runtime* runtime);
+int init_runtime_round_impl(Runtime* runtime,
+                    const TaskArg* orch_args,
+                    int orch_args_count,
+                    int* arg_types);
+int validate_runtime_round_impl(Runtime* runtime);
 
 /* Forward declarations */
 void* device_malloc(size_t size);
@@ -198,6 +203,33 @@ int launch_runtime(RuntimeHandle runtime,
     }
 }
 
+int init_runtime_round(RuntimeHandle runtime,
+                   const TaskArg* orch_args,
+                   int orch_args_count,
+                   int* arg_types) {
+    if (runtime == NULL) {
+        return -1;
+    }
+    try {
+        Runtime* r = static_cast<Runtime*>(runtime);
+        return init_runtime_round_impl(r, orch_args, orch_args_count, arg_types);
+    } catch (...) {
+        return -1;
+    }
+}
+
+int finalize_runtime_round(RuntimeHandle runtime) {
+    if (runtime == NULL) {
+        return -1;
+    }
+    try {
+        Runtime* r = static_cast<Runtime*>(runtime);
+        return validate_runtime_round_impl(r);
+    } catch (...) {
+        return -1;
+    }
+}
+
 int finalize_runtime(RuntimeHandle runtime) {
     if (runtime == NULL) {
         return -1;
diff --git a/src/a5/runtime/host_build_graph/host/runtime_maker.cpp b/src/a5/runtime/host_build_graph/host/runtime_maker.cpp
index 992cced65..52e1df5da 100644
--- a/src/a5/runtime/host_build_graph/host/runtime_maker.cpp
+++ b/src/a5/runtime/host_build_graph/host/runtime_maker.cpp
@@ -221,6 +221,44 @@ int validate_runtime_impl(Runtime *runtime) {
     return rc;
 }
 
+int init_runtime_round_impl(Runtime* runtime,
+                        const TaskArg* orch_args,
+                        int orch_args_count,
+                        int* arg_types) {
+    (void)runtime;
+    (void)orch_args;
+    (void)orch_args_count;
+    (void)arg_types;
+    // No-op: host orchestration manages device memory directly,
+    // so there is no per-round data copy to perform.
+    return 0;
+}
+
+int validate_runtime_round_impl(Runtime* runtime) {
+    if (runtime == nullptr) {
+        LOG_ERROR("Runtime pointer is null");
+        return -1;
+    }
+
+    int rc = 0;
+
+    // Copy recorded tensors from device back to host (same as validate_runtime_impl
+    // but without freeing device memory or kernel binaries).
+    TensorPair* tensor_pairs = runtime->get_tensor_pairs();
+    int tensor_pair_count = runtime->get_tensor_pair_count();
+
+    for (int i = 0; i < tensor_pair_count; i++) {
+        const TensorPair& pair = tensor_pairs[i];
+        int copy_rc = runtime->host_api.copy_from_device(pair.host_ptr, pair.dev_ptr, pair.size);
+        if (copy_rc != 0) {
+            LOG_ERROR("Failed to copy tensor %d from device: %d", i, copy_rc);
+            rc = copy_rc;
+        }
+    }
+
+    return rc;
+}
+
 #ifdef __cplusplus
 }  /* extern "C" */
 #endif
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
index 134aff055..d89a97411 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -254,7 +254,7 @@ extern "C" int init_runtime_impl(Runtime *runtime,
     LOG_INFO("Device orchestration ready: %d args", orch_args_count);
 
     long long t_total_end = _now_ms();
-    LOG_INFO("TIMING: args_malloc_copy = %lldms", t_args_end - t_args_start);
+    LOG_INFO("TIMING: args_malloc = %lldms", t_args_end - t_args_start);
     LOG_INFO("TIMING: orch_so_copy = %lldms", t_so_end - t_so_start);
     LOG_INFO("TIMING: gm_heap_alloc(1GB) = %lldms", t_heap_end - t_heap_start);
     LOG_INFO("TIMING: shared_mem_alloc = %lldms", t_sm_end - t_sm_start);
@@ -263,13 +263,15 @@ extern "C" int init_runtime_impl(Runtime *runtime,
     return 0;
 }
 
+
 /**
  * Validate runtime results and cleanup.
  *
  * This function:
- * 1. Copies recorded tensors from device back to host
- * 2. Frees device memory for recorded tensors
- * 3. Clears tensor pair state
+ * 1. Frees device memory for recorded tensors
+ * 2. Clears tensor pair state
+ *
+ * Copy-back is handled by validate_runtime_round_impl (called per round).
  *
  * @param runtime  Pointer to Runtime
  * @return 0 on success, -1 on failure
@@ -280,15 +282,111 @@ extern "C" int validate_runtime_impl(Runtime *runtime) {
         return -1;
     }
 
-    int rc = 0;
+    TensorPair* tensor_pairs = runtime->get_tensor_pairs();
+    int tensor_pair_count = runtime->get_tensor_pair_count();
+
+    // Cleanup device tensors
+    LOG_INFO("=== Cleaning Up ===");
+    for (int i = 0; i < tensor_pair_count; i++) {
+        if (tensor_pairs[i].dev_ptr != nullptr) {
+            runtime->host_api.device_free(tensor_pairs[i].dev_ptr);
+        }
+    }
+    LOG_INFO("Freed %d device allocations", tensor_pair_count);
+
+    // Cleanup kernel binaries
+    int kernel_count = runtime->get_registered_kernel_count();
+    for (int i = 0; i < kernel_count; i++) {
+        int func_id = runtime->get_registered_kernel_func_id(i);
+        runtime->host_api.remove_kernel_binary(func_id);
+        runtime->set_function_bin_addr(func_id, 0);
+    }
+    if (kernel_count > 0) {
+        LOG_INFO("Freed %d kernel binaries", kernel_count);
+    }
+    runtime->clear_registered_kernels();
 
-    LOG_INFO("=== Copying Results Back to Host ===");
+    // Clear tensor pairs
+    runtime->clear_tensor_pairs();
+
+    LOG_INFO("=== Finalize Complete ===");
+
+    return 0;
+}
+
+
+/**
+ * Per-round initialization: copy INPUT and INOUT tensor data to device.
+ *
+ * Uses existing device memory allocations from init_runtime_impl.
+ * Called every round (including the first) before launch_runtime.
+ *
+ * @param runtime           Pointer to previously initialized Runtime
+ * @param orch_args         Array of TaskArg describing orchestration arguments
+ * @param orch_args_count   Number of orchestration arguments
+ * @param arg_types         Array describing each argument's type (ArgType enum)
+ * @return 0 on success, -1 on failure
+ */
+extern "C" int init_runtime_round_impl(Runtime *runtime,
+                                   const TaskArg* orch_args,
+                                   int orch_args_count,
+                                   int* arg_types) {
+    (void)arg_types;
+
+    if (runtime == nullptr) {
+        LOG_ERROR("Runtime pointer is null");
+        return -1;
+    }
 
-    // Copy all recorded tensors from device back to host
     TensorPair* tensor_pairs = runtime->get_tensor_pairs();
-    int tensor_pair_count = runtime->get_tensor_pair_count();
+    const int total_pairs = runtime->get_tensor_pair_count();
+    int pair_idx = 0;
+
+    for (int i = 0; i < orch_args_count; ++i) {
+        if (orch_args[i].kind != TaskArgKind::TENSOR) continue;
+
+        if (pair_idx >= total_pairs) {
+            LOG_ERROR("init_round: tensor_pair index out of range at arg %d", i);
+            return -1;
+        }
+
+        const TensorPair& pair = tensor_pairs[pair_idx];
+
+        size_t size = static_cast<size_t>(orch_args[i].nbytes());
+        void* host_ptr = reinterpret_cast<void*>(static_cast<uintptr_t>(orch_args[i].tensor.data));
+
+        if (size > 0 && host_ptr != nullptr && pair.dev_ptr != nullptr) {
+            int rc = runtime->host_api.copy_to_device(pair.dev_ptr, host_ptr, size);
+            if (rc != 0) {
+                LOG_ERROR("init_round: failed to copy arg %d to device", i);
+                return -1;
+            }
+        }
+        pair_idx++;
+    }
+    return 0;
+}
+
+/**
+ * Round-level validate: copy results back but keep device resources alive.
+ *
+ * Handles PTO2 packed graph output: if the shared-memory header contains a
+ * graph_output_ptr, the first output tensor is read from that packed buffer
+ * instead of the individually-recorded device pointer.
+ *
+ * @param runtime  Pointer to Runtime
+ * @return 0 on success, -1 on failure
+ */
+extern "C" int validate_runtime_round_impl(Runtime *runtime) {
+    if (runtime == nullptr) {
+        LOG_ERROR("Runtime pointer is null");
+        return -1;
+    }
+
+    LOG_INFO("=== Round Finalize: Copying Results Back ===");
 
-    LOG_INFO("Tensor pairs to process: %d", tensor_pair_count);
+    TensorPair* tensor_pairs = runtime->get_tensor_pairs();
+    const int tensor_pair_count = runtime->get_tensor_pair_count();
 
     // PTO2 (device orchestration): graph output may be in packed buffer
     void* pto2_sm = runtime->get_pto2_gm_sm_ptr();
@@ -296,7 +394,6 @@ extern "C" int validate_runtime_impl(Runtime *runtime) {
     uint64_t graph_out_size = 0;
 
     if (pto2_sm != nullptr) {
-        // Copy header from device to host to read graph_output_ptr/size
         PTO2SharedMemoryHeader host_header;
         int hdr_rc = runtime->host_api.copy_from_device(&host_header, pto2_sm, sizeof(PTO2SharedMemoryHeader));
         if (hdr_rc == 0) {
@@ -310,19 +407,13 @@ extern "C" int validate_runtime_impl(Runtime *runtime) {
         }
     }
 
+    int rc = 0;
     bool first_output_tensor = true;
+
     for (int i = 0; i < tensor_pair_count; i++) {
         const TensorPair& pair = tensor_pairs[i];
 
-        // Skip if device pointer is null
-        if (pair.dev_ptr == nullptr) {
-            LOG_WARN("Tensor %d has null device pointer, skipping", i);
-            continue;
-        }
-
-        // If host pointer is null, this is a device-only allocation (no copy-back)
-        if (pair.host_ptr == nullptr) {
-            LOG_INFO("Tensor %d: device-only allocation (no copy-back)", i);
+        if (pair.dev_ptr == nullptr || pair.host_ptr == nullptr || pair.size == 0) {
             continue;
         }
 
@@ -333,44 +424,16 @@ extern "C" int validate_runtime_impl(Runtime *runtime) {
         if (first_output_tensor && graph_out_ptr != 0 && graph_out_size > 0) {
             src_ptr = reinterpret_cast<void*>(static_cast<uintptr_t>(graph_out_ptr));
             copy_size = static_cast<size_t>(graph_out_size);
-            LOG_INFO("Using packed output buffer for tensor %d", i);
             first_output_tensor = false;
         }
 
         int copy_rc = runtime->host_api.copy_from_device(pair.host_ptr, src_ptr, copy_size);
         if (copy_rc != 0) {
-            LOG_ERROR("Failed to copy tensor %d from device: %d", i, copy_rc);
+            LOG_ERROR("Failed to copy tensor %d from device", i);
             rc = copy_rc;
-        } else {
-            LOG_INFO("Tensor %d: %zu bytes copied to host", i, pair.size);
         }
     }
 
-    // Cleanup device tensors
-    LOG_INFO("=== Cleaning Up ===");
-    for (int i = 0; i < tensor_pair_count; i++) {
-        if (tensor_pairs[i].dev_ptr != nullptr) {
-            runtime->host_api.device_free(tensor_pairs[i].dev_ptr);
-        }
-    }
-    LOG_INFO("Freed %d device allocations", tensor_pair_count);
-
-    // Cleanup kernel binaries
-    int kernel_count = runtime->get_registered_kernel_count();
-    for (int i = 0; i < kernel_count; i++) {
-        int func_id = runtime->get_registered_kernel_func_id(i);
-        runtime->host_api.remove_kernel_binary(func_id);
-        runtime->set_function_bin_addr(func_id, 0);
-    }
-    if (kernel_count > 0) {
-        LOG_INFO("Freed %d kernel binaries", kernel_count);
-    }
-    runtime->clear_registered_kernels();
-
-    // Clear tensor pairs
-    runtime->clear_tensor_pairs();
-
-    LOG_INFO("=== Finalize Complete ===");
-
+    LOG_INFO("=== Round Finalize Complete ===");
     return rc;
 }
diff --git a/tools/benchmark_rounds.sh b/tools/benchmark_rounds.sh
index 64b283e81..28e6fccf2 100755
--- a/tools/benchmark_rounds.sh
+++ b/tools/benchmark_rounds.sh
@@ -49,6 +49,7 @@ ROUNDS=100
 PLATFORM=a2a3
 RUNTIME=tensormap_and_ringbuffer
 VERBOSE=0
+SHOW_HOST=0
 EXTRA_ARGS=()
 
 while [[ $# -gt 0 ]]; do
@@ -73,12 +74,16 @@ while [[ $# -gt 0 ]]; do
             VERBOSE=1
             shift
             ;;
+        --show-host)
+            SHOW_HOST=1
+            shift
+            ;;
         --help|-h)
             cat <<'USAGE'
 benchmark_rounds.sh — run all examples and report per-round timing from device logs
 
 Usage:
-  ./tools/benchmark_rounds.sh [-p <platform>] [-d <device>] [-n <rounds>] [-r <runtime>] [-v]
+  ./tools/benchmark_rounds.sh [-p <platform>] [-d <device>] [-n <rounds>] [-r <runtime>] [-v] [--show-host]
 
 Options:
   -p, --platform Platform to run on (default: a2a3)
@@ -86,6 +91,7 @@ Options:
   -n, --rounds   Override number of rounds for each example (default: 100)
   -r, --runtime  Runtime to benchmark: tensormap_and_ringbuffer (default), aicpu_build_graph
   -v, --verbose  Save detailed run_example.py output to a timestamped log file
+  --show-host    Show Host (us) timing column (default: hidden)
   -h, --help     Show this help
 
 All other options are passed through to run_example.py (e.g. --case).
@@ -168,6 +174,7 @@ DEVICE_LOG_DIR="$LOG_ROOT/device-${DEVICE_ID}"
 # ---------------------------------------------------------------------------
 parse_timing() {
     local log_file="$1"
+    local host_timing_file="${2:-}"
 
     local timing
     timing=$(grep -E 'Thread [0-9]+: (sched_start|orch_start|orch_end|sched_end|orch_stage_end)' "$log_file" || true)
@@ -177,7 +184,7 @@ parse_timing() {
         return 1
     fi
 
-    echo "$timing" | awk -v freq="$FREQ" '
+    echo "$timing" | awk -v freq="$FREQ" -v host_file="$host_timing_file" -v show_host="$SHOW_HOST" '
     function new_round() {
         flush_round()
         round++
@@ -203,6 +210,19 @@ parse_timing() {
         min_sched_start = 0; max_sched_end = 0
         min_orch_start = 0; max_orch_end = 0
         has_sched = 0; has_orch_end = 0
+        has_host = 0; host_count = 0
+        if (show_host == "1" && host_file != "") {
+            while ((getline hline < host_file) > 0) {
+                match(hline, /round=([0-9]+)/, hr)
+                match(hline, /total_us=([0-9.]+)/, hv)
+                if (hr[1] != "" && hv[1] != "") {
+                    host_results[hr[1] + 0] = hv[1] + 0.0
+                    host_count++
+                    has_host = 1
+                }
+            }
+            close(host_file)
+        }
     }
     /sched_start=/ {
         match($0, /Thread ([0-9]+):/, tm)
@@ -255,11 +275,13 @@ parse_timing() {
         sep = sprintf("  %-8s  %12s", "-----", "------------")
         if (show_sched) { hdr = hdr sprintf("  %12s", "Sched (us)"); sep = sep sprintf("  %12s", "----------") }
         if (show_orch)  { hdr = hdr sprintf("  %12s", "Orch (us)");  sep = sep sprintf("  %12s", "---------")  }
+        if (has_host)   { hdr = hdr sprintf("  %12s", "Host (us)");  sep = sep sprintf("  %12s", "---------")  }
         print hdr; print sep
 
         sum_v = 0; min_v = results[0]; max_v = results[0]
         sum_s = 0; min_s = sched_results[0]; max_s = sched_results[0]
         sum_o = 0; min_o = orch_results[0]; max_o = orch_results[0]
+        sum_h = 0; min_h = host_results[0]; max_h = host_results[0]
 
         for (i = 0; i < count; i++) {
             line = sprintf("  %-8d  %12.1f", i, results[i])
@@ -278,12 +300,19 @@ parse_timing() {
                 if (orch_results[i] < min_o) min_o = orch_results[i]
                 if (orch_results[i] > max_o) max_o = orch_results[i]
             }
+            if (has_host) {
+                line = line sprintf("  %12.1f", host_results[i])
+                sum_h += host_results[i]
+                if (host_results[i] < min_h) min_h = host_results[i]
+                if (host_results[i] > max_h) max_h = host_results[i]
+            }
             print line
         }
 
         printf "\n  Avg: %.1f us", sum_v / count
         if (show_sched) printf "  |  Sched Avg: %.1f us", sum_s / count
         if (show_orch)  printf "  |  Orch Avg: %.1f us", sum_o / count
+        if (has_host)   printf "  |  Host Avg: %.1f us", sum_h / count
         printf "  (%d rounds)\n", count
 
         TRIM = 10
@@ -321,6 +350,10 @@ parse_timing() {
                 for (i = TRIM; i < count - TRIM; i++) ts3 += so[i]
                 printf "  Orch Trimmed Avg: %.1f us  (dropped %d low + %d high)\n", ts3 / tc, TRIM, TRIM
             }
+            if (has_host) {
+                trimmed_h = (sum_h - min_h - max_h) / (count - 2)
+                printf "  Host Trimmed Avg: %.1f us  (excluding min=%.1f, max=%.1f)\n", trimmed_h, min_h, max_h
+            }
         }
     }'
 }
@@ -386,15 +419,13 @@ run_bench() {
     fi
     run_cmd+=("${EXTRA_ARGS[@]}")
 
-    # Run example
+    # Run example (always capture output for HOST_TIMING extraction)
     vlog "Running: ${run_cmd[*]}"
     local rc=0
-    if [[ -n "$VERBOSE_LOG" ]]; then
-        local run_output
-        run_output=$("${run_cmd[@]}" 2>&1) || rc=$?
-        if [[ -n "$run_output" ]]; then echo "$run_output" >> "$VERBOSE_LOG"; fi
-    else
-        "${run_cmd[@]}" > /dev/null 2>&1 || rc=$?
+    local run_output
+    run_output=$("${run_cmd[@]}" 2>&1) || rc=$?
+    if [[ -n "$VERBOSE_LOG" && -n "$run_output" ]]; then
+        echo "$run_output" >> "$VERBOSE_LOG"
     fi
     if [[ $rc -ne 0 ]]; then
         echo "  FAILED: run_example.py returned non-zero"
@@ -414,9 +445,16 @@ run_bench() {
     fi
 
     echo "  Log: $new_log"
+
+    # Extract HOST_TIMING lines to temp file for parse_timing
+    local host_timing_file
+    host_timing_file=$(mktemp)
+    echo "$run_output" | grep 'HOST_TIMING' > "$host_timing_file" 2>/dev/null || true
+
     local timing_output
     local parse_rc=0
-    timing_output=$(parse_timing "$new_log") || parse_rc=$?
+    timing_output=$(parse_timing "$new_log" "$host_timing_file") || parse_rc=$?
+    rm -f "$host_timing_file"
     echo "$timing_output"
 
     if [[ $parse_rc -ne 0 ]]; then
@@ -431,17 +469,19 @@ run_bench() {
 
     local avg_line
     avg_line=$(echo "$timing_output" | grep "^  Avg:" || true)
-    local avg_elapsed="-" avg_sched="-" avg_orch="-"
+    local avg_elapsed="-" avg_sched="-" avg_orch="-" avg_host="-"
     if [[ -n "$avg_line" ]]; then
         avg_elapsed=$(echo "$avg_line" | awk '{print $2}')
         avg_sched=$(echo "$avg_line" | grep -o 'Sched Avg: [0-9.]*' | awk '{print $3}') || avg_sched="-"
         avg_orch=$(echo "$avg_line" | grep -o 'Orch Avg: [0-9.]*' | awk '{print $3}') || avg_orch="-"
+        avg_host=$(echo "$avg_line" | grep -o 'Host Avg: [0-9.]*' | awk '{print $3}') || avg_host="-"
     fi
 
     SUMMARY_NAMES+=("$label")
     SUMMARY_ELAPSED+=("$avg_elapsed")
     SUMMARY_SCHED+=("$avg_sched")
     SUMMARY_ORCH+=("$avg_orch")
+    SUMMARY_HOST+=("$avg_host")
 }
 
 # ---------------------------------------------------------------------------
@@ -455,6 +495,7 @@ SUMMARY_NAMES=()
 SUMMARY_ELAPSED=()
 SUMMARY_SCHED=()
 SUMMARY_ORCH=()
+SUMMARY_HOST=()
 
 echo ""
 echo "Runtime: $RUNTIME"
@@ -495,9 +536,11 @@ if [[ ${#SUMMARY_NAMES[@]} -gt 0 ]]; then
     # Check if any sched/orch data exists across all runs
     _has_sched=0
     _has_orch=0
+    _has_host=0
     for _i in "${!SUMMARY_NAMES[@]}"; do
         [[ "${SUMMARY_SCHED[$_i]}" != "-" ]] && _has_sched=1
         [[ "${SUMMARY_ORCH[$_i]}" != "-" ]] && _has_orch=1
+        [[ $SHOW_HOST -eq 1 && "${SUMMARY_HOST[$_i]}" != "-" ]] && _has_host=1
     done
 
     echo ""
@@ -517,6 +560,10 @@ if [[ ${#SUMMARY_NAMES[@]} -gt 0 ]]; then
         _hdr=$(printf "%s  %12s" "$_hdr" "Orch (us)")
         _sep=$(printf "%s  %12s" "$_sep" "------------")
     fi
+    if [[ $_has_host -eq 1 ]]; then
+        _hdr=$(printf "%s  %12s" "$_hdr" "Host (us)")
+        _sep=$(printf "%s  %12s" "$_sep" "------------")
+    fi
     echo "$_hdr"
     echo "$_sep"
 
@@ -529,6 +576,9 @@ if [[ ${#SUMMARY_NAMES[@]} -gt 0 ]]; then
         if [[ $_has_orch -eq 1 ]]; then
             _row=$(printf "%s  %12s" "$_row" "${SUMMARY_ORCH[$_i]}")
         fi
+        if [[ $_has_host -eq 1 ]]; then
+            _row=$(printf "%s  %12s" "$_row" "${SUMMARY_HOST[$_i]}")
+        fi
         echo "$_row"
     done
 fi