From ae1544f30a02152d0cd4dc0e0bd8ce5f8c739fb8 Mon Sep 17 00:00:00 2001
From: Chao Wang
 <[26245345+ChaoWao@users.noreply.github.com](mailto:26245345+ChaoWao@users.noreply.github.com)>
Date: Fri, 3 Apr 2026 19:30:22 +0800
Subject: [PATCH] Add: OrchestrationRuntime vtable for host_build_graph,
 decouple orch from runtime.h

Introduce an opaque OrchestrationRuntime + function-pointer table
(orchestration_api.h) so host_build_graph orchestration SOs no longer
include runtime.h.  Orchestration sources include only orchestration_api.h
and interact with the runtime through free-function wrappers.

- orchestration_api.h (a2a3 + a5): C-compatible vtable with inline helpers
  add_task, add_successor, record_tensor_pair, device_malloc, device_free,
  copy_to_device, get_task_count, print_runtime
- runtime_maker.cpp: wraps Runtime* in OrchestrationRuntimeImpl, calls orch
  function as OrchestrationFunc(OrchestrationRuntime*, orch_args); SO loading
  delegated to platform via extern-C load_orch_so (sim: mkstemp,
  onboard: getpid path) implemented in pto_runtime_c_api.cpp
- build_config.py (a2a3 + a5): add orchestration compile target with
  include_dirs so kernel_compiler resolves orchestration_api.h
- All host_build_graph example orchestration sources (bgemm, matmul,
  paged_attention, vector_example for a2a3 + a5) migrated to new API
- tests/st host_build_graph paged_attention orchestration migrated
- Docs updated (INCORE_ORCHESTRATION_GUIDE.md, RUNTIME_LOGIC.md,
  scripts/README.md, code_runner.py)

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
---
 .../kernels/orchestration/bgemm_orch.cpp      | 45 +++++------
 .../docs/INCORE_ORCHESTRATION_GUIDE.md        | 37 ++++-----
 .../kernels/orchestration/matmul_orch.cpp     | 81 +++++++++----------
 .../orchestration/paged_attention_orch.cpp    | 55 +++++++------
 .../kernels/orchestration/example_orch.cpp    | 65 ++++++++-------
 .../orchestration/paged_attention_orch.cpp    | 55 +++++++------
 examples/scripts/README.md                    | 42 ++++++----
 examples/scripts/code_runner.py               |  7 +-
 .../runtime/host_build_graph/build_config.py  | 24 +++---
 .../host_build_graph/docs/RUNTIME_LOGIC.md    |  8 +-
 .../host_build_graph/host/runtime_maker.cpp   | 68 ++++++++++++----
 .../orchestration/orchestration_api.h         | 74 +++++++++++++++++
 .../runtime/host_build_graph/build_config.py  | 24 +++---
 .../host_build_graph/docs/RUNTIME_LOGIC.md    |  8 +-
 .../host_build_graph/host/runtime_maker.cpp   | 68 ++++++++++++----
 .../orchestration/orchestration_api.h         | 74 +++++++++++++++++
 .../orchestration/paged_attention_orch.cpp    | 55 +++++++------
 .../orchestration/paged_attention_orch.cpp    | 55 +++++++------
 18 files changed, 540 insertions(+), 305 deletions(-)
 create mode 100644 src/a2a3/runtime/host_build_graph/orchestration/orchestration_api.h
 create mode 100644 src/a5/runtime/host_build_graph/orchestration/orchestration_api.h
diff --git a/examples/a2a3/host_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp b/examples/a2a3/host_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp
index fea12a74c..927a69e8a 100644
--- a/examples/a2a3/host_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp
+++ b/examples/a2a3/host_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp
@@ -31,8 +31,7 @@
 #include <iostream>
 #include <vector>
 
-#include "runtime.h"    // NOLINT(build/include_subdir)
-#include "task_args.h"  // NOLINT(build/include_subdir)
+#include "orchestration_api.h"  // NOLINT(build/include_subdir)
 
 extern "C" {
 
@@ -44,7 +43,7 @@ constexpr int BATCH = 1;
 
 constexpr size_t TILE_BYTES = TILE * TILE * sizeof(float);
 
-int build_bgemm_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) {
+int build_bgemm_graph(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args) {
     // Expected orch_args: [A, B, C] — 3 tensors
     if (orch_args.tensor_count() < 3) {
         std::cerr << "build_bgemm_graph: Expected at least 3 tensors, got " << orch_args.tensor_count() << '\n';
@@ -62,38 +61,38 @@ int build_bgemm_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) {
     std::cout << "Grid: " << GRID_M << " x " << GRID_K << " x " << GRID_N << '\n';
 
     // Allocate device memory and copy inputs
-    void *dev_A = runtime->host_api.device_malloc(size_A);
+    void *dev_A = device_malloc(runtime, size_A);
     if (!dev_A) return -1;
-    runtime->host_api.copy_to_device(dev_A, host_A, size_A);
+    copy_to_device(runtime, dev_A, host_A, size_A);
 
-    void *dev_B = runtime->host_api.device_malloc(size_B);
+    void *dev_B = device_malloc(runtime, size_B);
     if (!dev_B) {
-        runtime->host_api.device_free(dev_A);
+        device_free(runtime, dev_A);
         return -1;
     }
-    runtime->host_api.copy_to_device(dev_B, host_B, size_B);
+    copy_to_device(runtime, dev_B, host_B, size_B);
 
-    void *dev_C = runtime->host_api.device_malloc(size_C);
+    void *dev_C = device_malloc(runtime, size_C);
     if (!dev_C) {
-        runtime->host_api.device_free(dev_A);
-        runtime->host_api.device_free(dev_B);
+        device_free(runtime, dev_A);
+        device_free(runtime, dev_B);
         return -1;
     }
-    runtime->host_api.copy_to_device(dev_C, host_C, size_C);
-    runtime->record_tensor_pair(host_C, dev_C, size_C);
+    copy_to_device(runtime, dev_C, host_C, size_C);
+    record_tensor_pair(runtime, host_C, dev_C, size_C);
 
     // Allocate intermediate P buffers (one per C tile)
     constexpr int NUM_P_BUFFERS = BATCH * GRID_M * GRID_N;
     std::vector<void *> dev_P(NUM_P_BUFFERS, nullptr);
     for (int i = 0; i < NUM_P_BUFFERS; i++) {
-        dev_P[i] = runtime->host_api.device_malloc(TILE_BYTES);
+        dev_P[i] = device_malloc(runtime, TILE_BYTES);
         if (!dev_P[i]) {
             for (int j = 0; j < i; j++) {
-                runtime->host_api.device_free(dev_P[j]);
+                device_free(runtime, dev_P[j]);
             }
-            runtime->host_api.device_free(dev_A);
-            runtime->host_api.device_free(dev_B);
-            runtime->host_api.device_free(dev_C);
+            device_free(runtime, dev_A);
+            device_free(runtime, dev_B);
+            device_free(runtime, dev_C);
             return -1;
         }
     }
@@ -121,7 +120,7 @@ int build_bgemm_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) {
                     args_gemm[3] = TILE;
                     args_gemm[4] = TILE;
                     args_gemm[5] = TILE;
-                    int t_gemm = runtime->add_task(args_gemm, 6, 0, CoreType::AIC);
+                    int t_gemm = add_task(runtime, args_gemm, 6, 0, CoreType::AIC);
 
                     // Task 2: C[m,n] = C[m,n] + P (tile_add on Vector core)
                     uint64_t args_add[5];
@@ -130,14 +129,14 @@ int build_bgemm_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) {
                     args_add[2] = reinterpret_cast<uint64_t>(static_cast<char *>(dev_C) + C_offset);
                     args_add[3] = TILE;
                     args_add[4] = TILE;
-                    int t_add = runtime->add_task(args_add, 5, 1, CoreType::AIV);
+                    int t_add = add_task(runtime, args_add, 5, 1, CoreType::AIV);
 
                     // Dependency: gemm must complete before add
-                    runtime->add_successor(t_gemm, t_add);
+                    add_successor(runtime, t_gemm, t_add);
 
                     // Dependency: previous add must complete before current gemm (K accumulation)
                     if (last_add_task[c_tile_idx] >= 0) {
-                        runtime->add_successor(last_add_task[c_tile_idx], t_gemm);
+                        add_successor(runtime, last_add_task[c_tile_idx], t_gemm);
                     }
                     last_add_task[c_tile_idx] = t_add;
                 }
@@ -145,7 +144,7 @@ int build_bgemm_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) {
         }
     }
 
-    std::cout << "Created " << runtime->get_task_count() << " tasks\n";
+    std::cout << "Created " << get_task_count(runtime) << " tasks\n";
     return 0;
 }
 
diff --git a/examples/a2a3/host_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md b/examples/a2a3/host_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md
index 733d31d20..fc632cc7b 100644
--- a/examples/a2a3/host_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md
+++ b/examples/a2a3/host_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md
@@ -1,49 +1,50 @@
 # InCore Orchestration Guide: host_build_graph
 
 ## Goal
-In host_build_graph, the orchestration function runs on the host. It allocates device buffers, builds the task graph by calling `Runtime::add_task`, and wires dependencies with `Runtime::add_successor`.
+
+In host_build_graph, the orchestration function runs on the host. It allocates device buffers, builds the task graph by calling `add_task(runtime, ...)`, and wires dependencies with `add_successor(runtime, ...)`.
 
 ## Where To Put Orchestration Code
+
 - Each example keeps orchestration sources under `examples/host_build_graph/<example>/kernels/orchestration/`.
 - `examples/host_build_graph/<example>/kernels/kernel_config.py` defines the orchestration entry point. Example: `ORCHESTRATION = {"source": ".../example_orch.cpp", "function_name": "build_example_graph"}`.
 
 ## Function Signature
+
 Your orchestration entry must be `extern "C"` and match:
 
 ```cpp
-int build_graph(Runtime* runtime, uint64_t* args, int arg_count);
+int build_graph(OrchestrationRuntime* runtime, const ChipStorageTaskArgs &orch_args);
 ```
 
-`Runtime` is defined in `src/runtime/host_build_graph/runtime/runtime.h`.
+Include `orchestration_api.h`. Do not include `runtime.h` in orchestration sources.
 
 ## Argument Layout
-When you use the default `golden.py` tensor argument order (`TENSOR_ORDER`), the argument layout built by `examples/scripts/code_runner.py` is:
 
-```
-[ptr_0, ptr_1, ..., ptr_n, nbytes_0, nbytes_1, ..., nbytes_n, element_count]
-```
+`orch_args` contains separated tensor and scalar arguments through `ChipStorageTaskArgs`.
 
-- Pointers are host pointers to CPU tensors.
-- Sizes are byte sizes for each tensor in `TENSOR_ORDER`.
-- `element_count` is the element count of the first tensor.
-
-If `golden.py` returns an explicit argument list, that list becomes `args` directly. Validate `arg_count` defensively in your orchestration.
+- Use `orch_args.tensor(i)` to read tensor metadata and host pointers
+- Use `orch_args.scalar(i)` to read scalar values
+- Validate `tensor_count()` / `scalar_count()` defensively in orchestration code
 
 ## Building The Graph
+
 A typical host orchestration sequence is:
 
-1. Allocate device buffers with `runtime->host_api.device_malloc`.
-2. Copy inputs to device with `runtime->host_api.copy_to_device`.
-3. Record output buffers with `runtime->record_tensor_pair(host_ptr, dev_ptr, size)` so finalize can copy them back.
-4. Create tasks with `runtime->add_task(args, num_args, func_id, core_type)`.
-5. Add dependency edges with `runtime->add_successor(producer, consumer)`.
+1. Allocate device buffers with `device_malloc(runtime, size)`.
+2. Copy inputs to device with `copy_to_device(runtime, dev_ptr, host_ptr, size)`.
+3. Record output buffers with `record_tensor_pair(runtime, host_ptr, dev_ptr, size)` so finalize can copy them back.
+4. Create tasks with `add_task(runtime, args, num_args, func_id, core_type)`.
+5. Add dependency edges with `add_successor(runtime, producer, consumer)`.
 
 Example: see `examples/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp`.
 
 ## Kernel Mapping
+
 - `func_id` and `core_type` are defined in `kernels/kernel_config.py` under `KERNELS`.
 - The host uploads kernel binaries via `upload_kernel_binary` and stores addresses in `Runtime::func_id_to_addr_[]`. The platform layer resolves per-task `Task::function_bin_addr` from this map before copying to device.
 
 ## Debugging Tips
-- Use `runtime->print_runtime()` to dump the task graph.
+
+- Use `print_runtime(runtime)` to dump the task graph.
 - Fail fast on arg count or allocation errors to avoid undefined behavior.
diff --git a/examples/a2a3/host_build_graph/matmul/kernels/orchestration/matmul_orch.cpp b/examples/a2a3/host_build_graph/matmul/kernels/orchestration/matmul_orch.cpp
index 1d5a130fd..96e8e39be 100644
--- a/examples/a2a3/host_build_graph/matmul/kernels/orchestration/matmul_orch.cpp
+++ b/examples/a2a3/host_build_graph/matmul/kernels/orchestration/matmul_orch.cpp
@@ -22,8 +22,8 @@
  *
  * This orchestration function:
  * 1. Receives ChipStorageTaskArgs with tensor metadata (pointers, shapes, dtypes)
- * 2. Allocates device memory via runtime->host_api
- * 3. Copies input data to device via runtime->host_api
+ * 2. Allocates device memory via orchestration API helpers
+ * 3. Copies input data to device via orchestration API helpers
  * 4. Records output tensor for copy-back during finalize
  * 5. Builds the task graph with 4 tasks (2 AIV + 2 AIC)
  */
@@ -31,12 +31,11 @@
 #include <cstdint>
 #include <iostream>
 
-#include "runtime.h"    // NOLINT(build/include_subdir)
-#include "task_args.h"  // NOLINT(build/include_subdir)
+#include "orchestration_api.h"  // NOLINT(build/include_subdir)
 
 extern "C" {
 
-int build_matmul_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) {
+int build_matmul_graph(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args) {
     // Validate argument count
     // Expected orch_args: [a, w1, w2, f] — 4 tensors
     if (orch_args.tensor_count() < 4) {
@@ -62,63 +61,63 @@ int build_matmul_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) {
     // Allocate device memory and copy inputs
     std::cout << "\n=== Allocating Device Memory ===" << '\n';
 
-    void *dev_a = runtime->host_api.device_malloc(size_a);
+    void *dev_a = device_malloc(runtime, size_a);
     if (!dev_a) {
         std::cerr << "Error: Failed to allocate device memory for A\n";
         return -1;
     }
-    runtime->host_api.copy_to_device(dev_a, host_a, size_a);
+    copy_to_device(runtime, dev_a, host_a, size_a);
     std::cout << "Tensor A: " << size_a << " bytes copied to device\n";
 
-    void *dev_w1 = runtime->host_api.device_malloc(size_w1);
+    void *dev_w1 = device_malloc(runtime, size_w1);
     if (!dev_w1) {
         std::cerr << "Error: Failed to allocate device memory for W1\n";
-        runtime->host_api.device_free(dev_a);
+        device_free(runtime, dev_a);
         return -1;
     }
-    runtime->host_api.copy_to_device(dev_w1, host_w1, size_w1);
+    copy_to_device(runtime, dev_w1, host_w1, size_w1);
     std::cout << "Tensor W1: " << size_w1 << " bytes copied to device\n";
 
-    void *dev_w2 = runtime->host_api.device_malloc(size_w2);
+    void *dev_w2 = device_malloc(runtime, size_w2);
     if (!dev_w2) {
         std::cerr << "Error: Failed to allocate device memory for W2\n";
-        runtime->host_api.device_free(dev_a);
-        runtime->host_api.device_free(dev_w1);
+        device_free(runtime, dev_a);
+        device_free(runtime, dev_w1);
         return -1;
     }
-    runtime->host_api.copy_to_device(dev_w2, host_w2, size_w2);
+    copy_to_device(runtime, dev_w2, host_w2, size_w2);
     std::cout << "Tensor W2: " << size_w2 << " bytes copied to device\n";
 
-    void *dev_f = runtime->host_api.device_malloc(size_f);
+    void *dev_f = device_malloc(runtime, size_f);
     if (!dev_f) {
         std::cerr << "Error: Failed to allocate device memory for F\n";
-        runtime->host_api.device_free(dev_a);
-        runtime->host_api.device_free(dev_w1);
-        runtime->host_api.device_free(dev_w2);
+        device_free(runtime, dev_a);
+        device_free(runtime, dev_w1);
+        device_free(runtime, dev_w2);
         return -1;
     }
     // Record output tensor for copy-back during finalize
-    runtime->record_tensor_pair(host_f, dev_f, size_f);
+    record_tensor_pair(runtime, host_f, dev_f, size_f);
     std::cout << "Tensor F (output): " << size_f << " bytes allocated\n";
 
     // Allocate intermediate tensors (b, c, d)
     // dev_b is half precision (output of log_sqrt kernel, input to matmul)
     // dev_c, dev_d are float precision (output of matmul kernels)
-    size_t BYTES_HALF = SIZE * sizeof(uint16_t);                 // half = 2 bytes
-    size_t BYTES_FLOAT = SIZE * sizeof(float);                   // float = 4 bytes
-    void *dev_b = runtime->host_api.device_malloc(BYTES_HALF);   // sqrt(log(A)) - half output
-    void *dev_c = runtime->host_api.device_malloc(BYTES_FLOAT);  // B @ W1 - float output
-    void *dev_d = runtime->host_api.device_malloc(BYTES_FLOAT);  // B @ W2 - float output
+    size_t BYTES_HALF = SIZE * sizeof(uint16_t);        // half = 2 bytes
+    size_t BYTES_FLOAT = SIZE * sizeof(float);          // float = 4 bytes
+    void *dev_b = device_malloc(runtime, BYTES_HALF);   // sqrt(log(A)) - half output
+    void *dev_c = device_malloc(runtime, BYTES_FLOAT);  // B @ W1 - float output
+    void *dev_d = device_malloc(runtime, BYTES_FLOAT);  // B @ W2 - float output
 
     if (!dev_b || !dev_c || !dev_d) {
         std::cerr << "Error: Failed to allocate intermediate tensors\n";
-        runtime->host_api.device_free(dev_a);
-        runtime->host_api.device_free(dev_w1);
-        runtime->host_api.device_free(dev_w2);
-        runtime->host_api.device_free(dev_f);
-        if (dev_b) runtime->host_api.device_free(dev_b);
-        if (dev_c) runtime->host_api.device_free(dev_c);
-        if (dev_d) runtime->host_api.device_free(dev_d);
+        device_free(runtime, dev_a);
+        device_free(runtime, dev_w1);
+        device_free(runtime, dev_w2);
+        device_free(runtime, dev_f);
+        if (dev_b) device_free(runtime, dev_b);
+        if (dev_c) device_free(runtime, dev_c);
+        if (dev_d) device_free(runtime, dev_d);
         return -1;
     }
 
@@ -130,7 +129,7 @@ int build_matmul_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) {
     args_t0[0] = reinterpret_cast<uint64_t>(dev_a);  // src
     args_t0[1] = reinterpret_cast<uint64_t>(dev_b);  // out
     args_t0[2] = SIZE;                               // size
-    int t0 = runtime->add_task(args_t0, 3, 0, CoreType::AIV);
+    int t0 = add_task(runtime, args_t0, 3, 0, CoreType::AIV);
 
     // Task 1: C = B @ W1 (func_id=1: kernel_matmul, AIC)
     uint64_t args_t1[4];
@@ -138,7 +137,7 @@ int build_matmul_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) {
     args_t1[1] = reinterpret_cast<uint64_t>(dev_w1);  // src1 (right matrix)
     args_t1[2] = reinterpret_cast<uint64_t>(dev_c);   // out
     args_t1[3] = SIZE;                                // size
-    int t1 = runtime->add_task(args_t1, 4, 1, CoreType::AIC);
+    int t1 = add_task(runtime, args_t1, 4, 1, CoreType::AIC);
 
     // Task 2: D = B @ W2 (func_id=1: kernel_matmul, AIC)
     uint64_t args_t2[4];
@@ -146,7 +145,7 @@ int build_matmul_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) {
     args_t2[1] = reinterpret_cast<uint64_t>(dev_w2);  // src1 (right matrix)
     args_t2[2] = reinterpret_cast<uint64_t>(dev_d);   // out
     args_t2[3] = SIZE;                                // size
-    int t2 = runtime->add_task(args_t2, 4, 1, CoreType::AIC);
+    int t2 = add_task(runtime, args_t2, 4, 1, CoreType::AIC);
 
     // Task 3: F = exp(C + D) (func_id=2: kernel_add_exp, AIV)
     uint64_t args_t3[4];
@@ -154,13 +153,13 @@ int build_matmul_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) {
     args_t3[1] = reinterpret_cast<uint64_t>(dev_d);  // src1
     args_t3[2] = reinterpret_cast<uint64_t>(dev_f);  // out
     args_t3[3] = SIZE;                               // size
-    int t3 = runtime->add_task(args_t3, 4, 2, CoreType::AIV);
+    int t3 = add_task(runtime, args_t3, 4, 2, CoreType::AIV);
 
     // Add dependencies (diamond: t0→t1→t3, t0→t2→t3)
-    runtime->add_successor(t0, t1);  // t0 → t1
-    runtime->add_successor(t0, t2);  // t0 → t2
-    runtime->add_successor(t1, t3);  // t1 → t3
-    runtime->add_successor(t2, t3);  // t2 → t3
+    add_successor(runtime, t0, t1);  // t0 → t1
+    add_successor(runtime, t0, t2);  // t0 → t2
+    add_successor(runtime, t1, t3);  // t1 → t3
+    add_successor(runtime, t2, t3);  // t2 → t3
 
     std::cout << "\nTasks:\n";
     std::cout << "  task" << t0 << ": B = sqrt(log(A))   [AIV]\n";
@@ -169,8 +168,8 @@ int build_matmul_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) {
     std::cout << "  task" << t3 << ": F = exp(C + D)     [AIV]\n";
     std::cout << "Dependencies: t0→t1→t3, t0→t2→t3 (diamond)\n";
 
-    std::cout << "Created runtime with " << runtime->get_task_count() << " tasks\n";
-    runtime->print_runtime();
+    std::cout << "Created runtime with " << get_task_count(runtime) << " tasks\n";
+    print_runtime(runtime);
 
     return 0;
 }
diff --git a/examples/a2a3/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/a2a3/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
index 017a0dc5a..17dbd02ce 100644
--- a/examples/a2a3/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
+++ b/examples/a2a3/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
@@ -27,8 +27,7 @@
 #include <cstring>
 #include <iostream>
 
-#include "runtime.h"    // NOLINT(build/include_subdir)
-#include "task_args.h"  // NOLINT(build/include_subdir)
+#include "orchestration_api.h"  // NOLINT(build/include_subdir)
 
 #define FUNC_QK_MATMUL 0
 #define FUNC_SOFTMAX_PREPARE 1
@@ -37,7 +36,7 @@
 
 extern "C" {
 
-int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) {
+int build_paged_attention_graph(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args) {
     if (orch_args.tensor_count() < 6) {
         std::cerr << "Expected at least 6 tensors, got " << orch_args.tensor_count() << '\n';
         return -1;
@@ -83,20 +82,20 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc
     std::cout << "q_tile_size=" << q_tile_size << ", num_head_tiles=" << num_head_tiles << '\n';
 
     // Allocate device memory for inputs/outputs
-    void *dev_query = runtime->host_api.device_malloc(query_size);
-    void *dev_key_cache = runtime->host_api.device_malloc(key_cache_size);
-    void *dev_value_cache = runtime->host_api.device_malloc(value_cache_size);
-    void *dev_out = runtime->host_api.device_malloc(out_size);
+    void *dev_query = device_malloc(runtime, query_size);
+    void *dev_key_cache = device_malloc(runtime, key_cache_size);
+    void *dev_value_cache = device_malloc(runtime, value_cache_size);
+    void *dev_out = device_malloc(runtime, out_size);
 
     if (!dev_query || !dev_key_cache || !dev_value_cache || !dev_out) {
         std::cerr << "Error: Failed to allocate device memory\n";
         return -1;
     }
 
-    runtime->host_api.copy_to_device(dev_query, host_query, query_size);
-    runtime->host_api.copy_to_device(dev_key_cache, host_key_cache, key_cache_size);
-    runtime->host_api.copy_to_device(dev_value_cache, host_value_cache, value_cache_size);
-    runtime->record_tensor_pair(host_out, dev_out, out_size);
+    copy_to_device(runtime, dev_query, host_query, query_size);
+    copy_to_device(runtime, dev_key_cache, host_key_cache, key_cache_size);
+    copy_to_device(runtime, dev_value_cache, host_value_cache, value_cache_size);
+    record_tensor_pair(runtime, host_out, dev_out, out_size);
 
     // Buffer sizes depend on q_tile_size and block_size
     size_t sij_size = static_cast<size_t>(q_tile_size) * block_size * sizeof(float);
@@ -114,11 +113,11 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc
     void **dev_oi_new_arr = new void *[total_buffers];
 
     for (uint32_t i = 0; i < total_buffers; i++) {
-        dev_sij_arr[i] = runtime->host_api.device_malloc(sij_size);
-        dev_pij_arr[i] = runtime->host_api.device_malloc(pij_size);
-        dev_mij_arr[i] = runtime->host_api.device_malloc(mij_size);
-        dev_lij_arr[i] = runtime->host_api.device_malloc(lij_size);
-        dev_oi_new_arr[i] = runtime->host_api.device_malloc(oi_new_size);
+        dev_sij_arr[i] = device_malloc(runtime, sij_size);
+        dev_pij_arr[i] = device_malloc(runtime, pij_size);
+        dev_mij_arr[i] = device_malloc(runtime, mij_size);
+        dev_lij_arr[i] = device_malloc(runtime, lij_size);
+        dev_oi_new_arr[i] = device_malloc(runtime, oi_new_size);
     }
 
     // Per-(batch, head_tile) accumulators
@@ -132,9 +131,9 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc
     void **dev_oi_arr = new void *[total_accums];
 
     for (uint32_t i = 0; i < total_accums; i++) {
-        dev_mi_arr[i] = runtime->host_api.device_malloc(mi_size);
-        dev_li_arr[i] = runtime->host_api.device_malloc(li_size);
-        dev_oi_arr[i] = runtime->host_api.device_malloc(oi_size);
+        dev_mi_arr[i] = device_malloc(runtime, mi_size);
+        dev_li_arr[i] = device_malloc(runtime, li_size);
+        dev_oi_arr[i] = device_malloc(runtime, oi_size);
     }
 
     std::cout << "Allocated " << total_buffers << " per-block buffers\n";
@@ -193,7 +192,7 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc
                 uint64_t qk_args[6] = {reinterpret_cast<uint64_t>(qi_ptr),  reinterpret_cast<uint64_t>(kj_ptr),
                                        reinterpret_cast<uint64_t>(dev_sij), static_cast<uint64_t>(q_tile_size),
                                        static_cast<uint64_t>(head_dim),     static_cast<uint64_t>(block_size)};
-                int t_qk = runtime->add_task(qk_args, 6, FUNC_QK_MATMUL, CoreType::AIC);
+                int t_qk = add_task(runtime, qk_args, 6, FUNC_QK_MATMUL, CoreType::AIC);
                 total_tasks++;
 
                 // SF: scale, rowmax, exp, rowsum -> pij, mij, lij
@@ -201,18 +200,18 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc
                                        reinterpret_cast<uint64_t>(dev_pij), reinterpret_cast<uint64_t>(dev_mij),
                                        reinterpret_cast<uint64_t>(dev_lij), static_cast<uint64_t>(q_tile_size),
                                        static_cast<uint64_t>(block_size)};
-                int t_sf = runtime->add_task(sf_args, 7, FUNC_SOFTMAX_PREPARE, CoreType::AIV);
+                int t_sf = add_task(runtime, sf_args, 7, FUNC_SOFTMAX_PREPARE, CoreType::AIV);
                 total_tasks++;
 
                 // PV: pij(M, K') @ vj(K', N') -> oi_new(M, N')
                 uint64_t pv_args[6] = {reinterpret_cast<uint64_t>(dev_pij),    reinterpret_cast<uint64_t>(vj_ptr),
                                        reinterpret_cast<uint64_t>(dev_oi_new), static_cast<uint64_t>(q_tile_size),
                                        static_cast<uint64_t>(block_size),      static_cast<uint64_t>(head_dim)};
-                int t_pv = runtime->add_task(pv_args, 6, FUNC_PV_MATMUL, CoreType::AIC);
+                int t_pv = add_task(runtime, pv_args, 6, FUNC_PV_MATMUL, CoreType::AIC);
                 total_tasks++;
 
-                runtime->add_successor(t_qk, t_sf);
-                runtime->add_successor(t_sf, t_pv);
+                add_successor(runtime, t_qk, t_sf);
+                add_successor(runtime, t_sf, t_pv);
 
                 // Online Update: serialized across blocks (each depends on previous)
                 int is_first = (bn == 0) ? 1 : 0;
@@ -224,12 +223,12 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc
                                         static_cast<uint64_t>(is_first),        static_cast<uint64_t>(is_last),
                                         reinterpret_cast<uint64_t>(out_ptr),    static_cast<uint64_t>(q_tile_size),
                                         static_cast<uint64_t>(head_dim)};
-                int t_up = runtime->add_task(up_args, 11, FUNC_ONLINE_UPDATE, CoreType::AIV);
+                int t_up = add_task(runtime, up_args, 11, FUNC_ONLINE_UPDATE, CoreType::AIV);
                 total_tasks++;
 
-                runtime->add_successor(t_pv, t_up);
+                add_successor(runtime, t_pv, t_up);
                 if (t_up_prev >= 0) {
-                    runtime->add_successor(t_up_prev, t_up);
+                    add_successor(runtime, t_up_prev, t_up);
                 }
                 t_up_prev = t_up;
             }
@@ -246,7 +245,7 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc
     delete[] dev_oi_arr;
 
     std::cout << "Created " << total_tasks << " tasks\n";
-    runtime->print_runtime();
+    print_runtime(runtime);
 
     return 0;
 }
diff --git a/examples/a2a3/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp b/examples/a2a3/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp
index 84173028f..82555d9ef 100644
--- a/examples/a2a3/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp
+++ b/examples/a2a3/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp
@@ -15,20 +15,19 @@
  *
  * This orchestration function:
  * 1. Receives ChipStorageTaskArgs with tensor metadata (pointers, shapes, dtypes)
- * 2. Allocates device memory via runtime->host_api
- * 3. Copies input data to device via runtime->host_api
+ * 2. Allocates device memory via orchestration API helpers
+ * 3. Copies input data to device via orchestration API helpers
  * 4. Records output tensor for copy-back during finalize
  * 5. Builds the task graph
  */
 
 #include <iostream>
 
-#include "runtime.h"    // NOLINT(build/include_subdir)
-#include "task_args.h"  // NOLINT(build/include_subdir)
+#include "orchestration_api.h"  // NOLINT(build/include_subdir)
 
 extern "C" {
 
-int build_example_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) {
+int build_example_graph(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args) {
     // Validate argument count
     // Expected orch_args: [a, b, f] — 3 tensors
     if (orch_args.tensor_count() < 3) {
@@ -52,48 +51,48 @@ int build_example_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args)
     // Allocate device memory and copy inputs
     std::cout << "\n=== Allocating Device Memory ===" << '\n';
 
-    void *dev_a = runtime->host_api.device_malloc(size_a);
+    void *dev_a = device_malloc(runtime, size_a);
     if (!dev_a) {
         std::cerr << "Error: Failed to allocate device memory for a\n";
         return -1;
     }
-    runtime->host_api.copy_to_device(dev_a, host_a, size_a);
+    copy_to_device(runtime, dev_a, host_a, size_a);
     std::cout << "Tensor a: " << size_a << " bytes copied to device\n";
 
-    void *dev_b = runtime->host_api.device_malloc(size_b);
+    void *dev_b = device_malloc(runtime, size_b);
     if (!dev_b) {
         std::cerr << "Error: Failed to allocate device memory for b\n";
-        runtime->host_api.device_free(dev_a);
+        device_free(runtime, dev_a);
         return -1;
     }
-    runtime->host_api.copy_to_device(dev_b, host_b, size_b);
+    copy_to_device(runtime, dev_b, host_b, size_b);
     std::cout << "Tensor b: " << size_b << " bytes copied to device\n";
 
-    void *dev_f = runtime->host_api.device_malloc(size_f);
+    void *dev_f = device_malloc(runtime, size_f);
     if (!dev_f) {
         std::cerr << "Error: Failed to allocate device memory for f\n";
-        runtime->host_api.device_free(dev_a);
-        runtime->host_api.device_free(dev_b);
+        device_free(runtime, dev_a);
+        device_free(runtime, dev_b);
         return -1;
     }
     // Record output tensor for copy-back during finalize
-    runtime->record_tensor_pair(host_f, dev_f, size_f);
+    record_tensor_pair(runtime, host_f, dev_f, size_f);
     std::cout << "Tensor f (output): " << size_f << " bytes allocated\n";
 
     // Allocate intermediate tensors (c, d, e)
     size_t BYTES = SIZE * sizeof(float);
-    void *dev_c = runtime->host_api.device_malloc(BYTES);
-    void *dev_d = runtime->host_api.device_malloc(BYTES);
-    void *dev_e = runtime->host_api.device_malloc(BYTES);
+    void *dev_c = device_malloc(runtime, BYTES);
+    void *dev_d = device_malloc(runtime, BYTES);
+    void *dev_e = device_malloc(runtime, BYTES);
 
     if (!dev_c || !dev_d || !dev_e) {
         std::cerr << "Error: Failed to allocate intermediate tensors\n";
-        runtime->host_api.device_free(dev_a);
-        runtime->host_api.device_free(dev_b);
-        runtime->host_api.device_free(dev_f);
-        if (dev_c) runtime->host_api.device_free(dev_c);
-        if (dev_d) runtime->host_api.device_free(dev_d);
-        if (dev_e) runtime->host_api.device_free(dev_e);
+        device_free(runtime, dev_a);
+        device_free(runtime, dev_b);
+        device_free(runtime, dev_f);
+        if (dev_c) device_free(runtime, dev_c);
+        if (dev_d) device_free(runtime, dev_d);
+        if (dev_e) device_free(runtime, dev_e);
         return -1;
     }
 
@@ -111,7 +110,7 @@ int build_example_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args)
     args_t0[1] = reinterpret_cast<uint64_t>(dev_b);  // src1
     args_t0[2] = reinterpret_cast<uint64_t>(dev_c);  // out
     args_t0[3] = SIZE;                               // size
-    int t0 = runtime->add_task(args_t0, 4, 0, CoreType::AIV);
+    int t0 = add_task(runtime, args_t0, 4, 0, CoreType::AIV);
 
     // Task 1: d = c + 1 (func_id=1: kernel_add_scalar, AIV)
     uint64_t args_t1[4];
@@ -120,7 +119,7 @@ int build_example_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args)
     args_t1[1] = scalar_converter.u64;               // scalar=1.0
     args_t1[2] = reinterpret_cast<uint64_t>(dev_d);  // out
     args_t1[3] = SIZE;                               // size
-    int t1 = runtime->add_task(args_t1, 4, 1, CoreType::AIV);
+    int t1 = add_task(runtime, args_t1, 4, 1, CoreType::AIV);
 
     // Task 2: e = c + 2 (func_id=1: kernel_add_scalar, AIV)
     uint64_t args_t2[4];
@@ -129,7 +128,7 @@ int build_example_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args)
     args_t2[1] = scalar_converter.u64;               // scalar=2.0
     args_t2[2] = reinterpret_cast<uint64_t>(dev_e);  // out
     args_t2[3] = SIZE;                               // size
-    int t2 = runtime->add_task(args_t2, 4, 1, CoreType::AIV);
+    int t2 = add_task(runtime, args_t2, 4, 1, CoreType::AIV);
 
     // Task 3: f = d * e (func_id=2: kernel_mul, AIV)
     uint64_t args_t3[4];
@@ -137,13 +136,13 @@ int build_example_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args)
     args_t3[1] = reinterpret_cast<uint64_t>(dev_e);  // src1
     args_t3[2] = reinterpret_cast<uint64_t>(dev_f);  // out
     args_t3[3] = SIZE;                               // size
-    int t3 = runtime->add_task(args_t3, 4, 2, CoreType::AIV);
+    int t3 = add_task(runtime, args_t3, 4, 2, CoreType::AIV);
 
     // Add dependencies
-    runtime->add_successor(t0, t1);  // t0 → t1
-    runtime->add_successor(t0, t2);  // t0 → t2
-    runtime->add_successor(t1, t3);  // t1 → t3
-    runtime->add_successor(t2, t3);  // t2 → t3
+    add_successor(runtime, t0, t1);  // t0 → t1
+    add_successor(runtime, t0, t2);  // t0 → t2
+    add_successor(runtime, t1, t3);  // t1 → t3
+    add_successor(runtime, t2, t3);  // t2 → t3
 
     std::cout << "\nTasks:\n";
     std::cout << "  task" << t0 << ": c = a + b\n";
@@ -152,8 +151,8 @@ int build_example_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args)
     std::cout << "  task" << t3 << ": f = d * e\n";
     std::cout << "Dependencies: t0→t1, t0→t2, t1→t3, t2→t3\n";
 
-    std::cout << "Created runtime with " << runtime->get_task_count() << " tasks\n";
-    runtime->print_runtime();
+    std::cout << "Created runtime with " << get_task_count(runtime) << " tasks\n";
+    print_runtime(runtime);
 
     return 0;
 }
diff --git a/examples/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
index 017a0dc5a..17dbd02ce 100644
--- a/examples/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
+++ b/examples/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
@@ -27,8 +27,7 @@
 #include <cstring>
 #include <iostream>
 
-#include "runtime.h"    // NOLINT(build/include_subdir)
-#include "task_args.h"  // NOLINT(build/include_subdir)
+#include "orchestration_api.h"  // NOLINT(build/include_subdir)
 
 #define FUNC_QK_MATMUL 0
 #define FUNC_SOFTMAX_PREPARE 1
@@ -37,7 +36,7 @@
 
 extern "C" {
 
-int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) {
+int build_paged_attention_graph(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args) {
     if (orch_args.tensor_count() < 6) {
         std::cerr << "Expected at least 6 tensors, got " << orch_args.tensor_count() << '\n';
         return -1;
@@ -83,20 +82,20 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc
     std::cout << "q_tile_size=" << q_tile_size << ", num_head_tiles=" << num_head_tiles << '\n';
 
     // Allocate device memory for inputs/outputs
-    void *dev_query = runtime->host_api.device_malloc(query_size);
-    void *dev_key_cache = runtime->host_api.device_malloc(key_cache_size);
-    void *dev_value_cache = runtime->host_api.device_malloc(value_cache_size);
-    void *dev_out = runtime->host_api.device_malloc(out_size);
+    void *dev_query = device_malloc(runtime, query_size);
+    void *dev_key_cache = device_malloc(runtime, key_cache_size);
+    void *dev_value_cache = device_malloc(runtime, value_cache_size);
+    void *dev_out = device_malloc(runtime, out_size);
 
     if (!dev_query || !dev_key_cache || !dev_value_cache || !dev_out) {
         std::cerr << "Error: Failed to allocate device memory\n";
         return -1;
     }
 
-    runtime->host_api.copy_to_device(dev_query, host_query, query_size);
-    runtime->host_api.copy_to_device(dev_key_cache, host_key_cache, key_cache_size);
-    runtime->host_api.copy_to_device(dev_value_cache, host_value_cache, value_cache_size);
-    runtime->record_tensor_pair(host_out, dev_out, out_size);
+    copy_to_device(runtime, dev_query, host_query, query_size);
+    copy_to_device(runtime, dev_key_cache, host_key_cache, key_cache_size);
+    copy_to_device(runtime, dev_value_cache, host_value_cache, value_cache_size);
+    record_tensor_pair(runtime, host_out, dev_out, out_size);
 
     // Buffer sizes depend on q_tile_size and block_size
     size_t sij_size = static_cast<size_t>(q_tile_size) * block_size * sizeof(float);
@@ -114,11 +113,11 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc
     void **dev_oi_new_arr = new void *[total_buffers];
 
     for (uint32_t i = 0; i < total_buffers; i++) {
-        dev_sij_arr[i] = runtime->host_api.device_malloc(sij_size);
-        dev_pij_arr[i] = runtime->host_api.device_malloc(pij_size);
-        dev_mij_arr[i] = runtime->host_api.device_malloc(mij_size);
-        dev_lij_arr[i] = runtime->host_api.device_malloc(lij_size);
-        dev_oi_new_arr[i] = runtime->host_api.device_malloc(oi_new_size);
+        dev_sij_arr[i] = device_malloc(runtime, sij_size);
+        dev_pij_arr[i] = device_malloc(runtime, pij_size);
+        dev_mij_arr[i] = device_malloc(runtime, mij_size);
+        dev_lij_arr[i] = device_malloc(runtime, lij_size);
+        dev_oi_new_arr[i] = device_malloc(runtime, oi_new_size);
     }
 
     // Per-(batch, head_tile) accumulators
@@ -132,9 +131,9 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc
     void **dev_oi_arr = new void *[total_accums];
 
     for (uint32_t i = 0; i < total_accums; i++) {
-        dev_mi_arr[i] = runtime->host_api.device_malloc(mi_size);
-        dev_li_arr[i] = runtime->host_api.device_malloc(li_size);
-        dev_oi_arr[i] = runtime->host_api.device_malloc(oi_size);
+        dev_mi_arr[i] = device_malloc(runtime, mi_size);
+        dev_li_arr[i] = device_malloc(runtime, li_size);
+        dev_oi_arr[i] = device_malloc(runtime, oi_size);
     }
 
     std::cout << "Allocated " << total_buffers << " per-block buffers\n";
@@ -193,7 +192,7 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc
                 uint64_t qk_args[6] = {reinterpret_cast<uint64_t>(qi_ptr),  reinterpret_cast<uint64_t>(kj_ptr),
                                        reinterpret_cast<uint64_t>(dev_sij), static_cast<uint64_t>(q_tile_size),
                                        static_cast<uint64_t>(head_dim),     static_cast<uint64_t>(block_size)};
-                int t_qk = runtime->add_task(qk_args, 6, FUNC_QK_MATMUL, CoreType::AIC);
+                int t_qk = add_task(runtime, qk_args, 6, FUNC_QK_MATMUL, CoreType::AIC);
                 total_tasks++;
 
                 // SF: scale, rowmax, exp, rowsum -> pij, mij, lij
@@ -201,18 +200,18 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc
                                        reinterpret_cast<uint64_t>(dev_pij), reinterpret_cast<uint64_t>(dev_mij),
                                        reinterpret_cast<uint64_t>(dev_lij), static_cast<uint64_t>(q_tile_size),
                                        static_cast<uint64_t>(block_size)};
-                int t_sf = runtime->add_task(sf_args, 7, FUNC_SOFTMAX_PREPARE, CoreType::AIV);
+                int t_sf = add_task(runtime, sf_args, 7, FUNC_SOFTMAX_PREPARE, CoreType::AIV);
                 total_tasks++;
 
                 // PV: pij(M, K') @ vj(K', N') -> oi_new(M, N')
                 uint64_t pv_args[6] = {reinterpret_cast<uint64_t>(dev_pij),    reinterpret_cast<uint64_t>(vj_ptr),
                                        reinterpret_cast<uint64_t>(dev_oi_new), static_cast<uint64_t>(q_tile_size),
                                        static_cast<uint64_t>(block_size),      static_cast<uint64_t>(head_dim)};
-                int t_pv = runtime->add_task(pv_args, 6, FUNC_PV_MATMUL, CoreType::AIC);
+                int t_pv = add_task(runtime, pv_args, 6, FUNC_PV_MATMUL, CoreType::AIC);
                 total_tasks++;
 
-                runtime->add_successor(t_qk, t_sf);
-                runtime->add_successor(t_sf, t_pv);
+                add_successor(runtime, t_qk, t_sf);
+                add_successor(runtime, t_sf, t_pv);
 
                 // Online Update: serialized across blocks (each depends on previous)
                 int is_first = (bn == 0) ? 1 : 0;
@@ -224,12 +223,12 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc
                                         static_cast<uint64_t>(is_first),        static_cast<uint64_t>(is_last),
                                         reinterpret_cast<uint64_t>(out_ptr),    static_cast<uint64_t>(q_tile_size),
                                         static_cast<uint64_t>(head_dim)};
-                int t_up = runtime->add_task(up_args, 11, FUNC_ONLINE_UPDATE, CoreType::AIV);
+                int t_up = add_task(runtime, up_args, 11, FUNC_ONLINE_UPDATE, CoreType::AIV);
                 total_tasks++;
 
-                runtime->add_successor(t_pv, t_up);
+                add_successor(runtime, t_pv, t_up);
                 if (t_up_prev >= 0) {
-                    runtime->add_successor(t_up_prev, t_up);
+                    add_successor(runtime, t_up_prev, t_up);
                 }
                 t_up_prev = t_up;
             }
@@ -246,7 +245,7 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc
     delete[] dev_oi_arr;
 
     std::cout << "Created " << total_tasks << " tasks\n";
-    runtime->print_runtime();
+    print_runtime(runtime);
 
     return 0;
 }
diff --git a/examples/scripts/README.md b/examples/scripts/README.md
index f8c29586c..ce5ac5c8c 100644
--- a/examples/scripts/README.md
+++ b/examples/scripts/README.md
@@ -47,7 +47,7 @@ python examples/scripts/run_example.py \
 ### `run_example.py` Parameters
 
 | Argument | Short | Description | Default |
-|----------|-------|-------------|---------|
+| -------- | ----- | ----------- | ------- |
 | `--kernels` | `-k` | Kernels directory path (contains kernel_config.py) | **Required** |
 | `--golden` | `-g` | golden.py script path | **Required** |
 | `--platform` | `-p` | Platform name: `a2a3` or `a2a3sim` | `a2a3` |
@@ -110,6 +110,7 @@ python examples/scripts/run_example.py -k ./kernels -g ./golden.py
 ### Priority
 
 Log level is determined by (highest to lowest priority):
+
 1. CLI arguments (`--log-level`, `--verbose`, `--silent`)
 2. Environment variable (`PTO_LOG_LEVEL`)
 3. Default value (`info` / INFO level)
@@ -126,7 +127,7 @@ Log level is determined by (highest to lowest priority):
 
 The kernels directory must contain a `kernel_config.py` file:
 
-```
+```text
 kernels/
 ├── kernel_config.py          # Required: kernel configuration
 ├── orchestration/
@@ -262,21 +263,30 @@ def generate_inputs(params: dict) -> dict:
 
 ## Orchestration Function Interface
 
-The orchestration function's parameter order must match `TENSOR_ORDER`:
+For `host_build_graph`, orchestration sources should include `orchestration_api.h` and use `ChipStorageTaskArgs`:
 
 ```cpp
 // Assume TENSOR_ORDER = ["a", "b", "f"]
-int BuildExampleGraph(Runtime* runtime, uint64_t* args, int arg_count) {
-    // args layout: [ptr_a, ptr_b, ptr_f, size_a, size_b, size_f, count]
-    void* ptr_a = reinterpret_cast<void*>(args[0]);
-    void* ptr_b = reinterpret_cast<void*>(args[1]);
-    void* ptr_f = reinterpret_cast<void*>(args[2]);
-    uint64_t size_a = args[3];
-    uint64_t size_b = args[4];
-    uint64_t size_f = args[5];
-    uint64_t count = args[6];
+#include "orchestration_api.h"
+
+int BuildExampleGraph(OrchestrationRuntime* runtime, const ChipStorageTaskArgs &orch_args) {
+    void* ptr_a = orch_args.tensor(0).data_as<void>();
+    void* ptr_b = orch_args.tensor(1).data_as<void>();
+    void* ptr_f = orch_args.tensor(2).data_as<void>();
+
+    size_t size_a = orch_args.tensor(0).nbytes();
+    size_t size_b = orch_args.tensor(1).nbytes();
+    size_t size_f = orch_args.tensor(2).nbytes();
+
+    void* dev_a = device_malloc(runtime, size_a);
+    void* dev_b = device_malloc(runtime, size_b);
+    void* dev_f = device_malloc(runtime, size_f);
+    copy_to_device(runtime, dev_a, ptr_a, size_a);
+    copy_to_device(runtime, dev_b, ptr_b, size_b);
+    record_tensor_pair(runtime, ptr_f, dev_f, size_f);
 
     // Build task graph...
+    return 0;
 }
 ```
 
@@ -313,7 +323,7 @@ No special platform-specific environment variables required.
 
 ### Directory Structure
 
-```
+```text
 my_test/
 ├── kernels/
 │   ├── kernel_config.py
@@ -342,7 +352,7 @@ python examples/scripts/run_example.py -k my_test/kernels -g my_test/golden.py -
 
 ### Success Example
 
-```
+```text
 === Building Runtime: host_build_graph (platform: a2a3sim) ===
 ...
 === Compiling and Registering Kernels ===
@@ -367,7 +377,7 @@ TEST PASSED
 
 ### Failure Example
 
-```
+```text
 === Comparing Results ===
 Comparing f: shape=(16384,), dtype=float32
   First 10 actual:   [40. 40. 40. 40. 40. 40. 40. 40. 40. 40.]
@@ -394,10 +404,12 @@ python examples/scripts/run_example.py -k ... -g ... -p ... -v
 ### Q: Why "binary_data cannot be empty" error?
 
 This usually happens when:
+
 - Using wrong platform (a2a3 vs a2a3sim)
 - Kernel compilation failed silently
 
 Solutions:
+
 1. Verify correct `-p` parameter is used
 2. Check if kernel source files exist
 3. Use `-v` to view detailed compilation logs
diff --git a/examples/scripts/code_runner.py b/examples/scripts/code_runner.py
index 259e95f76..be87aa3a9 100644
--- a/examples/scripts/code_runner.py
+++ b/examples/scripts/code_runner.py
@@ -649,11 +649,8 @@ def _build_func_args(self, tensors: dict[str, torch.Tensor]) -> list:
         """
         Build orch_args from tensors dict (legacy path).
 
-        Convention for orchestration function signature:
-            int BuildGraph(Runtime* runtime, uint64_t* args, int arg_count)
-
-        Where args layout is:
-            [ptr_0, ptr_1, ..., ptr_n, nbytes_0, nbytes_1, ..., nbytes_n, count]
+        The resulting object is passed to orchestration entries with the shape:
+            int BuildGraph(OrchestrationRuntime* runtime, const ChipStorageTaskArgs &orch_args)
 
         Args:
             tensors: Dict of torch tensors (will be modified to ensure contiguous)
diff --git a/src/a2a3/runtime/host_build_graph/build_config.py b/src/a2a3/runtime/host_build_graph/build_config.py
index 3e29fd698..76e7face5 100644
--- a/src/a2a3/runtime/host_build_graph/build_config.py
+++ b/src/a2a3/runtime/host_build_graph/build_config.py
@@ -1,17 +1,17 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
 # Runtime build configuration
 # All paths are relative to this file's directory (src/runtime/)
 
 BUILD_CONFIG = {
-    "aicore": {
-        "include_dirs": ["runtime"],
-        "source_dirs": ["aicore", "runtime"]
-    },
-    "aicpu": {
-        "include_dirs": ["runtime"],
-        "source_dirs": ["aicpu", "runtime"]
-    },
-    "host": {
-        "include_dirs": ["runtime"],
-        "source_dirs": ["host", "runtime"]
-    }
+    "aicore": {"include_dirs": ["runtime"], "source_dirs": ["aicore", "runtime"]},
+    "aicpu": {"include_dirs": ["runtime"], "source_dirs": ["aicpu", "runtime"]},
+    "host": {"include_dirs": ["runtime", "orchestration"], "source_dirs": ["host", "runtime"]},
+    "orchestration": {"include_dirs": ["runtime", "orchestration"], "source_dirs": []},
 }
diff --git a/src/a2a3/runtime/host_build_graph/docs/RUNTIME_LOGIC.md b/src/a2a3/runtime/host_build_graph/docs/RUNTIME_LOGIC.md
index 9db0f2657..75f3a4336 100644
--- a/src/a2a3/runtime/host_build_graph/docs/RUNTIME_LOGIC.md
+++ b/src/a2a3/runtime/host_build_graph/docs/RUNTIME_LOGIC.md
@@ -1,21 +1,25 @@
 # Runtime Logic: host_build_graph
 
 ## Overview
+
 The host_build_graph runtime builds a static task graph on the host, copies the Runtime object to device memory, and lets AICPU scheduler threads dispatch tasks to AICore via a per-core handshake. Dependencies are explicit edges created by orchestration code, so scheduling is a standard fanin/fanout ready-queue model.
 
 ## Core Data Structures
+
 - `Runtime` owns the task table, handshake buffers, and host-side device APIs. See `src/runtime/host_build_graph/runtime/runtime.h`.
 - `Task` is a fixed-size record that stores `func_id`, argument array, `fanin`, `fanout`, `core_type`, and `function_bin_addr`.
 - `Handshake` is the shared per-core control block used by AICPU and AICore for dispatch and completion.
 - `HostApi` provides device memory ops used by host orchestration (`device_malloc`, `copy_to_device`, `upload_kernel_binary`, etc.).
 
 ## Build And Init Flow
+
 1. Python tooling compiles kernels and orchestration into shared objects.
 2. `init_runtime_impl` loads the orchestration SO from bytes, resolves the entry symbol, and registers kernel binaries with the platform uploader. The resulting GM addresses are stored by `Runtime::set_function_bin_addr`. See `src/runtime/host_build_graph/host/runtime_maker.cpp`.
-3. The orchestration function runs on the host and builds the graph. It allocates device buffers, copies input data to device, records output buffers with `runtime->record_tensor_pair`, adds tasks via `runtime->add_task`, and adds dependency edges via `runtime->add_successor`.
+3. The orchestration function runs on the host and builds the graph. It allocates device buffers, copies input data to device, records output buffers with `record_tensor_pair(runtime, ...)`, adds tasks via `add_task(runtime, ...)`, and adds dependency edges via `add_successor(runtime, ...)`.
 4. The populated `Runtime` is copied to device memory by the platform layer. AICPU then runs the executor with this Runtime snapshot.
 
 ## Execution Flow (Device)
+
 1. `aicpu_executor.cpp` performs core discovery, handshake initialization, and ready-queue seeding using `Runtime::get_initial_ready_tasks`.
 2. Scheduler threads maintain per-core and global ready queues. When a task is ready, the scheduler writes its pointer to the core's `Handshake` and sets `task_status=1`.
 3. AICore reads the handshake, executes the kernel at `Task::function_bin_addr`, and writes `task_status=0` on completion.
@@ -23,9 +27,11 @@ The host_build_graph runtime builds a static task graph on the host, copies the
 5. The executor shuts down cores by setting `Handshake::control=1` after all tasks complete.
 
 ## Finalize And Cleanup
+
 `validate_runtime_impl` copies all recorded output tensors back to the host and frees device allocations recorded in tensor pairs. See `src/runtime/host_build_graph/host/runtime_maker.cpp`.
 
 ## Key Files
+
 - `src/runtime/host_build_graph/runtime/runtime.h`
 - `src/runtime/host_build_graph/runtime/runtime.cpp`
 - `src/runtime/host_build_graph/host/runtime_maker.cpp`
diff --git a/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp b/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp
index d906215bb..3cd8ca839 100644
--- a/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp
+++ b/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp
@@ -36,21 +36,55 @@
 #include <cstring>
 #include <string>
 
-#include "callable.h"   // NOLINT(build/include_subdir)
-#include "runtime.h"    // Includes unified_log.h and provides LOG_* macros  // NOLINT(build/include_subdir)
-#include "task_args.h"  // NOLINT(build/include_subdir)
-
-/**
- * Orchestration function signature.
- *
- * @param runtime    Pointer to Runtime to populate with tasks
- * @param orch_args  Separated tensor/scalar arguments
- * @return 0 on success, negative on error
- */
-typedef int (*OrchestrationFunc)(Runtime *runtime, const ChipStorageTaskArgs &orch_args);
+#include "callable.h"           // NOLINT(build/include_subdir)
+#include "orchestration_api.h"  // NOLINT(build/include_subdir)
+#include "runtime.h"            // Includes unified_log.h and provides LOG_* macros  // NOLINT(build/include_subdir)
+#include "task_args.h"          // NOLINT(build/include_subdir)
 
 namespace {
 
+struct OrchestrationRuntimeImpl {
+    const OrchestrationRuntimeOps *ops;
+    Runtime *runtime;
+};
+
+Runtime *unwrap_runtime(OrchestrationRuntime *runtime) {
+    return reinterpret_cast<OrchestrationRuntimeImpl *>(runtime)->runtime;
+}
+
+int runtime_add_task(OrchestrationRuntime *runtime, uint64_t *args, int num_args, int func_id, CoreType core_type) {
+    return unwrap_runtime(runtime)->add_task(args, num_args, func_id, core_type);
+}
+
+void runtime_add_successor(OrchestrationRuntime *runtime, int from_task, int to_task) {
+    unwrap_runtime(runtime)->add_successor(from_task, to_task);
+}
+
+void runtime_record_tensor_pair(OrchestrationRuntime *runtime, void *host_ptr, void *dev_ptr, size_t size) {
+    unwrap_runtime(runtime)->record_tensor_pair(host_ptr, dev_ptr, size);
+}
+
+int runtime_get_task_count(OrchestrationRuntime *runtime) { return unwrap_runtime(runtime)->get_task_count(); }
+
+void runtime_print_runtime(OrchestrationRuntime *runtime) { unwrap_runtime(runtime)->print_runtime(); }
+
+void *runtime_device_malloc(OrchestrationRuntime *runtime, size_t size) {
+    return unwrap_runtime(runtime)->host_api.device_malloc(size);
+}
+
+void runtime_device_free(OrchestrationRuntime *runtime, void *ptr) {
+    unwrap_runtime(runtime)->host_api.device_free(ptr);
+}
+
+int runtime_copy_to_device(OrchestrationRuntime *runtime, void *dev_ptr, const void *host_ptr, size_t size) {
+    return unwrap_runtime(runtime)->host_api.copy_to_device(dev_ptr, host_ptr, size);
+}
+
+const OrchestrationRuntimeOps k_orchestration_runtime_ops = {
+    runtime_add_task,      runtime_add_successor, runtime_record_tensor_pair, runtime_get_task_count,
+    runtime_print_runtime, runtime_device_malloc, runtime_device_free,        runtime_copy_to_device,
+};
+
 bool write_all_bytes(int fd, const uint8_t *data, size_t size) {
     size_t total_written = 0;
     while (total_written < size) {
@@ -102,10 +136,10 @@ extern "C" {
  * This function loads the orchestration SO from binary data via a temp file,
  * resolves the orchestration function via dlsym, then calls it to build the
  * task graph. The orchestration function is responsible for:
- * - Allocating device memory via runtime->host_api.device_malloc()
- * - Copying data to device via runtime->host_api.copy_to_device()
+ * - Allocating device memory via device_malloc()
+ * - Copying data to device via copy_to_device()
  * - Building the task graph
- * - Recording tensor pairs via runtime->record_tensor_pair()
+ * - Recording tensor pairs via record_tensor_pair()
  *
  * @param runtime   Pointer to pre-constructed Runtime
  * @param callable  ChipCallable containing orch binary, func_name, and child kernels
@@ -181,9 +215,11 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip
         orch_args->tensor_count(), orch_args->scalar_count()
     );
 
+    OrchestrationRuntimeImpl orchestration_runtime = {&k_orchestration_runtime_ops, runtime};
+
     // Call orchestration function to build task graph
     // The orchestration function handles device memory allocation and copy-to-device
-    int rc = orch_func(runtime, *orch_args);
+    int rc = orch_func(reinterpret_cast<OrchestrationRuntime *>(&orchestration_runtime), *orch_args);
     if (rc != 0) {
         LOG_ERROR("Orchestration function failed with code %d", rc);
         runtime->clear_tensor_pairs();
diff --git a/src/a2a3/runtime/host_build_graph/orchestration/orchestration_api.h b/src/a2a3/runtime/host_build_graph/orchestration/orchestration_api.h
new file mode 100644
index 000000000..b5854d39a
--- /dev/null
+++ b/src/a2a3/runtime/host_build_graph/orchestration/orchestration_api.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Orchestration API for host_build_graph.
+ *
+ * Orchestration sources include only this header and interact with the runtime
+ * through the function-pointer table embedded in OrchestrationRuntime.
+ */
+
+#ifndef SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_ORCHESTRATION_ORCHESTRATION_API_H_
+#define SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_ORCHESTRATION_ORCHESTRATION_API_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "common/core_type.h"  // NOLINT(build/include_subdir)
+#include "task_args.h"         // NOLINT(build/include_subdir)
+
+typedef struct OrchestrationRuntime OrchestrationRuntime;
+
+typedef struct OrchestrationRuntimeOps {
+    int (*add_task)(OrchestrationRuntime *runtime, uint64_t *args, int num_args, int func_id, CoreType core_type);
+    void (*add_successor)(OrchestrationRuntime *runtime, int from_task, int to_task);
+    void (*record_tensor_pair)(OrchestrationRuntime *runtime, void *host_ptr, void *dev_ptr, size_t size);
+    int (*get_task_count)(OrchestrationRuntime *runtime);
+    void (*print_runtime)(OrchestrationRuntime *runtime);
+
+    void *(*device_malloc)(OrchestrationRuntime *runtime, size_t size);
+    void (*device_free)(OrchestrationRuntime *runtime, void *ptr);
+    int (*copy_to_device)(OrchestrationRuntime *runtime, void *dev_ptr, const void *host_ptr, size_t size);
+} OrchestrationRuntimeOps;
+
+struct OrchestrationRuntime {
+    const OrchestrationRuntimeOps *ops;
+};
+
+static inline int
+add_task(OrchestrationRuntime *runtime, uint64_t *args, int num_args, int func_id, CoreType core_type) {
+    return runtime->ops->add_task(runtime, args, num_args, func_id, core_type);
+}
+
+static inline void add_successor(OrchestrationRuntime *runtime, int from_task, int to_task) {
+    runtime->ops->add_successor(runtime, from_task, to_task);
+}
+
+static inline void record_tensor_pair(OrchestrationRuntime *runtime, void *host_ptr, void *dev_ptr, size_t size) {
+    runtime->ops->record_tensor_pair(runtime, host_ptr, dev_ptr, size);
+}
+
+static inline int get_task_count(OrchestrationRuntime *runtime) { return runtime->ops->get_task_count(runtime); }
+
+static inline void print_runtime(OrchestrationRuntime *runtime) { runtime->ops->print_runtime(runtime); }
+
+static inline void *device_malloc(OrchestrationRuntime *runtime, size_t size) {
+    return runtime->ops->device_malloc(runtime, size);
+}
+
+static inline void device_free(OrchestrationRuntime *runtime, void *ptr) { runtime->ops->device_free(runtime, ptr); }
+
+static inline int copy_to_device(OrchestrationRuntime *runtime, void *dev_ptr, const void *host_ptr, size_t size) {
+    return runtime->ops->copy_to_device(runtime, dev_ptr, host_ptr, size);
+}
+
+typedef int (*OrchestrationFunc)(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args);
+
+#endif  // SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_ORCHESTRATION_ORCHESTRATION_API_H_
diff --git a/src/a5/runtime/host_build_graph/build_config.py b/src/a5/runtime/host_build_graph/build_config.py
index 3e29fd698..76e7face5 100644
--- a/src/a5/runtime/host_build_graph/build_config.py
+++ b/src/a5/runtime/host_build_graph/build_config.py
@@ -1,17 +1,17 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
 # Runtime build configuration
 # All paths are relative to this file's directory (src/runtime/)
 
 BUILD_CONFIG = {
-    "aicore": {
-        "include_dirs": ["runtime"],
-        "source_dirs": ["aicore", "runtime"]
-    },
-    "aicpu": {
-        "include_dirs": ["runtime"],
-        "source_dirs": ["aicpu", "runtime"]
-    },
-    "host": {
-        "include_dirs": ["runtime"],
-        "source_dirs": ["host", "runtime"]
-    }
+    "aicore": {"include_dirs": ["runtime"], "source_dirs": ["aicore", "runtime"]},
+    "aicpu": {"include_dirs": ["runtime"], "source_dirs": ["aicpu", "runtime"]},
+    "host": {"include_dirs": ["runtime", "orchestration"], "source_dirs": ["host", "runtime"]},
+    "orchestration": {"include_dirs": ["runtime", "orchestration"], "source_dirs": []},
 }
diff --git a/src/a5/runtime/host_build_graph/docs/RUNTIME_LOGIC.md b/src/a5/runtime/host_build_graph/docs/RUNTIME_LOGIC.md
index 9db0f2657..75f3a4336 100644
--- a/src/a5/runtime/host_build_graph/docs/RUNTIME_LOGIC.md
+++ b/src/a5/runtime/host_build_graph/docs/RUNTIME_LOGIC.md
@@ -1,21 +1,25 @@
 # Runtime Logic: host_build_graph
 
 ## Overview
+
 The host_build_graph runtime builds a static task graph on the host, copies the Runtime object to device memory, and lets AICPU scheduler threads dispatch tasks to AICore via a per-core handshake. Dependencies are explicit edges created by orchestration code, so scheduling is a standard fanin/fanout ready-queue model.
 
 ## Core Data Structures
+
 - `Runtime` owns the task table, handshake buffers, and host-side device APIs. See `src/runtime/host_build_graph/runtime/runtime.h`.
 - `Task` is a fixed-size record that stores `func_id`, argument array, `fanin`, `fanout`, `core_type`, and `function_bin_addr`.
 - `Handshake` is the shared per-core control block used by AICPU and AICore for dispatch and completion.
 - `HostApi` provides device memory ops used by host orchestration (`device_malloc`, `copy_to_device`, `upload_kernel_binary`, etc.).
 
 ## Build And Init Flow
+
 1. Python tooling compiles kernels and orchestration into shared objects.
 2. `init_runtime_impl` loads the orchestration SO from bytes, resolves the entry symbol, and registers kernel binaries with the platform uploader. The resulting GM addresses are stored by `Runtime::set_function_bin_addr`. See `src/runtime/host_build_graph/host/runtime_maker.cpp`.
-3. The orchestration function runs on the host and builds the graph. It allocates device buffers, copies input data to device, records output buffers with `runtime->record_tensor_pair`, adds tasks via `runtime->add_task`, and adds dependency edges via `runtime->add_successor`.
+3. The orchestration function runs on the host and builds the graph. It allocates device buffers, copies input data to device, records output buffers with `record_tensor_pair(runtime, ...)`, adds tasks via `add_task(runtime, ...)`, and adds dependency edges via `add_successor(runtime, ...)`.
 4. The populated `Runtime` is copied to device memory by the platform layer. AICPU then runs the executor with this Runtime snapshot.
 
 ## Execution Flow (Device)
+
 1. `aicpu_executor.cpp` performs core discovery, handshake initialization, and ready-queue seeding using `Runtime::get_initial_ready_tasks`.
 2. Scheduler threads maintain per-core and global ready queues. When a task is ready, the scheduler writes its pointer to the core's `Handshake` and sets `task_status=1`.
 3. AICore reads the handshake, executes the kernel at `Task::function_bin_addr`, and writes `task_status=0` on completion.
@@ -23,9 +27,11 @@ The host_build_graph runtime builds a static task graph on the host, copies the
 5. The executor shuts down cores by setting `Handshake::control=1` after all tasks complete.
 
 ## Finalize And Cleanup
+
 `validate_runtime_impl` copies all recorded output tensors back to the host and frees device allocations recorded in tensor pairs. See `src/runtime/host_build_graph/host/runtime_maker.cpp`.
 
 ## Key Files
+
 - `src/runtime/host_build_graph/runtime/runtime.h`
 - `src/runtime/host_build_graph/runtime/runtime.cpp`
 - `src/runtime/host_build_graph/host/runtime_maker.cpp`
diff --git a/src/a5/runtime/host_build_graph/host/runtime_maker.cpp b/src/a5/runtime/host_build_graph/host/runtime_maker.cpp
index d906215bb..3cd8ca839 100644
--- a/src/a5/runtime/host_build_graph/host/runtime_maker.cpp
+++ b/src/a5/runtime/host_build_graph/host/runtime_maker.cpp
@@ -36,21 +36,55 @@
 #include <cstring>
 #include <string>
 
-#include "callable.h"   // NOLINT(build/include_subdir)
-#include "runtime.h"    // Includes unified_log.h and provides LOG_* macros  // NOLINT(build/include_subdir)
-#include "task_args.h"  // NOLINT(build/include_subdir)
-
-/**
- * Orchestration function signature.
- *
- * @param runtime    Pointer to Runtime to populate with tasks
- * @param orch_args  Separated tensor/scalar arguments
- * @return 0 on success, negative on error
- */
-typedef int (*OrchestrationFunc)(Runtime *runtime, const ChipStorageTaskArgs &orch_args);
+#include "callable.h"           // NOLINT(build/include_subdir)
+#include "orchestration_api.h"  // NOLINT(build/include_subdir)
+#include "runtime.h"            // Includes unified_log.h and provides LOG_* macros  // NOLINT(build/include_subdir)
+#include "task_args.h"          // NOLINT(build/include_subdir)
 
 namespace {
 
+struct OrchestrationRuntimeImpl {
+    const OrchestrationRuntimeOps *ops;
+    Runtime *runtime;
+};
+
+Runtime *unwrap_runtime(OrchestrationRuntime *runtime) {
+    return reinterpret_cast<OrchestrationRuntimeImpl *>(runtime)->runtime;
+}
+
+int runtime_add_task(OrchestrationRuntime *runtime, uint64_t *args, int num_args, int func_id, CoreType core_type) {
+    return unwrap_runtime(runtime)->add_task(args, num_args, func_id, core_type);
+}
+
+void runtime_add_successor(OrchestrationRuntime *runtime, int from_task, int to_task) {
+    unwrap_runtime(runtime)->add_successor(from_task, to_task);
+}
+
+void runtime_record_tensor_pair(OrchestrationRuntime *runtime, void *host_ptr, void *dev_ptr, size_t size) {
+    unwrap_runtime(runtime)->record_tensor_pair(host_ptr, dev_ptr, size);
+}
+
+int runtime_get_task_count(OrchestrationRuntime *runtime) { return unwrap_runtime(runtime)->get_task_count(); }
+
+void runtime_print_runtime(OrchestrationRuntime *runtime) { unwrap_runtime(runtime)->print_runtime(); }
+
+void *runtime_device_malloc(OrchestrationRuntime *runtime, size_t size) {
+    return unwrap_runtime(runtime)->host_api.device_malloc(size);
+}
+
+void runtime_device_free(OrchestrationRuntime *runtime, void *ptr) {
+    unwrap_runtime(runtime)->host_api.device_free(ptr);
+}
+
+int runtime_copy_to_device(OrchestrationRuntime *runtime, void *dev_ptr, const void *host_ptr, size_t size) {
+    return unwrap_runtime(runtime)->host_api.copy_to_device(dev_ptr, host_ptr, size);
+}
+
+const OrchestrationRuntimeOps k_orchestration_runtime_ops = {
+    runtime_add_task,      runtime_add_successor, runtime_record_tensor_pair, runtime_get_task_count,
+    runtime_print_runtime, runtime_device_malloc, runtime_device_free,        runtime_copy_to_device,
+};
+
 bool write_all_bytes(int fd, const uint8_t *data, size_t size) {
     size_t total_written = 0;
     while (total_written < size) {
@@ -102,10 +136,10 @@ extern "C" {
  * This function loads the orchestration SO from binary data via a temp file,
  * resolves the orchestration function via dlsym, then calls it to build the
  * task graph. The orchestration function is responsible for:
- * - Allocating device memory via runtime->host_api.device_malloc()
- * - Copying data to device via runtime->host_api.copy_to_device()
+ * - Allocating device memory via device_malloc()
+ * - Copying data to device via copy_to_device()
  * - Building the task graph
- * - Recording tensor pairs via runtime->record_tensor_pair()
+ * - Recording tensor pairs via record_tensor_pair()
  *
  * @param runtime   Pointer to pre-constructed Runtime
  * @param callable  ChipCallable containing orch binary, func_name, and child kernels
@@ -181,9 +215,11 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip
         orch_args->tensor_count(), orch_args->scalar_count()
     );
 
+    OrchestrationRuntimeImpl orchestration_runtime = {&k_orchestration_runtime_ops, runtime};
+
     // Call orchestration function to build task graph
     // The orchestration function handles device memory allocation and copy-to-device
-    int rc = orch_func(runtime, *orch_args);
+    int rc = orch_func(reinterpret_cast<OrchestrationRuntime *>(&orchestration_runtime), *orch_args);
     if (rc != 0) {
         LOG_ERROR("Orchestration function failed with code %d", rc);
         runtime->clear_tensor_pairs();
diff --git a/src/a5/runtime/host_build_graph/orchestration/orchestration_api.h b/src/a5/runtime/host_build_graph/orchestration/orchestration_api.h
new file mode 100644
index 000000000..76c4066ba
--- /dev/null
+++ b/src/a5/runtime/host_build_graph/orchestration/orchestration_api.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Orchestration API for host_build_graph.
+ *
+ * Orchestration sources include only this header and interact with the runtime
+ * through the function-pointer table embedded in OrchestrationRuntime.
+ */
+
+#ifndef SRC_A5_RUNTIME_HOST_BUILD_GRAPH_ORCHESTRATION_ORCHESTRATION_API_H_
+#define SRC_A5_RUNTIME_HOST_BUILD_GRAPH_ORCHESTRATION_ORCHESTRATION_API_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "common/core_type.h"  // NOLINT(build/include_subdir)
+#include "task_args.h"         // NOLINT(build/include_subdir)
+
+typedef struct OrchestrationRuntime OrchestrationRuntime;
+
+typedef struct OrchestrationRuntimeOps {
+    int (*add_task)(OrchestrationRuntime *runtime, uint64_t *args, int num_args, int func_id, CoreType core_type);
+    void (*add_successor)(OrchestrationRuntime *runtime, int from_task, int to_task);
+    void (*record_tensor_pair)(OrchestrationRuntime *runtime, void *host_ptr, void *dev_ptr, size_t size);
+    int (*get_task_count)(OrchestrationRuntime *runtime);
+    void (*print_runtime)(OrchestrationRuntime *runtime);
+
+    void *(*device_malloc)(OrchestrationRuntime *runtime, size_t size);
+    void (*device_free)(OrchestrationRuntime *runtime, void *ptr);
+    int (*copy_to_device)(OrchestrationRuntime *runtime, void *dev_ptr, const void *host_ptr, size_t size);
+} OrchestrationRuntimeOps;
+
+struct OrchestrationRuntime {
+    const OrchestrationRuntimeOps *ops;
+};
+
+static inline int
+add_task(OrchestrationRuntime *runtime, uint64_t *args, int num_args, int func_id, CoreType core_type) {
+    return runtime->ops->add_task(runtime, args, num_args, func_id, core_type);
+}
+
+static inline void add_successor(OrchestrationRuntime *runtime, int from_task, int to_task) {
+    runtime->ops->add_successor(runtime, from_task, to_task);
+}
+
+static inline void record_tensor_pair(OrchestrationRuntime *runtime, void *host_ptr, void *dev_ptr, size_t size) {
+    runtime->ops->record_tensor_pair(runtime, host_ptr, dev_ptr, size);
+}
+
+static inline int get_task_count(OrchestrationRuntime *runtime) { return runtime->ops->get_task_count(runtime); }
+
+static inline void print_runtime(OrchestrationRuntime *runtime) { runtime->ops->print_runtime(runtime); }
+
+static inline void *device_malloc(OrchestrationRuntime *runtime, size_t size) {
+    return runtime->ops->device_malloc(runtime, size);
+}
+
+static inline void device_free(OrchestrationRuntime *runtime, void *ptr) { runtime->ops->device_free(runtime, ptr); }
+
+static inline int copy_to_device(OrchestrationRuntime *runtime, void *dev_ptr, const void *host_ptr, size_t size) {
+    return runtime->ops->copy_to_device(runtime, dev_ptr, host_ptr, size);
+}
+
+typedef int (*OrchestrationFunc)(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args);
+
+#endif  // SRC_A5_RUNTIME_HOST_BUILD_GRAPH_ORCHESTRATION_ORCHESTRATION_API_H_
diff --git a/tests/st/a2a3/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/st/a2a3/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
index 872af14c0..142921213 100644
--- a/tests/st/a2a3/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
+++ b/tests/st/a2a3/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
@@ -27,8 +27,7 @@
 #include <cstring>
 #include <iostream>
 
-#include "runtime.h"    // NOLINT(build/include_subdir)
-#include "task_args.h"  // NOLINT(build/include_subdir)
+#include "orchestration_api.h"  // NOLINT(build/include_subdir)
 
 #define FUNC_QK_MATMUL 0
 #define FUNC_SOFTMAX_PREPARE 1
@@ -37,7 +36,7 @@
 
 extern "C" {
 
-int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) {
+int build_paged_attention_graph(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args) {
     if (orch_args.tensor_count() < 6) {
         std::cerr << "Expected at least 6 tensors, got " << orch_args.tensor_count() << '\n';
         return -1;
@@ -85,20 +84,20 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc
     std::cout << "q_tile_size=" << q_tile_size << ", num_head_tiles=" << num_head_tiles << '\n';
 
     // Allocate device memory for inputs/outputs
-    void *dev_query = runtime->host_api.device_malloc(query_size);
-    void *dev_key_cache = runtime->host_api.device_malloc(key_cache_size);
-    void *dev_value_cache = runtime->host_api.device_malloc(value_cache_size);
-    void *dev_out = runtime->host_api.device_malloc(out_size);
+    void *dev_query = device_malloc(runtime, query_size);
+    void *dev_key_cache = device_malloc(runtime, key_cache_size);
+    void *dev_value_cache = device_malloc(runtime, value_cache_size);
+    void *dev_out = device_malloc(runtime, out_size);
 
     if (!dev_query || !dev_key_cache || !dev_value_cache || !dev_out) {
         std::cerr << "Error: Failed to allocate device memory\n";
         return -1;
     }
 
-    runtime->host_api.copy_to_device(dev_query, host_query, query_size);
-    runtime->host_api.copy_to_device(dev_key_cache, host_key_cache, key_cache_size);
-    runtime->host_api.copy_to_device(dev_value_cache, host_value_cache, value_cache_size);
-    runtime->record_tensor_pair(host_out, dev_out, out_size);
+    copy_to_device(runtime, dev_query, host_query, query_size);
+    copy_to_device(runtime, dev_key_cache, host_key_cache, key_cache_size);
+    copy_to_device(runtime, dev_value_cache, host_value_cache, value_cache_size);
+    record_tensor_pair(runtime, host_out, dev_out, out_size);
 
     // Buffer sizes depend on q_tile_size and block_size
     size_t sij_size = static_cast<size_t>(q_tile_size) * block_size * sizeof(float);
@@ -116,11 +115,11 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc
     void **dev_oi_new_arr = new void *[total_buffers];
 
     for (uint32_t i = 0; i < total_buffers; i++) {
-        dev_sij_arr[i] = runtime->host_api.device_malloc(sij_size);
-        dev_pij_arr[i] = runtime->host_api.device_malloc(pij_size);
-        dev_mij_arr[i] = runtime->host_api.device_malloc(mij_size);
-        dev_lij_arr[i] = runtime->host_api.device_malloc(lij_size);
-        dev_oi_new_arr[i] = runtime->host_api.device_malloc(oi_new_size);
+        dev_sij_arr[i] = device_malloc(runtime, sij_size);
+        dev_pij_arr[i] = device_malloc(runtime, pij_size);
+        dev_mij_arr[i] = device_malloc(runtime, mij_size);
+        dev_lij_arr[i] = device_malloc(runtime, lij_size);
+        dev_oi_new_arr[i] = device_malloc(runtime, oi_new_size);
     }
 
     // Per-(batch, head_tile) accumulators
@@ -134,9 +133,9 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc
     void **dev_oi_arr = new void *[total_accums];
 
     for (uint32_t i = 0; i < total_accums; i++) {
-        dev_mi_arr[i] = runtime->host_api.device_malloc(mi_size);
-        dev_li_arr[i] = runtime->host_api.device_malloc(li_size);
-        dev_oi_arr[i] = runtime->host_api.device_malloc(oi_size);
+        dev_mi_arr[i] = device_malloc(runtime, mi_size);
+        dev_li_arr[i] = device_malloc(runtime, li_size);
+        dev_oi_arr[i] = device_malloc(runtime, oi_size);
     }
 
     std::cout << "Allocated " << total_buffers << " per-block buffers\n";
@@ -196,7 +195,7 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc
                 uint64_t qk_args[6] = {reinterpret_cast<uint64_t>(qi_ptr),  reinterpret_cast<uint64_t>(kj_ptr),
                                        reinterpret_cast<uint64_t>(dev_sij), static_cast<uint64_t>(q_tile_size),
                                        static_cast<uint64_t>(head_dim),     static_cast<uint64_t>(block_size)};
-                int t_qk = runtime->add_task(qk_args, 6, FUNC_QK_MATMUL, CoreType::AIC);
+                int t_qk = add_task(runtime, qk_args, 6, FUNC_QK_MATMUL, CoreType::AIC);
                 total_tasks++;
 
                 // SF: scale, rowmax, exp, rowsum -> pij, mij, lij
@@ -204,18 +203,18 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc
                                        reinterpret_cast<uint64_t>(dev_pij), reinterpret_cast<uint64_t>(dev_mij),
                                        reinterpret_cast<uint64_t>(dev_lij), static_cast<uint64_t>(q_tile_size),
                                        static_cast<uint64_t>(block_size),   static_cast<uint64_t>(valid_len)};
-                int t_sf = runtime->add_task(sf_args, 8, FUNC_SOFTMAX_PREPARE, CoreType::AIV);
+                int t_sf = add_task(runtime, sf_args, 8, FUNC_SOFTMAX_PREPARE, CoreType::AIV);
                 total_tasks++;
 
                 // PV: pij(M, K') @ vj(K', N') -> oi_new(M, N')
                 uint64_t pv_args[6] = {reinterpret_cast<uint64_t>(dev_pij),    reinterpret_cast<uint64_t>(vj_ptr),
                                        reinterpret_cast<uint64_t>(dev_oi_new), static_cast<uint64_t>(q_tile_size),
                                        static_cast<uint64_t>(block_size),      static_cast<uint64_t>(head_dim)};
-                int t_pv = runtime->add_task(pv_args, 6, FUNC_PV_MATMUL, CoreType::AIC);
+                int t_pv = add_task(runtime, pv_args, 6, FUNC_PV_MATMUL, CoreType::AIC);
                 total_tasks++;
 
-                runtime->add_successor(t_qk, t_sf);
-                runtime->add_successor(t_sf, t_pv);
+                add_successor(runtime, t_qk, t_sf);
+                add_successor(runtime, t_sf, t_pv);
 
                 // Online Update: serialized across blocks (each depends on previous)
                 int is_first = (bn == 0) ? 1 : 0;
@@ -227,12 +226,12 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc
                                         static_cast<uint64_t>(is_first),        static_cast<uint64_t>(is_last),
                                         reinterpret_cast<uint64_t>(out_ptr),    static_cast<uint64_t>(q_tile_size),
                                         static_cast<uint64_t>(head_dim)};
-                int t_up = runtime->add_task(up_args, 11, FUNC_ONLINE_UPDATE, CoreType::AIV);
+                int t_up = add_task(runtime, up_args, 11, FUNC_ONLINE_UPDATE, CoreType::AIV);
                 total_tasks++;
 
-                runtime->add_successor(t_pv, t_up);
+                add_successor(runtime, t_pv, t_up);
                 if (t_up_prev >= 0) {
-                    runtime->add_successor(t_up_prev, t_up);
+                    add_successor(runtime, t_up_prev, t_up);
                 }
                 t_up_prev = t_up;
             }
@@ -249,7 +248,7 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc
     delete[] dev_oi_arr;
 
     std::cout << "Created " << total_tasks << " tasks\n";
-    runtime->print_runtime();
+    print_runtime(runtime);
 
     return 0;
 }
diff --git a/tests/st/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/st/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
index 872af14c0..142921213 100644
--- a/tests/st/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
+++ b/tests/st/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
@@ -27,8 +27,7 @@
 #include <cstring>
 #include <iostream>
 
-#include "runtime.h"    // NOLINT(build/include_subdir)
-#include "task_args.h"  // NOLINT(build/include_subdir)
+#include "orchestration_api.h"  // NOLINT(build/include_subdir)
 
 #define FUNC_QK_MATMUL 0
 #define FUNC_SOFTMAX_PREPARE 1
@@ -37,7 +36,7 @@
 
 extern "C" {
 
-int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) {
+int build_paged_attention_graph(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args) {
     if (orch_args.tensor_count() < 6) {
         std::cerr << "Expected at least 6 tensors, got " << orch_args.tensor_count() << '\n';
         return -1;
@@ -85,20 +84,20 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc
     std::cout << "q_tile_size=" << q_tile_size << ", num_head_tiles=" << num_head_tiles << '\n';
 
     // Allocate device memory for inputs/outputs
-    void *dev_query = runtime->host_api.device_malloc(query_size);
-    void *dev_key_cache = runtime->host_api.device_malloc(key_cache_size);
-    void *dev_value_cache = runtime->host_api.device_malloc(value_cache_size);
-    void *dev_out = runtime->host_api.device_malloc(out_size);
+    void *dev_query = device_malloc(runtime, query_size);
+    void *dev_key_cache = device_malloc(runtime, key_cache_size);
+    void *dev_value_cache = device_malloc(runtime, value_cache_size);
+    void *dev_out = device_malloc(runtime, out_size);
 
     if (!dev_query || !dev_key_cache || !dev_value_cache || !dev_out) {
         std::cerr << "Error: Failed to allocate device memory\n";
         return -1;
     }
 
-    runtime->host_api.copy_to_device(dev_query, host_query, query_size);
-    runtime->host_api.copy_to_device(dev_key_cache, host_key_cache, key_cache_size);
-    runtime->host_api.copy_to_device(dev_value_cache, host_value_cache, value_cache_size);
-    runtime->record_tensor_pair(host_out, dev_out, out_size);
+    copy_to_device(runtime, dev_query, host_query, query_size);
+    copy_to_device(runtime, dev_key_cache, host_key_cache, key_cache_size);
+    copy_to_device(runtime, dev_value_cache, host_value_cache, value_cache_size);
+    record_tensor_pair(runtime, host_out, dev_out, out_size);
 
     // Buffer sizes depend on q_tile_size and block_size
     size_t sij_size = static_cast<size_t>(q_tile_size) * block_size * sizeof(float);
@@ -116,11 +115,11 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc
     void **dev_oi_new_arr = new void *[total_buffers];
 
     for (uint32_t i = 0; i < total_buffers; i++) {
-        dev_sij_arr[i] = runtime->host_api.device_malloc(sij_size);
-        dev_pij_arr[i] = runtime->host_api.device_malloc(pij_size);
-        dev_mij_arr[i] = runtime->host_api.device_malloc(mij_size);
-        dev_lij_arr[i] = runtime->host_api.device_malloc(lij_size);
-        dev_oi_new_arr[i] = runtime->host_api.device_malloc(oi_new_size);
+        dev_sij_arr[i] = device_malloc(runtime, sij_size);
+        dev_pij_arr[i] = device_malloc(runtime, pij_size);
+        dev_mij_arr[i] = device_malloc(runtime, mij_size);
+        dev_lij_arr[i] = device_malloc(runtime, lij_size);
+        dev_oi_new_arr[i] = device_malloc(runtime, oi_new_size);
     }
 
     // Per-(batch, head_tile) accumulators
@@ -134,9 +133,9 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc
     void **dev_oi_arr = new void *[total_accums];
 
     for (uint32_t i = 0; i < total_accums; i++) {
-        dev_mi_arr[i] = runtime->host_api.device_malloc(mi_size);
-        dev_li_arr[i] = runtime->host_api.device_malloc(li_size);
-        dev_oi_arr[i] = runtime->host_api.device_malloc(oi_size);
+        dev_mi_arr[i] = device_malloc(runtime, mi_size);
+        dev_li_arr[i] = device_malloc(runtime, li_size);
+        dev_oi_arr[i] = device_malloc(runtime, oi_size);
     }
 
     std::cout << "Allocated " << total_buffers << " per-block buffers\n";
@@ -196,7 +195,7 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc
                 uint64_t qk_args[6] = {reinterpret_cast<uint64_t>(qi_ptr),  reinterpret_cast<uint64_t>(kj_ptr),
                                        reinterpret_cast<uint64_t>(dev_sij), static_cast<uint64_t>(q_tile_size),
                                        static_cast<uint64_t>(head_dim),     static_cast<uint64_t>(block_size)};
-                int t_qk = runtime->add_task(qk_args, 6, FUNC_QK_MATMUL, CoreType::AIC);
+                int t_qk = add_task(runtime, qk_args, 6, FUNC_QK_MATMUL, CoreType::AIC);
                 total_tasks++;
 
                 // SF: scale, rowmax, exp, rowsum -> pij, mij, lij
@@ -204,18 +203,18 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc
                                        reinterpret_cast<uint64_t>(dev_pij), reinterpret_cast<uint64_t>(dev_mij),
                                        reinterpret_cast<uint64_t>(dev_lij), static_cast<uint64_t>(q_tile_size),
                                        static_cast<uint64_t>(block_size),   static_cast<uint64_t>(valid_len)};
-                int t_sf = runtime->add_task(sf_args, 8, FUNC_SOFTMAX_PREPARE, CoreType::AIV);
+                int t_sf = add_task(runtime, sf_args, 8, FUNC_SOFTMAX_PREPARE, CoreType::AIV);
                 total_tasks++;
 
                 // PV: pij(M, K') @ vj(K', N') -> oi_new(M, N')
                 uint64_t pv_args[6] = {reinterpret_cast<uint64_t>(dev_pij),    reinterpret_cast<uint64_t>(vj_ptr),
                                        reinterpret_cast<uint64_t>(dev_oi_new), static_cast<uint64_t>(q_tile_size),
                                        static_cast<uint64_t>(block_size),      static_cast<uint64_t>(head_dim)};
-                int t_pv = runtime->add_task(pv_args, 6, FUNC_PV_MATMUL, CoreType::AIC);
+                int t_pv = add_task(runtime, pv_args, 6, FUNC_PV_MATMUL, CoreType::AIC);
                 total_tasks++;
 
-                runtime->add_successor(t_qk, t_sf);
-                runtime->add_successor(t_sf, t_pv);
+                add_successor(runtime, t_qk, t_sf);
+                add_successor(runtime, t_sf, t_pv);
 
                 // Online Update: serialized across blocks (each depends on previous)
                 int is_first = (bn == 0) ? 1 : 0;
@@ -227,12 +226,12 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc
                                         static_cast<uint64_t>(is_first),        static_cast<uint64_t>(is_last),
                                         reinterpret_cast<uint64_t>(out_ptr),    static_cast<uint64_t>(q_tile_size),
                                         static_cast<uint64_t>(head_dim)};
-                int t_up = runtime->add_task(up_args, 11, FUNC_ONLINE_UPDATE, CoreType::AIV);
+                int t_up = add_task(runtime, up_args, 11, FUNC_ONLINE_UPDATE, CoreType::AIV);
                 total_tasks++;
 
-                runtime->add_successor(t_pv, t_up);
+                add_successor(runtime, t_pv, t_up);
                 if (t_up_prev >= 0) {
-                    runtime->add_successor(t_up_prev, t_up);
+                    add_successor(runtime, t_up_prev, t_up);
                 }
                 t_up_prev = t_up;
             }
@@ -249,7 +248,7 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc
     delete[] dev_oi_arr;
 
     std::cout << "Created " << total_tasks << " tasks\n";
-    runtime->print_runtime();
+    print_runtime(runtime);
 
     return 0;
 }