hw-native-sys · ChaoWao · Apr 3, 2026 · Apr 3, 2026
diff --git a/examples/a2a3/host_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp b/examples/a2a3/host_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp
@@ -31,8 +31,7 @@
 #include <iostream>
 #include <vector>
 
-#include "runtime.h"    // NOLINT(build/include_subdir)
-#include "task_args.h"  // NOLINT(build/include_subdir)
+#include "orchestration_api.h"  // NOLINT(build/include_subdir)
 
 extern "C" {
 
@@ -44,7 +43,7 @@ constexpr int BATCH = 1;
 
 constexpr size_t TILE_BYTES = TILE * TILE * sizeof(float);
 
-int build_bgemm_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) {
+int build_bgemm_graph(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args) {
     // Expected orch_args: [A, B, C] — 3 tensors
     if (orch_args.tensor_count() < 3) {
         std::cerr << "build_bgemm_graph: Expected at least 3 tensors, got " << orch_args.tensor_count() << '\n';
@@ -62,38 +61,38 @@ int build_bgemm_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) {
     std::cout << "Grid: " << GRID_M << " x " << GRID_K << " x " << GRID_N << '\n';
 
     // Allocate device memory and copy inputs
-    void *dev_A = runtime->host_api.device_malloc(size_A);
+    void *dev_A = device_malloc(runtime, size_A);
     if (!dev_A) return -1;
-    runtime->host_api.copy_to_device(dev_A, host_A, size_A);
+    copy_to_device(runtime, dev_A, host_A, size_A);
 
-    void *dev_B = runtime->host_api.device_malloc(size_B);
+    void *dev_B = device_malloc(runtime, size_B);
     if (!dev_B) {
-        runtime->host_api.device_free(dev_A);
+        device_free(runtime, dev_A);
         return -1;
     }
-    runtime->host_api.copy_to_device(dev_B, host_B, size_B);
+    copy_to_device(runtime, dev_B, host_B, size_B);
 
-    void *dev_C = runtime->host_api.device_malloc(size_C);
+    void *dev_C = device_malloc(runtime, size_C);
     if (!dev_C) {
-        runtime->host_api.device_free(dev_A);
-        runtime->host_api.device_free(dev_B);
+        device_free(runtime, dev_A);
+        device_free(runtime, dev_B);
         return -1;
     }
-    runtime->host_api.copy_to_device(dev_C, host_C, size_C);
-    runtime->record_tensor_pair(host_C, dev_C, size_C);
+    copy_to_device(runtime, dev_C, host_C, size_C);
+    record_tensor_pair(runtime, host_C, dev_C, size_C);
 
     // Allocate intermediate P buffers (one per C tile)
     constexpr int NUM_P_BUFFERS = BATCH * GRID_M * GRID_N;
     std::vector<void *> dev_P(NUM_P_BUFFERS, nullptr);
     for (int i = 0; i < NUM_P_BUFFERS; i++) {
-        dev_P[i] = runtime->host_api.device_malloc(TILE_BYTES);
+        dev_P[i] = device_malloc(runtime, TILE_BYTES);
         if (!dev_P[i]) {
             for (int j = 0; j < i; j++) {
-                runtime->host_api.device_free(dev_P[j]);
+                device_free(runtime, dev_P[j]);
             }
-            runtime->host_api.device_free(dev_A);
-            runtime->host_api.device_free(dev_B);
-            runtime->host_api.device_free(dev_C);
+            device_free(runtime, dev_A);
+            device_free(runtime, dev_B);
+            device_free(runtime, dev_C);
             return -1;
         }
     }
@@ -121,7 +120,7 @@ int build_bgemm_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) {
                     args_gemm[3] = TILE;
                     args_gemm[4] = TILE;
                     args_gemm[5] = TILE;
-                    int t_gemm = runtime->add_task(args_gemm, 6, 0, CoreType::AIC);
+                    int t_gemm = add_task(runtime, args_gemm, 6, 0, CoreType::AIC);
 
                     // Task 2: C[m,n] = C[m,n] + P (tile_add on Vector core)
                     uint64_t args_add[5];
@@ -130,22 +129,22 @@ int build_bgemm_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) {
                     args_add[2] = reinterpret_cast<uint64_t>(static_cast<char *>(dev_C) + C_offset);
                     args_add[3] = TILE;
                     args_add[4] = TILE;
-                    int t_add = runtime->add_task(args_add, 5, 1, CoreType::AIV);
+                    int t_add = add_task(runtime, args_add, 5, 1, CoreType::AIV);
 
                     // Dependency: gemm must complete before add
-                    runtime->add_successor(t_gemm, t_add);
+                    add_successor(runtime, t_gemm, t_add);
 
                     // Dependency: previous add must complete before current gemm (K accumulation)
                     if (last_add_task[c_tile_idx] >= 0) {
-                        runtime->add_successor(last_add_task[c_tile_idx], t_gemm);
+                        add_successor(runtime, last_add_task[c_tile_idx], t_gemm);
                     }
                     last_add_task[c_tile_idx] = t_add;
                 }
             }
         }
     }
 
-    std::cout << "Created " << runtime->get_task_count() << " tasks\n";
+    std::cout << "Created " << get_task_count(runtime) << " tasks\n";
     return 0;
 }
 

diff --git a/examples/a2a3/host_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md b/examples/a2a3/host_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md
@@ -1,49 +1,50 @@
 # InCore Orchestration Guide: host_build_graph
 
 ## Goal
-In host_build_graph, the orchestration function runs on the host. It allocates device buffers, builds the task graph by calling `Runtime::add_task`, and wires dependencies with `Runtime::add_successor`.
+
+In host_build_graph, the orchestration function runs on the host. It allocates device buffers, builds the task graph by calling `add_task(runtime, ...)`, and wires dependencies with `add_successor(runtime, ...)`.
 
 ## Where To Put Orchestration Code
+
 - Each example keeps orchestration sources under `examples/host_build_graph/<example>/kernels/orchestration/`.
 - `examples/host_build_graph/<example>/kernels/kernel_config.py` defines the orchestration entry point. Example: `ORCHESTRATION = {"source": ".../example_orch.cpp", "function_name": "build_example_graph"}`.
 
 ## Function Signature
+
 Your orchestration entry must be `extern "C"` and match:
 
 ```cpp
-int build_graph(Runtime* runtime, uint64_t* args, int arg_count);
+int build_graph(OrchestrationRuntime* runtime, const ChipStorageTaskArgs &orch_args);
 ```
 
-`Runtime` is defined in `src/runtime/host_build_graph/runtime/runtime.h`.
+Include `orchestration_api.h`. Do not include `runtime.h` in orchestration sources.
 
 ## Argument Layout
-When you use the default `golden.py` tensor argument order (`TENSOR_ORDER`), the argument layout built by `examples/scripts/code_runner.py` is:
 
-```
-[ptr_0, ptr_1, ..., ptr_n, nbytes_0, nbytes_1, ..., nbytes_n, element_count]
-```
+`orch_args` contains separated tensor and scalar arguments through `ChipStorageTaskArgs`.
 
-- Pointers are host pointers to CPU tensors.
-- Sizes are byte sizes for each tensor in `TENSOR_ORDER`.
-- `element_count` is the element count of the first tensor.
-
-If `golden.py` returns an explicit argument list, that list becomes `args` directly. Validate `arg_count` defensively in your orchestration.
+- Use `orch_args.tensor(i)` to read tensor metadata and host pointers
+- Use `orch_args.scalar(i)` to read scalar values
+- Validate `tensor_count()` / `scalar_count()` defensively in orchestration code
 
 ## Building The Graph
+
 A typical host orchestration sequence is:
 
-1. Allocate device buffers with `runtime->host_api.device_malloc`.
-2. Copy inputs to device with `runtime->host_api.copy_to_device`.
-3. Record output buffers with `runtime->record_tensor_pair(host_ptr, dev_ptr, size)` so finalize can copy them back.
-4. Create tasks with `runtime->add_task(args, num_args, func_id, core_type)`.
-5. Add dependency edges with `runtime->add_successor(producer, consumer)`.
+1. Allocate device buffers with `device_malloc(runtime, size)`.
+2. Copy inputs to device with `copy_to_device(runtime, dev_ptr, host_ptr, size)`.
+3. Record output buffers with `record_tensor_pair(runtime, host_ptr, dev_ptr, size)` so finalize can copy them back.
+4. Create tasks with `add_task(runtime, args, num_args, func_id, core_type)`.
+5. Add dependency edges with `add_successor(runtime, producer, consumer)`.
 
 Example: see `examples/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp`.
 
 ## Kernel Mapping
+
 - `func_id` and `core_type` are defined in `kernels/kernel_config.py` under `KERNELS`.
 - The host uploads kernel binaries via `upload_kernel_binary` and stores addresses in `Runtime::func_id_to_addr_[]`. The platform layer resolves per-task `Task::function_bin_addr` from this map before copying to device.
 
 ## Debugging Tips
-- Use `runtime->print_runtime()` to dump the task graph.
+
+- Use `print_runtime(runtime)` to dump the task graph.
 - Fail fast on arg count or allocation errors to avoid undefined behavior.
diff --git a/examples/a2a3/host_build_graph/matmul/kernels/orchestration/matmul_orch.cpp b/examples/a2a3/host_build_graph/matmul/kernels/orchestration/matmul_orch.cpp
@@ -22,21 +22,20 @@
  *
  * This orchestration function:
  * 1. Receives ChipStorageTaskArgs with tensor metadata (pointers, shapes, dtypes)
- * 2. Allocates device memory via runtime->host_api
- * 3. Copies input data to device via runtime->host_api
+ * 2. Allocates device memory via orchestration API helpers
+ * 3. Copies input data to device via orchestration API helpers
  * 4. Records output tensor for copy-back during finalize
  * 5. Builds the task graph with 4 tasks (2 AIV + 2 AIC)
  */
 
 #include <cstdint>
 #include <iostream>
 
-#include "runtime.h"    // NOLINT(build/include_subdir)
-#include "task_args.h"  // NOLINT(build/include_subdir)
+#include "orchestration_api.h"  // NOLINT(build/include_subdir)
 
 extern "C" {
 
-int build_matmul_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) {
+int build_matmul_graph(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args) {
     // Validate argument count
     // Expected orch_args: [a, w1, w2, f] — 4 tensors
     if (orch_args.tensor_count() < 4) {
@@ -62,63 +61,63 @@ int build_matmul_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) {
     // Allocate device memory and copy inputs
     std::cout << "\n=== Allocating Device Memory ===" << '\n';
 
-    void *dev_a = runtime->host_api.device_malloc(size_a);
+    void *dev_a = device_malloc(runtime, size_a);
     if (!dev_a) {
         std::cerr << "Error: Failed to allocate device memory for A\n";
         return -1;
     }
-    runtime->host_api.copy_to_device(dev_a, host_a, size_a);
+    copy_to_device(runtime, dev_a, host_a, size_a);
     std::cout << "Tensor A: " << size_a << " bytes copied to device\n";
 
-    void *dev_w1 = runtime->host_api.device_malloc(size_w1);
+    void *dev_w1 = device_malloc(runtime, size_w1);
     if (!dev_w1) {
         std::cerr << "Error: Failed to allocate device memory for W1\n";
-        runtime->host_api.device_free(dev_a);
+        device_free(runtime, dev_a);
         return -1;
     }
-    runtime->host_api.copy_to_device(dev_w1, host_w1, size_w1);
+    copy_to_device(runtime, dev_w1, host_w1, size_w1);
     std::cout << "Tensor W1: " << size_w1 << " bytes copied to device\n";
 
-    void *dev_w2 = runtime->host_api.device_malloc(size_w2);
+    void *dev_w2 = device_malloc(runtime, size_w2);
     if (!dev_w2) {
         std::cerr << "Error: Failed to allocate device memory for W2\n";
-        runtime->host_api.device_free(dev_a);
-        runtime->host_api.device_free(dev_w1);
+        device_free(runtime, dev_a);
+        device_free(runtime, dev_w1);
         return -1;
     }
-    runtime->host_api.copy_to_device(dev_w2, host_w2, size_w2);
+    copy_to_device(runtime, dev_w2, host_w2, size_w2);
     std::cout << "Tensor W2: " << size_w2 << " bytes copied to device\n";
 
-    void *dev_f = runtime->host_api.device_malloc(size_f);
+    void *dev_f = device_malloc(runtime, size_f);
     if (!dev_f) {
         std::cerr << "Error: Failed to allocate device memory for F\n";
-        runtime->host_api.device_free(dev_a);
-        runtime->host_api.device_free(dev_w1);
-        runtime->host_api.device_free(dev_w2);
+        device_free(runtime, dev_a);
+        device_free(runtime, dev_w1);
+        device_free(runtime, dev_w2);
         return -1;
     }
     // Record output tensor for copy-back during finalize
-    runtime->record_tensor_pair(host_f, dev_f, size_f);
+    record_tensor_pair(runtime, host_f, dev_f, size_f);
     std::cout << "Tensor F (output): " << size_f << " bytes allocated\n";
 
     // Allocate intermediate tensors (b, c, d)
     // dev_b is half precision (output of log_sqrt kernel, input to matmul)
     // dev_c, dev_d are float precision (output of matmul kernels)
-    size_t BYTES_HALF = SIZE * sizeof(uint16_t);                 // half = 2 bytes
-    size_t BYTES_FLOAT = SIZE * sizeof(float);                   // float = 4 bytes
-    void *dev_b = runtime->host_api.device_malloc(BYTES_HALF);   // sqrt(log(A)) - half output
-    void *dev_c = runtime->host_api.device_malloc(BYTES_FLOAT);  // B @ W1 - float output
-    void *dev_d = runtime->host_api.device_malloc(BYTES_FLOAT);  // B @ W2 - float output
+    size_t BYTES_HALF = SIZE * sizeof(uint16_t);        // half = 2 bytes
+    size_t BYTES_FLOAT = SIZE * sizeof(float);          // float = 4 bytes
+    void *dev_b = device_malloc(runtime, BYTES_HALF);   // sqrt(log(A)) - half output
+    void *dev_c = device_malloc(runtime, BYTES_FLOAT);  // B @ W1 - float output
+    void *dev_d = device_malloc(runtime, BYTES_FLOAT);  // B @ W2 - float output
 
     if (!dev_b || !dev_c || !dev_d) {
         std::cerr << "Error: Failed to allocate intermediate tensors\n";
-        runtime->host_api.device_free(dev_a);
-        runtime->host_api.device_free(dev_w1);
-        runtime->host_api.device_free(dev_w2);
-        runtime->host_api.device_free(dev_f);
-        if (dev_b) runtime->host_api.device_free(dev_b);
-        if (dev_c) runtime->host_api.device_free(dev_c);
-        if (dev_d) runtime->host_api.device_free(dev_d);
+        device_free(runtime, dev_a);
+        device_free(runtime, dev_w1);
+        device_free(runtime, dev_w2);
+        device_free(runtime, dev_f);
+        if (dev_b) device_free(runtime, dev_b);
+        if (dev_c) device_free(runtime, dev_c);
+        if (dev_d) device_free(runtime, dev_d);
         return -1;
     }
 
@@ -130,37 +129,37 @@ int build_matmul_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) {
     args_t0[0] = reinterpret_cast<uint64_t>(dev_a);  // src
     args_t0[1] = reinterpret_cast<uint64_t>(dev_b);  // out
     args_t0[2] = SIZE;                               // size
-    int t0 = runtime->add_task(args_t0, 3, 0, CoreType::AIV);
+    int t0 = add_task(runtime, args_t0, 3, 0, CoreType::AIV);
 
     // Task 1: C = B @ W1 (func_id=1: kernel_matmul, AIC)
     uint64_t args_t1[4];
     args_t1[0] = reinterpret_cast<uint64_t>(dev_b);   // src0 (left matrix)
     args_t1[1] = reinterpret_cast<uint64_t>(dev_w1);  // src1 (right matrix)
     args_t1[2] = reinterpret_cast<uint64_t>(dev_c);   // out
     args_t1[3] = SIZE;                                // size
-    int t1 = runtime->add_task(args_t1, 4, 1, CoreType::AIC);
+    int t1 = add_task(runtime, args_t1, 4, 1, CoreType::AIC);
 
     // Task 2: D = B @ W2 (func_id=1: kernel_matmul, AIC)
     uint64_t args_t2[4];
     args_t2[0] = reinterpret_cast<uint64_t>(dev_b);   // src0 (left matrix)
     args_t2[1] = reinterpret_cast<uint64_t>(dev_w2);  // src1 (right matrix)
     args_t2[2] = reinterpret_cast<uint64_t>(dev_d);   // out
     args_t2[3] = SIZE;                                // size
-    int t2 = runtime->add_task(args_t2, 4, 1, CoreType::AIC);
+    int t2 = add_task(runtime, args_t2, 4, 1, CoreType::AIC);
 
     // Task 3: F = exp(C + D) (func_id=2: kernel_add_exp, AIV)
     uint64_t args_t3[4];
     args_t3[0] = reinterpret_cast<uint64_t>(dev_c);  // src0
     args_t3[1] = reinterpret_cast<uint64_t>(dev_d);  // src1
     args_t3[2] = reinterpret_cast<uint64_t>(dev_f);  // out
     args_t3[3] = SIZE;                               // size
-    int t3 = runtime->add_task(args_t3, 4, 2, CoreType::AIV);
+    int t3 = add_task(runtime, args_t3, 4, 2, CoreType::AIV);
 
     // Add dependencies (diamond: t0→t1→t3, t0→t2→t3)
-    runtime->add_successor(t0, t1);  // t0 → t1
-    runtime->add_successor(t0, t2);  // t0 → t2
-    runtime->add_successor(t1, t3);  // t1 → t3
-    runtime->add_successor(t2, t3);  // t2 → t3
+    add_successor(runtime, t0, t1);  // t0 → t1
+    add_successor(runtime, t0, t2);  // t0 → t2
+    add_successor(runtime, t1, t3);  // t1 → t3
+    add_successor(runtime, t2, t3);  // t2 → t3
 
     std::cout << "\nTasks:\n";
     std::cout << "  task" << t0 << ": B = sqrt(log(A))   [AIV]\n";
@@ -169,8 +168,8 @@ int build_matmul_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) {
     std::cout << "  task" << t3 << ": F = exp(C + D)     [AIV]\n";
     std::cout << "Dependencies: t0→t1→t3, t0→t2→t3 (diamond)\n";
 
-    std::cout << "Created runtime with " << runtime->get_task_count() << " tasks\n";
-    runtime->print_runtime();
+    std::cout << "Created runtime with " << get_task_count(runtime) << " tasks\n";
+    print_runtime(runtime);
 
     return 0;
 }