From ae1544f30a02152d0cd4dc0e0bd8ce5f8c739fb8 Mon Sep 17 00:00:00 2001 From: Chao Wang <[26245345+ChaoWao@users.noreply.github.com](mailto:26245345+ChaoWao@users.noreply.github.com)> Date: Fri, 3 Apr 2026 19:30:22 +0800 Subject: [PATCH] Add: OrchestrationRuntime vtable for host_build_graph, decouple orch from runtime.h Introduce an opaque OrchestrationRuntime + function-pointer table (orchestration_api.h) so host_build_graph orchestration SOs no longer include runtime.h. Orchestration sources include only orchestration_api.h and interact with the runtime through free-function wrappers. - orchestration_api.h (a2a3 + a5): C-compatible vtable with inline helpers add_task, add_successor, record_tensor_pair, device_malloc, device_free, copy_to_device, get_task_count, print_runtime - runtime_maker.cpp: wraps Runtime* in OrchestrationRuntimeImpl, calls orch function as OrchestrationFunc(OrchestrationRuntime*, orch_args); SO loading delegated to platform via extern-C load_orch_so (sim: mkstemp, onboard: getpid path) implemented in pto_runtime_c_api.cpp - build_config.py (a2a3 + a5): add orchestration compile target with include_dirs so kernel_compiler resolves orchestration_api.h - All host_build_graph example orchestration sources (bgemm, matmul, paged_attention, vector_example for a2a3 + a5) migrated to new API - tests/st host_build_graph paged_attention orchestration migrated - Docs updated (INCORE_ORCHESTRATION_GUIDE.md, RUNTIME_LOGIC.md, scripts/README.md, code_runner.py) Co-Authored-By: Claude Sonnet 4.6 (1M context) --- .../kernels/orchestration/bgemm_orch.cpp | 45 +++++------ .../docs/INCORE_ORCHESTRATION_GUIDE.md | 37 ++++----- .../kernels/orchestration/matmul_orch.cpp | 81 +++++++++---------- .../orchestration/paged_attention_orch.cpp | 55 +++++++------ .../kernels/orchestration/example_orch.cpp | 65 ++++++++------- .../orchestration/paged_attention_orch.cpp | 55 +++++++------ examples/scripts/README.md | 42 ++++++---- examples/scripts/code_runner.py | 7 +- .../runtime/host_build_graph/build_config.py | 24 +++--- .../host_build_graph/docs/RUNTIME_LOGIC.md | 8 +- .../host_build_graph/host/runtime_maker.cpp | 68 ++++++++++++---- .../orchestration/orchestration_api.h | 74 +++++++++++++++++ .../runtime/host_build_graph/build_config.py | 24 +++--- .../host_build_graph/docs/RUNTIME_LOGIC.md | 8 +- .../host_build_graph/host/runtime_maker.cpp | 68 ++++++++++++---- .../orchestration/orchestration_api.h | 74 +++++++++++++++++ .../orchestration/paged_attention_orch.cpp | 55 +++++++------ .../orchestration/paged_attention_orch.cpp | 55 +++++++------ 18 files changed, 540 insertions(+), 305 deletions(-) create mode 100644 src/a2a3/runtime/host_build_graph/orchestration/orchestration_api.h create mode 100644 src/a5/runtime/host_build_graph/orchestration/orchestration_api.h diff --git a/examples/a2a3/host_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp b/examples/a2a3/host_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp index fea12a74c..927a69e8a 100644 --- a/examples/a2a3/host_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp +++ b/examples/a2a3/host_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp @@ -31,8 +31,7 @@ #include #include -#include "runtime.h" // NOLINT(build/include_subdir) -#include "task_args.h" // NOLINT(build/include_subdir) +#include "orchestration_api.h" // NOLINT(build/include_subdir) extern "C" { @@ -44,7 +43,7 @@ constexpr int BATCH = 1; constexpr size_t TILE_BYTES = TILE * TILE * sizeof(float); -int build_bgemm_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) { +int build_bgemm_graph(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args) { // Expected orch_args: [A, B, C] — 3 tensors if (orch_args.tensor_count() < 3) { std::cerr << "build_bgemm_graph: Expected at least 3 tensors, got " << orch_args.tensor_count() << '\n'; @@ -62,38 +61,38 @@ int build_bgemm_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) { std::cout << "Grid: " << GRID_M << " x " << GRID_K << " x " << GRID_N << '\n'; // Allocate device memory and copy inputs - void *dev_A = runtime->host_api.device_malloc(size_A); + void *dev_A = device_malloc(runtime, size_A); if (!dev_A) return -1; - runtime->host_api.copy_to_device(dev_A, host_A, size_A); + copy_to_device(runtime, dev_A, host_A, size_A); - void *dev_B = runtime->host_api.device_malloc(size_B); + void *dev_B = device_malloc(runtime, size_B); if (!dev_B) { - runtime->host_api.device_free(dev_A); + device_free(runtime, dev_A); return -1; } - runtime->host_api.copy_to_device(dev_B, host_B, size_B); + copy_to_device(runtime, dev_B, host_B, size_B); - void *dev_C = runtime->host_api.device_malloc(size_C); + void *dev_C = device_malloc(runtime, size_C); if (!dev_C) { - runtime->host_api.device_free(dev_A); - runtime->host_api.device_free(dev_B); + device_free(runtime, dev_A); + device_free(runtime, dev_B); return -1; } - runtime->host_api.copy_to_device(dev_C, host_C, size_C); - runtime->record_tensor_pair(host_C, dev_C, size_C); + copy_to_device(runtime, dev_C, host_C, size_C); + record_tensor_pair(runtime, host_C, dev_C, size_C); // Allocate intermediate P buffers (one per C tile) constexpr int NUM_P_BUFFERS = BATCH * GRID_M * GRID_N; std::vector dev_P(NUM_P_BUFFERS, nullptr); for (int i = 0; i < NUM_P_BUFFERS; i++) { - dev_P[i] = runtime->host_api.device_malloc(TILE_BYTES); + dev_P[i] = device_malloc(runtime, TILE_BYTES); if (!dev_P[i]) { for (int j = 0; j < i; j++) { - runtime->host_api.device_free(dev_P[j]); + device_free(runtime, dev_P[j]); } - runtime->host_api.device_free(dev_A); - runtime->host_api.device_free(dev_B); - runtime->host_api.device_free(dev_C); + device_free(runtime, dev_A); + device_free(runtime, dev_B); + device_free(runtime, dev_C); return -1; } } @@ -121,7 +120,7 @@ int build_bgemm_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) { args_gemm[3] = TILE; args_gemm[4] = TILE; args_gemm[5] = TILE; - int t_gemm = runtime->add_task(args_gemm, 6, 0, CoreType::AIC); + int t_gemm = add_task(runtime, args_gemm, 6, 0, CoreType::AIC); // Task 2: C[m,n] = C[m,n] + P (tile_add on Vector core) uint64_t args_add[5]; @@ -130,14 +129,14 @@ int build_bgemm_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) { args_add[2] = reinterpret_cast(static_cast(dev_C) + C_offset); args_add[3] = TILE; args_add[4] = TILE; - int t_add = runtime->add_task(args_add, 5, 1, CoreType::AIV); + int t_add = add_task(runtime, args_add, 5, 1, CoreType::AIV); // Dependency: gemm must complete before add - runtime->add_successor(t_gemm, t_add); + add_successor(runtime, t_gemm, t_add); // Dependency: previous add must complete before current gemm (K accumulation) if (last_add_task[c_tile_idx] >= 0) { - runtime->add_successor(last_add_task[c_tile_idx], t_gemm); + add_successor(runtime, last_add_task[c_tile_idx], t_gemm); } last_add_task[c_tile_idx] = t_add; } @@ -145,7 +144,7 @@ int build_bgemm_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) { } } - std::cout << "Created " << runtime->get_task_count() << " tasks\n"; + std::cout << "Created " << get_task_count(runtime) << " tasks\n"; return 0; } diff --git a/examples/a2a3/host_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md b/examples/a2a3/host_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md index 733d31d20..fc632cc7b 100644 --- a/examples/a2a3/host_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md +++ b/examples/a2a3/host_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md @@ -1,49 +1,50 @@ # InCore Orchestration Guide: host_build_graph ## Goal -In host_build_graph, the orchestration function runs on the host. It allocates device buffers, builds the task graph by calling `Runtime::add_task`, and wires dependencies with `Runtime::add_successor`. + +In host_build_graph, the orchestration function runs on the host. It allocates device buffers, builds the task graph by calling `add_task(runtime, ...)`, and wires dependencies with `add_successor(runtime, ...)`. ## Where To Put Orchestration Code + - Each example keeps orchestration sources under `examples/host_build_graph//kernels/orchestration/`. - `examples/host_build_graph//kernels/kernel_config.py` defines the orchestration entry point. Example: `ORCHESTRATION = {"source": ".../example_orch.cpp", "function_name": "build_example_graph"}`. ## Function Signature + Your orchestration entry must be `extern "C"` and match: ```cpp -int build_graph(Runtime* runtime, uint64_t* args, int arg_count); +int build_graph(OrchestrationRuntime* runtime, const ChipStorageTaskArgs &orch_args); ``` -`Runtime` is defined in `src/runtime/host_build_graph/runtime/runtime.h`. +Include `orchestration_api.h`. Do not include `runtime.h` in orchestration sources. ## Argument Layout -When you use the default `golden.py` tensor argument order (`TENSOR_ORDER`), the argument layout built by `examples/scripts/code_runner.py` is: -``` -[ptr_0, ptr_1, ..., ptr_n, nbytes_0, nbytes_1, ..., nbytes_n, element_count] -``` +`orch_args` contains separated tensor and scalar arguments through `ChipStorageTaskArgs`. -- Pointers are host pointers to CPU tensors. -- Sizes are byte sizes for each tensor in `TENSOR_ORDER`. -- `element_count` is the element count of the first tensor. - -If `golden.py` returns an explicit argument list, that list becomes `args` directly. Validate `arg_count` defensively in your orchestration. +- Use `orch_args.tensor(i)` to read tensor metadata and host pointers +- Use `orch_args.scalar(i)` to read scalar values +- Validate `tensor_count()` / `scalar_count()` defensively in orchestration code ## Building The Graph + A typical host orchestration sequence is: -1. Allocate device buffers with `runtime->host_api.device_malloc`. -2. Copy inputs to device with `runtime->host_api.copy_to_device`. -3. Record output buffers with `runtime->record_tensor_pair(host_ptr, dev_ptr, size)` so finalize can copy them back. -4. Create tasks with `runtime->add_task(args, num_args, func_id, core_type)`. -5. Add dependency edges with `runtime->add_successor(producer, consumer)`. +1. Allocate device buffers with `device_malloc(runtime, size)`. +2. Copy inputs to device with `copy_to_device(runtime, dev_ptr, host_ptr, size)`. +3. Record output buffers with `record_tensor_pair(runtime, host_ptr, dev_ptr, size)` so finalize can copy them back. +4. Create tasks with `add_task(runtime, args, num_args, func_id, core_type)`. +5. Add dependency edges with `add_successor(runtime, producer, consumer)`. Example: see `examples/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp`. ## Kernel Mapping + - `func_id` and `core_type` are defined in `kernels/kernel_config.py` under `KERNELS`. - The host uploads kernel binaries via `upload_kernel_binary` and stores addresses in `Runtime::func_id_to_addr_[]`. The platform layer resolves per-task `Task::function_bin_addr` from this map before copying to device. ## Debugging Tips -- Use `runtime->print_runtime()` to dump the task graph. + +- Use `print_runtime(runtime)` to dump the task graph. - Fail fast on arg count or allocation errors to avoid undefined behavior. diff --git a/examples/a2a3/host_build_graph/matmul/kernels/orchestration/matmul_orch.cpp b/examples/a2a3/host_build_graph/matmul/kernels/orchestration/matmul_orch.cpp index 1d5a130fd..96e8e39be 100644 --- a/examples/a2a3/host_build_graph/matmul/kernels/orchestration/matmul_orch.cpp +++ b/examples/a2a3/host_build_graph/matmul/kernels/orchestration/matmul_orch.cpp @@ -22,8 +22,8 @@ * * This orchestration function: * 1. Receives ChipStorageTaskArgs with tensor metadata (pointers, shapes, dtypes) - * 2. Allocates device memory via runtime->host_api - * 3. Copies input data to device via runtime->host_api + * 2. Allocates device memory via orchestration API helpers + * 3. Copies input data to device via orchestration API helpers * 4. Records output tensor for copy-back during finalize * 5. Builds the task graph with 4 tasks (2 AIV + 2 AIC) */ @@ -31,12 +31,11 @@ #include #include -#include "runtime.h" // NOLINT(build/include_subdir) -#include "task_args.h" // NOLINT(build/include_subdir) +#include "orchestration_api.h" // NOLINT(build/include_subdir) extern "C" { -int build_matmul_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) { +int build_matmul_graph(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args) { // Validate argument count // Expected orch_args: [a, w1, w2, f] — 4 tensors if (orch_args.tensor_count() < 4) { @@ -62,63 +61,63 @@ int build_matmul_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) { // Allocate device memory and copy inputs std::cout << "\n=== Allocating Device Memory ===" << '\n'; - void *dev_a = runtime->host_api.device_malloc(size_a); + void *dev_a = device_malloc(runtime, size_a); if (!dev_a) { std::cerr << "Error: Failed to allocate device memory for A\n"; return -1; } - runtime->host_api.copy_to_device(dev_a, host_a, size_a); + copy_to_device(runtime, dev_a, host_a, size_a); std::cout << "Tensor A: " << size_a << " bytes copied to device\n"; - void *dev_w1 = runtime->host_api.device_malloc(size_w1); + void *dev_w1 = device_malloc(runtime, size_w1); if (!dev_w1) { std::cerr << "Error: Failed to allocate device memory for W1\n"; - runtime->host_api.device_free(dev_a); + device_free(runtime, dev_a); return -1; } - runtime->host_api.copy_to_device(dev_w1, host_w1, size_w1); + copy_to_device(runtime, dev_w1, host_w1, size_w1); std::cout << "Tensor W1: " << size_w1 << " bytes copied to device\n"; - void *dev_w2 = runtime->host_api.device_malloc(size_w2); + void *dev_w2 = device_malloc(runtime, size_w2); if (!dev_w2) { std::cerr << "Error: Failed to allocate device memory for W2\n"; - runtime->host_api.device_free(dev_a); - runtime->host_api.device_free(dev_w1); + device_free(runtime, dev_a); + device_free(runtime, dev_w1); return -1; } - runtime->host_api.copy_to_device(dev_w2, host_w2, size_w2); + copy_to_device(runtime, dev_w2, host_w2, size_w2); std::cout << "Tensor W2: " << size_w2 << " bytes copied to device\n"; - void *dev_f = runtime->host_api.device_malloc(size_f); + void *dev_f = device_malloc(runtime, size_f); if (!dev_f) { std::cerr << "Error: Failed to allocate device memory for F\n"; - runtime->host_api.device_free(dev_a); - runtime->host_api.device_free(dev_w1); - runtime->host_api.device_free(dev_w2); + device_free(runtime, dev_a); + device_free(runtime, dev_w1); + device_free(runtime, dev_w2); return -1; } // Record output tensor for copy-back during finalize - runtime->record_tensor_pair(host_f, dev_f, size_f); + record_tensor_pair(runtime, host_f, dev_f, size_f); std::cout << "Tensor F (output): " << size_f << " bytes allocated\n"; // Allocate intermediate tensors (b, c, d) // dev_b is half precision (output of log_sqrt kernel, input to matmul) // dev_c, dev_d are float precision (output of matmul kernels) - size_t BYTES_HALF = SIZE * sizeof(uint16_t); // half = 2 bytes - size_t BYTES_FLOAT = SIZE * sizeof(float); // float = 4 bytes - void *dev_b = runtime->host_api.device_malloc(BYTES_HALF); // sqrt(log(A)) - half output - void *dev_c = runtime->host_api.device_malloc(BYTES_FLOAT); // B @ W1 - float output - void *dev_d = runtime->host_api.device_malloc(BYTES_FLOAT); // B @ W2 - float output + size_t BYTES_HALF = SIZE * sizeof(uint16_t); // half = 2 bytes + size_t BYTES_FLOAT = SIZE * sizeof(float); // float = 4 bytes + void *dev_b = device_malloc(runtime, BYTES_HALF); // sqrt(log(A)) - half output + void *dev_c = device_malloc(runtime, BYTES_FLOAT); // B @ W1 - float output + void *dev_d = device_malloc(runtime, BYTES_FLOAT); // B @ W2 - float output if (!dev_b || !dev_c || !dev_d) { std::cerr << "Error: Failed to allocate intermediate tensors\n"; - runtime->host_api.device_free(dev_a); - runtime->host_api.device_free(dev_w1); - runtime->host_api.device_free(dev_w2); - runtime->host_api.device_free(dev_f); - if (dev_b) runtime->host_api.device_free(dev_b); - if (dev_c) runtime->host_api.device_free(dev_c); - if (dev_d) runtime->host_api.device_free(dev_d); + device_free(runtime, dev_a); + device_free(runtime, dev_w1); + device_free(runtime, dev_w2); + device_free(runtime, dev_f); + if (dev_b) device_free(runtime, dev_b); + if (dev_c) device_free(runtime, dev_c); + if (dev_d) device_free(runtime, dev_d); return -1; } @@ -130,7 +129,7 @@ int build_matmul_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) { args_t0[0] = reinterpret_cast(dev_a); // src args_t0[1] = reinterpret_cast(dev_b); // out args_t0[2] = SIZE; // size - int t0 = runtime->add_task(args_t0, 3, 0, CoreType::AIV); + int t0 = add_task(runtime, args_t0, 3, 0, CoreType::AIV); // Task 1: C = B @ W1 (func_id=1: kernel_matmul, AIC) uint64_t args_t1[4]; @@ -138,7 +137,7 @@ int build_matmul_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) { args_t1[1] = reinterpret_cast(dev_w1); // src1 (right matrix) args_t1[2] = reinterpret_cast(dev_c); // out args_t1[3] = SIZE; // size - int t1 = runtime->add_task(args_t1, 4, 1, CoreType::AIC); + int t1 = add_task(runtime, args_t1, 4, 1, CoreType::AIC); // Task 2: D = B @ W2 (func_id=1: kernel_matmul, AIC) uint64_t args_t2[4]; @@ -146,7 +145,7 @@ int build_matmul_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) { args_t2[1] = reinterpret_cast(dev_w2); // src1 (right matrix) args_t2[2] = reinterpret_cast(dev_d); // out args_t2[3] = SIZE; // size - int t2 = runtime->add_task(args_t2, 4, 1, CoreType::AIC); + int t2 = add_task(runtime, args_t2, 4, 1, CoreType::AIC); // Task 3: F = exp(C + D) (func_id=2: kernel_add_exp, AIV) uint64_t args_t3[4]; @@ -154,13 +153,13 @@ int build_matmul_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) { args_t3[1] = reinterpret_cast(dev_d); // src1 args_t3[2] = reinterpret_cast(dev_f); // out args_t3[3] = SIZE; // size - int t3 = runtime->add_task(args_t3, 4, 2, CoreType::AIV); + int t3 = add_task(runtime, args_t3, 4, 2, CoreType::AIV); // Add dependencies (diamond: t0→t1→t3, t0→t2→t3) - runtime->add_successor(t0, t1); // t0 → t1 - runtime->add_successor(t0, t2); // t0 → t2 - runtime->add_successor(t1, t3); // t1 → t3 - runtime->add_successor(t2, t3); // t2 → t3 + add_successor(runtime, t0, t1); // t0 → t1 + add_successor(runtime, t0, t2); // t0 → t2 + add_successor(runtime, t1, t3); // t1 → t3 + add_successor(runtime, t2, t3); // t2 → t3 std::cout << "\nTasks:\n"; std::cout << " task" << t0 << ": B = sqrt(log(A)) [AIV]\n"; @@ -169,8 +168,8 @@ int build_matmul_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) { std::cout << " task" << t3 << ": F = exp(C + D) [AIV]\n"; std::cout << "Dependencies: t0→t1→t3, t0→t2→t3 (diamond)\n"; - std::cout << "Created runtime with " << runtime->get_task_count() << " tasks\n"; - runtime->print_runtime(); + std::cout << "Created runtime with " << get_task_count(runtime) << " tasks\n"; + print_runtime(runtime); return 0; } diff --git a/examples/a2a3/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/a2a3/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp index 017a0dc5a..17dbd02ce 100644 --- a/examples/a2a3/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp +++ b/examples/a2a3/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp @@ -27,8 +27,7 @@ #include #include -#include "runtime.h" // NOLINT(build/include_subdir) -#include "task_args.h" // NOLINT(build/include_subdir) +#include "orchestration_api.h" // NOLINT(build/include_subdir) #define FUNC_QK_MATMUL 0 #define FUNC_SOFTMAX_PREPARE 1 @@ -37,7 +36,7 @@ extern "C" { -int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) { +int build_paged_attention_graph(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args) { if (orch_args.tensor_count() < 6) { std::cerr << "Expected at least 6 tensors, got " << orch_args.tensor_count() << '\n'; return -1; @@ -83,20 +82,20 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc std::cout << "q_tile_size=" << q_tile_size << ", num_head_tiles=" << num_head_tiles << '\n'; // Allocate device memory for inputs/outputs - void *dev_query = runtime->host_api.device_malloc(query_size); - void *dev_key_cache = runtime->host_api.device_malloc(key_cache_size); - void *dev_value_cache = runtime->host_api.device_malloc(value_cache_size); - void *dev_out = runtime->host_api.device_malloc(out_size); + void *dev_query = device_malloc(runtime, query_size); + void *dev_key_cache = device_malloc(runtime, key_cache_size); + void *dev_value_cache = device_malloc(runtime, value_cache_size); + void *dev_out = device_malloc(runtime, out_size); if (!dev_query || !dev_key_cache || !dev_value_cache || !dev_out) { std::cerr << "Error: Failed to allocate device memory\n"; return -1; } - runtime->host_api.copy_to_device(dev_query, host_query, query_size); - runtime->host_api.copy_to_device(dev_key_cache, host_key_cache, key_cache_size); - runtime->host_api.copy_to_device(dev_value_cache, host_value_cache, value_cache_size); - runtime->record_tensor_pair(host_out, dev_out, out_size); + copy_to_device(runtime, dev_query, host_query, query_size); + copy_to_device(runtime, dev_key_cache, host_key_cache, key_cache_size); + copy_to_device(runtime, dev_value_cache, host_value_cache, value_cache_size); + record_tensor_pair(runtime, host_out, dev_out, out_size); // Buffer sizes depend on q_tile_size and block_size size_t sij_size = static_cast(q_tile_size) * block_size * sizeof(float); @@ -114,11 +113,11 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc void **dev_oi_new_arr = new void *[total_buffers]; for (uint32_t i = 0; i < total_buffers; i++) { - dev_sij_arr[i] = runtime->host_api.device_malloc(sij_size); - dev_pij_arr[i] = runtime->host_api.device_malloc(pij_size); - dev_mij_arr[i] = runtime->host_api.device_malloc(mij_size); - dev_lij_arr[i] = runtime->host_api.device_malloc(lij_size); - dev_oi_new_arr[i] = runtime->host_api.device_malloc(oi_new_size); + dev_sij_arr[i] = device_malloc(runtime, sij_size); + dev_pij_arr[i] = device_malloc(runtime, pij_size); + dev_mij_arr[i] = device_malloc(runtime, mij_size); + dev_lij_arr[i] = device_malloc(runtime, lij_size); + dev_oi_new_arr[i] = device_malloc(runtime, oi_new_size); } // Per-(batch, head_tile) accumulators @@ -132,9 +131,9 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc void **dev_oi_arr = new void *[total_accums]; for (uint32_t i = 0; i < total_accums; i++) { - dev_mi_arr[i] = runtime->host_api.device_malloc(mi_size); - dev_li_arr[i] = runtime->host_api.device_malloc(li_size); - dev_oi_arr[i] = runtime->host_api.device_malloc(oi_size); + dev_mi_arr[i] = device_malloc(runtime, mi_size); + dev_li_arr[i] = device_malloc(runtime, li_size); + dev_oi_arr[i] = device_malloc(runtime, oi_size); } std::cout << "Allocated " << total_buffers << " per-block buffers\n"; @@ -193,7 +192,7 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc uint64_t qk_args[6] = {reinterpret_cast(qi_ptr), reinterpret_cast(kj_ptr), reinterpret_cast(dev_sij), static_cast(q_tile_size), static_cast(head_dim), static_cast(block_size)}; - int t_qk = runtime->add_task(qk_args, 6, FUNC_QK_MATMUL, CoreType::AIC); + int t_qk = add_task(runtime, qk_args, 6, FUNC_QK_MATMUL, CoreType::AIC); total_tasks++; // SF: scale, rowmax, exp, rowsum -> pij, mij, lij @@ -201,18 +200,18 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc reinterpret_cast(dev_pij), reinterpret_cast(dev_mij), reinterpret_cast(dev_lij), static_cast(q_tile_size), static_cast(block_size)}; - int t_sf = runtime->add_task(sf_args, 7, FUNC_SOFTMAX_PREPARE, CoreType::AIV); + int t_sf = add_task(runtime, sf_args, 7, FUNC_SOFTMAX_PREPARE, CoreType::AIV); total_tasks++; // PV: pij(M, K') @ vj(K', N') -> oi_new(M, N') uint64_t pv_args[6] = {reinterpret_cast(dev_pij), reinterpret_cast(vj_ptr), reinterpret_cast(dev_oi_new), static_cast(q_tile_size), static_cast(block_size), static_cast(head_dim)}; - int t_pv = runtime->add_task(pv_args, 6, FUNC_PV_MATMUL, CoreType::AIC); + int t_pv = add_task(runtime, pv_args, 6, FUNC_PV_MATMUL, CoreType::AIC); total_tasks++; - runtime->add_successor(t_qk, t_sf); - runtime->add_successor(t_sf, t_pv); + add_successor(runtime, t_qk, t_sf); + add_successor(runtime, t_sf, t_pv); // Online Update: serialized across blocks (each depends on previous) int is_first = (bn == 0) ? 1 : 0; @@ -224,12 +223,12 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc static_cast(is_first), static_cast(is_last), reinterpret_cast(out_ptr), static_cast(q_tile_size), static_cast(head_dim)}; - int t_up = runtime->add_task(up_args, 11, FUNC_ONLINE_UPDATE, CoreType::AIV); + int t_up = add_task(runtime, up_args, 11, FUNC_ONLINE_UPDATE, CoreType::AIV); total_tasks++; - runtime->add_successor(t_pv, t_up); + add_successor(runtime, t_pv, t_up); if (t_up_prev >= 0) { - runtime->add_successor(t_up_prev, t_up); + add_successor(runtime, t_up_prev, t_up); } t_up_prev = t_up; } @@ -246,7 +245,7 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc delete[] dev_oi_arr; std::cout << "Created " << total_tasks << " tasks\n"; - runtime->print_runtime(); + print_runtime(runtime); return 0; } diff --git a/examples/a2a3/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp b/examples/a2a3/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp index 84173028f..82555d9ef 100644 --- a/examples/a2a3/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp +++ b/examples/a2a3/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp @@ -15,20 +15,19 @@ * * This orchestration function: * 1. Receives ChipStorageTaskArgs with tensor metadata (pointers, shapes, dtypes) - * 2. Allocates device memory via runtime->host_api - * 3. Copies input data to device via runtime->host_api + * 2. Allocates device memory via orchestration API helpers + * 3. Copies input data to device via orchestration API helpers * 4. Records output tensor for copy-back during finalize * 5. Builds the task graph */ #include -#include "runtime.h" // NOLINT(build/include_subdir) -#include "task_args.h" // NOLINT(build/include_subdir) +#include "orchestration_api.h" // NOLINT(build/include_subdir) extern "C" { -int build_example_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) { +int build_example_graph(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args) { // Validate argument count // Expected orch_args: [a, b, f] — 3 tensors if (orch_args.tensor_count() < 3) { @@ -52,48 +51,48 @@ int build_example_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) // Allocate device memory and copy inputs std::cout << "\n=== Allocating Device Memory ===" << '\n'; - void *dev_a = runtime->host_api.device_malloc(size_a); + void *dev_a = device_malloc(runtime, size_a); if (!dev_a) { std::cerr << "Error: Failed to allocate device memory for a\n"; return -1; } - runtime->host_api.copy_to_device(dev_a, host_a, size_a); + copy_to_device(runtime, dev_a, host_a, size_a); std::cout << "Tensor a: " << size_a << " bytes copied to device\n"; - void *dev_b = runtime->host_api.device_malloc(size_b); + void *dev_b = device_malloc(runtime, size_b); if (!dev_b) { std::cerr << "Error: Failed to allocate device memory for b\n"; - runtime->host_api.device_free(dev_a); + device_free(runtime, dev_a); return -1; } - runtime->host_api.copy_to_device(dev_b, host_b, size_b); + copy_to_device(runtime, dev_b, host_b, size_b); std::cout << "Tensor b: " << size_b << " bytes copied to device\n"; - void *dev_f = runtime->host_api.device_malloc(size_f); + void *dev_f = device_malloc(runtime, size_f); if (!dev_f) { std::cerr << "Error: Failed to allocate device memory for f\n"; - runtime->host_api.device_free(dev_a); - runtime->host_api.device_free(dev_b); + device_free(runtime, dev_a); + device_free(runtime, dev_b); return -1; } // Record output tensor for copy-back during finalize - runtime->record_tensor_pair(host_f, dev_f, size_f); + record_tensor_pair(runtime, host_f, dev_f, size_f); std::cout << "Tensor f (output): " << size_f << " bytes allocated\n"; // Allocate intermediate tensors (c, d, e) size_t BYTES = SIZE * sizeof(float); - void *dev_c = runtime->host_api.device_malloc(BYTES); - void *dev_d = runtime->host_api.device_malloc(BYTES); - void *dev_e = runtime->host_api.device_malloc(BYTES); + void *dev_c = device_malloc(runtime, BYTES); + void *dev_d = device_malloc(runtime, BYTES); + void *dev_e = device_malloc(runtime, BYTES); if (!dev_c || !dev_d || !dev_e) { std::cerr << "Error: Failed to allocate intermediate tensors\n"; - runtime->host_api.device_free(dev_a); - runtime->host_api.device_free(dev_b); - runtime->host_api.device_free(dev_f); - if (dev_c) runtime->host_api.device_free(dev_c); - if (dev_d) runtime->host_api.device_free(dev_d); - if (dev_e) runtime->host_api.device_free(dev_e); + device_free(runtime, dev_a); + device_free(runtime, dev_b); + device_free(runtime, dev_f); + if (dev_c) device_free(runtime, dev_c); + if (dev_d) device_free(runtime, dev_d); + if (dev_e) device_free(runtime, dev_e); return -1; } @@ -111,7 +110,7 @@ int build_example_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) args_t0[1] = reinterpret_cast(dev_b); // src1 args_t0[2] = reinterpret_cast(dev_c); // out args_t0[3] = SIZE; // size - int t0 = runtime->add_task(args_t0, 4, 0, CoreType::AIV); + int t0 = add_task(runtime, args_t0, 4, 0, CoreType::AIV); // Task 1: d = c + 1 (func_id=1: kernel_add_scalar, AIV) uint64_t args_t1[4]; @@ -120,7 +119,7 @@ int build_example_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) args_t1[1] = scalar_converter.u64; // scalar=1.0 args_t1[2] = reinterpret_cast(dev_d); // out args_t1[3] = SIZE; // size - int t1 = runtime->add_task(args_t1, 4, 1, CoreType::AIV); + int t1 = add_task(runtime, args_t1, 4, 1, CoreType::AIV); // Task 2: e = c + 2 (func_id=1: kernel_add_scalar, AIV) uint64_t args_t2[4]; @@ -129,7 +128,7 @@ int build_example_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) args_t2[1] = scalar_converter.u64; // scalar=2.0 args_t2[2] = reinterpret_cast(dev_e); // out args_t2[3] = SIZE; // size - int t2 = runtime->add_task(args_t2, 4, 1, CoreType::AIV); + int t2 = add_task(runtime, args_t2, 4, 1, CoreType::AIV); // Task 3: f = d * e (func_id=2: kernel_mul, AIV) uint64_t args_t3[4]; @@ -137,13 +136,13 @@ int build_example_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) args_t3[1] = reinterpret_cast(dev_e); // src1 args_t3[2] = reinterpret_cast(dev_f); // out args_t3[3] = SIZE; // size - int t3 = runtime->add_task(args_t3, 4, 2, CoreType::AIV); + int t3 = add_task(runtime, args_t3, 4, 2, CoreType::AIV); // Add dependencies - runtime->add_successor(t0, t1); // t0 → t1 - runtime->add_successor(t0, t2); // t0 → t2 - runtime->add_successor(t1, t3); // t1 → t3 - runtime->add_successor(t2, t3); // t2 → t3 + add_successor(runtime, t0, t1); // t0 → t1 + add_successor(runtime, t0, t2); // t0 → t2 + add_successor(runtime, t1, t3); // t1 → t3 + add_successor(runtime, t2, t3); // t2 → t3 std::cout << "\nTasks:\n"; std::cout << " task" << t0 << ": c = a + b\n"; @@ -152,8 +151,8 @@ int build_example_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) std::cout << " task" << t3 << ": f = d * e\n"; std::cout << "Dependencies: t0→t1, t0→t2, t1→t3, t2→t3\n"; - std::cout << "Created runtime with " << runtime->get_task_count() << " tasks\n"; - runtime->print_runtime(); + std::cout << "Created runtime with " << get_task_count(runtime) << " tasks\n"; + print_runtime(runtime); return 0; } diff --git a/examples/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp index 017a0dc5a..17dbd02ce 100644 --- a/examples/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp +++ b/examples/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp @@ -27,8 +27,7 @@ #include #include -#include "runtime.h" // NOLINT(build/include_subdir) -#include "task_args.h" // NOLINT(build/include_subdir) +#include "orchestration_api.h" // NOLINT(build/include_subdir) #define FUNC_QK_MATMUL 0 #define FUNC_SOFTMAX_PREPARE 1 @@ -37,7 +36,7 @@ extern "C" { -int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) { +int build_paged_attention_graph(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args) { if (orch_args.tensor_count() < 6) { std::cerr << "Expected at least 6 tensors, got " << orch_args.tensor_count() << '\n'; return -1; @@ -83,20 +82,20 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc std::cout << "q_tile_size=" << q_tile_size << ", num_head_tiles=" << num_head_tiles << '\n'; // Allocate device memory for inputs/outputs - void *dev_query = runtime->host_api.device_malloc(query_size); - void *dev_key_cache = runtime->host_api.device_malloc(key_cache_size); - void *dev_value_cache = runtime->host_api.device_malloc(value_cache_size); - void *dev_out = runtime->host_api.device_malloc(out_size); + void *dev_query = device_malloc(runtime, query_size); + void *dev_key_cache = device_malloc(runtime, key_cache_size); + void *dev_value_cache = device_malloc(runtime, value_cache_size); + void *dev_out = device_malloc(runtime, out_size); if (!dev_query || !dev_key_cache || !dev_value_cache || !dev_out) { std::cerr << "Error: Failed to allocate device memory\n"; return -1; } - runtime->host_api.copy_to_device(dev_query, host_query, query_size); - runtime->host_api.copy_to_device(dev_key_cache, host_key_cache, key_cache_size); - runtime->host_api.copy_to_device(dev_value_cache, host_value_cache, value_cache_size); - runtime->record_tensor_pair(host_out, dev_out, out_size); + copy_to_device(runtime, dev_query, host_query, query_size); + copy_to_device(runtime, dev_key_cache, host_key_cache, key_cache_size); + copy_to_device(runtime, dev_value_cache, host_value_cache, value_cache_size); + record_tensor_pair(runtime, host_out, dev_out, out_size); // Buffer sizes depend on q_tile_size and block_size size_t sij_size = static_cast(q_tile_size) * block_size * sizeof(float); @@ -114,11 +113,11 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc void **dev_oi_new_arr = new void *[total_buffers]; for (uint32_t i = 0; i < total_buffers; i++) { - dev_sij_arr[i] = runtime->host_api.device_malloc(sij_size); - dev_pij_arr[i] = runtime->host_api.device_malloc(pij_size); - dev_mij_arr[i] = runtime->host_api.device_malloc(mij_size); - dev_lij_arr[i] = runtime->host_api.device_malloc(lij_size); - dev_oi_new_arr[i] = runtime->host_api.device_malloc(oi_new_size); + dev_sij_arr[i] = device_malloc(runtime, sij_size); + dev_pij_arr[i] = device_malloc(runtime, pij_size); + dev_mij_arr[i] = device_malloc(runtime, mij_size); + dev_lij_arr[i] = device_malloc(runtime, lij_size); + dev_oi_new_arr[i] = device_malloc(runtime, oi_new_size); } // Per-(batch, head_tile) accumulators @@ -132,9 +131,9 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc void **dev_oi_arr = new void *[total_accums]; for (uint32_t i = 0; i < total_accums; i++) { - dev_mi_arr[i] = runtime->host_api.device_malloc(mi_size); - dev_li_arr[i] = runtime->host_api.device_malloc(li_size); - dev_oi_arr[i] = runtime->host_api.device_malloc(oi_size); + dev_mi_arr[i] = device_malloc(runtime, mi_size); + dev_li_arr[i] = device_malloc(runtime, li_size); + dev_oi_arr[i] = device_malloc(runtime, oi_size); } std::cout << "Allocated " << total_buffers << " per-block buffers\n"; @@ -193,7 +192,7 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc uint64_t qk_args[6] = {reinterpret_cast(qi_ptr), reinterpret_cast(kj_ptr), reinterpret_cast(dev_sij), static_cast(q_tile_size), static_cast(head_dim), static_cast(block_size)}; - int t_qk = runtime->add_task(qk_args, 6, FUNC_QK_MATMUL, CoreType::AIC); + int t_qk = add_task(runtime, qk_args, 6, FUNC_QK_MATMUL, CoreType::AIC); total_tasks++; // SF: scale, rowmax, exp, rowsum -> pij, mij, lij @@ -201,18 +200,18 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc reinterpret_cast(dev_pij), reinterpret_cast(dev_mij), reinterpret_cast(dev_lij), static_cast(q_tile_size), static_cast(block_size)}; - int t_sf = runtime->add_task(sf_args, 7, FUNC_SOFTMAX_PREPARE, CoreType::AIV); + int t_sf = add_task(runtime, sf_args, 7, FUNC_SOFTMAX_PREPARE, CoreType::AIV); total_tasks++; // PV: pij(M, K') @ vj(K', N') -> oi_new(M, N') uint64_t pv_args[6] = {reinterpret_cast(dev_pij), reinterpret_cast(vj_ptr), reinterpret_cast(dev_oi_new), static_cast(q_tile_size), static_cast(block_size), static_cast(head_dim)}; - int t_pv = runtime->add_task(pv_args, 6, FUNC_PV_MATMUL, CoreType::AIC); + int t_pv = add_task(runtime, pv_args, 6, FUNC_PV_MATMUL, CoreType::AIC); total_tasks++; - runtime->add_successor(t_qk, t_sf); - runtime->add_successor(t_sf, t_pv); + add_successor(runtime, t_qk, t_sf); + add_successor(runtime, t_sf, t_pv); // Online Update: serialized across blocks (each depends on previous) int is_first = (bn == 0) ? 1 : 0; @@ -224,12 +223,12 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc static_cast(is_first), static_cast(is_last), reinterpret_cast(out_ptr), static_cast(q_tile_size), static_cast(head_dim)}; - int t_up = runtime->add_task(up_args, 11, FUNC_ONLINE_UPDATE, CoreType::AIV); + int t_up = add_task(runtime, up_args, 11, FUNC_ONLINE_UPDATE, CoreType::AIV); total_tasks++; - runtime->add_successor(t_pv, t_up); + add_successor(runtime, t_pv, t_up); if (t_up_prev >= 0) { - runtime->add_successor(t_up_prev, t_up); + add_successor(runtime, t_up_prev, t_up); } t_up_prev = t_up; } @@ -246,7 +245,7 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc delete[] dev_oi_arr; std::cout << "Created " << total_tasks << " tasks\n"; - runtime->print_runtime(); + print_runtime(runtime); return 0; } diff --git a/examples/scripts/README.md b/examples/scripts/README.md index f8c29586c..ce5ac5c8c 100644 --- a/examples/scripts/README.md +++ b/examples/scripts/README.md @@ -47,7 +47,7 @@ python examples/scripts/run_example.py \ ### `run_example.py` Parameters | Argument | Short | Description | Default | -|----------|-------|-------------|---------| +| -------- | ----- | ----------- | ------- | | `--kernels` | `-k` | Kernels directory path (contains kernel_config.py) | **Required** | | `--golden` | `-g` | golden.py script path | **Required** | | `--platform` | `-p` | Platform name: `a2a3` or `a2a3sim` | `a2a3` | @@ -110,6 +110,7 @@ python examples/scripts/run_example.py -k ./kernels -g ./golden.py ### Priority Log level is determined by (highest to lowest priority): + 1. CLI arguments (`--log-level`, `--verbose`, `--silent`) 2. Environment variable (`PTO_LOG_LEVEL`) 3. Default value (`info` / INFO level) @@ -126,7 +127,7 @@ Log level is determined by (highest to lowest priority): The kernels directory must contain a `kernel_config.py` file: -``` +```text kernels/ ├── kernel_config.py # Required: kernel configuration ├── orchestration/ @@ -262,21 +263,30 @@ def generate_inputs(params: dict) -> dict: ## Orchestration Function Interface -The orchestration function's parameter order must match `TENSOR_ORDER`: +For `host_build_graph`, orchestration sources should include `orchestration_api.h` and use `ChipStorageTaskArgs`: ```cpp // Assume TENSOR_ORDER = ["a", "b", "f"] -int BuildExampleGraph(Runtime* runtime, uint64_t* args, int arg_count) { - // args layout: [ptr_a, ptr_b, ptr_f, size_a, size_b, size_f, count] - void* ptr_a = reinterpret_cast(args[0]); - void* ptr_b = reinterpret_cast(args[1]); - void* ptr_f = reinterpret_cast(args[2]); - uint64_t size_a = args[3]; - uint64_t size_b = args[4]; - uint64_t size_f = args[5]; - uint64_t count = args[6]; +#include "orchestration_api.h" + +int BuildExampleGraph(OrchestrationRuntime* runtime, const ChipStorageTaskArgs &orch_args) { + void* ptr_a = orch_args.tensor(0).data_as(); + void* ptr_b = orch_args.tensor(1).data_as(); + void* ptr_f = orch_args.tensor(2).data_as(); + + size_t size_a = orch_args.tensor(0).nbytes(); + size_t size_b = orch_args.tensor(1).nbytes(); + size_t size_f = orch_args.tensor(2).nbytes(); + + void* dev_a = device_malloc(runtime, size_a); + void* dev_b = device_malloc(runtime, size_b); + void* dev_f = device_malloc(runtime, size_f); + copy_to_device(runtime, dev_a, ptr_a, size_a); + copy_to_device(runtime, dev_b, ptr_b, size_b); + record_tensor_pair(runtime, ptr_f, dev_f, size_f); // Build task graph... + return 0; } ``` @@ -313,7 +323,7 @@ No special platform-specific environment variables required. ### Directory Structure -``` +```text my_test/ ├── kernels/ │ ├── kernel_config.py @@ -342,7 +352,7 @@ python examples/scripts/run_example.py -k my_test/kernels -g my_test/golden.py - ### Success Example -``` +```text === Building Runtime: host_build_graph (platform: a2a3sim) === ... === Compiling and Registering Kernels === @@ -367,7 +377,7 @@ TEST PASSED ### Failure Example -``` +```text === Comparing Results === Comparing f: shape=(16384,), dtype=float32 First 10 actual: [40. 40. 40. 40. 40. 40. 40. 40. 40. 40.] @@ -394,10 +404,12 @@ python examples/scripts/run_example.py -k ... -g ... -p ... -v ### Q: Why "binary_data cannot be empty" error? This usually happens when: + - Using wrong platform (a2a3 vs a2a3sim) - Kernel compilation failed silently Solutions: + 1. Verify correct `-p` parameter is used 2. Check if kernel source files exist 3. Use `-v` to view detailed compilation logs diff --git a/examples/scripts/code_runner.py b/examples/scripts/code_runner.py index 259e95f76..be87aa3a9 100644 --- a/examples/scripts/code_runner.py +++ b/examples/scripts/code_runner.py @@ -649,11 +649,8 @@ def _build_func_args(self, tensors: dict[str, torch.Tensor]) -> list: """ Build orch_args from tensors dict (legacy path). - Convention for orchestration function signature: - int BuildGraph(Runtime* runtime, uint64_t* args, int arg_count) - - Where args layout is: - [ptr_0, ptr_1, ..., ptr_n, nbytes_0, nbytes_1, ..., nbytes_n, count] + The resulting object is passed to orchestration entries with the shape: + int BuildGraph(OrchestrationRuntime* runtime, const ChipStorageTaskArgs &orch_args) Args: tensors: Dict of torch tensors (will be modified to ensure contiguous) diff --git a/src/a2a3/runtime/host_build_graph/build_config.py b/src/a2a3/runtime/host_build_graph/build_config.py index 3e29fd698..76e7face5 100644 --- a/src/a2a3/runtime/host_build_graph/build_config.py +++ b/src/a2a3/runtime/host_build_graph/build_config.py @@ -1,17 +1,17 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- # Runtime build configuration # All paths are relative to this file's directory (src/runtime/) BUILD_CONFIG = { - "aicore": { - "include_dirs": ["runtime"], - "source_dirs": ["aicore", "runtime"] - }, - "aicpu": { - "include_dirs": ["runtime"], - "source_dirs": ["aicpu", "runtime"] - }, - "host": { - "include_dirs": ["runtime"], - "source_dirs": ["host", "runtime"] - } + "aicore": {"include_dirs": ["runtime"], "source_dirs": ["aicore", "runtime"]}, + "aicpu": {"include_dirs": ["runtime"], "source_dirs": ["aicpu", "runtime"]}, + "host": {"include_dirs": ["runtime", "orchestration"], "source_dirs": ["host", "runtime"]}, + "orchestration": {"include_dirs": ["runtime", "orchestration"], "source_dirs": []}, } diff --git a/src/a2a3/runtime/host_build_graph/docs/RUNTIME_LOGIC.md b/src/a2a3/runtime/host_build_graph/docs/RUNTIME_LOGIC.md index 9db0f2657..75f3a4336 100644 --- a/src/a2a3/runtime/host_build_graph/docs/RUNTIME_LOGIC.md +++ b/src/a2a3/runtime/host_build_graph/docs/RUNTIME_LOGIC.md @@ -1,21 +1,25 @@ # Runtime Logic: host_build_graph ## Overview + The host_build_graph runtime builds a static task graph on the host, copies the Runtime object to device memory, and lets AICPU scheduler threads dispatch tasks to AICore via a per-core handshake. Dependencies are explicit edges created by orchestration code, so scheduling is a standard fanin/fanout ready-queue model. ## Core Data Structures + - `Runtime` owns the task table, handshake buffers, and host-side device APIs. See `src/runtime/host_build_graph/runtime/runtime.h`. - `Task` is a fixed-size record that stores `func_id`, argument array, `fanin`, `fanout`, `core_type`, and `function_bin_addr`. - `Handshake` is the shared per-core control block used by AICPU and AICore for dispatch and completion. - `HostApi` provides device memory ops used by host orchestration (`device_malloc`, `copy_to_device`, `upload_kernel_binary`, etc.). ## Build And Init Flow + 1. Python tooling compiles kernels and orchestration into shared objects. 2. `init_runtime_impl` loads the orchestration SO from bytes, resolves the entry symbol, and registers kernel binaries with the platform uploader. The resulting GM addresses are stored by `Runtime::set_function_bin_addr`. See `src/runtime/host_build_graph/host/runtime_maker.cpp`. -3. The orchestration function runs on the host and builds the graph. It allocates device buffers, copies input data to device, records output buffers with `runtime->record_tensor_pair`, adds tasks via `runtime->add_task`, and adds dependency edges via `runtime->add_successor`. +3. The orchestration function runs on the host and builds the graph. It allocates device buffers, copies input data to device, records output buffers with `record_tensor_pair(runtime, ...)`, adds tasks via `add_task(runtime, ...)`, and adds dependency edges via `add_successor(runtime, ...)`. 4. The populated `Runtime` is copied to device memory by the platform layer. AICPU then runs the executor with this Runtime snapshot. ## Execution Flow (Device) + 1. `aicpu_executor.cpp` performs core discovery, handshake initialization, and ready-queue seeding using `Runtime::get_initial_ready_tasks`. 2. Scheduler threads maintain per-core and global ready queues. When a task is ready, the scheduler writes its pointer to the core's `Handshake` and sets `task_status=1`. 3. AICore reads the handshake, executes the kernel at `Task::function_bin_addr`, and writes `task_status=0` on completion. @@ -23,9 +27,11 @@ The host_build_graph runtime builds a static task graph on the host, copies the 5. The executor shuts down cores by setting `Handshake::control=1` after all tasks complete. ## Finalize And Cleanup + `validate_runtime_impl` copies all recorded output tensors back to the host and frees device allocations recorded in tensor pairs. See `src/runtime/host_build_graph/host/runtime_maker.cpp`. ## Key Files + - `src/runtime/host_build_graph/runtime/runtime.h` - `src/runtime/host_build_graph/runtime/runtime.cpp` - `src/runtime/host_build_graph/host/runtime_maker.cpp` diff --git a/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp b/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp index d906215bb..3cd8ca839 100644 --- a/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp +++ b/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp @@ -36,21 +36,55 @@ #include #include -#include "callable.h" // NOLINT(build/include_subdir) -#include "runtime.h" // Includes unified_log.h and provides LOG_* macros // NOLINT(build/include_subdir) -#include "task_args.h" // NOLINT(build/include_subdir) - -/** - * Orchestration function signature. - * - * @param runtime Pointer to Runtime to populate with tasks - * @param orch_args Separated tensor/scalar arguments - * @return 0 on success, negative on error - */ -typedef int (*OrchestrationFunc)(Runtime *runtime, const ChipStorageTaskArgs &orch_args); +#include "callable.h" // NOLINT(build/include_subdir) +#include "orchestration_api.h" // NOLINT(build/include_subdir) +#include "runtime.h" // Includes unified_log.h and provides LOG_* macros // NOLINT(build/include_subdir) +#include "task_args.h" // NOLINT(build/include_subdir) namespace { +struct OrchestrationRuntimeImpl { + const OrchestrationRuntimeOps *ops; + Runtime *runtime; +}; + +Runtime *unwrap_runtime(OrchestrationRuntime *runtime) { + return reinterpret_cast(runtime)->runtime; +} + +int runtime_add_task(OrchestrationRuntime *runtime, uint64_t *args, int num_args, int func_id, CoreType core_type) { + return unwrap_runtime(runtime)->add_task(args, num_args, func_id, core_type); +} + +void runtime_add_successor(OrchestrationRuntime *runtime, int from_task, int to_task) { + unwrap_runtime(runtime)->add_successor(from_task, to_task); +} + +void runtime_record_tensor_pair(OrchestrationRuntime *runtime, void *host_ptr, void *dev_ptr, size_t size) { + unwrap_runtime(runtime)->record_tensor_pair(host_ptr, dev_ptr, size); +} + +int runtime_get_task_count(OrchestrationRuntime *runtime) { return unwrap_runtime(runtime)->get_task_count(); } + +void runtime_print_runtime(OrchestrationRuntime *runtime) { unwrap_runtime(runtime)->print_runtime(); } + +void *runtime_device_malloc(OrchestrationRuntime *runtime, size_t size) { + return unwrap_runtime(runtime)->host_api.device_malloc(size); +} + +void runtime_device_free(OrchestrationRuntime *runtime, void *ptr) { + unwrap_runtime(runtime)->host_api.device_free(ptr); +} + +int runtime_copy_to_device(OrchestrationRuntime *runtime, void *dev_ptr, const void *host_ptr, size_t size) { + return unwrap_runtime(runtime)->host_api.copy_to_device(dev_ptr, host_ptr, size); +} + +const OrchestrationRuntimeOps k_orchestration_runtime_ops = { + runtime_add_task, runtime_add_successor, runtime_record_tensor_pair, runtime_get_task_count, + runtime_print_runtime, runtime_device_malloc, runtime_device_free, runtime_copy_to_device, +}; + bool write_all_bytes(int fd, const uint8_t *data, size_t size) { size_t total_written = 0; while (total_written < size) { @@ -102,10 +136,10 @@ extern "C" { * This function loads the orchestration SO from binary data via a temp file, * resolves the orchestration function via dlsym, then calls it to build the * task graph. The orchestration function is responsible for: - * - Allocating device memory via runtime->host_api.device_malloc() - * - Copying data to device via runtime->host_api.copy_to_device() + * - Allocating device memory via device_malloc() + * - Copying data to device via copy_to_device() * - Building the task graph - * - Recording tensor pairs via runtime->record_tensor_pair() + * - Recording tensor pairs via record_tensor_pair() * * @param runtime Pointer to pre-constructed Runtime * @param callable ChipCallable containing orch binary, func_name, and child kernels @@ -181,9 +215,11 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip orch_args->tensor_count(), orch_args->scalar_count() ); + OrchestrationRuntimeImpl orchestration_runtime = {&k_orchestration_runtime_ops, runtime}; + // Call orchestration function to build task graph // The orchestration function handles device memory allocation and copy-to-device - int rc = orch_func(runtime, *orch_args); + int rc = orch_func(reinterpret_cast(&orchestration_runtime), *orch_args); if (rc != 0) { LOG_ERROR("Orchestration function failed with code %d", rc); runtime->clear_tensor_pairs(); diff --git a/src/a2a3/runtime/host_build_graph/orchestration/orchestration_api.h b/src/a2a3/runtime/host_build_graph/orchestration/orchestration_api.h new file mode 100644 index 000000000..b5854d39a --- /dev/null +++ b/src/a2a3/runtime/host_build_graph/orchestration/orchestration_api.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Orchestration API for host_build_graph. + * + * Orchestration sources include only this header and interact with the runtime + * through the function-pointer table embedded in OrchestrationRuntime. + */ + +#ifndef SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_ORCHESTRATION_ORCHESTRATION_API_H_ +#define SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_ORCHESTRATION_ORCHESTRATION_API_H_ + +#include +#include + +#include "common/core_type.h" // NOLINT(build/include_subdir) +#include "task_args.h" // NOLINT(build/include_subdir) + +typedef struct OrchestrationRuntime OrchestrationRuntime; + +typedef struct OrchestrationRuntimeOps { + int (*add_task)(OrchestrationRuntime *runtime, uint64_t *args, int num_args, int func_id, CoreType core_type); + void (*add_successor)(OrchestrationRuntime *runtime, int from_task, int to_task); + void (*record_tensor_pair)(OrchestrationRuntime *runtime, void *host_ptr, void *dev_ptr, size_t size); + int (*get_task_count)(OrchestrationRuntime *runtime); + void (*print_runtime)(OrchestrationRuntime *runtime); + + void *(*device_malloc)(OrchestrationRuntime *runtime, size_t size); + void (*device_free)(OrchestrationRuntime *runtime, void *ptr); + int (*copy_to_device)(OrchestrationRuntime *runtime, void *dev_ptr, const void *host_ptr, size_t size); +} OrchestrationRuntimeOps; + +struct OrchestrationRuntime { + const OrchestrationRuntimeOps *ops; +}; + +static inline int +add_task(OrchestrationRuntime *runtime, uint64_t *args, int num_args, int func_id, CoreType core_type) { + return runtime->ops->add_task(runtime, args, num_args, func_id, core_type); +} + +static inline void add_successor(OrchestrationRuntime *runtime, int from_task, int to_task) { + runtime->ops->add_successor(runtime, from_task, to_task); +} + +static inline void record_tensor_pair(OrchestrationRuntime *runtime, void *host_ptr, void *dev_ptr, size_t size) { + runtime->ops->record_tensor_pair(runtime, host_ptr, dev_ptr, size); +} + +static inline int get_task_count(OrchestrationRuntime *runtime) { return runtime->ops->get_task_count(runtime); } + +static inline void print_runtime(OrchestrationRuntime *runtime) { runtime->ops->print_runtime(runtime); } + +static inline void *device_malloc(OrchestrationRuntime *runtime, size_t size) { + return runtime->ops->device_malloc(runtime, size); +} + +static inline void device_free(OrchestrationRuntime *runtime, void *ptr) { runtime->ops->device_free(runtime, ptr); } + +static inline int copy_to_device(OrchestrationRuntime *runtime, void *dev_ptr, const void *host_ptr, size_t size) { + return runtime->ops->copy_to_device(runtime, dev_ptr, host_ptr, size); +} + +typedef int (*OrchestrationFunc)(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args); + +#endif // SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_ORCHESTRATION_ORCHESTRATION_API_H_ diff --git a/src/a5/runtime/host_build_graph/build_config.py b/src/a5/runtime/host_build_graph/build_config.py index 3e29fd698..76e7face5 100644 --- a/src/a5/runtime/host_build_graph/build_config.py +++ b/src/a5/runtime/host_build_graph/build_config.py @@ -1,17 +1,17 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- # Runtime build configuration # All paths are relative to this file's directory (src/runtime/) BUILD_CONFIG = { - "aicore": { - "include_dirs": ["runtime"], - "source_dirs": ["aicore", "runtime"] - }, - "aicpu": { - "include_dirs": ["runtime"], - "source_dirs": ["aicpu", "runtime"] - }, - "host": { - "include_dirs": ["runtime"], - "source_dirs": ["host", "runtime"] - } + "aicore": {"include_dirs": ["runtime"], "source_dirs": ["aicore", "runtime"]}, + "aicpu": {"include_dirs": ["runtime"], "source_dirs": ["aicpu", "runtime"]}, + "host": {"include_dirs": ["runtime", "orchestration"], "source_dirs": ["host", "runtime"]}, + "orchestration": {"include_dirs": ["runtime", "orchestration"], "source_dirs": []}, } diff --git a/src/a5/runtime/host_build_graph/docs/RUNTIME_LOGIC.md b/src/a5/runtime/host_build_graph/docs/RUNTIME_LOGIC.md index 9db0f2657..75f3a4336 100644 --- a/src/a5/runtime/host_build_graph/docs/RUNTIME_LOGIC.md +++ b/src/a5/runtime/host_build_graph/docs/RUNTIME_LOGIC.md @@ -1,21 +1,25 @@ # Runtime Logic: host_build_graph ## Overview + The host_build_graph runtime builds a static task graph on the host, copies the Runtime object to device memory, and lets AICPU scheduler threads dispatch tasks to AICore via a per-core handshake. Dependencies are explicit edges created by orchestration code, so scheduling is a standard fanin/fanout ready-queue model. ## Core Data Structures + - `Runtime` owns the task table, handshake buffers, and host-side device APIs. See `src/runtime/host_build_graph/runtime/runtime.h`. - `Task` is a fixed-size record that stores `func_id`, argument array, `fanin`, `fanout`, `core_type`, and `function_bin_addr`. - `Handshake` is the shared per-core control block used by AICPU and AICore for dispatch and completion. - `HostApi` provides device memory ops used by host orchestration (`device_malloc`, `copy_to_device`, `upload_kernel_binary`, etc.). ## Build And Init Flow + 1. Python tooling compiles kernels and orchestration into shared objects. 2. `init_runtime_impl` loads the orchestration SO from bytes, resolves the entry symbol, and registers kernel binaries with the platform uploader. The resulting GM addresses are stored by `Runtime::set_function_bin_addr`. See `src/runtime/host_build_graph/host/runtime_maker.cpp`. -3. The orchestration function runs on the host and builds the graph. It allocates device buffers, copies input data to device, records output buffers with `runtime->record_tensor_pair`, adds tasks via `runtime->add_task`, and adds dependency edges via `runtime->add_successor`. +3. The orchestration function runs on the host and builds the graph. It allocates device buffers, copies input data to device, records output buffers with `record_tensor_pair(runtime, ...)`, adds tasks via `add_task(runtime, ...)`, and adds dependency edges via `add_successor(runtime, ...)`. 4. The populated `Runtime` is copied to device memory by the platform layer. AICPU then runs the executor with this Runtime snapshot. ## Execution Flow (Device) + 1. `aicpu_executor.cpp` performs core discovery, handshake initialization, and ready-queue seeding using `Runtime::get_initial_ready_tasks`. 2. Scheduler threads maintain per-core and global ready queues. When a task is ready, the scheduler writes its pointer to the core's `Handshake` and sets `task_status=1`. 3. AICore reads the handshake, executes the kernel at `Task::function_bin_addr`, and writes `task_status=0` on completion. @@ -23,9 +27,11 @@ The host_build_graph runtime builds a static task graph on the host, copies the 5. The executor shuts down cores by setting `Handshake::control=1` after all tasks complete. ## Finalize And Cleanup + `validate_runtime_impl` copies all recorded output tensors back to the host and frees device allocations recorded in tensor pairs. See `src/runtime/host_build_graph/host/runtime_maker.cpp`. ## Key Files + - `src/runtime/host_build_graph/runtime/runtime.h` - `src/runtime/host_build_graph/runtime/runtime.cpp` - `src/runtime/host_build_graph/host/runtime_maker.cpp` diff --git a/src/a5/runtime/host_build_graph/host/runtime_maker.cpp b/src/a5/runtime/host_build_graph/host/runtime_maker.cpp index d906215bb..3cd8ca839 100644 --- a/src/a5/runtime/host_build_graph/host/runtime_maker.cpp +++ b/src/a5/runtime/host_build_graph/host/runtime_maker.cpp @@ -36,21 +36,55 @@ #include #include -#include "callable.h" // NOLINT(build/include_subdir) -#include "runtime.h" // Includes unified_log.h and provides LOG_* macros // NOLINT(build/include_subdir) -#include "task_args.h" // NOLINT(build/include_subdir) - -/** - * Orchestration function signature. - * - * @param runtime Pointer to Runtime to populate with tasks - * @param orch_args Separated tensor/scalar arguments - * @return 0 on success, negative on error - */ -typedef int (*OrchestrationFunc)(Runtime *runtime, const ChipStorageTaskArgs &orch_args); +#include "callable.h" // NOLINT(build/include_subdir) +#include "orchestration_api.h" // NOLINT(build/include_subdir) +#include "runtime.h" // Includes unified_log.h and provides LOG_* macros // NOLINT(build/include_subdir) +#include "task_args.h" // NOLINT(build/include_subdir) namespace { +struct OrchestrationRuntimeImpl { + const OrchestrationRuntimeOps *ops; + Runtime *runtime; +}; + +Runtime *unwrap_runtime(OrchestrationRuntime *runtime) { + return reinterpret_cast(runtime)->runtime; +} + +int runtime_add_task(OrchestrationRuntime *runtime, uint64_t *args, int num_args, int func_id, CoreType core_type) { + return unwrap_runtime(runtime)->add_task(args, num_args, func_id, core_type); +} + +void runtime_add_successor(OrchestrationRuntime *runtime, int from_task, int to_task) { + unwrap_runtime(runtime)->add_successor(from_task, to_task); +} + +void runtime_record_tensor_pair(OrchestrationRuntime *runtime, void *host_ptr, void *dev_ptr, size_t size) { + unwrap_runtime(runtime)->record_tensor_pair(host_ptr, dev_ptr, size); +} + +int runtime_get_task_count(OrchestrationRuntime *runtime) { return unwrap_runtime(runtime)->get_task_count(); } + +void runtime_print_runtime(OrchestrationRuntime *runtime) { unwrap_runtime(runtime)->print_runtime(); } + +void *runtime_device_malloc(OrchestrationRuntime *runtime, size_t size) { + return unwrap_runtime(runtime)->host_api.device_malloc(size); +} + +void runtime_device_free(OrchestrationRuntime *runtime, void *ptr) { + unwrap_runtime(runtime)->host_api.device_free(ptr); +} + +int runtime_copy_to_device(OrchestrationRuntime *runtime, void *dev_ptr, const void *host_ptr, size_t size) { + return unwrap_runtime(runtime)->host_api.copy_to_device(dev_ptr, host_ptr, size); +} + +const OrchestrationRuntimeOps k_orchestration_runtime_ops = { + runtime_add_task, runtime_add_successor, runtime_record_tensor_pair, runtime_get_task_count, + runtime_print_runtime, runtime_device_malloc, runtime_device_free, runtime_copy_to_device, +}; + bool write_all_bytes(int fd, const uint8_t *data, size_t size) { size_t total_written = 0; while (total_written < size) { @@ -102,10 +136,10 @@ extern "C" { * This function loads the orchestration SO from binary data via a temp file, * resolves the orchestration function via dlsym, then calls it to build the * task graph. The orchestration function is responsible for: - * - Allocating device memory via runtime->host_api.device_malloc() - * - Copying data to device via runtime->host_api.copy_to_device() + * - Allocating device memory via device_malloc() + * - Copying data to device via copy_to_device() * - Building the task graph - * - Recording tensor pairs via runtime->record_tensor_pair() + * - Recording tensor pairs via record_tensor_pair() * * @param runtime Pointer to pre-constructed Runtime * @param callable ChipCallable containing orch binary, func_name, and child kernels @@ -181,9 +215,11 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip orch_args->tensor_count(), orch_args->scalar_count() ); + OrchestrationRuntimeImpl orchestration_runtime = {&k_orchestration_runtime_ops, runtime}; + // Call orchestration function to build task graph // The orchestration function handles device memory allocation and copy-to-device - int rc = orch_func(runtime, *orch_args); + int rc = orch_func(reinterpret_cast(&orchestration_runtime), *orch_args); if (rc != 0) { LOG_ERROR("Orchestration function failed with code %d", rc); runtime->clear_tensor_pairs(); diff --git a/src/a5/runtime/host_build_graph/orchestration/orchestration_api.h b/src/a5/runtime/host_build_graph/orchestration/orchestration_api.h new file mode 100644 index 000000000..76c4066ba --- /dev/null +++ b/src/a5/runtime/host_build_graph/orchestration/orchestration_api.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Orchestration API for host_build_graph. + * + * Orchestration sources include only this header and interact with the runtime + * through the function-pointer table embedded in OrchestrationRuntime. + */ + +#ifndef SRC_A5_RUNTIME_HOST_BUILD_GRAPH_ORCHESTRATION_ORCHESTRATION_API_H_ +#define SRC_A5_RUNTIME_HOST_BUILD_GRAPH_ORCHESTRATION_ORCHESTRATION_API_H_ + +#include +#include + +#include "common/core_type.h" // NOLINT(build/include_subdir) +#include "task_args.h" // NOLINT(build/include_subdir) + +typedef struct OrchestrationRuntime OrchestrationRuntime; + +typedef struct OrchestrationRuntimeOps { + int (*add_task)(OrchestrationRuntime *runtime, uint64_t *args, int num_args, int func_id, CoreType core_type); + void (*add_successor)(OrchestrationRuntime *runtime, int from_task, int to_task); + void (*record_tensor_pair)(OrchestrationRuntime *runtime, void *host_ptr, void *dev_ptr, size_t size); + int (*get_task_count)(OrchestrationRuntime *runtime); + void (*print_runtime)(OrchestrationRuntime *runtime); + + void *(*device_malloc)(OrchestrationRuntime *runtime, size_t size); + void (*device_free)(OrchestrationRuntime *runtime, void *ptr); + int (*copy_to_device)(OrchestrationRuntime *runtime, void *dev_ptr, const void *host_ptr, size_t size); +} OrchestrationRuntimeOps; + +struct OrchestrationRuntime { + const OrchestrationRuntimeOps *ops; +}; + +static inline int +add_task(OrchestrationRuntime *runtime, uint64_t *args, int num_args, int func_id, CoreType core_type) { + return runtime->ops->add_task(runtime, args, num_args, func_id, core_type); +} + +static inline void add_successor(OrchestrationRuntime *runtime, int from_task, int to_task) { + runtime->ops->add_successor(runtime, from_task, to_task); +} + +static inline void record_tensor_pair(OrchestrationRuntime *runtime, void *host_ptr, void *dev_ptr, size_t size) { + runtime->ops->record_tensor_pair(runtime, host_ptr, dev_ptr, size); +} + +static inline int get_task_count(OrchestrationRuntime *runtime) { return runtime->ops->get_task_count(runtime); } + +static inline void print_runtime(OrchestrationRuntime *runtime) { runtime->ops->print_runtime(runtime); } + +static inline void *device_malloc(OrchestrationRuntime *runtime, size_t size) { + return runtime->ops->device_malloc(runtime, size); +} + +static inline void device_free(OrchestrationRuntime *runtime, void *ptr) { runtime->ops->device_free(runtime, ptr); } + +static inline int copy_to_device(OrchestrationRuntime *runtime, void *dev_ptr, const void *host_ptr, size_t size) { + return runtime->ops->copy_to_device(runtime, dev_ptr, host_ptr, size); +} + +typedef int (*OrchestrationFunc)(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args); + +#endif // SRC_A5_RUNTIME_HOST_BUILD_GRAPH_ORCHESTRATION_ORCHESTRATION_API_H_ diff --git a/tests/st/a2a3/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/st/a2a3/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp index 872af14c0..142921213 100644 --- a/tests/st/a2a3/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp +++ b/tests/st/a2a3/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp @@ -27,8 +27,7 @@ #include #include -#include "runtime.h" // NOLINT(build/include_subdir) -#include "task_args.h" // NOLINT(build/include_subdir) +#include "orchestration_api.h" // NOLINT(build/include_subdir) #define FUNC_QK_MATMUL 0 #define FUNC_SOFTMAX_PREPARE 1 @@ -37,7 +36,7 @@ extern "C" { -int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) { +int build_paged_attention_graph(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args) { if (orch_args.tensor_count() < 6) { std::cerr << "Expected at least 6 tensors, got " << orch_args.tensor_count() << '\n'; return -1; @@ -85,20 +84,20 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc std::cout << "q_tile_size=" << q_tile_size << ", num_head_tiles=" << num_head_tiles << '\n'; // Allocate device memory for inputs/outputs - void *dev_query = runtime->host_api.device_malloc(query_size); - void *dev_key_cache = runtime->host_api.device_malloc(key_cache_size); - void *dev_value_cache = runtime->host_api.device_malloc(value_cache_size); - void *dev_out = runtime->host_api.device_malloc(out_size); + void *dev_query = device_malloc(runtime, query_size); + void *dev_key_cache = device_malloc(runtime, key_cache_size); + void *dev_value_cache = device_malloc(runtime, value_cache_size); + void *dev_out = device_malloc(runtime, out_size); if (!dev_query || !dev_key_cache || !dev_value_cache || !dev_out) { std::cerr << "Error: Failed to allocate device memory\n"; return -1; } - runtime->host_api.copy_to_device(dev_query, host_query, query_size); - runtime->host_api.copy_to_device(dev_key_cache, host_key_cache, key_cache_size); - runtime->host_api.copy_to_device(dev_value_cache, host_value_cache, value_cache_size); - runtime->record_tensor_pair(host_out, dev_out, out_size); + copy_to_device(runtime, dev_query, host_query, query_size); + copy_to_device(runtime, dev_key_cache, host_key_cache, key_cache_size); + copy_to_device(runtime, dev_value_cache, host_value_cache, value_cache_size); + record_tensor_pair(runtime, host_out, dev_out, out_size); // Buffer sizes depend on q_tile_size and block_size size_t sij_size = static_cast(q_tile_size) * block_size * sizeof(float); @@ -116,11 +115,11 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc void **dev_oi_new_arr = new void *[total_buffers]; for (uint32_t i = 0; i < total_buffers; i++) { - dev_sij_arr[i] = runtime->host_api.device_malloc(sij_size); - dev_pij_arr[i] = runtime->host_api.device_malloc(pij_size); - dev_mij_arr[i] = runtime->host_api.device_malloc(mij_size); - dev_lij_arr[i] = runtime->host_api.device_malloc(lij_size); - dev_oi_new_arr[i] = runtime->host_api.device_malloc(oi_new_size); + dev_sij_arr[i] = device_malloc(runtime, sij_size); + dev_pij_arr[i] = device_malloc(runtime, pij_size); + dev_mij_arr[i] = device_malloc(runtime, mij_size); + dev_lij_arr[i] = device_malloc(runtime, lij_size); + dev_oi_new_arr[i] = device_malloc(runtime, oi_new_size); } // Per-(batch, head_tile) accumulators @@ -134,9 +133,9 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc void **dev_oi_arr = new void *[total_accums]; for (uint32_t i = 0; i < total_accums; i++) { - dev_mi_arr[i] = runtime->host_api.device_malloc(mi_size); - dev_li_arr[i] = runtime->host_api.device_malloc(li_size); - dev_oi_arr[i] = runtime->host_api.device_malloc(oi_size); + dev_mi_arr[i] = device_malloc(runtime, mi_size); + dev_li_arr[i] = device_malloc(runtime, li_size); + dev_oi_arr[i] = device_malloc(runtime, oi_size); } std::cout << "Allocated " << total_buffers << " per-block buffers\n"; @@ -196,7 +195,7 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc uint64_t qk_args[6] = {reinterpret_cast(qi_ptr), reinterpret_cast(kj_ptr), reinterpret_cast(dev_sij), static_cast(q_tile_size), static_cast(head_dim), static_cast(block_size)}; - int t_qk = runtime->add_task(qk_args, 6, FUNC_QK_MATMUL, CoreType::AIC); + int t_qk = add_task(runtime, qk_args, 6, FUNC_QK_MATMUL, CoreType::AIC); total_tasks++; // SF: scale, rowmax, exp, rowsum -> pij, mij, lij @@ -204,18 +203,18 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc reinterpret_cast(dev_pij), reinterpret_cast(dev_mij), reinterpret_cast(dev_lij), static_cast(q_tile_size), static_cast(block_size), static_cast(valid_len)}; - int t_sf = runtime->add_task(sf_args, 8, FUNC_SOFTMAX_PREPARE, CoreType::AIV); + int t_sf = add_task(runtime, sf_args, 8, FUNC_SOFTMAX_PREPARE, CoreType::AIV); total_tasks++; // PV: pij(M, K') @ vj(K', N') -> oi_new(M, N') uint64_t pv_args[6] = {reinterpret_cast(dev_pij), reinterpret_cast(vj_ptr), reinterpret_cast(dev_oi_new), static_cast(q_tile_size), static_cast(block_size), static_cast(head_dim)}; - int t_pv = runtime->add_task(pv_args, 6, FUNC_PV_MATMUL, CoreType::AIC); + int t_pv = add_task(runtime, pv_args, 6, FUNC_PV_MATMUL, CoreType::AIC); total_tasks++; - runtime->add_successor(t_qk, t_sf); - runtime->add_successor(t_sf, t_pv); + add_successor(runtime, t_qk, t_sf); + add_successor(runtime, t_sf, t_pv); // Online Update: serialized across blocks (each depends on previous) int is_first = (bn == 0) ? 1 : 0; @@ -227,12 +226,12 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc static_cast(is_first), static_cast(is_last), reinterpret_cast(out_ptr), static_cast(q_tile_size), static_cast(head_dim)}; - int t_up = runtime->add_task(up_args, 11, FUNC_ONLINE_UPDATE, CoreType::AIV); + int t_up = add_task(runtime, up_args, 11, FUNC_ONLINE_UPDATE, CoreType::AIV); total_tasks++; - runtime->add_successor(t_pv, t_up); + add_successor(runtime, t_pv, t_up); if (t_up_prev >= 0) { - runtime->add_successor(t_up_prev, t_up); + add_successor(runtime, t_up_prev, t_up); } t_up_prev = t_up; } @@ -249,7 +248,7 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc delete[] dev_oi_arr; std::cout << "Created " << total_tasks << " tasks\n"; - runtime->print_runtime(); + print_runtime(runtime); return 0; } diff --git a/tests/st/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/st/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp index 872af14c0..142921213 100644 --- a/tests/st/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp +++ b/tests/st/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp @@ -27,8 +27,7 @@ #include #include -#include "runtime.h" // NOLINT(build/include_subdir) -#include "task_args.h" // NOLINT(build/include_subdir) +#include "orchestration_api.h" // NOLINT(build/include_subdir) #define FUNC_QK_MATMUL 0 #define FUNC_SOFTMAX_PREPARE 1 @@ -37,7 +36,7 @@ extern "C" { -int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) { +int build_paged_attention_graph(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args) { if (orch_args.tensor_count() < 6) { std::cerr << "Expected at least 6 tensors, got " << orch_args.tensor_count() << '\n'; return -1; @@ -85,20 +84,20 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc std::cout << "q_tile_size=" << q_tile_size << ", num_head_tiles=" << num_head_tiles << '\n'; // Allocate device memory for inputs/outputs - void *dev_query = runtime->host_api.device_malloc(query_size); - void *dev_key_cache = runtime->host_api.device_malloc(key_cache_size); - void *dev_value_cache = runtime->host_api.device_malloc(value_cache_size); - void *dev_out = runtime->host_api.device_malloc(out_size); + void *dev_query = device_malloc(runtime, query_size); + void *dev_key_cache = device_malloc(runtime, key_cache_size); + void *dev_value_cache = device_malloc(runtime, value_cache_size); + void *dev_out = device_malloc(runtime, out_size); if (!dev_query || !dev_key_cache || !dev_value_cache || !dev_out) { std::cerr << "Error: Failed to allocate device memory\n"; return -1; } - runtime->host_api.copy_to_device(dev_query, host_query, query_size); - runtime->host_api.copy_to_device(dev_key_cache, host_key_cache, key_cache_size); - runtime->host_api.copy_to_device(dev_value_cache, host_value_cache, value_cache_size); - runtime->record_tensor_pair(host_out, dev_out, out_size); + copy_to_device(runtime, dev_query, host_query, query_size); + copy_to_device(runtime, dev_key_cache, host_key_cache, key_cache_size); + copy_to_device(runtime, dev_value_cache, host_value_cache, value_cache_size); + record_tensor_pair(runtime, host_out, dev_out, out_size); // Buffer sizes depend on q_tile_size and block_size size_t sij_size = static_cast(q_tile_size) * block_size * sizeof(float); @@ -116,11 +115,11 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc void **dev_oi_new_arr = new void *[total_buffers]; for (uint32_t i = 0; i < total_buffers; i++) { - dev_sij_arr[i] = runtime->host_api.device_malloc(sij_size); - dev_pij_arr[i] = runtime->host_api.device_malloc(pij_size); - dev_mij_arr[i] = runtime->host_api.device_malloc(mij_size); - dev_lij_arr[i] = runtime->host_api.device_malloc(lij_size); - dev_oi_new_arr[i] = runtime->host_api.device_malloc(oi_new_size); + dev_sij_arr[i] = device_malloc(runtime, sij_size); + dev_pij_arr[i] = device_malloc(runtime, pij_size); + dev_mij_arr[i] = device_malloc(runtime, mij_size); + dev_lij_arr[i] = device_malloc(runtime, lij_size); + dev_oi_new_arr[i] = device_malloc(runtime, oi_new_size); } // Per-(batch, head_tile) accumulators @@ -134,9 +133,9 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc void **dev_oi_arr = new void *[total_accums]; for (uint32_t i = 0; i < total_accums; i++) { - dev_mi_arr[i] = runtime->host_api.device_malloc(mi_size); - dev_li_arr[i] = runtime->host_api.device_malloc(li_size); - dev_oi_arr[i] = runtime->host_api.device_malloc(oi_size); + dev_mi_arr[i] = device_malloc(runtime, mi_size); + dev_li_arr[i] = device_malloc(runtime, li_size); + dev_oi_arr[i] = device_malloc(runtime, oi_size); } std::cout << "Allocated " << total_buffers << " per-block buffers\n"; @@ -196,7 +195,7 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc uint64_t qk_args[6] = {reinterpret_cast(qi_ptr), reinterpret_cast(kj_ptr), reinterpret_cast(dev_sij), static_cast(q_tile_size), static_cast(head_dim), static_cast(block_size)}; - int t_qk = runtime->add_task(qk_args, 6, FUNC_QK_MATMUL, CoreType::AIC); + int t_qk = add_task(runtime, qk_args, 6, FUNC_QK_MATMUL, CoreType::AIC); total_tasks++; // SF: scale, rowmax, exp, rowsum -> pij, mij, lij @@ -204,18 +203,18 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc reinterpret_cast(dev_pij), reinterpret_cast(dev_mij), reinterpret_cast(dev_lij), static_cast(q_tile_size), static_cast(block_size), static_cast(valid_len)}; - int t_sf = runtime->add_task(sf_args, 8, FUNC_SOFTMAX_PREPARE, CoreType::AIV); + int t_sf = add_task(runtime, sf_args, 8, FUNC_SOFTMAX_PREPARE, CoreType::AIV); total_tasks++; // PV: pij(M, K') @ vj(K', N') -> oi_new(M, N') uint64_t pv_args[6] = {reinterpret_cast(dev_pij), reinterpret_cast(vj_ptr), reinterpret_cast(dev_oi_new), static_cast(q_tile_size), static_cast(block_size), static_cast(head_dim)}; - int t_pv = runtime->add_task(pv_args, 6, FUNC_PV_MATMUL, CoreType::AIC); + int t_pv = add_task(runtime, pv_args, 6, FUNC_PV_MATMUL, CoreType::AIC); total_tasks++; - runtime->add_successor(t_qk, t_sf); - runtime->add_successor(t_sf, t_pv); + add_successor(runtime, t_qk, t_sf); + add_successor(runtime, t_sf, t_pv); // Online Update: serialized across blocks (each depends on previous) int is_first = (bn == 0) ? 1 : 0; @@ -227,12 +226,12 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc static_cast(is_first), static_cast(is_last), reinterpret_cast(out_ptr), static_cast(q_tile_size), static_cast(head_dim)}; - int t_up = runtime->add_task(up_args, 11, FUNC_ONLINE_UPDATE, CoreType::AIV); + int t_up = add_task(runtime, up_args, 11, FUNC_ONLINE_UPDATE, CoreType::AIV); total_tasks++; - runtime->add_successor(t_pv, t_up); + add_successor(runtime, t_pv, t_up); if (t_up_prev >= 0) { - runtime->add_successor(t_up_prev, t_up); + add_successor(runtime, t_up_prev, t_up); } t_up_prev = t_up; } @@ -249,7 +248,7 @@ int build_paged_attention_graph(Runtime *runtime, const ChipStorageTaskArgs &orc delete[] dev_oi_arr; std::cout << "Created " << total_tasks << " tasks\n"; - runtime->print_runtime(); + print_runtime(runtime); return 0; }