Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,7 @@
#include <iostream>
#include <vector>

#include "runtime.h" // NOLINT(build/include_subdir)
#include "task_args.h" // NOLINT(build/include_subdir)
#include "orchestration_api.h" // NOLINT(build/include_subdir)

extern "C" {

Expand All @@ -44,7 +43,7 @@ constexpr int BATCH = 1;

constexpr size_t TILE_BYTES = TILE * TILE * sizeof(float);

int build_bgemm_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) {
int build_bgemm_graph(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args) {
// Expected orch_args: [A, B, C] — 3 tensors
if (orch_args.tensor_count() < 3) {
std::cerr << "build_bgemm_graph: Expected at least 3 tensors, got " << orch_args.tensor_count() << '\n';
Expand All @@ -62,38 +61,38 @@ int build_bgemm_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) {
std::cout << "Grid: " << GRID_M << " x " << GRID_K << " x " << GRID_N << '\n';

// Allocate device memory and copy inputs
void *dev_A = runtime->host_api.device_malloc(size_A);
void *dev_A = device_malloc(runtime, size_A);
if (!dev_A) return -1;
runtime->host_api.copy_to_device(dev_A, host_A, size_A);
copy_to_device(runtime, dev_A, host_A, size_A);

void *dev_B = runtime->host_api.device_malloc(size_B);
void *dev_B = device_malloc(runtime, size_B);
if (!dev_B) {
runtime->host_api.device_free(dev_A);
device_free(runtime, dev_A);
return -1;
}
runtime->host_api.copy_to_device(dev_B, host_B, size_B);
copy_to_device(runtime, dev_B, host_B, size_B);

void *dev_C = runtime->host_api.device_malloc(size_C);
void *dev_C = device_malloc(runtime, size_C);
if (!dev_C) {
runtime->host_api.device_free(dev_A);
runtime->host_api.device_free(dev_B);
device_free(runtime, dev_A);
device_free(runtime, dev_B);
return -1;
}
runtime->host_api.copy_to_device(dev_C, host_C, size_C);
runtime->record_tensor_pair(host_C, dev_C, size_C);
copy_to_device(runtime, dev_C, host_C, size_C);
record_tensor_pair(runtime, host_C, dev_C, size_C);

// Allocate intermediate P buffers (one per C tile)
constexpr int NUM_P_BUFFERS = BATCH * GRID_M * GRID_N;
std::vector<void *> dev_P(NUM_P_BUFFERS, nullptr);
for (int i = 0; i < NUM_P_BUFFERS; i++) {
dev_P[i] = runtime->host_api.device_malloc(TILE_BYTES);
dev_P[i] = device_malloc(runtime, TILE_BYTES);
if (!dev_P[i]) {
for (int j = 0; j < i; j++) {
runtime->host_api.device_free(dev_P[j]);
device_free(runtime, dev_P[j]);
}
runtime->host_api.device_free(dev_A);
runtime->host_api.device_free(dev_B);
runtime->host_api.device_free(dev_C);
device_free(runtime, dev_A);
device_free(runtime, dev_B);
device_free(runtime, dev_C);
return -1;
}
}
Expand Down Expand Up @@ -121,7 +120,7 @@ int build_bgemm_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) {
args_gemm[3] = TILE;
args_gemm[4] = TILE;
args_gemm[5] = TILE;
int t_gemm = runtime->add_task(args_gemm, 6, 0, CoreType::AIC);
int t_gemm = add_task(runtime, args_gemm, 6, 0, CoreType::AIC);

// Task 2: C[m,n] = C[m,n] + P (tile_add on Vector core)
uint64_t args_add[5];
Expand All @@ -130,22 +129,22 @@ int build_bgemm_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) {
args_add[2] = reinterpret_cast<uint64_t>(static_cast<char *>(dev_C) + C_offset);
args_add[3] = TILE;
args_add[4] = TILE;
int t_add = runtime->add_task(args_add, 5, 1, CoreType::AIV);
int t_add = add_task(runtime, args_add, 5, 1, CoreType::AIV);

// Dependency: gemm must complete before add
runtime->add_successor(t_gemm, t_add);
add_successor(runtime, t_gemm, t_add);

// Dependency: previous add must complete before current gemm (K accumulation)
if (last_add_task[c_tile_idx] >= 0) {
runtime->add_successor(last_add_task[c_tile_idx], t_gemm);
add_successor(runtime, last_add_task[c_tile_idx], t_gemm);
}
last_add_task[c_tile_idx] = t_add;
}
}
}
}

std::cout << "Created " << runtime->get_task_count() << " tasks\n";
std::cout << "Created " << get_task_count(runtime) << " tasks\n";
return 0;
}

Expand Down
37 changes: 19 additions & 18 deletions examples/a2a3/host_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md
Original file line number Diff line number Diff line change
@@ -1,49 +1,50 @@
# InCore Orchestration Guide: host_build_graph

## Goal
In host_build_graph, the orchestration function runs on the host. It allocates device buffers, builds the task graph by calling `Runtime::add_task`, and wires dependencies with `Runtime::add_successor`.

In host_build_graph, the orchestration function runs on the host. It allocates device buffers, builds the task graph by calling `add_task(runtime, ...)`, and wires dependencies with `add_successor(runtime, ...)`.

## Where To Put Orchestration Code

- Each example keeps orchestration sources under `examples/host_build_graph/<example>/kernels/orchestration/`.
- `examples/host_build_graph/<example>/kernels/kernel_config.py` defines the orchestration entry point. Example: `ORCHESTRATION = {"source": ".../example_orch.cpp", "function_name": "build_example_graph"}`.

## Function Signature

Your orchestration entry must be `extern "C"` and match:

```cpp
int build_graph(Runtime* runtime, uint64_t* args, int arg_count);
int build_graph(OrchestrationRuntime* runtime, const ChipStorageTaskArgs &orch_args);
```

`Runtime` is defined in `src/runtime/host_build_graph/runtime/runtime.h`.
Include `orchestration_api.h`. Do not include `runtime.h` in orchestration sources.

## Argument Layout
When you use the default `golden.py` tensor argument order (`TENSOR_ORDER`), the argument layout built by `examples/scripts/code_runner.py` is:

```
[ptr_0, ptr_1, ..., ptr_n, nbytes_0, nbytes_1, ..., nbytes_n, element_count]
```
`orch_args` contains separated tensor and scalar arguments through `ChipStorageTaskArgs`.

- Pointers are host pointers to CPU tensors.
- Sizes are byte sizes for each tensor in `TENSOR_ORDER`.
- `element_count` is the element count of the first tensor.

If `golden.py` returns an explicit argument list, that list becomes `args` directly. Validate `arg_count` defensively in your orchestration.
- Use `orch_args.tensor(i)` to read tensor metadata and host pointers
- Use `orch_args.scalar(i)` to read scalar values
- Validate `tensor_count()` / `scalar_count()` defensively in orchestration code

## Building The Graph

A typical host orchestration sequence is:

1. Allocate device buffers with `runtime->host_api.device_malloc`.
2. Copy inputs to device with `runtime->host_api.copy_to_device`.
3. Record output buffers with `runtime->record_tensor_pair(host_ptr, dev_ptr, size)` so finalize can copy them back.
4. Create tasks with `runtime->add_task(args, num_args, func_id, core_type)`.
5. Add dependency edges with `runtime->add_successor(producer, consumer)`.
1. Allocate device buffers with `device_malloc(runtime, size)`.
2. Copy inputs to device with `copy_to_device(runtime, dev_ptr, host_ptr, size)`.
3. Record output buffers with `record_tensor_pair(runtime, host_ptr, dev_ptr, size)` so finalize can copy them back.
4. Create tasks with `add_task(runtime, args, num_args, func_id, core_type)`.
5. Add dependency edges with `add_successor(runtime, producer, consumer)`.

Example: see `examples/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp`.

## Kernel Mapping

- `func_id` and `core_type` are defined in `kernels/kernel_config.py` under `KERNELS`.
- The host uploads kernel binaries via `upload_kernel_binary` and stores addresses in `Runtime::func_id_to_addr_[]`. The platform layer resolves per-task `Task::function_bin_addr` from this map before copying to device.

## Debugging Tips
- Use `runtime->print_runtime()` to dump the task graph.

- Use `print_runtime(runtime)` to dump the task graph.
- Fail fast on arg count or allocation errors to avoid undefined behavior.
Original file line number Diff line number Diff line change
Expand Up @@ -22,21 +22,20 @@
*
* This orchestration function:
* 1. Receives ChipStorageTaskArgs with tensor metadata (pointers, shapes, dtypes)
* 2. Allocates device memory via runtime->host_api
* 3. Copies input data to device via runtime->host_api
* 2. Allocates device memory via orchestration API helpers
* 3. Copies input data to device via orchestration API helpers
* 4. Records output tensor for copy-back during finalize
* 5. Builds the task graph with 4 tasks (2 AIV + 2 AIC)
*/

#include <cstdint>
#include <iostream>

#include "runtime.h" // NOLINT(build/include_subdir)
#include "task_args.h" // NOLINT(build/include_subdir)
#include "orchestration_api.h" // NOLINT(build/include_subdir)

extern "C" {

int build_matmul_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) {
int build_matmul_graph(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args) {
// Validate argument count
// Expected orch_args: [a, w1, w2, f] — 4 tensors
if (orch_args.tensor_count() < 4) {
Expand All @@ -62,63 +61,63 @@ int build_matmul_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) {
// Allocate device memory and copy inputs
std::cout << "\n=== Allocating Device Memory ===" << '\n';

void *dev_a = runtime->host_api.device_malloc(size_a);
void *dev_a = device_malloc(runtime, size_a);
if (!dev_a) {
std::cerr << "Error: Failed to allocate device memory for A\n";
return -1;
}
runtime->host_api.copy_to_device(dev_a, host_a, size_a);
copy_to_device(runtime, dev_a, host_a, size_a);
std::cout << "Tensor A: " << size_a << " bytes copied to device\n";

void *dev_w1 = runtime->host_api.device_malloc(size_w1);
void *dev_w1 = device_malloc(runtime, size_w1);
if (!dev_w1) {
std::cerr << "Error: Failed to allocate device memory for W1\n";
runtime->host_api.device_free(dev_a);
device_free(runtime, dev_a);
return -1;
}
runtime->host_api.copy_to_device(dev_w1, host_w1, size_w1);
copy_to_device(runtime, dev_w1, host_w1, size_w1);
std::cout << "Tensor W1: " << size_w1 << " bytes copied to device\n";

void *dev_w2 = runtime->host_api.device_malloc(size_w2);
void *dev_w2 = device_malloc(runtime, size_w2);
if (!dev_w2) {
std::cerr << "Error: Failed to allocate device memory for W2\n";
runtime->host_api.device_free(dev_a);
runtime->host_api.device_free(dev_w1);
device_free(runtime, dev_a);
device_free(runtime, dev_w1);
return -1;
}
runtime->host_api.copy_to_device(dev_w2, host_w2, size_w2);
copy_to_device(runtime, dev_w2, host_w2, size_w2);
std::cout << "Tensor W2: " << size_w2 << " bytes copied to device\n";

void *dev_f = runtime->host_api.device_malloc(size_f);
void *dev_f = device_malloc(runtime, size_f);
if (!dev_f) {
std::cerr << "Error: Failed to allocate device memory for F\n";
runtime->host_api.device_free(dev_a);
runtime->host_api.device_free(dev_w1);
runtime->host_api.device_free(dev_w2);
device_free(runtime, dev_a);
device_free(runtime, dev_w1);
device_free(runtime, dev_w2);
return -1;
}
// Record output tensor for copy-back during finalize
runtime->record_tensor_pair(host_f, dev_f, size_f);
record_tensor_pair(runtime, host_f, dev_f, size_f);
std::cout << "Tensor F (output): " << size_f << " bytes allocated\n";

// Allocate intermediate tensors (b, c, d)
// dev_b is half precision (output of log_sqrt kernel, input to matmul)
// dev_c, dev_d are float precision (output of matmul kernels)
size_t BYTES_HALF = SIZE * sizeof(uint16_t); // half = 2 bytes
size_t BYTES_FLOAT = SIZE * sizeof(float); // float = 4 bytes
void *dev_b = runtime->host_api.device_malloc(BYTES_HALF); // sqrt(log(A)) - half output
void *dev_c = runtime->host_api.device_malloc(BYTES_FLOAT); // B @ W1 - float output
void *dev_d = runtime->host_api.device_malloc(BYTES_FLOAT); // B @ W2 - float output
size_t BYTES_HALF = SIZE * sizeof(uint16_t); // half = 2 bytes
size_t BYTES_FLOAT = SIZE * sizeof(float); // float = 4 bytes
void *dev_b = device_malloc(runtime, BYTES_HALF); // sqrt(log(A)) - half output
void *dev_c = device_malloc(runtime, BYTES_FLOAT); // B @ W1 - float output
void *dev_d = device_malloc(runtime, BYTES_FLOAT); // B @ W2 - float output

if (!dev_b || !dev_c || !dev_d) {
std::cerr << "Error: Failed to allocate intermediate tensors\n";
runtime->host_api.device_free(dev_a);
runtime->host_api.device_free(dev_w1);
runtime->host_api.device_free(dev_w2);
runtime->host_api.device_free(dev_f);
if (dev_b) runtime->host_api.device_free(dev_b);
if (dev_c) runtime->host_api.device_free(dev_c);
if (dev_d) runtime->host_api.device_free(dev_d);
device_free(runtime, dev_a);
device_free(runtime, dev_w1);
device_free(runtime, dev_w2);
device_free(runtime, dev_f);
if (dev_b) device_free(runtime, dev_b);
if (dev_c) device_free(runtime, dev_c);
if (dev_d) device_free(runtime, dev_d);
return -1;
}

Expand All @@ -130,37 +129,37 @@ int build_matmul_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) {
args_t0[0] = reinterpret_cast<uint64_t>(dev_a); // src
args_t0[1] = reinterpret_cast<uint64_t>(dev_b); // out
args_t0[2] = SIZE; // size
int t0 = runtime->add_task(args_t0, 3, 0, CoreType::AIV);
int t0 = add_task(runtime, args_t0, 3, 0, CoreType::AIV);

// Task 1: C = B @ W1 (func_id=1: kernel_matmul, AIC)
uint64_t args_t1[4];
args_t1[0] = reinterpret_cast<uint64_t>(dev_b); // src0 (left matrix)
args_t1[1] = reinterpret_cast<uint64_t>(dev_w1); // src1 (right matrix)
args_t1[2] = reinterpret_cast<uint64_t>(dev_c); // out
args_t1[3] = SIZE; // size
int t1 = runtime->add_task(args_t1, 4, 1, CoreType::AIC);
int t1 = add_task(runtime, args_t1, 4, 1, CoreType::AIC);

// Task 2: D = B @ W2 (func_id=1: kernel_matmul, AIC)
uint64_t args_t2[4];
args_t2[0] = reinterpret_cast<uint64_t>(dev_b); // src0 (left matrix)
args_t2[1] = reinterpret_cast<uint64_t>(dev_w2); // src1 (right matrix)
args_t2[2] = reinterpret_cast<uint64_t>(dev_d); // out
args_t2[3] = SIZE; // size
int t2 = runtime->add_task(args_t2, 4, 1, CoreType::AIC);
int t2 = add_task(runtime, args_t2, 4, 1, CoreType::AIC);

// Task 3: F = exp(C + D) (func_id=2: kernel_add_exp, AIV)
uint64_t args_t3[4];
args_t3[0] = reinterpret_cast<uint64_t>(dev_c); // src0
args_t3[1] = reinterpret_cast<uint64_t>(dev_d); // src1
args_t3[2] = reinterpret_cast<uint64_t>(dev_f); // out
args_t3[3] = SIZE; // size
int t3 = runtime->add_task(args_t3, 4, 2, CoreType::AIV);
int t3 = add_task(runtime, args_t3, 4, 2, CoreType::AIV);

// Add dependencies (diamond: t0→t1→t3, t0→t2→t3)
runtime->add_successor(t0, t1); // t0 → t1
runtime->add_successor(t0, t2); // t0 → t2
runtime->add_successor(t1, t3); // t1 → t3
runtime->add_successor(t2, t3); // t2 → t3
add_successor(runtime, t0, t1); // t0 → t1
add_successor(runtime, t0, t2); // t0 → t2
add_successor(runtime, t1, t3); // t1 → t3
add_successor(runtime, t2, t3); // t2 → t3

std::cout << "\nTasks:\n";
std::cout << " task" << t0 << ": B = sqrt(log(A)) [AIV]\n";
Expand All @@ -169,8 +168,8 @@ int build_matmul_graph(Runtime *runtime, const ChipStorageTaskArgs &orch_args) {
std::cout << " task" << t3 << ": F = exp(C + D) [AIV]\n";
std::cout << "Dependencies: t0→t1→t3, t0→t2→t3 (diamond)\n";

std::cout << "Created runtime with " << runtime->get_task_count() << " tasks\n";
runtime->print_runtime();
std::cout << "Created runtime with " << get_task_count(runtime) << " tasks\n";
print_runtime(runtime);

return 0;
}
Expand Down
Loading
Loading