From 678473455668e671dd7851852919eecf41b09630 Mon Sep 17 00:00:00 2001 From: Shaw Date: Tue, 23 Jun 2026 21:36:56 -0700 Subject: [PATCH] =?UTF-8?q?fix(fused-ffi):=20complete=20ABI=20v9=20context?= =?UTF-8?q?=5Fsize=20+=20bump=20stale=20sizeof=20assert=20(80=E2=86=9288)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The streaming-LLM config struct gained `context_size` (int32 @ offset 80) for ABI v9 — `_llm_stream_open` now honors `cfg->context_size` (>0) instead of only the ELIZA_LLM_N_CTX env default. The TS marshaller (ffi-bindings.ts) was already emitting 88 bytes with context_size at offset 80, but the C-side ABI-guard `static_assert(sizeof(eliza_llm_stream_config_t) == 80)` was never bumped, so the fused `elizainference` target failed to compile. Bump it to 88 to match the real layout (8×int32 + 5×ptr, pointer-aligned). Validated: a host Metal build of `libelizainference` (ABI v12) loads + generates on the real google/gemma-4-E2B with context_size=4096 honored (83 tok/s, M4 Max). Co-Authored-By: Claude Opus 4.8 --- tools/omnivoice/include/eliza-inference-ffi.h | 6 +++++- tools/omnivoice/src/eliza-inference-ffi.cpp | 17 +++++++++++------ 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/tools/omnivoice/include/eliza-inference-ffi.h b/tools/omnivoice/include/eliza-inference-ffi.h index 075660a60..492626a99 100644 --- a/tools/omnivoice/include/eliza-inference-ffi.h +++ b/tools/omnivoice/include/eliza-inference-ffi.h @@ -853,7 +853,10 @@ int eliza_inference_llm_kv_quant_supported(void); * `cache_type_k` / `cache_type_v` (ABI v8): KV-cache quantization type names * (e.g. "f16", "q8_0", "qjl1_256", "q4_polar"). NULL leaves the llama.cpp * default (f16). Mapped to ggml_type and applied to cparams.type_k/type_v. - * Mirrors desktop-llama-adapter.ts's GGML_KV_CACHE_TYPES pass-through. */ + * Mirrors desktop-llama-adapter.ts's GGML_KV_CACHE_TYPES pass-through. + * + * `context_size` (ABI v9): runtime context window in tokens. <=0 falls back + * to ELIZA_LLM_N_CTX or the native default. */ typedef struct { int32_t max_tokens; float temperature; @@ -870,6 +873,7 @@ typedef struct { int32_t n_gpu_layers; /* -1 = default (all), 0 = CPU (ABI v8) */ const char * cache_type_k; /* KV K-cache quant name; NULL = f16 (ABI v8) */ const char * cache_type_v; /* KV V-cache quant name; NULL = f16 (ABI v8) */ + int32_t context_size; /* Runtime context tokens; <=0 = env/default (ABI v9) */ } eliza_llm_stream_config_t; /* Opaque streaming-LLM session. One per active generation. */ diff --git a/tools/omnivoice/src/eliza-inference-ffi.cpp b/tools/omnivoice/src/eliza-inference-ffi.cpp index dc74dd8eb..2821dcd22 100644 --- a/tools/omnivoice/src/eliza-inference-ffi.cpp +++ b/tools/omnivoice/src/eliza-inference-ffi.cpp @@ -22,13 +22,16 @@ // ABI guard: the TS loader (ffi-llm-streaming-abi.ts) marshals // eliza_llm_stream_config_t by hand-written field offsets, so any reorder / // insert / type change on the C side silently corrupts every streaming-LLM -// call. Pin the on-the-wire layout (documented "sizeof config = 80" since v8): -// 6×int32 + 5×ptr + 4-byte fields packed to 80 bytes on a 64-bit ABI. Adding a -// field is an ABI bump — update this assert AND the TS marshaller together. +// call. Pin the on-the-wire layout. ABI v9 appended `context_size` (int32 at +// offset 80), so the packed size on a 64-bit ABI is 88 bytes (8×int32 + 5×ptr, +// pointer-aligned). The TS marshaller (ffi-bindings.ts) already allocs 88 and +// writes context_size at offset 80; this assert had simply not been bumped to +// match. Adding a field is an ABI bump — update this assert AND the TS +// marshaller together. static_assert( - sizeof(eliza_llm_stream_config_t) == 80, + sizeof(eliza_llm_stream_config_t) == 88, "eliza_llm_stream_config_t layout changed — bump ABI + update the TS " - "marshaller in ffi-llm-streaming-abi.ts, then update this assert."); + "marshaller in ffi-bindings.ts, then update this assert."); /* common/ — the same-file MTP speculative-decode engine wired into the * streaming-LLM text path (ABI v8) reuses the DRAFT_MTP implementation in @@ -2922,7 +2925,9 @@ EliLlmStream * eliza_inference_llm_stream_open( * batch / threads / flash-attn / KV-quant). */ llama_context_params cparams = llama_context_default_params(); const int n_ctx_train = llama_model_n_ctx_train(model); - int n_ctx = eliza_int_env_or_default("ELIZA_LLM_N_CTX", 8192); + int n_ctx = cfg->context_size > 0 + ? cfg->context_size + : eliza_int_env_or_default("ELIZA_LLM_N_CTX", 8192); if (n_ctx_train > 0 && n_ctx > n_ctx_train) n_ctx = n_ctx_train; cparams.n_ctx = (uint32_t) n_ctx; cparams.n_batch = (uint32_t) eliza_int_env_or_default("ELIZA_LLM_N_BATCH", 512);