From 678473455668e671dd7851852919eecf41b09630 Mon Sep 17 00:00:00 2001
From: Shaw <shawmakesmagic@gmail.com>
Date: Tue, 23 Jun 2026 21:36:56 -0700
Subject: [PATCH] =?UTF-8?q?fix(fused-ffi):=20complete=20ABI=20v9=20context?=
 =?UTF-8?q?=5Fsize=20+=20bump=20stale=20sizeof=20assert=20(80=E2=86=9288)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The streaming-LLM config struct gained `context_size` (int32 @ offset 80) for
ABI v9 — `_llm_stream_open` now honors `cfg->context_size` (>0) instead of only
the ELIZA_LLM_N_CTX env default. The TS marshaller (ffi-bindings.ts) was already
emitting 88 bytes with context_size at offset 80, but the C-side ABI-guard
`static_assert(sizeof(eliza_llm_stream_config_t) == 80)` was never bumped, so the
fused `elizainference` target failed to compile. Bump it to 88 to match the real
layout (8×int32 + 5×ptr, pointer-aligned).

Validated: a host Metal build of `libelizainference` (ABI v12) loads + generates
on the real google/gemma-4-E2B with context_size=4096 honored (83 tok/s, M4 Max).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 tools/omnivoice/include/eliza-inference-ffi.h |  6 +++++-
 tools/omnivoice/src/eliza-inference-ffi.cpp   | 17 +++++++++++------
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/tools/omnivoice/include/eliza-inference-ffi.h b/tools/omnivoice/include/eliza-inference-ffi.h
index 075660a60..492626a99 100644
--- a/tools/omnivoice/include/eliza-inference-ffi.h
+++ b/tools/omnivoice/include/eliza-inference-ffi.h
@@ -853,7 +853,10 @@ int eliza_inference_llm_kv_quant_supported(void);
  * `cache_type_k` / `cache_type_v` (ABI v8): KV-cache quantization type names
  * (e.g. "f16", "q8_0", "qjl1_256", "q4_polar"). NULL leaves the llama.cpp
  * default (f16). Mapped to ggml_type and applied to cparams.type_k/type_v.
- * Mirrors desktop-llama-adapter.ts's GGML_KV_CACHE_TYPES pass-through. */
+ * Mirrors desktop-llama-adapter.ts's GGML_KV_CACHE_TYPES pass-through.
+ *
+ * `context_size` (ABI v9): runtime context window in tokens. <=0 falls back
+ * to ELIZA_LLM_N_CTX or the native default. */
 typedef struct {
     int32_t      max_tokens;
     float        temperature;
@@ -870,6 +873,7 @@ typedef struct {
     int32_t      n_gpu_layers;       /* -1 = default (all), 0 = CPU (ABI v8) */
     const char * cache_type_k;       /* KV K-cache quant name; NULL = f16 (ABI v8) */
     const char * cache_type_v;       /* KV V-cache quant name; NULL = f16 (ABI v8) */
+    int32_t      context_size;       /* Runtime context tokens; <=0 = env/default (ABI v9) */
 } eliza_llm_stream_config_t;
 
 /* Opaque streaming-LLM session. One per active generation. */
diff --git a/tools/omnivoice/src/eliza-inference-ffi.cpp b/tools/omnivoice/src/eliza-inference-ffi.cpp
index dc74dd8eb..2821dcd22 100644
--- a/tools/omnivoice/src/eliza-inference-ffi.cpp
+++ b/tools/omnivoice/src/eliza-inference-ffi.cpp
@@ -22,13 +22,16 @@
 // ABI guard: the TS loader (ffi-llm-streaming-abi.ts) marshals
 // eliza_llm_stream_config_t by hand-written field offsets, so any reorder /
 // insert / type change on the C side silently corrupts every streaming-LLM
-// call. Pin the on-the-wire layout (documented "sizeof config = 80" since v8):
-// 6×int32 + 5×ptr + 4-byte fields packed to 80 bytes on a 64-bit ABI. Adding a
-// field is an ABI bump — update this assert AND the TS marshaller together.
+// call. Pin the on-the-wire layout. ABI v9 appended `context_size` (int32 at
+// offset 80), so the packed size on a 64-bit ABI is 88 bytes (8×int32 + 5×ptr,
+// pointer-aligned). The TS marshaller (ffi-bindings.ts) already allocs 88 and
+// writes context_size at offset 80; this assert had simply not been bumped to
+// match. Adding a field is an ABI bump — update this assert AND the TS
+// marshaller together.
 static_assert(
-    sizeof(eliza_llm_stream_config_t) == 80,
+    sizeof(eliza_llm_stream_config_t) == 88,
     "eliza_llm_stream_config_t layout changed — bump ABI + update the TS "
-    "marshaller in ffi-llm-streaming-abi.ts, then update this assert.");
+    "marshaller in ffi-bindings.ts, then update this assert.");
 
 /* common/ — the same-file MTP speculative-decode engine wired into the
  * streaming-LLM text path (ABI v8) reuses the DRAFT_MTP implementation in
@@ -2922,7 +2925,9 @@ EliLlmStream * eliza_inference_llm_stream_open(
      * batch / threads / flash-attn / KV-quant). */
     llama_context_params cparams = llama_context_default_params();
     const int n_ctx_train = llama_model_n_ctx_train(model);
-    int n_ctx = eliza_int_env_or_default("ELIZA_LLM_N_CTX", 8192);
+    int n_ctx = cfg->context_size > 0
+        ? cfg->context_size
+        : eliza_int_env_or_default("ELIZA_LLM_N_CTX", 8192);
     if (n_ctx_train > 0 && n_ctx > n_ctx_train) n_ctx = n_ctx_train;
     cparams.n_ctx = (uint32_t) n_ctx;
     cparams.n_batch = (uint32_t) eliza_int_env_or_default("ELIZA_LLM_N_BATCH", 512);