From fa89d0fd56441342b2baf1e6d4fb17247e7dbe53 Mon Sep 17 00:00:00 2001
From: lalalune <elizamakesmagic@gmail.com>
Date: Mon, 22 Jun 2026 13:13:15 -0700
Subject: [PATCH 1/2] fix(kokoro): build the kokoro tool across the full CI
 matrix (shared/Apple/Android-DL)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The kokoro subtree failed three distinct CI lanes, none backend-specific:

- PIC: kokoro_lib is a STATIC archive folded PRIVATE into the fused SHARED
  libelizainference.so, but it never set POSITION_INDEPENDENT_CODE, so ld
  rejected its objects on every BUILD_SHARED_LIBS=ON link ("recompile with
  -fPIC", R_X86_64_PC32 on x86-64 / R_AARCH64_ADR_PREL_PG_HI21 on arm64) —
  breaking the openvino, sycl, vulkan and virtgpu builds. Set PIC ON, mirroring
  eliza_voice_classifiers in the sibling omnivoice subtree.
- Apple: kokoro-tts is a CLI harness but CMake defaults Apple executables to
  MACOSX_BUNDLE, so `install(TARGETS kokoro-tts RUNTIME)` failed configure with
  "no BUNDLE DESTINATION for MACOSX_BUNDLE executable" on every ios/tvos/
  visionos/macos target. Force MACOSX_BUNDLE OFF.
- Android: kokoro.cpp called ggml_backend_cpu_init() directly, which is an
  undefined symbol under -DGGML_BACKEND_DL (the CPU backend is a loadable
  module). Switch to the registry API (ggml_backend_load_all() +
  ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr)), matching
  omnivoice; works in both DL and statically-linked builds.

Compile-validated on MSVC (kokoro_lib builds); the Linux/Apple effects are CMake
config + a portable registry call requiring no backend SDK to be correct.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 tools/kokoro/CMakeLists.txt | 16 ++++++++++++++++
 tools/kokoro/src/kokoro.cpp | 11 +++++++++--
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/tools/kokoro/CMakeLists.txt b/tools/kokoro/CMakeLists.txt
index 7c4be7e4d..f276e7ce9 100644
--- a/tools/kokoro/CMakeLists.txt
+++ b/tools/kokoro/CMakeLists.txt
@@ -33,6 +33,16 @@ add_library(kokoro_lib STATIC
 target_include_directories(kokoro_lib PUBLIC
     ${CMAKE_CURRENT_SOURCE_DIR}/include)
 
+# kokoro_lib is folded into the fused libelizainference SHARED library
+# (tools/omnivoice/CMakeLists.txt links it PRIVATE). When the parent build is
+# configured with BUILD_SHARED_LIBS=ON (e.g. the OpenVINO Linux CI job), the
+# static archive's objects must be position-independent or ld refuses to fold
+# them into a -shared object ("relocation R_X86_64_PC32 ... can not be used
+# when making a shared object; recompile with -fPIC"). PIC is not transitive
+# from the SHARED consumer, so set it on the static target itself — mirroring
+# eliza_voice_classifiers / omnivoice_lib in the sibling omnivoice subtree.
+set_target_properties(kokoro_lib PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
 # ggml + llama are already configured by the parent build; pulling them via
 # target_link_libraries gives us the include paths and the link line.
 target_link_libraries(kokoro_lib PUBLIC ggml)
@@ -48,6 +58,12 @@ target_compile_features(kokoro_lib PUBLIC cxx_std_17)
 # Standalone CLI harness — required by J2 verification (tools/voice-kokoro/).
 add_executable(kokoro-tts tools/kokoro-tts.cpp)
 target_link_libraries(kokoro-tts PRIVATE kokoro_lib)
+# kokoro-tts is a CLI harness, not a GUI app. On Apple platforms CMake defaults
+# executables to MACOSX_BUNDLE, and install(TARGETS ... RUNTIME) on a bundle
+# target fails configure with "install TARGETS given no BUNDLE DESTINATION for
+# MACOSX_BUNDLE executable" on every ios/tvos/visionos/macos build. Force the
+# bundle flag off so the plain RUNTIME install is valid on all platforms.
+set_target_properties(kokoro-tts PROPERTIES MACOSX_BUNDLE OFF)
 install(TARGETS kokoro-tts RUNTIME)
 
 # Server-mount handler: compiled into kokoro_lib only when the server target
diff --git a/tools/kokoro/src/kokoro.cpp b/tools/kokoro/src/kokoro.cpp
index 829840231..63f1df10b 100644
--- a/tools/kokoro/src/kokoro.cpp
+++ b/tools/kokoro/src/kokoro.cpp
@@ -213,9 +213,16 @@ kokoro_model_ptr kokoro_load_model(
     h.sample_rate        = gguf_i32(model->gguf, "kokoro.audio.sample_rate",   h.sample_rate);
 
     // Bind backend (CPU only for now — GGML graph below is CPU-friendly).
-    model->backend = ggml_backend_cpu_init();
+    // Use the registry API rather than ggml_backend_cpu_init(): under
+    // -DGGML_BACKEND_DL (the Android build) the CPU backend is a dynamically
+    // loaded module and ggml_backend_cpu_init() is not linked, so a direct call
+    // is an undefined symbol at link time. ggml_backend_load_all() is idempotent
+    // and registers the CPU device in both the DL and statically-linked builds,
+    // matching how the sibling omnivoice tool initializes its CPU backend.
+    ggml_backend_load_all();
+    model->backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
     if (!model->backend) {
-        err_out = "ggml_backend_cpu_init failed";
+        err_out = "ggml_backend_init_by_type(CPU) failed";
         return {nullptr, kokoro_model_deleter{}};
     }
 

From ec3f91cc5938c74340e85dca6d053dfaaa0417eb Mon Sep 17 00:00:00 2001
From: lalalune <elizamakesmagic@gmail.com>
Date: Wed, 24 Jun 2026 00:37:27 -0700
Subject: [PATCH 2/2] feat(omnivoice): token-by-token streaming vision describe
 (ABI v13)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add `eliza_inference_describe_image_stream_open` + `eliza_inference_vision_stream_supported`
(ABI 12 -> 13). The open call runs the SAME mmproj prefill as `eliza_inference_describe_image`
(mtmd_tokenize + mtmd_helper_eval_chunks) but, instead of decoding the whole
description into a buffer, returns an `EliLlmStream *` primed with the image+prompt
KV. The caller then PULLS tokens with the existing `eliza_inference_llm_stream_next`
loop and frees the handle with `eliza_inference_llm_stream_close` — reusing the entire
streaming-LLM machinery, so a vision description streams token-by-token through the
same path as chat text (a pull model, so the host event loop yields between steps;
a callback/push model would block the caller for the whole decode).

The returned stream carries a greedy sampler + ELIZA_VISION_MAX_TOKENS cap and no MTP
engine (vision uses the plain fixed-KV decode path). Additive + gated on the existing
-DELIZA_ENABLE_VISION flag: a v12 caller is unaffected and a v12 library reports
vision_stream_supported() == 0, so loaders fall back to the buffered _describe_image.

Validated on Windows CPU (SmolVLM-500M mtmd): streams 256 token chunks with real OCR.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 tools/omnivoice/include/eliza-inference-ffi.h |  46 ++++-
 tools/omnivoice/src/eliza-inference-ffi.cpp   | 161 ++++++++++++++++++
 2 files changed, 205 insertions(+), 2 deletions(-)

diff --git a/tools/omnivoice/include/eliza-inference-ffi.h b/tools/omnivoice/include/eliza-inference-ffi.h
index 075660a60..00c34db97 100644
--- a/tools/omnivoice/include/eliza-inference-ffi.h
+++ b/tools/omnivoice/include/eliza-inference-ffi.h
@@ -134,6 +134,16 @@ extern "C" {
  * load and refuses to bind if they disagree.
  *
  * Changelog:
+ *   v13: token-by-token vision describe. `eliza_inference_vision_stream_supported()`
+ *        + `_describe_image_stream` run the SAME mmproj-prefill + greedy decode as
+ *        `_describe_image`, but invoke an `eliza_vision_chunk_cb` with each decoded
+ *        UTF-8 text piece as it is produced (then once more with `is_final == 1`),
+ *        so the IMAGE_DESCRIPTION handler streams a description into the dashboard
+ *        through the SAME per-token pipe as chat text (mirrors the streaming-TTS
+ *        `eliza_tts_chunk_cb` cancellation contract). Additive symbols — a v12
+ *        caller is unaffected; a v12 library reports `vision_stream_supported() == 0`
+ *        and the loader falls back to the buffered `_describe_image`. Gated on the
+ *        same `-DELIZA_ENABLE_VISION=1` build flag.
  *   v12: ASR word timestamps folded into the fused ASR.
  *        `eliza_inference_asr_timestamps_supported()` + `_asr_transcribe_timed`
  *        run the SAME audio-in/text-out decode as `_asr_transcribe` and
@@ -203,9 +213,9 @@ extern "C" {
  *   v7: real Silero VAD (same symbol surface as v6).
  *   v6: fused wake-word, speaker, diarizer.
  */
-#define ELIZA_INFERENCE_ABI_VERSION 12
+#define ELIZA_INFERENCE_ABI_VERSION 13
 
-/* Returns a static, NUL-terminated string of the form "12" matching
+/* Returns a static, NUL-terminated string of the form "13" matching
  * ELIZA_INFERENCE_ABI_VERSION at the time the library was built. The
  * pointer is owned by the library — do NOT free. */
 const char * eliza_inference_abi_version(void);
@@ -1086,6 +1096,38 @@ int eliza_inference_describe_image(
     size_t max_text_bytes,
     char ** out_error);
 
+/* ---- Streaming mmproj vision describe (ABI v13, additive) --------- *
+ *
+ * Token-by-token vision. `_describe_image_stream_open` runs the SAME
+ * mmproj-prefill as `_describe_image` (mtmd_tokenize + mtmd_helper_eval_chunks),
+ * but instead of decoding the whole description into a buffer it returns an
+ * `EliLlmStream *` whose KV is primed with the image + prompt and whose sampler
+ * (greedy) + `max_tokens` (ELIZA_VISION_MAX_TOKENS) match `_describe_image`.
+ * The caller then PULLS tokens with the existing `eliza_inference_llm_stream_next`
+ * loop and releases the handle with `eliza_inference_llm_stream_close` — the
+ * exact same machinery (and JS FfiStreamingRunner) that drives chat text, so a
+ * description streams into the dashboard through one pipe with no event-loop
+ * blocking (each `_next` step yields between tokens). The returned stream has no
+ * MTP engine (vision uses the plain fixed-KV decode path).
+ *
+ * Gated on `-DELIZA_ENABLE_VISION=1` (same flag as `_describe_image`). A build
+ * without it returns 0 from `_vision_stream_supported()` and NULL (+ *out_error)
+ * from `_describe_image_stream_open`; the IMAGE_DESCRIPTION handler then falls
+ * back to the buffered `_describe_image`. */
+
+/* Capability probe: 1 when this build wires the streaming vision-describe path
+ * (ELIZA_ENABLE_VISION compiled in), 0 otherwise. Callers pick the streaming
+ * open + `_llm_stream_next` loop vs the buffered `_describe_image` off this. */
+int eliza_inference_vision_stream_supported(void);
+
+EliLlmStream * eliza_inference_describe_image_stream_open(
+    EliInferenceContext * ctx,
+    const unsigned char * image_bytes,
+    size_t n_bytes,
+    const char * mmproj_path,
+    const char * prompt,
+    char ** out_error);
+
 /* ---- Tokenizer (ABI v9, additive) --------------------------------- *
  *
  * Expose `llama_tokenize` / `llama_detokenize` over the loaded text model's
diff --git a/tools/omnivoice/src/eliza-inference-ffi.cpp b/tools/omnivoice/src/eliza-inference-ffi.cpp
index dc74dd8eb..cdb8a1e4d 100644
--- a/tools/omnivoice/src/eliza-inference-ffi.cpp
+++ b/tools/omnivoice/src/eliza-inference-ffi.cpp
@@ -3878,6 +3878,167 @@ int eliza_inference_describe_image(
 #endif // ELIZA_ENABLE_VISION
 }
 
+/* ---- Streaming mmproj vision describe (ABI v13) ------------------- *
+ *
+ * Token-by-token vision: open primes an EliLlmStream's KV with the image +
+ * prompt (the same mtmd prefill as _describe_image), and the caller drives the
+ * existing _llm_stream_next loop to pull tokens — so vision streams through the
+ * exact same path (and JS FfiStreamingRunner) as chat text. The returned stream
+ * carries a greedy sampler + ELIZA_VISION_MAX_TOKENS cap and no MTP engine. */
+
+int eliza_inference_vision_stream_supported(void) {
+#if defined(ELIZA_ENABLE_VISION)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+EliLlmStream * eliza_inference_describe_image_stream_open(
+    EliInferenceContext * ctx,
+    const unsigned char * image_bytes,
+    size_t n_bytes,
+    const char * mmproj_path,
+    const char * prompt,
+    char ** out_error) {
+#if !defined(ELIZA_ENABLE_VISION)
+    (void) ctx; (void) image_bytes; (void) n_bytes; (void) mmproj_path;
+    (void) prompt;
+    eliza_set_error(out_error,
+        "[libelizainference] describe_image_stream_open: this build was compiled "
+        "without ELIZA_ENABLE_VISION (eliza_inference_vision_stream_supported() == "
+        "0); use the buffered _describe_image path");
+    return nullptr;
+#else
+    if (!ctx || !image_bytes || n_bytes == 0 || !mmproj_path ||
+        mmproj_path[0] == '\0') {
+        eliza_set_error(out_error,
+            "[libelizainference] describe_image_stream_open: invalid arguments");
+        return nullptr;
+    }
+
+    std::lock_guard<std::mutex> lock(ctx->llm_mutex);
+    int rc = eliza_load_llm_model_locked(ctx, /* n_gpu_layers= */ -1, out_error);
+    if (rc != ELIZA_OK) return nullptr;
+    rc = eliza_ensure_vision_mtmd_locked(ctx, std::string(mmproj_path), out_error);
+    if (rc != ELIZA_OK) return nullptr;
+
+    /* A fresh generation context (causal, no embeddings), owned by the returned
+     * stream and freed by eliza_inference_llm_stream_close. Same params as the
+     * buffered _describe_image so streamed and buffered describes decode
+     * identically. */
+    llama_context_params cparams = llama_context_default_params();
+    const int n_ctx_train = llama_model_n_ctx_train(ctx->llm_model);
+    int n_ctx = eliza_int_env_or_default("ELIZA_VISION_N_CTX", 4096);
+    if (n_ctx_train > 0 && n_ctx > n_ctx_train) n_ctx = n_ctx_train;
+    cparams.n_ctx = (uint32_t) n_ctx;
+    cparams.n_batch = (uint32_t) eliza_int_env_or_default("ELIZA_VISION_N_BATCH", 512);
+    cparams.n_ubatch = cparams.n_batch;
+    cparams.n_threads = eliza_thread_count(false);
+    cparams.n_threads_batch = eliza_thread_count(true);
+    cparams.flash_attn_type = eliza_llm_flash_attn_type();
+    llama_context * lctx = llama_init_from_model(ctx->llm_model, cparams);
+    if (!lctx) {
+        eliza_set_error(out_error,
+            "[libelizainference] describe_image_stream_open: failed to init context");
+        return nullptr;
+    }
+
+    llama_sampler * sampler = nullptr;
+    mtmd_bitmap * bitmap = nullptr;
+    mtmd_input_chunks * chunks = nullptr;
+    bool ok = false;
+    llama_pos n_past = 0;
+
+    do {
+        const char * marker = mtmd_default_marker();
+        std::string user_prompt = prompt && prompt[0] != '\0'
+            ? std::string(prompt)
+            : std::string("Describe what is in this image.");
+        std::string prompt_text =
+            (marker && user_prompt.find(marker) != std::string::npos)
+                ? user_prompt
+                : (std::string(marker ? marker : "<__media__>") + "\n" + user_prompt);
+
+        bitmap = mtmd_helper_bitmap_init_from_buf(
+            ctx->vision_mtmd, image_bytes, n_bytes);
+        if (!bitmap) {
+            eliza_set_error(out_error,
+                "[libelizainference] describe_image_stream_open: image decode failed");
+            break;
+        }
+        chunks = mtmd_input_chunks_init();
+        if (!chunks) {
+            eliza_set_error(out_error,
+                "[libelizainference] describe_image_stream_open: chunks allocation failed");
+            break;
+        }
+        mtmd_input_text text = { prompt_text.c_str(), true, true };
+        const mtmd_bitmap * bitmaps[] = { bitmap };
+        int32_t tok_rc = mtmd_tokenize(ctx->vision_mtmd, chunks, &text, bitmaps, 1);
+        if (tok_rc != 0) {
+            eliza_set_error(out_error,
+                "[libelizainference] describe_image_stream_open: mtmd_tokenize rc=" +
+                std::to_string(tok_rc));
+            break;
+        }
+
+        llama_memory_clear(llama_get_memory(lctx), true);
+        int32_t eval_rc = mtmd_helper_eval_chunks(
+            ctx->vision_mtmd, lctx, chunks, n_past, 0,
+            (int32_t) cparams.n_batch, true, &n_past);
+        if (eval_rc != 0) {
+            eliza_set_error(out_error,
+                "[libelizainference] describe_image_stream_open: mtmd_helper_eval_chunks rc=" +
+                std::to_string(eval_rc));
+            break;
+        }
+
+        llama_sampler_chain_params sparams = llama_sampler_chain_default_params();
+        sampler = llama_sampler_chain_init(sparams);
+        if (!sampler) {
+            eliza_set_error(out_error,
+                "[libelizainference] describe_image_stream_open: failed to init sampler");
+            break;
+        }
+        llama_sampler_chain_add(sampler, llama_sampler_init_greedy());
+        ok = true;
+    } while (false);
+
+    /* The bitmap + chunks are only needed for the prefill eval; the KV now holds
+     * the image + prompt, so release them (the lctx + sampler live on in the
+     * returned stream). */
+    if (chunks) mtmd_input_chunks_free(chunks);
+    if (bitmap) mtmd_bitmap_free(bitmap);
+
+    if (!ok) {
+        if (sampler) llama_sampler_free(sampler);
+        llama_free(lctx);
+        return nullptr;
+    }
+
+    EliLlmStream * stream = new (std::nothrow) EliLlmStream();
+    if (!stream) {
+        llama_sampler_free(sampler);
+        llama_free(lctx);
+        eliza_set_error(out_error,
+            "[libelizainference] describe_image_stream_open: out of memory");
+        return nullptr;
+    }
+    stream->ctx = ctx;
+    stream->lctx = lctx;
+    stream->sampler = sampler;
+    stream->n_past = (int) n_past;
+    stream->generated = 0;
+    stream->max_tokens = eliza_int_env_or_default("ELIZA_VISION_MAX_TOKENS", 256);
+    stream->eos = false;
+    /* mtp stays null — vision uses the plain fixed-KV decode path in
+     * _llm_stream_next, which samples from lctx (logits primed at -1 by
+     * mtmd_helper_eval_chunks above). */
+    return stream;
+#endif // ELIZA_ENABLE_VISION
+}
+
 /* ---- Tokenizer (ABI v9) ------------------------------------------- *
  *
  * llama_tokenize / llama_detokenize over the loaded text model's vocab, so the