From fa89d0fd56441342b2baf1e6d4fb17247e7dbe53 Mon Sep 17 00:00:00 2001 From: lalalune Date: Mon, 22 Jun 2026 13:13:15 -0700 Subject: [PATCH 1/2] fix(kokoro): build the kokoro tool across the full CI matrix (shared/Apple/Android-DL) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kokoro subtree failed three distinct CI lanes, none backend-specific: - PIC: kokoro_lib is a STATIC archive folded PRIVATE into the fused SHARED libelizainference.so, but it never set POSITION_INDEPENDENT_CODE, so ld rejected its objects on every BUILD_SHARED_LIBS=ON link ("recompile with -fPIC", R_X86_64_PC32 on x86-64 / R_AARCH64_ADR_PREL_PG_HI21 on arm64) — breaking the openvino, sycl, vulkan and virtgpu builds. Set PIC ON, mirroring eliza_voice_classifiers in the sibling omnivoice subtree. - Apple: kokoro-tts is a CLI harness but CMake defaults Apple executables to MACOSX_BUNDLE, so `install(TARGETS kokoro-tts RUNTIME)` failed configure with "no BUNDLE DESTINATION for MACOSX_BUNDLE executable" on every ios/tvos/ visionos/macos target. Force MACOSX_BUNDLE OFF. - Android: kokoro.cpp called ggml_backend_cpu_init() directly, which is an undefined symbol under -DGGML_BACKEND_DL (the CPU backend is a loadable module). Switch to the registry API (ggml_backend_load_all() + ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr)), matching omnivoice; works in both DL and statically-linked builds. Compile-validated on MSVC (kokoro_lib builds); the Linux/Apple effects are CMake config + a portable registry call requiring no backend SDK to be correct. Co-Authored-By: Claude Opus 4.8 --- tools/kokoro/CMakeLists.txt | 16 ++++++++++++++++ tools/kokoro/src/kokoro.cpp | 11 +++++++++-- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/tools/kokoro/CMakeLists.txt b/tools/kokoro/CMakeLists.txt index 7c4be7e4d..f276e7ce9 100644 --- a/tools/kokoro/CMakeLists.txt +++ b/tools/kokoro/CMakeLists.txt @@ -33,6 +33,16 @@ add_library(kokoro_lib STATIC target_include_directories(kokoro_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include) +# kokoro_lib is folded into the fused libelizainference SHARED library +# (tools/omnivoice/CMakeLists.txt links it PRIVATE). When the parent build is +# configured with BUILD_SHARED_LIBS=ON (e.g. the OpenVINO Linux CI job), the +# static archive's objects must be position-independent or ld refuses to fold +# them into a -shared object ("relocation R_X86_64_PC32 ... can not be used +# when making a shared object; recompile with -fPIC"). PIC is not transitive +# from the SHARED consumer, so set it on the static target itself — mirroring +# eliza_voice_classifiers / omnivoice_lib in the sibling omnivoice subtree. +set_target_properties(kokoro_lib PROPERTIES POSITION_INDEPENDENT_CODE ON) + # ggml + llama are already configured by the parent build; pulling them via # target_link_libraries gives us the include paths and the link line. target_link_libraries(kokoro_lib PUBLIC ggml) @@ -48,6 +58,12 @@ target_compile_features(kokoro_lib PUBLIC cxx_std_17) # Standalone CLI harness — required by J2 verification (tools/voice-kokoro/). add_executable(kokoro-tts tools/kokoro-tts.cpp) target_link_libraries(kokoro-tts PRIVATE kokoro_lib) +# kokoro-tts is a CLI harness, not a GUI app. On Apple platforms CMake defaults +# executables to MACOSX_BUNDLE, and install(TARGETS ... RUNTIME) on a bundle +# target fails configure with "install TARGETS given no BUNDLE DESTINATION for +# MACOSX_BUNDLE executable" on every ios/tvos/visionos/macos build. Force the +# bundle flag off so the plain RUNTIME install is valid on all platforms. +set_target_properties(kokoro-tts PROPERTIES MACOSX_BUNDLE OFF) install(TARGETS kokoro-tts RUNTIME) # Server-mount handler: compiled into kokoro_lib only when the server target diff --git a/tools/kokoro/src/kokoro.cpp b/tools/kokoro/src/kokoro.cpp index 829840231..63f1df10b 100644 --- a/tools/kokoro/src/kokoro.cpp +++ b/tools/kokoro/src/kokoro.cpp @@ -213,9 +213,16 @@ kokoro_model_ptr kokoro_load_model( h.sample_rate = gguf_i32(model->gguf, "kokoro.audio.sample_rate", h.sample_rate); // Bind backend (CPU only for now — GGML graph below is CPU-friendly). - model->backend = ggml_backend_cpu_init(); + // Use the registry API rather than ggml_backend_cpu_init(): under + // -DGGML_BACKEND_DL (the Android build) the CPU backend is a dynamically + // loaded module and ggml_backend_cpu_init() is not linked, so a direct call + // is an undefined symbol at link time. ggml_backend_load_all() is idempotent + // and registers the CPU device in both the DL and statically-linked builds, + // matching how the sibling omnivoice tool initializes its CPU backend. + ggml_backend_load_all(); + model->backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); if (!model->backend) { - err_out = "ggml_backend_cpu_init failed"; + err_out = "ggml_backend_init_by_type(CPU) failed"; return {nullptr, kokoro_model_deleter{}}; } From ec3f91cc5938c74340e85dca6d053dfaaa0417eb Mon Sep 17 00:00:00 2001 From: lalalune Date: Wed, 24 Jun 2026 00:37:27 -0700 Subject: [PATCH 2/2] feat(omnivoice): token-by-token streaming vision describe (ABI v13) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `eliza_inference_describe_image_stream_open` + `eliza_inference_vision_stream_supported` (ABI 12 -> 13). The open call runs the SAME mmproj prefill as `eliza_inference_describe_image` (mtmd_tokenize + mtmd_helper_eval_chunks) but, instead of decoding the whole description into a buffer, returns an `EliLlmStream *` primed with the image+prompt KV. The caller then PULLS tokens with the existing `eliza_inference_llm_stream_next` loop and frees the handle with `eliza_inference_llm_stream_close` — reusing the entire streaming-LLM machinery, so a vision description streams token-by-token through the same path as chat text (a pull model, so the host event loop yields between steps; a callback/push model would block the caller for the whole decode). The returned stream carries a greedy sampler + ELIZA_VISION_MAX_TOKENS cap and no MTP engine (vision uses the plain fixed-KV decode path). Additive + gated on the existing -DELIZA_ENABLE_VISION flag: a v12 caller is unaffected and a v12 library reports vision_stream_supported() == 0, so loaders fall back to the buffered _describe_image. Validated on Windows CPU (SmolVLM-500M mtmd): streams 256 token chunks with real OCR. Co-Authored-By: Claude Opus 4.8 --- tools/omnivoice/include/eliza-inference-ffi.h | 46 ++++- tools/omnivoice/src/eliza-inference-ffi.cpp | 161 ++++++++++++++++++ 2 files changed, 205 insertions(+), 2 deletions(-) diff --git a/tools/omnivoice/include/eliza-inference-ffi.h b/tools/omnivoice/include/eliza-inference-ffi.h index 075660a60..00c34db97 100644 --- a/tools/omnivoice/include/eliza-inference-ffi.h +++ b/tools/omnivoice/include/eliza-inference-ffi.h @@ -134,6 +134,16 @@ extern "C" { * load and refuses to bind if they disagree. * * Changelog: + * v13: token-by-token vision describe. `eliza_inference_vision_stream_supported()` + * + `_describe_image_stream` run the SAME mmproj-prefill + greedy decode as + * `_describe_image`, but invoke an `eliza_vision_chunk_cb` with each decoded + * UTF-8 text piece as it is produced (then once more with `is_final == 1`), + * so the IMAGE_DESCRIPTION handler streams a description into the dashboard + * through the SAME per-token pipe as chat text (mirrors the streaming-TTS + * `eliza_tts_chunk_cb` cancellation contract). Additive symbols — a v12 + * caller is unaffected; a v12 library reports `vision_stream_supported() == 0` + * and the loader falls back to the buffered `_describe_image`. Gated on the + * same `-DELIZA_ENABLE_VISION=1` build flag. * v12: ASR word timestamps folded into the fused ASR. * `eliza_inference_asr_timestamps_supported()` + `_asr_transcribe_timed` * run the SAME audio-in/text-out decode as `_asr_transcribe` and @@ -203,9 +213,9 @@ extern "C" { * v7: real Silero VAD (same symbol surface as v6). * v6: fused wake-word, speaker, diarizer. */ -#define ELIZA_INFERENCE_ABI_VERSION 12 +#define ELIZA_INFERENCE_ABI_VERSION 13 -/* Returns a static, NUL-terminated string of the form "12" matching +/* Returns a static, NUL-terminated string of the form "13" matching * ELIZA_INFERENCE_ABI_VERSION at the time the library was built. The * pointer is owned by the library — do NOT free. */ const char * eliza_inference_abi_version(void); @@ -1086,6 +1096,38 @@ int eliza_inference_describe_image( size_t max_text_bytes, char ** out_error); +/* ---- Streaming mmproj vision describe (ABI v13, additive) --------- * + * + * Token-by-token vision. `_describe_image_stream_open` runs the SAME + * mmproj-prefill as `_describe_image` (mtmd_tokenize + mtmd_helper_eval_chunks), + * but instead of decoding the whole description into a buffer it returns an + * `EliLlmStream *` whose KV is primed with the image + prompt and whose sampler + * (greedy) + `max_tokens` (ELIZA_VISION_MAX_TOKENS) match `_describe_image`. + * The caller then PULLS tokens with the existing `eliza_inference_llm_stream_next` + * loop and releases the handle with `eliza_inference_llm_stream_close` — the + * exact same machinery (and JS FfiStreamingRunner) that drives chat text, so a + * description streams into the dashboard through one pipe with no event-loop + * blocking (each `_next` step yields between tokens). The returned stream has no + * MTP engine (vision uses the plain fixed-KV decode path). + * + * Gated on `-DELIZA_ENABLE_VISION=1` (same flag as `_describe_image`). A build + * without it returns 0 from `_vision_stream_supported()` and NULL (+ *out_error) + * from `_describe_image_stream_open`; the IMAGE_DESCRIPTION handler then falls + * back to the buffered `_describe_image`. */ + +/* Capability probe: 1 when this build wires the streaming vision-describe path + * (ELIZA_ENABLE_VISION compiled in), 0 otherwise. Callers pick the streaming + * open + `_llm_stream_next` loop vs the buffered `_describe_image` off this. */ +int eliza_inference_vision_stream_supported(void); + +EliLlmStream * eliza_inference_describe_image_stream_open( + EliInferenceContext * ctx, + const unsigned char * image_bytes, + size_t n_bytes, + const char * mmproj_path, + const char * prompt, + char ** out_error); + /* ---- Tokenizer (ABI v9, additive) --------------------------------- * * * Expose `llama_tokenize` / `llama_detokenize` over the loaded text model's diff --git a/tools/omnivoice/src/eliza-inference-ffi.cpp b/tools/omnivoice/src/eliza-inference-ffi.cpp index dc74dd8eb..cdb8a1e4d 100644 --- a/tools/omnivoice/src/eliza-inference-ffi.cpp +++ b/tools/omnivoice/src/eliza-inference-ffi.cpp @@ -3878,6 +3878,167 @@ int eliza_inference_describe_image( #endif // ELIZA_ENABLE_VISION } +/* ---- Streaming mmproj vision describe (ABI v13) ------------------- * + * + * Token-by-token vision: open primes an EliLlmStream's KV with the image + + * prompt (the same mtmd prefill as _describe_image), and the caller drives the + * existing _llm_stream_next loop to pull tokens — so vision streams through the + * exact same path (and JS FfiStreamingRunner) as chat text. The returned stream + * carries a greedy sampler + ELIZA_VISION_MAX_TOKENS cap and no MTP engine. */ + +int eliza_inference_vision_stream_supported(void) { +#if defined(ELIZA_ENABLE_VISION) + return 1; +#else + return 0; +#endif +} + +EliLlmStream * eliza_inference_describe_image_stream_open( + EliInferenceContext * ctx, + const unsigned char * image_bytes, + size_t n_bytes, + const char * mmproj_path, + const char * prompt, + char ** out_error) { +#if !defined(ELIZA_ENABLE_VISION) + (void) ctx; (void) image_bytes; (void) n_bytes; (void) mmproj_path; + (void) prompt; + eliza_set_error(out_error, + "[libelizainference] describe_image_stream_open: this build was compiled " + "without ELIZA_ENABLE_VISION (eliza_inference_vision_stream_supported() == " + "0); use the buffered _describe_image path"); + return nullptr; +#else + if (!ctx || !image_bytes || n_bytes == 0 || !mmproj_path || + mmproj_path[0] == '\0') { + eliza_set_error(out_error, + "[libelizainference] describe_image_stream_open: invalid arguments"); + return nullptr; + } + + std::lock_guard lock(ctx->llm_mutex); + int rc = eliza_load_llm_model_locked(ctx, /* n_gpu_layers= */ -1, out_error); + if (rc != ELIZA_OK) return nullptr; + rc = eliza_ensure_vision_mtmd_locked(ctx, std::string(mmproj_path), out_error); + if (rc != ELIZA_OK) return nullptr; + + /* A fresh generation context (causal, no embeddings), owned by the returned + * stream and freed by eliza_inference_llm_stream_close. Same params as the + * buffered _describe_image so streamed and buffered describes decode + * identically. */ + llama_context_params cparams = llama_context_default_params(); + const int n_ctx_train = llama_model_n_ctx_train(ctx->llm_model); + int n_ctx = eliza_int_env_or_default("ELIZA_VISION_N_CTX", 4096); + if (n_ctx_train > 0 && n_ctx > n_ctx_train) n_ctx = n_ctx_train; + cparams.n_ctx = (uint32_t) n_ctx; + cparams.n_batch = (uint32_t) eliza_int_env_or_default("ELIZA_VISION_N_BATCH", 512); + cparams.n_ubatch = cparams.n_batch; + cparams.n_threads = eliza_thread_count(false); + cparams.n_threads_batch = eliza_thread_count(true); + cparams.flash_attn_type = eliza_llm_flash_attn_type(); + llama_context * lctx = llama_init_from_model(ctx->llm_model, cparams); + if (!lctx) { + eliza_set_error(out_error, + "[libelizainference] describe_image_stream_open: failed to init context"); + return nullptr; + } + + llama_sampler * sampler = nullptr; + mtmd_bitmap * bitmap = nullptr; + mtmd_input_chunks * chunks = nullptr; + bool ok = false; + llama_pos n_past = 0; + + do { + const char * marker = mtmd_default_marker(); + std::string user_prompt = prompt && prompt[0] != '\0' + ? std::string(prompt) + : std::string("Describe what is in this image."); + std::string prompt_text = + (marker && user_prompt.find(marker) != std::string::npos) + ? user_prompt + : (std::string(marker ? marker : "<__media__>") + "\n" + user_prompt); + + bitmap = mtmd_helper_bitmap_init_from_buf( + ctx->vision_mtmd, image_bytes, n_bytes); + if (!bitmap) { + eliza_set_error(out_error, + "[libelizainference] describe_image_stream_open: image decode failed"); + break; + } + chunks = mtmd_input_chunks_init(); + if (!chunks) { + eliza_set_error(out_error, + "[libelizainference] describe_image_stream_open: chunks allocation failed"); + break; + } + mtmd_input_text text = { prompt_text.c_str(), true, true }; + const mtmd_bitmap * bitmaps[] = { bitmap }; + int32_t tok_rc = mtmd_tokenize(ctx->vision_mtmd, chunks, &text, bitmaps, 1); + if (tok_rc != 0) { + eliza_set_error(out_error, + "[libelizainference] describe_image_stream_open: mtmd_tokenize rc=" + + std::to_string(tok_rc)); + break; + } + + llama_memory_clear(llama_get_memory(lctx), true); + int32_t eval_rc = mtmd_helper_eval_chunks( + ctx->vision_mtmd, lctx, chunks, n_past, 0, + (int32_t) cparams.n_batch, true, &n_past); + if (eval_rc != 0) { + eliza_set_error(out_error, + "[libelizainference] describe_image_stream_open: mtmd_helper_eval_chunks rc=" + + std::to_string(eval_rc)); + break; + } + + llama_sampler_chain_params sparams = llama_sampler_chain_default_params(); + sampler = llama_sampler_chain_init(sparams); + if (!sampler) { + eliza_set_error(out_error, + "[libelizainference] describe_image_stream_open: failed to init sampler"); + break; + } + llama_sampler_chain_add(sampler, llama_sampler_init_greedy()); + ok = true; + } while (false); + + /* The bitmap + chunks are only needed for the prefill eval; the KV now holds + * the image + prompt, so release them (the lctx + sampler live on in the + * returned stream). */ + if (chunks) mtmd_input_chunks_free(chunks); + if (bitmap) mtmd_bitmap_free(bitmap); + + if (!ok) { + if (sampler) llama_sampler_free(sampler); + llama_free(lctx); + return nullptr; + } + + EliLlmStream * stream = new (std::nothrow) EliLlmStream(); + if (!stream) { + llama_sampler_free(sampler); + llama_free(lctx); + eliza_set_error(out_error, + "[libelizainference] describe_image_stream_open: out of memory"); + return nullptr; + } + stream->ctx = ctx; + stream->lctx = lctx; + stream->sampler = sampler; + stream->n_past = (int) n_past; + stream->generated = 0; + stream->max_tokens = eliza_int_env_or_default("ELIZA_VISION_MAX_TOKENS", 256); + stream->eos = false; + /* mtp stays null — vision uses the plain fixed-KV decode path in + * _llm_stream_next, which samples from lctx (logits primed at -1 by + * mtmd_helper_eval_chunks above). */ + return stream; +#endif // ELIZA_ENABLE_VISION +} + /* ---- Tokenizer (ABI v9) ------------------------------------------- * * * llama_tokenize / llama_detokenize over the loaded text model's vocab, so the