Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions tools/kokoro/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,16 @@ add_library(kokoro_lib STATIC
target_include_directories(kokoro_lib PUBLIC
${CMAKE_CURRENT_SOURCE_DIR}/include)

# kokoro_lib is folded into the fused libelizainference SHARED library
# (tools/omnivoice/CMakeLists.txt links it PRIVATE). When the parent build is
# configured with BUILD_SHARED_LIBS=ON (e.g. the OpenVINO Linux CI job), the
# static archive's objects must be position-independent or ld refuses to fold
# them into a -shared object ("relocation R_X86_64_PC32 ... can not be used
# when making a shared object; recompile with -fPIC"). PIC is not transitive
# from the SHARED consumer, so set it on the static target itself — mirroring
# eliza_voice_classifiers / omnivoice_lib in the sibling omnivoice subtree.
set_target_properties(kokoro_lib PROPERTIES POSITION_INDEPENDENT_CODE ON)

# ggml + llama are already configured by the parent build; pulling them via
# target_link_libraries gives us the include paths and the link line.
target_link_libraries(kokoro_lib PUBLIC ggml)
Expand All @@ -48,6 +58,12 @@ target_compile_features(kokoro_lib PUBLIC cxx_std_17)
# Standalone CLI harness — required by J2 verification (tools/voice-kokoro/).
add_executable(kokoro-tts tools/kokoro-tts.cpp)
target_link_libraries(kokoro-tts PRIVATE kokoro_lib)
# kokoro-tts is a CLI harness, not a GUI app. On Apple platforms CMake defaults
# executables to MACOSX_BUNDLE, and install(TARGETS ... RUNTIME) on a bundle
# target fails configure with "install TARGETS given no BUNDLE DESTINATION for
# MACOSX_BUNDLE executable" on every ios/tvos/visionos/macos build. Force the
# bundle flag off so the plain RUNTIME install is valid on all platforms.
set_target_properties(kokoro-tts PROPERTIES MACOSX_BUNDLE OFF)
install(TARGETS kokoro-tts RUNTIME)

# Server-mount handler: compiled into kokoro_lib only when the server target
Expand Down
11 changes: 9 additions & 2 deletions tools/kokoro/src/kokoro.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -213,9 +213,16 @@ kokoro_model_ptr kokoro_load_model(
h.sample_rate = gguf_i32(model->gguf, "kokoro.audio.sample_rate", h.sample_rate);

// Bind backend (CPU only for now — GGML graph below is CPU-friendly).
model->backend = ggml_backend_cpu_init();
// Use the registry API rather than ggml_backend_cpu_init(): under
// -DGGML_BACKEND_DL (the Android build) the CPU backend is a dynamically
// loaded module and ggml_backend_cpu_init() is not linked, so a direct call
// is an undefined symbol at link time. ggml_backend_load_all() is idempotent
// and registers the CPU device in both the DL and statically-linked builds,
// matching how the sibling omnivoice tool initializes its CPU backend.
ggml_backend_load_all();
model->backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
if (!model->backend) {
err_out = "ggml_backend_cpu_init failed";
err_out = "ggml_backend_init_by_type(CPU) failed";
return {nullptr, kokoro_model_deleter{}};
}

Expand Down
46 changes: 44 additions & 2 deletions tools/omnivoice/include/eliza-inference-ffi.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,16 @@ extern "C" {
* load and refuses to bind if they disagree.
*
* Changelog:
* v13: token-by-token vision describe. `eliza_inference_vision_stream_supported()`
* + `_describe_image_stream` run the SAME mmproj-prefill + greedy decode as
* `_describe_image`, but invoke an `eliza_vision_chunk_cb` with each decoded
* UTF-8 text piece as it is produced (then once more with `is_final == 1`),
* so the IMAGE_DESCRIPTION handler streams a description into the dashboard
* through the SAME per-token pipe as chat text (mirrors the streaming-TTS
* `eliza_tts_chunk_cb` cancellation contract). Additive symbols — a v12
* caller is unaffected; a v12 library reports `vision_stream_supported() == 0`
* and the loader falls back to the buffered `_describe_image`. Gated on the
* same `-DELIZA_ENABLE_VISION=1` build flag.
* v12: ASR word timestamps folded into the fused ASR.
* `eliza_inference_asr_timestamps_supported()` + `_asr_transcribe_timed`
* run the SAME audio-in/text-out decode as `_asr_transcribe` and
Expand Down Expand Up @@ -203,9 +213,9 @@ extern "C" {
* v7: real Silero VAD (same symbol surface as v6).
* v6: fused wake-word, speaker, diarizer.
*/
#define ELIZA_INFERENCE_ABI_VERSION 12
#define ELIZA_INFERENCE_ABI_VERSION 13

/* Returns a static, NUL-terminated string of the form "12" matching
/* Returns a static, NUL-terminated string of the form "13" matching
* ELIZA_INFERENCE_ABI_VERSION at the time the library was built. The
* pointer is owned by the library — do NOT free. */
const char * eliza_inference_abi_version(void);
Expand Down Expand Up @@ -1086,6 +1096,38 @@ int eliza_inference_describe_image(
size_t max_text_bytes,
char ** out_error);

/* ---- Streaming mmproj vision describe (ABI v13, additive) --------- *
*
* Token-by-token vision. `_describe_image_stream_open` runs the SAME
* mmproj-prefill as `_describe_image` (mtmd_tokenize + mtmd_helper_eval_chunks),
* but instead of decoding the whole description into a buffer it returns an
* `EliLlmStream *` whose KV is primed with the image + prompt and whose sampler
* (greedy) + `max_tokens` (ELIZA_VISION_MAX_TOKENS) match `_describe_image`.
* The caller then PULLS tokens with the existing `eliza_inference_llm_stream_next`
* loop and releases the handle with `eliza_inference_llm_stream_close` — the
* exact same machinery (and JS FfiStreamingRunner) that drives chat text, so a
* description streams into the dashboard through one pipe with no event-loop
* blocking (each `_next` step yields between tokens). The returned stream has no
* MTP engine (vision uses the plain fixed-KV decode path).
*
* Gated on `-DELIZA_ENABLE_VISION=1` (same flag as `_describe_image`). A build
* without it returns 0 from `_vision_stream_supported()` and NULL (+ *out_error)
* from `_describe_image_stream_open`; the IMAGE_DESCRIPTION handler then falls
* back to the buffered `_describe_image`. */

/* Capability probe: 1 when this build wires the streaming vision-describe path
* (ELIZA_ENABLE_VISION compiled in), 0 otherwise. Callers pick the streaming
* open + `_llm_stream_next` loop vs the buffered `_describe_image` off this. */
int eliza_inference_vision_stream_supported(void);

EliLlmStream * eliza_inference_describe_image_stream_open(
EliInferenceContext * ctx,
const unsigned char * image_bytes,
size_t n_bytes,
const char * mmproj_path,
const char * prompt,
char ** out_error);

/* ---- Tokenizer (ABI v9, additive) --------------------------------- *
*
* Expose `llama_tokenize` / `llama_detokenize` over the loaded text model's
Expand Down
161 changes: 161 additions & 0 deletions tools/omnivoice/src/eliza-inference-ffi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3878,6 +3878,167 @@ int eliza_inference_describe_image(
#endif // ELIZA_ENABLE_VISION
}

/* ---- Streaming mmproj vision describe (ABI v13) ------------------- *
*
* Token-by-token vision: open primes an EliLlmStream's KV with the image +
* prompt (the same mtmd prefill as _describe_image), and the caller drives the
* existing _llm_stream_next loop to pull tokens — so vision streams through the
* exact same path (and JS FfiStreamingRunner) as chat text. The returned stream
* carries a greedy sampler + ELIZA_VISION_MAX_TOKENS cap and no MTP engine. */

int eliza_inference_vision_stream_supported(void) {
#if defined(ELIZA_ENABLE_VISION)
return 1;
#else
return 0;
#endif
}

EliLlmStream * eliza_inference_describe_image_stream_open(
EliInferenceContext * ctx,
const unsigned char * image_bytes,
size_t n_bytes,
const char * mmproj_path,
const char * prompt,
char ** out_error) {
#if !defined(ELIZA_ENABLE_VISION)
(void) ctx; (void) image_bytes; (void) n_bytes; (void) mmproj_path;
(void) prompt;
eliza_set_error(out_error,
"[libelizainference] describe_image_stream_open: this build was compiled "
"without ELIZA_ENABLE_VISION (eliza_inference_vision_stream_supported() == "
"0); use the buffered _describe_image path");
return nullptr;
#else
if (!ctx || !image_bytes || n_bytes == 0 || !mmproj_path ||
mmproj_path[0] == '\0') {
eliza_set_error(out_error,
"[libelizainference] describe_image_stream_open: invalid arguments");
return nullptr;
}

std::lock_guard<std::mutex> lock(ctx->llm_mutex);
int rc = eliza_load_llm_model_locked(ctx, /* n_gpu_layers= */ -1, out_error);
if (rc != ELIZA_OK) return nullptr;
rc = eliza_ensure_vision_mtmd_locked(ctx, std::string(mmproj_path), out_error);
if (rc != ELIZA_OK) return nullptr;

/* A fresh generation context (causal, no embeddings), owned by the returned
* stream and freed by eliza_inference_llm_stream_close. Same params as the
* buffered _describe_image so streamed and buffered describes decode
* identically. */
llama_context_params cparams = llama_context_default_params();
const int n_ctx_train = llama_model_n_ctx_train(ctx->llm_model);
int n_ctx = eliza_int_env_or_default("ELIZA_VISION_N_CTX", 4096);
if (n_ctx_train > 0 && n_ctx > n_ctx_train) n_ctx = n_ctx_train;
cparams.n_ctx = (uint32_t) n_ctx;
cparams.n_batch = (uint32_t) eliza_int_env_or_default("ELIZA_VISION_N_BATCH", 512);
cparams.n_ubatch = cparams.n_batch;
cparams.n_threads = eliza_thread_count(false);
cparams.n_threads_batch = eliza_thread_count(true);
cparams.flash_attn_type = eliza_llm_flash_attn_type();
llama_context * lctx = llama_init_from_model(ctx->llm_model, cparams);
if (!lctx) {
eliza_set_error(out_error,
"[libelizainference] describe_image_stream_open: failed to init context");
return nullptr;
}

llama_sampler * sampler = nullptr;
mtmd_bitmap * bitmap = nullptr;
mtmd_input_chunks * chunks = nullptr;
bool ok = false;
llama_pos n_past = 0;

do {
const char * marker = mtmd_default_marker();
std::string user_prompt = prompt && prompt[0] != '\0'
? std::string(prompt)
: std::string("Describe what is in this image.");
std::string prompt_text =
(marker && user_prompt.find(marker) != std::string::npos)
? user_prompt
: (std::string(marker ? marker : "<__media__>") + "\n" + user_prompt);

bitmap = mtmd_helper_bitmap_init_from_buf(
ctx->vision_mtmd, image_bytes, n_bytes);
if (!bitmap) {
eliza_set_error(out_error,
"[libelizainference] describe_image_stream_open: image decode failed");
break;
}
chunks = mtmd_input_chunks_init();
if (!chunks) {
eliza_set_error(out_error,
"[libelizainference] describe_image_stream_open: chunks allocation failed");
break;
}
mtmd_input_text text = { prompt_text.c_str(), true, true };
const mtmd_bitmap * bitmaps[] = { bitmap };
int32_t tok_rc = mtmd_tokenize(ctx->vision_mtmd, chunks, &text, bitmaps, 1);
if (tok_rc != 0) {
eliza_set_error(out_error,
"[libelizainference] describe_image_stream_open: mtmd_tokenize rc=" +
std::to_string(tok_rc));
break;
}

llama_memory_clear(llama_get_memory(lctx), true);
int32_t eval_rc = mtmd_helper_eval_chunks(
ctx->vision_mtmd, lctx, chunks, n_past, 0,
(int32_t) cparams.n_batch, true, &n_past);
if (eval_rc != 0) {
eliza_set_error(out_error,
"[libelizainference] describe_image_stream_open: mtmd_helper_eval_chunks rc=" +
std::to_string(eval_rc));
break;
}

llama_sampler_chain_params sparams = llama_sampler_chain_default_params();
sampler = llama_sampler_chain_init(sparams);
if (!sampler) {
eliza_set_error(out_error,
"[libelizainference] describe_image_stream_open: failed to init sampler");
break;
}
llama_sampler_chain_add(sampler, llama_sampler_init_greedy());
ok = true;
} while (false);

/* The bitmap + chunks are only needed for the prefill eval; the KV now holds
* the image + prompt, so release them (the lctx + sampler live on in the
* returned stream). */
if (chunks) mtmd_input_chunks_free(chunks);
if (bitmap) mtmd_bitmap_free(bitmap);

if (!ok) {
if (sampler) llama_sampler_free(sampler);
llama_free(lctx);
return nullptr;
}

EliLlmStream * stream = new (std::nothrow) EliLlmStream();
if (!stream) {
llama_sampler_free(sampler);
llama_free(lctx);
eliza_set_error(out_error,
"[libelizainference] describe_image_stream_open: out of memory");
return nullptr;
}
stream->ctx = ctx;
stream->lctx = lctx;
stream->sampler = sampler;
stream->n_past = (int) n_past;
stream->generated = 0;
stream->max_tokens = eliza_int_env_or_default("ELIZA_VISION_MAX_TOKENS", 256);
stream->eos = false;
/* mtp stays null — vision uses the plain fixed-KV decode path in
* _llm_stream_next, which samples from lctx (logits primed at -1 by
* mtmd_helper_eval_chunks above). */
return stream;
#endif // ELIZA_ENABLE_VISION
}

/* ---- Tokenizer (ABI v9) ------------------------------------------- *
*
* llama_tokenize / llama_detokenize over the loaded text model's vocab, so the
Expand Down
Loading