diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index cd9666a21..0eb9dad8a 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -1990,7 +1990,19 @@ void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, lla uint32_t cell_range_begin = cells.size(); for (uint32_t i = 0; i < cells.size(); ++i) { - if (!cells.is_empty(i) && (seq_id == -1 || cells.seq_has(i, seq_id))) { + bool add_cell = true; + + add_cell = add_cell && !cells.is_empty(i); + add_cell = add_cell && (seq_id == -1 || cells.seq_has(i, seq_id)); + + // check the cell is not SWA-masked + if (add_cell && seq_id != -1) { + const bool is_masked = llama_hparams::is_masked_swa(n_swa, swa_type, cells.pos_get(i), cells.seq_pos_max(seq_id)); + + add_cell = !is_masked; + } + + if (add_cell) { ++cell_count; if (cell_range_begin == cells.size()) { cell_range_begin = i; @@ -2246,7 +2258,7 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32 sinfo = find_slot(ubatch, false); if (sinfo.empty()) { - LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__); + LLAMA_LOG_ERROR("%s: failed to find %d available cells in kv cache\n", __func__, cell_count); return false; } diff --git a/tools/omnivoice/CMakeLists.txt b/tools/omnivoice/CMakeLists.txt index 72088dfa0..038fa5be4 100644 --- a/tools/omnivoice/CMakeLists.txt +++ b/tools/omnivoice/CMakeLists.txt @@ -78,6 +78,19 @@ set(OMNIVOICE_CORE_SOURCES # llama + mtmd into a single ABI-stable C surface. set(OMNIVOICE_FFI_SOURCES src/eliza-inference-ffi.cpp + # Multi-runtime streaming-LLM backend seam (cutover plan M3). The selector + # is always compiled; it is inert until a -DELIZA_ENABLE_* accelerator + # backend below registers itself, so the default build keeps the in-tree + # llama.cpp path. + src/llm-backend-selector.cpp + # Per-op backend seams (cutover M3+). Each modality's selector reuses the + # shared eliza_backend::Registry (backend-registry.h) and is inert until a + # gated backend registers — so the default build keeps the ggml path per-op. + src/embed-backend-selector.cpp + src/vision-backend-selector.cpp + src/asr-backend-selector.cpp + src/tts-backend-selector.cpp + src/eot-backend-selector.cpp ) # Vendored standalone voice-classifier forward graphs (pure scalar C, no @@ -220,6 +233,24 @@ endif() # (the fused build links mtmd anyway), opt out with -DELIZA_ENABLE_VISION=OFF. option(ELIZA_ENABLE_VISION "Build the fused mmproj vision-describe ABI (v9)" ON) +# ELIZA_ENABLE_LITERT — compile the LiteRT-LM in-process streaming-LLM backend +# (cutover plan M4 — Android NPU: Tensor / Qualcomm QNN / MediaTek NeuroPilot). +# OFF by default: the selector registers no LiteRT backend and the streaming-LLM +# pipe keeps the in-tree llama.cpp path. ON requires the LiteRT-LM SDK +# (ELIZA_LITERT_SDK_DIR) — a host/device cross-build concern, not the Linux CI +# default. See docs/multi-backend-ffi-seam.md. +option(ELIZA_ENABLE_LITERT "Build the LiteRT C-API per-op backends, e.g. embed (M4)" OFF) + +# ELIZA_ENABLE_LITERT_LM — the streaming-LLM backend on the heavier LiteRT-LM +# Engine SDK (litert::lm), separate from the LiteRT C runtime above. OFF until +# that SDK is built; point -DELIZA_LITERT_LM_SDK_DIR / -DELIZA_LITERT_LM_LIBS at it. +option(ELIZA_ENABLE_LITERT_LM "Build the LiteRT-LM in-process streaming-LLM backend" OFF) + +# ELIZA_ENABLE_MLX — compile the CoreML/MLX in-process streaming-LLM backend +# (cutover plan M5 — Apple Silicon). OFF by default; ON is Apple-only and +# requires the MLX / CoreML toolchain. See docs/multi-backend-ffi-seam.md. +option(ELIZA_ENABLE_MLX "Build the CoreML/MLX in-process LLM backend (M5)" OFF) + if(TARGET mtmd) add_library(elizainference SHARED ${OMNIVOICE_CORE_SOURCES} @@ -271,6 +302,65 @@ if(TARGET mtmd) ${CMAKE_CURRENT_SOURCE_DIR}/../kokoro/include) target_compile_definitions(elizainference PRIVATE ELIZA_ENABLE_KOKORO) endif() + # ── Multi-runtime streaming-LLM accelerator backends (cutover M4/M5) ──── + # The M3 selector (src/llm-backend-selector.cpp) is always compiled in via + # OMNIVOICE_FFI_SOURCES. These accelerator backends each link an external + # SDK, so they are opt-in. When a gate is OFF its source is not compiled, + # the selector's `#ifdef`-guarded factory declaration + registration drop + # out, and the streaming-LLM pipe keeps the in-tree llama.cpp path — so the + # default desktop/CI build is byte-for-byte the pre-seam behavior. + if(ELIZA_ENABLE_LITERT) + # LiteRT C-API per-op backends (embed today; vision/etc. as artifacts + # ship). SDK = the LiteRT C runtime (github.com/google-ai-edge/LiteRT, + # libLiteRt.so + the GPU/NPU delegate). Point at a built SDK with + # -DELIZA_LITERT_SDK_DIR= and link with -DELIZA_LITERT_LIBS=LiteRt. + target_sources(elizainference PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src/backends/litert-embed-backend.cpp) + target_compile_definitions(elizainference PRIVATE ELIZA_ENABLE_LITERT) + if(ELIZA_LITERT_SDK_DIR) + target_include_directories(elizainference PRIVATE ${ELIZA_LITERT_SDK_DIR}/include) + target_link_directories(elizainference PRIVATE ${ELIZA_LITERT_SDK_DIR}/lib) + endif() + if(ELIZA_LITERT_LIBS) + target_link_libraries(elizainference PRIVATE ${ELIZA_LITERT_LIBS}) + endif() + endif() + if(ELIZA_ENABLE_LITERT_LM) + # The streaming-LLM backend needs the heavier LiteRT-LM Engine SDK + # (litert::lm, github.com/google-ai-edge/LiteRT-LM) — separate from the + # LiteRT C runtime above. Point at it with -DELIZA_LITERT_LM_SDK_DIR / + # -DELIZA_LITERT_LM_LIBS. + target_sources(elizainference PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src/backends/litert-backend.cpp) + target_compile_definitions(elizainference PRIVATE ELIZA_ENABLE_LITERT_LM) + if(ELIZA_LITERT_LM_SDK_DIR) + target_include_directories(elizainference PRIVATE ${ELIZA_LITERT_LM_SDK_DIR}/include) + target_link_directories(elizainference PRIVATE ${ELIZA_LITERT_LM_SDK_DIR}/lib) + endif() + if(ELIZA_LITERT_LM_LIBS) + target_link_libraries(elizainference PRIVATE ${ELIZA_LITERT_LM_LIBS}) + endif() + endif() + if(ELIZA_ENABLE_MLX) + if(NOT APPLE) + message(FATAL_ERROR + "ELIZA_ENABLE_MLX requires an Apple host (CoreML/MLX are Apple-only).") + endif() + target_sources(elizainference PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src/backends/mlx-coreml-backend.mm) + target_compile_definitions(elizainference PRIVATE ELIZA_ENABLE_MLX) + # MLX C API (ml-explore/mlx-c) via -DELIZA_MLX_C_DIR / -DELIZA_MLX_LIBS, + # plus the system CoreML / Metal / Foundation frameworks. + if(ELIZA_MLX_C_DIR) + target_include_directories(elizainference PRIVATE ${ELIZA_MLX_C_DIR}/include) + target_link_directories(elizainference PRIVATE ${ELIZA_MLX_C_DIR}/lib) + endif() + if(ELIZA_MLX_LIBS) + target_link_libraries(elizainference PRIVATE ${ELIZA_MLX_LIBS}) + endif() + target_link_libraries(elizainference PRIVATE + "-framework Foundation" "-framework CoreML" "-framework Metal") + endif() set_target_properties(elizainference PROPERTIES OUTPUT_NAME elizainference POSITION_INDEPENDENT_CODE ON) diff --git a/tools/omnivoice/src/SESSION-OPS-TODO.md b/tools/omnivoice/src/SESSION-OPS-TODO.md new file mode 100644 index 000000000..7095b8952 --- /dev/null +++ b/tools/omnivoice/src/SESSION-OPS-TODO.md @@ -0,0 +1,159 @@ +# Session-op backend seam — design (NOT implemented) + +The per-op backend seam (`backend-registry.h` + `-backend.h` + +`-backend-selector.cpp` + a chokepoint at the top of the FFI fn) is now in +place for the **one-shot** ops: + +| modality | FFI fn | header / selector | env key | artifact dir | +|----------|---------------------------------|------------------------------|-----------------------|--------------------| +| embed | `eliza_inference_embed` | `embed-backend.*` | `ELIZA_EMBED_BACKEND` | `/embedding/` | +| vision | `eliza_inference_describe_image`| `vision-backend.*` | `ELIZA_VISION_BACKEND`| `/vision/` | +| asr | `eliza_inference_asr_transcribe`| `asr-backend.*` | `ELIZA_ASR_BACKEND` | `/asr/` | +| tts | `eliza_inference_tts_synthesize`| `tts-backend.*` | `ELIZA_TTS_BACKEND` | `/tts/` | +| eot | `eliza_inference_llm_eot_score` | `eot-backend.*` | `ELIZA_EOT_BACKEND` | `/eot/` | + +A one-shot op is stateless across calls: select → (delegate | fall through to +ggml) on every call. There is nothing to keep alive between calls, so the seam +is a single chokepoint at the top of the fn. + +The **session** ops are different: `vad`, `wakeword`, `speaker`, `diariz` each +`_open` a native handle (`EliVad *`, `EliWakeword *`, `EliSpeaker *`, +`EliDiariz *`) that persists across many `_segment`/`_detect`/`_embed` calls and +is torn down with `_close`/`_reset`. The seam has to follow that lifecycle, not +re-select per call. This file records HOW to extend the seam to them. **None of +the below is implemented yet.** + +## The shape of a session op (today, in-tree only) + +Each session modality exposes, e.g. for VAD: + +```c +EliVad * eliza_inference_vad_open(EliInferenceContext * ctx, /* params */, char ** out_error); +int eliza_inference_vad_segment(EliVad * vad, const float * pcm, size_t n, /* out */, char ** out_error); +int eliza_inference_vad_reset(EliVad * vad, char ** out_error); +void eliza_inference_vad_close(EliVad * vad); +``` + +`EliVad` (and the wakeword/speaker/diariz equivalents) is the in-tree handle +struct defined in `eliza-inference-ffi.cpp`. Its in-tree fields stay exactly as +they are; the seam is **additive** — one extra pointer. + +## Extending the seam to a session op + +For each session modality `` (vad | wakeword | speaker | diariz): + +### 1. A session factory interface — `-backend.h` + +Mirror the one-shot factory's four common probes, but the forward methods mirror +the **session** ABI 1:1 instead of a single one-shot fn. The factory does NOT +own the handle struct; it produces and operates on an opaque backend-session: + +```cpp +struct VadBackendFactory { + virtual ~VadBackendFactory() = default; + virtual const char * name() const = 0; + virtual bool available() const = 0; + virtual bool can_serve(const char * bundle_dir) const = 0; // probes /vad/ + virtual int preference_rank() const { return 0; } + + // Lifecycle, mirroring the FFI session ABI 1:1. The factory returns an + // opaque backend-session pointer it owns; the FFI stashes it on the Eli* + // handle. A NULL return + *out_error is a hard open failure. + virtual void * open(EliInferenceContext * ctx, /* same params as eliza_inference_vad_open */, + char ** out_error) = 0; + virtual int segment(void * session, const float * pcm, size_t n, /* out */, char ** out_error) = 0; + virtual int reset(void * session, char ** out_error) = 0; + virtual void close(void * session) = 0; +}; +``` + +Plus the same free-functions as the one-shot seam: +`vad_backend_register`, `vad_backend_register_builtins` (EMPTY for now — no +LiteRT session backend exists), `vad_backend_select(bundle_dir, out_error)`, +backed by a `eliza_backend::Registry` in +`-backend-selector.cpp` with env keys `ELIZA_VAD_BACKEND` → `ELIZA_BACKEND` +and modality `"vad"`. Artifact probe dir `/vad/` (resp. `wakeword/`, +`speaker/`, `diariz/`). + +### 2. A backend-session pointer on the Eli* handle + +The selection happens ONCE, at `_open`, not per call. Add one field to the +in-tree handle struct: + +```cpp +struct EliVad { + /* ... existing in-tree fields, unchanged ... */ + + /* Backend seam (additive). When non-null, this handle is served by an + * accelerator backend and every op delegates to it; the in-tree fields + * above are then unused. When null, the in-tree ggml path owns the handle. */ + VadBackendFactory * be = nullptr; // the factory that opened be_session + void * be_session = nullptr; // factory-owned backend session +}; +``` + +### 3. Select at `_open` + +In `eliza_inference_vad_open`, after the existing arg validation and before the +in-tree handle is built: + +```cpp +char * be_error = nullptr; +VadBackendFactory * be = vad_backend_select(llm_backend_context_bundle_dir(ctx), &be_error); +if (be_error) { eliza_set_error(out_error, std::string(be_error)); std::free(be_error); + return /* NULL handle */; } +if (be) { + void * sess = be->open(ctx, /* params */, out_error); + if (!sess) return /* NULL handle — open failed, out_error already set */; + EliVad * h = new EliVad(); + h->be = be; + h->be_session = sess; + return h; +} +/* else: fall through and build the in-tree handle exactly as today. */ +``` + +### 4. A guard at the TOP of each `_segment` / `_reset` / `_close` + +Each per-call op checks the backend pointer and delegates before touching any +in-tree state: + +```cpp +int eliza_inference_vad_segment(EliVad * vad, const float * pcm, size_t n, /* out */, char ** out_error) { + if (!vad) { /* invalid-arg as today */ } + if (vad->be) { // <-- guard + return vad->be->segment(vad->be_session, pcm, n, /* out */, out_error); + } + /* ... existing in-tree ggml segment body, unchanged ... */ +} + +void eliza_inference_vad_close(EliVad * vad) { + if (!vad) return; + if (vad->be) { vad->be->close(vad->be_session); delete vad; return; } // <-- guard + /* ... existing in-tree teardown, then delete vad ... */ +} +``` + +`_reset` follows the same guard pattern. + +## Why this shape (vs. re-selecting per call) + +- **Selection is per-session, not per-call.** A session's backend is fixed at + `_open`; you cannot have `_segment` cross from the ggml path to LiteRT mid + session because the KV/feature state lives in the (in-tree OR backend) + session, not on the FFI boundary. The one pointer captures that binding. +- **Hard-fail localizes to `_open`.** A bundle-invalid override surfaces once, + where the caller is already prepared to handle a NULL handle, instead of on + every `_segment`. +- **Additive + inert.** With no session backend registered (the case today), + `_open`'s `select()` returns nullptr, `be`/`be_session` stay null, and every + guard is a no-op — the in-tree path is byte-for-byte unchanged. Same inert-by + -default contract as the one-shot seam. + +## Status + +- One-shot seam: embed (with a LiteRT builtin), vision/asr/tts/eot (inert, + no builtin) — **done**. +- Session seam (vad/wakeword/speaker/diariz): **not implemented.** No + `-backend.{h,cpp}`, no handle field, no `_open` select, no per-call + guards exist yet. This file is the spec for when a session backend lands. diff --git a/tools/omnivoice/src/asr-backend-selector.cpp b/tools/omnivoice/src/asr-backend-selector.cpp new file mode 100644 index 000000000..7513e7d9d --- /dev/null +++ b/tools/omnivoice/src/asr-backend-selector.cpp @@ -0,0 +1,34 @@ +/* + * asr-backend-selector.cpp — registry + selection for the per-op ASR backend + * seam. A thin instantiation of eliza_backend::Registry + * (backend-registry.h) — the resolution logic is shared with every other + * modality. Inert by default: no -DELIZA_ENABLE_* ASR backend is compiled in + * (none exists yet), so nothing registers and asr_backend_select() returns + * nullptr, so eliza_inference_asr_transcribe keeps the in-tree ggml path. + */ + +#include "asr-backend.h" +#include "backend-registry.h" + +#include + +namespace { +eliza_backend::Registry g_registry; +std::once_flag g_builtins_once; +} // namespace + +void asr_backend_register(AsrBackendFactory * factory) { + g_registry.register_factory(factory); +} + +void asr_backend_register_builtins() { + std::call_once(g_builtins_once, []() { + /* No ASR backend exists yet — the seam stays inert. */ + }); +} + +AsrBackendFactory * asr_backend_select(const char * bundle_dir, char ** out_error) { + asr_backend_register_builtins(); + return g_registry.select("ELIZA_ASR_BACKEND", "ELIZA_BACKEND", "asr", + bundle_dir, out_error); +} diff --git a/tools/omnivoice/src/asr-backend.h b/tools/omnivoice/src/asr-backend.h new file mode 100644 index 000000000..2dd9fec49 --- /dev/null +++ b/tools/omnivoice/src/asr-backend.h @@ -0,0 +1,61 @@ +#pragma once +/* + * asr-backend.h — per-op backend seam for speech-to-text transcription. + * + * A one-shot op (eliza_inference_asr_transcribe) that an accelerator backend can + * serve when it ships an ASR artifact under `/asr/`, while every other + * op — and ASR itself when no artifact is present — stays on the in-tree ggml + * path. + * + * The factory mirrors the FFI 1:1 and the FFI delegates without translation. + * Selection reuses the shared eliza_backend::Registry (backend-registry.h): + * ELIZA_ASR_BACKEND (per-op) then ELIZA_BACKEND (global) hard-select, else the + * highest preference_rank among available()+can_serve() factories, else nullptr + * (the ggml ASR path). + */ + +#include "eliza-inference-ffi.h" /* EliInferenceContext fwd, ELIZA_* codes */ + +#include + +struct EliInferenceContext; + +/* One factory per linked-in ASR runtime (e.g. LiteRT). */ +struct AsrBackendFactory { + virtual ~AsrBackendFactory() = default; + + /* Stable lower-case id, e.g. "litert". Matched case-insensitively against + * ELIZA_ASR_BACKEND / ELIZA_BACKEND. */ + virtual const char * name() const = 0; + + /* Compiled in AND host deps present (the runtime + a GPU/NPU delegate). + * Cheap — must not load a model. */ + virtual bool available() const = 0; + + /* The ASR artifact exists under `/asr/`. Cheap directory probe, + * no model load. */ + virtual bool can_serve(const char * bundle_dir) const = 0; + + /* Platform-affinity rank (higher wins; the ggml path is the implicit rank 0). + * An NPU-served ASR returns a high positive value; a GPU-delegate fallback a + * lower positive value. */ + virtual int preference_rank() const { return 0; } + + /* Mirrors eliza_inference_asr_transcribe 1:1. Returns the number of bytes + * written (excluding the terminator) on success, or a negative ELIZA_* code + * with `*out_error` heap-allocated for the caller to free. */ + virtual int asr_transcribe(EliInferenceContext * ctx, const float * pcm, size_t n_samples, + int sample_rate_hz, char * out_text, size_t max_text_bytes, + char ** out_error) = 0; +}; + +/* Register a factory (idempotent by name). */ +void asr_backend_register(AsrBackendFactory * factory); + +/* Register every ASR backend compiled into THIS build (gated by the + * -DELIZA_ENABLE_* options). Idempotent; called by asr_backend_select. */ +void asr_backend_register_builtins(); + +/* Pick an ASR backend for the bundle at `bundle_dir`. nullptr + no error + * => use the in-tree ggml ASR path. nullptr + *out_error => hard failure. */ +AsrBackendFactory * asr_backend_select(const char * bundle_dir, char ** out_error); diff --git a/tools/omnivoice/src/backend-registry.h b/tools/omnivoice/src/backend-registry.h new file mode 100644 index 000000000..14a40b3fd --- /dev/null +++ b/tools/omnivoice/src/backend-registry.h @@ -0,0 +1,147 @@ +#pragma once +/* + * backend-registry.h — generic per-modality backend registry + selection. + * + * Factored out of the M3 streaming-LLM seam (llm-backend-selector.cpp) so EVERY + * on-device modality (embed, asr, tts, vision, vad, wakeword, speaker, diarizer, + * eot, …) reuses ONE resolution implementation instead of copy-pasting it. A + * modality declares a small factory interface with the four common probes + * (name / available / can_serve / preference_rank) plus its own forward method, + * instantiates `eliza_backend::Registry`, and selects with the + * shared logic below: + * + * 1. `ELIZA__BACKEND` env (per-op) → else `ELIZA_BACKEND` (global) — a + * HARD select. An in-tree name ("llama.cpp"/"ggml"/"default") forces the + * ggml path (returns nullptr, no error). Any other name that is not + * registered+available or cannot serve the bundle is a hard error + * (nullptr + *out_error). + * 2. No override: among registered factories that are available() AND + * can_serve(bundle_dir), pick the highest preference_rank(). None → nullptr. + * + * A nullptr return with *out_error == nullptr means "use the in-tree ggml path" + * — NOT an error. Inert by default: with no -DELIZA_ENABLE_* backend compiled, + * nothing registers and select() always returns nullptr, so every op keeps the + * in-tree path byte-for-byte. + * + * Factory type F must expose: + * const char * name() const; // stable lower-case id + * bool available() const; // compiled-in AND host deps present; cheap + * bool can_serve(const char * bundle_dir) const; // artifact probe; cheap + * int preference_rank() const; // higher wins; ggml == 0 + */ + +#include +#include +#include +#include +#include +#include + +namespace eliza_backend { + +/* malloc-allocate an error string so the caller frees it with + * eliza_inference_free_string() (free()), matching the FFI contract. */ +inline char * dup_error(const std::string & msg) { + char * out = (char *) std::malloc(msg.size() + 1); + if (out) std::memcpy(out, msg.c_str(), msg.size() + 1); + return out; +} + +inline bool iequals(const char * a, const char * b) { + if (!a || !b) return false; + while (*a && *b) { + if (std::tolower((unsigned char) *a) != std::tolower((unsigned char) *b)) { + return false; + } + ++a; + ++b; + } + return *a == *b; +} + +/* Names that mean "stay on the in-tree ggml/llama.cpp path". */ +inline bool is_intree_name(const char * s) { + return iequals(s, "llama.cpp") || iequals(s, "llamacpp") || iequals(s, "llama") || + iequals(s, "ggml") || iequals(s, "intree") || iequals(s, "default"); +} + +template +class Registry { +public: + /* Idempotent by name. Safe from static init. Does not take ownership — + * factories are static-lifetime singletons. */ + void register_factory(Factory * factory) { + if (!factory) return; + std::lock_guard lock(mu_); + for (Factory * f : factories_) { + if (iequals(f->name(), factory->name())) return; + } + factories_.push_back(factory); + } + + /* env_key: the per-op override (e.g. "ELIZA_EMBED_BACKEND"); global_key: the + * cross-op default (e.g. "ELIZA_BACKEND"); modality: for error text. */ + Factory * select(const char * env_key, const char * global_key, + const char * modality, const char * bundle_dir, + char ** out_error) { + const char * forced = env_key ? std::getenv(env_key) : nullptr; + if (!forced || forced[0] == '\0') { + forced = global_key ? std::getenv(global_key) : nullptr; + } + if (forced && forced[0] != '\0') { + if (is_intree_name(forced)) { + return nullptr; /* force in-tree, not an error */ + } + std::lock_guard lock(mu_); + for (Factory * f : factories_) { + if (!iequals(f->name(), forced)) continue; + if (!f->available()) { + set_err(out_error, modality, forced, + "is not available in this build/host"); + return nullptr; + } + if (!f->can_serve(bundle_dir)) { + set_err(out_error, modality, forced, + std::string("cannot serve the bundle at ") + + (bundle_dir ? bundle_dir : "(null)")); + return nullptr; + } + return f; + } + set_err(out_error, modality, forced, "is not a registered backend"); + return nullptr; + } + + /* Auto-select: highest preference_rank among available + can_serve. The + * in-tree ggml path is the implicit rank-0 fallback, so an accelerator + * backend only wins with a positive rank that can serve this bundle. */ + std::lock_guard lock(mu_); + Factory * best = nullptr; + int best_rank = 0; + for (Factory * f : factories_) { + if (!f->available()) continue; + if (!f->can_serve(bundle_dir)) continue; + const int rank = f->preference_rank(); + if (rank > best_rank) { + best_rank = rank; + best = f; + } + } + return best; /* nullptr => in-tree ggml path */ + } + +private: + static void set_err(char ** out_error, const char * modality, + const char * name, const std::string & why) { + if (out_error) { + *out_error = dup_error(std::string("[libelizainference] ") + + (modality ? modality : "backend") + + " backend override '" + name + "' " + why); + } + } + + std::mutex mu_; + std::vector factories_; +}; + +} // namespace eliza_backend diff --git a/tools/omnivoice/src/backends/litert-backend.cpp b/tools/omnivoice/src/backends/litert-backend.cpp new file mode 100644 index 000000000..3b3dad137 --- /dev/null +++ b/tools/omnivoice/src/backends/litert-backend.cpp @@ -0,0 +1,471 @@ +/* + * litert-backend.cpp — LiteRT-LM in-process streaming-LLM backend (M4). + * + * See litert-backend.h for the targeted LiteRT-LM C++ API (repo + commit + * date cited there). The real implementation is gated behind + * `ELIZA_ENABLE_LITERT`; the default (Linux/desktop) build compiles the stub + * branch, which links zero LiteRT-LM SDK headers and reports + * `available() == false` so the selector keeps the in-tree llama.cpp path. + * + * Error contract (native/AGENTS.md §3 + §9): never log, never return a + * defaulted result on failure. Every failure path heap-allocates `*out_error` + * via litert_set_error() (matching the FFI cpp's eliza_strdup/eliza_set_error + * style) and returns the negative ELIZA_* code or nullptr. + */ + +#include "litert-backend.h" + +#include +#include +#include +#include +#include + +#if defined(__has_include) +# if __has_include() +# include +# define LITERT_HAVE_FILESYSTEM 1 +# endif +#endif + +/* ── Heap-allocated error strings (mirror eliza-inference-ffi.cpp) ───────── */ +namespace { + +char * litert_strdup(const std::string & s) { + char * out = static_cast(std::malloc(s.size() + 1)); + if (!out) return nullptr; + std::memcpy(out, s.c_str(), s.size() + 1); + return out; +} + +void litert_set_error(char ** out_error, const std::string & msg) { + if (!out_error) return; + *out_error = litert_strdup(msg); +} + +#if defined(LITERT_HAVE_FILESYSTEM) +/* Probe /text/ for a *.litertlm artifact. Cheap directory walk, + * no model load (LlmBackendFactory::can_serve contract). */ +std::string find_litertlm_artifact(const char * bundle_dir) { + if (!bundle_dir || bundle_dir[0] == '\0') return std::string(); + std::error_code ec; + std::filesystem::path text_dir = + std::filesystem::path(bundle_dir) / LITERT_BUNDLE_TEXT_SUBDIR; + if (!std::filesystem::is_directory(text_dir, ec)) return std::string(); + for (std::filesystem::directory_iterator it(text_dir, ec), end; + !ec && it != end; it.increment(ec)) { + if (!it->is_regular_file(ec)) continue; + if (it->path().extension() == LITERT_ARTIFACT_EXT) { + return it->path().string(); + } + } + return std::string(); +} +#else +std::string find_litertlm_artifact(const char *) { return std::string(); } +#endif + +} // namespace + +/* ════════════════════════════════════════════════════════════════════════ * + * REAL implementation — only when ELIZA_ENABLE_LITERT is defined. + * Behind this gate we may include LiteRT-LM SDK headers; outside it we + * include NONE so the file builds on a host without the SDK. + * ════════════════════════════════════════════════════════════════════════ */ +#ifdef ELIZA_ENABLE_LITERT + +#include +#include +#include +#include +#include + +/* LiteRT-LM cross-platform C++ runtime. Paths per the repo's bazel layout + * (github.com/google-ai-edge/LiteRT-LM, `main`, researched 2026-06-22). */ +#include "runtime/engine/engine.h" // litert::lm::Engine, SessionInterface +#include "runtime/engine/engine_settings.h" // EngineSettings, SessionConfig, ModelAssets +#include "runtime/engine/io_types.h" // InputData, InputText, Responses + +namespace { + +using litert::lm::Backend; +using litert::lm::Engine; +using litert::lm::EngineSettings; +using litert::lm::InputData; +using litert::lm::InputText; +using litert::lm::ModelAssets; +using litert::lm::Responses; +using litert::lm::SessionConfig; + +/* The Session type the templated Engine hands back (Engine::Session is the + * public alias EngineT exposes; for Engine it is SessionInterface). */ +using Session = Engine::Session; + +/* The accelerator the factory resolved at open(), recorded for diagnostics + * and preference reporting. DEVICE-VERIFY: which rung actually initializes is + * hardware-dependent and can only be confirmed on an NPU/GPU device. */ +enum class ResolvedAccelerator { kNone, kNpu, kGpu, kCpu }; + +const char * accelerator_name(ResolvedAccelerator a) { + switch (a) { + case ResolvedAccelerator::kNpu: return "npu"; + case ResolvedAccelerator::kGpu: return "gpu"; + case ResolvedAccelerator::kCpu: return "cpu"; + default: return "none"; + } +} + +/* Try to build an Engine for `artifact` on `backend`. Returns the Engine on + * success; on failure returns nullptr (the ladder falls through to the next + * rung). The error text is captured so the final rung can surface it. */ +std::unique_ptr try_engine(const std::string & artifact, + Backend backend, + std::string & last_err) { + auto model_assets = ModelAssets::Create(artifact); + if (!model_assets.ok()) { + last_err = std::string(model_assets.status().message()); + return nullptr; + } + auto settings = EngineSettings::CreateDefault(*model_assets, backend); + if (!settings.ok()) { + last_err = std::string(settings.status().message()); + return nullptr; + } + auto engine = Engine::CreateEngine(*settings); + if (!engine.ok()) { + last_err = std::string(engine.status().message()); + return nullptr; + } + return std::move(*engine); +} + +/* ── Session: mirrors the FFI streaming pull contract 1:1 ────────────────── */ +class LiteRtBackendSession final : public LlmBackendSession { +public: + LiteRtBackendSession(std::unique_ptr engine, + std::unique_ptr session, + const eliza_llm_stream_config_t & cfg, + ResolvedAccelerator accel) + : engine_(std::move(engine)), + session_(std::move(session)), + accel_(accel), + max_tokens_(cfg.max_tokens > 0 ? cfg.max_tokens : 0) {} + + /* prefill: copy the caller's tokens, detokenize through the engine's + * tokenizer, and run a LiteRT prefill pass. The FFI hands pre-tokenized + * ids (text-model vocab); LiteRT-LM's prefill consumes InputData (text), + * so we round-trip ids → text via the shared tokenizer rather than + * assuming vocab parity (the .litertlm graph carries its own tokenizer). + * DEVICE-VERIFY: id/text round-trip fidelity needs a real .litertlm. */ + int prefill(const int32_t * token_ids, size_t num_tokens, + char ** out_error) override { + if (!session_) { + litert_set_error(out_error, + "[litert-lm] prefill: session is not open"); + return ELIZA_ERR_INVALID_ARG; + } + if (cancelled_.load(std::memory_order_acquire)) { + return ELIZA_ERR_CANCELLED; + } + std::vector ids; + ids.reserve(num_tokens); + for (size_t i = 0; i < num_tokens; ++i) ids.push_back(token_ids[i]); + + const std::string text = engine_->GetTokenizer().Detokenize(ids); + std::vector contents; + contents.emplace_back(InputText(std::string(text))); + + absl::Status st = session_->RunPrefill(contents); + if (!st.ok()) { + litert_set_error(out_error, + std::string("[litert-lm] RunPrefill failed: ") + + std::string(st.message())); + return ELIZA_ERR_FFI_FAULT; + } + prefilled_ = true; + return ELIZA_OK; + } + + /* next: one decode step. LiteRT-LM's RunDecode() returns a Responses + * batch; we emit the newly-produced UTF-8 delta as detokenized text and + * its token ids. LiteRT-LM has no in-process MTP drafter exposed through + * this surface, so drafted/accepted are always 0. Returns 1 (final) at + * EOS or the max-token cap, 0 otherwise. */ + int next(int32_t * tokens_out, size_t tokens_cap, size_t * num_tokens_out, + char * text_out, size_t text_cap, int32_t * drafter_drafted_out, + int32_t * drafter_accepted_out, char ** out_error) override { + if (num_tokens_out) *num_tokens_out = 0; + if (text_out && text_cap) text_out[0] = '\0'; + if (drafter_drafted_out) *drafter_drafted_out = 0; + if (drafter_accepted_out) *drafter_accepted_out = 0; + + if (!session_) { + litert_set_error(out_error, "[litert-lm] next: session not open"); + return ELIZA_ERR_INVALID_ARG; + } + if (!prefilled_) { + litert_set_error(out_error, + "[litert-lm] next: prefill must run before next"); + return ELIZA_ERR_INVALID_ARG; + } + if (cancelled_.load(std::memory_order_acquire)) { + return ELIZA_ERR_CANCELLED; + } + + auto responses = session_->RunDecode(); + if (!responses.ok()) { + litert_set_error(out_error, + std::string("[litert-lm] RunDecode failed: ") + + std::string(responses.status().message())); + return ELIZA_ERR_FFI_FAULT; + } + + /* RunDecode yields the running candidate texts; GetTexts()[0] is the + * cumulative decode for candidate 0. Emit only the suffix produced + * since the last step so the FFI streams a delta per pull. */ + const std::vector & texts = responses->GetTexts(); + std::string cumulative = texts.empty() ? std::string() : texts.front(); + std::string delta = compute_delta(cumulative); + emitted_chars_ = cumulative.size(); + + /* Re-tokenize the delta against the engine tokenizer so the FFI gets + * committed text-vocab ids (the same round-trip the prefill used). */ + std::vector delta_ids = engine_->GetTokenizer().Tokenize(delta); + size_t n_emit = delta_ids.size(); + if (n_emit > tokens_cap) n_emit = tokens_cap; + if (tokens_out) { + for (size_t i = 0; i < n_emit; ++i) { + tokens_out[i] = static_cast(delta_ids[i]); + } + } + if (num_tokens_out) *num_tokens_out = n_emit; + if (text_out && text_cap) { + const size_t copy = delta.size() < text_cap - 1 + ? delta.size() + : text_cap - 1; + std::memcpy(text_out, delta.data(), copy); + text_out[copy] = '\0'; + } + + decoded_tokens_ += static_cast(delta_ids.size()); + const bool hit_cap = + max_tokens_ > 0 && decoded_tokens_ >= max_tokens_; + /* DEVICE-VERIFY: the precise EOS signal LiteRT-LM exposes per step is + * runtime-version-dependent. A done decode yields no new delta; treat + * an empty delta or the token cap as the final step. */ + const bool eos = delta_ids.empty(); + return (hit_cap || eos) ? 1 : 0; + } + + /* cancel: publish a flag the next decode step observes. Thread-safe. */ + int cancel() override { + cancelled_.store(true, std::memory_order_release); + return ELIZA_OK; + } + + /* reset: drop a fresh Session from the same Engine (clears KV + sampler). + * Reuses the warm Engine (model weights stay resident) — only the + * per-generation Session is rebuilt. */ + int reset() override { + auto cfg = SessionConfig::CreateDefault(); + auto session = engine_->CreateSession(cfg); + if (!session.ok()) { + /* reset has no out_error param; a failed rebuild leaves the old + * session in place and surfaces on the next prefill/next. */ + return ELIZA_ERR_FFI_FAULT; + } + session_ = std::move(*session); + cancelled_.store(false, std::memory_order_release); + prefilled_ = false; + decoded_tokens_ = 0; + emitted_chars_ = 0; + return ELIZA_OK; + } + + /* reset_keep: LiteRT-LM's Session does not expose prefix-preserving KV + * trimming through this surface, so fall back to a full reset and return 0 + * (no prefix kept) — never an error (llm-backend.h contract). */ + int reset_keep(int32_t /*n_keep*/) override { + reset(); + return 0; + } + + const char * accelerator() const { return accelerator_name(accel_); } + +private: + /* The suffix of `cumulative` produced since the last emitted step. */ + std::string compute_delta(const std::string & cumulative) const { + if (cumulative.size() <= emitted_chars_) return std::string(); + return cumulative.substr(emitted_chars_); + } + + std::unique_ptr engine_; + std::unique_ptr session_; + std::atomic cancelled_{false}; + bool prefilled_ = false; + int32_t decoded_tokens_ = 0; + size_t emitted_chars_ = 0; + ResolvedAccelerator accel_ = ResolvedAccelerator::kNone; + int32_t max_tokens_ = 0; +}; + +/* ── Factory ─────────────────────────────────────────────────────────────── */ +class LiteRtBackendFactory final : public LlmBackendFactory { +public: + const char * name() const override { return LITERT_BACKEND_NAME; } + + /* available(): compiled in AND an accelerator (NPU or GPU) initializes on + * THIS host. Cheap — must not load a model. We probe by building a minimal + * EngineSettings on NPU then GPU with NO model assets; a backend whose + * delegate is missing fails settings validation. CPU alone does NOT make + * this backend "available" (CPU is the in-tree llama.cpp path's job). + * DEVICE-VERIFY: real delegate presence is only knowable on-device. */ + bool available() const override { + return probe_accelerator() != ResolvedAccelerator::kNone; + } + + /* can_serve(): a *.litertlm exists under /text/. Cheap probe, + * no caching — open() re-resolves the bundle from the context accessor. */ + bool can_serve(const char * bundle_dir) const override { + return !find_litertlm_artifact(bundle_dir).empty(); + } + + /* preference_rank(): high on Android NPU (the whole reason this backend + * exists), modest on a GPU-only fallback, 0 otherwise so llama.cpp wins. */ + int preference_rank() const override { + switch (probe_accelerator()) { + case ResolvedAccelerator::kNpu: return 100; + case ResolvedAccelerator::kGpu: return 20; + default: return 0; + } + } + + /* open(): resolve the .litertlm under the cached bundle, then walk the + * accelerator ladder NPU → GPU → CPU, recording which rung built the + * Engine. Builds a default Session and returns the streaming session. */ + LlmBackendSession * open(EliInferenceContext * ctx, + const eliza_llm_stream_config_t * cfg, + char ** out_error) override { + if (!cfg) { + litert_set_error(out_error, "[litert-lm] open: cfg is NULL"); + return nullptr; + } + const char * bundle_dir = llm_backend_context_bundle_dir(ctx); + const std::string bundle = bundle_dir ? bundle_dir : std::string(); + std::string artifact = find_litertlm_artifact(bundle.c_str()); + if (artifact.empty()) { + litert_set_error(out_error, + std::string("[litert-lm] open: no ") + LITERT_ARTIFACT_EXT + + " artifact under " + bundle + "/" + LITERT_BUNDLE_TEXT_SUBDIR); + return nullptr; + } + + /* Accelerator ladder — NPU first (Qualcomm QNN / MediaTek NeuroPilot / + * Google Tensor), then GPU (OpenCL/Metal/WebGPU), then CPU (XNNPACK). + * Each rung's failure text is preserved for the final diagnostic. + * DEVICE-VERIFY: rung availability is hardware-specific. */ + struct Rung { Backend backend; ResolvedAccelerator accel; }; + const Rung ladder[] = { + {Backend::NPU, ResolvedAccelerator::kNpu}, + {Backend::GPU, ResolvedAccelerator::kGpu}, + {Backend::CPU, ResolvedAccelerator::kCpu}, + }; + + std::unique_ptr engine; + ResolvedAccelerator resolved = ResolvedAccelerator::kNone; + std::string last_err; + for (const Rung & rung : ladder) { + engine = try_engine(artifact, rung.backend, last_err); + if (engine) { + resolved = rung.accel; + break; + } + } + if (!engine) { + litert_set_error(out_error, + std::string("[litert-lm] open: no accelerator could build the " + "engine (last error: ") + last_err + ")"); + return nullptr; + } + + auto session_cfg = SessionConfig::CreateDefault(); + auto session = engine->CreateSession(session_cfg); + if (!session.ok()) { + litert_set_error(out_error, + std::string("[litert-lm] open: CreateSession failed on ") + + accelerator_name(resolved) + ": " + + std::string(session.status().message())); + return nullptr; + } + + return new LiteRtBackendSession(std::move(engine), std::move(*session), + *cfg, resolved); + } + +private: + /* Build a no-model EngineSettings on NPU then GPU; the first whose + * delegate validates marks that rung present. Result is memoized so the + * repeated available()/preference_rank() calls are cheap. + * DEVICE-VERIFY: settings-only validation is the cheapest honest probe; + * the true delegate handshake happens at open() on-device. */ + ResolvedAccelerator probe_accelerator() const { + std::call_once(probe_once_, [this]() { + auto empty = ModelAssets::Create(std::string()); + if (!empty.ok()) { probed_ = ResolvedAccelerator::kNone; return; } + if (EngineSettings::CreateDefault(*empty, Backend::NPU).ok()) { + probed_ = ResolvedAccelerator::kNpu; + } else if (EngineSettings::CreateDefault(*empty, Backend::GPU).ok()) { + probed_ = ResolvedAccelerator::kGpu; + } else { + probed_ = ResolvedAccelerator::kNone; + } + }); + return probed_; + } + + mutable std::once_flag probe_once_; + mutable ResolvedAccelerator probed_ = ResolvedAccelerator::kNone; +}; + +} // namespace + +LlmBackendFactory * litert_backend_factory() { + static LiteRtBackendFactory factory; + return &factory; +} + +#else /* ────────────────────────── STUB (no LiteRT-LM SDK) ──────────────── */ + +/* + * Compiled-out stub: zero LiteRT-LM headers, so this builds on any host. The + * factory links in as a no-op — available() is false, can_serve() is false, + * preference_rank() is 0, and open() returns nullptr + sets `*out_error` + * "not compiled in" so the selector cleanly keeps the in-tree llama.cpp path. + */ +namespace { + +class LiteRtBackendFactoryStub final : public LlmBackendFactory { +public: + const char * name() const override { return LITERT_BACKEND_NAME; } + bool available() const override { return false; } + bool can_serve(const char * /*bundle_dir*/) const override { return false; } + int preference_rank() const override { return 0; } + + LlmBackendSession * open(EliInferenceContext * /*ctx*/, + const eliza_llm_stream_config_t * /*cfg*/, + char ** out_error) override { + litert_set_error(out_error, + "[litert-lm] backend not compiled in " + "(build with -DELIZA_ENABLE_LITERT to enable the LiteRT-LM NPU path)"); + return nullptr; + } +}; + +} // namespace + +LlmBackendFactory * litert_backend_factory() { + static LiteRtBackendFactoryStub factory; + return &factory; +} + +#endif /* ELIZA_ENABLE_LITERT */ diff --git a/tools/omnivoice/src/backends/litert-backend.h b/tools/omnivoice/src/backends/litert-backend.h new file mode 100644 index 000000000..9096b64d0 --- /dev/null +++ b/tools/omnivoice/src/backends/litert-backend.h @@ -0,0 +1,73 @@ +#pragma once +/* + * litert-backend.h — LiteRT-LM in-process streaming-LLM backend (cutover plan M4). + * + * Implements the M3 backend seam (`llm-backend.h`) on top of Google's + * LiteRT-LM C++ inference runtime, the in-process path for the Android NPU + * tier (Qualcomm QNN / MediaTek NeuroPilot / Google Tensor), with an + * optional desktop/iOS GPU fallback. LiteRT-LM is linked INTO + * `libelizainference` and exposed behind the same FFI streaming symbols — + * never a child process or TCP server (native/AGENTS.md §11, gemma4 cutover). + * + * The whole real implementation is gated behind the CMake define + * `ELIZA_ENABLE_LITERT`. When that flag is OFF this header pulls in NO + * LiteRT-LM SDK headers, so the file compiles on a host without the SDK and + * the factory links in as a no-op: `available()` is false and `open()` + * returns nullptr + sets `*out_error` "not compiled in". + * + * ── Targeted runtime API (researched 2026-06-22) ────────────────────────── + * Repo: https://github.com/google-ai-edge/LiteRT-LM (`main`) + * Docs: https://developers.google.com/edge/litert-lm/cpp + * https://ai.google.dev/edge/litert/next/litert_lm_npu + * Namespace: `litert::lm` + * + * Symbols this backend targets (verbatim from the headers above): + * - runtime/engine/engine.h + * using Engine = EngineT; + * static absl::StatusOr> + * Engine::CreateEngine(const EngineSettings&); + * absl::StatusOr> + * EngineT::CreateSession(const SessionConfig&); + * - runtime/engine/engine.h (SessionInterface) + * absl::Status RunPrefill(const std::vector&); + * absl::StatusOr RunDecode(); + * absl::StatusOr RunDecode(const DecodeConfig&); + * absl::Status GenerateContentStream( + * const std::vector&, + * absl::AnyInvocable)>); + * - runtime/engine/engine_settings.h + * static absl::StatusOr EngineSettings::CreateDefault( + * ModelAssets, Backend backend = Backend::CPU, + * std::optional vision_backend = std::nullopt, + * std::optional audio_backend = std::nullopt, + * std::optional sampler_backend = std::nullopt); + * static SessionConfig SessionConfig::CreateDefault(); + * absl::StatusOr ModelAssets::Create(); // .litertlm + * - runtime/engine/io_types.h + * using InputData = std::variant; + * class InputText { explicit InputText(std::variant); }; + * class Responses { const std::vector& GetTexts() const; }; + * - runtime/proto/engine.pb.h + * enum Backend { ... CPU, GPU, NPU, ... }; // litert::lm::Backend + * + * Accelerator ladder (Android NPU first): the factory tries NPU, then GPU, + * then CPU at `open()` and records which one initialized. Every + * hardware-gated assumption is tagged `DEVICE-VERIFY` in the .cpp — the + * accelerator ladder, the .litertlm graph fit, and tok/s can only be + * confirmed on a real NPU device, which this scaffold does not have. + */ + +#include "../llm-backend.h" + +/* Stable id matched case-insensitively against ELIZA_LLM_BACKEND, and the + * subdir + artifact extension the factory probes under /text/. */ +#define LITERT_BACKEND_NAME "litert-lm" +#define LITERT_BUNDLE_TEXT_SUBDIR "text" +#define LITERT_ARTIFACT_EXT ".litertlm" + +/* Singleton factory accessor. The selector (llm-backend-selector.cpp) calls + * this from `llm_backend_register_builtins()` to register the backend. The + * returned pointer is a static-lifetime singleton the registry does not own. + * Defined unconditionally — a build without ELIZA_ENABLE_LITERT returns a + * stub factory whose available() is false. */ +LlmBackendFactory * litert_backend_factory(); diff --git a/tools/omnivoice/src/backends/litert-embed-backend.cpp b/tools/omnivoice/src/backends/litert-embed-backend.cpp new file mode 100644 index 000000000..18bf11415 --- /dev/null +++ b/tools/omnivoice/src/backends/litert-embed-backend.cpp @@ -0,0 +1,252 @@ +/* + * litert-embed-backend.cpp — LiteRT (Google AI Edge) text-embedding backend. + * + * Serves eliza_inference_embed from a `/embedding/*.tflite` (or + * `.litertlm`) artifact via the LiteRT Next C runtime on the best available + * accelerator: NPU (Qualcomm QNN / MediaTek NeuroPilot / Google Tensor on + * capable silicon) -> GPU (OpenCL/Mali via libLiteRtClGlAccelerator.so) -> CPU. + * The accelerator ladder + preference_rank let the SAME build auto-promote to + * NPU on a Pixel-10/G5 or Qualcomm/MediaTek device and fall back to the GPU + * delegate on a Tensor-G4 (Pixel 9a) with NO code change. + * + * Uses the LiteRT *C* API (litert/c/...) — the C++ cc/ wrappers are not + * standalone (they pull Abseil/TFLite/flatbuffers). Compiles only under + * -DELIZA_ENABLE_LITERT with the SDK on the include/link path + * (-DELIZA_LITERT_SDK_DIR= -DELIZA_LITERT_LIBS=LiteRt). Without the gate the + * file is not compiled (CMake target_sources is inside if(ELIZA_ENABLE_LITERT)); + * the stub at the bottom keeps the factory accessor resolvable defensively. + * + * Model I/O (the converted all-MiniLM-L6-v2 .tflite, see + * litert-models/embedding/MANIFEST.md): 2 int32 inputs [1,128] bound BY INDEX + * (0=input_ids, 1=attention_mask), 1 float32 output [1,384] that is already + * masked-mean-pooled + L2-normalized in-graph (read 384 floats directly). + */ + +#include "../embed-backend.h" +#include "../llm-backend.h" /* llm_backend_context_bundle_dir */ + +#include +#include +#include + +#if defined(__has_include) +# if __has_include() +# include +# define ELIZA_HAS_FILESYSTEM 1 +# endif +#endif + +namespace { + +/* Probe `/embedding/` for a LiteRT artifact (.litertlm preferred, + * then .tflite). Cheap — no model load. Returns the absolute path or "". */ +std::string find_embed_artifact(const char * bundle_dir) { + if (!bundle_dir || !bundle_dir[0]) return ""; +#ifdef ELIZA_HAS_FILESYSTEM + namespace fs = std::filesystem; + std::error_code ec; + const fs::path dir = fs::path(bundle_dir) / "embedding"; + if (!fs::is_directory(dir, ec)) return ""; + std::string tflite; + for (const auto & e : fs::directory_iterator(dir, ec)) { + if (ec) break; + if (!e.is_regular_file(ec)) continue; + const std::string ext = e.path().extension().string(); + if (ext == ".litertlm") return e.path().string(); + if (ext == ".tflite" && tflite.empty()) tflite = e.path().string(); + } + return tflite; +#else + return ""; +#endif +} + +char * dup_error(const std::string & msg) { + const std::string full = "[libelizainference] " + msg; + char * out = (char *) std::malloc(full.size() + 1); + if (out) std::memcpy(out, full.c_str(), full.size() + 1); + return out; +} + +} // namespace + +#ifdef ELIZA_ENABLE_LITERT + +#include "litert/c/litert_common.h" +#include "litert/c/litert_compiled_model.h" +#include "litert/c/litert_environment.h" +#include "litert/c/litert_model.h" +#include "litert/c/litert_options.h" +#include "litert/c/litert_tensor_buffer.h" + +#include +#include +#include + +namespace { + +class LiteRtEmbedFactory final : public EmbedBackendFactory { +public: + const char * name() const override { return "litert"; } + + /* Compiled in AND a non-CPU accelerator is reachable (a CPU-only LiteRT is + * not a win over the in-tree ggml encoder). Settings-only probe — no model + * load. The ladder resolves to GPU on a Tensor-G4 (9a) and NPU on capable + * silicon. */ + bool available() const override { return probe_accel() != kLiteRtHwAcceleratorNone; } + + bool can_serve(const char * bundle_dir) const override { + return !find_embed_artifact(bundle_dir).empty(); + } + + int preference_rank() const override { + const int a = probe_accel(); + if (a & kLiteRtHwAcceleratorNpu) return 100; /* the real NPU win */ + if (a & kLiteRtHwAcceleratorGpu) return 20; /* GPU delegate (Mali on a 9a) */ + return 0; /* never beats ggml */ + } + + int embed(EliInferenceContext * ctx, const char * text, size_t text_len, + int pooling, float * out_embedding, size_t out_capacity, + int * out_dim, char ** out_error) override { + const char * bundle = llm_backend_context_bundle_dir(ctx); + const std::string artifact = find_embed_artifact(bundle); + if (artifact.empty()) { + if (out_error) *out_error = dup_error("litert embed: no artifact under /embedding/"); + return ELIZA_ERR_INVALID_ARG; + } + std::lock_guard lock(mu_); + if (int rc = ensure_loaded(artifact, out_error); rc != ELIZA_OK) return rc; + + /* Tokenize -> 2 int32 input tensors [1,128] (0=input_ids,1=attention_mask). + * The WordPiece tokenizer + the fixed-128 padding come from the model + * MANIFEST (litert-models/embedding). The LiteRT C run path below + * (managed buffers -> run -> read the in-graph-pooled [1,384] output) is + * wired; binding the tokenizer is the one model-specific step. */ + std::vector ids, mask; + if (int rc = tokenize(text, text_len, ids, mask, out_error); rc != ELIZA_OK) return rc; + + std::vector out_vec; + int dim = 0; + if (int rc = run(ids, mask, out_vec, dim, out_error); rc != ELIZA_OK) return rc; + + if (dim <= 0 || (size_t) dim > out_capacity) { + if (out_error) *out_error = dup_error("litert embed: output dim exceeds capacity"); + return ELIZA_ERR_INVALID_ARG; + } + (void) pooling; /* pooling + L2-norm are baked into the exported graph */ + std::memcpy(out_embedding, out_vec.data(), (size_t) dim * sizeof(float)); + *out_dim = dim; + return ELIZA_OK; + } + +private: + static int probe_accel() { + LiteRtEnvironment env = nullptr; + if (LiteRtCreateEnvironment(0, nullptr, &env) != kLiteRtStatusOk) { + return kLiteRtHwAcceleratorNone; + } + LiteRtDestroyEnvironment(env); + /* TODO(DEVICE-VERIFY): query the env for a registered NPU dispatch and + * return kLiteRtHwAcceleratorNpu when present. On a Tensor-G4 there is no + * app-usable NPU path, so this resolves to GPU. */ + return kLiteRtHwAcceleratorGpu; + } + + int ensure_loaded(const std::string & artifact, char ** out_error) { + if (artifact == loaded_path_ && compiled_) return ELIZA_OK; + reset(); + if (LiteRtCreateEnvironment(0, nullptr, &env_) != kLiteRtStatusOk) { + if (out_error) *out_error = dup_error("litert embed: environment create failed"); + return ELIZA_ERR_FFI_FAULT; + } + if (LiteRtCreateModelFromFile(artifact.c_str(), &model_) != kLiteRtStatusOk) { + if (out_error) *out_error = dup_error("litert embed: model load failed: " + artifact); + return ELIZA_ERR_BUNDLE_INVALID; + } + LiteRtOptions opts = nullptr; + if (LiteRtCreateOptions(&opts) != kLiteRtStatusOk) { + if (out_error) *out_error = dup_error("litert embed: options create failed"); + return ELIZA_ERR_FFI_FAULT; + } + LiteRtSetOptionsHardwareAccelerators( + opts, (LiteRtHwAcceleratorSet)(kLiteRtHwAcceleratorGpu | kLiteRtHwAcceleratorNpu)); + const LiteRtStatus st = LiteRtCreateCompiledModel(env_, model_, opts, &compiled_); + LiteRtDestroyOptions(opts); + if (st != kLiteRtStatusOk) { + if (out_error) *out_error = dup_error("litert embed: compile failed (accelerator unavailable?)"); + return ELIZA_ERR_FFI_FAULT; + } + loaded_path_ = artifact; + return ELIZA_OK; + } + + int tokenize(const char * /*text*/, size_t /*len*/, std::vector & /*ids*/, + std::vector & /*mask*/, char ** out_error) { + /* TODO(MANIFEST): wire the WordPiece tokenizer (vocab.txt under + * /embedding/): lower-case, [CLS] + greedy-longest-match subwords + * + [SEP], pad/truncate to exactly 128, attention_mask=1 for real tokens. + * Until wired this is a hard, observable failure — eliza_inference_embed + * does NOT fall back, so a misconfigured artifact surfaces loudly. */ + if (out_error) *out_error = dup_error( + "litert embed: WordPiece tokenizer not wired — stage vocab.txt + bind " + "per litert-models/embedding/MANIFEST.md"); + return ELIZA_ERR_NOT_IMPLEMENTED; + } + + int run(const std::vector & ids, const std::vector & mask, + std::vector & out_vec, int & dim, char ** out_error) { + /* TODO(MANIFEST): create 2 managed int32 input TensorBuffers [1,128] + * (LiteRtGetCompiledModelInputBufferRequirements -> + * LiteRtCreateManagedTensorBufferFromRequirements), Lock+write ids/mask, + * create the output buffer, LiteRtRunCompiledModel(compiled_, 0, in, out), + * Lock+read the [1,384] float output into out_vec (dim=384). Pooling + + * L2-norm are in-graph. */ + (void) ids; (void) mask; (void) out_vec; (void) dim; + if (out_error) *out_error = dup_error("litert embed: tensor run pending MANIFEST tokenizer"); + return ELIZA_ERR_NOT_IMPLEMENTED; + } + + void reset() { + if (compiled_) { LiteRtDestroyCompiledModel(compiled_); compiled_ = nullptr; } + if (model_) { LiteRtDestroyModel(model_); model_ = nullptr; } + if (env_) { LiteRtDestroyEnvironment(env_); env_ = nullptr; } + loaded_path_.clear(); + } + + std::mutex mu_; + LiteRtEnvironment env_ = nullptr; + LiteRtModel model_ = nullptr; + LiteRtCompiledModel compiled_ = nullptr; + std::string loaded_path_; +}; + +} // namespace + +EmbedBackendFactory * litert_embed_backend_factory() { + static LiteRtEmbedFactory instance; + return &instance; +} + +#else /* !ELIZA_ENABLE_LITERT — stub (kept resolvable; never selected) */ + +namespace { +class LiteRtEmbedStub final : public EmbedBackendFactory { +public: + const char * name() const override { return "litert"; } + bool available() const override { return false; } + bool can_serve(const char *) const override { return false; } + int embed(EliInferenceContext *, const char *, size_t, int, float *, size_t, + int *, char ** out_error) override { + if (out_error) *out_error = dup_error("litert embed backend not compiled in"); + return ELIZA_ERR_NOT_IMPLEMENTED; + } +}; +} // namespace + +EmbedBackendFactory * litert_embed_backend_factory() { + static LiteRtEmbedStub instance; + return &instance; +} + +#endif /* ELIZA_ENABLE_LITERT */ diff --git a/tools/omnivoice/src/backends/mlx-coreml-backend.h b/tools/omnivoice/src/backends/mlx-coreml-backend.h new file mode 100644 index 000000000..36d048c00 --- /dev/null +++ b/tools/omnivoice/src/backends/mlx-coreml-backend.h @@ -0,0 +1,128 @@ +#pragma once +/* + * mlx-coreml-backend.h — Apple-Silicon in-process streaming-LLM backend + * (Gemma-4 cutover plan M5). One of the alternate `LlmBackendSession` / + * `LlmBackendFactory` implementations behind the multi-runtime FFI seam + * defined in `../llm-backend.h` (cutover plan M3). + * + * Per native/AGENTS.md §11 ("one managed library, one pipe, no + * sidecar/subprocess/TCP") this backend is COMPILED INTO libelizainference + * and exposes the SAME `eliza_inference_llm_stream_*` FFI pull contract — + * it is the owned backend on Apple Silicon (mac first, iOS later), never a + * child process. Apple Foundation Models stays an opportunistic out-of- + * process adapter on the TS side and is NOT registered here. + * + * ── Two runtimes, one backend ───────────────────────────────────────────── + * + * The same `mlx-coreml` factory can serve a bundle through EITHER of two + * Apple on-device runtimes, picked at open() time from the artifact present + * under `/text/`: + * + * • MLX (PRIMARY) — Apple's array framework for Apple Silicon. We drive + * it through the C API `mlx-c` (ml-explore/mlx-c). The + * text weights are an `mlx` weights dir (safetensors, + * the mlx-lm convention) OR a `*.gguf` MLX reads via + * `mlx_load_gguf`. Decode runs the transformer graph + * on the Metal GPU stream with `mlx_quantized_matmul` + * for the quantized weight banks, + * `mlx_fast_scaled_dot_product_attention` for + * attention, and `mlx_fast_rope` for position. The KV + * cache is a pair of resident `mlx_array`s we append to + * per step (host-side cache handle, GPU-resident data). + * This is the preferred path: it gives us full control + * of the sampler, supports the Gemma SWA/shared-KV + * geometry, and matches mlx-lm's published Gemma graph. + * + * • CoreML (ALTERNATE) — Apple's MLModel runtime, which can place the graph + * on the ANE (Apple Neural Engine) as well as GPU/CPU. + * We load a compiled `*.mlmodelc` / `*.mlpackage` + * decoder and use the iOS-18 / macOS-15 **stateful** + * prediction API (`MLState`) so the KV cache lives + * inside CoreML and is updated in-place across decode + * steps (no per-token KV tensor marshalled across the + * ObjC boundary). CoreML needs Objective-C, which is + * why this whole backend is a `.mm` translation unit. + * + * TRADE-OFF (documented per the task brief): MLX is the primary path + * because it is the most flexible (custom sampler, exact Gemma geometry, + * speculative-decode-ready) and tracks mlx-lm directly; its decode runs on + * the GPU stream, not the ANE. CoreML's stateful MLModel can target the ANE + * for lower power on phones, but the decoder graph must be pre-compiled + * ahead of time, the sampler/KV layout is fixed by the converted model, and + * ANE placement of large attention graphs is fragile across OS revisions. + * We prefer MLX on mac/dev; CoreML is the alternate for ANE-bound iOS tiers + * once a stateful decoder package is published. open() selects MLX when an + * mlx weights dir / gguf is present, else falls back to the CoreML package. + * + * ── Build gate ──────────────────────────────────────────────────────────── + * + * The REAL implementation is gated behind `ELIZA_ENABLE_MLX` (the CMake + * define for this backend, per the cutover plan: LiteRT → ELIZA_ENABLE_LITERT, + * MLX/CoreML → ELIZA_ENABLE_MLX) AND `__APPLE__`. When the gate is OFF the + * translation unit includes NO Apple/MLX SDK headers, so it compiles on a + * plain Linux host: `available()` returns false, `can_serve()` returns false, + * and `open()` returns nullptr after setting `*out_error` ("not compiled in"). + * The default Linux build links it as a pure no-op and the selector skips it, + * keeping the in-tree llama.cpp path. + * + * ── API research (cited; symbols verified, not invented) ────────────────── + * + * MLX C API — ml-explore/mlx-c, `mlx/c/` headers, main @ 2026-06 (docs MLX C + * 0.4.1, https://ml-explore.github.io/mlx-c/). Symbols used by the real path: + * - device.h : `mlx_device mlx_device_new_type(mlx_device_type, int)` with + * `typedef enum { MLX_CPU, MLX_GPU } mlx_device_type;` + * - stream.h : `mlx_stream mlx_default_gpu_stream_new(void)`, + * `mlx_stream mlx_default_cpu_stream_new(void)` + * - io.h : `int mlx_load_safetensors(mlx_map_string_to_array*, + * mlx_map_string_to_string*, const char* file, mlx_stream)`, + * `int mlx_load_gguf(mlx_io_gguf*, const char* file, mlx_stream)` + * - array.h : `mlx_array mlx_array_new_data(const void*, const int* shape, + * int dim, mlx_dtype)`, `int mlx_array_eval(mlx_array)`, + * `int mlx_array_item_int32(int32_t*, mlx_array)`, + * `const float* mlx_array_data_float32(mlx_array)`, + * `int mlx_array_free(mlx_array)` + * - ops.h : `int mlx_quantized_matmul(mlx_array*, x, w, scales, biases, + * bool transpose, mlx_optional_int group_size, + * mlx_optional_int bits, const char* mode, mlx_stream)`, + * `int mlx_matmul(...)`, `int mlx_softmax_axes(...)`, + * `int mlx_argmax_axis(mlx_array*, a, int axis, bool, stream)`, + * `int mlx_take(mlx_array*, a, indices, stream)`, + * `int mlx_astype(...)`, `int mlx_concatenate(...)` + * - fast.h : `int mlx_fast_scaled_dot_product_attention(mlx_array*, q, k, + * v, float scale, const char* mask_mode, mlx_array mask, + * mlx_array sinks, mlx_stream)`, + * `int mlx_fast_rope(mlx_array*, x, int dims, bool traditional, + * mlx_optional_float base, float scale, int offset, + * mlx_array freqs, mlx_stream)` + * Gemma on MLX: ml-explore/mlx-lm (`mlx_lm/models/gemma*.py`) — the reference + * for the dense SWA + shared-KV + dual-head-dim graph this backend mirrors. + * + * CoreML stateful KV-cache — Apple Core ML, MLState API, macOS 15 / iOS 18 + * (WWDC24 "Bring your ML and AI models to Apple silicon"; coremltools + * Stateful Models guide, https://apple.github.io/coremltools/docs-guides/ + * source/stateful-models.html). ObjC symbols used: + * - `+ (nullable instancetype)modelWithContentsOfURL:(NSURL*)url + * error:(NSError**)error;` (and the compiled-model `compileModelAtURL:`) + * - `- (MLState*)newState;` (creates zeroed KV state buffers; MLState is + * +new/-init UNAVAILABLE — only MLModel vends it) + * - `- (nullable id)predictionFromFeatures: + * (id)input usingState:(MLState*)state + * error:(NSError**)error;` (the in-place stateful decode step) + * Apple's own "On-Device Llama 3.1 with Core ML" research post documents the + * prefill-then-stateful-decode loop this backend's MLX/CoreML paths follow. + * + * Every hardware-specific assumption that can only be confirmed on Apple + * Silicon is marked `DEVICE-VERIFY` in the .mm. This header carries no SDK + * dependency and is safe to include anywhere. + */ + +#include "../llm-backend.h" + +/* Free-function accessor returning the singleton `mlx-coreml` factory so the + * selector (llm-backend-selector.cpp, wired separately) can register it via + * `llm_backend_register(mlx_coreml_backend_factory())`. Defined in + * mlx-coreml-backend.mm. Always returns a valid non-null static-lifetime + * pointer — when the build gate is OFF the returned factory reports + * available()/can_serve() == false and open() == nullptr ("not compiled in"), + * so registering it unconditionally is safe. */ +LlmBackendFactory * mlx_coreml_backend_factory(); diff --git a/tools/omnivoice/src/backends/mlx-coreml-backend.mm b/tools/omnivoice/src/backends/mlx-coreml-backend.mm new file mode 100644 index 000000000..4b705d719 --- /dev/null +++ b/tools/omnivoice/src/backends/mlx-coreml-backend.mm @@ -0,0 +1,797 @@ +/* + * mlx-coreml-backend.mm — Apple-Silicon streaming-LLM backend (cutover M5). + * + * Objective-C++ translation unit: CoreML's MLModel / MLState API is + * Objective-C, and the MLX C++ / mlx-c headers also compile cleanly in a + * `.mm`. See mlx-coreml-backend.h for the full API research + citations and + * the MLX-primary / CoreML-alternate trade-off. + * + * STRUCTURE + * The whole real implementation sits behind + * #if defined(ELIZA_ENABLE_MLX) && defined(__APPLE__) + * and is the ONLY place that includes any MLX / CoreML SDK header. With the + * gate OFF (the default Linux build) this file pulls in no SDK header at all + * and compiles to a pure no-op factory: available()/can_serve() == false, + * open() returns nullptr after setting *out_error to "not compiled in". + * + * ERROR CONTRACT (native/AGENTS.md §3 + §9): never log, never return a + * defaulted result on failure. Out-error strings are heap-allocated with + * malloc (mirroring eliza-inference-ffi.cpp's `eliza_strdup`) so the FFI + * caller frees them with `eliza_inference_free_string` / free(). + */ + +#include "mlx-coreml-backend.h" + +#include +#include +#include +#include + +// =========================================================================== +// Shared (gate-independent) helpers +// =========================================================================== + +namespace { + +/* Heap-allocate an out-error string the way the FFI translation unit does + * (eliza-inference-ffi.cpp::eliza_strdup) so the caller's free() path is + * identical regardless of which backend produced the error. */ +void mlx_set_error(char ** out_error, const std::string & msg) { + if (!out_error) { + return; + } + char * out = static_cast(std::malloc(msg.size() + 1)); + if (!out) { + *out_error = nullptr; + return; + } + std::memcpy(out, msg.c_str(), msg.size() + 1); + *out_error = out; +} + +} // namespace + +// =========================================================================== +// REAL IMPLEMENTATION — Apple Silicon only, gated on ELIZA_ENABLE_MLX +// =========================================================================== +#if defined(ELIZA_ENABLE_MLX) && defined(__APPLE__) + +// --- Objective-C / Apple frameworks --------------------------------------- +#import +#import // MLModel, MLState, MLFeatureProvider, MLMultiArray +#import // MTLCreateSystemDefaultDevice — Metal/ANE presence probe + +// --- MLX C API (ml-explore/mlx-c) ------------------------------------------ +// Only included behind the gate so a host without the MLX SDK still compiles. +#include "mlx/c/array.h" +#include "mlx/c/device.h" +#include "mlx/c/stream.h" +#include "mlx/c/io.h" +#include "mlx/c/ops.h" +#include "mlx/c/fast.h" +#include "mlx/c/map.h" + +#include +#include +#include +#include +#include + +namespace { + +namespace fs = std::filesystem; + +// --- bundle artifact discovery -------------------------------------------- + +enum class AppleRuntime { + None, + Mlx, // mlx weights dir (safetensors) or *.gguf under text/ + CoreMl, // *.mlmodelc / *.mlpackage under text/ +}; + +bool has_suffix(const std::string & s, const char * suffix) { + const size_t n = std::strlen(suffix); + return s.size() >= n && std::equal(s.end() - n, s.end(), suffix); +} + +/* Probe `/text/` for an Apple-servable artifact and report which + * runtime would serve it. MLX is preferred when both kinds are present (an + * mlx weights dir / gguf wins over a CoreML package), matching the header's + * "MLX primary, CoreML alternate" rule. Cheap directory walk, no model load. */ +AppleRuntime detect_runtime(const char * bundle_dir, std::string & out_artifact) { + out_artifact.clear(); + if (!bundle_dir || bundle_dir[0] == '\0') { + return AppleRuntime::None; + } + std::error_code ec; + fs::path text_dir = fs::path(bundle_dir) / "text"; + if (!fs::is_directory(text_dir, ec)) { + return AppleRuntime::None; + } + + std::string gguf, mlpackage, mlmodelc, mlx_weights_dir; + for (fs::directory_iterator it(text_dir, ec), end; it != end && !ec; it.increment(ec)) { + const fs::path & p = it->path(); + const std::string name = p.filename().string(); + if (it->is_directory(ec)) { + // mlx-lm exports an `mlx` weights dir (model.safetensors + config.json), + // or a *.mlmodelc compiled CoreML model is itself a directory. + if (has_suffix(name, ".mlmodelc")) { + if (mlmodelc.empty()) mlmodelc = p.string(); + } else if (name == "mlx" || fs::exists(p / "model.safetensors", ec) || + fs::exists(p / "weights.safetensors", ec)) { + if (mlx_weights_dir.empty()) mlx_weights_dir = p.string(); + } + } else { + if (has_suffix(name, ".gguf")) { + if (gguf.empty()) gguf = p.string(); + } else if (has_suffix(name, ".mlpackage")) { + if (mlpackage.empty()) mlpackage = p.string(); + } else if (has_suffix(name, ".safetensors")) { + if (mlx_weights_dir.empty()) mlx_weights_dir = text_dir.string(); + } + } + } + + // MLX primary: weights dir / safetensors first, then gguf. + if (!mlx_weights_dir.empty()) { out_artifact = mlx_weights_dir; return AppleRuntime::Mlx; } + if (!gguf.empty()) { out_artifact = gguf; return AppleRuntime::Mlx; } + // CoreML alternate: compiled model, then package. + if (!mlmodelc.empty()) { out_artifact = mlmodelc; return AppleRuntime::CoreMl; } + if (!mlpackage.empty()) { out_artifact = mlpackage; return AppleRuntime::CoreMl; } + return AppleRuntime::None; +} + +/* True when a Metal device (hence GPU + ANE on Apple Silicon) is present. + * DEVICE-VERIFY: on a real Apple-Silicon Mac/phone this returns a valid + * MTLDevice; on a Mac without Metal (or an unexpected host) it is nil and the + * backend reports unavailable rather than crashing at open(). */ +bool metal_device_present() { + @autoreleasepool { + id dev = MTLCreateSystemDefaultDevice(); + return dev != nil; + } +} + +// =========================================================================== +// MLX-backed session (PRIMARY) +// =========================================================================== +// +// DEVICE-VERIFY: the decode graph below is structurally complete and uses the +// real mlx-c symbols, but the exact per-layer wiring of the Gemma graph +// (alternating local-SWA / global attention, dual head dims, shared-KV layer +// reuse, Per-Layer-Embeddings) must be assembled + numerically validated on +// Apple Silicon against mlx-lm's `gemma*` reference. The weight-tensor names, +// quant group_size/bits, and rope base/scale are read from the model config at +// load; they are not hardcoded here. + +class MlxLlmSession final : public LlmBackendSession { +public: + MlxLlmSession(std::string artifact, const eliza_llm_stream_config_t * cfg) + : artifact_(std::move(artifact)) { + if (cfg) { + cfg_ = *cfg; + have_cfg_ = true; + } + } + + ~MlxLlmSession() override { + free_kv(); + // mlx_array handles are value types wrapping a refcounted ctx; freeing + // releases our reference. The Metal stream/device are process-global. + } + + /* Load weights + build the resident graph. Returns ELIZA_OK or negative. + * + * The two on-disk shapes are loaded with the two distinct mlx-c readers: + * - safetensors (mlx-lm convention): mlx_load_safetensors fills a + * mlx_map_string_to_array keyed by tensor name (looked up per-tensor + * via mlx_map_string_to_array_get when the graph is assembled); + * - gguf: mlx_load_gguf fills a mlx_io_gguf whose tensors are read by + * key via mlx_io_gguf_get_array (key list from mlx_io_gguf_get_keys). + * We keep whichever handle we loaded resident; the per-tensor pulls happen + * inside run_forward when the Gemma graph is assembled on Metal. */ + int init(char ** out_error) { + // GPU stream (Metal). DEVICE-VERIFY: requires a Metal device. + gpu_stream_ = mlx_default_gpu_stream_new(); + + int rc; + if (has_suffix(artifact_, ".gguf")) { + gguf_ = mlx_io_gguf_new(); + rc = mlx_load_gguf(&gguf_, artifact_.c_str(), gpu_stream_); + if (rc == 0) { + have_gguf_ = true; + } + } else { + // mlx weights dir / safetensors (the mlx-lm convention). + std::string file = artifact_; + std::error_code ec; + if (fs::is_directory(file, ec)) { + if (fs::exists(fs::path(file) / "model.safetensors", ec)) { + file = (fs::path(file) / "model.safetensors").string(); + } else if (fs::exists(fs::path(file) / "weights.safetensors", ec)) { + file = (fs::path(file) / "weights.safetensors").string(); + } + } + weights_ = mlx_map_string_to_array_new(); + weights_meta_ = mlx_map_string_to_string_new(); + rc = mlx_load_safetensors(&weights_, &weights_meta_, file.c_str(), gpu_stream_); + if (rc == 0) { + have_weights_ = true; + } + } + if (rc != 0) { + free_weights(); + mlx_set_error(out_error, + "[mlx-coreml] MLX failed to load weights from " + artifact_); + return ELIZA_ERR_BUNDLE_INVALID; + } + + // DEVICE-VERIFY: parse the sibling config.json (vocab, n_layer, head + // dims global/swa, sliding-window, rope base, shared-KV layer map, PLE + // table, quant bits/group_size) into graph_ here. Mirrors + // mlx_lm.utils.load's config handling. Left as the on-Metal assembly + // step — the streaming contract below does not depend on its details. + return ELIZA_OK; + } + + int prefill(const int32_t * token_ids, size_t num_tokens, + char ** out_error) override { + if (!have_weights_) { + mlx_set_error(out_error, "[mlx-coreml] prefill before init"); + return ELIZA_ERR_INVALID_ARG; + } + if (!token_ids || num_tokens == 0) { + mlx_set_error(out_error, "[mlx-coreml] prefill: empty prompt"); + return ELIZA_ERR_INVALID_ARG; + } + cancel_.store(false); + + // Copy the prompt (the contract says prefill copies the tokens it needs). + prompt_.assign(token_ids, token_ids + num_tokens); + n_past_ = 0; + generated_ = 0; + + // Build the [1, T] int32 input and run one forward pass that fills KV. + // DEVICE-VERIFY: run_forward() must execute the Gemma decoder over the + // whole prompt at positions [0, T) and append to the resident KV + // arrays. The final-position logits feed the first sampled token. + const int shape[2] = {1, static_cast(num_tokens)}; + mlx_array input = mlx_array_new_data(prompt_.data(), shape, 2, MLX_INT32); + int rc = run_forward(input, /*start_pos=*/0, &last_logits_, out_error); + mlx_array_free(input); + if (rc != ELIZA_OK) { + return rc; + } + n_past_ = static_cast(num_tokens); + return ELIZA_OK; + } + + int next(int32_t * tokens_out, size_t tokens_cap, size_t * num_tokens_out, + char * text_out, size_t text_cap, int32_t * drafter_drafted_out, + int32_t * drafter_accepted_out, char ** out_error) override { + if (num_tokens_out) *num_tokens_out = 0; + if (text_out && text_cap) text_out[0] = '\0'; + // No speculative drafter on the MLX path yet (M6 wires MTP). + if (drafter_drafted_out) *drafter_drafted_out = 0; + if (drafter_accepted_out) *drafter_accepted_out = 0; + + if (!have_weights_) { + mlx_set_error(out_error, "[mlx-coreml] next before init/prefill"); + return ELIZA_ERR_INVALID_ARG; + } + if (cancel_.load()) { + return ELIZA_ERR_CANCELLED; + } + if (!tokens_out || tokens_cap == 0) { + mlx_set_error(out_error, "[mlx-coreml] next: token buffer too small"); + return ELIZA_ERR_INVALID_ARG; + } + + // Sample one token from last_logits_ (greedy here; temperature / top-p / + // top-k from cfg_ applied in sample_token). + // DEVICE-VERIFY: sample_token reads last_logits_ (an mlx_array of shape + // [1, vocab]) and returns one int32 token id. + int32_t next_id = 0; + int rc = sample_token(last_logits_, &next_id, out_error); + if (rc != ELIZA_OK) { + return rc; + } + + tokens_out[0] = next_id; + if (num_tokens_out) *num_tokens_out = 1; + generated_++; + + // Detokenize the single committed token into text_out (UTF-8). + // DEVICE-VERIFY: detokenize_piece resolves next_id against the model's + // vocab (loaded from the tokenizer sidecar / gguf vocab) and writes the + // UTF-8 piece. Partial multi-byte pieces are buffered across calls. + detokenize_piece(next_id, text_out, text_cap); + + const bool hit_eos = is_eos(next_id); + const int32_t cap = (have_cfg_ && cfg_.max_tokens > 0) + ? cfg_.max_tokens + : default_max_tokens_; + const bool hit_cap = generated_ >= cap; + if (hit_eos || hit_cap) { + return 1; // final step + } + + // Advance one position: forward pass for the just-sampled token only. + const int shape[2] = {1, 1}; + mlx_array step_in = mlx_array_new_data(&next_id, shape, 2, MLX_INT32); + rc = run_forward(step_in, /*start_pos=*/n_past_, &last_logits_, out_error); + mlx_array_free(step_in); + if (rc != ELIZA_OK) { + return rc; + } + n_past_++; + return cancel_.load() ? ELIZA_ERR_CANCELLED : 0; // more + } + + int cancel() override { + cancel_.store(true); + return ELIZA_OK; + } + + int reset() override { + cancel_.store(false); + prompt_.clear(); + n_past_ = 0; + generated_ = 0; + free_kv(); // drop resident KV arrays + free_logits(); + return ELIZA_OK; + } + + int reset_keep(int32_t n_keep) override { + // MLX KV is a resident pair of arrays we append to; trimming to a prefix + // is a tensor slice. DEVICE-VERIFY: when the on-Metal KV slice is wired, + // keep [0, n_keep) of the K/V arrays and set n_past_ = clamp(n_keep). + // Until that lands, do the contract-mandated SAFE fallback: full reset, + // return 0 — never an error (llm-backend.h reset_keep contract). + (void) n_keep; + reset(); + return 0; + } + +private: + void free_kv() { + if (have_kv_) { + mlx_array_free(kv_k_); + mlx_array_free(kv_v_); + have_kv_ = false; + } + } + void free_logits() { + if (have_logits_) { + mlx_array_free(last_logits_); + have_logits_ = false; + } + } + + /* One transformer forward pass over `input` ([1, T] int32) starting at + * position `start_pos`, appending to the resident KV cache and writing the + * final-position logits ([1, vocab]) into *out_logits. + * + * DEVICE-VERIFY: this is the Gemma decoder graph. It must, per layer: + * - embed tokens (+ Per-Layer-Embeddings) ; + * - apply mlx_fast_rope with the layer's (global vs SWA) head dim ; + * - run mlx_fast_scaled_dot_product_attention with mask_mode "causal" for + * global layers and a windowed mask for SWA layers ; + * - reuse earlier-layer KV on shared-KV layers ; + * - mlx_quantized_matmul for quantized weight banks (group_size/bits from + * config), mlx_matmul for f16 banks ; + * - mlx_array_eval the result on gpu_stream_ to force materialization. + * The scaffolding owns the resident-KV bookkeeping; the per-op assembly is + * the on-Metal step validated against mlx-lm. */ + int run_forward(mlx_array /*input*/, int /*start_pos*/, mlx_array * out_logits, + char ** out_error) { + // Until the on-Metal graph is assembled, surface a precise, non-default + // failure (§3: never return a defaulted result). When the graph lands, + // this returns ELIZA_OK with *out_logits set and the KV appended. + free_logits(); + (void) out_logits; + mlx_set_error(out_error, + "[mlx-coreml] MLX Gemma decode graph not assembled on this build " + "(DEVICE-VERIFY: requires Apple Silicon)"); + return ELIZA_ERR_NOT_IMPLEMENTED; + } + + int sample_token(mlx_array logits, int32_t * out_id, char ** out_error) { + if (!have_logits_) { + mlx_set_error(out_error, "[mlx-coreml] no logits to sample"); + return ELIZA_ERR_INVALID_ARG; + } + // DEVICE-VERIFY: apply cfg_.temperature / top_p / top_k / repeat_penalty + // then categorical sample; greedy argmax shown as the structural default. + mlx_array arg = mlx_array_new(); + if (mlx_argmax_axis(&arg, logits, /*axis=*/-1, /*keepdims=*/false, gpu_stream_) != 0) { + mlx_array_free(arg); + mlx_set_error(out_error, "[mlx-coreml] argmax failed"); + return ELIZA_ERR_FFI_FAULT; + } + mlx_array_eval(arg); + int32_t id = 0; + const int rc = mlx_array_item_int32(&id, arg); + mlx_array_free(arg); + if (rc != 0) { + mlx_set_error(out_error, "[mlx-coreml] failed to read sampled token"); + return ELIZA_ERR_FFI_FAULT; + } + *out_id = id; + return ELIZA_OK; + } + + bool is_eos(int32_t id) const { + // DEVICE-VERIFY: compare against the model's EOS / ids + // (Gemma uses ) read from the tokenizer config at load. + return id == eos_id_; + } + + void detokenize_piece(int32_t /*id*/, char * text_out, size_t text_cap) { + // DEVICE-VERIFY: resolve the token piece from the loaded vocab and copy + // its UTF-8 bytes (buffering partial code points across calls). The + // empty string here keeps the contract intact (committed id is already + // in tokens_out) until the vocab path is wired. + if (text_out && text_cap) { + text_out[0] = '\0'; + } + } + + std::string artifact_; + eliza_llm_stream_config_t cfg_{}; + bool have_cfg_ = false; + + mlx_stream gpu_stream_{}; + mlx_map_string_to_array weights_{}; + mlx_map_string_to_string weights_meta_{}; + bool have_weights_ = false; + + mlx_array kv_k_{}; + mlx_array kv_v_{}; + bool have_kv_ = false; + + mlx_array last_logits_{}; + bool have_logits_ = false; + + std::vector prompt_; + int n_past_ = 0; + int generated_ = 0; + int32_t eos_id_ = -1; + int32_t default_max_tokens_ = 2048; + + std::atomic cancel_{false}; +}; + +// =========================================================================== +// CoreML-backed session (ALTERNATE — ANE-bound, stateful MLState KV cache) +// =========================================================================== +// +// DEVICE-VERIFY: the converted decoder package must expose (a) an input +// feature for the current token id(s) and position, (b) an MLState-backed KV +// cache, and (c) a logits output. Apple's "On-Device Llama 3.1 with Core ML" +// post is the reference for the prefill-then-stateful-decode loop. We hold the +// MLModel + its MLState and call predictionFromFeatures:usingState:error: per +// step so the KV updates in-place inside CoreML (no per-token KV marshalling). + +class CoreMlLlmSession final : public LlmBackendSession { +public: + CoreMlLlmSession(std::string package, const eliza_llm_stream_config_t * cfg) + : package_(std::move(package)) { + if (cfg) { + cfg_ = *cfg; + have_cfg_ = true; + } + } + + ~CoreMlLlmSession() override { + @autoreleasepool { + state_ = nil; + model_ = nil; + } + } + + int init(char ** out_error) { + @autoreleasepool { + NSError * err = nil; + NSURL * url = [NSURL fileURLWithPath: + [NSString stringWithUTF8String:package_.c_str()]]; + + NSURL * compiled = url; + // A *.mlpackage must be compiled to *.mlmodelc before loading; a + // *.mlmodelc loads directly. DEVICE-VERIFY: compileModelAtURL is a + // synchronous one-time compile; production caches the result. + if ([package_.c_str() ? @(package_.c_str()) : @"" hasSuffix:@".mlpackage"]) { + NSURL * c = [MLModel compileModelAtURL:url error:&err]; + if (!c) { + mlx_set_error(out_error, std::string( + "[mlx-coreml] CoreML compile failed: ") + + (err ? err.localizedDescription.UTF8String : "unknown")); + return ELIZA_ERR_BUNDLE_INVALID; + } + compiled = c; + } + + MLModelConfiguration * conf = [[MLModelConfiguration alloc] init]; + // DEVICE-VERIFY: .all lets CoreML place the decoder on ANE when the + // converted graph is ANE-eligible, else GPU/CPU. + conf.computeUnits = MLComputeUnitsAll; + + model_ = [MLModel modelWithContentsOfURL:compiled + configuration:conf + error:&err]; + if (!model_) { + mlx_set_error(out_error, std::string( + "[mlx-coreml] CoreML model load failed: ") + + (err ? err.localizedDescription.UTF8String : "unknown")); + return ELIZA_ERR_BUNDLE_INVALID; + } + + // newState vends zeroed KV buffers; MLState is +new/-init + // UNAVAILABLE — only MLModel produces it (macOS 15 / iOS 18). + state_ = [model_ newState]; + if (!state_) { + mlx_set_error(out_error, + "[mlx-coreml] CoreML model has no stateful KV cache " + "(newState returned nil) — needs a stateful decoder package"); + return ELIZA_ERR_BUNDLE_INVALID; + } + return ELIZA_OK; + } + } + + int prefill(const int32_t * token_ids, size_t num_tokens, + char ** out_error) override { + if (!model_ || !state_) { + mlx_set_error(out_error, "[mlx-coreml] prefill before init"); + return ELIZA_ERR_INVALID_ARG; + } + if (!token_ids || num_tokens == 0) { + mlx_set_error(out_error, "[mlx-coreml] prefill: empty prompt"); + return ELIZA_ERR_INVALID_ARG; + } + cancel_.store(false); + prompt_.assign(token_ids, token_ids + num_tokens); + n_past_ = 0; + generated_ = 0; + + // DEVICE-VERIFY: feed the whole prompt as one prediction with positions + // [0, T) so CoreML fills the MLState KV in one pass, then keep the + // final-position logits for the first sampled token. The feature names + // ("input_ids", "position", "logits") are dictated by the converted + // model's MLModelDescription — read them from model_.modelDescription. + return run_step(prompt_.data(), prompt_.size(), /*start_pos=*/0, out_error); + } + + int next(int32_t * tokens_out, size_t tokens_cap, size_t * num_tokens_out, + char * text_out, size_t text_cap, int32_t * drafter_drafted_out, + int32_t * drafter_accepted_out, char ** out_error) override { + if (num_tokens_out) *num_tokens_out = 0; + if (text_out && text_cap) text_out[0] = '\0'; + if (drafter_drafted_out) *drafter_drafted_out = 0; + if (drafter_accepted_out) *drafter_accepted_out = 0; + + if (!model_ || !state_) { + mlx_set_error(out_error, "[mlx-coreml] next before init/prefill"); + return ELIZA_ERR_INVALID_ARG; + } + if (cancel_.load()) { + return ELIZA_ERR_CANCELLED; + } + if (!tokens_out || tokens_cap == 0) { + mlx_set_error(out_error, "[mlx-coreml] next: token buffer too small"); + return ELIZA_ERR_INVALID_ARG; + } + + int32_t next_id = 0; + int rc = sample_from_last_logits(&next_id, out_error); + if (rc != ELIZA_OK) { + return rc; + } + tokens_out[0] = next_id; + if (num_tokens_out) *num_tokens_out = 1; + generated_++; + detokenize_piece(next_id, text_out, text_cap); + + const int32_t cap = (have_cfg_ && cfg_.max_tokens > 0) + ? cfg_.max_tokens + : default_max_tokens_; + if (is_eos(next_id) || generated_ >= cap) { + return 1; // final + } + + // One stateful decode step for the just-sampled token. + const int32_t one = next_id; + rc = run_step(&one, 1, /*start_pos=*/n_past_, out_error); + if (rc != ELIZA_OK) { + return rc; + } + n_past_++; + return cancel_.load() ? ELIZA_ERR_CANCELLED : 0; // more + } + + int cancel() override { + cancel_.store(true); + return ELIZA_OK; + } + + int reset() override { + cancel_.store(false); + prompt_.clear(); + n_past_ = 0; + generated_ = 0; + @autoreleasepool { + // A fresh MLState zeroes the KV cache — the canonical CoreML reset. + if (model_) { + state_ = [model_ newState]; + } + } + return ELIZA_OK; + } + + int reset_keep(int32_t n_keep) override { + // CoreML's MLState is opaque: there is no public API to truncate the KV + // to a prefix. Per the llm-backend.h contract, fall back to a full + // reset and return 0 — never an error. + (void) n_keep; + reset(); + return 0; + } + +private: + /* Run one prediction (`n` tokens starting at `start_pos`) through the + * stateful model, updating the MLState KV in place and caching the + * final-position logits. DEVICE-VERIFY: builds an MLFeatureProvider from + * the converted model's actual input descriptions and reads the logits + * MLMultiArray from the output provider. */ + int run_step(const int32_t * /*tokens*/, size_t /*n*/, int /*start_pos*/, + char ** out_error) { + // The feature-name binding is model-specific and only knowable from a + // real converted package, so surface a precise failure (§3) rather than + // a defaulted success. When the package is wired this calls + // predictionFromFeatures:usingState:error: and stores the logits. + mlx_set_error(out_error, + "[mlx-coreml] CoreML stateful decode not bound to a converted " + "decoder package on this build (DEVICE-VERIFY: requires a stateful " + "*.mlmodelc and Apple Silicon)"); + return ELIZA_ERR_NOT_IMPLEMENTED; + } + + int sample_from_last_logits(int32_t * /*out_id*/, char ** out_error) { + // DEVICE-VERIFY: argmax / temperature-sample over the cached logits + // MLMultiArray. Fails precisely until run_step populates them. + mlx_set_error(out_error, "[mlx-coreml] no CoreML logits to sample"); + return ELIZA_ERR_NOT_IMPLEMENTED; + } + + bool is_eos(int32_t id) const { return id == eos_id_; } + + void detokenize_piece(int32_t /*id*/, char * text_out, size_t text_cap) { + if (text_out && text_cap) { + text_out[0] = '\0'; + } + } + + std::string package_; + eliza_llm_stream_config_t cfg_{}; + bool have_cfg_ = false; + + MLModel * model_ = nil; + MLState * state_ = nil; + + std::vector prompt_; + int n_past_ = 0; + int generated_ = 0; + int32_t eos_id_ = -1; + int32_t default_max_tokens_ = 2048; + + std::atomic cancel_{false}; +}; + +// =========================================================================== +// Factory (real) +// =========================================================================== + +class MlxCoreMlFactory final : public LlmBackendFactory { +public: + const char * name() const override { return "mlx-coreml"; } + + bool available() const override { + // Compiled in (we are inside the gate) AND a Metal device is present. + // DEVICE-VERIFY: true on Apple Silicon; false on a Mac without Metal. + return metal_device_present(); + } + + bool can_serve(const char * bundle_dir) const override { + std::string artifact; + return detect_runtime(bundle_dir, artifact) != AppleRuntime::None; + } + + int preference_rank() const override { + // Highest on Apple Silicon: the in-process Metal/ANE path beats the + // in-tree llama.cpp Metal path for the Gemma geometry. > LiteRT(0 here). + return 100; + } + + LlmBackendSession * open(EliInferenceContext * ctx, + const eliza_llm_stream_config_t * cfg, + char ** out_error) override { + // Resolve the bundle root from the context accessor (the struct is + // otherwise opaque here), then pick MLX vs CoreML from its artifacts. + const char * bundle_dir = llm_backend_context_bundle_dir(ctx); + const std::string bundle = bundle_dir ? bundle_dir : std::string(); + if (bundle.empty()) { + mlx_set_error(out_error, + "[mlx-coreml] open: context has no bundle dir"); + return nullptr; + } + std::string artifact; + const AppleRuntime rt = detect_runtime(bundle.c_str(), artifact); + if (rt == AppleRuntime::Mlx) { + auto * s = new MlxLlmSession(artifact, cfg); + const int rc = s->init(out_error); + if (rc != ELIZA_OK) { + delete s; + return nullptr; + } + return s; + } + if (rt == AppleRuntime::CoreMl) { + auto * s = new CoreMlLlmSession(artifact, cfg); + const int rc = s->init(out_error); + if (rc != ELIZA_OK) { + delete s; + return nullptr; + } + return s; + } + mlx_set_error(out_error, + "[mlx-coreml] open: bundle has no MLX/CoreML text artifact under text/"); + return nullptr; + } +}; + +MlxCoreMlFactory g_factory; + +} // namespace + +LlmBackendFactory * mlx_coreml_backend_factory() { + return &g_factory; +} + +// =========================================================================== +// STUB IMPLEMENTATION — every non-Apple / gate-OFF build +// =========================================================================== +#else // !(ELIZA_ENABLE_MLX && __APPLE__) + +namespace { + +/* No SDK header is included on this path, so the file compiles on a plain + * Linux host. The factory reports itself unavailable and refuses to open. */ +class MlxCoreMlStubFactory final : public LlmBackendFactory { +public: + const char * name() const override { return "mlx-coreml"; } + bool available() const override { return false; } + bool can_serve(const char * /*bundle_dir*/) const override { return false; } + int preference_rank() const override { return 0; } + + LlmBackendSession * open(EliInferenceContext * /*ctx*/, + const eliza_llm_stream_config_t * /*cfg*/, + char ** out_error) override { + mlx_set_error(out_error, + "[mlx-coreml] backend not compiled in " + "(needs -DELIZA_ENABLE_MLX on Apple Silicon)"); + return nullptr; + } +}; + +MlxCoreMlStubFactory g_stub_factory; + +} // namespace + +LlmBackendFactory * mlx_coreml_backend_factory() { + return &g_stub_factory; +} + +#endif // ELIZA_ENABLE_MLX && __APPLE__ diff --git a/tools/omnivoice/src/eliza-inference-ffi.cpp b/tools/omnivoice/src/eliza-inference-ffi.cpp index 345c87cb0..e35445169 100644 --- a/tools/omnivoice/src/eliza-inference-ffi.cpp +++ b/tools/omnivoice/src/eliza-inference-ffi.cpp @@ -14,6 +14,12 @@ // resolve `eliza_inference_*` symbols from this object. #include "eliza-inference-ffi.h" +#include "llm-backend.h" +#include "embed-backend.h" +#include "vision-backend.h" +#include "asr-backend.h" +#include "tts-backend.h" +#include "eot-backend.h" #include "omnivoice.h" #include "llama.h" #include "mtmd.h" @@ -173,6 +179,13 @@ struct EliInferenceContext { #endif }; +/* M3 seam accessor (declared in llm-backend.h): hand a backend's open() the + * bundle root without exposing the struct. Defined here where the type is + * complete. */ +const char * llm_backend_context_bundle_dir(const EliInferenceContext * ctx) { + return ctx ? ctx->bundle_dir.c_str() : nullptr; +} + /* ELZ2 magic 'ELZ1' (the ascii bytes 'E','L','Z','1' little-endian). * The magic stays 'ELZ1' across format versions — only the version * word at offset 4 changes between v1 and v2. */ @@ -1135,6 +1148,11 @@ static void reset_engine(Engine * e) { struct EliLlmStream { EliInferenceContext * ctx = nullptr; + /* Multi-backend seam (M3): when non-NULL, this session is driven by an + * alternate in-process runtime (LiteRT-LM / MLX-CoreML) and the llama.cpp + * fields below (lctx/sampler/mtp) are unused — every FFI streaming entry + * delegates to `backend` and returns before touching the llama.cpp path. */ + LlmBackendSession * backend = nullptr; llama_context * lctx = nullptr; llama_sampler * sampler = nullptr; int n_past = 0; @@ -1867,6 +1885,24 @@ int eliza_inference_tts_synthesize( return ELIZA_ERR_INVALID_ARG; } + /* Per-op backend seam: a TTS backend (e.g. LiteRT/NPU) serves this when it + * ships /tts/*; otherwise fall through to the in-tree OmniVoice path + * below. Inert by default (no backend registered). */ + { + char * be_error = nullptr; + TtsBackendFactory * be = + tts_backend_select(llm_backend_context_bundle_dir(ctx), &be_error); + if (be_error) { + eliza_set_error(out_error, std::string(be_error)); + std::free(be_error); + return ELIZA_ERR_BUNDLE_INVALID; + } + if (be) { + return be->tts_synthesize(ctx, text, text_len, speaker_preset_id, + out_pcm, max_samples, out_error); + } + } + std::lock_guard lock(ctx->tts_mutex); if (!ctx->ov) { eliza_set_error(out_error, "[libelizainference] tts_synthesize: TTS region is not acquired; call mmap_acquire(\"tts\") after arming voice"); @@ -2068,6 +2104,25 @@ int eliza_inference_asr_transcribe( eliza_set_error(out_error, "[libelizainference] asr_transcribe: invalid arguments"); return ELIZA_ERR_INVALID_ARG; } + + /* Per-op backend seam: an ASR backend (e.g. LiteRT/NPU) serves this when it + * ships /asr/*; otherwise fall through to the in-tree ggml path + * below. Inert by default (no backend registered). */ + { + char * be_error = nullptr; + AsrBackendFactory * be = + asr_backend_select(llm_backend_context_bundle_dir(ctx), &be_error); + if (be_error) { + eliza_set_error(out_error, std::string(be_error)); + std::free(be_error); + return ELIZA_ERR_BUNDLE_INVALID; + } + if (be) { + return be->asr_transcribe(ctx, pcm, n_samples, sample_rate_hz, + out_text, max_text_bytes, out_error); + } + } + std::string transcript; int rc = eliza_asr_decode_core(ctx, pcm, n_samples, sample_rate_hz, max_text_bytes, transcript, out_error); if (rc < 0) { @@ -2887,6 +2942,40 @@ EliLlmStream * eliza_inference_llm_stream_open( return nullptr; } + /* Multi-backend seam (M3): an alternate in-process runtime (LiteRT-LM / + * MLX-CoreML) may serve this bundle. The selector returns nullptr with NO + * error to keep the in-tree llama.cpp path below; nullptr WITH an error is a + * hard env-select failure to propagate. */ + { + char * sel_err = nullptr; + LlmBackendFactory * factory = + llm_backend_select(ctx->bundle_dir.c_str(), cfg, &sel_err); + if (!factory && sel_err) { + if (out_error) { + *out_error = sel_err; + } else { + eliza_inference_free_string(sel_err); + } + return nullptr; + } + if (factory) { + EliLlmStream * bstream = new (std::nothrow) EliLlmStream(); + if (!bstream) { + eliza_set_error(out_error, + "[libelizainference] llm_stream_open: out of memory"); + return nullptr; + } + bstream->ctx = ctx; + bstream->max_tokens = cfg->max_tokens > 0 ? cfg->max_tokens : 0; + bstream->backend = factory->open(ctx, cfg, out_error); + if (!bstream->backend) { + delete bstream; + return nullptr; + } + return bstream; + } + } + llama_model * model = nullptr; { std::lock_guard lock(ctx->llm_mutex); @@ -2988,6 +3077,9 @@ int eliza_inference_llm_stream_prefill( const int32_t * token_ids, size_t num_tokens, char ** out_error) { + if (stream && stream->backend) { + return stream->backend->prefill(token_ids, num_tokens, out_error); + } if (!stream || (!stream->lctx && !stream->mtp)) { eliza_set_error(out_error, "[libelizainference] llm_stream_prefill: invalid session"); @@ -3056,6 +3148,11 @@ int eliza_inference_llm_stream_next( if (drafter_accepted_out) *drafter_accepted_out = 0; if (text_out && text_cap > 0) text_out[0] = '\0'; + if (stream && stream->backend) { + return stream->backend->next(tokens_out, tokens_cap, num_tokens_out, + text_out, text_cap, drafter_drafted_out, + drafter_accepted_out, out_error); + } if (!stream || (!stream->mtp && (!stream->lctx || !stream->sampler))) { eliza_set_error(out_error, "[libelizainference] llm_stream_next: invalid session"); @@ -3245,6 +3342,9 @@ int eliza_inference_llm_stream_next( } int eliza_inference_llm_stream_cancel(EliLlmStream * stream) { + if (stream && stream->backend) { + return stream->backend->cancel(); + } if (stream) { stream->cancel.store(true, std::memory_order_release); } @@ -3255,6 +3355,9 @@ int eliza_inference_llm_stream_save_slot( EliLlmStream * stream, const char * filename, char ** out_error) { + if (stream && stream->backend) { + return stream->backend->save_slot(filename, out_error); + } (void) stream; (void) filename; /* v1: cross-launch slot KV persistence is not wired. Return a structured @@ -3269,6 +3372,9 @@ int eliza_inference_llm_stream_restore_slot( EliLlmStream * stream, const char * filename, char ** out_error) { + if (stream && stream->backend) { + return stream->backend->restore_slot(filename, out_error); + } (void) stream; (void) filename; eliza_set_error(out_error, @@ -3285,6 +3391,7 @@ int eliza_inference_llm_stream_reset(EliLlmStream * stream) { * created/destroyed repeatedly. Handles both the plain fixed-KV stream and * the MTP speculative engine (which owns its own target/draft KV). */ if (!stream) return ELIZA_ERR_INVALID_ARG; + if (stream->backend) return stream->backend->reset(); if (!stream->mtp && !stream->lctx) return ELIZA_ERR_INVALID_ARG; if (stream->mtp) { /* MTP stream: clear both the target and draft KV caches, reset the @@ -3319,6 +3426,7 @@ int eliza_inference_llm_stream_reset_keep(EliLlmStream * stream, int32_t n_keep) * separate (riskier) handling — prefix-reuse mode opens the resident stream * without MTP, trading MTP's ~1.5x decode for the much larger prefill cut. */ if (!stream) return ELIZA_ERR_INVALID_ARG; + if (stream->backend) return stream->backend->reset_keep(n_keep); if (stream->mtp || !stream->lctx) return ELIZA_ERR_INVALID_ARG; if (n_keep < 0) n_keep = 0; if (n_keep > stream->n_past) n_keep = stream->n_past; @@ -3339,6 +3447,10 @@ int eliza_inference_llm_stream_reset_keep(EliLlmStream * stream, int32_t n_keep) void eliza_inference_llm_stream_close(EliLlmStream * stream) { if (!stream) return; + if (stream->backend) { + delete stream->backend; + stream->backend = nullptr; + } if (stream->mtp) { eliza_mtp::free_engine(stream->mtp); stream->mtp = nullptr; @@ -3435,6 +3547,24 @@ int eliza_inference_embed( return ELIZA_ERR_INVALID_ARG; } + /* Per-op backend seam: an embedding backend (e.g. LiteRT/NPU) serves this + * when it ships /embedding/*; otherwise fall through to the in-tree + * ggml encoder below. Inert by default (no backend registered). */ + { + char * be_error = nullptr; + EmbedBackendFactory * be = + embed_backend_select(llm_backend_context_bundle_dir(ctx), &be_error); + if (be_error) { + eliza_set_error(out_error, std::string(be_error)); + std::free(be_error); + return ELIZA_ERR_BUNDLE_INVALID; + } + if (be) { + return be->embed(ctx, text, text_len, pooling, out_embedding, + out_capacity, out_dim, out_error); + } + } + std::lock_guard lock(ctx->llm_mutex); int rc = eliza_load_llm_model_locked(ctx, /* n_gpu_layers= */ -1, out_error); if (rc != ELIZA_OK) return rc; @@ -3569,6 +3699,25 @@ int eliza_inference_llm_eot_score( return ELIZA_ERR_INVALID_ARG; } + /* Per-op backend seam: an EOT backend (e.g. LiteRT/NPU) serves this when it + * ships /eot/*; otherwise fall through to the in-tree ggml + * causal-scoring path below. Inert by default (no backend registered). */ + { + char * be_error = nullptr; + EotBackendFactory * be = + eot_backend_select(llm_backend_context_bundle_dir(ctx), &be_error); + if (be_error) { + eliza_set_error(out_error, std::string(be_error)); + std::free(be_error); + return ELIZA_ERR_BUNDLE_INVALID; + } + if (be) { + return be->eot_score(ctx, token_ids, num_tokens, target_token_id, + out_target_prob, out_top_token, out_top_prob, + out_error); + } + } + std::lock_guard lock(ctx->llm_mutex); int rc = eliza_load_llm_model_locked(ctx, /* n_gpu_layers= */ -1, out_error); if (rc != ELIZA_OK) return rc; @@ -3730,6 +3879,24 @@ int eliza_inference_describe_image( return ELIZA_ERR_INVALID_ARG; } + /* Per-op backend seam: a vision backend (e.g. LiteRT/NPU) serves this when it + * ships /vision/*; otherwise fall through to the in-tree ggml mmproj + * path below. Inert by default (no backend registered). */ + { + char * be_error = nullptr; + VisionBackendFactory * be = + vision_backend_select(llm_backend_context_bundle_dir(ctx), &be_error); + if (be_error) { + eliza_set_error(out_error, std::string(be_error)); + std::free(be_error); + return ELIZA_ERR_BUNDLE_INVALID; + } + if (be) { + return be->describe_image(ctx, image_bytes, n_bytes, mmproj_path, + prompt, out_text, max_text_bytes, out_error); + } + } + std::lock_guard lock(ctx->llm_mutex); int rc = eliza_load_llm_model_locked(ctx, /* n_gpu_layers= */ -1, out_error); if (rc != ELIZA_OK) return rc; diff --git a/tools/omnivoice/src/embed-backend-selector.cpp b/tools/omnivoice/src/embed-backend-selector.cpp new file mode 100644 index 000000000..56449fb07 --- /dev/null +++ b/tools/omnivoice/src/embed-backend-selector.cpp @@ -0,0 +1,41 @@ +/* + * embed-backend-selector.cpp — registry + selection for the per-op embedding + * backend seam. A thin instantiation of eliza_backend::Registry + * (backend-registry.h) — the resolution logic is shared with every other + * modality. Inert by default: with no -DELIZA_ENABLE_* embedding backend + * compiled in, nothing registers and embed_backend_select() returns nullptr, so + * eliza_inference_embed keeps the in-tree ggml encoder path. + */ + +#include "embed-backend.h" +#include "backend-registry.h" + +#include + +/* Gated factory accessor — declared only when the backend is compiled in. */ +#ifdef ELIZA_ENABLE_LITERT +EmbedBackendFactory * litert_embed_backend_factory(); +#endif + +namespace { +eliza_backend::Registry g_registry; +std::once_flag g_builtins_once; +} // namespace + +void embed_backend_register(EmbedBackendFactory * factory) { + g_registry.register_factory(factory); +} + +void embed_backend_register_builtins() { + std::call_once(g_builtins_once, []() { +#ifdef ELIZA_ENABLE_LITERT + embed_backend_register(litert_embed_backend_factory()); +#endif + }); +} + +EmbedBackendFactory * embed_backend_select(const char * bundle_dir, char ** out_error) { + embed_backend_register_builtins(); + return g_registry.select("ELIZA_EMBED_BACKEND", "ELIZA_BACKEND", "embed", + bundle_dir, out_error); +} diff --git a/tools/omnivoice/src/embed-backend.h b/tools/omnivoice/src/embed-backend.h new file mode 100644 index 000000000..23473a648 --- /dev/null +++ b/tools/omnivoice/src/embed-backend.h @@ -0,0 +1,62 @@ +#pragma once +/* + * embed-backend.h — per-op backend seam for pooled text embeddings. + * + * The first per-op generalization of the M3 streaming-LLM seam: a one-shot op + * (eliza_inference_embed) that an accelerator backend can serve when it ships an + * embedding artifact under `/embedding/`, while every other op — and + * embedding itself when no artifact is present — stays on the in-tree ggml path. + * + * Embedding is the natural first LiteRT/NPU target: a static-shape, encoder-only + * forward with no streaming/KV/sampler, so the factory mirrors the FFI 1:1 and + * the FFI delegates without translation. Selection reuses the shared + * eliza_backend::Registry (backend-registry.h): ELIZA_EMBED_BACKEND (per-op) then + * ELIZA_BACKEND (global) hard-select, else the highest preference_rank among + * available()+can_serve() factories, else nullptr (the ggml encoder path). + */ + +#include "eliza-inference-ffi.h" /* EliInferenceContext fwd, ELIZA_* codes */ + +#include + +struct EliInferenceContext; + +/* One factory per linked-in embedding runtime (e.g. LiteRT). */ +struct EmbedBackendFactory { + virtual ~EmbedBackendFactory() = default; + + /* Stable lower-case id, e.g. "litert". Matched case-insensitively against + * ELIZA_EMBED_BACKEND / ELIZA_BACKEND. */ + virtual const char * name() const = 0; + + /* Compiled in AND host deps present (the LiteRT runtime + a GPU/NPU + * delegate). Cheap — must not load a model. */ + virtual bool available() const = 0; + + /* The embedding artifact exists under `/embedding/`. Cheap + * directory probe, no model load. */ + virtual bool can_serve(const char * bundle_dir) const = 0; + + /* Platform-affinity rank (higher wins; the ggml path is the implicit rank 0). + * An NPU-served embedding returns a high positive value; a GPU-delegate + * fallback a lower positive value. */ + virtual int preference_rank() const { return 0; } + + /* Mirrors eliza_inference_embed 1:1. Returns ELIZA_OK and writes `*out_dim` + * floats into out_embedding (>= out_capacity required), or a negative ELIZA_* + * code with `*out_error` heap-allocated for the caller to free. */ + virtual int embed(EliInferenceContext * ctx, const char * text, size_t text_len, + int pooling, float * out_embedding, size_t out_capacity, + int * out_dim, char ** out_error) = 0; +}; + +/* Register a factory (idempotent by name). */ +void embed_backend_register(EmbedBackendFactory * factory); + +/* Register every embedding backend compiled into THIS build (gated by the + * -DELIZA_ENABLE_* options). Idempotent; called by embed_backend_select. */ +void embed_backend_register_builtins(); + +/* Pick an embedding backend for the bundle at `bundle_dir`. nullptr + no error + * => use the in-tree ggml encoder path. nullptr + *out_error => hard failure. */ +EmbedBackendFactory * embed_backend_select(const char * bundle_dir, char ** out_error); diff --git a/tools/omnivoice/src/eot-backend-selector.cpp b/tools/omnivoice/src/eot-backend-selector.cpp new file mode 100644 index 000000000..32bb9fe65 --- /dev/null +++ b/tools/omnivoice/src/eot-backend-selector.cpp @@ -0,0 +1,35 @@ +/* + * eot-backend-selector.cpp — registry + selection for the per-op end-of-turn + * scoring backend seam. A thin instantiation of + * eliza_backend::Registry (backend-registry.h) — the + * resolution logic is shared with every other modality. Inert by default: no + * -DELIZA_ENABLE_* EOT backend is compiled in (none exists yet), so nothing + * registers and eot_backend_select() returns nullptr, so + * eliza_inference_llm_eot_score keeps the in-tree ggml causal-scoring path. + */ + +#include "eot-backend.h" +#include "backend-registry.h" + +#include + +namespace { +eliza_backend::Registry g_registry; +std::once_flag g_builtins_once; +} // namespace + +void eot_backend_register(EotBackendFactory * factory) { + g_registry.register_factory(factory); +} + +void eot_backend_register_builtins() { + std::call_once(g_builtins_once, []() { + /* No EOT backend exists yet — the seam stays inert. */ + }); +} + +EotBackendFactory * eot_backend_select(const char * bundle_dir, char ** out_error) { + eot_backend_register_builtins(); + return g_registry.select("ELIZA_EOT_BACKEND", "ELIZA_BACKEND", "eot", + bundle_dir, out_error); +} diff --git a/tools/omnivoice/src/eot-backend.h b/tools/omnivoice/src/eot-backend.h new file mode 100644 index 000000000..1c51dcbb6 --- /dev/null +++ b/tools/omnivoice/src/eot-backend.h @@ -0,0 +1,62 @@ +#pragma once +/* + * eot-backend.h — per-op backend seam for end-of-turn scoring. + * + * A one-shot op (eliza_inference_llm_eot_score) that an accelerator backend can + * serve when it ships an EOT artifact under `/eot/`, while every other + * op — and EOT itself when no artifact is present — stays on the in-tree ggml + * causal-scoring path. + * + * The factory mirrors the FFI 1:1 and the FFI delegates without translation. + * Selection reuses the shared eliza_backend::Registry (backend-registry.h): + * ELIZA_EOT_BACKEND (per-op) then ELIZA_BACKEND (global) hard-select, else the + * highest preference_rank among available()+can_serve() factories, else nullptr + * (the ggml EOT-scoring path). + */ + +#include "eliza-inference-ffi.h" /* EliInferenceContext fwd, ELIZA_* codes */ + +#include +#include + +struct EliInferenceContext; + +/* One factory per linked-in EOT runtime (e.g. LiteRT). */ +struct EotBackendFactory { + virtual ~EotBackendFactory() = default; + + /* Stable lower-case id, e.g. "litert". Matched case-insensitively against + * ELIZA_EOT_BACKEND / ELIZA_BACKEND. */ + virtual const char * name() const = 0; + + /* Compiled in AND host deps present (the runtime + a GPU/NPU delegate). + * Cheap — must not load a model. */ + virtual bool available() const = 0; + + /* The EOT artifact exists under `/eot/`. Cheap directory probe, + * no model load. */ + virtual bool can_serve(const char * bundle_dir) const = 0; + + /* Platform-affinity rank (higher wins; the ggml path is the implicit rank 0). + * An NPU-served EOT returns a high positive value; a GPU-delegate fallback a + * lower positive value. */ + virtual int preference_rank() const { return 0; } + + /* Mirrors eliza_inference_llm_eot_score 1:1. Returns ELIZA_OK and writes the + * next-token probabilities, or a negative ELIZA_* code with `*out_error` + * heap-allocated for the caller to free. */ + virtual int eot_score(EliInferenceContext * ctx, const int32_t * token_ids, size_t num_tokens, + int32_t target_token_id, float * out_target_prob, int32_t * out_top_token, + float * out_top_prob, char ** out_error) = 0; +}; + +/* Register a factory (idempotent by name). */ +void eot_backend_register(EotBackendFactory * factory); + +/* Register every EOT backend compiled into THIS build (gated by the + * -DELIZA_ENABLE_* options). Idempotent; called by eot_backend_select. */ +void eot_backend_register_builtins(); + +/* Pick an EOT backend for the bundle at `bundle_dir`. nullptr + no error + * => use the in-tree ggml EOT-scoring path. nullptr + *out_error => hard failure. */ +EotBackendFactory * eot_backend_select(const char * bundle_dir, char ** out_error); diff --git a/tools/omnivoice/src/llm-backend-selector.cpp b/tools/omnivoice/src/llm-backend-selector.cpp new file mode 100644 index 000000000..3ffe37680 --- /dev/null +++ b/tools/omnivoice/src/llm-backend-selector.cpp @@ -0,0 +1,140 @@ +/* + * llm-backend-selector.cpp — registry + selection for the multi-runtime + * streaming-LLM seam (cutover plan M3). + * + * On a default build (no -DELIZA_ENABLE_* gate) NO alternate backend is + * registered, so llm_backend_select() always returns nullptr and the FFI keeps + * the in-tree llama.cpp path. The seam is therefore inert-by-default: the + * library behaves exactly as before until an accelerator backend is compiled in. + */ + +#include "llm-backend.h" + +#include +#include +#include +#include +#include +#include + +/* Gated backend factory accessors. Declared only when the matching backend is + * compiled in; register_builtins() calls them under the same gate. Keeping the + * declarations gated means the default build has no unresolved symbols. */ +#ifdef ELIZA_ENABLE_LITERT_LM +LlmBackendFactory * litert_backend_factory(); +#endif +#if defined(ELIZA_ENABLE_MLX) && defined(__APPLE__) +LlmBackendFactory * mlx_coreml_backend_factory(); +#endif + +namespace { + +std::mutex g_reg_mutex; +std::vector g_factories; +std::once_flag g_builtins_once; + +/* Heap-allocate an error string with malloc so the caller can release it with + * eliza_inference_free_string() (which calls free()), matching the FFI contract. */ +char * dup_error(const std::string & msg) { + char * out = (char *) std::malloc(msg.size() + 1); + if (out) std::memcpy(out, msg.c_str(), msg.size() + 1); + return out; +} + +bool iequals(const char * a, const char * b) { + if (!a || !b) return false; + while (*a && *b) { + if (std::tolower((unsigned char) *a) != std::tolower((unsigned char) *b)) { + return false; + } + ++a; + ++b; + } + return *a == *b; +} + +bool is_llamacpp_name(const char * s) { + return iequals(s, "llama.cpp") || iequals(s, "llamacpp") || iequals(s, "llama"); +} + +} // namespace + +void llm_backend_register(LlmBackendFactory * factory) { + if (!factory) return; + std::lock_guard lock(g_reg_mutex); + for (LlmBackendFactory * f : g_factories) { + if (iequals(f->name(), factory->name())) return; /* idempotent by name */ + } + g_factories.push_back(factory); +} + +void llm_backend_register_builtins() { + std::call_once(g_builtins_once, []() { +#ifdef ELIZA_ENABLE_LITERT_LM + llm_backend_register(litert_backend_factory()); +#endif +#if defined(ELIZA_ENABLE_MLX) && defined(__APPLE__) + llm_backend_register(mlx_coreml_backend_factory()); +#endif + }); +} + +LlmBackendFactory * llm_backend_select(const char * bundle_dir, + const eliza_llm_stream_config_t * /*cfg*/, + char ** out_error) { + llm_backend_register_builtins(); + + /* (1) ELIZA_LLM_BACKEND env: a HARD select. */ + const char * forced = std::getenv("ELIZA_LLM_BACKEND"); + if (forced && forced[0] != '\0') { + if (is_llamacpp_name(forced)) { + return nullptr; /* force the in-tree path, not an error */ + } + std::lock_guard lock(g_reg_mutex); + for (LlmBackendFactory * f : g_factories) { + if (!iequals(f->name(), forced)) continue; + if (!f->available()) { + if (out_error) { + *out_error = dup_error( + std::string("[libelizainference] ELIZA_LLM_BACKEND=") + forced + + " is not available in this build/host"); + } + return nullptr; + } + if (!f->can_serve(bundle_dir)) { + if (out_error) { + *out_error = dup_error( + std::string("[libelizainference] ELIZA_LLM_BACKEND=") + forced + + " cannot serve the bundle at " + + (bundle_dir ? bundle_dir : "(null)")); + } + return nullptr; + } + return f; + } + if (out_error) { + *out_error = dup_error( + std::string("[libelizainference] ELIZA_LLM_BACKEND=") + forced + + " is not a registered backend"); + } + return nullptr; + } + + /* (2) Auto-select: the highest preference_rank among available + can_serve. + * The in-tree llama.cpp path is the implicit rank-0 fallback, so an + * accelerator backend only wins when it returns a positive rank AND can + * serve this bundle. */ + std::lock_guard lock(g_reg_mutex); + LlmBackendFactory * best = nullptr; + int best_rank = 0; + for (LlmBackendFactory * f : g_factories) { + if (!f->available()) continue; + if (!f->can_serve(bundle_dir)) continue; + const int rank = f->preference_rank(); + if (rank > best_rank) { + best_rank = rank; + best = f; + } + } + return best; /* nullptr => in-tree llama.cpp */ +} diff --git a/tools/omnivoice/src/llm-backend.h b/tools/omnivoice/src/llm-backend.h new file mode 100644 index 000000000..0fad67f3c --- /dev/null +++ b/tools/omnivoice/src/llm-backend.h @@ -0,0 +1,167 @@ +#pragma once +/* + * llm-backend.h — multi-runtime streaming-LLM backend seam (cutover plan M3). + * + * The libelizainference streaming-LLM FFI (`eliza_inference_llm_stream_*`) is + * ONE pipe that can be driven by more than one in-process inference runtime: + * + * - llama.cpp — the default / reference backend (CPU / CUDA / Vulkan-Mali- + * Adreno / Metal). Always present; the in-tree code path. + * - LiteRT-LM — Android NPU (Tensor / Qualcomm QNN / MediaTek NeuroPilot), + * optionally desktop/iOS GPU. Gated -DELIZA_ENABLE_LITERT. + * - CoreML/MLX — Apple Silicon (mac first, iOS later). Gated -DELIZA_ENABLE_MLX. + * + * Per native/AGENTS.md §11 (reinterpreted by the Gemma-4 cutover): "one managed + * library, one pipe, no sidecar/subprocess/TCP." LiteRT-LM and MLX are + * EMBEDDABLE in-process C++ libraries linked INTO libelizainference and exposed + * behind the SAME FFI streaming symbols — never a child process or TCP server. + * (AICore / Apple Foundation stay opportunistic out-of-process adapters on the + * TS side, not owned backends — they are NOT registered here.) + * + * A backend supplies: + * - LlmBackendSession — the per-generation streaming session, mirroring the + * FFI pull contract (prefill -> next* -> close) 1:1 so + * the FFI functions delegate without translation. + * - LlmBackendFactory — names the runtime, reports availability + bundle fit, + * and opens sessions. + * + * `llm_backend_select()` picks a backend at `_open` time from the platform, the + * bundle contents, the build flags, and the `ELIZA_LLM_BACKEND` env override. + * When it returns nullptr (and no error) the FFI keeps the in-tree llama.cpp + * path — so a build with no alternate backend behaves exactly as before. + */ + +#include "eliza-inference-ffi.h" /* eliza_llm_stream_config_t, EliInferenceContext fwd */ + +#include +#include + +/* Defined in the FFI translation unit. Opaque to backends — a backend reaches + * the resident model/bundle through the accessors below, not the struct. */ +struct EliInferenceContext; + +/* The bundle directory the context was opened against. A backend's open() + * resolves its own artifact under this root (e.g. `/text/*.litertlm`, + * `/text/*.mlpackage`) — the ONLY supported way to read the bundle path, + * since the struct itself is opaque here. Returns nullptr when ctx is null. + * Defined in eliza-inference-ffi.cpp; the pointer is owned by the context and + * stays valid for the session's lifetime. */ +const char * llm_backend_context_bundle_dir(const EliInferenceContext * ctx); + +/* ---- Per-generation streaming session ------------------------------------ * + * + * Lifetime: created by LlmBackendFactory::open(), destroyed via `delete` on the + * FFI `_close` path. Every method mirrors the matching FFI entry point so the + * FFI can `return session->method(...)` with no argument translation. Status + * conventions are identical to the FFI: >= 0 on success, the negative `ELIZA_*` + * constants on failure, with `*out_error` heap-allocated for the caller to free. + */ +struct LlmBackendSession { + virtual ~LlmBackendSession() = default; + + /* Mirrors eliza_inference_llm_stream_prefill. Copies the tokens it needs. */ + virtual int prefill(const int32_t * token_ids, size_t num_tokens, + char ** out_error) = 0; + + /* Mirrors eliza_inference_llm_stream_next. Returns 0 (more output), 1 (final + * step — EOS / cap), or a negative ELIZA_* code (ELIZA_ERR_CANCELLED on + * cancel). `drafter_*_out` carry per-step speculative stats (0 when the + * backend has no drafter). */ + virtual int next(int32_t * tokens_out, size_t tokens_cap, + size_t * num_tokens_out, char * text_out, size_t text_cap, + int32_t * drafter_drafted_out, int32_t * drafter_accepted_out, + char ** out_error) = 0; + + /* Mirrors eliza_inference_llm_stream_cancel. Publishes a flag an in-flight + * next() checks at a step boundary; safe to call from another thread. + * Returns ELIZA_OK whether or not a pass was running. */ + virtual int cancel() = 0; + + /* Mirrors eliza_inference_llm_stream_reset: clear KV + sampler/counters so + * the next prefill starts a fresh prompt on the same warm session. */ + virtual int reset() = 0; + + /* Mirrors eliza_inference_llm_stream_reset_keep: keep the first `n_keep` + * tokens of state resident and drop the rest. Returns the n_keep actually + * applied (>= 0, may be clamped / 0 on a full-reset fallback). A backend + * that cannot do prefix reuse MUST fall back to a full reset and return 0 — + * never an error. */ + virtual int reset_keep(int32_t n_keep) = 0; + + /* Slot KV persistence — optional. Default: not supported. */ + virtual int save_slot(const char * /*filename*/, char ** /*out_error*/) { + return ELIZA_ERR_INVALID_ARG; + } + virtual int restore_slot(const char * /*filename*/, char ** /*out_error*/) { + return ELIZA_ERR_INVALID_ARG; + } +}; + +/* ---- Backend factory (one per linked-in runtime) ------------------------- */ +struct LlmBackendFactory { + virtual ~LlmBackendFactory() = default; + + /* Stable lower-case id: "llama.cpp", "litert-lm", "mlx-coreml". Matched + * case-insensitively against ELIZA_LLM_BACKEND. */ + virtual const char * name() const = 0; + + /* True only when this backend is compiled in AND its runtime dependencies + * are present on THIS host (the NPU delegate / Metal device / the linked + * lib). A scaffold whose build gate is OFF returns false. Cheap — must not + * load a model. */ + virtual bool available() const = 0; + + /* True when this backend can serve the bundle at `bundle_dir` — i.e. the + * backend-specific artifact exists (e.g. `text/*.litertlm`, `text/*.mlpackage`). + * Cheap directory probe, no model load. */ + virtual bool can_serve(const char * bundle_dir) const = 0; + + /* Platform-affinity rank used to order candidates when several can serve the + * same bundle and no env override is set. Higher wins. The in-tree llama.cpp + * path is rank 0 (the implicit fallback); an accelerator backend that is the + * preferred path on this device returns a positive value. */ + virtual int preference_rank() const { return 0; } + + /* Create a streaming session for (ctx, cfg). Returns nullptr + `*out_error` + * on failure. The returned session is owned by the caller (FFI `_close` + * deletes it). */ + virtual LlmBackendSession * open(EliInferenceContext * ctx, + const eliza_llm_stream_config_t * cfg, + char ** out_error) = 0; +}; + +/* ---- Registry + selection ------------------------------------------------ * + * + * Backends register their singleton factory (idempotent; the registry does not + * take ownership — factories are static-lifetime singletons). The FFI + * translation unit calls llm_backend_register_builtins() once to register every + * compiled-in backend, then calls llm_backend_select() per `_open`. + */ + +/* Register a factory (idempotent by name). Safe to call from static init. */ +void llm_backend_register(LlmBackendFactory * factory); + +/* Register every backend compiled into THIS build (gated by the -DELIZA_ENABLE_* + * CMake options). Idempotent; call once at first `_open`. Defined in + * llm-backend-selector.cpp; the gated backends self-register via their headers. */ +void llm_backend_register_builtins(); + +/* Pick a backend for the bundle at `bundle_dir` with `cfg`. Resolution order: + * + * 1. ELIZA_LLM_BACKEND env (exact, case-insensitive backend name) — a HARD + * select. "llama.cpp" / "llamacpp" forces the in-tree path (returns + * nullptr, no error). Any other name that is not registered+available, or + * cannot serve the bundle, is a hard error: returns nullptr AND sets + * `*out_error` so the FFI aborts rather than silently using llama.cpp. + * + * 2. No env override: among registered backends that are available() AND + * can_serve(bundle_dir), pick the highest preference_rank(). If none + * qualifies, return nullptr (use the in-tree llama.cpp path). + * + * A nullptr return with `*out_error == nullptr` means "use the in-tree llama.cpp + * path" — NOT an error. A nullptr return with `*out_error != nullptr` is a hard + * failure the caller must propagate. + */ +LlmBackendFactory * llm_backend_select(const char * bundle_dir, + const eliza_llm_stream_config_t * cfg, + char ** out_error); diff --git a/tools/omnivoice/src/tts-backend-selector.cpp b/tools/omnivoice/src/tts-backend-selector.cpp new file mode 100644 index 000000000..ad2d28447 --- /dev/null +++ b/tools/omnivoice/src/tts-backend-selector.cpp @@ -0,0 +1,34 @@ +/* + * tts-backend-selector.cpp — registry + selection for the per-op TTS backend + * seam. A thin instantiation of eliza_backend::Registry + * (backend-registry.h) — the resolution logic is shared with every other + * modality. Inert by default: no -DELIZA_ENABLE_* TTS backend is compiled in + * (none exists yet), so nothing registers and tts_backend_select() returns + * nullptr, so eliza_inference_tts_synthesize keeps the in-tree OmniVoice path. + */ + +#include "tts-backend.h" +#include "backend-registry.h" + +#include + +namespace { +eliza_backend::Registry g_registry; +std::once_flag g_builtins_once; +} // namespace + +void tts_backend_register(TtsBackendFactory * factory) { + g_registry.register_factory(factory); +} + +void tts_backend_register_builtins() { + std::call_once(g_builtins_once, []() { + /* No TTS backend exists yet — the seam stays inert. */ + }); +} + +TtsBackendFactory * tts_backend_select(const char * bundle_dir, char ** out_error) { + tts_backend_register_builtins(); + return g_registry.select("ELIZA_TTS_BACKEND", "ELIZA_BACKEND", "tts", + bundle_dir, out_error); +} diff --git a/tools/omnivoice/src/tts-backend.h b/tools/omnivoice/src/tts-backend.h new file mode 100644 index 000000000..127ce2a33 --- /dev/null +++ b/tools/omnivoice/src/tts-backend.h @@ -0,0 +1,61 @@ +#pragma once +/* + * tts-backend.h — per-op backend seam for text-to-speech synthesis. + * + * A one-shot op (eliza_inference_tts_synthesize) that an accelerator backend can + * serve when it ships a TTS artifact under `/tts/`, while every other + * op — and TTS itself when no artifact is present — stays on the in-tree + * OmniVoice/ggml path. + * + * The factory mirrors the FFI 1:1 and the FFI delegates without translation. + * Selection reuses the shared eliza_backend::Registry (backend-registry.h): + * ELIZA_TTS_BACKEND (per-op) then ELIZA_BACKEND (global) hard-select, else the + * highest preference_rank among available()+can_serve() factories, else nullptr + * (the in-tree OmniVoice path). + */ + +#include "eliza-inference-ffi.h" /* EliInferenceContext fwd, ELIZA_* codes */ + +#include + +struct EliInferenceContext; + +/* One factory per linked-in TTS runtime (e.g. LiteRT). */ +struct TtsBackendFactory { + virtual ~TtsBackendFactory() = default; + + /* Stable lower-case id, e.g. "litert". Matched case-insensitively against + * ELIZA_TTS_BACKEND / ELIZA_BACKEND. */ + virtual const char * name() const = 0; + + /* Compiled in AND host deps present (the runtime + a GPU/NPU delegate). + * Cheap — must not load a model. */ + virtual bool available() const = 0; + + /* The TTS artifact exists under `/tts/`. Cheap directory probe, + * no model load. */ + virtual bool can_serve(const char * bundle_dir) const = 0; + + /* Platform-affinity rank (higher wins; the ggml path is the implicit rank 0). + * An NPU-served TTS returns a high positive value; a GPU-delegate fallback a + * lower positive value. */ + virtual int preference_rank() const { return 0; } + + /* Mirrors eliza_inference_tts_synthesize 1:1. Returns the number of fp32 PCM + * samples actually written (>= 0) on success, or a negative ELIZA_* code with + * `*out_error` heap-allocated for the caller to free. */ + virtual int tts_synthesize(EliInferenceContext * ctx, const char * text, size_t text_len, + const char * speaker_preset_id, float * out_pcm, + size_t max_samples, char ** out_error) = 0; +}; + +/* Register a factory (idempotent by name). */ +void tts_backend_register(TtsBackendFactory * factory); + +/* Register every TTS backend compiled into THIS build (gated by the + * -DELIZA_ENABLE_* options). Idempotent; called by tts_backend_select. */ +void tts_backend_register_builtins(); + +/* Pick a TTS backend for the bundle at `bundle_dir`. nullptr + no error + * => use the in-tree OmniVoice path. nullptr + *out_error => hard failure. */ +TtsBackendFactory * tts_backend_select(const char * bundle_dir, char ** out_error); diff --git a/tools/omnivoice/src/vision-backend-selector.cpp b/tools/omnivoice/src/vision-backend-selector.cpp new file mode 100644 index 000000000..095450cab --- /dev/null +++ b/tools/omnivoice/src/vision-backend-selector.cpp @@ -0,0 +1,34 @@ +/* + * vision-backend-selector.cpp — registry + selection for the per-op vision + * backend seam. A thin instantiation of eliza_backend::Registry + * (backend-registry.h) — the resolution logic is shared with every other + * modality. Inert by default: no -DELIZA_ENABLE_* vision backend is compiled in + * (none exists yet), so nothing registers and vision_backend_select() returns + * nullptr, so eliza_inference_describe_image keeps the in-tree ggml mmproj path. + */ + +#include "vision-backend.h" +#include "backend-registry.h" + +#include + +namespace { +eliza_backend::Registry g_registry; +std::once_flag g_builtins_once; +} // namespace + +void vision_backend_register(VisionBackendFactory * factory) { + g_registry.register_factory(factory); +} + +void vision_backend_register_builtins() { + std::call_once(g_builtins_once, []() { + /* No vision backend exists yet — the seam stays inert. */ + }); +} + +VisionBackendFactory * vision_backend_select(const char * bundle_dir, char ** out_error) { + vision_backend_register_builtins(); + return g_registry.select("ELIZA_VISION_BACKEND", "ELIZA_BACKEND", "vision", + bundle_dir, out_error); +} diff --git a/tools/omnivoice/src/vision-backend.h b/tools/omnivoice/src/vision-backend.h new file mode 100644 index 000000000..51da0632a --- /dev/null +++ b/tools/omnivoice/src/vision-backend.h @@ -0,0 +1,61 @@ +#pragma once +/* + * vision-backend.h — per-op backend seam for mmproj image description. + * + * A one-shot op (eliza_inference_describe_image) that an accelerator backend can + * serve when it ships a vision artifact under `/vision/`, while every + * other op — and vision itself when no artifact is present — stays on the + * in-tree ggml mmproj path. + * + * The factory mirrors the FFI 1:1 and the FFI delegates without translation. + * Selection reuses the shared eliza_backend::Registry (backend-registry.h): + * ELIZA_VISION_BACKEND (per-op) then ELIZA_BACKEND (global) hard-select, else + * the highest preference_rank among available()+can_serve() factories, else + * nullptr (the ggml mmproj path). + */ + +#include "eliza-inference-ffi.h" /* EliInferenceContext fwd, ELIZA_* codes */ + +#include + +struct EliInferenceContext; + +/* One factory per linked-in vision runtime (e.g. LiteRT). */ +struct VisionBackendFactory { + virtual ~VisionBackendFactory() = default; + + /* Stable lower-case id, e.g. "litert". Matched case-insensitively against + * ELIZA_VISION_BACKEND / ELIZA_BACKEND. */ + virtual const char * name() const = 0; + + /* Compiled in AND host deps present (the runtime + a GPU/NPU delegate). + * Cheap — must not load a model. */ + virtual bool available() const = 0; + + /* The vision artifact exists under `/vision/`. Cheap directory + * probe, no model load. */ + virtual bool can_serve(const char * bundle_dir) const = 0; + + /* Platform-affinity rank (higher wins; the ggml path is the implicit rank 0). + * An NPU-served vision returns a high positive value; a GPU-delegate + * fallback a lower positive value. */ + virtual int preference_rank() const { return 0; } + + /* Mirrors eliza_inference_describe_image 1:1. Returns the number of bytes + * written (excluding the terminator) on success, or a negative ELIZA_* code + * with `*out_error` heap-allocated for the caller to free. */ + virtual int describe_image(EliInferenceContext * ctx, const unsigned char * image_bytes, + size_t n_bytes, const char * mmproj_path, const char * prompt, + char * out_text, size_t max_text_bytes, char ** out_error) = 0; +}; + +/* Register a factory (idempotent by name). */ +void vision_backend_register(VisionBackendFactory * factory); + +/* Register every vision backend compiled into THIS build (gated by the + * -DELIZA_ENABLE_* options). Idempotent; called by vision_backend_select. */ +void vision_backend_register_builtins(); + +/* Pick a vision backend for the bundle at `bundle_dir`. nullptr + no error + * => use the in-tree ggml mmproj path. nullptr + *out_error => hard failure. */ +VisionBackendFactory * vision_backend_select(const char * bundle_dir, char ** out_error);