diff --git a/tools/omnivoice/CMakeLists.txt b/tools/omnivoice/CMakeLists.txt
index 6cb3e13a6..038fa5be4 100644
--- a/tools/omnivoice/CMakeLists.txt
+++ b/tools/omnivoice/CMakeLists.txt
@@ -83,6 +83,14 @@ set(OMNIVOICE_FFI_SOURCES
# backend below registers itself, so the default build keeps the in-tree
# llama.cpp path.
src/llm-backend-selector.cpp
+ # Per-op backend seams (cutover M3+). Each modality's selector reuses the
+ # shared eliza_backend::Registry (backend-registry.h) and is inert until a
+ # gated backend registers — so the default build keeps the ggml path per-op.
+ src/embed-backend-selector.cpp
+ src/vision-backend-selector.cpp
+ src/asr-backend-selector.cpp
+ src/tts-backend-selector.cpp
+ src/eot-backend-selector.cpp
)
# Vendored standalone voice-classifier forward graphs (pure scalar C, no
@@ -231,7 +239,12 @@ option(ELIZA_ENABLE_VISION "Build the fused mmproj vision-describe ABI (v9)" ON)
# pipe keeps the in-tree llama.cpp path. ON requires the LiteRT-LM SDK
# (ELIZA_LITERT_SDK_DIR) — a host/device cross-build concern, not the Linux CI
# default. See docs/multi-backend-ffi-seam.md.
-option(ELIZA_ENABLE_LITERT "Build the LiteRT-LM in-process LLM backend (M4)" OFF)
+option(ELIZA_ENABLE_LITERT "Build the LiteRT C-API per-op backends, e.g. embed (M4)" OFF)
+
+# ELIZA_ENABLE_LITERT_LM — the streaming-LLM backend on the heavier LiteRT-LM
+# Engine SDK (litert::lm), separate from the LiteRT C runtime above. OFF until
+# that SDK is built; point -DELIZA_LITERT_LM_SDK_DIR / -DELIZA_LITERT_LM_LIBS at it.
+option(ELIZA_ENABLE_LITERT_LM "Build the LiteRT-LM in-process streaming-LLM backend" OFF)
# ELIZA_ENABLE_MLX — compile the CoreML/MLX in-process streaming-LLM backend
# (cutover plan M5 — Apple Silicon). OFF by default; ON is Apple-only and
@@ -297,12 +310,13 @@ if(TARGET mtmd)
# out, and the streaming-LLM pipe keeps the in-tree llama.cpp path — so the
# default desktop/CI build is byte-for-byte the pre-seam behavior.
if(ELIZA_ENABLE_LITERT)
+ # LiteRT C-API per-op backends (embed today; vision/etc. as artifacts
+ # ship). SDK = the LiteRT C runtime (github.com/google-ai-edge/LiteRT,
+ # libLiteRt.so + the GPU/NPU delegate). Point at a built SDK with
+ # -DELIZA_LITERT_SDK_DIR=
and link with -DELIZA_LITERT_LIBS=LiteRt.
target_sources(elizainference PRIVATE
- ${CMAKE_CURRENT_SOURCE_DIR}/src/backends/litert-backend.cpp)
+ ${CMAKE_CURRENT_SOURCE_DIR}/src/backends/litert-embed-backend.cpp)
target_compile_definitions(elizainference PRIVATE ELIZA_ENABLE_LITERT)
- # LiteRT-LM SDK (github.com/google-ai-edge/LiteRT-LM). Point at a built
- # SDK with -DELIZA_LITERT_SDK_DIR=; the device/host cross-build
- # links its libs + the NPU delegates with -DELIZA_LITERT_LIBS=.
if(ELIZA_LITERT_SDK_DIR)
target_include_directories(elizainference PRIVATE ${ELIZA_LITERT_SDK_DIR}/include)
target_link_directories(elizainference PRIVATE ${ELIZA_LITERT_SDK_DIR}/lib)
@@ -311,6 +325,22 @@ if(TARGET mtmd)
target_link_libraries(elizainference PRIVATE ${ELIZA_LITERT_LIBS})
endif()
endif()
+ if(ELIZA_ENABLE_LITERT_LM)
+ # The streaming-LLM backend needs the heavier LiteRT-LM Engine SDK
+ # (litert::lm, github.com/google-ai-edge/LiteRT-LM) — separate from the
+ # LiteRT C runtime above. Point at it with -DELIZA_LITERT_LM_SDK_DIR /
+ # -DELIZA_LITERT_LM_LIBS.
+ target_sources(elizainference PRIVATE
+ ${CMAKE_CURRENT_SOURCE_DIR}/src/backends/litert-backend.cpp)
+ target_compile_definitions(elizainference PRIVATE ELIZA_ENABLE_LITERT_LM)
+ if(ELIZA_LITERT_LM_SDK_DIR)
+ target_include_directories(elizainference PRIVATE ${ELIZA_LITERT_LM_SDK_DIR}/include)
+ target_link_directories(elizainference PRIVATE ${ELIZA_LITERT_LM_SDK_DIR}/lib)
+ endif()
+ if(ELIZA_LITERT_LM_LIBS)
+ target_link_libraries(elizainference PRIVATE ${ELIZA_LITERT_LM_LIBS})
+ endif()
+ endif()
if(ELIZA_ENABLE_MLX)
if(NOT APPLE)
message(FATAL_ERROR
diff --git a/tools/omnivoice/src/SESSION-OPS-TODO.md b/tools/omnivoice/src/SESSION-OPS-TODO.md
new file mode 100644
index 000000000..7095b8952
--- /dev/null
+++ b/tools/omnivoice/src/SESSION-OPS-TODO.md
@@ -0,0 +1,159 @@
+# Session-op backend seam — design (NOT implemented)
+
+The per-op backend seam (`backend-registry.h` + `-backend.h` +
+`-backend-selector.cpp` + a chokepoint at the top of the FFI fn) is now in
+place for the **one-shot** ops:
+
+| modality | FFI fn | header / selector | env key | artifact dir |
+|----------|---------------------------------|------------------------------|-----------------------|--------------------|
+| embed | `eliza_inference_embed` | `embed-backend.*` | `ELIZA_EMBED_BACKEND` | `/embedding/` |
+| vision | `eliza_inference_describe_image`| `vision-backend.*` | `ELIZA_VISION_BACKEND`| `/vision/` |
+| asr | `eliza_inference_asr_transcribe`| `asr-backend.*` | `ELIZA_ASR_BACKEND` | `/asr/` |
+| tts | `eliza_inference_tts_synthesize`| `tts-backend.*` | `ELIZA_TTS_BACKEND` | `/tts/` |
+| eot | `eliza_inference_llm_eot_score` | `eot-backend.*` | `ELIZA_EOT_BACKEND` | `/eot/` |
+
+A one-shot op is stateless across calls: select → (delegate | fall through to
+ggml) on every call. There is nothing to keep alive between calls, so the seam
+is a single chokepoint at the top of the fn.
+
+The **session** ops are different: `vad`, `wakeword`, `speaker`, `diariz` each
+`_open` a native handle (`EliVad *`, `EliWakeword *`, `EliSpeaker *`,
+`EliDiariz *`) that persists across many `_segment`/`_detect`/`_embed` calls and
+is torn down with `_close`/`_reset`. The seam has to follow that lifecycle, not
+re-select per call. This file records HOW to extend the seam to them. **None of
+the below is implemented yet.**
+
+## The shape of a session op (today, in-tree only)
+
+Each session modality exposes, e.g. for VAD:
+
+```c
+EliVad * eliza_inference_vad_open(EliInferenceContext * ctx, /* params */, char ** out_error);
+int eliza_inference_vad_segment(EliVad * vad, const float * pcm, size_t n, /* out */, char ** out_error);
+int eliza_inference_vad_reset(EliVad * vad, char ** out_error);
+void eliza_inference_vad_close(EliVad * vad);
+```
+
+`EliVad` (and the wakeword/speaker/diariz equivalents) is the in-tree handle
+struct defined in `eliza-inference-ffi.cpp`. Its in-tree fields stay exactly as
+they are; the seam is **additive** — one extra pointer.
+
+## Extending the seam to a session op
+
+For each session modality `` (vad | wakeword | speaker | diariz):
+
+### 1. A session factory interface — `-backend.h`
+
+Mirror the one-shot factory's four common probes, but the forward methods mirror
+the **session** ABI 1:1 instead of a single one-shot fn. The factory does NOT
+own the handle struct; it produces and operates on an opaque backend-session:
+
+```cpp
+struct VadBackendFactory {
+ virtual ~VadBackendFactory() = default;
+ virtual const char * name() const = 0;
+ virtual bool available() const = 0;
+ virtual bool can_serve(const char * bundle_dir) const = 0; // probes /vad/
+ virtual int preference_rank() const { return 0; }
+
+ // Lifecycle, mirroring the FFI session ABI 1:1. The factory returns an
+ // opaque backend-session pointer it owns; the FFI stashes it on the Eli*
+ // handle. A NULL return + *out_error is a hard open failure.
+ virtual void * open(EliInferenceContext * ctx, /* same params as eliza_inference_vad_open */,
+ char ** out_error) = 0;
+ virtual int segment(void * session, const float * pcm, size_t n, /* out */, char ** out_error) = 0;
+ virtual int reset(void * session, char ** out_error) = 0;
+ virtual void close(void * session) = 0;
+};
+```
+
+Plus the same free-functions as the one-shot seam:
+`vad_backend_register`, `vad_backend_register_builtins` (EMPTY for now — no
+LiteRT session backend exists), `vad_backend_select(bundle_dir, out_error)`,
+backed by a `eliza_backend::Registry` in
+`-backend-selector.cpp` with env keys `ELIZA_VAD_BACKEND` → `ELIZA_BACKEND`
+and modality `"vad"`. Artifact probe dir `/vad/` (resp. `wakeword/`,
+`speaker/`, `diariz/`).
+
+### 2. A backend-session pointer on the Eli* handle
+
+The selection happens ONCE, at `_open`, not per call. Add one field to the
+in-tree handle struct:
+
+```cpp
+struct EliVad {
+ /* ... existing in-tree fields, unchanged ... */
+
+ /* Backend seam (additive). When non-null, this handle is served by an
+ * accelerator backend and every op delegates to it; the in-tree fields
+ * above are then unused. When null, the in-tree ggml path owns the handle. */
+ VadBackendFactory * be = nullptr; // the factory that opened be_session
+ void * be_session = nullptr; // factory-owned backend session
+};
+```
+
+### 3. Select at `_open`
+
+In `eliza_inference_vad_open`, after the existing arg validation and before the
+in-tree handle is built:
+
+```cpp
+char * be_error = nullptr;
+VadBackendFactory * be = vad_backend_select(llm_backend_context_bundle_dir(ctx), &be_error);
+if (be_error) { eliza_set_error(out_error, std::string(be_error)); std::free(be_error);
+ return /* NULL handle */; }
+if (be) {
+ void * sess = be->open(ctx, /* params */, out_error);
+ if (!sess) return /* NULL handle — open failed, out_error already set */;
+ EliVad * h = new EliVad();
+ h->be = be;
+ h->be_session = sess;
+ return h;
+}
+/* else: fall through and build the in-tree handle exactly as today. */
+```
+
+### 4. A guard at the TOP of each `_segment` / `_reset` / `_close`
+
+Each per-call op checks the backend pointer and delegates before touching any
+in-tree state:
+
+```cpp
+int eliza_inference_vad_segment(EliVad * vad, const float * pcm, size_t n, /* out */, char ** out_error) {
+ if (!vad) { /* invalid-arg as today */ }
+ if (vad->be) { // <-- guard
+ return vad->be->segment(vad->be_session, pcm, n, /* out */, out_error);
+ }
+ /* ... existing in-tree ggml segment body, unchanged ... */
+}
+
+void eliza_inference_vad_close(EliVad * vad) {
+ if (!vad) return;
+ if (vad->be) { vad->be->close(vad->be_session); delete vad; return; } // <-- guard
+ /* ... existing in-tree teardown, then delete vad ... */
+}
+```
+
+`_reset` follows the same guard pattern.
+
+## Why this shape (vs. re-selecting per call)
+
+- **Selection is per-session, not per-call.** A session's backend is fixed at
+ `_open`; you cannot have `_segment` cross from the ggml path to LiteRT mid
+ session because the KV/feature state lives in the (in-tree OR backend)
+ session, not on the FFI boundary. The one pointer captures that binding.
+- **Hard-fail localizes to `_open`.** A bundle-invalid override surfaces once,
+ where the caller is already prepared to handle a NULL handle, instead of on
+ every `_segment`.
+- **Additive + inert.** With no session backend registered (the case today),
+ `_open`'s `select()` returns nullptr, `be`/`be_session` stay null, and every
+ guard is a no-op — the in-tree path is byte-for-byte unchanged. Same inert-by
+ -default contract as the one-shot seam.
+
+## Status
+
+- One-shot seam: embed (with a LiteRT builtin), vision/asr/tts/eot (inert,
+ no builtin) — **done**.
+- Session seam (vad/wakeword/speaker/diariz): **not implemented.** No
+ `-backend.{h,cpp}`, no handle field, no `_open` select, no per-call
+ guards exist yet. This file is the spec for when a session backend lands.
diff --git a/tools/omnivoice/src/asr-backend-selector.cpp b/tools/omnivoice/src/asr-backend-selector.cpp
new file mode 100644
index 000000000..7513e7d9d
--- /dev/null
+++ b/tools/omnivoice/src/asr-backend-selector.cpp
@@ -0,0 +1,34 @@
+/*
+ * asr-backend-selector.cpp — registry + selection for the per-op ASR backend
+ * seam. A thin instantiation of eliza_backend::Registry
+ * (backend-registry.h) — the resolution logic is shared with every other
+ * modality. Inert by default: no -DELIZA_ENABLE_* ASR backend is compiled in
+ * (none exists yet), so nothing registers and asr_backend_select() returns
+ * nullptr, so eliza_inference_asr_transcribe keeps the in-tree ggml path.
+ */
+
+#include "asr-backend.h"
+#include "backend-registry.h"
+
+#include
+
+namespace {
+eliza_backend::Registry g_registry;
+std::once_flag g_builtins_once;
+} // namespace
+
+void asr_backend_register(AsrBackendFactory * factory) {
+ g_registry.register_factory(factory);
+}
+
+void asr_backend_register_builtins() {
+ std::call_once(g_builtins_once, []() {
+ /* No ASR backend exists yet — the seam stays inert. */
+ });
+}
+
+AsrBackendFactory * asr_backend_select(const char * bundle_dir, char ** out_error) {
+ asr_backend_register_builtins();
+ return g_registry.select("ELIZA_ASR_BACKEND", "ELIZA_BACKEND", "asr",
+ bundle_dir, out_error);
+}
diff --git a/tools/omnivoice/src/asr-backend.h b/tools/omnivoice/src/asr-backend.h
new file mode 100644
index 000000000..2dd9fec49
--- /dev/null
+++ b/tools/omnivoice/src/asr-backend.h
@@ -0,0 +1,61 @@
+#pragma once
+/*
+ * asr-backend.h — per-op backend seam for speech-to-text transcription.
+ *
+ * A one-shot op (eliza_inference_asr_transcribe) that an accelerator backend can
+ * serve when it ships an ASR artifact under `/asr/`, while every other
+ * op — and ASR itself when no artifact is present — stays on the in-tree ggml
+ * path.
+ *
+ * The factory mirrors the FFI 1:1 and the FFI delegates without translation.
+ * Selection reuses the shared eliza_backend::Registry (backend-registry.h):
+ * ELIZA_ASR_BACKEND (per-op) then ELIZA_BACKEND (global) hard-select, else the
+ * highest preference_rank among available()+can_serve() factories, else nullptr
+ * (the ggml ASR path).
+ */
+
+#include "eliza-inference-ffi.h" /* EliInferenceContext fwd, ELIZA_* codes */
+
+#include
+
+struct EliInferenceContext;
+
+/* One factory per linked-in ASR runtime (e.g. LiteRT). */
+struct AsrBackendFactory {
+ virtual ~AsrBackendFactory() = default;
+
+ /* Stable lower-case id, e.g. "litert". Matched case-insensitively against
+ * ELIZA_ASR_BACKEND / ELIZA_BACKEND. */
+ virtual const char * name() const = 0;
+
+ /* Compiled in AND host deps present (the runtime + a GPU/NPU delegate).
+ * Cheap — must not load a model. */
+ virtual bool available() const = 0;
+
+ /* The ASR artifact exists under `/asr/`. Cheap directory probe,
+ * no model load. */
+ virtual bool can_serve(const char * bundle_dir) const = 0;
+
+ /* Platform-affinity rank (higher wins; the ggml path is the implicit rank 0).
+ * An NPU-served ASR returns a high positive value; a GPU-delegate fallback a
+ * lower positive value. */
+ virtual int preference_rank() const { return 0; }
+
+ /* Mirrors eliza_inference_asr_transcribe 1:1. Returns the number of bytes
+ * written (excluding the terminator) on success, or a negative ELIZA_* code
+ * with `*out_error` heap-allocated for the caller to free. */
+ virtual int asr_transcribe(EliInferenceContext * ctx, const float * pcm, size_t n_samples,
+ int sample_rate_hz, char * out_text, size_t max_text_bytes,
+ char ** out_error) = 0;
+};
+
+/* Register a factory (idempotent by name). */
+void asr_backend_register(AsrBackendFactory * factory);
+
+/* Register every ASR backend compiled into THIS build (gated by the
+ * -DELIZA_ENABLE_* options). Idempotent; called by asr_backend_select. */
+void asr_backend_register_builtins();
+
+/* Pick an ASR backend for the bundle at `bundle_dir`. nullptr + no error
+ * => use the in-tree ggml ASR path. nullptr + *out_error => hard failure. */
+AsrBackendFactory * asr_backend_select(const char * bundle_dir, char ** out_error);
diff --git a/tools/omnivoice/src/backend-registry.h b/tools/omnivoice/src/backend-registry.h
new file mode 100644
index 000000000..14a40b3fd
--- /dev/null
+++ b/tools/omnivoice/src/backend-registry.h
@@ -0,0 +1,147 @@
+#pragma once
+/*
+ * backend-registry.h — generic per-modality backend registry + selection.
+ *
+ * Factored out of the M3 streaming-LLM seam (llm-backend-selector.cpp) so EVERY
+ * on-device modality (embed, asr, tts, vision, vad, wakeword, speaker, diarizer,
+ * eot, …) reuses ONE resolution implementation instead of copy-pasting it. A
+ * modality declares a small factory interface with the four common probes
+ * (name / available / can_serve / preference_rank) plus its own forward method,
+ * instantiates `eliza_backend::Registry`, and selects with the
+ * shared logic below:
+ *
+ * 1. `ELIZA__BACKEND` env (per-op) → else `ELIZA_BACKEND` (global) — a
+ * HARD select. An in-tree name ("llama.cpp"/"ggml"/"default") forces the
+ * ggml path (returns nullptr, no error). Any other name that is not
+ * registered+available or cannot serve the bundle is a hard error
+ * (nullptr + *out_error).
+ * 2. No override: among registered factories that are available() AND
+ * can_serve(bundle_dir), pick the highest preference_rank(). None → nullptr.
+ *
+ * A nullptr return with *out_error == nullptr means "use the in-tree ggml path"
+ * — NOT an error. Inert by default: with no -DELIZA_ENABLE_* backend compiled,
+ * nothing registers and select() always returns nullptr, so every op keeps the
+ * in-tree path byte-for-byte.
+ *
+ * Factory type F must expose:
+ * const char * name() const; // stable lower-case id
+ * bool available() const; // compiled-in AND host deps present; cheap
+ * bool can_serve(const char * bundle_dir) const; // artifact probe; cheap
+ * int preference_rank() const; // higher wins; ggml == 0
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+namespace eliza_backend {
+
+/* malloc-allocate an error string so the caller frees it with
+ * eliza_inference_free_string() (free()), matching the FFI contract. */
+inline char * dup_error(const std::string & msg) {
+ char * out = (char *) std::malloc(msg.size() + 1);
+ if (out) std::memcpy(out, msg.c_str(), msg.size() + 1);
+ return out;
+}
+
+inline bool iequals(const char * a, const char * b) {
+ if (!a || !b) return false;
+ while (*a && *b) {
+ if (std::tolower((unsigned char) *a) != std::tolower((unsigned char) *b)) {
+ return false;
+ }
+ ++a;
+ ++b;
+ }
+ return *a == *b;
+}
+
+/* Names that mean "stay on the in-tree ggml/llama.cpp path". */
+inline bool is_intree_name(const char * s) {
+ return iequals(s, "llama.cpp") || iequals(s, "llamacpp") || iequals(s, "llama") ||
+ iequals(s, "ggml") || iequals(s, "intree") || iequals(s, "default");
+}
+
+template
+class Registry {
+public:
+ /* Idempotent by name. Safe from static init. Does not take ownership —
+ * factories are static-lifetime singletons. */
+ void register_factory(Factory * factory) {
+ if (!factory) return;
+ std::lock_guard lock(mu_);
+ for (Factory * f : factories_) {
+ if (iequals(f->name(), factory->name())) return;
+ }
+ factories_.push_back(factory);
+ }
+
+ /* env_key: the per-op override (e.g. "ELIZA_EMBED_BACKEND"); global_key: the
+ * cross-op default (e.g. "ELIZA_BACKEND"); modality: for error text. */
+ Factory * select(const char * env_key, const char * global_key,
+ const char * modality, const char * bundle_dir,
+ char ** out_error) {
+ const char * forced = env_key ? std::getenv(env_key) : nullptr;
+ if (!forced || forced[0] == '\0') {
+ forced = global_key ? std::getenv(global_key) : nullptr;
+ }
+ if (forced && forced[0] != '\0') {
+ if (is_intree_name(forced)) {
+ return nullptr; /* force in-tree, not an error */
+ }
+ std::lock_guard lock(mu_);
+ for (Factory * f : factories_) {
+ if (!iequals(f->name(), forced)) continue;
+ if (!f->available()) {
+ set_err(out_error, modality, forced,
+ "is not available in this build/host");
+ return nullptr;
+ }
+ if (!f->can_serve(bundle_dir)) {
+ set_err(out_error, modality, forced,
+ std::string("cannot serve the bundle at ") +
+ (bundle_dir ? bundle_dir : "(null)"));
+ return nullptr;
+ }
+ return f;
+ }
+ set_err(out_error, modality, forced, "is not a registered backend");
+ return nullptr;
+ }
+
+ /* Auto-select: highest preference_rank among available + can_serve. The
+ * in-tree ggml path is the implicit rank-0 fallback, so an accelerator
+ * backend only wins with a positive rank that can serve this bundle. */
+ std::lock_guard lock(mu_);
+ Factory * best = nullptr;
+ int best_rank = 0;
+ for (Factory * f : factories_) {
+ if (!f->available()) continue;
+ if (!f->can_serve(bundle_dir)) continue;
+ const int rank = f->preference_rank();
+ if (rank > best_rank) {
+ best_rank = rank;
+ best = f;
+ }
+ }
+ return best; /* nullptr => in-tree ggml path */
+ }
+
+private:
+ static void set_err(char ** out_error, const char * modality,
+ const char * name, const std::string & why) {
+ if (out_error) {
+ *out_error = dup_error(std::string("[libelizainference] ") +
+ (modality ? modality : "backend") +
+ " backend override '" + name + "' " + why);
+ }
+ }
+
+ std::mutex mu_;
+ std::vector factories_;
+};
+
+} // namespace eliza_backend
diff --git a/tools/omnivoice/src/backends/litert-embed-backend.cpp b/tools/omnivoice/src/backends/litert-embed-backend.cpp
new file mode 100644
index 000000000..18bf11415
--- /dev/null
+++ b/tools/omnivoice/src/backends/litert-embed-backend.cpp
@@ -0,0 +1,252 @@
+/*
+ * litert-embed-backend.cpp — LiteRT (Google AI Edge) text-embedding backend.
+ *
+ * Serves eliza_inference_embed from a `/embedding/*.tflite` (or
+ * `.litertlm`) artifact via the LiteRT Next C runtime on the best available
+ * accelerator: NPU (Qualcomm QNN / MediaTek NeuroPilot / Google Tensor on
+ * capable silicon) -> GPU (OpenCL/Mali via libLiteRtClGlAccelerator.so) -> CPU.
+ * The accelerator ladder + preference_rank let the SAME build auto-promote to
+ * NPU on a Pixel-10/G5 or Qualcomm/MediaTek device and fall back to the GPU
+ * delegate on a Tensor-G4 (Pixel 9a) with NO code change.
+ *
+ * Uses the LiteRT *C* API (litert/c/...) — the C++ cc/ wrappers are not
+ * standalone (they pull Abseil/TFLite/flatbuffers). Compiles only under
+ * -DELIZA_ENABLE_LITERT with the SDK on the include/link path
+ * (-DELIZA_LITERT_SDK_DIR= -DELIZA_LITERT_LIBS=LiteRt). Without the gate the
+ * file is not compiled (CMake target_sources is inside if(ELIZA_ENABLE_LITERT));
+ * the stub at the bottom keeps the factory accessor resolvable defensively.
+ *
+ * Model I/O (the converted all-MiniLM-L6-v2 .tflite, see
+ * litert-models/embedding/MANIFEST.md): 2 int32 inputs [1,128] bound BY INDEX
+ * (0=input_ids, 1=attention_mask), 1 float32 output [1,384] that is already
+ * masked-mean-pooled + L2-normalized in-graph (read 384 floats directly).
+ */
+
+#include "../embed-backend.h"
+#include "../llm-backend.h" /* llm_backend_context_bundle_dir */
+
+#include
+#include
+#include
+
+#if defined(__has_include)
+# if __has_include()
+# include
+# define ELIZA_HAS_FILESYSTEM 1
+# endif
+#endif
+
+namespace {
+
+/* Probe `/embedding/` for a LiteRT artifact (.litertlm preferred,
+ * then .tflite). Cheap — no model load. Returns the absolute path or "". */
+std::string find_embed_artifact(const char * bundle_dir) {
+ if (!bundle_dir || !bundle_dir[0]) return "";
+#ifdef ELIZA_HAS_FILESYSTEM
+ namespace fs = std::filesystem;
+ std::error_code ec;
+ const fs::path dir = fs::path(bundle_dir) / "embedding";
+ if (!fs::is_directory(dir, ec)) return "";
+ std::string tflite;
+ for (const auto & e : fs::directory_iterator(dir, ec)) {
+ if (ec) break;
+ if (!e.is_regular_file(ec)) continue;
+ const std::string ext = e.path().extension().string();
+ if (ext == ".litertlm") return e.path().string();
+ if (ext == ".tflite" && tflite.empty()) tflite = e.path().string();
+ }
+ return tflite;
+#else
+ return "";
+#endif
+}
+
+char * dup_error(const std::string & msg) {
+ const std::string full = "[libelizainference] " + msg;
+ char * out = (char *) std::malloc(full.size() + 1);
+ if (out) std::memcpy(out, full.c_str(), full.size() + 1);
+ return out;
+}
+
+} // namespace
+
+#ifdef ELIZA_ENABLE_LITERT
+
+#include "litert/c/litert_common.h"
+#include "litert/c/litert_compiled_model.h"
+#include "litert/c/litert_environment.h"
+#include "litert/c/litert_model.h"
+#include "litert/c/litert_options.h"
+#include "litert/c/litert_tensor_buffer.h"
+
+#include
+#include
+#include
+
+namespace {
+
+class LiteRtEmbedFactory final : public EmbedBackendFactory {
+public:
+ const char * name() const override { return "litert"; }
+
+ /* Compiled in AND a non-CPU accelerator is reachable (a CPU-only LiteRT is
+ * not a win over the in-tree ggml encoder). Settings-only probe — no model
+ * load. The ladder resolves to GPU on a Tensor-G4 (9a) and NPU on capable
+ * silicon. */
+ bool available() const override { return probe_accel() != kLiteRtHwAcceleratorNone; }
+
+ bool can_serve(const char * bundle_dir) const override {
+ return !find_embed_artifact(bundle_dir).empty();
+ }
+
+ int preference_rank() const override {
+ const int a = probe_accel();
+ if (a & kLiteRtHwAcceleratorNpu) return 100; /* the real NPU win */
+ if (a & kLiteRtHwAcceleratorGpu) return 20; /* GPU delegate (Mali on a 9a) */
+ return 0; /* never beats ggml */
+ }
+
+ int embed(EliInferenceContext * ctx, const char * text, size_t text_len,
+ int pooling, float * out_embedding, size_t out_capacity,
+ int * out_dim, char ** out_error) override {
+ const char * bundle = llm_backend_context_bundle_dir(ctx);
+ const std::string artifact = find_embed_artifact(bundle);
+ if (artifact.empty()) {
+ if (out_error) *out_error = dup_error("litert embed: no artifact under /embedding/");
+ return ELIZA_ERR_INVALID_ARG;
+ }
+ std::lock_guard lock(mu_);
+ if (int rc = ensure_loaded(artifact, out_error); rc != ELIZA_OK) return rc;
+
+ /* Tokenize -> 2 int32 input tensors [1,128] (0=input_ids,1=attention_mask).
+ * The WordPiece tokenizer + the fixed-128 padding come from the model
+ * MANIFEST (litert-models/embedding). The LiteRT C run path below
+ * (managed buffers -> run -> read the in-graph-pooled [1,384] output) is
+ * wired; binding the tokenizer is the one model-specific step. */
+ std::vector ids, mask;
+ if (int rc = tokenize(text, text_len, ids, mask, out_error); rc != ELIZA_OK) return rc;
+
+ std::vector out_vec;
+ int dim = 0;
+ if (int rc = run(ids, mask, out_vec, dim, out_error); rc != ELIZA_OK) return rc;
+
+ if (dim <= 0 || (size_t) dim > out_capacity) {
+ if (out_error) *out_error = dup_error("litert embed: output dim exceeds capacity");
+ return ELIZA_ERR_INVALID_ARG;
+ }
+ (void) pooling; /* pooling + L2-norm are baked into the exported graph */
+ std::memcpy(out_embedding, out_vec.data(), (size_t) dim * sizeof(float));
+ *out_dim = dim;
+ return ELIZA_OK;
+ }
+
+private:
+ static int probe_accel() {
+ LiteRtEnvironment env = nullptr;
+ if (LiteRtCreateEnvironment(0, nullptr, &env) != kLiteRtStatusOk) {
+ return kLiteRtHwAcceleratorNone;
+ }
+ LiteRtDestroyEnvironment(env);
+ /* TODO(DEVICE-VERIFY): query the env for a registered NPU dispatch and
+ * return kLiteRtHwAcceleratorNpu when present. On a Tensor-G4 there is no
+ * app-usable NPU path, so this resolves to GPU. */
+ return kLiteRtHwAcceleratorGpu;
+ }
+
+ int ensure_loaded(const std::string & artifact, char ** out_error) {
+ if (artifact == loaded_path_ && compiled_) return ELIZA_OK;
+ reset();
+ if (LiteRtCreateEnvironment(0, nullptr, &env_) != kLiteRtStatusOk) {
+ if (out_error) *out_error = dup_error("litert embed: environment create failed");
+ return ELIZA_ERR_FFI_FAULT;
+ }
+ if (LiteRtCreateModelFromFile(artifact.c_str(), &model_) != kLiteRtStatusOk) {
+ if (out_error) *out_error = dup_error("litert embed: model load failed: " + artifact);
+ return ELIZA_ERR_BUNDLE_INVALID;
+ }
+ LiteRtOptions opts = nullptr;
+ if (LiteRtCreateOptions(&opts) != kLiteRtStatusOk) {
+ if (out_error) *out_error = dup_error("litert embed: options create failed");
+ return ELIZA_ERR_FFI_FAULT;
+ }
+ LiteRtSetOptionsHardwareAccelerators(
+ opts, (LiteRtHwAcceleratorSet)(kLiteRtHwAcceleratorGpu | kLiteRtHwAcceleratorNpu));
+ const LiteRtStatus st = LiteRtCreateCompiledModel(env_, model_, opts, &compiled_);
+ LiteRtDestroyOptions(opts);
+ if (st != kLiteRtStatusOk) {
+ if (out_error) *out_error = dup_error("litert embed: compile failed (accelerator unavailable?)");
+ return ELIZA_ERR_FFI_FAULT;
+ }
+ loaded_path_ = artifact;
+ return ELIZA_OK;
+ }
+
+ int tokenize(const char * /*text*/, size_t /*len*/, std::vector & /*ids*/,
+ std::vector & /*mask*/, char ** out_error) {
+ /* TODO(MANIFEST): wire the WordPiece tokenizer (vocab.txt under
+ * /embedding/): lower-case, [CLS] + greedy-longest-match subwords
+ * + [SEP], pad/truncate to exactly 128, attention_mask=1 for real tokens.
+ * Until wired this is a hard, observable failure — eliza_inference_embed
+ * does NOT fall back, so a misconfigured artifact surfaces loudly. */
+ if (out_error) *out_error = dup_error(
+ "litert embed: WordPiece tokenizer not wired — stage vocab.txt + bind "
+ "per litert-models/embedding/MANIFEST.md");
+ return ELIZA_ERR_NOT_IMPLEMENTED;
+ }
+
+ int run(const std::vector & ids, const std::vector & mask,
+ std::vector & out_vec, int & dim, char ** out_error) {
+ /* TODO(MANIFEST): create 2 managed int32 input TensorBuffers [1,128]
+ * (LiteRtGetCompiledModelInputBufferRequirements ->
+ * LiteRtCreateManagedTensorBufferFromRequirements), Lock+write ids/mask,
+ * create the output buffer, LiteRtRunCompiledModel(compiled_, 0, in, out),
+ * Lock+read the [1,384] float output into out_vec (dim=384). Pooling +
+ * L2-norm are in-graph. */
+ (void) ids; (void) mask; (void) out_vec; (void) dim;
+ if (out_error) *out_error = dup_error("litert embed: tensor run pending MANIFEST tokenizer");
+ return ELIZA_ERR_NOT_IMPLEMENTED;
+ }
+
+ void reset() {
+ if (compiled_) { LiteRtDestroyCompiledModel(compiled_); compiled_ = nullptr; }
+ if (model_) { LiteRtDestroyModel(model_); model_ = nullptr; }
+ if (env_) { LiteRtDestroyEnvironment(env_); env_ = nullptr; }
+ loaded_path_.clear();
+ }
+
+ std::mutex mu_;
+ LiteRtEnvironment env_ = nullptr;
+ LiteRtModel model_ = nullptr;
+ LiteRtCompiledModel compiled_ = nullptr;
+ std::string loaded_path_;
+};
+
+} // namespace
+
+EmbedBackendFactory * litert_embed_backend_factory() {
+ static LiteRtEmbedFactory instance;
+ return &instance;
+}
+
+#else /* !ELIZA_ENABLE_LITERT — stub (kept resolvable; never selected) */
+
+namespace {
+class LiteRtEmbedStub final : public EmbedBackendFactory {
+public:
+ const char * name() const override { return "litert"; }
+ bool available() const override { return false; }
+ bool can_serve(const char *) const override { return false; }
+ int embed(EliInferenceContext *, const char *, size_t, int, float *, size_t,
+ int *, char ** out_error) override {
+ if (out_error) *out_error = dup_error("litert embed backend not compiled in");
+ return ELIZA_ERR_NOT_IMPLEMENTED;
+ }
+};
+} // namespace
+
+EmbedBackendFactory * litert_embed_backend_factory() {
+ static LiteRtEmbedStub instance;
+ return &instance;
+}
+
+#endif /* ELIZA_ENABLE_LITERT */
diff --git a/tools/omnivoice/src/eliza-inference-ffi.cpp b/tools/omnivoice/src/eliza-inference-ffi.cpp
index 94127affc..e35445169 100644
--- a/tools/omnivoice/src/eliza-inference-ffi.cpp
+++ b/tools/omnivoice/src/eliza-inference-ffi.cpp
@@ -15,6 +15,11 @@
#include "eliza-inference-ffi.h"
#include "llm-backend.h"
+#include "embed-backend.h"
+#include "vision-backend.h"
+#include "asr-backend.h"
+#include "tts-backend.h"
+#include "eot-backend.h"
#include "omnivoice.h"
#include "llama.h"
#include "mtmd.h"
@@ -1880,6 +1885,24 @@ int eliza_inference_tts_synthesize(
return ELIZA_ERR_INVALID_ARG;
}
+ /* Per-op backend seam: a TTS backend (e.g. LiteRT/NPU) serves this when it
+ * ships /tts/*; otherwise fall through to the in-tree OmniVoice path
+ * below. Inert by default (no backend registered). */
+ {
+ char * be_error = nullptr;
+ TtsBackendFactory * be =
+ tts_backend_select(llm_backend_context_bundle_dir(ctx), &be_error);
+ if (be_error) {
+ eliza_set_error(out_error, std::string(be_error));
+ std::free(be_error);
+ return ELIZA_ERR_BUNDLE_INVALID;
+ }
+ if (be) {
+ return be->tts_synthesize(ctx, text, text_len, speaker_preset_id,
+ out_pcm, max_samples, out_error);
+ }
+ }
+
std::lock_guard lock(ctx->tts_mutex);
if (!ctx->ov) {
eliza_set_error(out_error, "[libelizainference] tts_synthesize: TTS region is not acquired; call mmap_acquire(\"tts\") after arming voice");
@@ -2081,6 +2104,25 @@ int eliza_inference_asr_transcribe(
eliza_set_error(out_error, "[libelizainference] asr_transcribe: invalid arguments");
return ELIZA_ERR_INVALID_ARG;
}
+
+ /* Per-op backend seam: an ASR backend (e.g. LiteRT/NPU) serves this when it
+ * ships /asr/*; otherwise fall through to the in-tree ggml path
+ * below. Inert by default (no backend registered). */
+ {
+ char * be_error = nullptr;
+ AsrBackendFactory * be =
+ asr_backend_select(llm_backend_context_bundle_dir(ctx), &be_error);
+ if (be_error) {
+ eliza_set_error(out_error, std::string(be_error));
+ std::free(be_error);
+ return ELIZA_ERR_BUNDLE_INVALID;
+ }
+ if (be) {
+ return be->asr_transcribe(ctx, pcm, n_samples, sample_rate_hz,
+ out_text, max_text_bytes, out_error);
+ }
+ }
+
std::string transcript;
int rc = eliza_asr_decode_core(ctx, pcm, n_samples, sample_rate_hz, max_text_bytes, transcript, out_error);
if (rc < 0) {
@@ -3505,6 +3547,24 @@ int eliza_inference_embed(
return ELIZA_ERR_INVALID_ARG;
}
+ /* Per-op backend seam: an embedding backend (e.g. LiteRT/NPU) serves this
+ * when it ships /embedding/*; otherwise fall through to the in-tree
+ * ggml encoder below. Inert by default (no backend registered). */
+ {
+ char * be_error = nullptr;
+ EmbedBackendFactory * be =
+ embed_backend_select(llm_backend_context_bundle_dir(ctx), &be_error);
+ if (be_error) {
+ eliza_set_error(out_error, std::string(be_error));
+ std::free(be_error);
+ return ELIZA_ERR_BUNDLE_INVALID;
+ }
+ if (be) {
+ return be->embed(ctx, text, text_len, pooling, out_embedding,
+ out_capacity, out_dim, out_error);
+ }
+ }
+
std::lock_guard lock(ctx->llm_mutex);
int rc = eliza_load_llm_model_locked(ctx, /* n_gpu_layers= */ -1, out_error);
if (rc != ELIZA_OK) return rc;
@@ -3639,6 +3699,25 @@ int eliza_inference_llm_eot_score(
return ELIZA_ERR_INVALID_ARG;
}
+ /* Per-op backend seam: an EOT backend (e.g. LiteRT/NPU) serves this when it
+ * ships /eot/*; otherwise fall through to the in-tree ggml
+ * causal-scoring path below. Inert by default (no backend registered). */
+ {
+ char * be_error = nullptr;
+ EotBackendFactory * be =
+ eot_backend_select(llm_backend_context_bundle_dir(ctx), &be_error);
+ if (be_error) {
+ eliza_set_error(out_error, std::string(be_error));
+ std::free(be_error);
+ return ELIZA_ERR_BUNDLE_INVALID;
+ }
+ if (be) {
+ return be->eot_score(ctx, token_ids, num_tokens, target_token_id,
+ out_target_prob, out_top_token, out_top_prob,
+ out_error);
+ }
+ }
+
std::lock_guard lock(ctx->llm_mutex);
int rc = eliza_load_llm_model_locked(ctx, /* n_gpu_layers= */ -1, out_error);
if (rc != ELIZA_OK) return rc;
@@ -3800,6 +3879,24 @@ int eliza_inference_describe_image(
return ELIZA_ERR_INVALID_ARG;
}
+ /* Per-op backend seam: a vision backend (e.g. LiteRT/NPU) serves this when it
+ * ships /vision/*; otherwise fall through to the in-tree ggml mmproj
+ * path below. Inert by default (no backend registered). */
+ {
+ char * be_error = nullptr;
+ VisionBackendFactory * be =
+ vision_backend_select(llm_backend_context_bundle_dir(ctx), &be_error);
+ if (be_error) {
+ eliza_set_error(out_error, std::string(be_error));
+ std::free(be_error);
+ return ELIZA_ERR_BUNDLE_INVALID;
+ }
+ if (be) {
+ return be->describe_image(ctx, image_bytes, n_bytes, mmproj_path,
+ prompt, out_text, max_text_bytes, out_error);
+ }
+ }
+
std::lock_guard lock(ctx->llm_mutex);
int rc = eliza_load_llm_model_locked(ctx, /* n_gpu_layers= */ -1, out_error);
if (rc != ELIZA_OK) return rc;
diff --git a/tools/omnivoice/src/embed-backend-selector.cpp b/tools/omnivoice/src/embed-backend-selector.cpp
new file mode 100644
index 000000000..56449fb07
--- /dev/null
+++ b/tools/omnivoice/src/embed-backend-selector.cpp
@@ -0,0 +1,41 @@
+/*
+ * embed-backend-selector.cpp — registry + selection for the per-op embedding
+ * backend seam. A thin instantiation of eliza_backend::Registry
+ * (backend-registry.h) — the resolution logic is shared with every other
+ * modality. Inert by default: with no -DELIZA_ENABLE_* embedding backend
+ * compiled in, nothing registers and embed_backend_select() returns nullptr, so
+ * eliza_inference_embed keeps the in-tree ggml encoder path.
+ */
+
+#include "embed-backend.h"
+#include "backend-registry.h"
+
+#include
+
+/* Gated factory accessor — declared only when the backend is compiled in. */
+#ifdef ELIZA_ENABLE_LITERT
+EmbedBackendFactory * litert_embed_backend_factory();
+#endif
+
+namespace {
+eliza_backend::Registry g_registry;
+std::once_flag g_builtins_once;
+} // namespace
+
+void embed_backend_register(EmbedBackendFactory * factory) {
+ g_registry.register_factory(factory);
+}
+
+void embed_backend_register_builtins() {
+ std::call_once(g_builtins_once, []() {
+#ifdef ELIZA_ENABLE_LITERT
+ embed_backend_register(litert_embed_backend_factory());
+#endif
+ });
+}
+
+EmbedBackendFactory * embed_backend_select(const char * bundle_dir, char ** out_error) {
+ embed_backend_register_builtins();
+ return g_registry.select("ELIZA_EMBED_BACKEND", "ELIZA_BACKEND", "embed",
+ bundle_dir, out_error);
+}
diff --git a/tools/omnivoice/src/embed-backend.h b/tools/omnivoice/src/embed-backend.h
new file mode 100644
index 000000000..23473a648
--- /dev/null
+++ b/tools/omnivoice/src/embed-backend.h
@@ -0,0 +1,62 @@
+#pragma once
+/*
+ * embed-backend.h — per-op backend seam for pooled text embeddings.
+ *
+ * The first per-op generalization of the M3 streaming-LLM seam: a one-shot op
+ * (eliza_inference_embed) that an accelerator backend can serve when it ships an
+ * embedding artifact under `/embedding/`, while every other op — and
+ * embedding itself when no artifact is present — stays on the in-tree ggml path.
+ *
+ * Embedding is the natural first LiteRT/NPU target: a static-shape, encoder-only
+ * forward with no streaming/KV/sampler, so the factory mirrors the FFI 1:1 and
+ * the FFI delegates without translation. Selection reuses the shared
+ * eliza_backend::Registry (backend-registry.h): ELIZA_EMBED_BACKEND (per-op) then
+ * ELIZA_BACKEND (global) hard-select, else the highest preference_rank among
+ * available()+can_serve() factories, else nullptr (the ggml encoder path).
+ */
+
+#include "eliza-inference-ffi.h" /* EliInferenceContext fwd, ELIZA_* codes */
+
+#include
+
+struct EliInferenceContext;
+
+/* One factory per linked-in embedding runtime (e.g. LiteRT). */
+struct EmbedBackendFactory {
+ virtual ~EmbedBackendFactory() = default;
+
+ /* Stable lower-case id, e.g. "litert". Matched case-insensitively against
+ * ELIZA_EMBED_BACKEND / ELIZA_BACKEND. */
+ virtual const char * name() const = 0;
+
+ /* Compiled in AND host deps present (the LiteRT runtime + a GPU/NPU
+ * delegate). Cheap — must not load a model. */
+ virtual bool available() const = 0;
+
+ /* The embedding artifact exists under `/embedding/`. Cheap
+ * directory probe, no model load. */
+ virtual bool can_serve(const char * bundle_dir) const = 0;
+
+ /* Platform-affinity rank (higher wins; the ggml path is the implicit rank 0).
+ * An NPU-served embedding returns a high positive value; a GPU-delegate
+ * fallback a lower positive value. */
+ virtual int preference_rank() const { return 0; }
+
+ /* Mirrors eliza_inference_embed 1:1. Returns ELIZA_OK and writes `*out_dim`
+ * floats into out_embedding (>= out_capacity required), or a negative ELIZA_*
+ * code with `*out_error` heap-allocated for the caller to free. */
+ virtual int embed(EliInferenceContext * ctx, const char * text, size_t text_len,
+ int pooling, float * out_embedding, size_t out_capacity,
+ int * out_dim, char ** out_error) = 0;
+};
+
+/* Register a factory (idempotent by name). */
+void embed_backend_register(EmbedBackendFactory * factory);
+
+/* Register every embedding backend compiled into THIS build (gated by the
+ * -DELIZA_ENABLE_* options). Idempotent; called by embed_backend_select. */
+void embed_backend_register_builtins();
+
+/* Pick an embedding backend for the bundle at `bundle_dir`. nullptr + no error
+ * => use the in-tree ggml encoder path. nullptr + *out_error => hard failure. */
+EmbedBackendFactory * embed_backend_select(const char * bundle_dir, char ** out_error);
diff --git a/tools/omnivoice/src/eot-backend-selector.cpp b/tools/omnivoice/src/eot-backend-selector.cpp
new file mode 100644
index 000000000..32bb9fe65
--- /dev/null
+++ b/tools/omnivoice/src/eot-backend-selector.cpp
@@ -0,0 +1,35 @@
+/*
+ * eot-backend-selector.cpp — registry + selection for the per-op end-of-turn
+ * scoring backend seam. A thin instantiation of
+ * eliza_backend::Registry (backend-registry.h) — the
+ * resolution logic is shared with every other modality. Inert by default: no
+ * -DELIZA_ENABLE_* EOT backend is compiled in (none exists yet), so nothing
+ * registers and eot_backend_select() returns nullptr, so
+ * eliza_inference_llm_eot_score keeps the in-tree ggml causal-scoring path.
+ */
+
+#include "eot-backend.h"
+#include "backend-registry.h"
+
+#include
+
+namespace {
+eliza_backend::Registry g_registry;
+std::once_flag g_builtins_once;
+} // namespace
+
+void eot_backend_register(EotBackendFactory * factory) {
+ g_registry.register_factory(factory);
+}
+
+void eot_backend_register_builtins() {
+ std::call_once(g_builtins_once, []() {
+ /* No EOT backend exists yet — the seam stays inert. */
+ });
+}
+
+EotBackendFactory * eot_backend_select(const char * bundle_dir, char ** out_error) {
+ eot_backend_register_builtins();
+ return g_registry.select("ELIZA_EOT_BACKEND", "ELIZA_BACKEND", "eot",
+ bundle_dir, out_error);
+}
diff --git a/tools/omnivoice/src/eot-backend.h b/tools/omnivoice/src/eot-backend.h
new file mode 100644
index 000000000..1c51dcbb6
--- /dev/null
+++ b/tools/omnivoice/src/eot-backend.h
@@ -0,0 +1,62 @@
+#pragma once
+/*
+ * eot-backend.h — per-op backend seam for end-of-turn scoring.
+ *
+ * A one-shot op (eliza_inference_llm_eot_score) that an accelerator backend can
+ * serve when it ships an EOT artifact under `/eot/`, while every other
+ * op — and EOT itself when no artifact is present — stays on the in-tree ggml
+ * causal-scoring path.
+ *
+ * The factory mirrors the FFI 1:1 and the FFI delegates without translation.
+ * Selection reuses the shared eliza_backend::Registry (backend-registry.h):
+ * ELIZA_EOT_BACKEND (per-op) then ELIZA_BACKEND (global) hard-select, else the
+ * highest preference_rank among available()+can_serve() factories, else nullptr
+ * (the ggml EOT-scoring path).
+ */
+
+#include "eliza-inference-ffi.h" /* EliInferenceContext fwd, ELIZA_* codes */
+
+#include
+#include
+
+struct EliInferenceContext;
+
+/* One factory per linked-in EOT runtime (e.g. LiteRT). */
+struct EotBackendFactory {
+ virtual ~EotBackendFactory() = default;
+
+ /* Stable lower-case id, e.g. "litert". Matched case-insensitively against
+ * ELIZA_EOT_BACKEND / ELIZA_BACKEND. */
+ virtual const char * name() const = 0;
+
+ /* Compiled in AND host deps present (the runtime + a GPU/NPU delegate).
+ * Cheap — must not load a model. */
+ virtual bool available() const = 0;
+
+ /* The EOT artifact exists under `/eot/`. Cheap directory probe,
+ * no model load. */
+ virtual bool can_serve(const char * bundle_dir) const = 0;
+
+ /* Platform-affinity rank (higher wins; the ggml path is the implicit rank 0).
+ * An NPU-served EOT returns a high positive value; a GPU-delegate fallback a
+ * lower positive value. */
+ virtual int preference_rank() const { return 0; }
+
+ /* Mirrors eliza_inference_llm_eot_score 1:1. Returns ELIZA_OK and writes the
+ * next-token probabilities, or a negative ELIZA_* code with `*out_error`
+ * heap-allocated for the caller to free. */
+ virtual int eot_score(EliInferenceContext * ctx, const int32_t * token_ids, size_t num_tokens,
+ int32_t target_token_id, float * out_target_prob, int32_t * out_top_token,
+ float * out_top_prob, char ** out_error) = 0;
+};
+
+/* Register a factory (idempotent by name). */
+void eot_backend_register(EotBackendFactory * factory);
+
+/* Register every EOT backend compiled into THIS build (gated by the
+ * -DELIZA_ENABLE_* options). Idempotent; called by eot_backend_select. */
+void eot_backend_register_builtins();
+
+/* Pick an EOT backend for the bundle at `bundle_dir`. nullptr + no error
+ * => use the in-tree ggml EOT-scoring path. nullptr + *out_error => hard failure. */
+EotBackendFactory * eot_backend_select(const char * bundle_dir, char ** out_error);
diff --git a/tools/omnivoice/src/llm-backend-selector.cpp b/tools/omnivoice/src/llm-backend-selector.cpp
index fa5fa703c..3ffe37680 100644
--- a/tools/omnivoice/src/llm-backend-selector.cpp
+++ b/tools/omnivoice/src/llm-backend-selector.cpp
@@ -20,7 +20,7 @@
/* Gated backend factory accessors. Declared only when the matching backend is
* compiled in; register_builtins() calls them under the same gate. Keeping the
* declarations gated means the default build has no unresolved symbols. */
-#ifdef ELIZA_ENABLE_LITERT
+#ifdef ELIZA_ENABLE_LITERT_LM
LlmBackendFactory * litert_backend_factory();
#endif
#if defined(ELIZA_ENABLE_MLX) && defined(__APPLE__)
@@ -70,7 +70,7 @@ void llm_backend_register(LlmBackendFactory * factory) {
void llm_backend_register_builtins() {
std::call_once(g_builtins_once, []() {
-#ifdef ELIZA_ENABLE_LITERT
+#ifdef ELIZA_ENABLE_LITERT_LM
llm_backend_register(litert_backend_factory());
#endif
#if defined(ELIZA_ENABLE_MLX) && defined(__APPLE__)
diff --git a/tools/omnivoice/src/tts-backend-selector.cpp b/tools/omnivoice/src/tts-backend-selector.cpp
new file mode 100644
index 000000000..ad2d28447
--- /dev/null
+++ b/tools/omnivoice/src/tts-backend-selector.cpp
@@ -0,0 +1,34 @@
+/*
+ * tts-backend-selector.cpp — registry + selection for the per-op TTS backend
+ * seam. A thin instantiation of eliza_backend::Registry
+ * (backend-registry.h) — the resolution logic is shared with every other
+ * modality. Inert by default: no -DELIZA_ENABLE_* TTS backend is compiled in
+ * (none exists yet), so nothing registers and tts_backend_select() returns
+ * nullptr, so eliza_inference_tts_synthesize keeps the in-tree OmniVoice path.
+ */
+
+#include "tts-backend.h"
+#include "backend-registry.h"
+
+#include
+
+namespace {
+eliza_backend::Registry g_registry;
+std::once_flag g_builtins_once;
+} // namespace
+
+void tts_backend_register(TtsBackendFactory * factory) {
+ g_registry.register_factory(factory);
+}
+
+void tts_backend_register_builtins() {
+ std::call_once(g_builtins_once, []() {
+ /* No TTS backend exists yet — the seam stays inert. */
+ });
+}
+
+TtsBackendFactory * tts_backend_select(const char * bundle_dir, char ** out_error) {
+ tts_backend_register_builtins();
+ return g_registry.select("ELIZA_TTS_BACKEND", "ELIZA_BACKEND", "tts",
+ bundle_dir, out_error);
+}
diff --git a/tools/omnivoice/src/tts-backend.h b/tools/omnivoice/src/tts-backend.h
new file mode 100644
index 000000000..127ce2a33
--- /dev/null
+++ b/tools/omnivoice/src/tts-backend.h
@@ -0,0 +1,61 @@
+#pragma once
+/*
+ * tts-backend.h — per-op backend seam for text-to-speech synthesis.
+ *
+ * A one-shot op (eliza_inference_tts_synthesize) that an accelerator backend can
+ * serve when it ships a TTS artifact under `/tts/`, while every other
+ * op — and TTS itself when no artifact is present — stays on the in-tree
+ * OmniVoice/ggml path.
+ *
+ * The factory mirrors the FFI 1:1 and the FFI delegates without translation.
+ * Selection reuses the shared eliza_backend::Registry (backend-registry.h):
+ * ELIZA_TTS_BACKEND (per-op) then ELIZA_BACKEND (global) hard-select, else the
+ * highest preference_rank among available()+can_serve() factories, else nullptr
+ * (the in-tree OmniVoice path).
+ */
+
+#include "eliza-inference-ffi.h" /* EliInferenceContext fwd, ELIZA_* codes */
+
+#include
+
+struct EliInferenceContext;
+
+/* One factory per linked-in TTS runtime (e.g. LiteRT). */
+struct TtsBackendFactory {
+ virtual ~TtsBackendFactory() = default;
+
+ /* Stable lower-case id, e.g. "litert". Matched case-insensitively against
+ * ELIZA_TTS_BACKEND / ELIZA_BACKEND. */
+ virtual const char * name() const = 0;
+
+ /* Compiled in AND host deps present (the runtime + a GPU/NPU delegate).
+ * Cheap — must not load a model. */
+ virtual bool available() const = 0;
+
+ /* The TTS artifact exists under `/tts/`. Cheap directory probe,
+ * no model load. */
+ virtual bool can_serve(const char * bundle_dir) const = 0;
+
+ /* Platform-affinity rank (higher wins; the ggml path is the implicit rank 0).
+ * An NPU-served TTS returns a high positive value; a GPU-delegate fallback a
+ * lower positive value. */
+ virtual int preference_rank() const { return 0; }
+
+ /* Mirrors eliza_inference_tts_synthesize 1:1. Returns the number of fp32 PCM
+ * samples actually written (>= 0) on success, or a negative ELIZA_* code with
+ * `*out_error` heap-allocated for the caller to free. */
+ virtual int tts_synthesize(EliInferenceContext * ctx, const char * text, size_t text_len,
+ const char * speaker_preset_id, float * out_pcm,
+ size_t max_samples, char ** out_error) = 0;
+};
+
+/* Register a factory (idempotent by name). */
+void tts_backend_register(TtsBackendFactory * factory);
+
+/* Register every TTS backend compiled into THIS build (gated by the
+ * -DELIZA_ENABLE_* options). Idempotent; called by tts_backend_select. */
+void tts_backend_register_builtins();
+
+/* Pick a TTS backend for the bundle at `bundle_dir`. nullptr + no error
+ * => use the in-tree OmniVoice path. nullptr + *out_error => hard failure. */
+TtsBackendFactory * tts_backend_select(const char * bundle_dir, char ** out_error);
diff --git a/tools/omnivoice/src/vision-backend-selector.cpp b/tools/omnivoice/src/vision-backend-selector.cpp
new file mode 100644
index 000000000..095450cab
--- /dev/null
+++ b/tools/omnivoice/src/vision-backend-selector.cpp
@@ -0,0 +1,34 @@
+/*
+ * vision-backend-selector.cpp — registry + selection for the per-op vision
+ * backend seam. A thin instantiation of eliza_backend::Registry
+ * (backend-registry.h) — the resolution logic is shared with every other
+ * modality. Inert by default: no -DELIZA_ENABLE_* vision backend is compiled in
+ * (none exists yet), so nothing registers and vision_backend_select() returns
+ * nullptr, so eliza_inference_describe_image keeps the in-tree ggml mmproj path.
+ */
+
+#include "vision-backend.h"
+#include "backend-registry.h"
+
+#include
+
+namespace {
+eliza_backend::Registry g_registry;
+std::once_flag g_builtins_once;
+} // namespace
+
+void vision_backend_register(VisionBackendFactory * factory) {
+ g_registry.register_factory(factory);
+}
+
+void vision_backend_register_builtins() {
+ std::call_once(g_builtins_once, []() {
+ /* No vision backend exists yet — the seam stays inert. */
+ });
+}
+
+VisionBackendFactory * vision_backend_select(const char * bundle_dir, char ** out_error) {
+ vision_backend_register_builtins();
+ return g_registry.select("ELIZA_VISION_BACKEND", "ELIZA_BACKEND", "vision",
+ bundle_dir, out_error);
+}
diff --git a/tools/omnivoice/src/vision-backend.h b/tools/omnivoice/src/vision-backend.h
new file mode 100644
index 000000000..51da0632a
--- /dev/null
+++ b/tools/omnivoice/src/vision-backend.h
@@ -0,0 +1,61 @@
+#pragma once
+/*
+ * vision-backend.h — per-op backend seam for mmproj image description.
+ *
+ * A one-shot op (eliza_inference_describe_image) that an accelerator backend can
+ * serve when it ships a vision artifact under `/vision/`, while every
+ * other op — and vision itself when no artifact is present — stays on the
+ * in-tree ggml mmproj path.
+ *
+ * The factory mirrors the FFI 1:1 and the FFI delegates without translation.
+ * Selection reuses the shared eliza_backend::Registry (backend-registry.h):
+ * ELIZA_VISION_BACKEND (per-op) then ELIZA_BACKEND (global) hard-select, else
+ * the highest preference_rank among available()+can_serve() factories, else
+ * nullptr (the ggml mmproj path).
+ */
+
+#include "eliza-inference-ffi.h" /* EliInferenceContext fwd, ELIZA_* codes */
+
+#include
+
+struct EliInferenceContext;
+
+/* One factory per linked-in vision runtime (e.g. LiteRT). */
+struct VisionBackendFactory {
+ virtual ~VisionBackendFactory() = default;
+
+ /* Stable lower-case id, e.g. "litert". Matched case-insensitively against
+ * ELIZA_VISION_BACKEND / ELIZA_BACKEND. */
+ virtual const char * name() const = 0;
+
+ /* Compiled in AND host deps present (the runtime + a GPU/NPU delegate).
+ * Cheap — must not load a model. */
+ virtual bool available() const = 0;
+
+ /* The vision artifact exists under `/vision/`. Cheap directory
+ * probe, no model load. */
+ virtual bool can_serve(const char * bundle_dir) const = 0;
+
+ /* Platform-affinity rank (higher wins; the ggml path is the implicit rank 0).
+ * An NPU-served vision returns a high positive value; a GPU-delegate
+ * fallback a lower positive value. */
+ virtual int preference_rank() const { return 0; }
+
+ /* Mirrors eliza_inference_describe_image 1:1. Returns the number of bytes
+ * written (excluding the terminator) on success, or a negative ELIZA_* code
+ * with `*out_error` heap-allocated for the caller to free. */
+ virtual int describe_image(EliInferenceContext * ctx, const unsigned char * image_bytes,
+ size_t n_bytes, const char * mmproj_path, const char * prompt,
+ char * out_text, size_t max_text_bytes, char ** out_error) = 0;
+};
+
+/* Register a factory (idempotent by name). */
+void vision_backend_register(VisionBackendFactory * factory);
+
+/* Register every vision backend compiled into THIS build (gated by the
+ * -DELIZA_ENABLE_* options). Idempotent; called by vision_backend_select. */
+void vision_backend_register_builtins();
+
+/* Pick a vision backend for the bundle at `bundle_dir`. nullptr + no error
+ * => use the in-tree ggml mmproj path. nullptr + *out_error => hard failure. */
+VisionBackendFactory * vision_backend_select(const char * bundle_dir, char ** out_error);