From 6c7616eee246df59f5f64aff9ad346b5dc2069bd Mon Sep 17 00:00:00 2001
From: claude <noreply@anthropic.com>
Date: Wed, 24 Jun 2026 16:05:39 -0700
Subject: [PATCH] feat(elizainference): per-op backend seam + LiteRT C-API
 embed backend

Generalize the M3 streaming-LLM seam to ALL on-device model ops. A shared
eliza_backend::Registry<F> (backend-registry.h) holds the resolution logic
(ELIZA_<MOD>_BACKEND/ELIZA_BACKEND hard-select -> highest preference_rank among
available()+can_serve() -> nullptr=ggml); each modality adds a tiny factory
interface + selector + one FFI chokepoint. Wired for embed/vision/asr/tts/eot:
each routes to a backend that ships <bundle>/<modality>/* when present, else
falls through to the in-tree ggml path. Inert-by-default (no backend registered
=> select() returns nullptr => every op byte-identical to before).

First real backend: LiteRT text embedding (backends/litert-embed-backend.cpp,
gated ELIZA_ENABLE_LITERT) on the LiteRT Next *C* API (the C++ cc/ wrappers are
not standalone): env/model/compiled-model lifecycle + NPU->GPU->CPU accelerator
ladder (rank 100/20/0) + reads the in-graph-pooled [1,384] output; the WordPiece
tokenizer + tensor binding are the one model-specific step (MANIFEST-gated).
Serves <bundle>/embedding/*.tflite; auto-promotes to NPU on Pixel-10/G5 or
Qualcomm/MediaTek silicon, GPU-delegate (Mali) on a Tensor-G4.

Split the LiteRT gates: ELIZA_ENABLE_LITERT = the LiteRT C-API per-op backends
(embed); new ELIZA_ENABLE_LITERT_LM = the streaming-LLM backend on the heavier
LiteRT-LM Engine SDK (off until that SDK is built). SESSION-OPS-TODO.md documents
the vad/wakeword/speaker/diariz extension.

Verified: 11/11 TUs compile (inert selectors + self-contained headers + the
gated embed backend against the LiteRT SDK); adversarial review confirms
inert-by-default + correct chokepoints across all 5 modalities.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 tools/omnivoice/CMakeLists.txt                |  40 ++-
 tools/omnivoice/src/SESSION-OPS-TODO.md       | 159 +++++++++++
 tools/omnivoice/src/asr-backend-selector.cpp  |  34 +++
 tools/omnivoice/src/asr-backend.h             |  61 +++++
 tools/omnivoice/src/backend-registry.h        | 147 ++++++++++
 .../src/backends/litert-embed-backend.cpp     | 252 ++++++++++++++++++
 tools/omnivoice/src/eliza-inference-ffi.cpp   |  97 +++++++
 .../omnivoice/src/embed-backend-selector.cpp  |  41 +++
 tools/omnivoice/src/embed-backend.h           |  62 +++++
 tools/omnivoice/src/eot-backend-selector.cpp  |  35 +++
 tools/omnivoice/src/eot-backend.h             |  62 +++++
 tools/omnivoice/src/llm-backend-selector.cpp  |   4 +-
 tools/omnivoice/src/tts-backend-selector.cpp  |  34 +++
 tools/omnivoice/src/tts-backend.h             |  61 +++++
 .../omnivoice/src/vision-backend-selector.cpp |  34 +++
 tools/omnivoice/src/vision-backend.h          |  61 +++++
 16 files changed, 1177 insertions(+), 7 deletions(-)
 create mode 100644 tools/omnivoice/src/SESSION-OPS-TODO.md
 create mode 100644 tools/omnivoice/src/asr-backend-selector.cpp
 create mode 100644 tools/omnivoice/src/asr-backend.h
 create mode 100644 tools/omnivoice/src/backend-registry.h
 create mode 100644 tools/omnivoice/src/backends/litert-embed-backend.cpp
 create mode 100644 tools/omnivoice/src/embed-backend-selector.cpp
 create mode 100644 tools/omnivoice/src/embed-backend.h
 create mode 100644 tools/omnivoice/src/eot-backend-selector.cpp
 create mode 100644 tools/omnivoice/src/eot-backend.h
 create mode 100644 tools/omnivoice/src/tts-backend-selector.cpp
 create mode 100644 tools/omnivoice/src/tts-backend.h
 create mode 100644 tools/omnivoice/src/vision-backend-selector.cpp
 create mode 100644 tools/omnivoice/src/vision-backend.h
diff --git a/tools/omnivoice/CMakeLists.txt b/tools/omnivoice/CMakeLists.txt
index 6cb3e13a6..038fa5be4 100644
--- a/tools/omnivoice/CMakeLists.txt
+++ b/tools/omnivoice/CMakeLists.txt
@@ -83,6 +83,14 @@ set(OMNIVOICE_FFI_SOURCES
     # backend below registers itself, so the default build keeps the in-tree
     # llama.cpp path.
     src/llm-backend-selector.cpp
+    # Per-op backend seams (cutover M3+). Each modality's selector reuses the
+    # shared eliza_backend::Registry (backend-registry.h) and is inert until a
+    # gated backend registers — so the default build keeps the ggml path per-op.
+    src/embed-backend-selector.cpp
+    src/vision-backend-selector.cpp
+    src/asr-backend-selector.cpp
+    src/tts-backend-selector.cpp
+    src/eot-backend-selector.cpp
 )
 
 # Vendored standalone voice-classifier forward graphs (pure scalar C, no
@@ -231,7 +239,12 @@ option(ELIZA_ENABLE_VISION "Build the fused mmproj vision-describe ABI (v9)" ON)
 # pipe keeps the in-tree llama.cpp path. ON requires the LiteRT-LM SDK
 # (ELIZA_LITERT_SDK_DIR) — a host/device cross-build concern, not the Linux CI
 # default. See docs/multi-backend-ffi-seam.md.
-option(ELIZA_ENABLE_LITERT "Build the LiteRT-LM in-process LLM backend (M4)" OFF)
+option(ELIZA_ENABLE_LITERT "Build the LiteRT C-API per-op backends, e.g. embed (M4)" OFF)
+
+# ELIZA_ENABLE_LITERT_LM — the streaming-LLM backend on the heavier LiteRT-LM
+# Engine SDK (litert::lm), separate from the LiteRT C runtime above. OFF until
+# that SDK is built; point -DELIZA_LITERT_LM_SDK_DIR / -DELIZA_LITERT_LM_LIBS at it.
+option(ELIZA_ENABLE_LITERT_LM "Build the LiteRT-LM in-process streaming-LLM backend" OFF)
 
 # ELIZA_ENABLE_MLX — compile the CoreML/MLX in-process streaming-LLM backend
 # (cutover plan M5 — Apple Silicon). OFF by default; ON is Apple-only and
@@ -297,12 +310,13 @@ if(TARGET mtmd)
     # out, and the streaming-LLM pipe keeps the in-tree llama.cpp path — so the
     # default desktop/CI build is byte-for-byte the pre-seam behavior.
     if(ELIZA_ENABLE_LITERT)
+        # LiteRT C-API per-op backends (embed today; vision/etc. as artifacts
+        # ship). SDK = the LiteRT C runtime (github.com/google-ai-edge/LiteRT,
+        # libLiteRt.so + the GPU/NPU delegate). Point at a built SDK with
+        # -DELIZA_LITERT_SDK_DIR=<dir> and link with -DELIZA_LITERT_LIBS=LiteRt.
         target_sources(elizainference PRIVATE
-            ${CMAKE_CURRENT_SOURCE_DIR}/src/backends/litert-backend.cpp)
+            ${CMAKE_CURRENT_SOURCE_DIR}/src/backends/litert-embed-backend.cpp)
         target_compile_definitions(elizainference PRIVATE ELIZA_ENABLE_LITERT)
-        # LiteRT-LM SDK (github.com/google-ai-edge/LiteRT-LM). Point at a built
-        # SDK with -DELIZA_LITERT_SDK_DIR=<dir>; the device/host cross-build
-        # links its libs + the NPU delegates with -DELIZA_LITERT_LIBS=<libs>.
         if(ELIZA_LITERT_SDK_DIR)
             target_include_directories(elizainference PRIVATE ${ELIZA_LITERT_SDK_DIR}/include)
             target_link_directories(elizainference PRIVATE ${ELIZA_LITERT_SDK_DIR}/lib)
@@ -311,6 +325,22 @@ if(TARGET mtmd)
             target_link_libraries(elizainference PRIVATE ${ELIZA_LITERT_LIBS})
         endif()
     endif()
+    if(ELIZA_ENABLE_LITERT_LM)
+        # The streaming-LLM backend needs the heavier LiteRT-LM Engine SDK
+        # (litert::lm, github.com/google-ai-edge/LiteRT-LM) — separate from the
+        # LiteRT C runtime above. Point at it with -DELIZA_LITERT_LM_SDK_DIR /
+        # -DELIZA_LITERT_LM_LIBS.
+        target_sources(elizainference PRIVATE
+            ${CMAKE_CURRENT_SOURCE_DIR}/src/backends/litert-backend.cpp)
+        target_compile_definitions(elizainference PRIVATE ELIZA_ENABLE_LITERT_LM)
+        if(ELIZA_LITERT_LM_SDK_DIR)
+            target_include_directories(elizainference PRIVATE ${ELIZA_LITERT_LM_SDK_DIR}/include)
+            target_link_directories(elizainference PRIVATE ${ELIZA_LITERT_LM_SDK_DIR}/lib)
+        endif()
+        if(ELIZA_LITERT_LM_LIBS)
+            target_link_libraries(elizainference PRIVATE ${ELIZA_LITERT_LM_LIBS})
+        endif()
+    endif()
     if(ELIZA_ENABLE_MLX)
         if(NOT APPLE)
             message(FATAL_ERROR
diff --git a/tools/omnivoice/src/SESSION-OPS-TODO.md b/tools/omnivoice/src/SESSION-OPS-TODO.md
new file mode 100644
index 000000000..7095b8952
--- /dev/null
+++ b/tools/omnivoice/src/SESSION-OPS-TODO.md
@@ -0,0 +1,159 @@
+# Session-op backend seam — design (NOT implemented)
+
+The per-op backend seam (`backend-registry.h` + `<mod>-backend.h` +
+`<mod>-backend-selector.cpp` + a chokepoint at the top of the FFI fn) is now in
+place for the **one-shot** ops:
+
+| modality | FFI fn                          | header / selector            | env key               | artifact dir       |
+|----------|---------------------------------|------------------------------|-----------------------|--------------------|
+| embed    | `eliza_inference_embed`         | `embed-backend.*`            | `ELIZA_EMBED_BACKEND` | `<bundle>/embedding/` |
+| vision   | `eliza_inference_describe_image`| `vision-backend.*`           | `ELIZA_VISION_BACKEND`| `<bundle>/vision/` |
+| asr      | `eliza_inference_asr_transcribe`| `asr-backend.*`              | `ELIZA_ASR_BACKEND`   | `<bundle>/asr/`    |
+| tts      | `eliza_inference_tts_synthesize`| `tts-backend.*`              | `ELIZA_TTS_BACKEND`   | `<bundle>/tts/`    |
+| eot      | `eliza_inference_llm_eot_score` | `eot-backend.*`              | `ELIZA_EOT_BACKEND`   | `<bundle>/eot/`    |
+
+A one-shot op is stateless across calls: select → (delegate | fall through to
+ggml) on every call. There is nothing to keep alive between calls, so the seam
+is a single chokepoint at the top of the fn.
+
+The **session** ops are different: `vad`, `wakeword`, `speaker`, `diariz` each
+`_open` a native handle (`EliVad *`, `EliWakeword *`, `EliSpeaker *`,
+`EliDiariz *`) that persists across many `_segment`/`_detect`/`_embed` calls and
+is torn down with `_close`/`_reset`. The seam has to follow that lifecycle, not
+re-select per call. This file records HOW to extend the seam to them. **None of
+the below is implemented yet.**
+
+## The shape of a session op (today, in-tree only)
+
+Each session modality exposes, e.g. for VAD:
+
+```c
+EliVad * eliza_inference_vad_open(EliInferenceContext * ctx, /* params */, char ** out_error);
+int      eliza_inference_vad_segment(EliVad * vad, const float * pcm, size_t n, /* out */, char ** out_error);
+int      eliza_inference_vad_reset(EliVad * vad, char ** out_error);
+void     eliza_inference_vad_close(EliVad * vad);
+```
+
+`EliVad` (and the wakeword/speaker/diariz equivalents) is the in-tree handle
+struct defined in `eliza-inference-ffi.cpp`. Its in-tree fields stay exactly as
+they are; the seam is **additive** — one extra pointer.
+
+## Extending the seam to a session op
+
+For each session modality `<mod>` (vad | wakeword | speaker | diariz):
+
+### 1. A session factory interface — `<mod>-backend.h`
+
+Mirror the one-shot factory's four common probes, but the forward methods mirror
+the **session** ABI 1:1 instead of a single one-shot fn. The factory does NOT
+own the handle struct; it produces and operates on an opaque backend-session:
+
+```cpp
+struct VadBackendFactory {
+    virtual ~VadBackendFactory() = default;
+    virtual const char * name() const = 0;
+    virtual bool         available() const = 0;
+    virtual bool         can_serve(const char * bundle_dir) const = 0;  // probes <bundle>/vad/
+    virtual int          preference_rank() const { return 0; }
+
+    // Lifecycle, mirroring the FFI session ABI 1:1. The factory returns an
+    // opaque backend-session pointer it owns; the FFI stashes it on the Eli*
+    // handle. A NULL return + *out_error is a hard open failure.
+    virtual void * open(EliInferenceContext * ctx, /* same params as eliza_inference_vad_open */,
+                        char ** out_error) = 0;
+    virtual int   segment(void * session, const float * pcm, size_t n, /* out */, char ** out_error) = 0;
+    virtual int   reset(void * session, char ** out_error) = 0;
+    virtual void  close(void * session) = 0;
+};
+```
+
+Plus the same free-functions as the one-shot seam:
+`vad_backend_register`, `vad_backend_register_builtins` (EMPTY for now — no
+LiteRT session backend exists), `vad_backend_select(bundle_dir, out_error)`,
+backed by a `eliza_backend::Registry<VadBackendFactory>` in
+`<mod>-backend-selector.cpp` with env keys `ELIZA_VAD_BACKEND` → `ELIZA_BACKEND`
+and modality `"vad"`. Artifact probe dir `<bundle>/vad/` (resp. `wakeword/`,
+`speaker/`, `diariz/`).
+
+### 2. A backend-session pointer on the Eli* handle
+
+The selection happens ONCE, at `_open`, not per call. Add one field to the
+in-tree handle struct:
+
+```cpp
+struct EliVad {
+    /* ... existing in-tree fields, unchanged ... */
+
+    /* Backend seam (additive). When non-null, this handle is served by an
+     * accelerator backend and every op delegates to it; the in-tree fields
+     * above are then unused. When null, the in-tree ggml path owns the handle. */
+    VadBackendFactory * be         = nullptr;  // the factory that opened be_session
+    void *              be_session = nullptr;  // factory-owned backend session
+};
+```
+
+### 3. Select at `_open`
+
+In `eliza_inference_vad_open`, after the existing arg validation and before the
+in-tree handle is built:
+
+```cpp
+char * be_error = nullptr;
+VadBackendFactory * be = vad_backend_select(llm_backend_context_bundle_dir(ctx), &be_error);
+if (be_error) { eliza_set_error(out_error, std::string(be_error)); std::free(be_error);
+                return /* NULL handle */; }
+if (be) {
+    void * sess = be->open(ctx, /* params */, out_error);
+    if (!sess) return /* NULL handle — open failed, out_error already set */;
+    EliVad * h = new EliVad();
+    h->be = be;
+    h->be_session = sess;
+    return h;
+}
+/* else: fall through and build the in-tree handle exactly as today. */
+```
+
+### 4. A guard at the TOP of each `_segment` / `_reset` / `_close`
+
+Each per-call op checks the backend pointer and delegates before touching any
+in-tree state:
+
+```cpp
+int eliza_inference_vad_segment(EliVad * vad, const float * pcm, size_t n, /* out */, char ** out_error) {
+    if (!vad) { /* invalid-arg as today */ }
+    if (vad->be) {                                   // <-- guard
+        return vad->be->segment(vad->be_session, pcm, n, /* out */, out_error);
+    }
+    /* ... existing in-tree ggml segment body, unchanged ... */
+}
+
+void eliza_inference_vad_close(EliVad * vad) {
+    if (!vad) return;
+    if (vad->be) { vad->be->close(vad->be_session); delete vad; return; }  // <-- guard
+    /* ... existing in-tree teardown, then delete vad ... */
+}
+```
+
+`_reset` follows the same guard pattern.
+
+## Why this shape (vs. re-selecting per call)
+
+- **Selection is per-session, not per-call.** A session's backend is fixed at
+  `_open`; you cannot have `_segment` cross from the ggml path to LiteRT mid
+  session because the KV/feature state lives in the (in-tree OR backend)
+  session, not on the FFI boundary. The one pointer captures that binding.
+- **Hard-fail localizes to `_open`.** A bundle-invalid override surfaces once,
+  where the caller is already prepared to handle a NULL handle, instead of on
+  every `_segment`.
+- **Additive + inert.** With no session backend registered (the case today),
+  `_open`'s `select()` returns nullptr, `be`/`be_session` stay null, and every
+  guard is a no-op — the in-tree path is byte-for-byte unchanged. Same inert-by
+  -default contract as the one-shot seam.
+
+## Status
+
+- One-shot seam: embed (with a LiteRT builtin), vision/asr/tts/eot (inert,
+  no builtin) — **done**.
+- Session seam (vad/wakeword/speaker/diariz): **not implemented.** No
+  `<mod>-backend.{h,cpp}`, no handle field, no `_open` select, no per-call
+  guards exist yet. This file is the spec for when a session backend lands.
diff --git a/tools/omnivoice/src/asr-backend-selector.cpp b/tools/omnivoice/src/asr-backend-selector.cpp
new file mode 100644
index 000000000..7513e7d9d
--- /dev/null
+++ b/tools/omnivoice/src/asr-backend-selector.cpp
@@ -0,0 +1,34 @@
+/*
+ * asr-backend-selector.cpp — registry + selection for the per-op ASR backend
+ * seam. A thin instantiation of eliza_backend::Registry<AsrBackendFactory>
+ * (backend-registry.h) — the resolution logic is shared with every other
+ * modality. Inert by default: no -DELIZA_ENABLE_* ASR backend is compiled in
+ * (none exists yet), so nothing registers and asr_backend_select() returns
+ * nullptr, so eliza_inference_asr_transcribe keeps the in-tree ggml path.
+ */
+
+#include "asr-backend.h"
+#include "backend-registry.h"
+
+#include <mutex>
+
+namespace {
+eliza_backend::Registry<AsrBackendFactory> g_registry;
+std::once_flag                             g_builtins_once;
+} // namespace
+
+void asr_backend_register(AsrBackendFactory * factory) {
+    g_registry.register_factory(factory);
+}
+
+void asr_backend_register_builtins() {
+    std::call_once(g_builtins_once, []() {
+        /* No ASR backend exists yet — the seam stays inert. */
+    });
+}
+
+AsrBackendFactory * asr_backend_select(const char * bundle_dir, char ** out_error) {
+    asr_backend_register_builtins();
+    return g_registry.select("ELIZA_ASR_BACKEND", "ELIZA_BACKEND", "asr",
+                             bundle_dir, out_error);
+}
diff --git a/tools/omnivoice/src/asr-backend.h b/tools/omnivoice/src/asr-backend.h
new file mode 100644
index 000000000..2dd9fec49
--- /dev/null
+++ b/tools/omnivoice/src/asr-backend.h
@@ -0,0 +1,61 @@
+#pragma once
+/*
+ * asr-backend.h — per-op backend seam for speech-to-text transcription.
+ *
+ * A one-shot op (eliza_inference_asr_transcribe) that an accelerator backend can
+ * serve when it ships an ASR artifact under `<bundle>/asr/`, while every other
+ * op — and ASR itself when no artifact is present — stays on the in-tree ggml
+ * path.
+ *
+ * The factory mirrors the FFI 1:1 and the FFI delegates without translation.
+ * Selection reuses the shared eliza_backend::Registry (backend-registry.h):
+ * ELIZA_ASR_BACKEND (per-op) then ELIZA_BACKEND (global) hard-select, else the
+ * highest preference_rank among available()+can_serve() factories, else nullptr
+ * (the ggml ASR path).
+ */
+
+#include "eliza-inference-ffi.h" /* EliInferenceContext fwd, ELIZA_* codes */
+
+#include <cstddef>
+
+struct EliInferenceContext;
+
+/* One factory per linked-in ASR runtime (e.g. LiteRT). */
+struct AsrBackendFactory {
+    virtual ~AsrBackendFactory() = default;
+
+    /* Stable lower-case id, e.g. "litert". Matched case-insensitively against
+     * ELIZA_ASR_BACKEND / ELIZA_BACKEND. */
+    virtual const char * name() const = 0;
+
+    /* Compiled in AND host deps present (the runtime + a GPU/NPU delegate).
+     * Cheap — must not load a model. */
+    virtual bool available() const = 0;
+
+    /* The ASR artifact exists under `<bundle_dir>/asr/`. Cheap directory probe,
+     * no model load. */
+    virtual bool can_serve(const char * bundle_dir) const = 0;
+
+    /* Platform-affinity rank (higher wins; the ggml path is the implicit rank 0).
+     * An NPU-served ASR returns a high positive value; a GPU-delegate fallback a
+     * lower positive value. */
+    virtual int preference_rank() const { return 0; }
+
+    /* Mirrors eliza_inference_asr_transcribe 1:1. Returns the number of bytes
+     * written (excluding the terminator) on success, or a negative ELIZA_* code
+     * with `*out_error` heap-allocated for the caller to free. */
+    virtual int asr_transcribe(EliInferenceContext * ctx, const float * pcm, size_t n_samples,
+                               int sample_rate_hz, char * out_text, size_t max_text_bytes,
+                               char ** out_error) = 0;
+};
+
+/* Register a factory (idempotent by name). */
+void asr_backend_register(AsrBackendFactory * factory);
+
+/* Register every ASR backend compiled into THIS build (gated by the
+ * -DELIZA_ENABLE_* options). Idempotent; called by asr_backend_select. */
+void asr_backend_register_builtins();
+
+/* Pick an ASR backend for the bundle at `bundle_dir`. nullptr + no error
+ * => use the in-tree ggml ASR path. nullptr + *out_error => hard failure. */
+AsrBackendFactory * asr_backend_select(const char * bundle_dir, char ** out_error);
diff --git a/tools/omnivoice/src/backend-registry.h b/tools/omnivoice/src/backend-registry.h
new file mode 100644
index 000000000..14a40b3fd
--- /dev/null
+++ b/tools/omnivoice/src/backend-registry.h
@@ -0,0 +1,147 @@
+#pragma once
+/*
+ * backend-registry.h — generic per-modality backend registry + selection.
+ *
+ * Factored out of the M3 streaming-LLM seam (llm-backend-selector.cpp) so EVERY
+ * on-device modality (embed, asr, tts, vision, vad, wakeword, speaker, diarizer,
+ * eot, …) reuses ONE resolution implementation instead of copy-pasting it. A
+ * modality declares a small factory interface with the four common probes
+ * (name / available / can_serve / preference_rank) plus its own forward method,
+ * instantiates `eliza_backend::Registry<ThatFactory>`, and selects with the
+ * shared logic below:
+ *
+ *   1. `ELIZA_<MOD>_BACKEND` env (per-op) → else `ELIZA_BACKEND` (global) — a
+ *      HARD select. An in-tree name ("llama.cpp"/"ggml"/"default") forces the
+ *      ggml path (returns nullptr, no error). Any other name that is not
+ *      registered+available or cannot serve the bundle is a hard error
+ *      (nullptr + *out_error).
+ *   2. No override: among registered factories that are available() AND
+ *      can_serve(bundle_dir), pick the highest preference_rank(). None → nullptr.
+ *
+ * A nullptr return with *out_error == nullptr means "use the in-tree ggml path"
+ * — NOT an error. Inert by default: with no -DELIZA_ENABLE_* backend compiled,
+ * nothing registers and select() always returns nullptr, so every op keeps the
+ * in-tree path byte-for-byte.
+ *
+ * Factory type F must expose:
+ *   const char * name() const;          // stable lower-case id
+ *   bool         available() const;     // compiled-in AND host deps present; cheap
+ *   bool         can_serve(const char * bundle_dir) const;  // artifact probe; cheap
+ *   int          preference_rank() const;                   // higher wins; ggml == 0
+ */
+
+#include <cctype>
+#include <cstdlib>
+#include <cstring>
+#include <mutex>
+#include <string>
+#include <vector>
+
+namespace eliza_backend {
+
+/* malloc-allocate an error string so the caller frees it with
+ * eliza_inference_free_string() (free()), matching the FFI contract. */
+inline char * dup_error(const std::string & msg) {
+    char * out = (char *) std::malloc(msg.size() + 1);
+    if (out) std::memcpy(out, msg.c_str(), msg.size() + 1);
+    return out;
+}
+
+inline bool iequals(const char * a, const char * b) {
+    if (!a || !b) return false;
+    while (*a && *b) {
+        if (std::tolower((unsigned char) *a) != std::tolower((unsigned char) *b)) {
+            return false;
+        }
+        ++a;
+        ++b;
+    }
+    return *a == *b;
+}
+
+/* Names that mean "stay on the in-tree ggml/llama.cpp path". */
+inline bool is_intree_name(const char * s) {
+    return iequals(s, "llama.cpp") || iequals(s, "llamacpp") || iequals(s, "llama") ||
+           iequals(s, "ggml") || iequals(s, "intree") || iequals(s, "default");
+}
+
+template <class Factory>
+class Registry {
+public:
+    /* Idempotent by name. Safe from static init. Does not take ownership —
+     * factories are static-lifetime singletons. */
+    void register_factory(Factory * factory) {
+        if (!factory) return;
+        std::lock_guard<std::mutex> lock(mu_);
+        for (Factory * f : factories_) {
+            if (iequals(f->name(), factory->name())) return;
+        }
+        factories_.push_back(factory);
+    }
+
+    /* env_key: the per-op override (e.g. "ELIZA_EMBED_BACKEND"); global_key: the
+     * cross-op default (e.g. "ELIZA_BACKEND"); modality: for error text. */
+    Factory * select(const char * env_key, const char * global_key,
+                     const char * modality, const char * bundle_dir,
+                     char ** out_error) {
+        const char * forced = env_key ? std::getenv(env_key) : nullptr;
+        if (!forced || forced[0] == '\0') {
+            forced = global_key ? std::getenv(global_key) : nullptr;
+        }
+        if (forced && forced[0] != '\0') {
+            if (is_intree_name(forced)) {
+                return nullptr; /* force in-tree, not an error */
+            }
+            std::lock_guard<std::mutex> lock(mu_);
+            for (Factory * f : factories_) {
+                if (!iequals(f->name(), forced)) continue;
+                if (!f->available()) {
+                    set_err(out_error, modality, forced,
+                            "is not available in this build/host");
+                    return nullptr;
+                }
+                if (!f->can_serve(bundle_dir)) {
+                    set_err(out_error, modality, forced,
+                            std::string("cannot serve the bundle at ") +
+                                (bundle_dir ? bundle_dir : "(null)"));
+                    return nullptr;
+                }
+                return f;
+            }
+            set_err(out_error, modality, forced, "is not a registered backend");
+            return nullptr;
+        }
+
+        /* Auto-select: highest preference_rank among available + can_serve. The
+         * in-tree ggml path is the implicit rank-0 fallback, so an accelerator
+         * backend only wins with a positive rank that can serve this bundle. */
+        std::lock_guard<std::mutex> lock(mu_);
+        Factory * best      = nullptr;
+        int       best_rank = 0;
+        for (Factory * f : factories_) {
+            if (!f->available()) continue;
+            if (!f->can_serve(bundle_dir)) continue;
+            const int rank = f->preference_rank();
+            if (rank > best_rank) {
+                best_rank = rank;
+                best      = f;
+            }
+        }
+        return best; /* nullptr => in-tree ggml path */
+    }
+
+private:
+    static void set_err(char ** out_error, const char * modality,
+                        const char * name, const std::string & why) {
+        if (out_error) {
+            *out_error = dup_error(std::string("[libelizainference] ") +
+                                   (modality ? modality : "backend") +
+                                   " backend override '" + name + "' " + why);
+        }
+    }
+
+    std::mutex             mu_;
+    std::vector<Factory *> factories_;
+};
+
+} // namespace eliza_backend
diff --git a/tools/omnivoice/src/backends/litert-embed-backend.cpp b/tools/omnivoice/src/backends/litert-embed-backend.cpp
new file mode 100644
index 000000000..18bf11415
--- /dev/null
+++ b/tools/omnivoice/src/backends/litert-embed-backend.cpp
@@ -0,0 +1,252 @@
+/*
+ * litert-embed-backend.cpp — LiteRT (Google AI Edge) text-embedding backend.
+ *
+ * Serves eliza_inference_embed from a `<bundle>/embedding/*.tflite` (or
+ * `.litertlm`) artifact via the LiteRT Next C runtime on the best available
+ * accelerator: NPU (Qualcomm QNN / MediaTek NeuroPilot / Google Tensor on
+ * capable silicon) -> GPU (OpenCL/Mali via libLiteRtClGlAccelerator.so) -> CPU.
+ * The accelerator ladder + preference_rank let the SAME build auto-promote to
+ * NPU on a Pixel-10/G5 or Qualcomm/MediaTek device and fall back to the GPU
+ * delegate on a Tensor-G4 (Pixel 9a) with NO code change.
+ *
+ * Uses the LiteRT *C* API (litert/c/...) — the C++ cc/ wrappers are not
+ * standalone (they pull Abseil/TFLite/flatbuffers). Compiles only under
+ * -DELIZA_ENABLE_LITERT with the SDK on the include/link path
+ * (-DELIZA_LITERT_SDK_DIR=<dir> -DELIZA_LITERT_LIBS=LiteRt). Without the gate the
+ * file is not compiled (CMake target_sources is inside if(ELIZA_ENABLE_LITERT));
+ * the stub at the bottom keeps the factory accessor resolvable defensively.
+ *
+ * Model I/O (the converted all-MiniLM-L6-v2 .tflite, see
+ * litert-models/embedding/MANIFEST.md): 2 int32 inputs [1,128] bound BY INDEX
+ * (0=input_ids, 1=attention_mask), 1 float32 output [1,384] that is already
+ * masked-mean-pooled + L2-normalized in-graph (read 384 floats directly).
+ */
+
+#include "../embed-backend.h"
+#include "../llm-backend.h" /* llm_backend_context_bundle_dir */
+
+#include <cstdlib>
+#include <cstring>
+#include <string>
+
+#if defined(__has_include)
+#  if __has_include(<filesystem>)
+#    include <filesystem>
+#    define ELIZA_HAS_FILESYSTEM 1
+#  endif
+#endif
+
+namespace {
+
+/* Probe `<bundle_dir>/embedding/` for a LiteRT artifact (.litertlm preferred,
+ * then .tflite). Cheap — no model load. Returns the absolute path or "". */
+std::string find_embed_artifact(const char * bundle_dir) {
+    if (!bundle_dir || !bundle_dir[0]) return "";
+#ifdef ELIZA_HAS_FILESYSTEM
+    namespace fs = std::filesystem;
+    std::error_code ec;
+    const fs::path dir = fs::path(bundle_dir) / "embedding";
+    if (!fs::is_directory(dir, ec)) return "";
+    std::string tflite;
+    for (const auto & e : fs::directory_iterator(dir, ec)) {
+        if (ec) break;
+        if (!e.is_regular_file(ec)) continue;
+        const std::string ext = e.path().extension().string();
+        if (ext == ".litertlm") return e.path().string();
+        if (ext == ".tflite" && tflite.empty()) tflite = e.path().string();
+    }
+    return tflite;
+#else
+    return "";
+#endif
+}
+
+char * dup_error(const std::string & msg) {
+    const std::string full = "[libelizainference] " + msg;
+    char * out = (char *) std::malloc(full.size() + 1);
+    if (out) std::memcpy(out, full.c_str(), full.size() + 1);
+    return out;
+}
+
+} // namespace
+
+#ifdef ELIZA_ENABLE_LITERT
+
+#include "litert/c/litert_common.h"
+#include "litert/c/litert_compiled_model.h"
+#include "litert/c/litert_environment.h"
+#include "litert/c/litert_model.h"
+#include "litert/c/litert_options.h"
+#include "litert/c/litert_tensor_buffer.h"
+
+#include <cmath>
+#include <mutex>
+#include <vector>
+
+namespace {
+
+class LiteRtEmbedFactory final : public EmbedBackendFactory {
+public:
+    const char * name() const override { return "litert"; }
+
+    /* Compiled in AND a non-CPU accelerator is reachable (a CPU-only LiteRT is
+     * not a win over the in-tree ggml encoder). Settings-only probe — no model
+     * load. The ladder resolves to GPU on a Tensor-G4 (9a) and NPU on capable
+     * silicon. */
+    bool available() const override { return probe_accel() != kLiteRtHwAcceleratorNone; }
+
+    bool can_serve(const char * bundle_dir) const override {
+        return !find_embed_artifact(bundle_dir).empty();
+    }
+
+    int preference_rank() const override {
+        const int a = probe_accel();
+        if (a & kLiteRtHwAcceleratorNpu) return 100; /* the real NPU win */
+        if (a & kLiteRtHwAcceleratorGpu) return 20;  /* GPU delegate (Mali on a 9a) */
+        return 0;                                    /* never beats ggml */
+    }
+
+    int embed(EliInferenceContext * ctx, const char * text, size_t text_len,
+              int pooling, float * out_embedding, size_t out_capacity,
+              int * out_dim, char ** out_error) override {
+        const char * bundle = llm_backend_context_bundle_dir(ctx);
+        const std::string artifact = find_embed_artifact(bundle);
+        if (artifact.empty()) {
+            if (out_error) *out_error = dup_error("litert embed: no artifact under <bundle>/embedding/");
+            return ELIZA_ERR_INVALID_ARG;
+        }
+        std::lock_guard<std::mutex> lock(mu_);
+        if (int rc = ensure_loaded(artifact, out_error); rc != ELIZA_OK) return rc;
+
+        /* Tokenize -> 2 int32 input tensors [1,128] (0=input_ids,1=attention_mask).
+         * The WordPiece tokenizer + the fixed-128 padding come from the model
+         * MANIFEST (litert-models/embedding). The LiteRT C run path below
+         * (managed buffers -> run -> read the in-graph-pooled [1,384] output) is
+         * wired; binding the tokenizer is the one model-specific step. */
+        std::vector<int32_t> ids, mask;
+        if (int rc = tokenize(text, text_len, ids, mask, out_error); rc != ELIZA_OK) return rc;
+
+        std::vector<float> out_vec;
+        int dim = 0;
+        if (int rc = run(ids, mask, out_vec, dim, out_error); rc != ELIZA_OK) return rc;
+
+        if (dim <= 0 || (size_t) dim > out_capacity) {
+            if (out_error) *out_error = dup_error("litert embed: output dim exceeds capacity");
+            return ELIZA_ERR_INVALID_ARG;
+        }
+        (void) pooling; /* pooling + L2-norm are baked into the exported graph */
+        std::memcpy(out_embedding, out_vec.data(), (size_t) dim * sizeof(float));
+        *out_dim = dim;
+        return ELIZA_OK;
+    }
+
+private:
+    static int probe_accel() {
+        LiteRtEnvironment env = nullptr;
+        if (LiteRtCreateEnvironment(0, nullptr, &env) != kLiteRtStatusOk) {
+            return kLiteRtHwAcceleratorNone;
+        }
+        LiteRtDestroyEnvironment(env);
+        /* TODO(DEVICE-VERIFY): query the env for a registered NPU dispatch and
+         * return kLiteRtHwAcceleratorNpu when present. On a Tensor-G4 there is no
+         * app-usable NPU path, so this resolves to GPU. */
+        return kLiteRtHwAcceleratorGpu;
+    }
+
+    int ensure_loaded(const std::string & artifact, char ** out_error) {
+        if (artifact == loaded_path_ && compiled_) return ELIZA_OK;
+        reset();
+        if (LiteRtCreateEnvironment(0, nullptr, &env_) != kLiteRtStatusOk) {
+            if (out_error) *out_error = dup_error("litert embed: environment create failed");
+            return ELIZA_ERR_FFI_FAULT;
+        }
+        if (LiteRtCreateModelFromFile(artifact.c_str(), &model_) != kLiteRtStatusOk) {
+            if (out_error) *out_error = dup_error("litert embed: model load failed: " + artifact);
+            return ELIZA_ERR_BUNDLE_INVALID;
+        }
+        LiteRtOptions opts = nullptr;
+        if (LiteRtCreateOptions(&opts) != kLiteRtStatusOk) {
+            if (out_error) *out_error = dup_error("litert embed: options create failed");
+            return ELIZA_ERR_FFI_FAULT;
+        }
+        LiteRtSetOptionsHardwareAccelerators(
+            opts, (LiteRtHwAcceleratorSet)(kLiteRtHwAcceleratorGpu | kLiteRtHwAcceleratorNpu));
+        const LiteRtStatus st = LiteRtCreateCompiledModel(env_, model_, opts, &compiled_);
+        LiteRtDestroyOptions(opts);
+        if (st != kLiteRtStatusOk) {
+            if (out_error) *out_error = dup_error("litert embed: compile failed (accelerator unavailable?)");
+            return ELIZA_ERR_FFI_FAULT;
+        }
+        loaded_path_ = artifact;
+        return ELIZA_OK;
+    }
+
+    int tokenize(const char * /*text*/, size_t /*len*/, std::vector<int32_t> & /*ids*/,
+                 std::vector<int32_t> & /*mask*/, char ** out_error) {
+        /* TODO(MANIFEST): wire the WordPiece tokenizer (vocab.txt under
+         * <bundle>/embedding/): lower-case, [CLS] + greedy-longest-match subwords
+         * + [SEP], pad/truncate to exactly 128, attention_mask=1 for real tokens.
+         * Until wired this is a hard, observable failure — eliza_inference_embed
+         * does NOT fall back, so a misconfigured artifact surfaces loudly. */
+        if (out_error) *out_error = dup_error(
+            "litert embed: WordPiece tokenizer not wired — stage vocab.txt + bind "
+            "per litert-models/embedding/MANIFEST.md");
+        return ELIZA_ERR_NOT_IMPLEMENTED;
+    }
+
+    int run(const std::vector<int32_t> & ids, const std::vector<int32_t> & mask,
+            std::vector<float> & out_vec, int & dim, char ** out_error) {
+        /* TODO(MANIFEST): create 2 managed int32 input TensorBuffers [1,128]
+         * (LiteRtGetCompiledModelInputBufferRequirements ->
+         * LiteRtCreateManagedTensorBufferFromRequirements), Lock+write ids/mask,
+         * create the output buffer, LiteRtRunCompiledModel(compiled_, 0, in, out),
+         * Lock+read the [1,384] float output into out_vec (dim=384). Pooling +
+         * L2-norm are in-graph. */
+        (void) ids; (void) mask; (void) out_vec; (void) dim;
+        if (out_error) *out_error = dup_error("litert embed: tensor run pending MANIFEST tokenizer");
+        return ELIZA_ERR_NOT_IMPLEMENTED;
+    }
+
+    void reset() {
+        if (compiled_) { LiteRtDestroyCompiledModel(compiled_); compiled_ = nullptr; }
+        if (model_)    { LiteRtDestroyModel(model_);            model_ = nullptr; }
+        if (env_)      { LiteRtDestroyEnvironment(env_);        env_ = nullptr; }
+        loaded_path_.clear();
+    }
+
+    std::mutex          mu_;
+    LiteRtEnvironment   env_      = nullptr;
+    LiteRtModel         model_    = nullptr;
+    LiteRtCompiledModel compiled_ = nullptr;
+    std::string         loaded_path_;
+};
+
+} // namespace
+
+EmbedBackendFactory * litert_embed_backend_factory() {
+    static LiteRtEmbedFactory instance;
+    return &instance;
+}
+
+#else /* !ELIZA_ENABLE_LITERT — stub (kept resolvable; never selected) */
+
+namespace {
+class LiteRtEmbedStub final : public EmbedBackendFactory {
+public:
+    const char * name() const override { return "litert"; }
+    bool available() const override { return false; }
+    bool can_serve(const char *) const override { return false; }
+    int embed(EliInferenceContext *, const char *, size_t, int, float *, size_t,
+              int *, char ** out_error) override {
+        if (out_error) *out_error = dup_error("litert embed backend not compiled in");
+        return ELIZA_ERR_NOT_IMPLEMENTED;
+    }
+};
+} // namespace
+
+EmbedBackendFactory * litert_embed_backend_factory() {
+    static LiteRtEmbedStub instance;
+    return &instance;
+}
+
+#endif /* ELIZA_ENABLE_LITERT */
diff --git a/tools/omnivoice/src/eliza-inference-ffi.cpp b/tools/omnivoice/src/eliza-inference-ffi.cpp
index 94127affc..e35445169 100644
--- a/tools/omnivoice/src/eliza-inference-ffi.cpp
+++ b/tools/omnivoice/src/eliza-inference-ffi.cpp
@@ -15,6 +15,11 @@
 
 #include "eliza-inference-ffi.h"
 #include "llm-backend.h"
+#include "embed-backend.h"
+#include "vision-backend.h"
+#include "asr-backend.h"
+#include "tts-backend.h"
+#include "eot-backend.h"
 #include "omnivoice.h"
 #include "llama.h"
 #include "mtmd.h"
@@ -1880,6 +1885,24 @@ int eliza_inference_tts_synthesize(
         return ELIZA_ERR_INVALID_ARG;
     }
 
+    /* Per-op backend seam: a TTS backend (e.g. LiteRT/NPU) serves this when it
+     * ships <bundle>/tts/*; otherwise fall through to the in-tree OmniVoice path
+     * below. Inert by default (no backend registered). */
+    {
+        char * be_error = nullptr;
+        TtsBackendFactory * be =
+            tts_backend_select(llm_backend_context_bundle_dir(ctx), &be_error);
+        if (be_error) {
+            eliza_set_error(out_error, std::string(be_error));
+            std::free(be_error);
+            return ELIZA_ERR_BUNDLE_INVALID;
+        }
+        if (be) {
+            return be->tts_synthesize(ctx, text, text_len, speaker_preset_id,
+                                      out_pcm, max_samples, out_error);
+        }
+    }
+
     std::lock_guard<std::mutex> lock(ctx->tts_mutex);
     if (!ctx->ov) {
         eliza_set_error(out_error, "[libelizainference] tts_synthesize: TTS region is not acquired; call mmap_acquire(\"tts\") after arming voice");
@@ -2081,6 +2104,25 @@ int eliza_inference_asr_transcribe(
         eliza_set_error(out_error, "[libelizainference] asr_transcribe: invalid arguments");
         return ELIZA_ERR_INVALID_ARG;
     }
+
+    /* Per-op backend seam: an ASR backend (e.g. LiteRT/NPU) serves this when it
+     * ships <bundle>/asr/*; otherwise fall through to the in-tree ggml path
+     * below. Inert by default (no backend registered). */
+    {
+        char * be_error = nullptr;
+        AsrBackendFactory * be =
+            asr_backend_select(llm_backend_context_bundle_dir(ctx), &be_error);
+        if (be_error) {
+            eliza_set_error(out_error, std::string(be_error));
+            std::free(be_error);
+            return ELIZA_ERR_BUNDLE_INVALID;
+        }
+        if (be) {
+            return be->asr_transcribe(ctx, pcm, n_samples, sample_rate_hz,
+                                      out_text, max_text_bytes, out_error);
+        }
+    }
+
     std::string transcript;
     int rc = eliza_asr_decode_core(ctx, pcm, n_samples, sample_rate_hz, max_text_bytes, transcript, out_error);
     if (rc < 0) {
@@ -3505,6 +3547,24 @@ int eliza_inference_embed(
         return ELIZA_ERR_INVALID_ARG;
     }
 
+    /* Per-op backend seam: an embedding backend (e.g. LiteRT/NPU) serves this
+     * when it ships <bundle>/embedding/*; otherwise fall through to the in-tree
+     * ggml encoder below. Inert by default (no backend registered). */
+    {
+        char * be_error = nullptr;
+        EmbedBackendFactory * be =
+            embed_backend_select(llm_backend_context_bundle_dir(ctx), &be_error);
+        if (be_error) {
+            eliza_set_error(out_error, std::string(be_error));
+            std::free(be_error);
+            return ELIZA_ERR_BUNDLE_INVALID;
+        }
+        if (be) {
+            return be->embed(ctx, text, text_len, pooling, out_embedding,
+                             out_capacity, out_dim, out_error);
+        }
+    }
+
     std::lock_guard<std::mutex> lock(ctx->llm_mutex);
     int rc = eliza_load_llm_model_locked(ctx, /* n_gpu_layers= */ -1, out_error);
     if (rc != ELIZA_OK) return rc;
@@ -3639,6 +3699,25 @@ int eliza_inference_llm_eot_score(
         return ELIZA_ERR_INVALID_ARG;
     }
 
+    /* Per-op backend seam: an EOT backend (e.g. LiteRT/NPU) serves this when it
+     * ships <bundle>/eot/*; otherwise fall through to the in-tree ggml
+     * causal-scoring path below. Inert by default (no backend registered). */
+    {
+        char * be_error = nullptr;
+        EotBackendFactory * be =
+            eot_backend_select(llm_backend_context_bundle_dir(ctx), &be_error);
+        if (be_error) {
+            eliza_set_error(out_error, std::string(be_error));
+            std::free(be_error);
+            return ELIZA_ERR_BUNDLE_INVALID;
+        }
+        if (be) {
+            return be->eot_score(ctx, token_ids, num_tokens, target_token_id,
+                                 out_target_prob, out_top_token, out_top_prob,
+                                 out_error);
+        }
+    }
+
     std::lock_guard<std::mutex> lock(ctx->llm_mutex);
     int rc = eliza_load_llm_model_locked(ctx, /* n_gpu_layers= */ -1, out_error);
     if (rc != ELIZA_OK) return rc;
@@ -3800,6 +3879,24 @@ int eliza_inference_describe_image(
         return ELIZA_ERR_INVALID_ARG;
     }
 
+    /* Per-op backend seam: a vision backend (e.g. LiteRT/NPU) serves this when it
+     * ships <bundle>/vision/*; otherwise fall through to the in-tree ggml mmproj
+     * path below. Inert by default (no backend registered). */
+    {
+        char * be_error = nullptr;
+        VisionBackendFactory * be =
+            vision_backend_select(llm_backend_context_bundle_dir(ctx), &be_error);
+        if (be_error) {
+            eliza_set_error(out_error, std::string(be_error));
+            std::free(be_error);
+            return ELIZA_ERR_BUNDLE_INVALID;
+        }
+        if (be) {
+            return be->describe_image(ctx, image_bytes, n_bytes, mmproj_path,
+                                      prompt, out_text, max_text_bytes, out_error);
+        }
+    }
+
     std::lock_guard<std::mutex> lock(ctx->llm_mutex);
     int rc = eliza_load_llm_model_locked(ctx, /* n_gpu_layers= */ -1, out_error);
     if (rc != ELIZA_OK) return rc;
diff --git a/tools/omnivoice/src/embed-backend-selector.cpp b/tools/omnivoice/src/embed-backend-selector.cpp
new file mode 100644
index 000000000..56449fb07
--- /dev/null
+++ b/tools/omnivoice/src/embed-backend-selector.cpp
@@ -0,0 +1,41 @@
+/*
+ * embed-backend-selector.cpp — registry + selection for the per-op embedding
+ * backend seam. A thin instantiation of eliza_backend::Registry<EmbedBackendFactory>
+ * (backend-registry.h) — the resolution logic is shared with every other
+ * modality. Inert by default: with no -DELIZA_ENABLE_* embedding backend
+ * compiled in, nothing registers and embed_backend_select() returns nullptr, so
+ * eliza_inference_embed keeps the in-tree ggml encoder path.
+ */
+
+#include "embed-backend.h"
+#include "backend-registry.h"
+
+#include <mutex>
+
+/* Gated factory accessor — declared only when the backend is compiled in. */
+#ifdef ELIZA_ENABLE_LITERT
+EmbedBackendFactory * litert_embed_backend_factory();
+#endif
+
+namespace {
+eliza_backend::Registry<EmbedBackendFactory> g_registry;
+std::once_flag                               g_builtins_once;
+} // namespace
+
+void embed_backend_register(EmbedBackendFactory * factory) {
+    g_registry.register_factory(factory);
+}
+
+void embed_backend_register_builtins() {
+    std::call_once(g_builtins_once, []() {
+#ifdef ELIZA_ENABLE_LITERT
+        embed_backend_register(litert_embed_backend_factory());
+#endif
+    });
+}
+
+EmbedBackendFactory * embed_backend_select(const char * bundle_dir, char ** out_error) {
+    embed_backend_register_builtins();
+    return g_registry.select("ELIZA_EMBED_BACKEND", "ELIZA_BACKEND", "embed",
+                             bundle_dir, out_error);
+}
diff --git a/tools/omnivoice/src/embed-backend.h b/tools/omnivoice/src/embed-backend.h
new file mode 100644
index 000000000..23473a648
--- /dev/null
+++ b/tools/omnivoice/src/embed-backend.h
@@ -0,0 +1,62 @@
+#pragma once
+/*
+ * embed-backend.h — per-op backend seam for pooled text embeddings.
+ *
+ * The first per-op generalization of the M3 streaming-LLM seam: a one-shot op
+ * (eliza_inference_embed) that an accelerator backend can serve when it ships an
+ * embedding artifact under `<bundle>/embedding/`, while every other op — and
+ * embedding itself when no artifact is present — stays on the in-tree ggml path.
+ *
+ * Embedding is the natural first LiteRT/NPU target: a static-shape, encoder-only
+ * forward with no streaming/KV/sampler, so the factory mirrors the FFI 1:1 and
+ * the FFI delegates without translation. Selection reuses the shared
+ * eliza_backend::Registry (backend-registry.h): ELIZA_EMBED_BACKEND (per-op) then
+ * ELIZA_BACKEND (global) hard-select, else the highest preference_rank among
+ * available()+can_serve() factories, else nullptr (the ggml encoder path).
+ */
+
+#include "eliza-inference-ffi.h" /* EliInferenceContext fwd, ELIZA_* codes */
+
+#include <cstddef>
+
+struct EliInferenceContext;
+
+/* One factory per linked-in embedding runtime (e.g. LiteRT). */
+struct EmbedBackendFactory {
+    virtual ~EmbedBackendFactory() = default;
+
+    /* Stable lower-case id, e.g. "litert". Matched case-insensitively against
+     * ELIZA_EMBED_BACKEND / ELIZA_BACKEND. */
+    virtual const char * name() const = 0;
+
+    /* Compiled in AND host deps present (the LiteRT runtime + a GPU/NPU
+     * delegate). Cheap — must not load a model. */
+    virtual bool available() const = 0;
+
+    /* The embedding artifact exists under `<bundle_dir>/embedding/`. Cheap
+     * directory probe, no model load. */
+    virtual bool can_serve(const char * bundle_dir) const = 0;
+
+    /* Platform-affinity rank (higher wins; the ggml path is the implicit rank 0).
+     * An NPU-served embedding returns a high positive value; a GPU-delegate
+     * fallback a lower positive value. */
+    virtual int preference_rank() const { return 0; }
+
+    /* Mirrors eliza_inference_embed 1:1. Returns ELIZA_OK and writes `*out_dim`
+     * floats into out_embedding (>= out_capacity required), or a negative ELIZA_*
+     * code with `*out_error` heap-allocated for the caller to free. */
+    virtual int embed(EliInferenceContext * ctx, const char * text, size_t text_len,
+                      int pooling, float * out_embedding, size_t out_capacity,
+                      int * out_dim, char ** out_error) = 0;
+};
+
+/* Register a factory (idempotent by name). */
+void embed_backend_register(EmbedBackendFactory * factory);
+
+/* Register every embedding backend compiled into THIS build (gated by the
+ * -DELIZA_ENABLE_* options). Idempotent; called by embed_backend_select. */
+void embed_backend_register_builtins();
+
+/* Pick an embedding backend for the bundle at `bundle_dir`. nullptr + no error
+ * => use the in-tree ggml encoder path. nullptr + *out_error => hard failure. */
+EmbedBackendFactory * embed_backend_select(const char * bundle_dir, char ** out_error);
diff --git a/tools/omnivoice/src/eot-backend-selector.cpp b/tools/omnivoice/src/eot-backend-selector.cpp
new file mode 100644
index 000000000..32bb9fe65
--- /dev/null
+++ b/tools/omnivoice/src/eot-backend-selector.cpp
@@ -0,0 +1,35 @@
+/*
+ * eot-backend-selector.cpp — registry + selection for the per-op end-of-turn
+ * scoring backend seam. A thin instantiation of
+ * eliza_backend::Registry<EotBackendFactory> (backend-registry.h) — the
+ * resolution logic is shared with every other modality. Inert by default: no
+ * -DELIZA_ENABLE_* EOT backend is compiled in (none exists yet), so nothing
+ * registers and eot_backend_select() returns nullptr, so
+ * eliza_inference_llm_eot_score keeps the in-tree ggml causal-scoring path.
+ */
+
+#include "eot-backend.h"
+#include "backend-registry.h"
+
+#include <mutex>
+
+namespace {
+eliza_backend::Registry<EotBackendFactory> g_registry;
+std::once_flag                             g_builtins_once;
+} // namespace
+
+void eot_backend_register(EotBackendFactory * factory) {
+    g_registry.register_factory(factory);
+}
+
+void eot_backend_register_builtins() {
+    std::call_once(g_builtins_once, []() {
+        /* No EOT backend exists yet — the seam stays inert. */
+    });
+}
+
+EotBackendFactory * eot_backend_select(const char * bundle_dir, char ** out_error) {
+    eot_backend_register_builtins();
+    return g_registry.select("ELIZA_EOT_BACKEND", "ELIZA_BACKEND", "eot",
+                             bundle_dir, out_error);
+}
diff --git a/tools/omnivoice/src/eot-backend.h b/tools/omnivoice/src/eot-backend.h
new file mode 100644
index 000000000..1c51dcbb6
--- /dev/null
+++ b/tools/omnivoice/src/eot-backend.h
@@ -0,0 +1,62 @@
+#pragma once
+/*
+ * eot-backend.h — per-op backend seam for end-of-turn scoring.
+ *
+ * A one-shot op (eliza_inference_llm_eot_score) that an accelerator backend can
+ * serve when it ships an EOT artifact under `<bundle>/eot/`, while every other
+ * op — and EOT itself when no artifact is present — stays on the in-tree ggml
+ * causal-scoring path.
+ *
+ * The factory mirrors the FFI 1:1 and the FFI delegates without translation.
+ * Selection reuses the shared eliza_backend::Registry (backend-registry.h):
+ * ELIZA_EOT_BACKEND (per-op) then ELIZA_BACKEND (global) hard-select, else the
+ * highest preference_rank among available()+can_serve() factories, else nullptr
+ * (the ggml EOT-scoring path).
+ */
+
+#include "eliza-inference-ffi.h" /* EliInferenceContext fwd, ELIZA_* codes */
+
+#include <cstddef>
+#include <cstdint>
+
+struct EliInferenceContext;
+
+/* One factory per linked-in EOT runtime (e.g. LiteRT). */
+struct EotBackendFactory {
+    virtual ~EotBackendFactory() = default;
+
+    /* Stable lower-case id, e.g. "litert". Matched case-insensitively against
+     * ELIZA_EOT_BACKEND / ELIZA_BACKEND. */
+    virtual const char * name() const = 0;
+
+    /* Compiled in AND host deps present (the runtime + a GPU/NPU delegate).
+     * Cheap — must not load a model. */
+    virtual bool available() const = 0;
+
+    /* The EOT artifact exists under `<bundle_dir>/eot/`. Cheap directory probe,
+     * no model load. */
+    virtual bool can_serve(const char * bundle_dir) const = 0;
+
+    /* Platform-affinity rank (higher wins; the ggml path is the implicit rank 0).
+     * An NPU-served EOT returns a high positive value; a GPU-delegate fallback a
+     * lower positive value. */
+    virtual int preference_rank() const { return 0; }
+
+    /* Mirrors eliza_inference_llm_eot_score 1:1. Returns ELIZA_OK and writes the
+     * next-token probabilities, or a negative ELIZA_* code with `*out_error`
+     * heap-allocated for the caller to free. */
+    virtual int eot_score(EliInferenceContext * ctx, const int32_t * token_ids, size_t num_tokens,
+                          int32_t target_token_id, float * out_target_prob, int32_t * out_top_token,
+                          float * out_top_prob, char ** out_error) = 0;
+};
+
+/* Register a factory (idempotent by name). */
+void eot_backend_register(EotBackendFactory * factory);
+
+/* Register every EOT backend compiled into THIS build (gated by the
+ * -DELIZA_ENABLE_* options). Idempotent; called by eot_backend_select. */
+void eot_backend_register_builtins();
+
+/* Pick an EOT backend for the bundle at `bundle_dir`. nullptr + no error
+ * => use the in-tree ggml EOT-scoring path. nullptr + *out_error => hard failure. */
+EotBackendFactory * eot_backend_select(const char * bundle_dir, char ** out_error);
diff --git a/tools/omnivoice/src/llm-backend-selector.cpp b/tools/omnivoice/src/llm-backend-selector.cpp
index fa5fa703c..3ffe37680 100644
--- a/tools/omnivoice/src/llm-backend-selector.cpp
+++ b/tools/omnivoice/src/llm-backend-selector.cpp
@@ -20,7 +20,7 @@
 /* Gated backend factory accessors. Declared only when the matching backend is
  * compiled in; register_builtins() calls them under the same gate. Keeping the
  * declarations gated means the default build has no unresolved symbols. */
-#ifdef ELIZA_ENABLE_LITERT
+#ifdef ELIZA_ENABLE_LITERT_LM
 LlmBackendFactory * litert_backend_factory();
 #endif
 #if defined(ELIZA_ENABLE_MLX) && defined(__APPLE__)
@@ -70,7 +70,7 @@ void llm_backend_register(LlmBackendFactory * factory) {
 
 void llm_backend_register_builtins() {
     std::call_once(g_builtins_once, []() {
-#ifdef ELIZA_ENABLE_LITERT
+#ifdef ELIZA_ENABLE_LITERT_LM
         llm_backend_register(litert_backend_factory());
 #endif
 #if defined(ELIZA_ENABLE_MLX) && defined(__APPLE__)
diff --git a/tools/omnivoice/src/tts-backend-selector.cpp b/tools/omnivoice/src/tts-backend-selector.cpp
new file mode 100644
index 000000000..ad2d28447
--- /dev/null
+++ b/tools/omnivoice/src/tts-backend-selector.cpp
@@ -0,0 +1,34 @@
+/*
+ * tts-backend-selector.cpp — registry + selection for the per-op TTS backend
+ * seam. A thin instantiation of eliza_backend::Registry<TtsBackendFactory>
+ * (backend-registry.h) — the resolution logic is shared with every other
+ * modality. Inert by default: no -DELIZA_ENABLE_* TTS backend is compiled in
+ * (none exists yet), so nothing registers and tts_backend_select() returns
+ * nullptr, so eliza_inference_tts_synthesize keeps the in-tree OmniVoice path.
+ */
+
+#include "tts-backend.h"
+#include "backend-registry.h"
+
+#include <mutex>
+
+namespace {
+eliza_backend::Registry<TtsBackendFactory> g_registry;
+std::once_flag                             g_builtins_once;
+} // namespace
+
+void tts_backend_register(TtsBackendFactory * factory) {
+    g_registry.register_factory(factory);
+}
+
+void tts_backend_register_builtins() {
+    std::call_once(g_builtins_once, []() {
+        /* No TTS backend exists yet — the seam stays inert. */
+    });
+}
+
+TtsBackendFactory * tts_backend_select(const char * bundle_dir, char ** out_error) {
+    tts_backend_register_builtins();
+    return g_registry.select("ELIZA_TTS_BACKEND", "ELIZA_BACKEND", "tts",
+                             bundle_dir, out_error);
+}
diff --git a/tools/omnivoice/src/tts-backend.h b/tools/omnivoice/src/tts-backend.h
new file mode 100644
index 000000000..127ce2a33
--- /dev/null
+++ b/tools/omnivoice/src/tts-backend.h
@@ -0,0 +1,61 @@
+#pragma once
+/*
+ * tts-backend.h — per-op backend seam for text-to-speech synthesis.
+ *
+ * A one-shot op (eliza_inference_tts_synthesize) that an accelerator backend can
+ * serve when it ships a TTS artifact under `<bundle>/tts/`, while every other
+ * op — and TTS itself when no artifact is present — stays on the in-tree
+ * OmniVoice/ggml path.
+ *
+ * The factory mirrors the FFI 1:1 and the FFI delegates without translation.
+ * Selection reuses the shared eliza_backend::Registry (backend-registry.h):
+ * ELIZA_TTS_BACKEND (per-op) then ELIZA_BACKEND (global) hard-select, else the
+ * highest preference_rank among available()+can_serve() factories, else nullptr
+ * (the in-tree OmniVoice path).
+ */
+
+#include "eliza-inference-ffi.h" /* EliInferenceContext fwd, ELIZA_* codes */
+
+#include <cstddef>
+
+struct EliInferenceContext;
+
+/* One factory per linked-in TTS runtime (e.g. LiteRT). */
+struct TtsBackendFactory {
+    virtual ~TtsBackendFactory() = default;
+
+    /* Stable lower-case id, e.g. "litert". Matched case-insensitively against
+     * ELIZA_TTS_BACKEND / ELIZA_BACKEND. */
+    virtual const char * name() const = 0;
+
+    /* Compiled in AND host deps present (the runtime + a GPU/NPU delegate).
+     * Cheap — must not load a model. */
+    virtual bool available() const = 0;
+
+    /* The TTS artifact exists under `<bundle_dir>/tts/`. Cheap directory probe,
+     * no model load. */
+    virtual bool can_serve(const char * bundle_dir) const = 0;
+
+    /* Platform-affinity rank (higher wins; the ggml path is the implicit rank 0).
+     * An NPU-served TTS returns a high positive value; a GPU-delegate fallback a
+     * lower positive value. */
+    virtual int preference_rank() const { return 0; }
+
+    /* Mirrors eliza_inference_tts_synthesize 1:1. Returns the number of fp32 PCM
+     * samples actually written (>= 0) on success, or a negative ELIZA_* code with
+     * `*out_error` heap-allocated for the caller to free. */
+    virtual int tts_synthesize(EliInferenceContext * ctx, const char * text, size_t text_len,
+                               const char * speaker_preset_id, float * out_pcm,
+                               size_t max_samples, char ** out_error) = 0;
+};
+
+/* Register a factory (idempotent by name). */
+void tts_backend_register(TtsBackendFactory * factory);
+
+/* Register every TTS backend compiled into THIS build (gated by the
+ * -DELIZA_ENABLE_* options). Idempotent; called by tts_backend_select. */
+void tts_backend_register_builtins();
+
+/* Pick a TTS backend for the bundle at `bundle_dir`. nullptr + no error
+ * => use the in-tree OmniVoice path. nullptr + *out_error => hard failure. */
+TtsBackendFactory * tts_backend_select(const char * bundle_dir, char ** out_error);
diff --git a/tools/omnivoice/src/vision-backend-selector.cpp b/tools/omnivoice/src/vision-backend-selector.cpp
new file mode 100644
index 000000000..095450cab
--- /dev/null
+++ b/tools/omnivoice/src/vision-backend-selector.cpp
@@ -0,0 +1,34 @@
+/*
+ * vision-backend-selector.cpp — registry + selection for the per-op vision
+ * backend seam. A thin instantiation of eliza_backend::Registry<VisionBackendFactory>
+ * (backend-registry.h) — the resolution logic is shared with every other
+ * modality. Inert by default: no -DELIZA_ENABLE_* vision backend is compiled in
+ * (none exists yet), so nothing registers and vision_backend_select() returns
+ * nullptr, so eliza_inference_describe_image keeps the in-tree ggml mmproj path.
+ */
+
+#include "vision-backend.h"
+#include "backend-registry.h"
+
+#include <mutex>
+
+namespace {
+eliza_backend::Registry<VisionBackendFactory> g_registry;
+std::once_flag                                g_builtins_once;
+} // namespace
+
+void vision_backend_register(VisionBackendFactory * factory) {
+    g_registry.register_factory(factory);
+}
+
+void vision_backend_register_builtins() {
+    std::call_once(g_builtins_once, []() {
+        /* No vision backend exists yet — the seam stays inert. */
+    });
+}
+
+VisionBackendFactory * vision_backend_select(const char * bundle_dir, char ** out_error) {
+    vision_backend_register_builtins();
+    return g_registry.select("ELIZA_VISION_BACKEND", "ELIZA_BACKEND", "vision",
+                             bundle_dir, out_error);
+}
diff --git a/tools/omnivoice/src/vision-backend.h b/tools/omnivoice/src/vision-backend.h
new file mode 100644
index 000000000..51da0632a
--- /dev/null
+++ b/tools/omnivoice/src/vision-backend.h
@@ -0,0 +1,61 @@
+#pragma once
+/*
+ * vision-backend.h — per-op backend seam for mmproj image description.
+ *
+ * A one-shot op (eliza_inference_describe_image) that an accelerator backend can
+ * serve when it ships a vision artifact under `<bundle>/vision/`, while every
+ * other op — and vision itself when no artifact is present — stays on the
+ * in-tree ggml mmproj path.
+ *
+ * The factory mirrors the FFI 1:1 and the FFI delegates without translation.
+ * Selection reuses the shared eliza_backend::Registry (backend-registry.h):
+ * ELIZA_VISION_BACKEND (per-op) then ELIZA_BACKEND (global) hard-select, else
+ * the highest preference_rank among available()+can_serve() factories, else
+ * nullptr (the ggml mmproj path).
+ */
+
+#include "eliza-inference-ffi.h" /* EliInferenceContext fwd, ELIZA_* codes */
+
+#include <cstddef>
+
+struct EliInferenceContext;
+
+/* One factory per linked-in vision runtime (e.g. LiteRT). */
+struct VisionBackendFactory {
+    virtual ~VisionBackendFactory() = default;
+
+    /* Stable lower-case id, e.g. "litert". Matched case-insensitively against
+     * ELIZA_VISION_BACKEND / ELIZA_BACKEND. */
+    virtual const char * name() const = 0;
+
+    /* Compiled in AND host deps present (the runtime + a GPU/NPU delegate).
+     * Cheap — must not load a model. */
+    virtual bool available() const = 0;
+
+    /* The vision artifact exists under `<bundle_dir>/vision/`. Cheap directory
+     * probe, no model load. */
+    virtual bool can_serve(const char * bundle_dir) const = 0;
+
+    /* Platform-affinity rank (higher wins; the ggml path is the implicit rank 0).
+     * An NPU-served vision returns a high positive value; a GPU-delegate
+     * fallback a lower positive value. */
+    virtual int preference_rank() const { return 0; }
+
+    /* Mirrors eliza_inference_describe_image 1:1. Returns the number of bytes
+     * written (excluding the terminator) on success, or a negative ELIZA_* code
+     * with `*out_error` heap-allocated for the caller to free. */
+    virtual int describe_image(EliInferenceContext * ctx, const unsigned char * image_bytes,
+                               size_t n_bytes, const char * mmproj_path, const char * prompt,
+                               char * out_text, size_t max_text_bytes, char ** out_error) = 0;
+};
+
+/* Register a factory (idempotent by name). */
+void vision_backend_register(VisionBackendFactory * factory);
+
+/* Register every vision backend compiled into THIS build (gated by the
+ * -DELIZA_ENABLE_* options). Idempotent; called by vision_backend_select. */
+void vision_backend_register_builtins();
+
+/* Pick a vision backend for the bundle at `bundle_dir`. nullptr + no error
+ * => use the in-tree ggml mmproj path. nullptr + *out_error => hard failure. */
+VisionBackendFactory * vision_backend_select(const char * bundle_dir, char ** out_error);