diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index cd9666a21..0eb9dad8a 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -1990,7 +1990,19 @@ void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, lla
         uint32_t cell_range_begin = cells.size();
 
         for (uint32_t i = 0; i < cells.size(); ++i) {
-            if (!cells.is_empty(i) && (seq_id == -1 || cells.seq_has(i, seq_id))) {
+            bool add_cell = true;
+
+            add_cell = add_cell && !cells.is_empty(i);
+            add_cell = add_cell && (seq_id == -1 || cells.seq_has(i, seq_id));
+
+            // check the cell is not SWA-masked
+            if (add_cell && seq_id != -1) {
+                const bool is_masked = llama_hparams::is_masked_swa(n_swa, swa_type, cells.pos_get(i), cells.seq_pos_max(seq_id));
+
+                add_cell = !is_masked;
+            }
+
+            if (add_cell) {
                 ++cell_count;
                 if (cell_range_begin == cells.size()) {
                     cell_range_begin = i;
@@ -2246,7 +2258,7 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
 
         sinfo = find_slot(ubatch, false);
         if (sinfo.empty()) {
-            LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
+            LLAMA_LOG_ERROR("%s: failed to find %d available cells in kv cache\n", __func__,  cell_count);
             return false;
         }
 
diff --git a/tools/omnivoice/CMakeLists.txt b/tools/omnivoice/CMakeLists.txt
index 72088dfa0..038fa5be4 100644
--- a/tools/omnivoice/CMakeLists.txt
+++ b/tools/omnivoice/CMakeLists.txt
@@ -78,6 +78,19 @@ set(OMNIVOICE_CORE_SOURCES
 # llama + mtmd into a single ABI-stable C surface.
 set(OMNIVOICE_FFI_SOURCES
     src/eliza-inference-ffi.cpp
+    # Multi-runtime streaming-LLM backend seam (cutover plan M3). The selector
+    # is always compiled; it is inert until a -DELIZA_ENABLE_* accelerator
+    # backend below registers itself, so the default build keeps the in-tree
+    # llama.cpp path.
+    src/llm-backend-selector.cpp
+    # Per-op backend seams (cutover M3+). Each modality's selector reuses the
+    # shared eliza_backend::Registry (backend-registry.h) and is inert until a
+    # gated backend registers — so the default build keeps the ggml path per-op.
+    src/embed-backend-selector.cpp
+    src/vision-backend-selector.cpp
+    src/asr-backend-selector.cpp
+    src/tts-backend-selector.cpp
+    src/eot-backend-selector.cpp
 )
 
 # Vendored standalone voice-classifier forward graphs (pure scalar C, no
@@ -220,6 +233,24 @@ endif()
 # (the fused build links mtmd anyway), opt out with -DELIZA_ENABLE_VISION=OFF.
 option(ELIZA_ENABLE_VISION "Build the fused mmproj vision-describe ABI (v9)" ON)
 
+# ELIZA_ENABLE_LITERT — compile the LiteRT-LM in-process streaming-LLM backend
+# (cutover plan M4 — Android NPU: Tensor / Qualcomm QNN / MediaTek NeuroPilot).
+# OFF by default: the selector registers no LiteRT backend and the streaming-LLM
+# pipe keeps the in-tree llama.cpp path. ON requires the LiteRT-LM SDK
+# (ELIZA_LITERT_SDK_DIR) — a host/device cross-build concern, not the Linux CI
+# default. See docs/multi-backend-ffi-seam.md.
+option(ELIZA_ENABLE_LITERT "Build the LiteRT C-API per-op backends, e.g. embed (M4)" OFF)
+
+# ELIZA_ENABLE_LITERT_LM — the streaming-LLM backend on the heavier LiteRT-LM
+# Engine SDK (litert::lm), separate from the LiteRT C runtime above. OFF until
+# that SDK is built; point -DELIZA_LITERT_LM_SDK_DIR / -DELIZA_LITERT_LM_LIBS at it.
+option(ELIZA_ENABLE_LITERT_LM "Build the LiteRT-LM in-process streaming-LLM backend" OFF)
+
+# ELIZA_ENABLE_MLX — compile the CoreML/MLX in-process streaming-LLM backend
+# (cutover plan M5 — Apple Silicon). OFF by default; ON is Apple-only and
+# requires the MLX / CoreML toolchain. See docs/multi-backend-ffi-seam.md.
+option(ELIZA_ENABLE_MLX "Build the CoreML/MLX in-process LLM backend (M5)" OFF)
+
 if(TARGET mtmd)
     add_library(elizainference SHARED
         ${OMNIVOICE_CORE_SOURCES}
@@ -271,6 +302,65 @@ if(TARGET mtmd)
             ${CMAKE_CURRENT_SOURCE_DIR}/../kokoro/include)
         target_compile_definitions(elizainference PRIVATE ELIZA_ENABLE_KOKORO)
     endif()
+    # ── Multi-runtime streaming-LLM accelerator backends (cutover M4/M5) ────
+    # The M3 selector (src/llm-backend-selector.cpp) is always compiled in via
+    # OMNIVOICE_FFI_SOURCES. These accelerator backends each link an external
+    # SDK, so they are opt-in. When a gate is OFF its source is not compiled,
+    # the selector's `#ifdef`-guarded factory declaration + registration drop
+    # out, and the streaming-LLM pipe keeps the in-tree llama.cpp path — so the
+    # default desktop/CI build is byte-for-byte the pre-seam behavior.
+    if(ELIZA_ENABLE_LITERT)
+        # LiteRT C-API per-op backends (embed today; vision/etc. as artifacts
+        # ship). SDK = the LiteRT C runtime (github.com/google-ai-edge/LiteRT,
+        # libLiteRt.so + the GPU/NPU delegate). Point at a built SDK with
+        # -DELIZA_LITERT_SDK_DIR=<dir> and link with -DELIZA_LITERT_LIBS=LiteRt.
+        target_sources(elizainference PRIVATE
+            ${CMAKE_CURRENT_SOURCE_DIR}/src/backends/litert-embed-backend.cpp)
+        target_compile_definitions(elizainference PRIVATE ELIZA_ENABLE_LITERT)
+        if(ELIZA_LITERT_SDK_DIR)
+            target_include_directories(elizainference PRIVATE ${ELIZA_LITERT_SDK_DIR}/include)
+            target_link_directories(elizainference PRIVATE ${ELIZA_LITERT_SDK_DIR}/lib)
+        endif()
+        if(ELIZA_LITERT_LIBS)
+            target_link_libraries(elizainference PRIVATE ${ELIZA_LITERT_LIBS})
+        endif()
+    endif()
+    if(ELIZA_ENABLE_LITERT_LM)
+        # The streaming-LLM backend needs the heavier LiteRT-LM Engine SDK
+        # (litert::lm, github.com/google-ai-edge/LiteRT-LM) — separate from the
+        # LiteRT C runtime above. Point at it with -DELIZA_LITERT_LM_SDK_DIR /
+        # -DELIZA_LITERT_LM_LIBS.
+        target_sources(elizainference PRIVATE
+            ${CMAKE_CURRENT_SOURCE_DIR}/src/backends/litert-backend.cpp)
+        target_compile_definitions(elizainference PRIVATE ELIZA_ENABLE_LITERT_LM)
+        if(ELIZA_LITERT_LM_SDK_DIR)
+            target_include_directories(elizainference PRIVATE ${ELIZA_LITERT_LM_SDK_DIR}/include)
+            target_link_directories(elizainference PRIVATE ${ELIZA_LITERT_LM_SDK_DIR}/lib)
+        endif()
+        if(ELIZA_LITERT_LM_LIBS)
+            target_link_libraries(elizainference PRIVATE ${ELIZA_LITERT_LM_LIBS})
+        endif()
+    endif()
+    if(ELIZA_ENABLE_MLX)
+        if(NOT APPLE)
+            message(FATAL_ERROR
+                "ELIZA_ENABLE_MLX requires an Apple host (CoreML/MLX are Apple-only).")
+        endif()
+        target_sources(elizainference PRIVATE
+            ${CMAKE_CURRENT_SOURCE_DIR}/src/backends/mlx-coreml-backend.mm)
+        target_compile_definitions(elizainference PRIVATE ELIZA_ENABLE_MLX)
+        # MLX C API (ml-explore/mlx-c) via -DELIZA_MLX_C_DIR / -DELIZA_MLX_LIBS,
+        # plus the system CoreML / Metal / Foundation frameworks.
+        if(ELIZA_MLX_C_DIR)
+            target_include_directories(elizainference PRIVATE ${ELIZA_MLX_C_DIR}/include)
+            target_link_directories(elizainference PRIVATE ${ELIZA_MLX_C_DIR}/lib)
+        endif()
+        if(ELIZA_MLX_LIBS)
+            target_link_libraries(elizainference PRIVATE ${ELIZA_MLX_LIBS})
+        endif()
+        target_link_libraries(elizainference PRIVATE
+            "-framework Foundation" "-framework CoreML" "-framework Metal")
+    endif()
     set_target_properties(elizainference PROPERTIES
         OUTPUT_NAME              elizainference
         POSITION_INDEPENDENT_CODE ON)
diff --git a/tools/omnivoice/src/SESSION-OPS-TODO.md b/tools/omnivoice/src/SESSION-OPS-TODO.md
new file mode 100644
index 000000000..7095b8952
--- /dev/null
+++ b/tools/omnivoice/src/SESSION-OPS-TODO.md
@@ -0,0 +1,159 @@
+# Session-op backend seam — design (NOT implemented)
+
+The per-op backend seam (`backend-registry.h` + `<mod>-backend.h` +
+`<mod>-backend-selector.cpp` + a chokepoint at the top of the FFI fn) is now in
+place for the **one-shot** ops:
+
+| modality | FFI fn                          | header / selector            | env key               | artifact dir       |
+|----------|---------------------------------|------------------------------|-----------------------|--------------------|
+| embed    | `eliza_inference_embed`         | `embed-backend.*`            | `ELIZA_EMBED_BACKEND` | `<bundle>/embedding/` |
+| vision   | `eliza_inference_describe_image`| `vision-backend.*`           | `ELIZA_VISION_BACKEND`| `<bundle>/vision/` |
+| asr      | `eliza_inference_asr_transcribe`| `asr-backend.*`              | `ELIZA_ASR_BACKEND`   | `<bundle>/asr/`    |
+| tts      | `eliza_inference_tts_synthesize`| `tts-backend.*`              | `ELIZA_TTS_BACKEND`   | `<bundle>/tts/`    |
+| eot      | `eliza_inference_llm_eot_score` | `eot-backend.*`              | `ELIZA_EOT_BACKEND`   | `<bundle>/eot/`    |
+
+A one-shot op is stateless across calls: select → (delegate | fall through to
+ggml) on every call. There is nothing to keep alive between calls, so the seam
+is a single chokepoint at the top of the fn.
+
+The **session** ops are different: `vad`, `wakeword`, `speaker`, `diariz` each
+`_open` a native handle (`EliVad *`, `EliWakeword *`, `EliSpeaker *`,
+`EliDiariz *`) that persists across many `_segment`/`_detect`/`_embed` calls and
+is torn down with `_close`/`_reset`. The seam has to follow that lifecycle, not
+re-select per call. This file records HOW to extend the seam to them. **None of
+the below is implemented yet.**
+
+## The shape of a session op (today, in-tree only)
+
+Each session modality exposes, e.g. for VAD:
+
+```c
+EliVad * eliza_inference_vad_open(EliInferenceContext * ctx, /* params */, char ** out_error);
+int      eliza_inference_vad_segment(EliVad * vad, const float * pcm, size_t n, /* out */, char ** out_error);
+int      eliza_inference_vad_reset(EliVad * vad, char ** out_error);
+void     eliza_inference_vad_close(EliVad * vad);
+```
+
+`EliVad` (and the wakeword/speaker/diariz equivalents) is the in-tree handle
+struct defined in `eliza-inference-ffi.cpp`. Its in-tree fields stay exactly as
+they are; the seam is **additive** — one extra pointer.
+
+## Extending the seam to a session op
+
+For each session modality `<mod>` (vad | wakeword | speaker | diariz):
+
+### 1. A session factory interface — `<mod>-backend.h`
+
+Mirror the one-shot factory's four common probes, but the forward methods mirror
+the **session** ABI 1:1 instead of a single one-shot fn. The factory does NOT
+own the handle struct; it produces and operates on an opaque backend-session:
+
+```cpp
+struct VadBackendFactory {
+    virtual ~VadBackendFactory() = default;
+    virtual const char * name() const = 0;
+    virtual bool         available() const = 0;
+    virtual bool         can_serve(const char * bundle_dir) const = 0;  // probes <bundle>/vad/
+    virtual int          preference_rank() const { return 0; }
+
+    // Lifecycle, mirroring the FFI session ABI 1:1. The factory returns an
+    // opaque backend-session pointer it owns; the FFI stashes it on the Eli*
+    // handle. A NULL return + *out_error is a hard open failure.
+    virtual void * open(EliInferenceContext * ctx, /* same params as eliza_inference_vad_open */,
+                        char ** out_error) = 0;
+    virtual int   segment(void * session, const float * pcm, size_t n, /* out */, char ** out_error) = 0;
+    virtual int   reset(void * session, char ** out_error) = 0;
+    virtual void  close(void * session) = 0;
+};
+```
+
+Plus the same free-functions as the one-shot seam:
+`vad_backend_register`, `vad_backend_register_builtins` (EMPTY for now — no
+LiteRT session backend exists), `vad_backend_select(bundle_dir, out_error)`,
+backed by a `eliza_backend::Registry<VadBackendFactory>` in
+`<mod>-backend-selector.cpp` with env keys `ELIZA_VAD_BACKEND` → `ELIZA_BACKEND`
+and modality `"vad"`. Artifact probe dir `<bundle>/vad/` (resp. `wakeword/`,
+`speaker/`, `diariz/`).
+
+### 2. A backend-session pointer on the Eli* handle
+
+The selection happens ONCE, at `_open`, not per call. Add one field to the
+in-tree handle struct:
+
+```cpp
+struct EliVad {
+    /* ... existing in-tree fields, unchanged ... */
+
+    /* Backend seam (additive). When non-null, this handle is served by an
+     * accelerator backend and every op delegates to it; the in-tree fields
+     * above are then unused. When null, the in-tree ggml path owns the handle. */
+    VadBackendFactory * be         = nullptr;  // the factory that opened be_session
+    void *              be_session = nullptr;  // factory-owned backend session
+};
+```
+
+### 3. Select at `_open`
+
+In `eliza_inference_vad_open`, after the existing arg validation and before the
+in-tree handle is built:
+
+```cpp
+char * be_error = nullptr;
+VadBackendFactory * be = vad_backend_select(llm_backend_context_bundle_dir(ctx), &be_error);
+if (be_error) { eliza_set_error(out_error, std::string(be_error)); std::free(be_error);
+                return /* NULL handle */; }
+if (be) {
+    void * sess = be->open(ctx, /* params */, out_error);
+    if (!sess) return /* NULL handle — open failed, out_error already set */;
+    EliVad * h = new EliVad();
+    h->be = be;
+    h->be_session = sess;
+    return h;
+}
+/* else: fall through and build the in-tree handle exactly as today. */
+```
+
+### 4. A guard at the TOP of each `_segment` / `_reset` / `_close`
+
+Each per-call op checks the backend pointer and delegates before touching any
+in-tree state:
+
+```cpp
+int eliza_inference_vad_segment(EliVad * vad, const float * pcm, size_t n, /* out */, char ** out_error) {
+    if (!vad) { /* invalid-arg as today */ }
+    if (vad->be) {                                   // <-- guard
+        return vad->be->segment(vad->be_session, pcm, n, /* out */, out_error);
+    }
+    /* ... existing in-tree ggml segment body, unchanged ... */
+}
+
+void eliza_inference_vad_close(EliVad * vad) {
+    if (!vad) return;
+    if (vad->be) { vad->be->close(vad->be_session); delete vad; return; }  // <-- guard
+    /* ... existing in-tree teardown, then delete vad ... */
+}
+```
+
+`_reset` follows the same guard pattern.
+
+## Why this shape (vs. re-selecting per call)
+
+- **Selection is per-session, not per-call.** A session's backend is fixed at
+  `_open`; you cannot have `_segment` cross from the ggml path to LiteRT mid
+  session because the KV/feature state lives in the (in-tree OR backend)
+  session, not on the FFI boundary. The one pointer captures that binding.
+- **Hard-fail localizes to `_open`.** A bundle-invalid override surfaces once,
+  where the caller is already prepared to handle a NULL handle, instead of on
+  every `_segment`.
+- **Additive + inert.** With no session backend registered (the case today),
+  `_open`'s `select()` returns nullptr, `be`/`be_session` stay null, and every
+  guard is a no-op — the in-tree path is byte-for-byte unchanged. Same inert-by
+  -default contract as the one-shot seam.
+
+## Status
+
+- One-shot seam: embed (with a LiteRT builtin), vision/asr/tts/eot (inert,
+  no builtin) — **done**.
+- Session seam (vad/wakeword/speaker/diariz): **not implemented.** No
+  `<mod>-backend.{h,cpp}`, no handle field, no `_open` select, no per-call
+  guards exist yet. This file is the spec for when a session backend lands.
diff --git a/tools/omnivoice/src/asr-backend-selector.cpp b/tools/omnivoice/src/asr-backend-selector.cpp
new file mode 100644
index 000000000..7513e7d9d
--- /dev/null
+++ b/tools/omnivoice/src/asr-backend-selector.cpp
@@ -0,0 +1,34 @@
+/*
+ * asr-backend-selector.cpp — registry + selection for the per-op ASR backend
+ * seam. A thin instantiation of eliza_backend::Registry<AsrBackendFactory>
+ * (backend-registry.h) — the resolution logic is shared with every other
+ * modality. Inert by default: no -DELIZA_ENABLE_* ASR backend is compiled in
+ * (none exists yet), so nothing registers and asr_backend_select() returns
+ * nullptr, so eliza_inference_asr_transcribe keeps the in-tree ggml path.
+ */
+
+#include "asr-backend.h"
+#include "backend-registry.h"
+
+#include <mutex>
+
+namespace {
+eliza_backend::Registry<AsrBackendFactory> g_registry;
+std::once_flag                             g_builtins_once;
+} // namespace
+
+void asr_backend_register(AsrBackendFactory * factory) {
+    g_registry.register_factory(factory);
+}
+
+void asr_backend_register_builtins() {
+    std::call_once(g_builtins_once, []() {
+        /* No ASR backend exists yet — the seam stays inert. */
+    });
+}
+
+AsrBackendFactory * asr_backend_select(const char * bundle_dir, char ** out_error) {
+    asr_backend_register_builtins();
+    return g_registry.select("ELIZA_ASR_BACKEND", "ELIZA_BACKEND", "asr",
+                             bundle_dir, out_error);
+}
diff --git a/tools/omnivoice/src/asr-backend.h b/tools/omnivoice/src/asr-backend.h
new file mode 100644
index 000000000..2dd9fec49
--- /dev/null
+++ b/tools/omnivoice/src/asr-backend.h
@@ -0,0 +1,61 @@
+#pragma once
+/*
+ * asr-backend.h — per-op backend seam for speech-to-text transcription.
+ *
+ * A one-shot op (eliza_inference_asr_transcribe) that an accelerator backend can
+ * serve when it ships an ASR artifact under `<bundle>/asr/`, while every other
+ * op — and ASR itself when no artifact is present — stays on the in-tree ggml
+ * path.
+ *
+ * The factory mirrors the FFI 1:1 and the FFI delegates without translation.
+ * Selection reuses the shared eliza_backend::Registry (backend-registry.h):
+ * ELIZA_ASR_BACKEND (per-op) then ELIZA_BACKEND (global) hard-select, else the
+ * highest preference_rank among available()+can_serve() factories, else nullptr
+ * (the ggml ASR path).
+ */
+
+#include "eliza-inference-ffi.h" /* EliInferenceContext fwd, ELIZA_* codes */
+
+#include <cstddef>
+
+struct EliInferenceContext;
+
+/* One factory per linked-in ASR runtime (e.g. LiteRT). */
+struct AsrBackendFactory {
+    virtual ~AsrBackendFactory() = default;
+
+    /* Stable lower-case id, e.g. "litert". Matched case-insensitively against
+     * ELIZA_ASR_BACKEND / ELIZA_BACKEND. */
+    virtual const char * name() const = 0;
+
+    /* Compiled in AND host deps present (the runtime + a GPU/NPU delegate).
+     * Cheap — must not load a model. */
+    virtual bool available() const = 0;
+
+    /* The ASR artifact exists under `<bundle_dir>/asr/`. Cheap directory probe,
+     * no model load. */
+    virtual bool can_serve(const char * bundle_dir) const = 0;
+
+    /* Platform-affinity rank (higher wins; the ggml path is the implicit rank 0).
+     * An NPU-served ASR returns a high positive value; a GPU-delegate fallback a
+     * lower positive value. */
+    virtual int preference_rank() const { return 0; }
+
+    /* Mirrors eliza_inference_asr_transcribe 1:1. Returns the number of bytes
+     * written (excluding the terminator) on success, or a negative ELIZA_* code
+     * with `*out_error` heap-allocated for the caller to free. */
+    virtual int asr_transcribe(EliInferenceContext * ctx, const float * pcm, size_t n_samples,
+                               int sample_rate_hz, char * out_text, size_t max_text_bytes,
+                               char ** out_error) = 0;
+};
+
+/* Register a factory (idempotent by name). */
+void asr_backend_register(AsrBackendFactory * factory);
+
+/* Register every ASR backend compiled into THIS build (gated by the
+ * -DELIZA_ENABLE_* options). Idempotent; called by asr_backend_select. */
+void asr_backend_register_builtins();
+
+/* Pick an ASR backend for the bundle at `bundle_dir`. nullptr + no error
+ * => use the in-tree ggml ASR path. nullptr + *out_error => hard failure. */
+AsrBackendFactory * asr_backend_select(const char * bundle_dir, char ** out_error);
diff --git a/tools/omnivoice/src/backend-registry.h b/tools/omnivoice/src/backend-registry.h
new file mode 100644
index 000000000..14a40b3fd
--- /dev/null
+++ b/tools/omnivoice/src/backend-registry.h
@@ -0,0 +1,147 @@
+#pragma once
+/*
+ * backend-registry.h — generic per-modality backend registry + selection.
+ *
+ * Factored out of the M3 streaming-LLM seam (llm-backend-selector.cpp) so EVERY
+ * on-device modality (embed, asr, tts, vision, vad, wakeword, speaker, diarizer,
+ * eot, …) reuses ONE resolution implementation instead of copy-pasting it. A
+ * modality declares a small factory interface with the four common probes
+ * (name / available / can_serve / preference_rank) plus its own forward method,
+ * instantiates `eliza_backend::Registry<ThatFactory>`, and selects with the
+ * shared logic below:
+ *
+ *   1. `ELIZA_<MOD>_BACKEND` env (per-op) → else `ELIZA_BACKEND` (global) — a
+ *      HARD select. An in-tree name ("llama.cpp"/"ggml"/"default") forces the
+ *      ggml path (returns nullptr, no error). Any other name that is not
+ *      registered+available or cannot serve the bundle is a hard error
+ *      (nullptr + *out_error).
+ *   2. No override: among registered factories that are available() AND
+ *      can_serve(bundle_dir), pick the highest preference_rank(). None → nullptr.
+ *
+ * A nullptr return with *out_error == nullptr means "use the in-tree ggml path"
+ * — NOT an error. Inert by default: with no -DELIZA_ENABLE_* backend compiled,
+ * nothing registers and select() always returns nullptr, so every op keeps the
+ * in-tree path byte-for-byte.
+ *
+ * Factory type F must expose:
+ *   const char * name() const;          // stable lower-case id
+ *   bool         available() const;     // compiled-in AND host deps present; cheap
+ *   bool         can_serve(const char * bundle_dir) const;  // artifact probe; cheap
+ *   int          preference_rank() const;                   // higher wins; ggml == 0
+ */
+
+#include <cctype>
+#include <cstdlib>
+#include <cstring>
+#include <mutex>
+#include <string>
+#include <vector>
+
+namespace eliza_backend {
+
+/* malloc-allocate an error string so the caller frees it with
+ * eliza_inference_free_string() (free()), matching the FFI contract. */
+inline char * dup_error(const std::string & msg) {
+    char * out = (char *) std::malloc(msg.size() + 1);
+    if (out) std::memcpy(out, msg.c_str(), msg.size() + 1);
+    return out;
+}
+
+inline bool iequals(const char * a, const char * b) {
+    if (!a || !b) return false;
+    while (*a && *b) {
+        if (std::tolower((unsigned char) *a) != std::tolower((unsigned char) *b)) {
+            return false;
+        }
+        ++a;
+        ++b;
+    }
+    return *a == *b;
+}
+
+/* Names that mean "stay on the in-tree ggml/llama.cpp path". */
+inline bool is_intree_name(const char * s) {
+    return iequals(s, "llama.cpp") || iequals(s, "llamacpp") || iequals(s, "llama") ||
+           iequals(s, "ggml") || iequals(s, "intree") || iequals(s, "default");
+}
+
+template <class Factory>
+class Registry {
+public:
+    /* Idempotent by name. Safe from static init. Does not take ownership —
+     * factories are static-lifetime singletons. */
+    void register_factory(Factory * factory) {
+        if (!factory) return;
+        std::lock_guard<std::mutex> lock(mu_);
+        for (Factory * f : factories_) {
+            if (iequals(f->name(), factory->name())) return;
+        }
+        factories_.push_back(factory);
+    }
+
+    /* env_key: the per-op override (e.g. "ELIZA_EMBED_BACKEND"); global_key: the
+     * cross-op default (e.g. "ELIZA_BACKEND"); modality: for error text. */
+    Factory * select(const char * env_key, const char * global_key,
+                     const char * modality, const char * bundle_dir,
+                     char ** out_error) {
+        const char * forced = env_key ? std::getenv(env_key) : nullptr;
+        if (!forced || forced[0] == '\0') {
+            forced = global_key ? std::getenv(global_key) : nullptr;
+        }
+        if (forced && forced[0] != '\0') {
+            if (is_intree_name(forced)) {
+                return nullptr; /* force in-tree, not an error */
+            }
+            std::lock_guard<std::mutex> lock(mu_);
+            for (Factory * f : factories_) {
+                if (!iequals(f->name(), forced)) continue;
+                if (!f->available()) {
+                    set_err(out_error, modality, forced,
+                            "is not available in this build/host");
+                    return nullptr;
+                }
+                if (!f->can_serve(bundle_dir)) {
+                    set_err(out_error, modality, forced,
+                            std::string("cannot serve the bundle at ") +
+                                (bundle_dir ? bundle_dir : "(null)"));
+                    return nullptr;
+                }
+                return f;
+            }
+            set_err(out_error, modality, forced, "is not a registered backend");
+            return nullptr;
+        }
+
+        /* Auto-select: highest preference_rank among available + can_serve. The
+         * in-tree ggml path is the implicit rank-0 fallback, so an accelerator
+         * backend only wins with a positive rank that can serve this bundle. */
+        std::lock_guard<std::mutex> lock(mu_);
+        Factory * best      = nullptr;
+        int       best_rank = 0;
+        for (Factory * f : factories_) {
+            if (!f->available()) continue;
+            if (!f->can_serve(bundle_dir)) continue;
+            const int rank = f->preference_rank();
+            if (rank > best_rank) {
+                best_rank = rank;
+                best      = f;
+            }
+        }
+        return best; /* nullptr => in-tree ggml path */
+    }
+
+private:
+    static void set_err(char ** out_error, const char * modality,
+                        const char * name, const std::string & why) {
+        if (out_error) {
+            *out_error = dup_error(std::string("[libelizainference] ") +
+                                   (modality ? modality : "backend") +
+                                   " backend override '" + name + "' " + why);
+        }
+    }
+
+    std::mutex             mu_;
+    std::vector<Factory *> factories_;
+};
+
+} // namespace eliza_backend
diff --git a/tools/omnivoice/src/backends/litert-backend.cpp b/tools/omnivoice/src/backends/litert-backend.cpp
new file mode 100644
index 000000000..3b3dad137
--- /dev/null
+++ b/tools/omnivoice/src/backends/litert-backend.cpp
@@ -0,0 +1,471 @@
+/*
+ * litert-backend.cpp — LiteRT-LM in-process streaming-LLM backend (M4).
+ *
+ * See litert-backend.h for the targeted LiteRT-LM C++ API (repo + commit
+ * date cited there). The real implementation is gated behind
+ * `ELIZA_ENABLE_LITERT`; the default (Linux/desktop) build compiles the stub
+ * branch, which links zero LiteRT-LM SDK headers and reports
+ * `available() == false` so the selector keeps the in-tree llama.cpp path.
+ *
+ * Error contract (native/AGENTS.md §3 + §9): never log, never return a
+ * defaulted result on failure. Every failure path heap-allocates `*out_error`
+ * via litert_set_error() (matching the FFI cpp's eliza_strdup/eliza_set_error
+ * style) and returns the negative ELIZA_* code or nullptr.
+ */
+
+#include "litert-backend.h"
+
+#include <atomic>
+#include <cstdlib>
+#include <cstring>
+#include <mutex>
+#include <string>
+
+#if defined(__has_include)
+#  if __has_include(<filesystem>)
+#    include <filesystem>
+#    define LITERT_HAVE_FILESYSTEM 1
+#  endif
+#endif
+
+/* ── Heap-allocated error strings (mirror eliza-inference-ffi.cpp) ───────── */
+namespace {
+
+char * litert_strdup(const std::string & s) {
+    char * out = static_cast<char *>(std::malloc(s.size() + 1));
+    if (!out) return nullptr;
+    std::memcpy(out, s.c_str(), s.size() + 1);
+    return out;
+}
+
+void litert_set_error(char ** out_error, const std::string & msg) {
+    if (!out_error) return;
+    *out_error = litert_strdup(msg);
+}
+
+#if defined(LITERT_HAVE_FILESYSTEM)
+/* Probe <bundle_dir>/text/ for a *.litertlm artifact. Cheap directory walk,
+ * no model load (LlmBackendFactory::can_serve contract). */
+std::string find_litertlm_artifact(const char * bundle_dir) {
+    if (!bundle_dir || bundle_dir[0] == '\0') return std::string();
+    std::error_code ec;
+    std::filesystem::path text_dir =
+        std::filesystem::path(bundle_dir) / LITERT_BUNDLE_TEXT_SUBDIR;
+    if (!std::filesystem::is_directory(text_dir, ec)) return std::string();
+    for (std::filesystem::directory_iterator it(text_dir, ec), end;
+         !ec && it != end; it.increment(ec)) {
+        if (!it->is_regular_file(ec)) continue;
+        if (it->path().extension() == LITERT_ARTIFACT_EXT) {
+            return it->path().string();
+        }
+    }
+    return std::string();
+}
+#else
+std::string find_litertlm_artifact(const char *) { return std::string(); }
+#endif
+
+}  // namespace
+
+/* ════════════════════════════════════════════════════════════════════════ *
+ *  REAL implementation — only when ELIZA_ENABLE_LITERT is defined.
+ *  Behind this gate we may include LiteRT-LM SDK headers; outside it we
+ *  include NONE so the file builds on a host without the SDK.
+ * ════════════════════════════════════════════════════════════════════════ */
+#ifdef ELIZA_ENABLE_LITERT
+
+#include <memory>
+#include <optional>
+#include <utility>
+#include <variant>
+#include <vector>
+
+/* LiteRT-LM cross-platform C++ runtime. Paths per the repo's bazel layout
+ * (github.com/google-ai-edge/LiteRT-LM, `main`, researched 2026-06-22). */
+#include "runtime/engine/engine.h"          // litert::lm::Engine, SessionInterface
+#include "runtime/engine/engine_settings.h" // EngineSettings, SessionConfig, ModelAssets
+#include "runtime/engine/io_types.h"        // InputData, InputText, Responses
+
+namespace {
+
+using litert::lm::Backend;
+using litert::lm::Engine;
+using litert::lm::EngineSettings;
+using litert::lm::InputData;
+using litert::lm::InputText;
+using litert::lm::ModelAssets;
+using litert::lm::Responses;
+using litert::lm::SessionConfig;
+
+/* The Session type the templated Engine hands back (Engine::Session is the
+ * public alias EngineT<SessionT> exposes; for Engine it is SessionInterface). */
+using Session = Engine::Session;
+
+/* The accelerator the factory resolved at open(), recorded for diagnostics
+ * and preference reporting. DEVICE-VERIFY: which rung actually initializes is
+ * hardware-dependent and can only be confirmed on an NPU/GPU device. */
+enum class ResolvedAccelerator { kNone, kNpu, kGpu, kCpu };
+
+const char * accelerator_name(ResolvedAccelerator a) {
+    switch (a) {
+        case ResolvedAccelerator::kNpu: return "npu";
+        case ResolvedAccelerator::kGpu: return "gpu";
+        case ResolvedAccelerator::kCpu: return "cpu";
+        default:                        return "none";
+    }
+}
+
+/* Try to build an Engine for `artifact` on `backend`. Returns the Engine on
+ * success; on failure returns nullptr (the ladder falls through to the next
+ * rung). The error text is captured so the final rung can surface it. */
+std::unique_ptr<Engine> try_engine(const std::string & artifact,
+                                   Backend backend,
+                                   std::string & last_err) {
+    auto model_assets = ModelAssets::Create(artifact);
+    if (!model_assets.ok()) {
+        last_err = std::string(model_assets.status().message());
+        return nullptr;
+    }
+    auto settings = EngineSettings::CreateDefault(*model_assets, backend);
+    if (!settings.ok()) {
+        last_err = std::string(settings.status().message());
+        return nullptr;
+    }
+    auto engine = Engine::CreateEngine(*settings);
+    if (!engine.ok()) {
+        last_err = std::string(engine.status().message());
+        return nullptr;
+    }
+    return std::move(*engine);
+}
+
+/* ── Session: mirrors the FFI streaming pull contract 1:1 ────────────────── */
+class LiteRtBackendSession final : public LlmBackendSession {
+public:
+    LiteRtBackendSession(std::unique_ptr<Engine> engine,
+                         std::unique_ptr<Session> session,
+                         const eliza_llm_stream_config_t & cfg,
+                         ResolvedAccelerator accel)
+        : engine_(std::move(engine)),
+          session_(std::move(session)),
+          accel_(accel),
+          max_tokens_(cfg.max_tokens > 0 ? cfg.max_tokens : 0) {}
+
+    /* prefill: copy the caller's tokens, detokenize through the engine's
+     * tokenizer, and run a LiteRT prefill pass. The FFI hands pre-tokenized
+     * ids (text-model vocab); LiteRT-LM's prefill consumes InputData (text),
+     * so we round-trip ids → text via the shared tokenizer rather than
+     * assuming vocab parity (the .litertlm graph carries its own tokenizer).
+     * DEVICE-VERIFY: id/text round-trip fidelity needs a real .litertlm. */
+    int prefill(const int32_t * token_ids, size_t num_tokens,
+                char ** out_error) override {
+        if (!session_) {
+            litert_set_error(out_error,
+                "[litert-lm] prefill: session is not open");
+            return ELIZA_ERR_INVALID_ARG;
+        }
+        if (cancelled_.load(std::memory_order_acquire)) {
+            return ELIZA_ERR_CANCELLED;
+        }
+        std::vector<int> ids;
+        ids.reserve(num_tokens);
+        for (size_t i = 0; i < num_tokens; ++i) ids.push_back(token_ids[i]);
+
+        const std::string text = engine_->GetTokenizer().Detokenize(ids);
+        std::vector<InputData> contents;
+        contents.emplace_back(InputText(std::string(text)));
+
+        absl::Status st = session_->RunPrefill(contents);
+        if (!st.ok()) {
+            litert_set_error(out_error,
+                std::string("[litert-lm] RunPrefill failed: ") +
+                std::string(st.message()));
+            return ELIZA_ERR_FFI_FAULT;
+        }
+        prefilled_ = true;
+        return ELIZA_OK;
+    }
+
+    /* next: one decode step. LiteRT-LM's RunDecode() returns a Responses
+     * batch; we emit the newly-produced UTF-8 delta as detokenized text and
+     * its token ids. LiteRT-LM has no in-process MTP drafter exposed through
+     * this surface, so drafted/accepted are always 0. Returns 1 (final) at
+     * EOS or the max-token cap, 0 otherwise. */
+    int next(int32_t * tokens_out, size_t tokens_cap, size_t * num_tokens_out,
+             char * text_out, size_t text_cap, int32_t * drafter_drafted_out,
+             int32_t * drafter_accepted_out, char ** out_error) override {
+        if (num_tokens_out) *num_tokens_out = 0;
+        if (text_out && text_cap) text_out[0] = '\0';
+        if (drafter_drafted_out)  *drafter_drafted_out = 0;
+        if (drafter_accepted_out) *drafter_accepted_out = 0;
+
+        if (!session_) {
+            litert_set_error(out_error, "[litert-lm] next: session not open");
+            return ELIZA_ERR_INVALID_ARG;
+        }
+        if (!prefilled_) {
+            litert_set_error(out_error,
+                "[litert-lm] next: prefill must run before next");
+            return ELIZA_ERR_INVALID_ARG;
+        }
+        if (cancelled_.load(std::memory_order_acquire)) {
+            return ELIZA_ERR_CANCELLED;
+        }
+
+        auto responses = session_->RunDecode();
+        if (!responses.ok()) {
+            litert_set_error(out_error,
+                std::string("[litert-lm] RunDecode failed: ") +
+                std::string(responses.status().message()));
+            return ELIZA_ERR_FFI_FAULT;
+        }
+
+        /* RunDecode yields the running candidate texts; GetTexts()[0] is the
+         * cumulative decode for candidate 0. Emit only the suffix produced
+         * since the last step so the FFI streams a delta per pull. */
+        const std::vector<std::string> & texts = responses->GetTexts();
+        std::string cumulative = texts.empty() ? std::string() : texts.front();
+        std::string delta = compute_delta(cumulative);
+        emitted_chars_ = cumulative.size();
+
+        /* Re-tokenize the delta against the engine tokenizer so the FFI gets
+         * committed text-vocab ids (the same round-trip the prefill used). */
+        std::vector<int> delta_ids = engine_->GetTokenizer().Tokenize(delta);
+        size_t n_emit = delta_ids.size();
+        if (n_emit > tokens_cap) n_emit = tokens_cap;
+        if (tokens_out) {
+            for (size_t i = 0; i < n_emit; ++i) {
+                tokens_out[i] = static_cast<int32_t>(delta_ids[i]);
+            }
+        }
+        if (num_tokens_out) *num_tokens_out = n_emit;
+        if (text_out && text_cap) {
+            const size_t copy = delta.size() < text_cap - 1
+                                    ? delta.size()
+                                    : text_cap - 1;
+            std::memcpy(text_out, delta.data(), copy);
+            text_out[copy] = '\0';
+        }
+
+        decoded_tokens_ += static_cast<int32_t>(delta_ids.size());
+        const bool hit_cap =
+            max_tokens_ > 0 && decoded_tokens_ >= max_tokens_;
+        /* DEVICE-VERIFY: the precise EOS signal LiteRT-LM exposes per step is
+         * runtime-version-dependent. A done decode yields no new delta; treat
+         * an empty delta or the token cap as the final step. */
+        const bool eos = delta_ids.empty();
+        return (hit_cap || eos) ? 1 : 0;
+    }
+
+    /* cancel: publish a flag the next decode step observes. Thread-safe. */
+    int cancel() override {
+        cancelled_.store(true, std::memory_order_release);
+        return ELIZA_OK;
+    }
+
+    /* reset: drop a fresh Session from the same Engine (clears KV + sampler).
+     * Reuses the warm Engine (model weights stay resident) — only the
+     * per-generation Session is rebuilt. */
+    int reset() override {
+        auto cfg = SessionConfig::CreateDefault();
+        auto session = engine_->CreateSession(cfg);
+        if (!session.ok()) {
+            /* reset has no out_error param; a failed rebuild leaves the old
+             * session in place and surfaces on the next prefill/next. */
+            return ELIZA_ERR_FFI_FAULT;
+        }
+        session_ = std::move(*session);
+        cancelled_.store(false, std::memory_order_release);
+        prefilled_ = false;
+        decoded_tokens_ = 0;
+        emitted_chars_ = 0;
+        return ELIZA_OK;
+    }
+
+    /* reset_keep: LiteRT-LM's Session does not expose prefix-preserving KV
+     * trimming through this surface, so fall back to a full reset and return 0
+     * (no prefix kept) — never an error (llm-backend.h contract). */
+    int reset_keep(int32_t /*n_keep*/) override {
+        reset();
+        return 0;
+    }
+
+    const char * accelerator() const { return accelerator_name(accel_); }
+
+private:
+    /* The suffix of `cumulative` produced since the last emitted step. */
+    std::string compute_delta(const std::string & cumulative) const {
+        if (cumulative.size() <= emitted_chars_) return std::string();
+        return cumulative.substr(emitted_chars_);
+    }
+
+    std::unique_ptr<Engine>  engine_;
+    std::unique_ptr<Session> session_;
+    std::atomic<bool>        cancelled_{false};
+    bool                     prefilled_ = false;
+    int32_t                  decoded_tokens_ = 0;
+    size_t                   emitted_chars_ = 0;
+    ResolvedAccelerator      accel_ = ResolvedAccelerator::kNone;
+    int32_t                  max_tokens_ = 0;
+};
+
+/* ── Factory ─────────────────────────────────────────────────────────────── */
+class LiteRtBackendFactory final : public LlmBackendFactory {
+public:
+    const char * name() const override { return LITERT_BACKEND_NAME; }
+
+    /* available(): compiled in AND an accelerator (NPU or GPU) initializes on
+     * THIS host. Cheap — must not load a model. We probe by building a minimal
+     * EngineSettings on NPU then GPU with NO model assets; a backend whose
+     * delegate is missing fails settings validation. CPU alone does NOT make
+     * this backend "available" (CPU is the in-tree llama.cpp path's job).
+     * DEVICE-VERIFY: real delegate presence is only knowable on-device. */
+    bool available() const override {
+        return probe_accelerator() != ResolvedAccelerator::kNone;
+    }
+
+    /* can_serve(): a *.litertlm exists under <bundle_dir>/text/. Cheap probe,
+     * no caching — open() re-resolves the bundle from the context accessor. */
+    bool can_serve(const char * bundle_dir) const override {
+        return !find_litertlm_artifact(bundle_dir).empty();
+    }
+
+    /* preference_rank(): high on Android NPU (the whole reason this backend
+     * exists), modest on a GPU-only fallback, 0 otherwise so llama.cpp wins. */
+    int preference_rank() const override {
+        switch (probe_accelerator()) {
+            case ResolvedAccelerator::kNpu: return 100;
+            case ResolvedAccelerator::kGpu: return 20;
+            default:                        return 0;
+        }
+    }
+
+    /* open(): resolve the .litertlm under the cached bundle, then walk the
+     * accelerator ladder NPU → GPU → CPU, recording which rung built the
+     * Engine. Builds a default Session and returns the streaming session. */
+    LlmBackendSession * open(EliInferenceContext * ctx,
+                             const eliza_llm_stream_config_t * cfg,
+                             char ** out_error) override {
+        if (!cfg) {
+            litert_set_error(out_error, "[litert-lm] open: cfg is NULL");
+            return nullptr;
+        }
+        const char * bundle_dir = llm_backend_context_bundle_dir(ctx);
+        const std::string bundle = bundle_dir ? bundle_dir : std::string();
+        std::string artifact = find_litertlm_artifact(bundle.c_str());
+        if (artifact.empty()) {
+            litert_set_error(out_error,
+                std::string("[litert-lm] open: no ") + LITERT_ARTIFACT_EXT +
+                " artifact under " + bundle + "/" + LITERT_BUNDLE_TEXT_SUBDIR);
+            return nullptr;
+        }
+
+        /* Accelerator ladder — NPU first (Qualcomm QNN / MediaTek NeuroPilot /
+         * Google Tensor), then GPU (OpenCL/Metal/WebGPU), then CPU (XNNPACK).
+         * Each rung's failure text is preserved for the final diagnostic.
+         * DEVICE-VERIFY: rung availability is hardware-specific. */
+        struct Rung { Backend backend; ResolvedAccelerator accel; };
+        const Rung ladder[] = {
+            {Backend::NPU, ResolvedAccelerator::kNpu},
+            {Backend::GPU, ResolvedAccelerator::kGpu},
+            {Backend::CPU, ResolvedAccelerator::kCpu},
+        };
+
+        std::unique_ptr<Engine> engine;
+        ResolvedAccelerator resolved = ResolvedAccelerator::kNone;
+        std::string last_err;
+        for (const Rung & rung : ladder) {
+            engine = try_engine(artifact, rung.backend, last_err);
+            if (engine) {
+                resolved = rung.accel;
+                break;
+            }
+        }
+        if (!engine) {
+            litert_set_error(out_error,
+                std::string("[litert-lm] open: no accelerator could build the "
+                            "engine (last error: ") + last_err + ")");
+            return nullptr;
+        }
+
+        auto session_cfg = SessionConfig::CreateDefault();
+        auto session = engine->CreateSession(session_cfg);
+        if (!session.ok()) {
+            litert_set_error(out_error,
+                std::string("[litert-lm] open: CreateSession failed on ") +
+                accelerator_name(resolved) + ": " +
+                std::string(session.status().message()));
+            return nullptr;
+        }
+
+        return new LiteRtBackendSession(std::move(engine), std::move(*session),
+                                        *cfg, resolved);
+    }
+
+private:
+    /* Build a no-model EngineSettings on NPU then GPU; the first whose
+     * delegate validates marks that rung present. Result is memoized so the
+     * repeated available()/preference_rank() calls are cheap.
+     * DEVICE-VERIFY: settings-only validation is the cheapest honest probe;
+     * the true delegate handshake happens at open() on-device. */
+    ResolvedAccelerator probe_accelerator() const {
+        std::call_once(probe_once_, [this]() {
+            auto empty = ModelAssets::Create(std::string());
+            if (!empty.ok()) { probed_ = ResolvedAccelerator::kNone; return; }
+            if (EngineSettings::CreateDefault(*empty, Backend::NPU).ok()) {
+                probed_ = ResolvedAccelerator::kNpu;
+            } else if (EngineSettings::CreateDefault(*empty, Backend::GPU).ok()) {
+                probed_ = ResolvedAccelerator::kGpu;
+            } else {
+                probed_ = ResolvedAccelerator::kNone;
+            }
+        });
+        return probed_;
+    }
+
+    mutable std::once_flag      probe_once_;
+    mutable ResolvedAccelerator probed_ = ResolvedAccelerator::kNone;
+};
+
+}  // namespace
+
+LlmBackendFactory * litert_backend_factory() {
+    static LiteRtBackendFactory factory;
+    return &factory;
+}
+
+#else  /* ────────────────────────── STUB (no LiteRT-LM SDK) ──────────────── */
+
+/*
+ * Compiled-out stub: zero LiteRT-LM headers, so this builds on any host. The
+ * factory links in as a no-op — available() is false, can_serve() is false,
+ * preference_rank() is 0, and open() returns nullptr + sets `*out_error`
+ * "not compiled in" so the selector cleanly keeps the in-tree llama.cpp path.
+ */
+namespace {
+
+class LiteRtBackendFactoryStub final : public LlmBackendFactory {
+public:
+    const char * name() const override { return LITERT_BACKEND_NAME; }
+    bool available() const override { return false; }
+    bool can_serve(const char * /*bundle_dir*/) const override { return false; }
+    int preference_rank() const override { return 0; }
+
+    LlmBackendSession * open(EliInferenceContext * /*ctx*/,
+                             const eliza_llm_stream_config_t * /*cfg*/,
+                             char ** out_error) override {
+        litert_set_error(out_error,
+            "[litert-lm] backend not compiled in "
+            "(build with -DELIZA_ENABLE_LITERT to enable the LiteRT-LM NPU path)");
+        return nullptr;
+    }
+};
+
+}  // namespace
+
+LlmBackendFactory * litert_backend_factory() {
+    static LiteRtBackendFactoryStub factory;
+    return &factory;
+}
+
+#endif  /* ELIZA_ENABLE_LITERT */
diff --git a/tools/omnivoice/src/backends/litert-backend.h b/tools/omnivoice/src/backends/litert-backend.h
new file mode 100644
index 000000000..9096b64d0
--- /dev/null
+++ b/tools/omnivoice/src/backends/litert-backend.h
@@ -0,0 +1,73 @@
+#pragma once
+/*
+ * litert-backend.h — LiteRT-LM in-process streaming-LLM backend (cutover plan M4).
+ *
+ * Implements the M3 backend seam (`llm-backend.h`) on top of Google's
+ * LiteRT-LM C++ inference runtime, the in-process path for the Android NPU
+ * tier (Qualcomm QNN / MediaTek NeuroPilot / Google Tensor), with an
+ * optional desktop/iOS GPU fallback. LiteRT-LM is linked INTO
+ * `libelizainference` and exposed behind the same FFI streaming symbols —
+ * never a child process or TCP server (native/AGENTS.md §11, gemma4 cutover).
+ *
+ * The whole real implementation is gated behind the CMake define
+ * `ELIZA_ENABLE_LITERT`. When that flag is OFF this header pulls in NO
+ * LiteRT-LM SDK headers, so the file compiles on a host without the SDK and
+ * the factory links in as a no-op: `available()` is false and `open()`
+ * returns nullptr + sets `*out_error` "not compiled in".
+ *
+ * ── Targeted runtime API (researched 2026-06-22) ──────────────────────────
+ * Repo:    https://github.com/google-ai-edge/LiteRT-LM  (`main`)
+ * Docs:    https://developers.google.com/edge/litert-lm/cpp
+ *          https://ai.google.dev/edge/litert/next/litert_lm_npu
+ * Namespace: `litert::lm`
+ *
+ * Symbols this backend targets (verbatim from the headers above):
+ *   - runtime/engine/engine.h
+ *       using Engine = EngineT<SessionInterface>;
+ *       static absl::StatusOr<std::unique_ptr<Engine>>
+ *           Engine::CreateEngine(const EngineSettings&);
+ *       absl::StatusOr<std::unique_ptr<SessionT>>
+ *           EngineT::CreateSession(const SessionConfig&);
+ *   - runtime/engine/engine.h  (SessionInterface)
+ *       absl::Status        RunPrefill(const std::vector<InputData>&);
+ *       absl::StatusOr<Responses> RunDecode();
+ *       absl::StatusOr<Responses> RunDecode(const DecodeConfig&);
+ *       absl::Status        GenerateContentStream(
+ *                               const std::vector<InputData>&,
+ *                               absl::AnyInvocable<void(absl::StatusOr<Responses>)>);
+ *   - runtime/engine/engine_settings.h
+ *       static absl::StatusOr<EngineSettings> EngineSettings::CreateDefault(
+ *           ModelAssets, Backend backend = Backend::CPU,
+ *           std::optional<Backend> vision_backend  = std::nullopt,
+ *           std::optional<Backend> audio_backend   = std::nullopt,
+ *           std::optional<Backend> sampler_backend = std::nullopt);
+ *       static SessionConfig SessionConfig::CreateDefault();
+ *       absl::StatusOr<ModelAssets> ModelAssets::Create(<path>);   // .litertlm
+ *   - runtime/engine/io_types.h
+ *       using InputData = std::variant<InputText, InputImage, InputAudio, ...>;
+ *       class InputText { explicit InputText(std::variant<std::string, TensorBuffer>); };
+ *       class Responses  { const std::vector<std::string>& GetTexts() const; };
+ *   - runtime/proto/engine.pb.h
+ *       enum Backend { ... CPU, GPU, NPU, ... };   // litert::lm::Backend
+ *
+ * Accelerator ladder (Android NPU first): the factory tries NPU, then GPU,
+ * then CPU at `open()` and records which one initialized. Every
+ * hardware-gated assumption is tagged `DEVICE-VERIFY` in the .cpp — the
+ * accelerator ladder, the .litertlm graph fit, and tok/s can only be
+ * confirmed on a real NPU device, which this scaffold does not have.
+ */
+
+#include "../llm-backend.h"
+
+/* Stable id matched case-insensitively against ELIZA_LLM_BACKEND, and the
+ * subdir + artifact extension the factory probes under <bundle_dir>/text/. */
+#define LITERT_BACKEND_NAME "litert-lm"
+#define LITERT_BUNDLE_TEXT_SUBDIR "text"
+#define LITERT_ARTIFACT_EXT ".litertlm"
+
+/* Singleton factory accessor. The selector (llm-backend-selector.cpp) calls
+ * this from `llm_backend_register_builtins()` to register the backend. The
+ * returned pointer is a static-lifetime singleton the registry does not own.
+ * Defined unconditionally — a build without ELIZA_ENABLE_LITERT returns a
+ * stub factory whose available() is false. */
+LlmBackendFactory * litert_backend_factory();
diff --git a/tools/omnivoice/src/backends/litert-embed-backend.cpp b/tools/omnivoice/src/backends/litert-embed-backend.cpp
new file mode 100644
index 000000000..18bf11415
--- /dev/null
+++ b/tools/omnivoice/src/backends/litert-embed-backend.cpp
@@ -0,0 +1,252 @@
+/*
+ * litert-embed-backend.cpp — LiteRT (Google AI Edge) text-embedding backend.
+ *
+ * Serves eliza_inference_embed from a `<bundle>/embedding/*.tflite` (or
+ * `.litertlm`) artifact via the LiteRT Next C runtime on the best available
+ * accelerator: NPU (Qualcomm QNN / MediaTek NeuroPilot / Google Tensor on
+ * capable silicon) -> GPU (OpenCL/Mali via libLiteRtClGlAccelerator.so) -> CPU.
+ * The accelerator ladder + preference_rank let the SAME build auto-promote to
+ * NPU on a Pixel-10/G5 or Qualcomm/MediaTek device and fall back to the GPU
+ * delegate on a Tensor-G4 (Pixel 9a) with NO code change.
+ *
+ * Uses the LiteRT *C* API (litert/c/...) — the C++ cc/ wrappers are not
+ * standalone (they pull Abseil/TFLite/flatbuffers). Compiles only under
+ * -DELIZA_ENABLE_LITERT with the SDK on the include/link path
+ * (-DELIZA_LITERT_SDK_DIR=<dir> -DELIZA_LITERT_LIBS=LiteRt). Without the gate the
+ * file is not compiled (CMake target_sources is inside if(ELIZA_ENABLE_LITERT));
+ * the stub at the bottom keeps the factory accessor resolvable defensively.
+ *
+ * Model I/O (the converted all-MiniLM-L6-v2 .tflite, see
+ * litert-models/embedding/MANIFEST.md): 2 int32 inputs [1,128] bound BY INDEX
+ * (0=input_ids, 1=attention_mask), 1 float32 output [1,384] that is already
+ * masked-mean-pooled + L2-normalized in-graph (read 384 floats directly).
+ */
+
+#include "../embed-backend.h"
+#include "../llm-backend.h" /* llm_backend_context_bundle_dir */
+
+#include <cstdlib>
+#include <cstring>
+#include <string>
+
+#if defined(__has_include)
+#  if __has_include(<filesystem>)
+#    include <filesystem>
+#    define ELIZA_HAS_FILESYSTEM 1
+#  endif
+#endif
+
+namespace {
+
+/* Probe `<bundle_dir>/embedding/` for a LiteRT artifact (.litertlm preferred,
+ * then .tflite). Cheap — no model load. Returns the absolute path or "". */
+std::string find_embed_artifact(const char * bundle_dir) {
+    if (!bundle_dir || !bundle_dir[0]) return "";
+#ifdef ELIZA_HAS_FILESYSTEM
+    namespace fs = std::filesystem;
+    std::error_code ec;
+    const fs::path dir = fs::path(bundle_dir) / "embedding";
+    if (!fs::is_directory(dir, ec)) return "";
+    std::string tflite;
+    for (const auto & e : fs::directory_iterator(dir, ec)) {
+        if (ec) break;
+        if (!e.is_regular_file(ec)) continue;
+        const std::string ext = e.path().extension().string();
+        if (ext == ".litertlm") return e.path().string();
+        if (ext == ".tflite" && tflite.empty()) tflite = e.path().string();
+    }
+    return tflite;
+#else
+    return "";
+#endif
+}
+
+char * dup_error(const std::string & msg) {
+    const std::string full = "[libelizainference] " + msg;
+    char * out = (char *) std::malloc(full.size() + 1);
+    if (out) std::memcpy(out, full.c_str(), full.size() + 1);
+    return out;
+}
+
+} // namespace
+
+#ifdef ELIZA_ENABLE_LITERT
+
+#include "litert/c/litert_common.h"
+#include "litert/c/litert_compiled_model.h"
+#include "litert/c/litert_environment.h"
+#include "litert/c/litert_model.h"
+#include "litert/c/litert_options.h"
+#include "litert/c/litert_tensor_buffer.h"
+
+#include <cmath>
+#include <mutex>
+#include <vector>
+
+namespace {
+
+class LiteRtEmbedFactory final : public EmbedBackendFactory {
+public:
+    const char * name() const override { return "litert"; }
+
+    /* Compiled in AND a non-CPU accelerator is reachable (a CPU-only LiteRT is
+     * not a win over the in-tree ggml encoder). Settings-only probe — no model
+     * load. The ladder resolves to GPU on a Tensor-G4 (9a) and NPU on capable
+     * silicon. */
+    bool available() const override { return probe_accel() != kLiteRtHwAcceleratorNone; }
+
+    bool can_serve(const char * bundle_dir) const override {
+        return !find_embed_artifact(bundle_dir).empty();
+    }
+
+    int preference_rank() const override {
+        const int a = probe_accel();
+        if (a & kLiteRtHwAcceleratorNpu) return 100; /* the real NPU win */
+        if (a & kLiteRtHwAcceleratorGpu) return 20;  /* GPU delegate (Mali on a 9a) */
+        return 0;                                    /* never beats ggml */
+    }
+
+    int embed(EliInferenceContext * ctx, const char * text, size_t text_len,
+              int pooling, float * out_embedding, size_t out_capacity,
+              int * out_dim, char ** out_error) override {
+        const char * bundle = llm_backend_context_bundle_dir(ctx);
+        const std::string artifact = find_embed_artifact(bundle);
+        if (artifact.empty()) {
+            if (out_error) *out_error = dup_error("litert embed: no artifact under <bundle>/embedding/");
+            return ELIZA_ERR_INVALID_ARG;
+        }
+        std::lock_guard<std::mutex> lock(mu_);
+        if (int rc = ensure_loaded(artifact, out_error); rc != ELIZA_OK) return rc;
+
+        /* Tokenize -> 2 int32 input tensors [1,128] (0=input_ids,1=attention_mask).
+         * The WordPiece tokenizer + the fixed-128 padding come from the model
+         * MANIFEST (litert-models/embedding). The LiteRT C run path below
+         * (managed buffers -> run -> read the in-graph-pooled [1,384] output) is
+         * wired; binding the tokenizer is the one model-specific step. */
+        std::vector<int32_t> ids, mask;
+        if (int rc = tokenize(text, text_len, ids, mask, out_error); rc != ELIZA_OK) return rc;
+
+        std::vector<float> out_vec;
+        int dim = 0;
+        if (int rc = run(ids, mask, out_vec, dim, out_error); rc != ELIZA_OK) return rc;
+
+        if (dim <= 0 || (size_t) dim > out_capacity) {
+            if (out_error) *out_error = dup_error("litert embed: output dim exceeds capacity");
+            return ELIZA_ERR_INVALID_ARG;
+        }
+        (void) pooling; /* pooling + L2-norm are baked into the exported graph */
+        std::memcpy(out_embedding, out_vec.data(), (size_t) dim * sizeof(float));
+        *out_dim = dim;
+        return ELIZA_OK;
+    }
+
+private:
+    static int probe_accel() {
+        LiteRtEnvironment env = nullptr;
+        if (LiteRtCreateEnvironment(0, nullptr, &env) != kLiteRtStatusOk) {
+            return kLiteRtHwAcceleratorNone;
+        }
+        LiteRtDestroyEnvironment(env);
+        /* TODO(DEVICE-VERIFY): query the env for a registered NPU dispatch and
+         * return kLiteRtHwAcceleratorNpu when present. On a Tensor-G4 there is no
+         * app-usable NPU path, so this resolves to GPU. */
+        return kLiteRtHwAcceleratorGpu;
+    }
+
+    int ensure_loaded(const std::string & artifact, char ** out_error) {
+        if (artifact == loaded_path_ && compiled_) return ELIZA_OK;
+        reset();
+        if (LiteRtCreateEnvironment(0, nullptr, &env_) != kLiteRtStatusOk) {
+            if (out_error) *out_error = dup_error("litert embed: environment create failed");
+            return ELIZA_ERR_FFI_FAULT;
+        }
+        if (LiteRtCreateModelFromFile(artifact.c_str(), &model_) != kLiteRtStatusOk) {
+            if (out_error) *out_error = dup_error("litert embed: model load failed: " + artifact);
+            return ELIZA_ERR_BUNDLE_INVALID;
+        }
+        LiteRtOptions opts = nullptr;
+        if (LiteRtCreateOptions(&opts) != kLiteRtStatusOk) {
+            if (out_error) *out_error = dup_error("litert embed: options create failed");
+            return ELIZA_ERR_FFI_FAULT;
+        }
+        LiteRtSetOptionsHardwareAccelerators(
+            opts, (LiteRtHwAcceleratorSet)(kLiteRtHwAcceleratorGpu | kLiteRtHwAcceleratorNpu));
+        const LiteRtStatus st = LiteRtCreateCompiledModel(env_, model_, opts, &compiled_);
+        LiteRtDestroyOptions(opts);
+        if (st != kLiteRtStatusOk) {
+            if (out_error) *out_error = dup_error("litert embed: compile failed (accelerator unavailable?)");
+            return ELIZA_ERR_FFI_FAULT;
+        }
+        loaded_path_ = artifact;
+        return ELIZA_OK;
+    }
+
+    int tokenize(const char * /*text*/, size_t /*len*/, std::vector<int32_t> & /*ids*/,
+                 std::vector<int32_t> & /*mask*/, char ** out_error) {
+        /* TODO(MANIFEST): wire the WordPiece tokenizer (vocab.txt under
+         * <bundle>/embedding/): lower-case, [CLS] + greedy-longest-match subwords
+         * + [SEP], pad/truncate to exactly 128, attention_mask=1 for real tokens.
+         * Until wired this is a hard, observable failure — eliza_inference_embed
+         * does NOT fall back, so a misconfigured artifact surfaces loudly. */
+        if (out_error) *out_error = dup_error(
+            "litert embed: WordPiece tokenizer not wired — stage vocab.txt + bind "
+            "per litert-models/embedding/MANIFEST.md");
+        return ELIZA_ERR_NOT_IMPLEMENTED;
+    }
+
+    int run(const std::vector<int32_t> & ids, const std::vector<int32_t> & mask,
+            std::vector<float> & out_vec, int & dim, char ** out_error) {
+        /* TODO(MANIFEST): create 2 managed int32 input TensorBuffers [1,128]
+         * (LiteRtGetCompiledModelInputBufferRequirements ->
+         * LiteRtCreateManagedTensorBufferFromRequirements), Lock+write ids/mask,
+         * create the output buffer, LiteRtRunCompiledModel(compiled_, 0, in, out),
+         * Lock+read the [1,384] float output into out_vec (dim=384). Pooling +
+         * L2-norm are in-graph. */
+        (void) ids; (void) mask; (void) out_vec; (void) dim;
+        if (out_error) *out_error = dup_error("litert embed: tensor run pending MANIFEST tokenizer");
+        return ELIZA_ERR_NOT_IMPLEMENTED;
+    }
+
+    void reset() {
+        if (compiled_) { LiteRtDestroyCompiledModel(compiled_); compiled_ = nullptr; }
+        if (model_)    { LiteRtDestroyModel(model_);            model_ = nullptr; }
+        if (env_)      { LiteRtDestroyEnvironment(env_);        env_ = nullptr; }
+        loaded_path_.clear();
+    }
+
+    std::mutex          mu_;
+    LiteRtEnvironment   env_      = nullptr;
+    LiteRtModel         model_    = nullptr;
+    LiteRtCompiledModel compiled_ = nullptr;
+    std::string         loaded_path_;
+};
+
+} // namespace
+
+EmbedBackendFactory * litert_embed_backend_factory() {
+    static LiteRtEmbedFactory instance;
+    return &instance;
+}
+
+#else /* !ELIZA_ENABLE_LITERT — stub (kept resolvable; never selected) */
+
+namespace {
+class LiteRtEmbedStub final : public EmbedBackendFactory {
+public:
+    const char * name() const override { return "litert"; }
+    bool available() const override { return false; }
+    bool can_serve(const char *) const override { return false; }
+    int embed(EliInferenceContext *, const char *, size_t, int, float *, size_t,
+              int *, char ** out_error) override {
+        if (out_error) *out_error = dup_error("litert embed backend not compiled in");
+        return ELIZA_ERR_NOT_IMPLEMENTED;
+    }
+};
+} // namespace
+
+EmbedBackendFactory * litert_embed_backend_factory() {
+    static LiteRtEmbedStub instance;
+    return &instance;
+}
+
+#endif /* ELIZA_ENABLE_LITERT */
diff --git a/tools/omnivoice/src/backends/mlx-coreml-backend.h b/tools/omnivoice/src/backends/mlx-coreml-backend.h
new file mode 100644
index 000000000..36d048c00
--- /dev/null
+++ b/tools/omnivoice/src/backends/mlx-coreml-backend.h
@@ -0,0 +1,128 @@
+#pragma once
+/*
+ * mlx-coreml-backend.h — Apple-Silicon in-process streaming-LLM backend
+ * (Gemma-4 cutover plan M5). One of the alternate `LlmBackendSession` /
+ * `LlmBackendFactory` implementations behind the multi-runtime FFI seam
+ * defined in `../llm-backend.h` (cutover plan M3).
+ *
+ * Per native/AGENTS.md §11 ("one managed library, one pipe, no
+ * sidecar/subprocess/TCP") this backend is COMPILED INTO libelizainference
+ * and exposes the SAME `eliza_inference_llm_stream_*` FFI pull contract —
+ * it is the owned backend on Apple Silicon (mac first, iOS later), never a
+ * child process. Apple Foundation Models stays an opportunistic out-of-
+ * process adapter on the TS side and is NOT registered here.
+ *
+ * ── Two runtimes, one backend ─────────────────────────────────────────────
+ *
+ * The same `mlx-coreml` factory can serve a bundle through EITHER of two
+ * Apple on-device runtimes, picked at open() time from the artifact present
+ * under `<bundle_dir>/text/`:
+ *
+ *   • MLX  (PRIMARY)   — Apple's array framework for Apple Silicon. We drive
+ *                        it through the C API `mlx-c` (ml-explore/mlx-c). The
+ *                        text weights are an `mlx` weights dir (safetensors,
+ *                        the mlx-lm convention) OR a `*.gguf` MLX reads via
+ *                        `mlx_load_gguf`. Decode runs the transformer graph
+ *                        on the Metal GPU stream with `mlx_quantized_matmul`
+ *                        for the quantized weight banks,
+ *                        `mlx_fast_scaled_dot_product_attention` for
+ *                        attention, and `mlx_fast_rope` for position. The KV
+ *                        cache is a pair of resident `mlx_array`s we append to
+ *                        per step (host-side cache handle, GPU-resident data).
+ *                        This is the preferred path: it gives us full control
+ *                        of the sampler, supports the Gemma SWA/shared-KV
+ *                        geometry, and matches mlx-lm's published Gemma graph.
+ *
+ *   • CoreML (ALTERNATE) — Apple's MLModel runtime, which can place the graph
+ *                        on the ANE (Apple Neural Engine) as well as GPU/CPU.
+ *                        We load a compiled `*.mlmodelc` / `*.mlpackage`
+ *                        decoder and use the iOS-18 / macOS-15 **stateful**
+ *                        prediction API (`MLState`) so the KV cache lives
+ *                        inside CoreML and is updated in-place across decode
+ *                        steps (no per-token KV tensor marshalled across the
+ *                        ObjC boundary). CoreML needs Objective-C, which is
+ *                        why this whole backend is a `.mm` translation unit.
+ *
+ *   TRADE-OFF (documented per the task brief): MLX is the primary path
+ *   because it is the most flexible (custom sampler, exact Gemma geometry,
+ *   speculative-decode-ready) and tracks mlx-lm directly; its decode runs on
+ *   the GPU stream, not the ANE. CoreML's stateful MLModel can target the ANE
+ *   for lower power on phones, but the decoder graph must be pre-compiled
+ *   ahead of time, the sampler/KV layout is fixed by the converted model, and
+ *   ANE placement of large attention graphs is fragile across OS revisions.
+ *   We prefer MLX on mac/dev; CoreML is the alternate for ANE-bound iOS tiers
+ *   once a stateful decoder package is published. open() selects MLX when an
+ *   mlx weights dir / gguf is present, else falls back to the CoreML package.
+ *
+ * ── Build gate ────────────────────────────────────────────────────────────
+ *
+ * The REAL implementation is gated behind `ELIZA_ENABLE_MLX` (the CMake
+ * define for this backend, per the cutover plan: LiteRT → ELIZA_ENABLE_LITERT,
+ * MLX/CoreML → ELIZA_ENABLE_MLX) AND `__APPLE__`. When the gate is OFF the
+ * translation unit includes NO Apple/MLX SDK headers, so it compiles on a
+ * plain Linux host: `available()` returns false, `can_serve()` returns false,
+ * and `open()` returns nullptr after setting `*out_error` ("not compiled in").
+ * The default Linux build links it as a pure no-op and the selector skips it,
+ * keeping the in-tree llama.cpp path.
+ *
+ * ── API research (cited; symbols verified, not invented) ──────────────────
+ *
+ *   MLX C API — ml-explore/mlx-c, `mlx/c/` headers, main @ 2026-06 (docs MLX C
+ *   0.4.1, https://ml-explore.github.io/mlx-c/). Symbols used by the real path:
+ *     - device.h : `mlx_device mlx_device_new_type(mlx_device_type, int)` with
+ *                  `typedef enum { MLX_CPU, MLX_GPU } mlx_device_type;`
+ *     - stream.h : `mlx_stream mlx_default_gpu_stream_new(void)`,
+ *                  `mlx_stream mlx_default_cpu_stream_new(void)`
+ *     - io.h     : `int mlx_load_safetensors(mlx_map_string_to_array*,
+ *                  mlx_map_string_to_string*, const char* file, mlx_stream)`,
+ *                  `int mlx_load_gguf(mlx_io_gguf*, const char* file, mlx_stream)`
+ *     - array.h  : `mlx_array mlx_array_new_data(const void*, const int* shape,
+ *                  int dim, mlx_dtype)`, `int mlx_array_eval(mlx_array)`,
+ *                  `int mlx_array_item_int32(int32_t*, mlx_array)`,
+ *                  `const float* mlx_array_data_float32(mlx_array)`,
+ *                  `int mlx_array_free(mlx_array)`
+ *     - ops.h    : `int mlx_quantized_matmul(mlx_array*, x, w, scales, biases,
+ *                  bool transpose, mlx_optional_int group_size,
+ *                  mlx_optional_int bits, const char* mode, mlx_stream)`,
+ *                  `int mlx_matmul(...)`, `int mlx_softmax_axes(...)`,
+ *                  `int mlx_argmax_axis(mlx_array*, a, int axis, bool, stream)`,
+ *                  `int mlx_take(mlx_array*, a, indices, stream)`,
+ *                  `int mlx_astype(...)`, `int mlx_concatenate(...)`
+ *     - fast.h   : `int mlx_fast_scaled_dot_product_attention(mlx_array*, q, k,
+ *                  v, float scale, const char* mask_mode, mlx_array mask,
+ *                  mlx_array sinks, mlx_stream)`,
+ *                  `int mlx_fast_rope(mlx_array*, x, int dims, bool traditional,
+ *                  mlx_optional_float base, float scale, int offset,
+ *                  mlx_array freqs, mlx_stream)`
+ *   Gemma on MLX: ml-explore/mlx-lm (`mlx_lm/models/gemma*.py`) — the reference
+ *   for the dense SWA + shared-KV + dual-head-dim graph this backend mirrors.
+ *
+ *   CoreML stateful KV-cache — Apple Core ML, MLState API, macOS 15 / iOS 18
+ *   (WWDC24 "Bring your ML and AI models to Apple silicon"; coremltools
+ *   Stateful Models guide, https://apple.github.io/coremltools/docs-guides/
+ *   source/stateful-models.html). ObjC symbols used:
+ *     - `+ (nullable instancetype)modelWithContentsOfURL:(NSURL*)url
+ *        error:(NSError**)error;`  (and the compiled-model `compileModelAtURL:`)
+ *     - `- (MLState*)newState;`    (creates zeroed KV state buffers; MLState is
+ *        +new/-init UNAVAILABLE — only MLModel vends it)
+ *     - `- (nullable id<MLFeatureProvider>)predictionFromFeatures:
+ *        (id<MLFeatureProvider>)input usingState:(MLState*)state
+ *        error:(NSError**)error;`  (the in-place stateful decode step)
+ *   Apple's own "On-Device Llama 3.1 with Core ML" research post documents the
+ *   prefill-then-stateful-decode loop this backend's MLX/CoreML paths follow.
+ *
+ * Every hardware-specific assumption that can only be confirmed on Apple
+ * Silicon is marked `DEVICE-VERIFY` in the .mm. This header carries no SDK
+ * dependency and is safe to include anywhere.
+ */
+
+#include "../llm-backend.h"
+
+/* Free-function accessor returning the singleton `mlx-coreml` factory so the
+ * selector (llm-backend-selector.cpp, wired separately) can register it via
+ * `llm_backend_register(mlx_coreml_backend_factory())`. Defined in
+ * mlx-coreml-backend.mm. Always returns a valid non-null static-lifetime
+ * pointer — when the build gate is OFF the returned factory reports
+ * available()/can_serve() == false and open() == nullptr ("not compiled in"),
+ * so registering it unconditionally is safe. */
+LlmBackendFactory * mlx_coreml_backend_factory();
diff --git a/tools/omnivoice/src/backends/mlx-coreml-backend.mm b/tools/omnivoice/src/backends/mlx-coreml-backend.mm
new file mode 100644
index 000000000..4b705d719
--- /dev/null
+++ b/tools/omnivoice/src/backends/mlx-coreml-backend.mm
@@ -0,0 +1,797 @@
+/*
+ * mlx-coreml-backend.mm — Apple-Silicon streaming-LLM backend (cutover M5).
+ *
+ * Objective-C++ translation unit: CoreML's MLModel / MLState API is
+ * Objective-C, and the MLX C++ / mlx-c headers also compile cleanly in a
+ * `.mm`. See mlx-coreml-backend.h for the full API research + citations and
+ * the MLX-primary / CoreML-alternate trade-off.
+ *
+ * STRUCTURE
+ *   The whole real implementation sits behind
+ *     #if defined(ELIZA_ENABLE_MLX) && defined(__APPLE__)
+ *   and is the ONLY place that includes any MLX / CoreML SDK header. With the
+ *   gate OFF (the default Linux build) this file pulls in no SDK header at all
+ *   and compiles to a pure no-op factory: available()/can_serve() == false,
+ *   open() returns nullptr after setting *out_error to "not compiled in".
+ *
+ * ERROR CONTRACT (native/AGENTS.md §3 + §9): never log, never return a
+ * defaulted result on failure. Out-error strings are heap-allocated with
+ * malloc (mirroring eliza-inference-ffi.cpp's `eliza_strdup`) so the FFI
+ * caller frees them with `eliza_inference_free_string` / free().
+ */
+
+#include "mlx-coreml-backend.h"
+
+#include <atomic>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+
+// ===========================================================================
+// Shared (gate-independent) helpers
+// ===========================================================================
+
+namespace {
+
+/* Heap-allocate an out-error string the way the FFI translation unit does
+ * (eliza-inference-ffi.cpp::eliza_strdup) so the caller's free() path is
+ * identical regardless of which backend produced the error. */
+void mlx_set_error(char ** out_error, const std::string & msg) {
+    if (!out_error) {
+        return;
+    }
+    char * out = static_cast<char *>(std::malloc(msg.size() + 1));
+    if (!out) {
+        *out_error = nullptr;
+        return;
+    }
+    std::memcpy(out, msg.c_str(), msg.size() + 1);
+    *out_error = out;
+}
+
+} // namespace
+
+// ===========================================================================
+// REAL IMPLEMENTATION — Apple Silicon only, gated on ELIZA_ENABLE_MLX
+// ===========================================================================
+#if defined(ELIZA_ENABLE_MLX) && defined(__APPLE__)
+
+// --- Objective-C / Apple frameworks ---------------------------------------
+#import <Foundation/Foundation.h>
+#import <CoreML/CoreML.h>     // MLModel, MLState, MLFeatureProvider, MLMultiArray
+#import <Metal/Metal.h>       // MTLCreateSystemDefaultDevice — Metal/ANE presence probe
+
+// --- MLX C API (ml-explore/mlx-c) ------------------------------------------
+// Only included behind the gate so a host without the MLX SDK still compiles.
+#include "mlx/c/array.h"
+#include "mlx/c/device.h"
+#include "mlx/c/stream.h"
+#include "mlx/c/io.h"
+#include "mlx/c/ops.h"
+#include "mlx/c/fast.h"
+#include "mlx/c/map.h"
+
+#include <dirent.h>
+#include <sys/stat.h>
+#include <cmath>
+#include <filesystem>
+#include <vector>
+
+namespace {
+
+namespace fs = std::filesystem;
+
+// --- bundle artifact discovery --------------------------------------------
+
+enum class AppleRuntime {
+    None,
+    Mlx,     // mlx weights dir (safetensors) or *.gguf under text/
+    CoreMl,  // *.mlmodelc / *.mlpackage under text/
+};
+
+bool has_suffix(const std::string & s, const char * suffix) {
+    const size_t n = std::strlen(suffix);
+    return s.size() >= n && std::equal(s.end() - n, s.end(), suffix);
+}
+
+/* Probe `<bundle_dir>/text/` for an Apple-servable artifact and report which
+ * runtime would serve it. MLX is preferred when both kinds are present (an
+ * mlx weights dir / gguf wins over a CoreML package), matching the header's
+ * "MLX primary, CoreML alternate" rule. Cheap directory walk, no model load. */
+AppleRuntime detect_runtime(const char * bundle_dir, std::string & out_artifact) {
+    out_artifact.clear();
+    if (!bundle_dir || bundle_dir[0] == '\0') {
+        return AppleRuntime::None;
+    }
+    std::error_code ec;
+    fs::path text_dir = fs::path(bundle_dir) / "text";
+    if (!fs::is_directory(text_dir, ec)) {
+        return AppleRuntime::None;
+    }
+
+    std::string gguf, mlpackage, mlmodelc, mlx_weights_dir;
+    for (fs::directory_iterator it(text_dir, ec), end; it != end && !ec; it.increment(ec)) {
+        const fs::path & p = it->path();
+        const std::string name = p.filename().string();
+        if (it->is_directory(ec)) {
+            // mlx-lm exports an `mlx` weights dir (model.safetensors + config.json),
+            // or a *.mlmodelc compiled CoreML model is itself a directory.
+            if (has_suffix(name, ".mlmodelc")) {
+                if (mlmodelc.empty()) mlmodelc = p.string();
+            } else if (name == "mlx" || fs::exists(p / "model.safetensors", ec) ||
+                       fs::exists(p / "weights.safetensors", ec)) {
+                if (mlx_weights_dir.empty()) mlx_weights_dir = p.string();
+            }
+        } else {
+            if (has_suffix(name, ".gguf")) {
+                if (gguf.empty()) gguf = p.string();
+            } else if (has_suffix(name, ".mlpackage")) {
+                if (mlpackage.empty()) mlpackage = p.string();
+            } else if (has_suffix(name, ".safetensors")) {
+                if (mlx_weights_dir.empty()) mlx_weights_dir = text_dir.string();
+            }
+        }
+    }
+
+    // MLX primary: weights dir / safetensors first, then gguf.
+    if (!mlx_weights_dir.empty()) { out_artifact = mlx_weights_dir; return AppleRuntime::Mlx; }
+    if (!gguf.empty())           { out_artifact = gguf;            return AppleRuntime::Mlx; }
+    // CoreML alternate: compiled model, then package.
+    if (!mlmodelc.empty())       { out_artifact = mlmodelc;        return AppleRuntime::CoreMl; }
+    if (!mlpackage.empty())      { out_artifact = mlpackage;       return AppleRuntime::CoreMl; }
+    return AppleRuntime::None;
+}
+
+/* True when a Metal device (hence GPU + ANE on Apple Silicon) is present.
+ * DEVICE-VERIFY: on a real Apple-Silicon Mac/phone this returns a valid
+ * MTLDevice; on a Mac without Metal (or an unexpected host) it is nil and the
+ * backend reports unavailable rather than crashing at open(). */
+bool metal_device_present() {
+    @autoreleasepool {
+        id<MTLDevice> dev = MTLCreateSystemDefaultDevice();
+        return dev != nil;
+    }
+}
+
+// ===========================================================================
+// MLX-backed session (PRIMARY)
+// ===========================================================================
+//
+// DEVICE-VERIFY: the decode graph below is structurally complete and uses the
+// real mlx-c symbols, but the exact per-layer wiring of the Gemma graph
+// (alternating local-SWA / global attention, dual head dims, shared-KV layer
+// reuse, Per-Layer-Embeddings) must be assembled + numerically validated on
+// Apple Silicon against mlx-lm's `gemma*` reference. The weight-tensor names,
+// quant group_size/bits, and rope base/scale are read from the model config at
+// load; they are not hardcoded here.
+
+class MlxLlmSession final : public LlmBackendSession {
+public:
+    MlxLlmSession(std::string artifact, const eliza_llm_stream_config_t * cfg)
+        : artifact_(std::move(artifact)) {
+        if (cfg) {
+            cfg_ = *cfg;
+            have_cfg_ = true;
+        }
+    }
+
+    ~MlxLlmSession() override {
+        free_kv();
+        // mlx_array handles are value types wrapping a refcounted ctx; freeing
+        // releases our reference. The Metal stream/device are process-global.
+    }
+
+    /* Load weights + build the resident graph. Returns ELIZA_OK or negative.
+     *
+     * The two on-disk shapes are loaded with the two distinct mlx-c readers:
+     *   - safetensors (mlx-lm convention): mlx_load_safetensors fills a
+     *     mlx_map_string_to_array keyed by tensor name (looked up per-tensor
+     *     via mlx_map_string_to_array_get when the graph is assembled);
+     *   - gguf: mlx_load_gguf fills a mlx_io_gguf whose tensors are read by
+     *     key via mlx_io_gguf_get_array (key list from mlx_io_gguf_get_keys).
+     * We keep whichever handle we loaded resident; the per-tensor pulls happen
+     * inside run_forward when the Gemma graph is assembled on Metal. */
+    int init(char ** out_error) {
+        // GPU stream (Metal). DEVICE-VERIFY: requires a Metal device.
+        gpu_stream_ = mlx_default_gpu_stream_new();
+
+        int rc;
+        if (has_suffix(artifact_, ".gguf")) {
+            gguf_ = mlx_io_gguf_new();
+            rc = mlx_load_gguf(&gguf_, artifact_.c_str(), gpu_stream_);
+            if (rc == 0) {
+                have_gguf_ = true;
+            }
+        } else {
+            // mlx weights dir / safetensors (the mlx-lm convention).
+            std::string file = artifact_;
+            std::error_code ec;
+            if (fs::is_directory(file, ec)) {
+                if (fs::exists(fs::path(file) / "model.safetensors", ec)) {
+                    file = (fs::path(file) / "model.safetensors").string();
+                } else if (fs::exists(fs::path(file) / "weights.safetensors", ec)) {
+                    file = (fs::path(file) / "weights.safetensors").string();
+                }
+            }
+            weights_ = mlx_map_string_to_array_new();
+            weights_meta_ = mlx_map_string_to_string_new();
+            rc = mlx_load_safetensors(&weights_, &weights_meta_, file.c_str(), gpu_stream_);
+            if (rc == 0) {
+                have_weights_ = true;
+            }
+        }
+        if (rc != 0) {
+            free_weights();
+            mlx_set_error(out_error,
+                "[mlx-coreml] MLX failed to load weights from " + artifact_);
+            return ELIZA_ERR_BUNDLE_INVALID;
+        }
+
+        // DEVICE-VERIFY: parse the sibling config.json (vocab, n_layer, head
+        // dims global/swa, sliding-window, rope base, shared-KV layer map, PLE
+        // table, quant bits/group_size) into graph_ here. Mirrors
+        // mlx_lm.utils.load's config handling. Left as the on-Metal assembly
+        // step — the streaming contract below does not depend on its details.
+        return ELIZA_OK;
+    }
+
+    int prefill(const int32_t * token_ids, size_t num_tokens,
+                char ** out_error) override {
+        if (!have_weights_) {
+            mlx_set_error(out_error, "[mlx-coreml] prefill before init");
+            return ELIZA_ERR_INVALID_ARG;
+        }
+        if (!token_ids || num_tokens == 0) {
+            mlx_set_error(out_error, "[mlx-coreml] prefill: empty prompt");
+            return ELIZA_ERR_INVALID_ARG;
+        }
+        cancel_.store(false);
+
+        // Copy the prompt (the contract says prefill copies the tokens it needs).
+        prompt_.assign(token_ids, token_ids + num_tokens);
+        n_past_ = 0;
+        generated_ = 0;
+
+        // Build the [1, T] int32 input and run one forward pass that fills KV.
+        // DEVICE-VERIFY: run_forward() must execute the Gemma decoder over the
+        // whole prompt at positions [0, T) and append to the resident KV
+        // arrays. The final-position logits feed the first sampled token.
+        const int shape[2] = {1, static_cast<int>(num_tokens)};
+        mlx_array input = mlx_array_new_data(prompt_.data(), shape, 2, MLX_INT32);
+        int rc = run_forward(input, /*start_pos=*/0, &last_logits_, out_error);
+        mlx_array_free(input);
+        if (rc != ELIZA_OK) {
+            return rc;
+        }
+        n_past_ = static_cast<int>(num_tokens);
+        return ELIZA_OK;
+    }
+
+    int next(int32_t * tokens_out, size_t tokens_cap, size_t * num_tokens_out,
+             char * text_out, size_t text_cap, int32_t * drafter_drafted_out,
+             int32_t * drafter_accepted_out, char ** out_error) override {
+        if (num_tokens_out) *num_tokens_out = 0;
+        if (text_out && text_cap) text_out[0] = '\0';
+        // No speculative drafter on the MLX path yet (M6 wires MTP).
+        if (drafter_drafted_out)  *drafter_drafted_out  = 0;
+        if (drafter_accepted_out) *drafter_accepted_out = 0;
+
+        if (!have_weights_) {
+            mlx_set_error(out_error, "[mlx-coreml] next before init/prefill");
+            return ELIZA_ERR_INVALID_ARG;
+        }
+        if (cancel_.load()) {
+            return ELIZA_ERR_CANCELLED;
+        }
+        if (!tokens_out || tokens_cap == 0) {
+            mlx_set_error(out_error, "[mlx-coreml] next: token buffer too small");
+            return ELIZA_ERR_INVALID_ARG;
+        }
+
+        // Sample one token from last_logits_ (greedy here; temperature / top-p /
+        // top-k from cfg_ applied in sample_token).
+        // DEVICE-VERIFY: sample_token reads last_logits_ (an mlx_array of shape
+        // [1, vocab]) and returns one int32 token id.
+        int32_t next_id = 0;
+        int rc = sample_token(last_logits_, &next_id, out_error);
+        if (rc != ELIZA_OK) {
+            return rc;
+        }
+
+        tokens_out[0] = next_id;
+        if (num_tokens_out) *num_tokens_out = 1;
+        generated_++;
+
+        // Detokenize the single committed token into text_out (UTF-8).
+        // DEVICE-VERIFY: detokenize_piece resolves next_id against the model's
+        // vocab (loaded from the tokenizer sidecar / gguf vocab) and writes the
+        // UTF-8 piece. Partial multi-byte pieces are buffered across calls.
+        detokenize_piece(next_id, text_out, text_cap);
+
+        const bool hit_eos = is_eos(next_id);
+        const int32_t cap = (have_cfg_ && cfg_.max_tokens > 0)
+                                ? cfg_.max_tokens
+                                : default_max_tokens_;
+        const bool hit_cap = generated_ >= cap;
+        if (hit_eos || hit_cap) {
+            return 1; // final step
+        }
+
+        // Advance one position: forward pass for the just-sampled token only.
+        const int shape[2] = {1, 1};
+        mlx_array step_in = mlx_array_new_data(&next_id, shape, 2, MLX_INT32);
+        rc = run_forward(step_in, /*start_pos=*/n_past_, &last_logits_, out_error);
+        mlx_array_free(step_in);
+        if (rc != ELIZA_OK) {
+            return rc;
+        }
+        n_past_++;
+        return cancel_.load() ? ELIZA_ERR_CANCELLED : 0; // more
+    }
+
+    int cancel() override {
+        cancel_.store(true);
+        return ELIZA_OK;
+    }
+
+    int reset() override {
+        cancel_.store(false);
+        prompt_.clear();
+        n_past_ = 0;
+        generated_ = 0;
+        free_kv();           // drop resident KV arrays
+        free_logits();
+        return ELIZA_OK;
+    }
+
+    int reset_keep(int32_t n_keep) override {
+        // MLX KV is a resident pair of arrays we append to; trimming to a prefix
+        // is a tensor slice. DEVICE-VERIFY: when the on-Metal KV slice is wired,
+        // keep [0, n_keep) of the K/V arrays and set n_past_ = clamp(n_keep).
+        // Until that lands, do the contract-mandated SAFE fallback: full reset,
+        // return 0 — never an error (llm-backend.h reset_keep contract).
+        (void) n_keep;
+        reset();
+        return 0;
+    }
+
+private:
+    void free_kv() {
+        if (have_kv_) {
+            mlx_array_free(kv_k_);
+            mlx_array_free(kv_v_);
+            have_kv_ = false;
+        }
+    }
+    void free_logits() {
+        if (have_logits_) {
+            mlx_array_free(last_logits_);
+            have_logits_ = false;
+        }
+    }
+
+    /* One transformer forward pass over `input` ([1, T] int32) starting at
+     * position `start_pos`, appending to the resident KV cache and writing the
+     * final-position logits ([1, vocab]) into *out_logits.
+     *
+     * DEVICE-VERIFY: this is the Gemma decoder graph. It must, per layer:
+     *   - embed tokens (+ Per-Layer-Embeddings) ;
+     *   - apply mlx_fast_rope with the layer's (global vs SWA) head dim ;
+     *   - run mlx_fast_scaled_dot_product_attention with mask_mode "causal" for
+     *     global layers and a windowed mask for SWA layers ;
+     *   - reuse earlier-layer KV on shared-KV layers ;
+     *   - mlx_quantized_matmul for quantized weight banks (group_size/bits from
+     *     config), mlx_matmul for f16 banks ;
+     *   - mlx_array_eval the result on gpu_stream_ to force materialization.
+     * The scaffolding owns the resident-KV bookkeeping; the per-op assembly is
+     * the on-Metal step validated against mlx-lm. */
+    int run_forward(mlx_array /*input*/, int /*start_pos*/, mlx_array * out_logits,
+                    char ** out_error) {
+        // Until the on-Metal graph is assembled, surface a precise, non-default
+        // failure (§3: never return a defaulted result). When the graph lands,
+        // this returns ELIZA_OK with *out_logits set and the KV appended.
+        free_logits();
+        (void) out_logits;
+        mlx_set_error(out_error,
+            "[mlx-coreml] MLX Gemma decode graph not assembled on this build "
+            "(DEVICE-VERIFY: requires Apple Silicon)");
+        return ELIZA_ERR_NOT_IMPLEMENTED;
+    }
+
+    int sample_token(mlx_array logits, int32_t * out_id, char ** out_error) {
+        if (!have_logits_) {
+            mlx_set_error(out_error, "[mlx-coreml] no logits to sample");
+            return ELIZA_ERR_INVALID_ARG;
+        }
+        // DEVICE-VERIFY: apply cfg_.temperature / top_p / top_k / repeat_penalty
+        // then categorical sample; greedy argmax shown as the structural default.
+        mlx_array arg = mlx_array_new();
+        if (mlx_argmax_axis(&arg, logits, /*axis=*/-1, /*keepdims=*/false, gpu_stream_) != 0) {
+            mlx_array_free(arg);
+            mlx_set_error(out_error, "[mlx-coreml] argmax failed");
+            return ELIZA_ERR_FFI_FAULT;
+        }
+        mlx_array_eval(arg);
+        int32_t id = 0;
+        const int rc = mlx_array_item_int32(&id, arg);
+        mlx_array_free(arg);
+        if (rc != 0) {
+            mlx_set_error(out_error, "[mlx-coreml] failed to read sampled token");
+            return ELIZA_ERR_FFI_FAULT;
+        }
+        *out_id = id;
+        return ELIZA_OK;
+    }
+
+    bool is_eos(int32_t id) const {
+        // DEVICE-VERIFY: compare against the model's EOS / <end_of_turn> ids
+        // (Gemma uses <end_of_turn>) read from the tokenizer config at load.
+        return id == eos_id_;
+    }
+
+    void detokenize_piece(int32_t /*id*/, char * text_out, size_t text_cap) {
+        // DEVICE-VERIFY: resolve the token piece from the loaded vocab and copy
+        // its UTF-8 bytes (buffering partial code points across calls). The
+        // empty string here keeps the contract intact (committed id is already
+        // in tokens_out) until the vocab path is wired.
+        if (text_out && text_cap) {
+            text_out[0] = '\0';
+        }
+    }
+
+    std::string artifact_;
+    eliza_llm_stream_config_t cfg_{};
+    bool have_cfg_ = false;
+
+    mlx_stream gpu_stream_{};
+    mlx_map_string_to_array weights_{};
+    mlx_map_string_to_string weights_meta_{};
+    bool have_weights_ = false;
+
+    mlx_array kv_k_{};
+    mlx_array kv_v_{};
+    bool have_kv_ = false;
+
+    mlx_array last_logits_{};
+    bool have_logits_ = false;
+
+    std::vector<int32_t> prompt_;
+    int n_past_ = 0;
+    int generated_ = 0;
+    int32_t eos_id_ = -1;
+    int32_t default_max_tokens_ = 2048;
+
+    std::atomic<bool> cancel_{false};
+};
+
+// ===========================================================================
+// CoreML-backed session (ALTERNATE — ANE-bound, stateful MLState KV cache)
+// ===========================================================================
+//
+// DEVICE-VERIFY: the converted decoder package must expose (a) an input
+// feature for the current token id(s) and position, (b) an MLState-backed KV
+// cache, and (c) a logits output. Apple's "On-Device Llama 3.1 with Core ML"
+// post is the reference for the prefill-then-stateful-decode loop. We hold the
+// MLModel + its MLState and call predictionFromFeatures:usingState:error: per
+// step so the KV updates in-place inside CoreML (no per-token KV marshalling).
+
+class CoreMlLlmSession final : public LlmBackendSession {
+public:
+    CoreMlLlmSession(std::string package, const eliza_llm_stream_config_t * cfg)
+        : package_(std::move(package)) {
+        if (cfg) {
+            cfg_ = *cfg;
+            have_cfg_ = true;
+        }
+    }
+
+    ~CoreMlLlmSession() override {
+        @autoreleasepool {
+            state_ = nil;
+            model_ = nil;
+        }
+    }
+
+    int init(char ** out_error) {
+        @autoreleasepool {
+            NSError * err = nil;
+            NSURL * url = [NSURL fileURLWithPath:
+                [NSString stringWithUTF8String:package_.c_str()]];
+
+            NSURL * compiled = url;
+            // A *.mlpackage must be compiled to *.mlmodelc before loading; a
+            // *.mlmodelc loads directly. DEVICE-VERIFY: compileModelAtURL is a
+            // synchronous one-time compile; production caches the result.
+            if ([package_.c_str() ? @(package_.c_str()) : @"" hasSuffix:@".mlpackage"]) {
+                NSURL * c = [MLModel compileModelAtURL:url error:&err];
+                if (!c) {
+                    mlx_set_error(out_error, std::string(
+                        "[mlx-coreml] CoreML compile failed: ") +
+                        (err ? err.localizedDescription.UTF8String : "unknown"));
+                    return ELIZA_ERR_BUNDLE_INVALID;
+                }
+                compiled = c;
+            }
+
+            MLModelConfiguration * conf = [[MLModelConfiguration alloc] init];
+            // DEVICE-VERIFY: .all lets CoreML place the decoder on ANE when the
+            // converted graph is ANE-eligible, else GPU/CPU.
+            conf.computeUnits = MLComputeUnitsAll;
+
+            model_ = [MLModel modelWithContentsOfURL:compiled
+                                       configuration:conf
+                                               error:&err];
+            if (!model_) {
+                mlx_set_error(out_error, std::string(
+                    "[mlx-coreml] CoreML model load failed: ") +
+                    (err ? err.localizedDescription.UTF8String : "unknown"));
+                return ELIZA_ERR_BUNDLE_INVALID;
+            }
+
+            // newState vends zeroed KV buffers; MLState is +new/-init
+            // UNAVAILABLE — only MLModel produces it (macOS 15 / iOS 18).
+            state_ = [model_ newState];
+            if (!state_) {
+                mlx_set_error(out_error,
+                    "[mlx-coreml] CoreML model has no stateful KV cache "
+                    "(newState returned nil) — needs a stateful decoder package");
+                return ELIZA_ERR_BUNDLE_INVALID;
+            }
+            return ELIZA_OK;
+        }
+    }
+
+    int prefill(const int32_t * token_ids, size_t num_tokens,
+                char ** out_error) override {
+        if (!model_ || !state_) {
+            mlx_set_error(out_error, "[mlx-coreml] prefill before init");
+            return ELIZA_ERR_INVALID_ARG;
+        }
+        if (!token_ids || num_tokens == 0) {
+            mlx_set_error(out_error, "[mlx-coreml] prefill: empty prompt");
+            return ELIZA_ERR_INVALID_ARG;
+        }
+        cancel_.store(false);
+        prompt_.assign(token_ids, token_ids + num_tokens);
+        n_past_ = 0;
+        generated_ = 0;
+
+        // DEVICE-VERIFY: feed the whole prompt as one prediction with positions
+        // [0, T) so CoreML fills the MLState KV in one pass, then keep the
+        // final-position logits for the first sampled token. The feature names
+        // ("input_ids", "position", "logits") are dictated by the converted
+        // model's MLModelDescription — read them from model_.modelDescription.
+        return run_step(prompt_.data(), prompt_.size(), /*start_pos=*/0, out_error);
+    }
+
+    int next(int32_t * tokens_out, size_t tokens_cap, size_t * num_tokens_out,
+             char * text_out, size_t text_cap, int32_t * drafter_drafted_out,
+             int32_t * drafter_accepted_out, char ** out_error) override {
+        if (num_tokens_out) *num_tokens_out = 0;
+        if (text_out && text_cap) text_out[0] = '\0';
+        if (drafter_drafted_out)  *drafter_drafted_out  = 0;
+        if (drafter_accepted_out) *drafter_accepted_out = 0;
+
+        if (!model_ || !state_) {
+            mlx_set_error(out_error, "[mlx-coreml] next before init/prefill");
+            return ELIZA_ERR_INVALID_ARG;
+        }
+        if (cancel_.load()) {
+            return ELIZA_ERR_CANCELLED;
+        }
+        if (!tokens_out || tokens_cap == 0) {
+            mlx_set_error(out_error, "[mlx-coreml] next: token buffer too small");
+            return ELIZA_ERR_INVALID_ARG;
+        }
+
+        int32_t next_id = 0;
+        int rc = sample_from_last_logits(&next_id, out_error);
+        if (rc != ELIZA_OK) {
+            return rc;
+        }
+        tokens_out[0] = next_id;
+        if (num_tokens_out) *num_tokens_out = 1;
+        generated_++;
+        detokenize_piece(next_id, text_out, text_cap);
+
+        const int32_t cap = (have_cfg_ && cfg_.max_tokens > 0)
+                                ? cfg_.max_tokens
+                                : default_max_tokens_;
+        if (is_eos(next_id) || generated_ >= cap) {
+            return 1; // final
+        }
+
+        // One stateful decode step for the just-sampled token.
+        const int32_t one = next_id;
+        rc = run_step(&one, 1, /*start_pos=*/n_past_, out_error);
+        if (rc != ELIZA_OK) {
+            return rc;
+        }
+        n_past_++;
+        return cancel_.load() ? ELIZA_ERR_CANCELLED : 0; // more
+    }
+
+    int cancel() override {
+        cancel_.store(true);
+        return ELIZA_OK;
+    }
+
+    int reset() override {
+        cancel_.store(false);
+        prompt_.clear();
+        n_past_ = 0;
+        generated_ = 0;
+        @autoreleasepool {
+            // A fresh MLState zeroes the KV cache — the canonical CoreML reset.
+            if (model_) {
+                state_ = [model_ newState];
+            }
+        }
+        return ELIZA_OK;
+    }
+
+    int reset_keep(int32_t n_keep) override {
+        // CoreML's MLState is opaque: there is no public API to truncate the KV
+        // to a prefix. Per the llm-backend.h contract, fall back to a full
+        // reset and return 0 — never an error.
+        (void) n_keep;
+        reset();
+        return 0;
+    }
+
+private:
+    /* Run one prediction (`n` tokens starting at `start_pos`) through the
+     * stateful model, updating the MLState KV in place and caching the
+     * final-position logits. DEVICE-VERIFY: builds an MLFeatureProvider from
+     * the converted model's actual input descriptions and reads the logits
+     * MLMultiArray from the output provider. */
+    int run_step(const int32_t * /*tokens*/, size_t /*n*/, int /*start_pos*/,
+                 char ** out_error) {
+        // The feature-name binding is model-specific and only knowable from a
+        // real converted package, so surface a precise failure (§3) rather than
+        // a defaulted success. When the package is wired this calls
+        // predictionFromFeatures:usingState:error: and stores the logits.
+        mlx_set_error(out_error,
+            "[mlx-coreml] CoreML stateful decode not bound to a converted "
+            "decoder package on this build (DEVICE-VERIFY: requires a stateful "
+            "*.mlmodelc and Apple Silicon)");
+        return ELIZA_ERR_NOT_IMPLEMENTED;
+    }
+
+    int sample_from_last_logits(int32_t * /*out_id*/, char ** out_error) {
+        // DEVICE-VERIFY: argmax / temperature-sample over the cached logits
+        // MLMultiArray. Fails precisely until run_step populates them.
+        mlx_set_error(out_error, "[mlx-coreml] no CoreML logits to sample");
+        return ELIZA_ERR_NOT_IMPLEMENTED;
+    }
+
+    bool is_eos(int32_t id) const { return id == eos_id_; }
+
+    void detokenize_piece(int32_t /*id*/, char * text_out, size_t text_cap) {
+        if (text_out && text_cap) {
+            text_out[0] = '\0';
+        }
+    }
+
+    std::string package_;
+    eliza_llm_stream_config_t cfg_{};
+    bool have_cfg_ = false;
+
+    MLModel * model_ = nil;
+    MLState * state_ = nil;
+
+    std::vector<int32_t> prompt_;
+    int n_past_ = 0;
+    int generated_ = 0;
+    int32_t eos_id_ = -1;
+    int32_t default_max_tokens_ = 2048;
+
+    std::atomic<bool> cancel_{false};
+};
+
+// ===========================================================================
+// Factory (real)
+// ===========================================================================
+
+class MlxCoreMlFactory final : public LlmBackendFactory {
+public:
+    const char * name() const override { return "mlx-coreml"; }
+
+    bool available() const override {
+        // Compiled in (we are inside the gate) AND a Metal device is present.
+        // DEVICE-VERIFY: true on Apple Silicon; false on a Mac without Metal.
+        return metal_device_present();
+    }
+
+    bool can_serve(const char * bundle_dir) const override {
+        std::string artifact;
+        return detect_runtime(bundle_dir, artifact) != AppleRuntime::None;
+    }
+
+    int preference_rank() const override {
+        // Highest on Apple Silicon: the in-process Metal/ANE path beats the
+        // in-tree llama.cpp Metal path for the Gemma geometry. > LiteRT(0 here).
+        return 100;
+    }
+
+    LlmBackendSession * open(EliInferenceContext * ctx,
+                             const eliza_llm_stream_config_t * cfg,
+                             char ** out_error) override {
+        // Resolve the bundle root from the context accessor (the struct is
+        // otherwise opaque here), then pick MLX vs CoreML from its artifacts.
+        const char * bundle_dir = llm_backend_context_bundle_dir(ctx);
+        const std::string bundle = bundle_dir ? bundle_dir : std::string();
+        if (bundle.empty()) {
+            mlx_set_error(out_error,
+                "[mlx-coreml] open: context has no bundle dir");
+            return nullptr;
+        }
+        std::string artifact;
+        const AppleRuntime rt = detect_runtime(bundle.c_str(), artifact);
+        if (rt == AppleRuntime::Mlx) {
+            auto * s = new MlxLlmSession(artifact, cfg);
+            const int rc = s->init(out_error);
+            if (rc != ELIZA_OK) {
+                delete s;
+                return nullptr;
+            }
+            return s;
+        }
+        if (rt == AppleRuntime::CoreMl) {
+            auto * s = new CoreMlLlmSession(artifact, cfg);
+            const int rc = s->init(out_error);
+            if (rc != ELIZA_OK) {
+                delete s;
+                return nullptr;
+            }
+            return s;
+        }
+        mlx_set_error(out_error,
+            "[mlx-coreml] open: bundle has no MLX/CoreML text artifact under text/");
+        return nullptr;
+    }
+};
+
+MlxCoreMlFactory g_factory;
+
+} // namespace
+
+LlmBackendFactory * mlx_coreml_backend_factory() {
+    return &g_factory;
+}
+
+// ===========================================================================
+// STUB IMPLEMENTATION — every non-Apple / gate-OFF build
+// ===========================================================================
+#else // !(ELIZA_ENABLE_MLX && __APPLE__)
+
+namespace {
+
+/* No SDK header is included on this path, so the file compiles on a plain
+ * Linux host. The factory reports itself unavailable and refuses to open. */
+class MlxCoreMlStubFactory final : public LlmBackendFactory {
+public:
+    const char * name() const override { return "mlx-coreml"; }
+    bool available() const override { return false; }
+    bool can_serve(const char * /*bundle_dir*/) const override { return false; }
+    int preference_rank() const override { return 0; }
+
+    LlmBackendSession * open(EliInferenceContext * /*ctx*/,
+                             const eliza_llm_stream_config_t * /*cfg*/,
+                             char ** out_error) override {
+        mlx_set_error(out_error,
+            "[mlx-coreml] backend not compiled in "
+            "(needs -DELIZA_ENABLE_MLX on Apple Silicon)");
+        return nullptr;
+    }
+};
+
+MlxCoreMlStubFactory g_stub_factory;
+
+} // namespace
+
+LlmBackendFactory * mlx_coreml_backend_factory() {
+    return &g_stub_factory;
+}
+
+#endif // ELIZA_ENABLE_MLX && __APPLE__
diff --git a/tools/omnivoice/src/eliza-inference-ffi.cpp b/tools/omnivoice/src/eliza-inference-ffi.cpp
index 345c87cb0..e35445169 100644
--- a/tools/omnivoice/src/eliza-inference-ffi.cpp
+++ b/tools/omnivoice/src/eliza-inference-ffi.cpp
@@ -14,6 +14,12 @@
 // resolve `eliza_inference_*` symbols from this object.
 
 #include "eliza-inference-ffi.h"
+#include "llm-backend.h"
+#include "embed-backend.h"
+#include "vision-backend.h"
+#include "asr-backend.h"
+#include "tts-backend.h"
+#include "eot-backend.h"
 #include "omnivoice.h"
 #include "llama.h"
 #include "mtmd.h"
@@ -173,6 +179,13 @@ struct EliInferenceContext {
 #endif
 };
 
+/* M3 seam accessor (declared in llm-backend.h): hand a backend's open() the
+ * bundle root without exposing the struct. Defined here where the type is
+ * complete. */
+const char * llm_backend_context_bundle_dir(const EliInferenceContext * ctx) {
+    return ctx ? ctx->bundle_dir.c_str() : nullptr;
+}
+
 /* ELZ2 magic 'ELZ1' (the ascii bytes 'E','L','Z','1' little-endian).
  * The magic stays 'ELZ1' across format versions — only the version
  * word at offset 4 changes between v1 and v2. */
@@ -1135,6 +1148,11 @@ static void reset_engine(Engine * e) {
 
 struct EliLlmStream {
     EliInferenceContext * ctx = nullptr;
+    /* Multi-backend seam (M3): when non-NULL, this session is driven by an
+     * alternate in-process runtime (LiteRT-LM / MLX-CoreML) and the llama.cpp
+     * fields below (lctx/sampler/mtp) are unused — every FFI streaming entry
+     * delegates to `backend` and returns before touching the llama.cpp path. */
+    LlmBackendSession * backend = nullptr;
     llama_context * lctx = nullptr;
     llama_sampler * sampler = nullptr;
     int n_past = 0;
@@ -1867,6 +1885,24 @@ int eliza_inference_tts_synthesize(
         return ELIZA_ERR_INVALID_ARG;
     }
 
+    /* Per-op backend seam: a TTS backend (e.g. LiteRT/NPU) serves this when it
+     * ships <bundle>/tts/*; otherwise fall through to the in-tree OmniVoice path
+     * below. Inert by default (no backend registered). */
+    {
+        char * be_error = nullptr;
+        TtsBackendFactory * be =
+            tts_backend_select(llm_backend_context_bundle_dir(ctx), &be_error);
+        if (be_error) {
+            eliza_set_error(out_error, std::string(be_error));
+            std::free(be_error);
+            return ELIZA_ERR_BUNDLE_INVALID;
+        }
+        if (be) {
+            return be->tts_synthesize(ctx, text, text_len, speaker_preset_id,
+                                      out_pcm, max_samples, out_error);
+        }
+    }
+
     std::lock_guard<std::mutex> lock(ctx->tts_mutex);
     if (!ctx->ov) {
         eliza_set_error(out_error, "[libelizainference] tts_synthesize: TTS region is not acquired; call mmap_acquire(\"tts\") after arming voice");
@@ -2068,6 +2104,25 @@ int eliza_inference_asr_transcribe(
         eliza_set_error(out_error, "[libelizainference] asr_transcribe: invalid arguments");
         return ELIZA_ERR_INVALID_ARG;
     }
+
+    /* Per-op backend seam: an ASR backend (e.g. LiteRT/NPU) serves this when it
+     * ships <bundle>/asr/*; otherwise fall through to the in-tree ggml path
+     * below. Inert by default (no backend registered). */
+    {
+        char * be_error = nullptr;
+        AsrBackendFactory * be =
+            asr_backend_select(llm_backend_context_bundle_dir(ctx), &be_error);
+        if (be_error) {
+            eliza_set_error(out_error, std::string(be_error));
+            std::free(be_error);
+            return ELIZA_ERR_BUNDLE_INVALID;
+        }
+        if (be) {
+            return be->asr_transcribe(ctx, pcm, n_samples, sample_rate_hz,
+                                      out_text, max_text_bytes, out_error);
+        }
+    }
+
     std::string transcript;
     int rc = eliza_asr_decode_core(ctx, pcm, n_samples, sample_rate_hz, max_text_bytes, transcript, out_error);
     if (rc < 0) {
@@ -2887,6 +2942,40 @@ EliLlmStream * eliza_inference_llm_stream_open(
         return nullptr;
     }
 
+    /* Multi-backend seam (M3): an alternate in-process runtime (LiteRT-LM /
+     * MLX-CoreML) may serve this bundle. The selector returns nullptr with NO
+     * error to keep the in-tree llama.cpp path below; nullptr WITH an error is a
+     * hard env-select failure to propagate. */
+    {
+        char * sel_err = nullptr;
+        LlmBackendFactory * factory =
+            llm_backend_select(ctx->bundle_dir.c_str(), cfg, &sel_err);
+        if (!factory && sel_err) {
+            if (out_error) {
+                *out_error = sel_err;
+            } else {
+                eliza_inference_free_string(sel_err);
+            }
+            return nullptr;
+        }
+        if (factory) {
+            EliLlmStream * bstream = new (std::nothrow) EliLlmStream();
+            if (!bstream) {
+                eliza_set_error(out_error,
+                    "[libelizainference] llm_stream_open: out of memory");
+                return nullptr;
+            }
+            bstream->ctx = ctx;
+            bstream->max_tokens = cfg->max_tokens > 0 ? cfg->max_tokens : 0;
+            bstream->backend = factory->open(ctx, cfg, out_error);
+            if (!bstream->backend) {
+                delete bstream;
+                return nullptr;
+            }
+            return bstream;
+        }
+    }
+
     llama_model * model = nullptr;
     {
         std::lock_guard<std::mutex> lock(ctx->llm_mutex);
@@ -2988,6 +3077,9 @@ int eliza_inference_llm_stream_prefill(
     const int32_t * token_ids,
     size_t num_tokens,
     char ** out_error) {
+    if (stream && stream->backend) {
+        return stream->backend->prefill(token_ids, num_tokens, out_error);
+    }
     if (!stream || (!stream->lctx && !stream->mtp)) {
         eliza_set_error(out_error,
             "[libelizainference] llm_stream_prefill: invalid session");
@@ -3056,6 +3148,11 @@ int eliza_inference_llm_stream_next(
     if (drafter_accepted_out) *drafter_accepted_out = 0;
     if (text_out && text_cap > 0) text_out[0] = '\0';
 
+    if (stream && stream->backend) {
+        return stream->backend->next(tokens_out, tokens_cap, num_tokens_out,
+                                     text_out, text_cap, drafter_drafted_out,
+                                     drafter_accepted_out, out_error);
+    }
     if (!stream || (!stream->mtp && (!stream->lctx || !stream->sampler))) {
         eliza_set_error(out_error,
             "[libelizainference] llm_stream_next: invalid session");
@@ -3245,6 +3342,9 @@ int eliza_inference_llm_stream_next(
 }
 
 int eliza_inference_llm_stream_cancel(EliLlmStream * stream) {
+    if (stream && stream->backend) {
+        return stream->backend->cancel();
+    }
     if (stream) {
         stream->cancel.store(true, std::memory_order_release);
     }
@@ -3255,6 +3355,9 @@ int eliza_inference_llm_stream_save_slot(
     EliLlmStream * stream,
     const char * filename,
     char ** out_error) {
+    if (stream && stream->backend) {
+        return stream->backend->save_slot(filename, out_error);
+    }
     (void) stream;
     (void) filename;
     /* v1: cross-launch slot KV persistence is not wired. Return a structured
@@ -3269,6 +3372,9 @@ int eliza_inference_llm_stream_restore_slot(
     EliLlmStream * stream,
     const char * filename,
     char ** out_error) {
+    if (stream && stream->backend) {
+        return stream->backend->restore_slot(filename, out_error);
+    }
     (void) stream;
     (void) filename;
     eliza_set_error(out_error,
@@ -3285,6 +3391,7 @@ int eliza_inference_llm_stream_reset(EliLlmStream * stream) {
      * created/destroyed repeatedly. Handles both the plain fixed-KV stream and
      * the MTP speculative engine (which owns its own target/draft KV). */
     if (!stream) return ELIZA_ERR_INVALID_ARG;
+    if (stream->backend) return stream->backend->reset();
     if (!stream->mtp && !stream->lctx) return ELIZA_ERR_INVALID_ARG;
     if (stream->mtp) {
         /* MTP stream: clear both the target and draft KV caches, reset the
@@ -3319,6 +3426,7 @@ int eliza_inference_llm_stream_reset_keep(EliLlmStream * stream, int32_t n_keep)
      * separate (riskier) handling — prefix-reuse mode opens the resident stream
      * without MTP, trading MTP's ~1.5x decode for the much larger prefill cut. */
     if (!stream) return ELIZA_ERR_INVALID_ARG;
+    if (stream->backend) return stream->backend->reset_keep(n_keep);
     if (stream->mtp || !stream->lctx) return ELIZA_ERR_INVALID_ARG;
     if (n_keep < 0) n_keep = 0;
     if (n_keep > stream->n_past) n_keep = stream->n_past;
@@ -3339,6 +3447,10 @@ int eliza_inference_llm_stream_reset_keep(EliLlmStream * stream, int32_t n_keep)
 
 void eliza_inference_llm_stream_close(EliLlmStream * stream) {
     if (!stream) return;
+    if (stream->backend) {
+        delete stream->backend;
+        stream->backend = nullptr;
+    }
     if (stream->mtp) {
         eliza_mtp::free_engine(stream->mtp);
         stream->mtp = nullptr;
@@ -3435,6 +3547,24 @@ int eliza_inference_embed(
         return ELIZA_ERR_INVALID_ARG;
     }
 
+    /* Per-op backend seam: an embedding backend (e.g. LiteRT/NPU) serves this
+     * when it ships <bundle>/embedding/*; otherwise fall through to the in-tree
+     * ggml encoder below. Inert by default (no backend registered). */
+    {
+        char * be_error = nullptr;
+        EmbedBackendFactory * be =
+            embed_backend_select(llm_backend_context_bundle_dir(ctx), &be_error);
+        if (be_error) {
+            eliza_set_error(out_error, std::string(be_error));
+            std::free(be_error);
+            return ELIZA_ERR_BUNDLE_INVALID;
+        }
+        if (be) {
+            return be->embed(ctx, text, text_len, pooling, out_embedding,
+                             out_capacity, out_dim, out_error);
+        }
+    }
+
     std::lock_guard<std::mutex> lock(ctx->llm_mutex);
     int rc = eliza_load_llm_model_locked(ctx, /* n_gpu_layers= */ -1, out_error);
     if (rc != ELIZA_OK) return rc;
@@ -3569,6 +3699,25 @@ int eliza_inference_llm_eot_score(
         return ELIZA_ERR_INVALID_ARG;
     }
 
+    /* Per-op backend seam: an EOT backend (e.g. LiteRT/NPU) serves this when it
+     * ships <bundle>/eot/*; otherwise fall through to the in-tree ggml
+     * causal-scoring path below. Inert by default (no backend registered). */
+    {
+        char * be_error = nullptr;
+        EotBackendFactory * be =
+            eot_backend_select(llm_backend_context_bundle_dir(ctx), &be_error);
+        if (be_error) {
+            eliza_set_error(out_error, std::string(be_error));
+            std::free(be_error);
+            return ELIZA_ERR_BUNDLE_INVALID;
+        }
+        if (be) {
+            return be->eot_score(ctx, token_ids, num_tokens, target_token_id,
+                                 out_target_prob, out_top_token, out_top_prob,
+                                 out_error);
+        }
+    }
+
     std::lock_guard<std::mutex> lock(ctx->llm_mutex);
     int rc = eliza_load_llm_model_locked(ctx, /* n_gpu_layers= */ -1, out_error);
     if (rc != ELIZA_OK) return rc;
@@ -3730,6 +3879,24 @@ int eliza_inference_describe_image(
         return ELIZA_ERR_INVALID_ARG;
     }
 
+    /* Per-op backend seam: a vision backend (e.g. LiteRT/NPU) serves this when it
+     * ships <bundle>/vision/*; otherwise fall through to the in-tree ggml mmproj
+     * path below. Inert by default (no backend registered). */
+    {
+        char * be_error = nullptr;
+        VisionBackendFactory * be =
+            vision_backend_select(llm_backend_context_bundle_dir(ctx), &be_error);
+        if (be_error) {
+            eliza_set_error(out_error, std::string(be_error));
+            std::free(be_error);
+            return ELIZA_ERR_BUNDLE_INVALID;
+        }
+        if (be) {
+            return be->describe_image(ctx, image_bytes, n_bytes, mmproj_path,
+                                      prompt, out_text, max_text_bytes, out_error);
+        }
+    }
+
     std::lock_guard<std::mutex> lock(ctx->llm_mutex);
     int rc = eliza_load_llm_model_locked(ctx, /* n_gpu_layers= */ -1, out_error);
     if (rc != ELIZA_OK) return rc;
diff --git a/tools/omnivoice/src/embed-backend-selector.cpp b/tools/omnivoice/src/embed-backend-selector.cpp
new file mode 100644
index 000000000..56449fb07
--- /dev/null
+++ b/tools/omnivoice/src/embed-backend-selector.cpp
@@ -0,0 +1,41 @@
+/*
+ * embed-backend-selector.cpp — registry + selection for the per-op embedding
+ * backend seam. A thin instantiation of eliza_backend::Registry<EmbedBackendFactory>
+ * (backend-registry.h) — the resolution logic is shared with every other
+ * modality. Inert by default: with no -DELIZA_ENABLE_* embedding backend
+ * compiled in, nothing registers and embed_backend_select() returns nullptr, so
+ * eliza_inference_embed keeps the in-tree ggml encoder path.
+ */
+
+#include "embed-backend.h"
+#include "backend-registry.h"
+
+#include <mutex>
+
+/* Gated factory accessor — declared only when the backend is compiled in. */
+#ifdef ELIZA_ENABLE_LITERT
+EmbedBackendFactory * litert_embed_backend_factory();
+#endif
+
+namespace {
+eliza_backend::Registry<EmbedBackendFactory> g_registry;
+std::once_flag                               g_builtins_once;
+} // namespace
+
+void embed_backend_register(EmbedBackendFactory * factory) {
+    g_registry.register_factory(factory);
+}
+
+void embed_backend_register_builtins() {
+    std::call_once(g_builtins_once, []() {
+#ifdef ELIZA_ENABLE_LITERT
+        embed_backend_register(litert_embed_backend_factory());
+#endif
+    });
+}
+
+EmbedBackendFactory * embed_backend_select(const char * bundle_dir, char ** out_error) {
+    embed_backend_register_builtins();
+    return g_registry.select("ELIZA_EMBED_BACKEND", "ELIZA_BACKEND", "embed",
+                             bundle_dir, out_error);
+}
diff --git a/tools/omnivoice/src/embed-backend.h b/tools/omnivoice/src/embed-backend.h
new file mode 100644
index 000000000..23473a648
--- /dev/null
+++ b/tools/omnivoice/src/embed-backend.h
@@ -0,0 +1,62 @@
+#pragma once
+/*
+ * embed-backend.h — per-op backend seam for pooled text embeddings.
+ *
+ * The first per-op generalization of the M3 streaming-LLM seam: a one-shot op
+ * (eliza_inference_embed) that an accelerator backend can serve when it ships an
+ * embedding artifact under `<bundle>/embedding/`, while every other op — and
+ * embedding itself when no artifact is present — stays on the in-tree ggml path.
+ *
+ * Embedding is the natural first LiteRT/NPU target: a static-shape, encoder-only
+ * forward with no streaming/KV/sampler, so the factory mirrors the FFI 1:1 and
+ * the FFI delegates without translation. Selection reuses the shared
+ * eliza_backend::Registry (backend-registry.h): ELIZA_EMBED_BACKEND (per-op) then
+ * ELIZA_BACKEND (global) hard-select, else the highest preference_rank among
+ * available()+can_serve() factories, else nullptr (the ggml encoder path).
+ */
+
+#include "eliza-inference-ffi.h" /* EliInferenceContext fwd, ELIZA_* codes */
+
+#include <cstddef>
+
+struct EliInferenceContext;
+
+/* One factory per linked-in embedding runtime (e.g. LiteRT). */
+struct EmbedBackendFactory {
+    virtual ~EmbedBackendFactory() = default;
+
+    /* Stable lower-case id, e.g. "litert". Matched case-insensitively against
+     * ELIZA_EMBED_BACKEND / ELIZA_BACKEND. */
+    virtual const char * name() const = 0;
+
+    /* Compiled in AND host deps present (the LiteRT runtime + a GPU/NPU
+     * delegate). Cheap — must not load a model. */
+    virtual bool available() const = 0;
+
+    /* The embedding artifact exists under `<bundle_dir>/embedding/`. Cheap
+     * directory probe, no model load. */
+    virtual bool can_serve(const char * bundle_dir) const = 0;
+
+    /* Platform-affinity rank (higher wins; the ggml path is the implicit rank 0).
+     * An NPU-served embedding returns a high positive value; a GPU-delegate
+     * fallback a lower positive value. */
+    virtual int preference_rank() const { return 0; }
+
+    /* Mirrors eliza_inference_embed 1:1. Returns ELIZA_OK and writes `*out_dim`
+     * floats into out_embedding (>= out_capacity required), or a negative ELIZA_*
+     * code with `*out_error` heap-allocated for the caller to free. */
+    virtual int embed(EliInferenceContext * ctx, const char * text, size_t text_len,
+                      int pooling, float * out_embedding, size_t out_capacity,
+                      int * out_dim, char ** out_error) = 0;
+};
+
+/* Register a factory (idempotent by name). */
+void embed_backend_register(EmbedBackendFactory * factory);
+
+/* Register every embedding backend compiled into THIS build (gated by the
+ * -DELIZA_ENABLE_* options). Idempotent; called by embed_backend_select. */
+void embed_backend_register_builtins();
+
+/* Pick an embedding backend for the bundle at `bundle_dir`. nullptr + no error
+ * => use the in-tree ggml encoder path. nullptr + *out_error => hard failure. */
+EmbedBackendFactory * embed_backend_select(const char * bundle_dir, char ** out_error);
diff --git a/tools/omnivoice/src/eot-backend-selector.cpp b/tools/omnivoice/src/eot-backend-selector.cpp
new file mode 100644
index 000000000..32bb9fe65
--- /dev/null
+++ b/tools/omnivoice/src/eot-backend-selector.cpp
@@ -0,0 +1,35 @@
+/*
+ * eot-backend-selector.cpp — registry + selection for the per-op end-of-turn
+ * scoring backend seam. A thin instantiation of
+ * eliza_backend::Registry<EotBackendFactory> (backend-registry.h) — the
+ * resolution logic is shared with every other modality. Inert by default: no
+ * -DELIZA_ENABLE_* EOT backend is compiled in (none exists yet), so nothing
+ * registers and eot_backend_select() returns nullptr, so
+ * eliza_inference_llm_eot_score keeps the in-tree ggml causal-scoring path.
+ */
+
+#include "eot-backend.h"
+#include "backend-registry.h"
+
+#include <mutex>
+
+namespace {
+eliza_backend::Registry<EotBackendFactory> g_registry;
+std::once_flag                             g_builtins_once;
+} // namespace
+
+void eot_backend_register(EotBackendFactory * factory) {
+    g_registry.register_factory(factory);
+}
+
+void eot_backend_register_builtins() {
+    std::call_once(g_builtins_once, []() {
+        /* No EOT backend exists yet — the seam stays inert. */
+    });
+}
+
+EotBackendFactory * eot_backend_select(const char * bundle_dir, char ** out_error) {
+    eot_backend_register_builtins();
+    return g_registry.select("ELIZA_EOT_BACKEND", "ELIZA_BACKEND", "eot",
+                             bundle_dir, out_error);
+}
diff --git a/tools/omnivoice/src/eot-backend.h b/tools/omnivoice/src/eot-backend.h
new file mode 100644
index 000000000..1c51dcbb6
--- /dev/null
+++ b/tools/omnivoice/src/eot-backend.h
@@ -0,0 +1,62 @@
+#pragma once
+/*
+ * eot-backend.h — per-op backend seam for end-of-turn scoring.
+ *
+ * A one-shot op (eliza_inference_llm_eot_score) that an accelerator backend can
+ * serve when it ships an EOT artifact under `<bundle>/eot/`, while every other
+ * op — and EOT itself when no artifact is present — stays on the in-tree ggml
+ * causal-scoring path.
+ *
+ * The factory mirrors the FFI 1:1 and the FFI delegates without translation.
+ * Selection reuses the shared eliza_backend::Registry (backend-registry.h):
+ * ELIZA_EOT_BACKEND (per-op) then ELIZA_BACKEND (global) hard-select, else the
+ * highest preference_rank among available()+can_serve() factories, else nullptr
+ * (the ggml EOT-scoring path).
+ */
+
+#include "eliza-inference-ffi.h" /* EliInferenceContext fwd, ELIZA_* codes */
+
+#include <cstddef>
+#include <cstdint>
+
+struct EliInferenceContext;
+
+/* One factory per linked-in EOT runtime (e.g. LiteRT). */
+struct EotBackendFactory {
+    virtual ~EotBackendFactory() = default;
+
+    /* Stable lower-case id, e.g. "litert". Matched case-insensitively against
+     * ELIZA_EOT_BACKEND / ELIZA_BACKEND. */
+    virtual const char * name() const = 0;
+
+    /* Compiled in AND host deps present (the runtime + a GPU/NPU delegate).
+     * Cheap — must not load a model. */
+    virtual bool available() const = 0;
+
+    /* The EOT artifact exists under `<bundle_dir>/eot/`. Cheap directory probe,
+     * no model load. */
+    virtual bool can_serve(const char * bundle_dir) const = 0;
+
+    /* Platform-affinity rank (higher wins; the ggml path is the implicit rank 0).
+     * An NPU-served EOT returns a high positive value; a GPU-delegate fallback a
+     * lower positive value. */
+    virtual int preference_rank() const { return 0; }
+
+    /* Mirrors eliza_inference_llm_eot_score 1:1. Returns ELIZA_OK and writes the
+     * next-token probabilities, or a negative ELIZA_* code with `*out_error`
+     * heap-allocated for the caller to free. */
+    virtual int eot_score(EliInferenceContext * ctx, const int32_t * token_ids, size_t num_tokens,
+                          int32_t target_token_id, float * out_target_prob, int32_t * out_top_token,
+                          float * out_top_prob, char ** out_error) = 0;
+};
+
+/* Register a factory (idempotent by name). */
+void eot_backend_register(EotBackendFactory * factory);
+
+/* Register every EOT backend compiled into THIS build (gated by the
+ * -DELIZA_ENABLE_* options). Idempotent; called by eot_backend_select. */
+void eot_backend_register_builtins();
+
+/* Pick an EOT backend for the bundle at `bundle_dir`. nullptr + no error
+ * => use the in-tree ggml EOT-scoring path. nullptr + *out_error => hard failure. */
+EotBackendFactory * eot_backend_select(const char * bundle_dir, char ** out_error);
diff --git a/tools/omnivoice/src/llm-backend-selector.cpp b/tools/omnivoice/src/llm-backend-selector.cpp
new file mode 100644
index 000000000..3ffe37680
--- /dev/null
+++ b/tools/omnivoice/src/llm-backend-selector.cpp
@@ -0,0 +1,140 @@
+/*
+ * llm-backend-selector.cpp — registry + selection for the multi-runtime
+ * streaming-LLM seam (cutover plan M3).
+ *
+ * On a default build (no -DELIZA_ENABLE_* gate) NO alternate backend is
+ * registered, so llm_backend_select() always returns nullptr and the FFI keeps
+ * the in-tree llama.cpp path. The seam is therefore inert-by-default: the
+ * library behaves exactly as before until an accelerator backend is compiled in.
+ */
+
+#include "llm-backend.h"
+
+#include <cctype>
+#include <cstdlib>
+#include <cstring>
+#include <mutex>
+#include <string>
+#include <vector>
+
+/* Gated backend factory accessors. Declared only when the matching backend is
+ * compiled in; register_builtins() calls them under the same gate. Keeping the
+ * declarations gated means the default build has no unresolved symbols. */
+#ifdef ELIZA_ENABLE_LITERT_LM
+LlmBackendFactory * litert_backend_factory();
+#endif
+#if defined(ELIZA_ENABLE_MLX) && defined(__APPLE__)
+LlmBackendFactory * mlx_coreml_backend_factory();
+#endif
+
+namespace {
+
+std::mutex                       g_reg_mutex;
+std::vector<LlmBackendFactory *> g_factories;
+std::once_flag                   g_builtins_once;
+
+/* Heap-allocate an error string with malloc so the caller can release it with
+ * eliza_inference_free_string() (which calls free()), matching the FFI contract. */
+char * dup_error(const std::string & msg) {
+    char * out = (char *) std::malloc(msg.size() + 1);
+    if (out) std::memcpy(out, msg.c_str(), msg.size() + 1);
+    return out;
+}
+
+bool iequals(const char * a, const char * b) {
+    if (!a || !b) return false;
+    while (*a && *b) {
+        if (std::tolower((unsigned char) *a) != std::tolower((unsigned char) *b)) {
+            return false;
+        }
+        ++a;
+        ++b;
+    }
+    return *a == *b;
+}
+
+bool is_llamacpp_name(const char * s) {
+    return iequals(s, "llama.cpp") || iequals(s, "llamacpp") || iequals(s, "llama");
+}
+
+} // namespace
+
+void llm_backend_register(LlmBackendFactory * factory) {
+    if (!factory) return;
+    std::lock_guard<std::mutex> lock(g_reg_mutex);
+    for (LlmBackendFactory * f : g_factories) {
+        if (iequals(f->name(), factory->name())) return; /* idempotent by name */
+    }
+    g_factories.push_back(factory);
+}
+
+void llm_backend_register_builtins() {
+    std::call_once(g_builtins_once, []() {
+#ifdef ELIZA_ENABLE_LITERT_LM
+        llm_backend_register(litert_backend_factory());
+#endif
+#if defined(ELIZA_ENABLE_MLX) && defined(__APPLE__)
+        llm_backend_register(mlx_coreml_backend_factory());
+#endif
+    });
+}
+
+LlmBackendFactory * llm_backend_select(const char * bundle_dir,
+                                       const eliza_llm_stream_config_t * /*cfg*/,
+                                       char ** out_error) {
+    llm_backend_register_builtins();
+
+    /* (1) ELIZA_LLM_BACKEND env: a HARD select. */
+    const char * forced = std::getenv("ELIZA_LLM_BACKEND");
+    if (forced && forced[0] != '\0') {
+        if (is_llamacpp_name(forced)) {
+            return nullptr; /* force the in-tree path, not an error */
+        }
+        std::lock_guard<std::mutex> lock(g_reg_mutex);
+        for (LlmBackendFactory * f : g_factories) {
+            if (!iequals(f->name(), forced)) continue;
+            if (!f->available()) {
+                if (out_error) {
+                    *out_error = dup_error(
+                        std::string("[libelizainference] ELIZA_LLM_BACKEND=") + forced +
+                        " is not available in this build/host");
+                }
+                return nullptr;
+            }
+            if (!f->can_serve(bundle_dir)) {
+                if (out_error) {
+                    *out_error = dup_error(
+                        std::string("[libelizainference] ELIZA_LLM_BACKEND=") + forced +
+                        " cannot serve the bundle at " +
+                        (bundle_dir ? bundle_dir : "(null)"));
+                }
+                return nullptr;
+            }
+            return f;
+        }
+        if (out_error) {
+            *out_error = dup_error(
+                std::string("[libelizainference] ELIZA_LLM_BACKEND=") + forced +
+                " is not a registered backend");
+        }
+        return nullptr;
+    }
+
+    /* (2) Auto-select: the highest preference_rank among available + can_serve.
+     * The in-tree llama.cpp path is the implicit rank-0 fallback, so an
+     * accelerator backend only wins when it returns a positive rank AND can
+     * serve this bundle. */
+    std::lock_guard<std::mutex> lock(g_reg_mutex);
+    LlmBackendFactory * best      = nullptr;
+    int                 best_rank = 0;
+    for (LlmBackendFactory * f : g_factories) {
+        if (!f->available()) continue;
+        if (!f->can_serve(bundle_dir)) continue;
+        const int rank = f->preference_rank();
+        if (rank > best_rank) {
+            best_rank = rank;
+            best      = f;
+        }
+    }
+    return best; /* nullptr => in-tree llama.cpp */
+}
diff --git a/tools/omnivoice/src/llm-backend.h b/tools/omnivoice/src/llm-backend.h
new file mode 100644
index 000000000..0fad67f3c
--- /dev/null
+++ b/tools/omnivoice/src/llm-backend.h
@@ -0,0 +1,167 @@
+#pragma once
+/*
+ * llm-backend.h — multi-runtime streaming-LLM backend seam (cutover plan M3).
+ *
+ * The libelizainference streaming-LLM FFI (`eliza_inference_llm_stream_*`) is
+ * ONE pipe that can be driven by more than one in-process inference runtime:
+ *
+ *   - llama.cpp   — the default / reference backend (CPU / CUDA / Vulkan-Mali-
+ *                   Adreno / Metal). Always present; the in-tree code path.
+ *   - LiteRT-LM   — Android NPU (Tensor / Qualcomm QNN / MediaTek NeuroPilot),
+ *                   optionally desktop/iOS GPU. Gated -DELIZA_ENABLE_LITERT.
+ *   - CoreML/MLX  — Apple Silicon (mac first, iOS later). Gated -DELIZA_ENABLE_MLX.
+ *
+ * Per native/AGENTS.md §11 (reinterpreted by the Gemma-4 cutover): "one managed
+ * library, one pipe, no sidecar/subprocess/TCP." LiteRT-LM and MLX are
+ * EMBEDDABLE in-process C++ libraries linked INTO libelizainference and exposed
+ * behind the SAME FFI streaming symbols — never a child process or TCP server.
+ * (AICore / Apple Foundation stay opportunistic out-of-process adapters on the
+ * TS side, not owned backends — they are NOT registered here.)
+ *
+ * A backend supplies:
+ *   - LlmBackendSession  — the per-generation streaming session, mirroring the
+ *                          FFI pull contract (prefill -> next* -> close) 1:1 so
+ *                          the FFI functions delegate without translation.
+ *   - LlmBackendFactory  — names the runtime, reports availability + bundle fit,
+ *                          and opens sessions.
+ *
+ * `llm_backend_select()` picks a backend at `_open` time from the platform, the
+ * bundle contents, the build flags, and the `ELIZA_LLM_BACKEND` env override.
+ * When it returns nullptr (and no error) the FFI keeps the in-tree llama.cpp
+ * path — so a build with no alternate backend behaves exactly as before.
+ */
+
+#include "eliza-inference-ffi.h" /* eliza_llm_stream_config_t, EliInferenceContext fwd */
+
+#include <cstddef>
+#include <cstdint>
+
+/* Defined in the FFI translation unit. Opaque to backends — a backend reaches
+ * the resident model/bundle through the accessors below, not the struct. */
+struct EliInferenceContext;
+
+/* The bundle directory the context was opened against. A backend's open()
+ * resolves its own artifact under this root (e.g. `<dir>/text/*.litertlm`,
+ * `<dir>/text/*.mlpackage`) — the ONLY supported way to read the bundle path,
+ * since the struct itself is opaque here. Returns nullptr when ctx is null.
+ * Defined in eliza-inference-ffi.cpp; the pointer is owned by the context and
+ * stays valid for the session's lifetime. */
+const char * llm_backend_context_bundle_dir(const EliInferenceContext * ctx);
+
+/* ---- Per-generation streaming session ------------------------------------ *
+ *
+ * Lifetime: created by LlmBackendFactory::open(), destroyed via `delete` on the
+ * FFI `_close` path. Every method mirrors the matching FFI entry point so the
+ * FFI can `return session->method(...)` with no argument translation. Status
+ * conventions are identical to the FFI: >= 0 on success, the negative `ELIZA_*`
+ * constants on failure, with `*out_error` heap-allocated for the caller to free.
+ */
+struct LlmBackendSession {
+    virtual ~LlmBackendSession() = default;
+
+    /* Mirrors eliza_inference_llm_stream_prefill. Copies the tokens it needs. */
+    virtual int prefill(const int32_t * token_ids, size_t num_tokens,
+                        char ** out_error) = 0;
+
+    /* Mirrors eliza_inference_llm_stream_next. Returns 0 (more output), 1 (final
+     * step — EOS / cap), or a negative ELIZA_* code (ELIZA_ERR_CANCELLED on
+     * cancel). `drafter_*_out` carry per-step speculative stats (0 when the
+     * backend has no drafter). */
+    virtual int next(int32_t * tokens_out, size_t tokens_cap,
+                     size_t * num_tokens_out, char * text_out, size_t text_cap,
+                     int32_t * drafter_drafted_out, int32_t * drafter_accepted_out,
+                     char ** out_error) = 0;
+
+    /* Mirrors eliza_inference_llm_stream_cancel. Publishes a flag an in-flight
+     * next() checks at a step boundary; safe to call from another thread.
+     * Returns ELIZA_OK whether or not a pass was running. */
+    virtual int cancel() = 0;
+
+    /* Mirrors eliza_inference_llm_stream_reset: clear KV + sampler/counters so
+     * the next prefill starts a fresh prompt on the same warm session. */
+    virtual int reset() = 0;
+
+    /* Mirrors eliza_inference_llm_stream_reset_keep: keep the first `n_keep`
+     * tokens of state resident and drop the rest. Returns the n_keep actually
+     * applied (>= 0, may be clamped / 0 on a full-reset fallback). A backend
+     * that cannot do prefix reuse MUST fall back to a full reset and return 0 —
+     * never an error. */
+    virtual int reset_keep(int32_t n_keep) = 0;
+
+    /* Slot KV persistence — optional. Default: not supported. */
+    virtual int save_slot(const char * /*filename*/, char ** /*out_error*/) {
+        return ELIZA_ERR_INVALID_ARG;
+    }
+    virtual int restore_slot(const char * /*filename*/, char ** /*out_error*/) {
+        return ELIZA_ERR_INVALID_ARG;
+    }
+};
+
+/* ---- Backend factory (one per linked-in runtime) ------------------------- */
+struct LlmBackendFactory {
+    virtual ~LlmBackendFactory() = default;
+
+    /* Stable lower-case id: "llama.cpp", "litert-lm", "mlx-coreml". Matched
+     * case-insensitively against ELIZA_LLM_BACKEND. */
+    virtual const char * name() const = 0;
+
+    /* True only when this backend is compiled in AND its runtime dependencies
+     * are present on THIS host (the NPU delegate / Metal device / the linked
+     * lib). A scaffold whose build gate is OFF returns false. Cheap — must not
+     * load a model. */
+    virtual bool available() const = 0;
+
+    /* True when this backend can serve the bundle at `bundle_dir` — i.e. the
+     * backend-specific artifact exists (e.g. `text/*.litertlm`, `text/*.mlpackage`).
+     * Cheap directory probe, no model load. */
+    virtual bool can_serve(const char * bundle_dir) const = 0;
+
+    /* Platform-affinity rank used to order candidates when several can serve the
+     * same bundle and no env override is set. Higher wins. The in-tree llama.cpp
+     * path is rank 0 (the implicit fallback); an accelerator backend that is the
+     * preferred path on this device returns a positive value. */
+    virtual int preference_rank() const { return 0; }
+
+    /* Create a streaming session for (ctx, cfg). Returns nullptr + `*out_error`
+     * on failure. The returned session is owned by the caller (FFI `_close`
+     * deletes it). */
+    virtual LlmBackendSession * open(EliInferenceContext * ctx,
+                                     const eliza_llm_stream_config_t * cfg,
+                                     char ** out_error) = 0;
+};
+
+/* ---- Registry + selection ------------------------------------------------ *
+ *
+ * Backends register their singleton factory (idempotent; the registry does not
+ * take ownership — factories are static-lifetime singletons). The FFI
+ * translation unit calls llm_backend_register_builtins() once to register every
+ * compiled-in backend, then calls llm_backend_select() per `_open`.
+ */
+
+/* Register a factory (idempotent by name). Safe to call from static init. */
+void llm_backend_register(LlmBackendFactory * factory);
+
+/* Register every backend compiled into THIS build (gated by the -DELIZA_ENABLE_*
+ * CMake options). Idempotent; call once at first `_open`. Defined in
+ * llm-backend-selector.cpp; the gated backends self-register via their headers. */
+void llm_backend_register_builtins();
+
+/* Pick a backend for the bundle at `bundle_dir` with `cfg`. Resolution order:
+ *
+ *   1. ELIZA_LLM_BACKEND env (exact, case-insensitive backend name) — a HARD
+ *      select. "llama.cpp" / "llamacpp" forces the in-tree path (returns
+ *      nullptr, no error). Any other name that is not registered+available, or
+ *      cannot serve the bundle, is a hard error: returns nullptr AND sets
+ *      `*out_error` so the FFI aborts rather than silently using llama.cpp.
+ *
+ *   2. No env override: among registered backends that are available() AND
+ *      can_serve(bundle_dir), pick the highest preference_rank(). If none
+ *      qualifies, return nullptr (use the in-tree llama.cpp path).
+ *
+ * A nullptr return with `*out_error == nullptr` means "use the in-tree llama.cpp
+ * path" — NOT an error. A nullptr return with `*out_error != nullptr` is a hard
+ * failure the caller must propagate.
+ */
+LlmBackendFactory * llm_backend_select(const char * bundle_dir,
+                                       const eliza_llm_stream_config_t * cfg,
+                                       char ** out_error);
diff --git a/tools/omnivoice/src/tts-backend-selector.cpp b/tools/omnivoice/src/tts-backend-selector.cpp
new file mode 100644
index 000000000..ad2d28447
--- /dev/null
+++ b/tools/omnivoice/src/tts-backend-selector.cpp
@@ -0,0 +1,34 @@
+/*
+ * tts-backend-selector.cpp — registry + selection for the per-op TTS backend
+ * seam. A thin instantiation of eliza_backend::Registry<TtsBackendFactory>
+ * (backend-registry.h) — the resolution logic is shared with every other
+ * modality. Inert by default: no -DELIZA_ENABLE_* TTS backend is compiled in
+ * (none exists yet), so nothing registers and tts_backend_select() returns
+ * nullptr, so eliza_inference_tts_synthesize keeps the in-tree OmniVoice path.
+ */
+
+#include "tts-backend.h"
+#include "backend-registry.h"
+
+#include <mutex>
+
+namespace {
+eliza_backend::Registry<TtsBackendFactory> g_registry;
+std::once_flag                             g_builtins_once;
+} // namespace
+
+void tts_backend_register(TtsBackendFactory * factory) {
+    g_registry.register_factory(factory);
+}
+
+void tts_backend_register_builtins() {
+    std::call_once(g_builtins_once, []() {
+        /* No TTS backend exists yet — the seam stays inert. */
+    });
+}
+
+TtsBackendFactory * tts_backend_select(const char * bundle_dir, char ** out_error) {
+    tts_backend_register_builtins();
+    return g_registry.select("ELIZA_TTS_BACKEND", "ELIZA_BACKEND", "tts",
+                             bundle_dir, out_error);
+}
diff --git a/tools/omnivoice/src/tts-backend.h b/tools/omnivoice/src/tts-backend.h
new file mode 100644
index 000000000..127ce2a33
--- /dev/null
+++ b/tools/omnivoice/src/tts-backend.h
@@ -0,0 +1,61 @@
+#pragma once
+/*
+ * tts-backend.h — per-op backend seam for text-to-speech synthesis.
+ *
+ * A one-shot op (eliza_inference_tts_synthesize) that an accelerator backend can
+ * serve when it ships a TTS artifact under `<bundle>/tts/`, while every other
+ * op — and TTS itself when no artifact is present — stays on the in-tree
+ * OmniVoice/ggml path.
+ *
+ * The factory mirrors the FFI 1:1 and the FFI delegates without translation.
+ * Selection reuses the shared eliza_backend::Registry (backend-registry.h):
+ * ELIZA_TTS_BACKEND (per-op) then ELIZA_BACKEND (global) hard-select, else the
+ * highest preference_rank among available()+can_serve() factories, else nullptr
+ * (the in-tree OmniVoice path).
+ */
+
+#include "eliza-inference-ffi.h" /* EliInferenceContext fwd, ELIZA_* codes */
+
+#include <cstddef>
+
+struct EliInferenceContext;
+
+/* One factory per linked-in TTS runtime (e.g. LiteRT). */
+struct TtsBackendFactory {
+    virtual ~TtsBackendFactory() = default;
+
+    /* Stable lower-case id, e.g. "litert". Matched case-insensitively against
+     * ELIZA_TTS_BACKEND / ELIZA_BACKEND. */
+    virtual const char * name() const = 0;
+
+    /* Compiled in AND host deps present (the runtime + a GPU/NPU delegate).
+     * Cheap — must not load a model. */
+    virtual bool available() const = 0;
+
+    /* The TTS artifact exists under `<bundle_dir>/tts/`. Cheap directory probe,
+     * no model load. */
+    virtual bool can_serve(const char * bundle_dir) const = 0;
+
+    /* Platform-affinity rank (higher wins; the ggml path is the implicit rank 0).
+     * An NPU-served TTS returns a high positive value; a GPU-delegate fallback a
+     * lower positive value. */
+    virtual int preference_rank() const { return 0; }
+
+    /* Mirrors eliza_inference_tts_synthesize 1:1. Returns the number of fp32 PCM
+     * samples actually written (>= 0) on success, or a negative ELIZA_* code with
+     * `*out_error` heap-allocated for the caller to free. */
+    virtual int tts_synthesize(EliInferenceContext * ctx, const char * text, size_t text_len,
+                               const char * speaker_preset_id, float * out_pcm,
+                               size_t max_samples, char ** out_error) = 0;
+};
+
+/* Register a factory (idempotent by name). */
+void tts_backend_register(TtsBackendFactory * factory);
+
+/* Register every TTS backend compiled into THIS build (gated by the
+ * -DELIZA_ENABLE_* options). Idempotent; called by tts_backend_select. */
+void tts_backend_register_builtins();
+
+/* Pick a TTS backend for the bundle at `bundle_dir`. nullptr + no error
+ * => use the in-tree OmniVoice path. nullptr + *out_error => hard failure. */
+TtsBackendFactory * tts_backend_select(const char * bundle_dir, char ** out_error);
diff --git a/tools/omnivoice/src/vision-backend-selector.cpp b/tools/omnivoice/src/vision-backend-selector.cpp
new file mode 100644
index 000000000..095450cab
--- /dev/null
+++ b/tools/omnivoice/src/vision-backend-selector.cpp
@@ -0,0 +1,34 @@
+/*
+ * vision-backend-selector.cpp — registry + selection for the per-op vision
+ * backend seam. A thin instantiation of eliza_backend::Registry<VisionBackendFactory>
+ * (backend-registry.h) — the resolution logic is shared with every other
+ * modality. Inert by default: no -DELIZA_ENABLE_* vision backend is compiled in
+ * (none exists yet), so nothing registers and vision_backend_select() returns
+ * nullptr, so eliza_inference_describe_image keeps the in-tree ggml mmproj path.
+ */
+
+#include "vision-backend.h"
+#include "backend-registry.h"
+
+#include <mutex>
+
+namespace {
+eliza_backend::Registry<VisionBackendFactory> g_registry;
+std::once_flag                                g_builtins_once;
+} // namespace
+
+void vision_backend_register(VisionBackendFactory * factory) {
+    g_registry.register_factory(factory);
+}
+
+void vision_backend_register_builtins() {
+    std::call_once(g_builtins_once, []() {
+        /* No vision backend exists yet — the seam stays inert. */
+    });
+}
+
+VisionBackendFactory * vision_backend_select(const char * bundle_dir, char ** out_error) {
+    vision_backend_register_builtins();
+    return g_registry.select("ELIZA_VISION_BACKEND", "ELIZA_BACKEND", "vision",
+                             bundle_dir, out_error);
+}
diff --git a/tools/omnivoice/src/vision-backend.h b/tools/omnivoice/src/vision-backend.h
new file mode 100644
index 000000000..51da0632a
--- /dev/null
+++ b/tools/omnivoice/src/vision-backend.h
@@ -0,0 +1,61 @@
+#pragma once
+/*
+ * vision-backend.h — per-op backend seam for mmproj image description.
+ *
+ * A one-shot op (eliza_inference_describe_image) that an accelerator backend can
+ * serve when it ships a vision artifact under `<bundle>/vision/`, while every
+ * other op — and vision itself when no artifact is present — stays on the
+ * in-tree ggml mmproj path.
+ *
+ * The factory mirrors the FFI 1:1 and the FFI delegates without translation.
+ * Selection reuses the shared eliza_backend::Registry (backend-registry.h):
+ * ELIZA_VISION_BACKEND (per-op) then ELIZA_BACKEND (global) hard-select, else
+ * the highest preference_rank among available()+can_serve() factories, else
+ * nullptr (the ggml mmproj path).
+ */
+
+#include "eliza-inference-ffi.h" /* EliInferenceContext fwd, ELIZA_* codes */
+
+#include <cstddef>
+
+struct EliInferenceContext;
+
+/* One factory per linked-in vision runtime (e.g. LiteRT). */
+struct VisionBackendFactory {
+    virtual ~VisionBackendFactory() = default;
+
+    /* Stable lower-case id, e.g. "litert". Matched case-insensitively against
+     * ELIZA_VISION_BACKEND / ELIZA_BACKEND. */
+    virtual const char * name() const = 0;
+
+    /* Compiled in AND host deps present (the runtime + a GPU/NPU delegate).
+     * Cheap — must not load a model. */
+    virtual bool available() const = 0;
+
+    /* The vision artifact exists under `<bundle_dir>/vision/`. Cheap directory
+     * probe, no model load. */
+    virtual bool can_serve(const char * bundle_dir) const = 0;
+
+    /* Platform-affinity rank (higher wins; the ggml path is the implicit rank 0).
+     * An NPU-served vision returns a high positive value; a GPU-delegate
+     * fallback a lower positive value. */
+    virtual int preference_rank() const { return 0; }
+
+    /* Mirrors eliza_inference_describe_image 1:1. Returns the number of bytes
+     * written (excluding the terminator) on success, or a negative ELIZA_* code
+     * with `*out_error` heap-allocated for the caller to free. */
+    virtual int describe_image(EliInferenceContext * ctx, const unsigned char * image_bytes,
+                               size_t n_bytes, const char * mmproj_path, const char * prompt,
+                               char * out_text, size_t max_text_bytes, char ** out_error) = 0;
+};
+
+/* Register a factory (idempotent by name). */
+void vision_backend_register(VisionBackendFactory * factory);
+
+/* Register every vision backend compiled into THIS build (gated by the
+ * -DELIZA_ENABLE_* options). Idempotent; called by vision_backend_select. */
+void vision_backend_register_builtins();
+
+/* Pick a vision backend for the bundle at `bundle_dir`. nullptr + no error
+ * => use the in-tree ggml mmproj path. nullptr + *out_error => hard failure. */
+VisionBackendFactory * vision_backend_select(const char * bundle_dir, char ** out_error);