elizaOS · lalalune · Jun 24, 2026 · Jun 24, 2026
diff --git a/tools/omnivoice/CMakeLists.txt b/tools/omnivoice/CMakeLists.txt
@@ -83,6 +83,14 @@ set(OMNIVOICE_FFI_SOURCES
     # backend below registers itself, so the default build keeps the in-tree
     # llama.cpp path.
     src/llm-backend-selector.cpp
+    # Per-op backend seams (cutover M3+). Each modality's selector reuses the
+    # shared eliza_backend::Registry (backend-registry.h) and is inert until a
+    # gated backend registers — so the default build keeps the ggml path per-op.
+    src/embed-backend-selector.cpp
+    src/vision-backend-selector.cpp
+    src/asr-backend-selector.cpp
+    src/tts-backend-selector.cpp
+    src/eot-backend-selector.cpp
 )
 
 # Vendored standalone voice-classifier forward graphs (pure scalar C, no
@@ -231,7 +239,12 @@ option(ELIZA_ENABLE_VISION "Build the fused mmproj vision-describe ABI (v9)" ON)
 # pipe keeps the in-tree llama.cpp path. ON requires the LiteRT-LM SDK
 # (ELIZA_LITERT_SDK_DIR) — a host/device cross-build concern, not the Linux CI
 # default. See docs/multi-backend-ffi-seam.md.
-option(ELIZA_ENABLE_LITERT "Build the LiteRT-LM in-process LLM backend (M4)" OFF)
+option(ELIZA_ENABLE_LITERT "Build the LiteRT C-API per-op backends, e.g. embed (M4)" OFF)
+
+# ELIZA_ENABLE_LITERT_LM — the streaming-LLM backend on the heavier LiteRT-LM
+# Engine SDK (litert::lm), separate from the LiteRT C runtime above. OFF until
+# that SDK is built; point -DELIZA_LITERT_LM_SDK_DIR / -DELIZA_LITERT_LM_LIBS at it.
+option(ELIZA_ENABLE_LITERT_LM "Build the LiteRT-LM in-process streaming-LLM backend" OFF)
 
 # ELIZA_ENABLE_MLX — compile the CoreML/MLX in-process streaming-LLM backend
 # (cutover plan M5 — Apple Silicon). OFF by default; ON is Apple-only and
@@ -297,12 +310,13 @@ if(TARGET mtmd)
     # out, and the streaming-LLM pipe keeps the in-tree llama.cpp path — so the
     # default desktop/CI build is byte-for-byte the pre-seam behavior.
     if(ELIZA_ENABLE_LITERT)
+        # LiteRT C-API per-op backends (embed today; vision/etc. as artifacts
+        # ship). SDK = the LiteRT C runtime (github.com/google-ai-edge/LiteRT,
+        # libLiteRt.so + the GPU/NPU delegate). Point at a built SDK with
+        # -DELIZA_LITERT_SDK_DIR=<dir> and link with -DELIZA_LITERT_LIBS=LiteRt.
         target_sources(elizainference PRIVATE
-            ${CMAKE_CURRENT_SOURCE_DIR}/src/backends/litert-backend.cpp)
+            ${CMAKE_CURRENT_SOURCE_DIR}/src/backends/litert-embed-backend.cpp)
         target_compile_definitions(elizainference PRIVATE ELIZA_ENABLE_LITERT)
-        # LiteRT-LM SDK (github.com/google-ai-edge/LiteRT-LM). Point at a built
-        # SDK with -DELIZA_LITERT_SDK_DIR=<dir>; the device/host cross-build
-        # links its libs + the NPU delegates with -DELIZA_LITERT_LIBS=<libs>.
         if(ELIZA_LITERT_SDK_DIR)
             target_include_directories(elizainference PRIVATE ${ELIZA_LITERT_SDK_DIR}/include)
             target_link_directories(elizainference PRIVATE ${ELIZA_LITERT_SDK_DIR}/lib)
@@ -311,6 +325,22 @@ if(TARGET mtmd)
             target_link_libraries(elizainference PRIVATE ${ELIZA_LITERT_LIBS})
         endif()
     endif()
+    if(ELIZA_ENABLE_LITERT_LM)
+        # The streaming-LLM backend needs the heavier LiteRT-LM Engine SDK
+        # (litert::lm, github.com/google-ai-edge/LiteRT-LM) — separate from the
+        # LiteRT C runtime above. Point at it with -DELIZA_LITERT_LM_SDK_DIR /
+        # -DELIZA_LITERT_LM_LIBS.
+        target_sources(elizainference PRIVATE
+            ${CMAKE_CURRENT_SOURCE_DIR}/src/backends/litert-backend.cpp)
+        target_compile_definitions(elizainference PRIVATE ELIZA_ENABLE_LITERT_LM)
+        if(ELIZA_LITERT_LM_SDK_DIR)
+            target_include_directories(elizainference PRIVATE ${ELIZA_LITERT_LM_SDK_DIR}/include)
+            target_link_directories(elizainference PRIVATE ${ELIZA_LITERT_LM_SDK_DIR}/lib)
+        endif()
+        if(ELIZA_LITERT_LM_LIBS)
+            target_link_libraries(elizainference PRIVATE ${ELIZA_LITERT_LM_LIBS})
+        endif()
+    endif()
     if(ELIZA_ENABLE_MLX)
         if(NOT APPLE)
             message(FATAL_ERROR

diff --git a/tools/omnivoice/src/SESSION-OPS-TODO.md b/tools/omnivoice/src/SESSION-OPS-TODO.md
@@ -0,0 +1,159 @@
+# Session-op backend seam — design (NOT implemented)
+
+The per-op backend seam (`backend-registry.h` + `<mod>-backend.h` +
+`<mod>-backend-selector.cpp` + a chokepoint at the top of the FFI fn) is now in
+place for the **one-shot** ops:
+
+| modality | FFI fn                          | header / selector            | env key               | artifact dir       |
+|----------|---------------------------------|------------------------------|-----------------------|--------------------|
+| embed    | `eliza_inference_embed`         | `embed-backend.*`            | `ELIZA_EMBED_BACKEND` | `<bundle>/embedding/` |
+| vision   | `eliza_inference_describe_image`| `vision-backend.*`           | `ELIZA_VISION_BACKEND`| `<bundle>/vision/` |
+| asr      | `eliza_inference_asr_transcribe`| `asr-backend.*`              | `ELIZA_ASR_BACKEND`   | `<bundle>/asr/`    |
+| tts      | `eliza_inference_tts_synthesize`| `tts-backend.*`              | `ELIZA_TTS_BACKEND`   | `<bundle>/tts/`    |
+| eot      | `eliza_inference_llm_eot_score` | `eot-backend.*`              | `ELIZA_EOT_BACKEND`   | `<bundle>/eot/`    |
+
+A one-shot op is stateless across calls: select → (delegate | fall through to
+ggml) on every call. There is nothing to keep alive between calls, so the seam
+is a single chokepoint at the top of the fn.
+
+The **session** ops are different: `vad`, `wakeword`, `speaker`, `diariz` each
+`_open` a native handle (`EliVad *`, `EliWakeword *`, `EliSpeaker *`,
+`EliDiariz *`) that persists across many `_segment`/`_detect`/`_embed` calls and
+is torn down with `_close`/`_reset`. The seam has to follow that lifecycle, not
+re-select per call. This file records HOW to extend the seam to them. **None of
+the below is implemented yet.**
+
+## The shape of a session op (today, in-tree only)
+
+Each session modality exposes, e.g. for VAD:
+
+```c
+EliVad * eliza_inference_vad_open(EliInferenceContext * ctx, /* params */, char ** out_error);
+int      eliza_inference_vad_segment(EliVad * vad, const float * pcm, size_t n, /* out */, char ** out_error);
+int      eliza_inference_vad_reset(EliVad * vad, char ** out_error);
+void     eliza_inference_vad_close(EliVad * vad);
+```
+
+`EliVad` (and the wakeword/speaker/diariz equivalents) is the in-tree handle
+struct defined in `eliza-inference-ffi.cpp`. Its in-tree fields stay exactly as
+they are; the seam is **additive** — one extra pointer.
+
+## Extending the seam to a session op
+
+For each session modality `<mod>` (vad | wakeword | speaker | diariz):
+
+### 1. A session factory interface — `<mod>-backend.h`
+
+Mirror the one-shot factory's four common probes, but the forward methods mirror
+the **session** ABI 1:1 instead of a single one-shot fn. The factory does NOT
+own the handle struct; it produces and operates on an opaque backend-session:
+
+```cpp
+struct VadBackendFactory {
+    virtual ~VadBackendFactory() = default;
+    virtual const char * name() const = 0;
+    virtual bool         available() const = 0;
+    virtual bool         can_serve(const char * bundle_dir) const = 0;  // probes <bundle>/vad/
+    virtual int          preference_rank() const { return 0; }
+
+    // Lifecycle, mirroring the FFI session ABI 1:1. The factory returns an
+    // opaque backend-session pointer it owns; the FFI stashes it on the Eli*
+    // handle. A NULL return + *out_error is a hard open failure.
+    virtual void * open(EliInferenceContext * ctx, /* same params as eliza_inference_vad_open */,
+                        char ** out_error) = 0;
+    virtual int   segment(void * session, const float * pcm, size_t n, /* out */, char ** out_error) = 0;
+    virtual int   reset(void * session, char ** out_error) = 0;
+    virtual void  close(void * session) = 0;
+};
+```
+
+Plus the same free-functions as the one-shot seam:
+`vad_backend_register`, `vad_backend_register_builtins` (EMPTY for now — no
+LiteRT session backend exists), `vad_backend_select(bundle_dir, out_error)`,
+backed by a `eliza_backend::Registry<VadBackendFactory>` in
+`<mod>-backend-selector.cpp` with env keys `ELIZA_VAD_BACKEND` → `ELIZA_BACKEND`
+and modality `"vad"`. Artifact probe dir `<bundle>/vad/` (resp. `wakeword/`,
+`speaker/`, `diariz/`).
+
+### 2. A backend-session pointer on the Eli* handle
+
+The selection happens ONCE, at `_open`, not per call. Add one field to the
+in-tree handle struct:
+
+```cpp
+struct EliVad {
+    /* ... existing in-tree fields, unchanged ... */
+
+    /* Backend seam (additive). When non-null, this handle is served by an
+     * accelerator backend and every op delegates to it; the in-tree fields
+     * above are then unused. When null, the in-tree ggml path owns the handle. */
+    VadBackendFactory * be         = nullptr;  // the factory that opened be_session
+    void *              be_session = nullptr;  // factory-owned backend session
+};
+```
+
+### 3. Select at `_open`
+
+In `eliza_inference_vad_open`, after the existing arg validation and before the
+in-tree handle is built:
+
+```cpp
+char * be_error = nullptr;
+VadBackendFactory * be = vad_backend_select(llm_backend_context_bundle_dir(ctx), &be_error);
+if (be_error) { eliza_set_error(out_error, std::string(be_error)); std::free(be_error);
+                return /* NULL handle */; }
+if (be) {
+    void * sess = be->open(ctx, /* params */, out_error);
+    if (!sess) return /* NULL handle — open failed, out_error already set */;
+    EliVad * h = new EliVad();
+    h->be = be;
+    h->be_session = sess;
+    return h;
+}
+/* else: fall through and build the in-tree handle exactly as today. */
+```
+
+### 4. A guard at the TOP of each `_segment` / `_reset` / `_close`
+
+Each per-call op checks the backend pointer and delegates before touching any
+in-tree state:
+
+```cpp
+int eliza_inference_vad_segment(EliVad * vad, const float * pcm, size_t n, /* out */, char ** out_error) {
+    if (!vad) { /* invalid-arg as today */ }
+    if (vad->be) {                                   // <-- guard
+        return vad->be->segment(vad->be_session, pcm, n, /* out */, out_error);
+    }
+    /* ... existing in-tree ggml segment body, unchanged ... */
+}
+
+void eliza_inference_vad_close(EliVad * vad) {
+    if (!vad) return;
+    if (vad->be) { vad->be->close(vad->be_session); delete vad; return; }  // <-- guard
+    /* ... existing in-tree teardown, then delete vad ... */
+}
+```
+
+`_reset` follows the same guard pattern.
+
+## Why this shape (vs. re-selecting per call)
+
+- **Selection is per-session, not per-call.** A session's backend is fixed at
+  `_open`; you cannot have `_segment` cross from the ggml path to LiteRT mid
+  session because the KV/feature state lives in the (in-tree OR backend)
+  session, not on the FFI boundary. The one pointer captures that binding.
+- **Hard-fail localizes to `_open`.** A bundle-invalid override surfaces once,
+  where the caller is already prepared to handle a NULL handle, instead of on
+  every `_segment`.
+- **Additive + inert.** With no session backend registered (the case today),
+  `_open`'s `select()` returns nullptr, `be`/`be_session` stay null, and every
+  guard is a no-op — the in-tree path is byte-for-byte unchanged. Same inert-by
+  -default contract as the one-shot seam.
+
+## Status
+
+- One-shot seam: embed (with a LiteRT builtin), vision/asr/tts/eot (inert,
+  no builtin) — **done**.
+- Session seam (vad/wakeword/speaker/diariz): **not implemented.** No
+  `<mod>-backend.{h,cpp}`, no handle field, no `_open` select, no per-call
+  guards exist yet. This file is the spec for when a session backend lands.
diff --git a/tools/omnivoice/src/asr-backend-selector.cpp b/tools/omnivoice/src/asr-backend-selector.cpp
@@ -0,0 +1,34 @@
+/*
+ * asr-backend-selector.cpp — registry + selection for the per-op ASR backend
+ * seam. A thin instantiation of eliza_backend::Registry<AsrBackendFactory>
+ * (backend-registry.h) — the resolution logic is shared with every other
+ * modality. Inert by default: no -DELIZA_ENABLE_* ASR backend is compiled in
+ * (none exists yet), so nothing registers and asr_backend_select() returns
+ * nullptr, so eliza_inference_asr_transcribe keeps the in-tree ggml path.
+ */
+
+#include "asr-backend.h"
+#include "backend-registry.h"
+
+#include <mutex>
+
+namespace {
+eliza_backend::Registry<AsrBackendFactory> g_registry;
+std::once_flag                             g_builtins_once;
+} // namespace
+
+void asr_backend_register(AsrBackendFactory * factory) {
+    g_registry.register_factory(factory);
+}
+
+void asr_backend_register_builtins() {
+    std::call_once(g_builtins_once, []() {
+        /* No ASR backend exists yet — the seam stays inert. */
+    });
+}
+
+AsrBackendFactory * asr_backend_select(const char * bundle_dir, char ** out_error) {
+    asr_backend_register_builtins();
+    return g_registry.select("ELIZA_ASR_BACKEND", "ELIZA_BACKEND", "asr",
+                             bundle_dir, out_error);
+}
diff --git a/tools/omnivoice/src/asr-backend.h b/tools/omnivoice/src/asr-backend.h
@@ -0,0 +1,61 @@
+#pragma once
+/*
+ * asr-backend.h — per-op backend seam for speech-to-text transcription.
+ *
+ * A one-shot op (eliza_inference_asr_transcribe) that an accelerator backend can
+ * serve when it ships an ASR artifact under `<bundle>/asr/`, while every other
+ * op — and ASR itself when no artifact is present — stays on the in-tree ggml
+ * path.
+ *
+ * The factory mirrors the FFI 1:1 and the FFI delegates without translation.
+ * Selection reuses the shared eliza_backend::Registry (backend-registry.h):
+ * ELIZA_ASR_BACKEND (per-op) then ELIZA_BACKEND (global) hard-select, else the
+ * highest preference_rank among available()+can_serve() factories, else nullptr
+ * (the ggml ASR path).
+ */
+
+#include "eliza-inference-ffi.h" /* EliInferenceContext fwd, ELIZA_* codes */
+
+#include <cstddef>
+
+struct EliInferenceContext;
+
+/* One factory per linked-in ASR runtime (e.g. LiteRT). */
+struct AsrBackendFactory {
+    virtual ~AsrBackendFactory() = default;
+
+    /* Stable lower-case id, e.g. "litert". Matched case-insensitively against
+     * ELIZA_ASR_BACKEND / ELIZA_BACKEND. */
+    virtual const char * name() const = 0;
+
+    /* Compiled in AND host deps present (the runtime + a GPU/NPU delegate).
+     * Cheap — must not load a model. */
+    virtual bool available() const = 0;
+
+    /* The ASR artifact exists under `<bundle_dir>/asr/`. Cheap directory probe,
+     * no model load. */
+    virtual bool can_serve(const char * bundle_dir) const = 0;
+
+    /* Platform-affinity rank (higher wins; the ggml path is the implicit rank 0).
+     * An NPU-served ASR returns a high positive value; a GPU-delegate fallback a
+     * lower positive value. */
+    virtual int preference_rank() const { return 0; }
+
+    /* Mirrors eliza_inference_asr_transcribe 1:1. Returns the number of bytes
+     * written (excluding the terminator) on success, or a negative ELIZA_* code
+     * with `*out_error` heap-allocated for the caller to free. */
+    virtual int asr_transcribe(EliInferenceContext * ctx, const float * pcm, size_t n_samples,
+                               int sample_rate_hz, char * out_text, size_t max_text_bytes,
+                               char ** out_error) = 0;
+};
+
+/* Register a factory (idempotent by name). */
+void asr_backend_register(AsrBackendFactory * factory);
+
+/* Register every ASR backend compiled into THIS build (gated by the
+ * -DELIZA_ENABLE_* options). Idempotent; called by asr_backend_select. */
+void asr_backend_register_builtins();
+
+/* Pick an ASR backend for the bundle at `bundle_dir`. nullptr + no error
+ * => use the in-tree ggml ASR path. nullptr + *out_error => hard failure. */
+AsrBackendFactory * asr_backend_select(const char * bundle_dir, char ** out_error);