Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 35 additions & 5 deletions tools/omnivoice/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,14 @@ set(OMNIVOICE_FFI_SOURCES
# backend below registers itself, so the default build keeps the in-tree
# llama.cpp path.
src/llm-backend-selector.cpp
# Per-op backend seams (cutover M3+). Each modality's selector reuses the
# shared eliza_backend::Registry (backend-registry.h) and is inert until a
# gated backend registers — so the default build keeps the ggml path per-op.
src/embed-backend-selector.cpp
src/vision-backend-selector.cpp
src/asr-backend-selector.cpp
src/tts-backend-selector.cpp
src/eot-backend-selector.cpp
)

# Vendored standalone voice-classifier forward graphs (pure scalar C, no
Expand Down Expand Up @@ -231,7 +239,12 @@ option(ELIZA_ENABLE_VISION "Build the fused mmproj vision-describe ABI (v9)" ON)
# pipe keeps the in-tree llama.cpp path. ON requires the LiteRT-LM SDK
# (ELIZA_LITERT_SDK_DIR) — a host/device cross-build concern, not the Linux CI
# default. See docs/multi-backend-ffi-seam.md.
option(ELIZA_ENABLE_LITERT "Build the LiteRT-LM in-process LLM backend (M4)" OFF)
option(ELIZA_ENABLE_LITERT "Build the LiteRT C-API per-op backends, e.g. embed (M4)" OFF)

# ELIZA_ENABLE_LITERT_LM — the streaming-LLM backend on the heavier LiteRT-LM
# Engine SDK (litert::lm), separate from the LiteRT C runtime above. OFF until
# that SDK is built; point -DELIZA_LITERT_LM_SDK_DIR / -DELIZA_LITERT_LM_LIBS at it.
option(ELIZA_ENABLE_LITERT_LM "Build the LiteRT-LM in-process streaming-LLM backend" OFF)

# ELIZA_ENABLE_MLX — compile the CoreML/MLX in-process streaming-LLM backend
# (cutover plan M5 — Apple Silicon). OFF by default; ON is Apple-only and
Expand Down Expand Up @@ -297,12 +310,13 @@ if(TARGET mtmd)
# out, and the streaming-LLM pipe keeps the in-tree llama.cpp path — so the
# default desktop/CI build is byte-for-byte the pre-seam behavior.
if(ELIZA_ENABLE_LITERT)
# LiteRT C-API per-op backends (embed today; vision/etc. as artifacts
# ship). SDK = the LiteRT C runtime (github.com/google-ai-edge/LiteRT,
# libLiteRt.so + the GPU/NPU delegate). Point at a built SDK with
# -DELIZA_LITERT_SDK_DIR=<dir> and link with -DELIZA_LITERT_LIBS=LiteRt.
target_sources(elizainference PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/src/backends/litert-backend.cpp)
${CMAKE_CURRENT_SOURCE_DIR}/src/backends/litert-embed-backend.cpp)
target_compile_definitions(elizainference PRIVATE ELIZA_ENABLE_LITERT)
# LiteRT-LM SDK (github.com/google-ai-edge/LiteRT-LM). Point at a built
# SDK with -DELIZA_LITERT_SDK_DIR=<dir>; the device/host cross-build
# links its libs + the NPU delegates with -DELIZA_LITERT_LIBS=<libs>.
if(ELIZA_LITERT_SDK_DIR)
target_include_directories(elizainference PRIVATE ${ELIZA_LITERT_SDK_DIR}/include)
target_link_directories(elizainference PRIVATE ${ELIZA_LITERT_SDK_DIR}/lib)
Expand All @@ -311,6 +325,22 @@ if(TARGET mtmd)
target_link_libraries(elizainference PRIVATE ${ELIZA_LITERT_LIBS})
endif()
endif()
if(ELIZA_ENABLE_LITERT_LM)
# The streaming-LLM backend needs the heavier LiteRT-LM Engine SDK
# (litert::lm, github.com/google-ai-edge/LiteRT-LM) — separate from the
# LiteRT C runtime above. Point at it with -DELIZA_LITERT_LM_SDK_DIR /
# -DELIZA_LITERT_LM_LIBS.
target_sources(elizainference PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/src/backends/litert-backend.cpp)
target_compile_definitions(elizainference PRIVATE ELIZA_ENABLE_LITERT_LM)
if(ELIZA_LITERT_LM_SDK_DIR)
target_include_directories(elizainference PRIVATE ${ELIZA_LITERT_LM_SDK_DIR}/include)
target_link_directories(elizainference PRIVATE ${ELIZA_LITERT_LM_SDK_DIR}/lib)
endif()
if(ELIZA_LITERT_LM_LIBS)
target_link_libraries(elizainference PRIVATE ${ELIZA_LITERT_LM_LIBS})
endif()
endif()
if(ELIZA_ENABLE_MLX)
if(NOT APPLE)
message(FATAL_ERROR
Expand Down
159 changes: 159 additions & 0 deletions tools/omnivoice/src/SESSION-OPS-TODO.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
# Session-op backend seam — design (NOT implemented)

The per-op backend seam (`backend-registry.h` + `<mod>-backend.h` +
`<mod>-backend-selector.cpp` + a chokepoint at the top of the FFI fn) is now in
place for the **one-shot** ops:

| modality | FFI fn | header / selector | env key | artifact dir |
|----------|---------------------------------|------------------------------|-----------------------|--------------------|
| embed | `eliza_inference_embed` | `embed-backend.*` | `ELIZA_EMBED_BACKEND` | `<bundle>/embedding/` |
| vision | `eliza_inference_describe_image`| `vision-backend.*` | `ELIZA_VISION_BACKEND`| `<bundle>/vision/` |
| asr | `eliza_inference_asr_transcribe`| `asr-backend.*` | `ELIZA_ASR_BACKEND` | `<bundle>/asr/` |
| tts | `eliza_inference_tts_synthesize`| `tts-backend.*` | `ELIZA_TTS_BACKEND` | `<bundle>/tts/` |
| eot | `eliza_inference_llm_eot_score` | `eot-backend.*` | `ELIZA_EOT_BACKEND` | `<bundle>/eot/` |

A one-shot op is stateless across calls: select → (delegate | fall through to
ggml) on every call. There is nothing to keep alive between calls, so the seam
is a single chokepoint at the top of the fn.

The **session** ops are different: `vad`, `wakeword`, `speaker`, `diariz` each
`_open` a native handle (`EliVad *`, `EliWakeword *`, `EliSpeaker *`,
`EliDiariz *`) that persists across many `_segment`/`_detect`/`_embed` calls and
is torn down with `_close`/`_reset`. The seam has to follow that lifecycle, not
re-select per call. This file records HOW to extend the seam to them. **None of
the below is implemented yet.**

## The shape of a session op (today, in-tree only)

Each session modality exposes, e.g. for VAD:

```c
EliVad * eliza_inference_vad_open(EliInferenceContext * ctx, /* params */, char ** out_error);
int eliza_inference_vad_segment(EliVad * vad, const float * pcm, size_t n, /* out */, char ** out_error);
int eliza_inference_vad_reset(EliVad * vad, char ** out_error);
void eliza_inference_vad_close(EliVad * vad);
```

`EliVad` (and the wakeword/speaker/diariz equivalents) is the in-tree handle
struct defined in `eliza-inference-ffi.cpp`. Its in-tree fields stay exactly as
they are; the seam is **additive** — one extra pointer.

## Extending the seam to a session op

For each session modality `<mod>` (vad | wakeword | speaker | diariz):

### 1. A session factory interface — `<mod>-backend.h`

Mirror the one-shot factory's four common probes, but the forward methods mirror
the **session** ABI 1:1 instead of a single one-shot fn. The factory does NOT
own the handle struct; it produces and operates on an opaque backend-session:

```cpp
struct VadBackendFactory {
virtual ~VadBackendFactory() = default;
virtual const char * name() const = 0;
virtual bool available() const = 0;
virtual bool can_serve(const char * bundle_dir) const = 0; // probes <bundle>/vad/
virtual int preference_rank() const { return 0; }

// Lifecycle, mirroring the FFI session ABI 1:1. The factory returns an
// opaque backend-session pointer it owns; the FFI stashes it on the Eli*
// handle. A NULL return + *out_error is a hard open failure.
virtual void * open(EliInferenceContext * ctx, /* same params as eliza_inference_vad_open */,
char ** out_error) = 0;
virtual int segment(void * session, const float * pcm, size_t n, /* out */, char ** out_error) = 0;
virtual int reset(void * session, char ** out_error) = 0;
virtual void close(void * session) = 0;
};
```

Plus the same free-functions as the one-shot seam:
`vad_backend_register`, `vad_backend_register_builtins` (EMPTY for now — no
LiteRT session backend exists), `vad_backend_select(bundle_dir, out_error)`,
backed by a `eliza_backend::Registry<VadBackendFactory>` in
`<mod>-backend-selector.cpp` with env keys `ELIZA_VAD_BACKEND` → `ELIZA_BACKEND`
and modality `"vad"`. Artifact probe dir `<bundle>/vad/` (resp. `wakeword/`,
`speaker/`, `diariz/`).

### 2. A backend-session pointer on the Eli* handle

The selection happens ONCE, at `_open`, not per call. Add one field to the
in-tree handle struct:

```cpp
struct EliVad {
/* ... existing in-tree fields, unchanged ... */

/* Backend seam (additive). When non-null, this handle is served by an
* accelerator backend and every op delegates to it; the in-tree fields
* above are then unused. When null, the in-tree ggml path owns the handle. */
VadBackendFactory * be = nullptr; // the factory that opened be_session
void * be_session = nullptr; // factory-owned backend session
};
```

### 3. Select at `_open`

In `eliza_inference_vad_open`, after the existing arg validation and before the
in-tree handle is built:

```cpp
char * be_error = nullptr;
VadBackendFactory * be = vad_backend_select(llm_backend_context_bundle_dir(ctx), &be_error);
if (be_error) { eliza_set_error(out_error, std::string(be_error)); std::free(be_error);
return /* NULL handle */; }
if (be) {
void * sess = be->open(ctx, /* params */, out_error);
if (!sess) return /* NULL handle — open failed, out_error already set */;
EliVad * h = new EliVad();
h->be = be;
h->be_session = sess;
return h;
}
/* else: fall through and build the in-tree handle exactly as today. */
```

### 4. A guard at the TOP of each `_segment` / `_reset` / `_close`

Each per-call op checks the backend pointer and delegates before touching any
in-tree state:

```cpp
int eliza_inference_vad_segment(EliVad * vad, const float * pcm, size_t n, /* out */, char ** out_error) {
if (!vad) { /* invalid-arg as today */ }
if (vad->be) { // <-- guard
return vad->be->segment(vad->be_session, pcm, n, /* out */, out_error);
}
/* ... existing in-tree ggml segment body, unchanged ... */
}

void eliza_inference_vad_close(EliVad * vad) {
if (!vad) return;
if (vad->be) { vad->be->close(vad->be_session); delete vad; return; } // <-- guard
/* ... existing in-tree teardown, then delete vad ... */
}
```

`_reset` follows the same guard pattern.

## Why this shape (vs. re-selecting per call)

- **Selection is per-session, not per-call.** A session's backend is fixed at
`_open`; you cannot have `_segment` cross from the ggml path to LiteRT mid
session because the KV/feature state lives in the (in-tree OR backend)
session, not on the FFI boundary. The one pointer captures that binding.
- **Hard-fail localizes to `_open`.** A bundle-invalid override surfaces once,
where the caller is already prepared to handle a NULL handle, instead of on
every `_segment`.
- **Additive + inert.** With no session backend registered (the case today),
`_open`'s `select()` returns nullptr, `be`/`be_session` stay null, and every
guard is a no-op — the in-tree path is byte-for-byte unchanged. Same inert-by
-default contract as the one-shot seam.

## Status

- One-shot seam: embed (with a LiteRT builtin), vision/asr/tts/eot (inert,
no builtin) — **done**.
- Session seam (vad/wakeword/speaker/diariz): **not implemented.** No
`<mod>-backend.{h,cpp}`, no handle field, no `_open` select, no per-call
guards exist yet. This file is the spec for when a session backend lands.
34 changes: 34 additions & 0 deletions tools/omnivoice/src/asr-backend-selector.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/*
* asr-backend-selector.cpp — registry + selection for the per-op ASR backend
* seam. A thin instantiation of eliza_backend::Registry<AsrBackendFactory>
* (backend-registry.h) — the resolution logic is shared with every other
* modality. Inert by default: no -DELIZA_ENABLE_* ASR backend is compiled in
* (none exists yet), so nothing registers and asr_backend_select() returns
* nullptr, so eliza_inference_asr_transcribe keeps the in-tree ggml path.
*/

#include "asr-backend.h"
#include "backend-registry.h"

#include <mutex>

namespace {
eliza_backend::Registry<AsrBackendFactory> g_registry;
std::once_flag g_builtins_once;
} // namespace

void asr_backend_register(AsrBackendFactory * factory) {
g_registry.register_factory(factory);
}

void asr_backend_register_builtins() {
std::call_once(g_builtins_once, []() {
/* No ASR backend exists yet — the seam stays inert. */
});
}

AsrBackendFactory * asr_backend_select(const char * bundle_dir, char ** out_error) {
asr_backend_register_builtins();
return g_registry.select("ELIZA_ASR_BACKEND", "ELIZA_BACKEND", "asr",
bundle_dir, out_error);
}
61 changes: 61 additions & 0 deletions tools/omnivoice/src/asr-backend.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#pragma once
/*
* asr-backend.h — per-op backend seam for speech-to-text transcription.
*
* A one-shot op (eliza_inference_asr_transcribe) that an accelerator backend can
* serve when it ships an ASR artifact under `<bundle>/asr/`, while every other
* op — and ASR itself when no artifact is present — stays on the in-tree ggml
* path.
*
* The factory mirrors the FFI 1:1 and the FFI delegates without translation.
* Selection reuses the shared eliza_backend::Registry (backend-registry.h):
* ELIZA_ASR_BACKEND (per-op) then ELIZA_BACKEND (global) hard-select, else the
* highest preference_rank among available()+can_serve() factories, else nullptr
* (the ggml ASR path).
*/

#include "eliza-inference-ffi.h" /* EliInferenceContext fwd, ELIZA_* codes */

#include <cstddef>

struct EliInferenceContext;

/* One factory per linked-in ASR runtime (e.g. LiteRT). */
struct AsrBackendFactory {
virtual ~AsrBackendFactory() = default;

/* Stable lower-case id, e.g. "litert". Matched case-insensitively against
* ELIZA_ASR_BACKEND / ELIZA_BACKEND. */
virtual const char * name() const = 0;

/* Compiled in AND host deps present (the runtime + a GPU/NPU delegate).
* Cheap — must not load a model. */
virtual bool available() const = 0;

/* The ASR artifact exists under `<bundle_dir>/asr/`. Cheap directory probe,
* no model load. */
virtual bool can_serve(const char * bundle_dir) const = 0;

/* Platform-affinity rank (higher wins; the ggml path is the implicit rank 0).
* An NPU-served ASR returns a high positive value; a GPU-delegate fallback a
* lower positive value. */
virtual int preference_rank() const { return 0; }

/* Mirrors eliza_inference_asr_transcribe 1:1. Returns the number of bytes
* written (excluding the terminator) on success, or a negative ELIZA_* code
* with `*out_error` heap-allocated for the caller to free. */
virtual int asr_transcribe(EliInferenceContext * ctx, const float * pcm, size_t n_samples,
int sample_rate_hz, char * out_text, size_t max_text_bytes,
char ** out_error) = 0;
};

/* Register a factory (idempotent by name). */
void asr_backend_register(AsrBackendFactory * factory);

/* Register every ASR backend compiled into THIS build (gated by the
* -DELIZA_ENABLE_* options). Idempotent; called by asr_backend_select. */
void asr_backend_register_builtins();

/* Pick an ASR backend for the bundle at `bundle_dir`. nullptr + no error
* => use the in-tree ggml ASR path. nullptr + *out_error => hard failure. */
AsrBackendFactory * asr_backend_select(const char * bundle_dir, char ** out_error);
Loading
Loading