From efad622b7992f33f98c559433bb1ecd69966a28b Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sun, 7 Jun 2026 08:39:38 +0000 Subject: [PATCH] feat(capi): ABI v4 segment-timestamp support (frame_sec + streaming JSON) Add the data LocalAI needs to build NeMo-faithful segment timestamps: - Offline JSON (transcribe_*_json) now carries "frame_sec", the encoder frame stride in seconds, so a consumer can convert NeMo's frame-unit segment_gap_threshold into the seconds gap between words. - New streaming JSON entry points parakeet_capi_stream_feed_json / parakeet_capi_stream_finalize_json return {text, eou, frame_sec, words} by surfacing the streaming session's existing drain_words() per-word start/end/conf alongside the newly-finalized text and EOU flag. Bumps PARAKEET_CAPI_ABI_VERSION to 4. All existing entry points are unchanged; the new symbols are additive (consumers probe for them). tests/test_capi_stream_json.cpp drives the new streaming JSON path on the EOU model (skips with 77 when PARAKEET_TEST_GGUF_EOU is unset, like the sibling streaming tests). Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto --- README.md | 3 +- include/parakeet_capi.h | 31 ++++++++- src/parakeet_capi.cpp | 112 +++++++++++++++++++++++++++++++- tests/CMakeLists.txt | 5 +- tests/test_capi_stream_json.cpp | 107 ++++++++++++++++++++++++++++++ 5 files changed, 252 insertions(+), 6 deletions(-) create mode 100644 tests/test_capi_stream_json.cpp diff --git a/README.md b/README.md index a7cd0f3..c21bf27 100644 --- a/README.md +++ b/README.md @@ -275,11 +275,12 @@ Timestamps and confidence as JSON (matches NeMo `timestamps=True` + `max_prob`): ```c char *json = parakeet_capi_transcribe_path_json(ctx, "audio.wav", 0 /*default*/); // {"text":"...", +// "frame_sec":0.080000, // "words":[{"w":"Well,","start":0.480,"end":0.640,"conf":0.7859}, ...], // "tokens":[{"id":639,"t":0.480,"conf":0.9969}, ...]} if (json) { printf("%s\n", json); parakeet_capi_free_string(json); } ``` -`start`/`end`/`t` are in seconds; `conf` is the rescaled softmax probability of the emitted token in `(0,1]` (a word's `conf` is the `min` over its tokens). +`start`/`end`/`t` are in seconds; `conf` is the rescaled softmax probability of the emitted token in `(0,1]` (a word's `conf` is the `min` over its tokens). `frame_sec` is the encoder frame stride in seconds (`hop x subsampling / sample_rate`); multiply a frame-unit segment gap threshold (NeMo's `segment_gap_threshold`) by it to get the seconds gap between words when forming segments. ### Streaming (cache-aware EOU model) diff --git a/include/parakeet_capi.h b/include/parakeet_capi.h index 8926197..b082455 100644 --- a/include/parakeet_capi.h +++ b/include/parakeet_capi.h @@ -24,6 +24,12 @@ typedef struct parakeet_ctx parakeet_ctx; // parakeet_capi_transcribe_pcm_batch_lang) for multilingual // prompt-conditioned (nemotron) models. The original non-lang entry points // are unchanged and delegate with the model default language. +// +// v4: added the streaming JSON entry points (parakeet_capi_stream_feed_json, +// parakeet_capi_stream_finalize_json) that surface per-word timestamps +// (start/end/conf) plus frame_sec alongside the newly-finalized text, and +// added "frame_sec" to the transcribe_*_json documents. The original entry +// points are unchanged. int parakeet_capi_abi_version(void); // Load a GGUF model. Returns an owning context, or NULL on failure. @@ -101,11 +107,15 @@ int parakeet_capi_transcribe_pcm_batch_lang(parakeet_ctx* ctx, // parakeet_capi_transcribe_path. The JSON shape is: // // {"text":"...", +// "frame_sec":0.080000, // "words":[{"w":"...","start":0.480,"end":0.640,"conf":0.9100}, ...], // "tokens":[{"id":123,"t":0.480,"conf":0.9100}, ...]} // // where "start"/"end"/"t" are seconds (3 decimals) and "conf" is the -// confidence in (0,1] (4 decimals). The "w"/"text" strings are JSON-escaped +// confidence in (0,1] (4 decimals). "frame_sec" is the encoder frame stride in +// seconds (hop_length * subsampling_factor / sample_rate); multiply a frame-unit +// segment gap threshold by it to get the seconds gap between words. The +// "w"/"text" strings are JSON-escaped // (", \\, and control chars). On success returns the malloc'd string (free with // parakeet_capi_free_string); on error returns NULL and sets the context's last // error. @@ -180,6 +190,25 @@ char* parakeet_capi_stream_feed(parakeet_stream* s, const float* pcm, // complete. Does NOT fabricate an NeMo's streaming would not emit. char* parakeet_capi_stream_finalize(parakeet_stream* s); +// Like parakeet_capi_stream_feed but returns a malloc'd UTF-8 JSON document +// instead of bare text: +// {"text":"...","eou":0,"frame_sec":0.080000, +// "words":[{"w":"...","start":0.480,"end":0.640,"conf":0.9100}, ...]} +// "text" is the newly-finalized text since the last call ("" if none); "eou" is +// 1 iff an / fired during this feed; "frame_sec" is the encoder frame +// stride in seconds; "words" are the words finalized this call with absolute +// (stream-relative) start/end seconds and 'min'-aggregate confidence (the same +// drain as the offline pk::group_words). Returns NULL only on error (see +// parakeet_capi_last_error). Free with parakeet_capi_free_string. +char* parakeet_capi_stream_feed_json(parakeet_stream* s, const float* pcm, + int n_samples); + +// Like parakeet_capi_stream_finalize but returns the same JSON document shape as +// parakeet_capi_stream_feed_json (flushing the end-of-stream tail; "eou" is +// typically 0 — finalize does not fabricate an ). Free with +// parakeet_capi_free_string; NULL only on error. +char* parakeet_capi_stream_finalize_json(parakeet_stream* s); + // Free a streaming session. Safe on NULL. void parakeet_capi_stream_free(parakeet_stream* s); diff --git a/src/parakeet_capi.cpp b/src/parakeet_capi.cpp index 7871d0e..01de213 100644 --- a/src/parakeet_capi.cpp +++ b/src/parakeet_capi.cpp @@ -20,7 +20,11 @@ // stream_begin_lang / transcribe_pcm_batch_json_lang / // transcribe_pcm_batch_lang) for multilingual prompt-conditioned (nemotron) // models. -#define PARAKEET_CAPI_ABI_VERSION 3 +// v4: streaming JSON entry points (stream_feed_json / stream_finalize_json) that +// surface per-word timestamps (start/end/conf) plus frame_sec alongside the +// newly-finalized text + eou flag, and a "frame_sec" field added to the +// transcribe_*_json documents. Original entry points unchanged. +#define PARAKEET_CAPI_ABI_VERSION 4 // The opaque context: a loaded model plus a buffer for the last error message. struct parakeet_ctx { @@ -150,9 +154,14 @@ void append_json_float(std::string& out, const char* fmt, float v) { // (word start/end, token t) with %.3f, confidences with %.4f. std::string transcription_to_json(const pk::Transcription& tr, float frame_sec) { std::string out; - out.reserve(64 + tr.words.size() * 48 + tr.tokens.size() * 40); + out.reserve(80 + tr.words.size() * 48 + tr.tokens.size() * 40); out += "{\"text\":"; append_json_string(out, tr.text); + // Encoder frame stride in seconds; lets consumers convert a frame-unit + // segment gap threshold (NeMo segment_gap_threshold) to the seconds gap + // between words when forming segments. + out += ",\"frame_sec\":"; + append_json_float(out, "%.6f", frame_sec); out += ",\"words\":["; for (size_t i = 0; i < tr.words.size(); ++i) { if (i) out += ','; @@ -573,6 +582,105 @@ extern "C" char* parakeet_capi_stream_finalize(parakeet_stream* s) { } } +namespace { + +// Serialize a streaming feed/finalize result to JSON: the newly-finalized text, +// the eou flag, frame_sec, and the words drained this call (absolute seconds). +// Shape matches the header doc on parakeet_capi_stream_feed_json. +std::string stream_json(const std::string& text, int eou, float frame_sec, + const std::vector& words) { + std::string out; + out.reserve(80 + words.size() * 48); + out += "{\"text\":"; + append_json_string(out, text); + out += ",\"eou\":"; + out += (eou ? "1" : "0"); + out += ",\"frame_sec\":"; + append_json_float(out, "%.6f", frame_sec); + out += ",\"words\":["; + for (size_t i = 0; i < words.size(); ++i) { + if (i) out += ','; + out += "{\"w\":"; + append_json_string(out, words[i].text); + out += ",\"start\":"; + append_json_float(out, "%.3f", words[i].start); + out += ",\"end\":"; + append_json_float(out, "%.3f", words[i].end); + out += ",\"conf\":"; + append_json_float(out, "%.4f", words[i].conf); + out += '}'; + } + out += "]}"; + return out; +} + +// frame_sec for the stream's model (encoder frame stride in seconds). +float stream_frame_sec(const parakeet_stream* s) { + const pk::ParakeetConfig& cfg = s->ctx->model->config(); + return (float)cfg.hop_length * (float)cfg.subsampling_factor / (float)cfg.sample_rate; +} + +} // namespace + +extern "C" char* parakeet_capi_stream_feed_json(parakeet_stream* s, + const float* pcm, int n_samples) { + if (!s) return nullptr; + if (!s->ctx || !s->ctx->model) return nullptr; + if (n_samples < 0 || (!pcm && n_samples > 0)) { + s->ctx->last_error = "invalid PCM buffer"; + return nullptr; + } + try { + if (n_samples > 0) { + int n_new = 0; + std::vector frames = s->mel->feed(pcm, n_samples, n_new); + append_mel_frames(s, frames, n_new); + } + int eou = 0; + std::string delta = feed_available(s, /*flush=*/false, eou); + std::vector words = s->sess->drain_words(); + std::string json = stream_json(delta, eou, stream_frame_sec(s), words); + s->ctx->last_error.clear(); + char* out = dup_to_c(json); + if (!out) { s->ctx->last_error = "out of memory"; return nullptr; } + return out; + } catch (const std::exception& e) { + s->ctx->last_error = e.what(); + return nullptr; + } catch (...) { + s->ctx->last_error = "unknown error"; + return nullptr; + } +} + +extern "C" char* parakeet_capi_stream_finalize_json(parakeet_stream* s) { + if (!s) return nullptr; + if (!s->ctx || !s->ctx->model) return nullptr; + try { + if (s->mel) { + int n_tail = 0; + std::vector tail = s->mel->finalize(n_tail); + append_mel_frames(s, tail, n_tail); + } + int eou = 0; + std::string delta = feed_available(s, /*flush=*/true, eou); + delta += s->sess->finalize(); + std::vector words = s->sess->drain_words(); + std::string json = stream_json(delta, eou, stream_frame_sec(s), words); + s->finalized = true; + s->ctx->last_error.clear(); + char* out = dup_to_c(json); + if (!out) { s->ctx->last_error = "out of memory"; return nullptr; } + return out; + } catch (const std::exception& e) { + s->ctx->last_error = e.what(); + return nullptr; + } catch (...) { + s->ctx->last_error = "unknown error"; + return nullptr; + } +} + extern "C" void parakeet_capi_stream_free(parakeet_stream* s) { delete s; // safe on nullptr } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index f40a34a..9d71015 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -58,6 +58,7 @@ pk_add_test(test_streaming_mel) pk_add_test(test_capi) pk_add_test(test_capi_batch) pk_add_test(test_capi_stream) +pk_add_test(test_capi_stream_json) pk_add_test(test_capi_timestamps) pk_add_test(test_capi_batch_json) set_tests_properties(test_model_loader test_mel test_mel_gpu test_subsampling test_subsampling_batch test_subsampling_batch_causal test_relpos_attention test_relpos_attention_batch test_relpos_attention_local_chunked @@ -69,7 +70,7 @@ set_tests_properties(test_model_loader test_mel test_mel_gpu test_subsampling te test_timestamps_tokens test_timestamps test_transcribe_batch_ts test_tokenizer test_transcribe test_transcribe_speech test_transcribe_tdt test_transcribe_0_6b test_transcribe_ctc test_transcribe_rnnt test_transcribe_eou test_transcribe_nemotron - test_streaming_decode test_streaming_eou_reset test_streaming_nemotron test_streaming_mel test_capi test_capi_batch test_capi_stream + test_streaming_decode test_streaming_eou_reset test_streaming_nemotron test_streaming_mel test_capi test_capi_batch test_capi_stream test_capi_stream_json test_capi_timestamps test_capi_batch_json PROPERTIES LABELS "model") # These tests read fixtures/baselines via paths relative to the project root. @@ -84,7 +85,7 @@ set_tests_properties(test_mel test_mel_gpu test_subsampling test_subsampling_bat test_tokenizer test_transcribe test_transcribe_speech test_transcribe_tdt test_transcribe_0_6b test_transcribe_ctc test_transcribe_rnnt test_transcribe_eou test_transcribe_nemotron - test_streaming_decode test_streaming_eou_reset test_streaming_nemotron test_streaming_mel test_capi test_capi_batch test_capi_stream + test_streaming_decode test_streaming_eou_reset test_streaming_nemotron test_streaming_mel test_capi test_capi_batch test_capi_stream test_capi_stream_json test_capi_timestamps test_capi_batch_json PROPERTIES WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}) diff --git a/tests/test_capi_stream_json.cpp b/tests/test_capi_stream_json.cpp new file mode 100644 index 0000000..785a37e --- /dev/null +++ b/tests/test_capi_stream_json.cpp @@ -0,0 +1,107 @@ +#include "parakeet_capi.h" +#include "audio_io.hpp" // pk::load_audio_16k_mono (test links the parakeet lib) + +#include +#include +#include +#include +#include + +// Streaming JSON C-API smoke test (segment-timestamp support). +// +// Drives parakeet_capi_stream_feed_json / parakeet_capi_stream_finalize_json on +// the cache-aware EOU streaming model and asserts the returned documents carry +// "frame_sec" (> 0) and a "words" array (the per-word start/end timestamps from +// the streaming session's drain_words) — the data LocalAI needs to build +// timestamped per-utterance segments. +// +// Skips (exit 77) unless PARAKEET_TEST_GGUF_EOU is set (the streaming EOU model +// is a ~480MB download, not in CI). +// +// LABEL model +// WORKING_DIRECTORY (run from project root; wav path is relative) + +static bool contains(const std::string& hay, const char* needle) { + return hay.find(needle) != std::string::npos; +} + +int main() { + const char* gguf = std::getenv("PARAKEET_TEST_GGUF_EOU"); + if (!gguf) { + std::fprintf(stderr, + "test_capi_stream_json: PARAKEET_TEST_GGUF_EOU not set; skip " + "(streaming EOU model is a ~480MB download, not in CI)\n"); + return 77; + } + + pk::Audio audio; + if (!pk::load_audio_16k_mono("tests/fixtures/speech.wav", audio)) { + std::fprintf(stderr, "test_capi_stream_json: failed to load speech.wav\n"); + return 1; + } + + parakeet_ctx* ctx = parakeet_capi_load(gguf); + if (!ctx) { + std::fprintf(stderr, "test_capi_stream_json: load failed for %s\n", gguf); + return 1; + } + parakeet_stream* s = parakeet_capi_stream_begin(ctx); + if (!s) { + std::fprintf(stderr, "test_capi_stream_json: stream_begin failed: %s\n", + parakeet_capi_last_error(ctx)); + parakeet_capi_free(ctx); + return 1; + } + + // Feed the PCM in real-time-sized chunks (~100 ms = 1600 samples). + const int chunk = 1600; + const int n = (int)audio.samples.size(); + std::string acc; + for (int off = 0; off < n; off += chunk) { + const int len = std::min(chunk, n - off); + char* t = parakeet_capi_stream_feed_json(s, audio.samples.data() + off, len); + if (!t) { + std::fprintf(stderr, "test_capi_stream_json: feed_json NULL: %s\n", + parakeet_capi_last_error(ctx)); + parakeet_capi_stream_free(s); + parakeet_capi_free(ctx); + return 1; + } + acc += t; + parakeet_capi_free_string(t); + } + + char* fin = parakeet_capi_stream_finalize_json(s); + if (!fin) { + std::fprintf(stderr, "test_capi_stream_json: finalize_json NULL: %s\n", + parakeet_capi_last_error(ctx)); + parakeet_capi_stream_free(s); + parakeet_capi_free(ctx); + return 1; + } + acc += fin; + parakeet_capi_free_string(fin); + + parakeet_capi_stream_free(s); + parakeet_capi_free(ctx); + + std::fprintf(stderr, "test_capi_stream_json: concatenated docs:\n%s\n", acc.c_str()); + + if (!contains(acc, "\"frame_sec\"")) { + std::fprintf(stderr, "test_capi_stream_json: FAIL — no frame_sec in output\n"); + return 1; + } + if (!contains(acc, "\"words\"")) { + std::fprintf(stderr, "test_capi_stream_json: FAIL — no words array in output\n"); + return 1; + } + // A real transcription must finalize at least one word with a non-zero end. + if (!contains(acc, "\"end\":")) { + std::fprintf(stderr, "test_capi_stream_json: FAIL — no word end timestamps\n"); + return 1; + } + + std::fprintf(stderr, "test_capi_stream_json: PASS — streaming JSON carries " + "frame_sec + per-word timestamps\n"); + return 0; +}