From efad622b7992f33f98c559433bb1ecd69966a28b Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Sun, 7 Jun 2026 08:39:38 +0000
Subject: [PATCH] feat(capi): ABI v4 segment-timestamp support (frame_sec +
 streaming JSON)

Add the data LocalAI needs to build NeMo-faithful segment timestamps:

- Offline JSON (transcribe_*_json) now carries "frame_sec", the encoder
  frame stride in seconds, so a consumer can convert NeMo's frame-unit
  segment_gap_threshold into the seconds gap between words.

- New streaming JSON entry points parakeet_capi_stream_feed_json /
  parakeet_capi_stream_finalize_json return {text, eou, frame_sec, words}
  by surfacing the streaming session's existing drain_words() per-word
  start/end/conf alongside the newly-finalized text and EOU flag.

Bumps PARAKEET_CAPI_ABI_VERSION to 4. All existing entry points are
unchanged; the new symbols are additive (consumers probe for them).

tests/test_capi_stream_json.cpp drives the new streaming JSON path on the
EOU model (skips with 77 when PARAKEET_TEST_GGUF_EOU is unset, like the
sibling streaming tests).

Assisted-by: Claude:claude-opus-4-8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 README.md                       |   3 +-
 include/parakeet_capi.h         |  31 ++++++++-
 src/parakeet_capi.cpp           | 112 +++++++++++++++++++++++++++++++-
 tests/CMakeLists.txt            |   5 +-
 tests/test_capi_stream_json.cpp | 107 ++++++++++++++++++++++++++++++
 5 files changed, 252 insertions(+), 6 deletions(-)
 create mode 100644 tests/test_capi_stream_json.cpp

diff --git a/README.md b/README.md
index a7cd0f3..c21bf27 100644
--- a/README.md
+++ b/README.md
@@ -275,11 +275,12 @@ Timestamps and confidence as JSON (matches NeMo `timestamps=True` + `max_prob`):
 ```c
 char *json = parakeet_capi_transcribe_path_json(ctx, "audio.wav", 0 /*default*/);
 // {"text":"...",
+//  "frame_sec":0.080000,
 //  "words":[{"w":"Well,","start":0.480,"end":0.640,"conf":0.7859}, ...],
 //  "tokens":[{"id":639,"t":0.480,"conf":0.9969}, ...]}
 if (json) { printf("%s\n", json); parakeet_capi_free_string(json); }
 ```
-`start`/`end`/`t` are in seconds; `conf` is the rescaled softmax probability of the emitted token in `(0,1]` (a word's `conf` is the `min` over its tokens).
+`start`/`end`/`t` are in seconds; `conf` is the rescaled softmax probability of the emitted token in `(0,1]` (a word's `conf` is the `min` over its tokens). `frame_sec` is the encoder frame stride in seconds (`hop x subsampling / sample_rate`); multiply a frame-unit segment gap threshold (NeMo's `segment_gap_threshold`) by it to get the seconds gap between words when forming segments.
 
 ### Streaming (cache-aware EOU model)
 
diff --git a/include/parakeet_capi.h b/include/parakeet_capi.h
index 8926197..b082455 100644
--- a/include/parakeet_capi.h
+++ b/include/parakeet_capi.h
@@ -24,6 +24,12 @@ typedef struct parakeet_ctx parakeet_ctx;
 //     parakeet_capi_transcribe_pcm_batch_lang) for multilingual
 //     prompt-conditioned (nemotron) models. The original non-lang entry points
 //     are unchanged and delegate with the model default language.
+//
+// v4: added the streaming JSON entry points (parakeet_capi_stream_feed_json,
+//     parakeet_capi_stream_finalize_json) that surface per-word timestamps
+//     (start/end/conf) plus frame_sec alongside the newly-finalized text, and
+//     added "frame_sec" to the transcribe_*_json documents. The original entry
+//     points are unchanged.
 int parakeet_capi_abi_version(void);
 
 // Load a GGUF model. Returns an owning context, or NULL on failure.
@@ -101,11 +107,15 @@ int parakeet_capi_transcribe_pcm_batch_lang(parakeet_ctx* ctx,
 // parakeet_capi_transcribe_path. The JSON shape is:
 //
 //   {"text":"...",
+//    "frame_sec":0.080000,
 //    "words":[{"w":"...","start":0.480,"end":0.640,"conf":0.9100}, ...],
 //    "tokens":[{"id":123,"t":0.480,"conf":0.9100}, ...]}
 //
 // where "start"/"end"/"t" are seconds (3 decimals) and "conf" is the
-// confidence in (0,1] (4 decimals). The "w"/"text" strings are JSON-escaped
+// confidence in (0,1] (4 decimals). "frame_sec" is the encoder frame stride in
+// seconds (hop_length * subsampling_factor / sample_rate); multiply a frame-unit
+// segment gap threshold by it to get the seconds gap between words. The
+// "w"/"text" strings are JSON-escaped
 // (", \\, and control chars). On success returns the malloc'd string (free with
 // parakeet_capi_free_string); on error returns NULL and sets the context's last
 // error.
@@ -180,6 +190,25 @@ char* parakeet_capi_stream_feed(parakeet_stream* s, const float* pcm,
 // complete. Does NOT fabricate an <EOU> NeMo's streaming would not emit.
 char* parakeet_capi_stream_finalize(parakeet_stream* s);
 
+// Like parakeet_capi_stream_feed but returns a malloc'd UTF-8 JSON document
+// instead of bare text:
+//   {"text":"...","eou":0,"frame_sec":0.080000,
+//    "words":[{"w":"...","start":0.480,"end":0.640,"conf":0.9100}, ...]}
+// "text" is the newly-finalized text since the last call ("" if none); "eou" is
+// 1 iff an <EOU>/<EOB> fired during this feed; "frame_sec" is the encoder frame
+// stride in seconds; "words" are the words finalized this call with absolute
+// (stream-relative) start/end seconds and 'min'-aggregate confidence (the same
+// drain as the offline pk::group_words). Returns NULL only on error (see
+// parakeet_capi_last_error). Free with parakeet_capi_free_string.
+char* parakeet_capi_stream_feed_json(parakeet_stream* s, const float* pcm,
+                                     int n_samples);
+
+// Like parakeet_capi_stream_finalize but returns the same JSON document shape as
+// parakeet_capi_stream_feed_json (flushing the end-of-stream tail; "eou" is
+// typically 0 — finalize does not fabricate an <EOU>). Free with
+// parakeet_capi_free_string; NULL only on error.
+char* parakeet_capi_stream_finalize_json(parakeet_stream* s);
+
 // Free a streaming session. Safe on NULL.
 void parakeet_capi_stream_free(parakeet_stream* s);
 
diff --git a/src/parakeet_capi.cpp b/src/parakeet_capi.cpp
index 7871d0e..01de213 100644
--- a/src/parakeet_capi.cpp
+++ b/src/parakeet_capi.cpp
@@ -20,7 +20,11 @@
 //     stream_begin_lang / transcribe_pcm_batch_json_lang /
 //     transcribe_pcm_batch_lang) for multilingual prompt-conditioned (nemotron)
 //     models.
-#define PARAKEET_CAPI_ABI_VERSION 3
+// v4: streaming JSON entry points (stream_feed_json / stream_finalize_json) that
+//     surface per-word timestamps (start/end/conf) plus frame_sec alongside the
+//     newly-finalized text + eou flag, and a "frame_sec" field added to the
+//     transcribe_*_json documents. Original entry points unchanged.
+#define PARAKEET_CAPI_ABI_VERSION 4
 
 // The opaque context: a loaded model plus a buffer for the last error message.
 struct parakeet_ctx {
@@ -150,9 +154,14 @@ void append_json_float(std::string& out, const char* fmt, float v) {
 // (word start/end, token t) with %.3f, confidences with %.4f.
 std::string transcription_to_json(const pk::Transcription& tr, float frame_sec) {
     std::string out;
-    out.reserve(64 + tr.words.size() * 48 + tr.tokens.size() * 40);
+    out.reserve(80 + tr.words.size() * 48 + tr.tokens.size() * 40);
     out += "{\"text\":";
     append_json_string(out, tr.text);
+    // Encoder frame stride in seconds; lets consumers convert a frame-unit
+    // segment gap threshold (NeMo segment_gap_threshold) to the seconds gap
+    // between words when forming segments.
+    out += ",\"frame_sec\":";
+    append_json_float(out, "%.6f", frame_sec);
     out += ",\"words\":[";
     for (size_t i = 0; i < tr.words.size(); ++i) {
         if (i) out += ',';
@@ -573,6 +582,105 @@ extern "C" char* parakeet_capi_stream_finalize(parakeet_stream* s) {
     }
 }
 
+namespace {
+
+// Serialize a streaming feed/finalize result to JSON: the newly-finalized text,
+// the eou flag, frame_sec, and the words drained this call (absolute seconds).
+// Shape matches the header doc on parakeet_capi_stream_feed_json.
+std::string stream_json(const std::string& text, int eou, float frame_sec,
+                        const std::vector<pk::Word>& words) {
+    std::string out;
+    out.reserve(80 + words.size() * 48);
+    out += "{\"text\":";
+    append_json_string(out, text);
+    out += ",\"eou\":";
+    out += (eou ? "1" : "0");
+    out += ",\"frame_sec\":";
+    append_json_float(out, "%.6f", frame_sec);
+    out += ",\"words\":[";
+    for (size_t i = 0; i < words.size(); ++i) {
+        if (i) out += ',';
+        out += "{\"w\":";
+        append_json_string(out, words[i].text);
+        out += ",\"start\":";
+        append_json_float(out, "%.3f", words[i].start);
+        out += ",\"end\":";
+        append_json_float(out, "%.3f", words[i].end);
+        out += ",\"conf\":";
+        append_json_float(out, "%.4f", words[i].conf);
+        out += '}';
+    }
+    out += "]}";
+    return out;
+}
+
+// frame_sec for the stream's model (encoder frame stride in seconds).
+float stream_frame_sec(const parakeet_stream* s) {
+    const pk::ParakeetConfig& cfg = s->ctx->model->config();
+    return (float)cfg.hop_length * (float)cfg.subsampling_factor / (float)cfg.sample_rate;
+}
+
+} // namespace
+
+extern "C" char* parakeet_capi_stream_feed_json(parakeet_stream* s,
+                                                const float* pcm, int n_samples) {
+    if (!s) return nullptr;
+    if (!s->ctx || !s->ctx->model) return nullptr;
+    if (n_samples < 0 || (!pcm && n_samples > 0)) {
+        s->ctx->last_error = "invalid PCM buffer";
+        return nullptr;
+    }
+    try {
+        if (n_samples > 0) {
+            int n_new = 0;
+            std::vector<float> frames = s->mel->feed(pcm, n_samples, n_new);
+            append_mel_frames(s, frames, n_new);
+        }
+        int eou = 0;
+        std::string delta = feed_available(s, /*flush=*/false, eou);
+        std::vector<pk::Word> words = s->sess->drain_words();
+        std::string json = stream_json(delta, eou, stream_frame_sec(s), words);
+        s->ctx->last_error.clear();
+        char* out = dup_to_c(json);
+        if (!out) { s->ctx->last_error = "out of memory"; return nullptr; }
+        return out;
+    } catch (const std::exception& e) {
+        s->ctx->last_error = e.what();
+        return nullptr;
+    } catch (...) {
+        s->ctx->last_error = "unknown error";
+        return nullptr;
+    }
+}
+
+extern "C" char* parakeet_capi_stream_finalize_json(parakeet_stream* s) {
+    if (!s) return nullptr;
+    if (!s->ctx || !s->ctx->model) return nullptr;
+    try {
+        if (s->mel) {
+            int n_tail = 0;
+            std::vector<float> tail = s->mel->finalize(n_tail);
+            append_mel_frames(s, tail, n_tail);
+        }
+        int eou = 0;
+        std::string delta = feed_available(s, /*flush=*/true, eou);
+        delta += s->sess->finalize();
+        std::vector<pk::Word> words = s->sess->drain_words();
+        std::string json = stream_json(delta, eou, stream_frame_sec(s), words);
+        s->finalized = true;
+        s->ctx->last_error.clear();
+        char* out = dup_to_c(json);
+        if (!out) { s->ctx->last_error = "out of memory"; return nullptr; }
+        return out;
+    } catch (const std::exception& e) {
+        s->ctx->last_error = e.what();
+        return nullptr;
+    } catch (...) {
+        s->ctx->last_error = "unknown error";
+        return nullptr;
+    }
+}
+
 extern "C" void parakeet_capi_stream_free(parakeet_stream* s) {
     delete s;  // safe on nullptr
 }
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index f40a34a..9d71015 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -58,6 +58,7 @@ pk_add_test(test_streaming_mel)
 pk_add_test(test_capi)
 pk_add_test(test_capi_batch)
 pk_add_test(test_capi_stream)
+pk_add_test(test_capi_stream_json)
 pk_add_test(test_capi_timestamps)
 pk_add_test(test_capi_batch_json)
 set_tests_properties(test_model_loader test_mel test_mel_gpu test_subsampling test_subsampling_batch test_subsampling_batch_causal test_relpos_attention test_relpos_attention_batch test_relpos_attention_local_chunked
@@ -69,7 +70,7 @@ set_tests_properties(test_model_loader test_mel test_mel_gpu test_subsampling te
                      test_timestamps_tokens test_timestamps test_transcribe_batch_ts test_tokenizer test_transcribe
                      test_transcribe_speech test_transcribe_tdt test_transcribe_0_6b
                      test_transcribe_ctc test_transcribe_rnnt test_transcribe_eou test_transcribe_nemotron
-                     test_streaming_decode test_streaming_eou_reset test_streaming_nemotron test_streaming_mel test_capi test_capi_batch test_capi_stream
+                     test_streaming_decode test_streaming_eou_reset test_streaming_nemotron test_streaming_mel test_capi test_capi_batch test_capi_stream test_capi_stream_json
                      test_capi_timestamps test_capi_batch_json
                      PROPERTIES LABELS "model")
 # These tests read fixtures/baselines via paths relative to the project root.
@@ -84,7 +85,7 @@ set_tests_properties(test_mel test_mel_gpu test_subsampling test_subsampling_bat
                      test_tokenizer test_transcribe
                      test_transcribe_speech test_transcribe_tdt test_transcribe_0_6b
                      test_transcribe_ctc test_transcribe_rnnt test_transcribe_eou test_transcribe_nemotron
-                     test_streaming_decode test_streaming_eou_reset test_streaming_nemotron test_streaming_mel test_capi test_capi_batch test_capi_stream
+                     test_streaming_decode test_streaming_eou_reset test_streaming_nemotron test_streaming_mel test_capi test_capi_batch test_capi_stream test_capi_stream_json
                      test_capi_timestamps test_capi_batch_json
                      PROPERTIES WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
 
diff --git a/tests/test_capi_stream_json.cpp b/tests/test_capi_stream_json.cpp
new file mode 100644
index 0000000..785a37e
--- /dev/null
+++ b/tests/test_capi_stream_json.cpp
@@ -0,0 +1,107 @@
+#include "parakeet_capi.h"
+#include "audio_io.hpp"   // pk::load_audio_16k_mono (test links the parakeet lib)
+
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <vector>
+
+// Streaming JSON C-API smoke test (segment-timestamp support).
+//
+// Drives parakeet_capi_stream_feed_json / parakeet_capi_stream_finalize_json on
+// the cache-aware EOU streaming model and asserts the returned documents carry
+// "frame_sec" (> 0) and a "words" array (the per-word start/end timestamps from
+// the streaming session's drain_words) — the data LocalAI needs to build
+// timestamped per-utterance segments.
+//
+// Skips (exit 77) unless PARAKEET_TEST_GGUF_EOU is set (the streaming EOU model
+// is a ~480MB download, not in CI).
+//
+// LABEL model
+// WORKING_DIRECTORY (run from project root; wav path is relative)
+
+static bool contains(const std::string& hay, const char* needle) {
+    return hay.find(needle) != std::string::npos;
+}
+
+int main() {
+    const char* gguf = std::getenv("PARAKEET_TEST_GGUF_EOU");
+    if (!gguf) {
+        std::fprintf(stderr,
+            "test_capi_stream_json: PARAKEET_TEST_GGUF_EOU not set; skip "
+            "(streaming EOU model is a ~480MB download, not in CI)\n");
+        return 77;
+    }
+
+    pk::Audio audio;
+    if (!pk::load_audio_16k_mono("tests/fixtures/speech.wav", audio)) {
+        std::fprintf(stderr, "test_capi_stream_json: failed to load speech.wav\n");
+        return 1;
+    }
+
+    parakeet_ctx* ctx = parakeet_capi_load(gguf);
+    if (!ctx) {
+        std::fprintf(stderr, "test_capi_stream_json: load failed for %s\n", gguf);
+        return 1;
+    }
+    parakeet_stream* s = parakeet_capi_stream_begin(ctx);
+    if (!s) {
+        std::fprintf(stderr, "test_capi_stream_json: stream_begin failed: %s\n",
+                     parakeet_capi_last_error(ctx));
+        parakeet_capi_free(ctx);
+        return 1;
+    }
+
+    // Feed the PCM in real-time-sized chunks (~100 ms = 1600 samples).
+    const int chunk = 1600;
+    const int n = (int)audio.samples.size();
+    std::string acc;
+    for (int off = 0; off < n; off += chunk) {
+        const int len = std::min(chunk, n - off);
+        char* t = parakeet_capi_stream_feed_json(s, audio.samples.data() + off, len);
+        if (!t) {
+            std::fprintf(stderr, "test_capi_stream_json: feed_json NULL: %s\n",
+                         parakeet_capi_last_error(ctx));
+            parakeet_capi_stream_free(s);
+            parakeet_capi_free(ctx);
+            return 1;
+        }
+        acc += t;
+        parakeet_capi_free_string(t);
+    }
+
+    char* fin = parakeet_capi_stream_finalize_json(s);
+    if (!fin) {
+        std::fprintf(stderr, "test_capi_stream_json: finalize_json NULL: %s\n",
+                     parakeet_capi_last_error(ctx));
+        parakeet_capi_stream_free(s);
+        parakeet_capi_free(ctx);
+        return 1;
+    }
+    acc += fin;
+    parakeet_capi_free_string(fin);
+
+    parakeet_capi_stream_free(s);
+    parakeet_capi_free(ctx);
+
+    std::fprintf(stderr, "test_capi_stream_json: concatenated docs:\n%s\n", acc.c_str());
+
+    if (!contains(acc, "\"frame_sec\"")) {
+        std::fprintf(stderr, "test_capi_stream_json: FAIL — no frame_sec in output\n");
+        return 1;
+    }
+    if (!contains(acc, "\"words\"")) {
+        std::fprintf(stderr, "test_capi_stream_json: FAIL — no words array in output\n");
+        return 1;
+    }
+    // A real transcription must finalize at least one word with a non-zero end.
+    if (!contains(acc, "\"end\":")) {
+        std::fprintf(stderr, "test_capi_stream_json: FAIL — no word end timestamps\n");
+        return 1;
+    }
+
+    std::fprintf(stderr, "test_capi_stream_json: PASS — streaming JSON carries "
+                         "frame_sec + per-word timestamps\n");
+    return 0;
+}