Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -275,11 +275,12 @@ Timestamps and confidence as JSON (matches NeMo `timestamps=True` + `max_prob`):
```c
char *json = parakeet_capi_transcribe_path_json(ctx, "audio.wav", 0 /*default*/);
// {"text":"...",
// "frame_sec":0.080000,
// "words":[{"w":"Well,","start":0.480,"end":0.640,"conf":0.7859}, ...],
// "tokens":[{"id":639,"t":0.480,"conf":0.9969}, ...]}
if (json) { printf("%s\n", json); parakeet_capi_free_string(json); }
```
`start`/`end`/`t` are in seconds; `conf` is the rescaled softmax probability of the emitted token in `(0,1]` (a word's `conf` is the `min` over its tokens).
`start`/`end`/`t` are in seconds; `conf` is the rescaled softmax probability of the emitted token in `(0,1]` (a word's `conf` is the `min` over its tokens). `frame_sec` is the encoder frame stride in seconds (`hop x subsampling / sample_rate`); multiply a frame-unit segment gap threshold (NeMo's `segment_gap_threshold`) by it to get the seconds gap between words when forming segments.

### Streaming (cache-aware EOU model)

Expand Down
31 changes: 30 additions & 1 deletion include/parakeet_capi.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,12 @@ typedef struct parakeet_ctx parakeet_ctx;
// parakeet_capi_transcribe_pcm_batch_lang) for multilingual
// prompt-conditioned (nemotron) models. The original non-lang entry points
// are unchanged and delegate with the model default language.
//
// v4: added the streaming JSON entry points (parakeet_capi_stream_feed_json,
// parakeet_capi_stream_finalize_json) that surface per-word timestamps
// (start/end/conf) plus frame_sec alongside the newly-finalized text, and
// added "frame_sec" to the transcribe_*_json documents. The original entry
// points are unchanged.
int parakeet_capi_abi_version(void);

// Load a GGUF model. Returns an owning context, or NULL on failure.
Expand Down Expand Up @@ -101,11 +107,15 @@ int parakeet_capi_transcribe_pcm_batch_lang(parakeet_ctx* ctx,
// parakeet_capi_transcribe_path. The JSON shape is:
//
// {"text":"...",
// "frame_sec":0.080000,
// "words":[{"w":"...","start":0.480,"end":0.640,"conf":0.9100}, ...],
// "tokens":[{"id":123,"t":0.480,"conf":0.9100}, ...]}
//
// where "start"/"end"/"t" are seconds (3 decimals) and "conf" is the
// confidence in (0,1] (4 decimals). The "w"/"text" strings are JSON-escaped
// confidence in (0,1] (4 decimals). "frame_sec" is the encoder frame stride in
// seconds (hop_length * subsampling_factor / sample_rate); multiply a frame-unit
// segment gap threshold by it to get the seconds gap between words. The
// "w"/"text" strings are JSON-escaped
// (", \\, and control chars). On success returns the malloc'd string (free with
// parakeet_capi_free_string); on error returns NULL and sets the context's last
// error.
Expand Down Expand Up @@ -180,6 +190,25 @@ char* parakeet_capi_stream_feed(parakeet_stream* s, const float* pcm,
// complete. Does NOT fabricate an <EOU> NeMo's streaming would not emit.
char* parakeet_capi_stream_finalize(parakeet_stream* s);

// Like parakeet_capi_stream_feed but returns a malloc'd UTF-8 JSON document
// instead of bare text:
// {"text":"...","eou":0,"frame_sec":0.080000,
// "words":[{"w":"...","start":0.480,"end":0.640,"conf":0.9100}, ...]}
// "text" is the newly-finalized text since the last call ("" if none); "eou" is
// 1 iff an <EOU>/<EOB> fired during this feed; "frame_sec" is the encoder frame
// stride in seconds; "words" are the words finalized this call with absolute
// (stream-relative) start/end seconds and 'min'-aggregate confidence (the same
// drain as the offline pk::group_words). Returns NULL only on error (see
// parakeet_capi_last_error). Free with parakeet_capi_free_string.
char* parakeet_capi_stream_feed_json(parakeet_stream* s, const float* pcm,
int n_samples);

// Like parakeet_capi_stream_finalize but returns the same JSON document shape as
// parakeet_capi_stream_feed_json (flushing the end-of-stream tail; "eou" is
// typically 0 — finalize does not fabricate an <EOU>). Free with
// parakeet_capi_free_string; NULL only on error.
char* parakeet_capi_stream_finalize_json(parakeet_stream* s);

// Free a streaming session. Safe on NULL.
void parakeet_capi_stream_free(parakeet_stream* s);

Expand Down
112 changes: 110 additions & 2 deletions src/parakeet_capi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,11 @@
// stream_begin_lang / transcribe_pcm_batch_json_lang /
// transcribe_pcm_batch_lang) for multilingual prompt-conditioned (nemotron)
// models.
#define PARAKEET_CAPI_ABI_VERSION 3
// v4: streaming JSON entry points (stream_feed_json / stream_finalize_json) that
// surface per-word timestamps (start/end/conf) plus frame_sec alongside the
// newly-finalized text + eou flag, and a "frame_sec" field added to the
// transcribe_*_json documents. Original entry points unchanged.
#define PARAKEET_CAPI_ABI_VERSION 4

// The opaque context: a loaded model plus a buffer for the last error message.
struct parakeet_ctx {
Expand Down Expand Up @@ -150,9 +154,14 @@ void append_json_float(std::string& out, const char* fmt, float v) {
// (word start/end, token t) with %.3f, confidences with %.4f.
std::string transcription_to_json(const pk::Transcription& tr, float frame_sec) {
std::string out;
out.reserve(64 + tr.words.size() * 48 + tr.tokens.size() * 40);
out.reserve(80 + tr.words.size() * 48 + tr.tokens.size() * 40);
out += "{\"text\":";
append_json_string(out, tr.text);
// Encoder frame stride in seconds; lets consumers convert a frame-unit
// segment gap threshold (NeMo segment_gap_threshold) to the seconds gap
// between words when forming segments.
out += ",\"frame_sec\":";
append_json_float(out, "%.6f", frame_sec);
out += ",\"words\":[";
for (size_t i = 0; i < tr.words.size(); ++i) {
if (i) out += ',';
Expand Down Expand Up @@ -573,6 +582,105 @@ extern "C" char* parakeet_capi_stream_finalize(parakeet_stream* s) {
}
}

namespace {

// Serialize a streaming feed/finalize result to JSON: the newly-finalized text,
// the eou flag, frame_sec, and the words drained this call (absolute seconds).
// Shape matches the header doc on parakeet_capi_stream_feed_json.
std::string stream_json(const std::string& text, int eou, float frame_sec,
const std::vector<pk::Word>& words) {
std::string out;
out.reserve(80 + words.size() * 48);
out += "{\"text\":";
append_json_string(out, text);
out += ",\"eou\":";
out += (eou ? "1" : "0");
out += ",\"frame_sec\":";
append_json_float(out, "%.6f", frame_sec);
out += ",\"words\":[";
for (size_t i = 0; i < words.size(); ++i) {
if (i) out += ',';
out += "{\"w\":";
append_json_string(out, words[i].text);
out += ",\"start\":";
append_json_float(out, "%.3f", words[i].start);
out += ",\"end\":";
append_json_float(out, "%.3f", words[i].end);
out += ",\"conf\":";
append_json_float(out, "%.4f", words[i].conf);
out += '}';
}
out += "]}";
return out;
}

// frame_sec for the stream's model (encoder frame stride in seconds).
float stream_frame_sec(const parakeet_stream* s) {
const pk::ParakeetConfig& cfg = s->ctx->model->config();
return (float)cfg.hop_length * (float)cfg.subsampling_factor / (float)cfg.sample_rate;
}

} // namespace

extern "C" char* parakeet_capi_stream_feed_json(parakeet_stream* s,
const float* pcm, int n_samples) {
if (!s) return nullptr;
if (!s->ctx || !s->ctx->model) return nullptr;
if (n_samples < 0 || (!pcm && n_samples > 0)) {
s->ctx->last_error = "invalid PCM buffer";
return nullptr;
}
try {
if (n_samples > 0) {
int n_new = 0;
std::vector<float> frames = s->mel->feed(pcm, n_samples, n_new);
append_mel_frames(s, frames, n_new);
}
int eou = 0;
std::string delta = feed_available(s, /*flush=*/false, eou);
std::vector<pk::Word> words = s->sess->drain_words();
std::string json = stream_json(delta, eou, stream_frame_sec(s), words);
s->ctx->last_error.clear();
char* out = dup_to_c(json);
if (!out) { s->ctx->last_error = "out of memory"; return nullptr; }
return out;
} catch (const std::exception& e) {
s->ctx->last_error = e.what();
return nullptr;
} catch (...) {
s->ctx->last_error = "unknown error";
return nullptr;
}
}

extern "C" char* parakeet_capi_stream_finalize_json(parakeet_stream* s) {
if (!s) return nullptr;
if (!s->ctx || !s->ctx->model) return nullptr;
try {
if (s->mel) {
int n_tail = 0;
std::vector<float> tail = s->mel->finalize(n_tail);
append_mel_frames(s, tail, n_tail);
}
int eou = 0;
std::string delta = feed_available(s, /*flush=*/true, eou);
delta += s->sess->finalize();
std::vector<pk::Word> words = s->sess->drain_words();
std::string json = stream_json(delta, eou, stream_frame_sec(s), words);
s->finalized = true;
s->ctx->last_error.clear();
char* out = dup_to_c(json);
if (!out) { s->ctx->last_error = "out of memory"; return nullptr; }
return out;
} catch (const std::exception& e) {
s->ctx->last_error = e.what();
return nullptr;
} catch (...) {
s->ctx->last_error = "unknown error";
return nullptr;
}
}

extern "C" void parakeet_capi_stream_free(parakeet_stream* s) {
delete s; // safe on nullptr
}
Expand Down
5 changes: 3 additions & 2 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ pk_add_test(test_streaming_mel)
pk_add_test(test_capi)
pk_add_test(test_capi_batch)
pk_add_test(test_capi_stream)
pk_add_test(test_capi_stream_json)
pk_add_test(test_capi_timestamps)
pk_add_test(test_capi_batch_json)
set_tests_properties(test_model_loader test_mel test_mel_gpu test_subsampling test_subsampling_batch test_subsampling_batch_causal test_relpos_attention test_relpos_attention_batch test_relpos_attention_local_chunked
Expand All @@ -69,7 +70,7 @@ set_tests_properties(test_model_loader test_mel test_mel_gpu test_subsampling te
test_timestamps_tokens test_timestamps test_transcribe_batch_ts test_tokenizer test_transcribe
test_transcribe_speech test_transcribe_tdt test_transcribe_0_6b
test_transcribe_ctc test_transcribe_rnnt test_transcribe_eou test_transcribe_nemotron
test_streaming_decode test_streaming_eou_reset test_streaming_nemotron test_streaming_mel test_capi test_capi_batch test_capi_stream
test_streaming_decode test_streaming_eou_reset test_streaming_nemotron test_streaming_mel test_capi test_capi_batch test_capi_stream test_capi_stream_json
test_capi_timestamps test_capi_batch_json
PROPERTIES LABELS "model")
# These tests read fixtures/baselines via paths relative to the project root.
Expand All @@ -84,7 +85,7 @@ set_tests_properties(test_mel test_mel_gpu test_subsampling test_subsampling_bat
test_tokenizer test_transcribe
test_transcribe_speech test_transcribe_tdt test_transcribe_0_6b
test_transcribe_ctc test_transcribe_rnnt test_transcribe_eou test_transcribe_nemotron
test_streaming_decode test_streaming_eou_reset test_streaming_nemotron test_streaming_mel test_capi test_capi_batch test_capi_stream
test_streaming_decode test_streaming_eou_reset test_streaming_nemotron test_streaming_mel test_capi test_capi_batch test_capi_stream test_capi_stream_json
test_capi_timestamps test_capi_batch_json
PROPERTIES WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})

Expand Down
107 changes: 107 additions & 0 deletions tests/test_capi_stream_json.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#include "parakeet_capi.h"
#include "audio_io.hpp" // pk::load_audio_16k_mono (test links the parakeet lib)

#include <algorithm>
#include <cstdio>
#include <cstdlib>
#include <string>
#include <vector>

// Streaming JSON C-API smoke test (segment-timestamp support).
//
// Drives parakeet_capi_stream_feed_json / parakeet_capi_stream_finalize_json on
// the cache-aware EOU streaming model and asserts the returned documents carry
// "frame_sec" (> 0) and a "words" array (the per-word start/end timestamps from
// the streaming session's drain_words) — the data LocalAI needs to build
// timestamped per-utterance segments.
//
// Skips (exit 77) unless PARAKEET_TEST_GGUF_EOU is set (the streaming EOU model
// is a ~480MB download, not in CI).
//
// LABEL model
// WORKING_DIRECTORY (run from project root; wav path is relative)

static bool contains(const std::string& hay, const char* needle) {
return hay.find(needle) != std::string::npos;
}

int main() {
const char* gguf = std::getenv("PARAKEET_TEST_GGUF_EOU");
if (!gguf) {
std::fprintf(stderr,
"test_capi_stream_json: PARAKEET_TEST_GGUF_EOU not set; skip "
"(streaming EOU model is a ~480MB download, not in CI)\n");
return 77;
}

pk::Audio audio;
if (!pk::load_audio_16k_mono("tests/fixtures/speech.wav", audio)) {
std::fprintf(stderr, "test_capi_stream_json: failed to load speech.wav\n");
return 1;
}

parakeet_ctx* ctx = parakeet_capi_load(gguf);
if (!ctx) {
std::fprintf(stderr, "test_capi_stream_json: load failed for %s\n", gguf);
return 1;
}
parakeet_stream* s = parakeet_capi_stream_begin(ctx);
if (!s) {
std::fprintf(stderr, "test_capi_stream_json: stream_begin failed: %s\n",
parakeet_capi_last_error(ctx));
parakeet_capi_free(ctx);
return 1;
}

// Feed the PCM in real-time-sized chunks (~100 ms = 1600 samples).
const int chunk = 1600;
const int n = (int)audio.samples.size();
std::string acc;
for (int off = 0; off < n; off += chunk) {
const int len = std::min(chunk, n - off);
char* t = parakeet_capi_stream_feed_json(s, audio.samples.data() + off, len);
if (!t) {
std::fprintf(stderr, "test_capi_stream_json: feed_json NULL: %s\n",
parakeet_capi_last_error(ctx));
parakeet_capi_stream_free(s);
parakeet_capi_free(ctx);
return 1;
}
acc += t;
parakeet_capi_free_string(t);
}

char* fin = parakeet_capi_stream_finalize_json(s);
if (!fin) {
std::fprintf(stderr, "test_capi_stream_json: finalize_json NULL: %s\n",
parakeet_capi_last_error(ctx));
parakeet_capi_stream_free(s);
parakeet_capi_free(ctx);
return 1;
}
acc += fin;
parakeet_capi_free_string(fin);

parakeet_capi_stream_free(s);
parakeet_capi_free(ctx);

std::fprintf(stderr, "test_capi_stream_json: concatenated docs:\n%s\n", acc.c_str());

if (!contains(acc, "\"frame_sec\"")) {
std::fprintf(stderr, "test_capi_stream_json: FAIL — no frame_sec in output\n");
return 1;
}
if (!contains(acc, "\"words\"")) {
std::fprintf(stderr, "test_capi_stream_json: FAIL — no words array in output\n");
return 1;
}
// A real transcription must finalize at least one word with a non-zero end.
if (!contains(acc, "\"end\":")) {
std::fprintf(stderr, "test_capi_stream_json: FAIL — no word end timestamps\n");
return 1;
}

std::fprintf(stderr, "test_capi_stream_json: PASS — streaming JSON carries "
"frame_sec + per-word timestamps\n");
return 0;
}
Loading