Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 31 additions & 3 deletions include/parakeet_capi.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,11 @@ typedef struct parakeet_ctx parakeet_ctx;
// function signatures or semantics below.
//
// v3: added the target_lang variants (parakeet_capi_transcribe_path_lang,
// parakeet_capi_transcribe_pcm_lang, parakeet_capi_stream_begin_lang) for
// multilingual prompt-conditioned (nemotron) models. The original non-lang
// entry points are unchanged and delegate with the model default language.
// parakeet_capi_transcribe_pcm_lang, parakeet_capi_stream_begin_lang,
// parakeet_capi_transcribe_pcm_batch_json_lang,
// parakeet_capi_transcribe_pcm_batch_lang) for multilingual
// prompt-conditioned (nemotron) models. The original non-lang entry points
// are unchanged and delegate with the model default language.
int parakeet_capi_abi_version(void);

// Load a GGUF model. Returns an owning context, or NULL on failure.
Expand Down Expand Up @@ -79,6 +81,20 @@ int parakeet_capi_transcribe_pcm_batch(parakeet_ctx* ctx,
int sample_rate, int decoder,
char** out);

// Like parakeet_capi_transcribe_pcm_batch but selects the language prompt for
// multilingual (nemotron) models. ONE `target_lang` applies to the whole batch:
// a locale string (e.g. "en", "de", "auto"); NULL or "" uses the model's
// default ("auto"). Ignored by non-prompt models. On an unknown locale (for a
// prompt model) returns nonzero, sets the context's last error, and leaves
// every out[] entry NULL. parakeet_capi_transcribe_pcm_batch delegates here
// with the model default.
int parakeet_capi_transcribe_pcm_batch_lang(parakeet_ctx* ctx,
const float* const* samples,
const int* n_samples, int n_clips,
int sample_rate, int decoder,
const char* target_lang,
char** out);

// Transcribe a WAV file returning a malloc'd UTF-8 JSON document with per-word
// and per-token timestamps + confidence (matching NeMo timestamps=True and the
// 'max_prob' confidence method). `decoder` is as in
Expand Down Expand Up @@ -111,6 +127,18 @@ char* parakeet_capi_transcribe_pcm_batch_json(parakeet_ctx* ctx,
const int* n_samples, int n_clips,
int sample_rate, int decoder);

// Like parakeet_capi_transcribe_pcm_batch_json but selects the language prompt
// for multilingual (nemotron) models. ONE `target_lang` applies to the whole
// batch: a locale string (e.g. "en", "de", "auto"); NULL or "" uses the model's
// default ("auto"). Ignored by non-prompt models. On an unknown locale (for a
// prompt model) returns NULL and sets the context's last error.
// parakeet_capi_transcribe_pcm_batch_json delegates here with the model default.
char* parakeet_capi_transcribe_pcm_batch_json_lang(parakeet_ctx* ctx,
const float* samples_concat,
const int* n_samples, int n_clips,
int sample_rate, int decoder,
const char* target_lang);

// ---------------------------------------------------------------------------
// Streaming API (cache-aware streaming RNN-T, e.g. the EOU model
// nvidia/parakeet_realtime_eou_120m-v1). The stream session buffers incoming
Expand Down
46 changes: 36 additions & 10 deletions src/parakeet_capi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@

// ABI version. Bump on breaking changes.
// v3: target_lang variants (transcribe_path_lang / transcribe_pcm_lang /
// stream_begin_lang) for multilingual prompt-conditioned (nemotron) models.
// stream_begin_lang / transcribe_pcm_batch_json_lang /
// transcribe_pcm_batch_lang) for multilingual prompt-conditioned (nemotron)
// models.
#define PARAKEET_CAPI_ABI_VERSION 3

// The opaque context: a loaded model plus a buffer for the last error message.
Expand Down Expand Up @@ -268,17 +270,20 @@ extern "C" char* parakeet_capi_transcribe_pcm(parakeet_ctx* ctx, const float* sa
decoder, nullptr);
}

extern "C" int parakeet_capi_transcribe_pcm_batch(parakeet_ctx* ctx,
const float* const* samples,
const int* n_samples, int n_clips,
int sample_rate, int decoder,
char** out) {
extern "C" int parakeet_capi_transcribe_pcm_batch_lang(parakeet_ctx* ctx,
const float* const* samples,
const int* n_samples, int n_clips,
int sample_rate, int decoder,
const char* target_lang,
char** out) {
if (!ctx) return 1;
if (!ctx->model) { ctx->last_error = "context has no loaded model"; return 1; }
if (!samples || !n_samples || !out || n_clips < 0) {
ctx->last_error = "invalid batch arguments";
return 1;
}
// NULL / "" -> model default language (ignored by non-prompt models).
const std::string lang = target_lang ? target_lang : "";
// Contract: on any error path (validation, exception, OOM) every out[]
// entry is left NULL, so the caller owns nothing and frees nothing.
for (int i = 0; i < n_clips; ++i) out[i] = nullptr;
Expand All @@ -292,7 +297,7 @@ extern "C" int parakeet_capi_transcribe_pcm_batch(parakeet_ctx* ctx,
pcms[i].assign(samples[i], samples[i] + n_samples[i]);
}
std::vector<std::string> texts =
ctx->model->transcribe_pcm_batch(pcms, sample_rate, to_decoder(decoder));
ctx->model->transcribe_pcm_batch(pcms, sample_rate, to_decoder(decoder), lang);
ctx->last_error.clear();
for (int i = 0; i < n_clips; ++i) {
char* s = dup_to_c(texts[i]);
Expand All @@ -315,6 +320,16 @@ extern "C" int parakeet_capi_transcribe_pcm_batch(parakeet_ctx* ctx,
}
}

extern "C" int parakeet_capi_transcribe_pcm_batch(parakeet_ctx* ctx,
const float* const* samples,
const int* n_samples, int n_clips,
int sample_rate, int decoder,
char** out) {
// Delegate with the model default language.
return parakeet_capi_transcribe_pcm_batch_lang(ctx, samples, n_samples, n_clips,
sample_rate, decoder, nullptr, out);
}

extern "C" char* parakeet_capi_transcribe_path_json(parakeet_ctx* ctx,
const char* wav_path,
int decoder) {
Expand Down Expand Up @@ -342,14 +357,16 @@ extern "C" char* parakeet_capi_transcribe_path_json(parakeet_ctx* ctx,
}
}

extern "C" char* parakeet_capi_transcribe_pcm_batch_json(parakeet_ctx* ctx,
extern "C" char* parakeet_capi_transcribe_pcm_batch_json_lang(parakeet_ctx* ctx,
const float* samples_concat, const int* n_samples, int n_clips,
int sample_rate, int decoder) {
int sample_rate, int decoder, const char* target_lang) {
if (!ctx) return nullptr;
if (!ctx->model) { ctx->last_error = "context has no loaded model"; return nullptr; }
if (!samples_concat || !n_samples || n_clips < 0) {
ctx->last_error = "invalid batch arguments"; return nullptr;
}
// NULL / "" -> model default language (ignored by non-prompt models).
const std::string lang = target_lang ? target_lang : "";
try {
std::vector<std::vector<float>> pcms(n_clips);
size_t off = 0;
Expand All @@ -360,7 +377,7 @@ extern "C" char* parakeet_capi_transcribe_pcm_batch_json(parakeet_ctx* ctx,
}
std::vector<pk::Transcription> trs =
ctx->model->transcribe_pcm_batch_with_timestamps(pcms, sample_rate,
to_decoder(decoder));
to_decoder(decoder), lang);
const pk::ParakeetConfig& cfg = ctx->model->config();
const float frame_sec =
(float)cfg.hop_length * (float)cfg.subsampling_factor / (float)cfg.sample_rate;
Expand All @@ -381,6 +398,15 @@ extern "C" char* parakeet_capi_transcribe_pcm_batch_json(parakeet_ctx* ctx,
}
}

extern "C" char* parakeet_capi_transcribe_pcm_batch_json(parakeet_ctx* ctx,
const float* samples_concat, const int* n_samples, int n_clips,
int sample_rate, int decoder) {
// Delegate with the model default language.
return parakeet_capi_transcribe_pcm_batch_json_lang(ctx, samples_concat, n_samples,
n_clips, sample_rate, decoder,
nullptr);
}

// ---------------------------------------------------------------------------
// Streaming API
// ---------------------------------------------------------------------------
Expand Down
10 changes: 6 additions & 4 deletions src/subsampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,12 @@ ggml_tensor* Subsampling::build_graph_batched(ggml_context* ctx,
const int F = n_mels; // feature dim (80)
const ModelLoader& ml = ml_;

// This task targets the NON-causal (offline) model only. Batched causal
// subsampling (per-stage time masking with a batch axis) is out of scope:
// the causal branch below still operates on the single-item assumption.
GGML_ASSERT(!(causal_ && B > 1) && "batched causal subsampling not supported");
// Batched causal subsampling IS supported: the causal branch below applies
// the leading ggml_pad_ext (lp1=2/rp1=1 on time) uniformly across the batch,
// and the per-item trailing-pad time masking (mask_time on the batch axis)
// plus the all_paddings=3 valid-length recurrence reproduce, per item, the
// exact standalone causal boundary. A clip in a B>1 batch is byte-identical
// to the same clip transcribed standalone (see test_subsampling_batch_causal).

// --- Input (host-side): ggml conv data layout is [W=feat, H=T, IC=1, N=B].
// NeMo conv input is [B,1,T,feat] (H=T, W=feat). We must feed
Expand Down
5 changes: 3 additions & 2 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ pk_add_test(test_mel)
pk_add_test(test_mel_gpu)
pk_add_test(test_subsampling)
pk_add_test(test_subsampling_batch)
pk_add_test(test_subsampling_batch_causal)
pk_add_test(test_relpos_attention)
pk_add_test(test_relpos_attention_local)
pk_add_test(test_relpos_attention_local_chunked)
Expand Down Expand Up @@ -58,7 +59,7 @@ pk_add_test(test_capi_batch)
pk_add_test(test_capi_stream)
pk_add_test(test_capi_timestamps)
pk_add_test(test_capi_batch_json)
set_tests_properties(test_model_loader test_mel test_mel_gpu test_subsampling test_subsampling_batch test_relpos_attention test_relpos_attention_batch test_relpos_attention_local_chunked
set_tests_properties(test_model_loader test_mel test_mel_gpu test_subsampling test_subsampling_batch test_subsampling_batch_causal test_relpos_attention test_relpos_attention_batch test_relpos_attention_local_chunked
test_conformer test_conformer_batch test_conv_eou test_encoder test_encoder_batch test_encoder_batch_local test_encoder_eou
test_streaming_encoder test_ctc test_prediction
test_prediction_step test_prediction_step_batch
Expand All @@ -71,7 +72,7 @@ set_tests_properties(test_model_loader test_mel test_mel_gpu test_subsampling te
test_capi_timestamps test_capi_batch_json
PROPERTIES LABELS "model")
# These tests read fixtures/baselines via paths relative to the project root.
set_tests_properties(test_mel test_mel_gpu test_subsampling test_subsampling_batch test_relpos_attention test_relpos_attention_batch test_conformer test_conformer_batch
set_tests_properties(test_mel test_mel_gpu test_subsampling test_subsampling_batch test_subsampling_batch_causal test_relpos_attention test_relpos_attention_batch test_conformer test_conformer_batch
test_conv_eou test_encoder test_encoder_batch test_encoder_batch_local test_encoder_eou test_streaming_encoder
test_ctc test_prediction test_prediction_step test_prediction_step_batch
test_joint test_joint_step_batch test_prompt_kernel
Expand Down
Loading
Loading