diff --git a/include/parakeet_capi.h b/include/parakeet_capi.h index def9d2d..8926197 100644 --- a/include/parakeet_capi.h +++ b/include/parakeet_capi.h @@ -19,9 +19,11 @@ typedef struct parakeet_ctx parakeet_ctx; // function signatures or semantics below. // // v3: added the target_lang variants (parakeet_capi_transcribe_path_lang, -// parakeet_capi_transcribe_pcm_lang, parakeet_capi_stream_begin_lang) for -// multilingual prompt-conditioned (nemotron) models. The original non-lang -// entry points are unchanged and delegate with the model default language. +// parakeet_capi_transcribe_pcm_lang, parakeet_capi_stream_begin_lang, +// parakeet_capi_transcribe_pcm_batch_json_lang, +// parakeet_capi_transcribe_pcm_batch_lang) for multilingual +// prompt-conditioned (nemotron) models. The original non-lang entry points +// are unchanged and delegate with the model default language. int parakeet_capi_abi_version(void); // Load a GGUF model. Returns an owning context, or NULL on failure. @@ -79,6 +81,20 @@ int parakeet_capi_transcribe_pcm_batch(parakeet_ctx* ctx, int sample_rate, int decoder, char** out); +// Like parakeet_capi_transcribe_pcm_batch but selects the language prompt for +// multilingual (nemotron) models. ONE `target_lang` applies to the whole batch: +// a locale string (e.g. "en", "de", "auto"); NULL or "" uses the model's +// default ("auto"). Ignored by non-prompt models. On an unknown locale (for a +// prompt model) returns nonzero, sets the context's last error, and leaves +// every out[] entry NULL. parakeet_capi_transcribe_pcm_batch delegates here +// with the model default. +int parakeet_capi_transcribe_pcm_batch_lang(parakeet_ctx* ctx, + const float* const* samples, + const int* n_samples, int n_clips, + int sample_rate, int decoder, + const char* target_lang, + char** out); + // Transcribe a WAV file returning a malloc'd UTF-8 JSON document with per-word // and per-token timestamps + confidence (matching NeMo timestamps=True and the // 'max_prob' confidence method). `decoder` is as in @@ -111,6 +127,18 @@ char* parakeet_capi_transcribe_pcm_batch_json(parakeet_ctx* ctx, const int* n_samples, int n_clips, int sample_rate, int decoder); +// Like parakeet_capi_transcribe_pcm_batch_json but selects the language prompt +// for multilingual (nemotron) models. ONE `target_lang` applies to the whole +// batch: a locale string (e.g. "en", "de", "auto"); NULL or "" uses the model's +// default ("auto"). Ignored by non-prompt models. On an unknown locale (for a +// prompt model) returns NULL and sets the context's last error. +// parakeet_capi_transcribe_pcm_batch_json delegates here with the model default. +char* parakeet_capi_transcribe_pcm_batch_json_lang(parakeet_ctx* ctx, + const float* samples_concat, + const int* n_samples, int n_clips, + int sample_rate, int decoder, + const char* target_lang); + // --------------------------------------------------------------------------- // Streaming API (cache-aware streaming RNN-T, e.g. the EOU model // nvidia/parakeet_realtime_eou_120m-v1). The stream session buffers incoming diff --git a/src/parakeet_capi.cpp b/src/parakeet_capi.cpp index 40e8577..7871d0e 100644 --- a/src/parakeet_capi.cpp +++ b/src/parakeet_capi.cpp @@ -17,7 +17,9 @@ // ABI version. Bump on breaking changes. // v3: target_lang variants (transcribe_path_lang / transcribe_pcm_lang / -// stream_begin_lang) for multilingual prompt-conditioned (nemotron) models. +// stream_begin_lang / transcribe_pcm_batch_json_lang / +// transcribe_pcm_batch_lang) for multilingual prompt-conditioned (nemotron) +// models. #define PARAKEET_CAPI_ABI_VERSION 3 // The opaque context: a loaded model plus a buffer for the last error message. @@ -268,17 +270,20 @@ extern "C" char* parakeet_capi_transcribe_pcm(parakeet_ctx* ctx, const float* sa decoder, nullptr); } -extern "C" int parakeet_capi_transcribe_pcm_batch(parakeet_ctx* ctx, - const float* const* samples, - const int* n_samples, int n_clips, - int sample_rate, int decoder, - char** out) { +extern "C" int parakeet_capi_transcribe_pcm_batch_lang(parakeet_ctx* ctx, + const float* const* samples, + const int* n_samples, int n_clips, + int sample_rate, int decoder, + const char* target_lang, + char** out) { if (!ctx) return 1; if (!ctx->model) { ctx->last_error = "context has no loaded model"; return 1; } if (!samples || !n_samples || !out || n_clips < 0) { ctx->last_error = "invalid batch arguments"; return 1; } + // NULL / "" -> model default language (ignored by non-prompt models). + const std::string lang = target_lang ? target_lang : ""; // Contract: on any error path (validation, exception, OOM) every out[] // entry is left NULL, so the caller owns nothing and frees nothing. for (int i = 0; i < n_clips; ++i) out[i] = nullptr; @@ -292,7 +297,7 @@ extern "C" int parakeet_capi_transcribe_pcm_batch(parakeet_ctx* ctx, pcms[i].assign(samples[i], samples[i] + n_samples[i]); } std::vector texts = - ctx->model->transcribe_pcm_batch(pcms, sample_rate, to_decoder(decoder)); + ctx->model->transcribe_pcm_batch(pcms, sample_rate, to_decoder(decoder), lang); ctx->last_error.clear(); for (int i = 0; i < n_clips; ++i) { char* s = dup_to_c(texts[i]); @@ -315,6 +320,16 @@ extern "C" int parakeet_capi_transcribe_pcm_batch(parakeet_ctx* ctx, } } +extern "C" int parakeet_capi_transcribe_pcm_batch(parakeet_ctx* ctx, + const float* const* samples, + const int* n_samples, int n_clips, + int sample_rate, int decoder, + char** out) { + // Delegate with the model default language. + return parakeet_capi_transcribe_pcm_batch_lang(ctx, samples, n_samples, n_clips, + sample_rate, decoder, nullptr, out); +} + extern "C" char* parakeet_capi_transcribe_path_json(parakeet_ctx* ctx, const char* wav_path, int decoder) { @@ -342,14 +357,16 @@ extern "C" char* parakeet_capi_transcribe_path_json(parakeet_ctx* ctx, } } -extern "C" char* parakeet_capi_transcribe_pcm_batch_json(parakeet_ctx* ctx, +extern "C" char* parakeet_capi_transcribe_pcm_batch_json_lang(parakeet_ctx* ctx, const float* samples_concat, const int* n_samples, int n_clips, - int sample_rate, int decoder) { + int sample_rate, int decoder, const char* target_lang) { if (!ctx) return nullptr; if (!ctx->model) { ctx->last_error = "context has no loaded model"; return nullptr; } if (!samples_concat || !n_samples || n_clips < 0) { ctx->last_error = "invalid batch arguments"; return nullptr; } + // NULL / "" -> model default language (ignored by non-prompt models). + const std::string lang = target_lang ? target_lang : ""; try { std::vector> pcms(n_clips); size_t off = 0; @@ -360,7 +377,7 @@ extern "C" char* parakeet_capi_transcribe_pcm_batch_json(parakeet_ctx* ctx, } std::vector trs = ctx->model->transcribe_pcm_batch_with_timestamps(pcms, sample_rate, - to_decoder(decoder)); + to_decoder(decoder), lang); const pk::ParakeetConfig& cfg = ctx->model->config(); const float frame_sec = (float)cfg.hop_length * (float)cfg.subsampling_factor / (float)cfg.sample_rate; @@ -381,6 +398,15 @@ extern "C" char* parakeet_capi_transcribe_pcm_batch_json(parakeet_ctx* ctx, } } +extern "C" char* parakeet_capi_transcribe_pcm_batch_json(parakeet_ctx* ctx, + const float* samples_concat, const int* n_samples, int n_clips, + int sample_rate, int decoder) { + // Delegate with the model default language. + return parakeet_capi_transcribe_pcm_batch_json_lang(ctx, samples_concat, n_samples, + n_clips, sample_rate, decoder, + nullptr); +} + // --------------------------------------------------------------------------- // Streaming API // --------------------------------------------------------------------------- diff --git a/src/subsampling.cpp b/src/subsampling.cpp index f06b881..7af909a 100644 --- a/src/subsampling.cpp +++ b/src/subsampling.cpp @@ -54,10 +54,12 @@ ggml_tensor* Subsampling::build_graph_batched(ggml_context* ctx, const int F = n_mels; // feature dim (80) const ModelLoader& ml = ml_; - // This task targets the NON-causal (offline) model only. Batched causal - // subsampling (per-stage time masking with a batch axis) is out of scope: - // the causal branch below still operates on the single-item assumption. - GGML_ASSERT(!(causal_ && B > 1) && "batched causal subsampling not supported"); + // Batched causal subsampling IS supported: the causal branch below applies + // the leading ggml_pad_ext (lp1=2/rp1=1 on time) uniformly across the batch, + // and the per-item trailing-pad time masking (mask_time on the batch axis) + // plus the all_paddings=3 valid-length recurrence reproduce, per item, the + // exact standalone causal boundary. A clip in a B>1 batch is byte-identical + // to the same clip transcribed standalone (see test_subsampling_batch_causal). // --- Input (host-side): ggml conv data layout is [W=feat, H=T, IC=1, N=B]. // NeMo conv input is [B,1,T,feat] (H=T, W=feat). We must feed diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 71a3999..b83f76a 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -14,6 +14,7 @@ pk_add_test(test_mel) pk_add_test(test_mel_gpu) pk_add_test(test_subsampling) pk_add_test(test_subsampling_batch) +pk_add_test(test_subsampling_batch_causal) pk_add_test(test_relpos_attention) pk_add_test(test_relpos_attention_local) pk_add_test(test_relpos_attention_local_chunked) @@ -58,7 +59,7 @@ pk_add_test(test_capi_batch) pk_add_test(test_capi_stream) pk_add_test(test_capi_timestamps) pk_add_test(test_capi_batch_json) -set_tests_properties(test_model_loader test_mel test_mel_gpu test_subsampling test_subsampling_batch test_relpos_attention test_relpos_attention_batch test_relpos_attention_local_chunked +set_tests_properties(test_model_loader test_mel test_mel_gpu test_subsampling test_subsampling_batch test_subsampling_batch_causal test_relpos_attention test_relpos_attention_batch test_relpos_attention_local_chunked test_conformer test_conformer_batch test_conv_eou test_encoder test_encoder_batch test_encoder_batch_local test_encoder_eou test_streaming_encoder test_ctc test_prediction test_prediction_step test_prediction_step_batch @@ -71,7 +72,7 @@ set_tests_properties(test_model_loader test_mel test_mel_gpu test_subsampling te test_capi_timestamps test_capi_batch_json PROPERTIES LABELS "model") # These tests read fixtures/baselines via paths relative to the project root. -set_tests_properties(test_mel test_mel_gpu test_subsampling test_subsampling_batch test_relpos_attention test_relpos_attention_batch test_conformer test_conformer_batch +set_tests_properties(test_mel test_mel_gpu test_subsampling test_subsampling_batch test_subsampling_batch_causal test_relpos_attention test_relpos_attention_batch test_conformer test_conformer_batch test_conv_eou test_encoder test_encoder_batch test_encoder_batch_local test_encoder_eou test_streaming_encoder test_ctc test_prediction test_prediction_step test_prediction_step_batch test_joint test_joint_step_batch test_prompt_kernel diff --git a/tests/test_capi_batch_json.cpp b/tests/test_capi_batch_json.cpp index 0ccb38c..038603d 100644 --- a/tests/test_capi_batch_json.cpp +++ b/tests/test_capi_batch_json.cpp @@ -4,42 +4,124 @@ #include #include #include + +// The two model env vars are independent: the offline block (PARAKEET_TEST_GGUF) +// exercises the non-lang batch JSON path and its language-aware delegate on a +// real offline model; the prompt block (PARAKEET_TEST_GGUF_NEMOTRON) exercises +// the batched target_lang error handling on a multilingual prompt model. Each +// block runs only when its env var is set; if neither is set we skip (77). int main() { - const char* gguf = std::getenv("PARAKEET_TEST_GGUF"); - if (!gguf) { std::fprintf(stderr, "env not set; skip\n"); return 77; } - parakeet_ctx* ctx = parakeet_capi_load(gguf); - if (!ctx) { std::fprintf(stderr, "load failed\n"); return 1; } + bool ran_any = false; + + // Build a 2-clip batch (speech.wav twice) reused by both blocks. pk::Audio a; if (!pk::load_audio_16k_mono("tests/fixtures/speech.wav", a) || a.samples.empty()) { - std::fprintf(stderr, "wav load failed\n"); parakeet_capi_free(ctx); return 1; + std::fprintf(stderr, "wav load failed\n"); return 1; } - char* single = parakeet_capi_transcribe_path_json(ctx, "tests/fixtures/speech.wav", 0); - if (!single) { std::fprintf(stderr, "single failed: %s\n", parakeet_capi_last_error(ctx)); parakeet_capi_free(ctx); return 1; } - std::string single_doc(single); - parakeet_capi_free_string(single); - std::vector concat; concat.insert(concat.end(), a.samples.begin(), a.samples.end()); concat.insert(concat.end(), a.samples.begin(), a.samples.end()); int n_samples[2] = { (int)a.samples.size(), (int)a.samples.size() }; - char* batch = parakeet_capi_transcribe_pcm_batch_json(ctx, concat.data(), n_samples, 2, 16000, 0); - if (!batch) { std::fprintf(stderr, "batch failed: %s\n", parakeet_capi_last_error(ctx)); parakeet_capi_free(ctx); return 1; } - std::string doc(batch); - parakeet_capi_free_string(batch); - - auto text_field = [](const std::string& s) -> std::string { - size_t p = s.find("\"text\":\""); - if (p == std::string::npos) return ""; - p += 8; size_t q = s.find('"', p); - return s.substr(p, q - p); - }; - std::string t = text_field(single_doc); - bool is_array = !doc.empty() && doc.front() == '[' && doc.back() == ']'; - size_t cnt = 0, pos = 0; - std::string needle = "\"text\":\"" + t + "\""; - while ((pos = doc.find(needle, pos)) != std::string::npos) { ++cnt; pos += needle.size(); } - bool ok = is_array && !t.empty() && cnt == 2; - std::fprintf(stderr, "array=%d text='%s' count=%zu -> %s\n", is_array, t.c_str(), cnt, ok?"OK":"FAIL"); - parakeet_capi_free(ctx); - return ok ? 0 : 1; + + const char* gguf = std::getenv("PARAKEET_TEST_GGUF"); + if (gguf) { + ran_any = true; + parakeet_ctx* ctx = parakeet_capi_load(gguf); + if (!ctx) { std::fprintf(stderr, "load failed\n"); return 1; } + + char* single = parakeet_capi_transcribe_path_json(ctx, "tests/fixtures/speech.wav", 0); + if (!single) { std::fprintf(stderr, "single failed: %s\n", parakeet_capi_last_error(ctx)); parakeet_capi_free(ctx); return 1; } + std::string single_doc(single); + parakeet_capi_free_string(single); + + char* batch = parakeet_capi_transcribe_pcm_batch_json(ctx, concat.data(), n_samples, 2, 16000, 0); + if (!batch) { std::fprintf(stderr, "batch failed: %s\n", parakeet_capi_last_error(ctx)); parakeet_capi_free(ctx); return 1; } + std::string doc(batch); + parakeet_capi_free_string(batch); + + auto text_field = [](const std::string& s) -> std::string { + size_t p = s.find("\"text\":\""); + if (p == std::string::npos) return ""; + p += 8; size_t q = s.find('"', p); + return s.substr(p, q - p); + }; + std::string t = text_field(single_doc); + bool is_array = !doc.empty() && doc.front() == '[' && doc.back() == ']'; + size_t cnt = 0, pos = 0; + std::string needle = "\"text\":\"" + t + "\""; + while ((pos = doc.find(needle, pos)) != std::string::npos) { ++cnt; pos += needle.size(); } + bool ok = is_array && !t.empty() && cnt == 2; + std::fprintf(stderr, "array=%d text='%s' count=%zu -> %s\n", is_array, t.c_str(), cnt, ok?"OK":"FAIL"); + + // The language-aware delegate with the model default (NULL) must match the + // non-lang path byte-for-byte on an offline (non-prompt) model. + char* batch_lang = parakeet_capi_transcribe_pcm_batch_json_lang(ctx, concat.data(), n_samples, 2, 16000, 0, nullptr); + bool lang_ok = batch_lang && doc == std::string(batch_lang); + std::fprintf(stderr, "batch_json_lang(NULL)==batch_json -> %s\n", lang_ok?"OK":"FAIL"); + parakeet_capi_free_string(batch_lang); + + parakeet_capi_free(ctx); + if (!ok || !lang_ok) return 1; + } + + // Prompt (multilingual / nemotron) model: exercise the batched target_lang + // variant. This fixture is a CAUSAL streaming prompt model + // (causal_downsampling=True). Batched causal subsampling is now supported + // (byte-identical to per-item), so a valid-language 2-clip batch runs through + // the batched encoder and returns a JSON array of length 2. We also assert + // the catchable error path: an unknown locale is rejected by + // resolve_prompt_index (which runs before the encoder) -> NULL + non-empty + // last_error, proving target_lang is threaded through the batched C-API. + // Skipped cleanly when the env var is unset. + const char* nemotron = std::getenv("PARAKEET_TEST_GGUF_NEMOTRON"); + if (nemotron) { + ran_any = true; + parakeet_ctx* nctx = parakeet_capi_load(nemotron); + if (!nctx) { std::fprintf(stderr, "nemotron load failed\n"); return 1; } + + // A valid locale ("de") must return a non-NULL JSON array of length 2. + char* ngood = parakeet_capi_transcribe_pcm_batch_json_lang( + nctx, concat.data(), n_samples, 2, 16000, 0, "de"); + if (!ngood) { + std::fprintf(stderr, "nemotron batch_json_lang(de) returned NULL: %s\n", + parakeet_capi_last_error(nctx)); + parakeet_capi_free(nctx); + return 1; + } + std::string ndoc(ngood); + parakeet_capi_free_string(ngood); + bool nis_array = !ndoc.empty() && ndoc.front() == '[' && ndoc.back() == ']'; + // Two JSON objects in the array (one per clip). + size_t nobj = 0, npos = 0; + while ((npos = ndoc.find("\"text\":", npos)) != std::string::npos) { ++nobj; npos += 7; } + bool ngood_ok = nis_array && nobj == 2; + std::fprintf(stderr, "nemotron batch_json_lang(de) array=%d objects=%zu -> %s\n", + nis_array, nobj, ngood_ok ? "OK" : "FAIL"); + if (!ngood_ok) { parakeet_capi_free(nctx); return 1; } + + // An unknown locale must fail cleanly: NULL + non-empty last_error. + char* nbad = parakeet_capi_transcribe_pcm_batch_json_lang( + nctx, concat.data(), n_samples, 2, 16000, 0, "zzz"); + if (nbad != nullptr) { + std::fprintf(stderr, "nemotron batch_json_lang(zzz) returned non-NULL\n"); + parakeet_capi_free_string(nbad); + parakeet_capi_free(nctx); + return 1; + } + const char* nerr = parakeet_capi_last_error(nctx); + if (!nerr || nerr[0] == '\0') { + std::fprintf(stderr, "nemotron unknown-lang did not set last_error\n"); + parakeet_capi_free(nctx); + return 1; + } + std::fprintf(stderr, "nemotron unknown-lang error = %s\n", nerr); + parakeet_capi_free(nctx); + std::fprintf(stderr, "PASS nemotron batch_json_lang error path\n"); + } + + if (!ran_any) { + std::fprintf(stderr, "no model env var set (PARAKEET_TEST_GGUF / PARAKEET_TEST_GGUF_NEMOTRON); skip\n"); + return 77; + } + return 0; } diff --git a/tests/test_subsampling_batch_causal.cpp b/tests/test_subsampling_batch_causal.cpp new file mode 100644 index 0000000..d9b79fe --- /dev/null +++ b/tests/test_subsampling_batch_causal.cpp @@ -0,0 +1,131 @@ +#include "model.hpp" +#include "audio_io.hpp" +#include +#include +#include +#include + +// Batched CAUSAL subsampling parity for the multilingual streaming nemotron +// model (causal_downsampling=True). Greedy decode is deterministic and the +// per-item causal path is NeMo-validated at WER 0, so the gold check is +// byte-identical equivalence: a clip transcribed inside a B>1 batch MUST equal +// the SAME clip transcribed standalone. Mixed-length batches exercise the +// per-item trailing-pad masking that interacts with the causal right pad of 1. +// +// Skips (77) unless PARAKEET_TEST_GGUF_NEMOTRON points at a causal nemotron gguf. +int main() { + const char* gguf = std::getenv("PARAKEET_TEST_GGUF_NEMOTRON"); + if (!gguf) { std::fprintf(stderr, "PARAKEET_TEST_GGUF_NEMOTRON not set; skip\n"); return 77; } + + auto model = pk::Model::load(gguf); + if (!model) { std::fprintf(stderr, "load failed\n"); return 1; } + + pk::Audio speech, clip; + if (!pk::load_audio_16k_mono("tests/fixtures/speech.wav", speech) || speech.samples.empty()) { + std::fprintf(stderr, "speech.wav load failed\n"); return 1; + } + if (!pk::load_audio_16k_mono("tests/fixtures/clip.wav", clip) || clip.samples.empty()) { + std::fprintf(stderr, "clip.wav load failed\n"); return 1; + } + + const std::string lang = "en"; + + // A truncated speech slice: a SHORTER clip that still carries real content + // (non-empty transcript), so when it is the padded/masked item in a batch a + // per-item masking bug would corrupt its tokens. This is the strongest + // mixed-length masking probe. + std::vector half(speech.samples.begin(), + speech.samples.begin() + (speech.samples.size() * 3) / 5); + + // Per-item standalone references (the ground truth for byte-identity). + std::string ref_speech = model->transcribe_16k(speech.samples, pk::Decoder::kDefault, lang); + std::string ref_clip = model->transcribe_16k(clip.samples, pk::Decoder::kDefault, lang); + std::string ref_half = model->transcribe_16k(half, pk::Decoder::kDefault, lang); + std::fprintf(stderr, "ref_speech='%s'\nref_clip='%s'\nref_half='%s'\n", + ref_speech.c_str(), ref_clip.c_str(), ref_half.c_str()); + + bool ok = true; + + // (a) Uniform batch: both items identical to the standalone speech transcript. + { + auto out = model->transcribe_pcm_batch({speech.samples, speech.samples}, 16000, + pk::Decoder::kDefault, lang); + bool pass = out.size() == 2 && out[0] == ref_speech && out[1] == ref_speech; + std::fprintf(stderr, "(a) uniform batch: size=%zu item0=%s item1=%s -> %s\n", + out.size(), + (out.size() > 0 && out[0] == ref_speech) ? "OK" : "DIFF", + (out.size() > 1 && out[1] == ref_speech) ? "OK" : "DIFF", + pass ? "PASS" : "FAIL"); + ok = ok && pass; + } + + // (b) MIXED-LENGTH batch (the real test): different lengths exercise the + // per-item causal masking; each item must be byte-identical to standalone. + { + auto out = model->transcribe_pcm_batch({speech.samples, clip.samples}, 16000, + pk::Decoder::kDefault, lang); + bool pass = out.size() == 2 && out[0] == ref_speech && out[1] == ref_clip; + std::fprintf(stderr, "(b) mixed batch: size=%zu speech=%s clip=%s -> %s\n", + out.size(), + (out.size() > 0 && out[0] == ref_speech) ? "OK" : "DIFF", + (out.size() > 1 && out[1] == ref_clip) ? "OK" : "DIFF", + pass ? "PASS" : "FAIL"); + if (out.size() > 0 && out[0] != ref_speech) + std::fprintf(stderr, " batched speech='%s'\n", out[0].c_str()); + if (out.size() > 1 && out[1] != ref_clip) + std::fprintf(stderr, " batched clip='%s'\n", out[1].c_str()); + ok = ok && pass; + } + + // Reversed order too (clip first): order must not perturb per-item identity. + { + auto out = model->transcribe_pcm_batch({clip.samples, speech.samples}, 16000, + pk::Decoder::kDefault, lang); + bool pass = out.size() == 2 && out[0] == ref_clip && out[1] == ref_speech; + std::fprintf(stderr, "(b') reversed mixed batch: clip=%s speech=%s -> %s\n", + (out.size() > 0 && out[0] == ref_clip) ? "OK" : "DIFF", + (out.size() > 1 && out[1] == ref_speech) ? "OK" : "DIFF", + pass ? "PASS" : "FAIL"); + ok = ok && pass; + } + + // (b'') MIXED-LENGTH with a non-empty shorter item, both orderings. The half + // slice is the padded/masked item in one ordering; its tokens must be + // byte-identical to its standalone transcript. + { + auto out1 = model->transcribe_pcm_batch({speech.samples, half}, 16000, + pk::Decoder::kDefault, lang); + auto out2 = model->transcribe_pcm_batch({half, speech.samples}, 16000, + pk::Decoder::kDefault, lang); + bool pass = out1.size() == 2 && out1[0] == ref_speech && out1[1] == ref_half + && out2.size() == 2 && out2[0] == ref_half && out2[1] == ref_speech; + std::fprintf(stderr, + "(b'') [speech,half]: speech=%s half=%s ; [half,speech]: half=%s speech=%s -> %s\n", + (out1.size() > 0 && out1[0] == ref_speech) ? "OK" : "DIFF", + (out1.size() > 1 && out1[1] == ref_half) ? "OK" : "DIFF", + (out2.size() > 0 && out2[0] == ref_half) ? "OK" : "DIFF", + (out2.size() > 1 && out2[1] == ref_speech) ? "OK" : "DIFF", + pass ? "PASS" : "FAIL"); + if (out2.size() > 1 && out2[0] != ref_half) + std::fprintf(stderr, " batched half (padded item)='%s'\n", out2[0].c_str()); + ok = ok && pass; + } + + // (c) Batched timestamps consistency: per-item text matches the single-clip + // timestamped text. + { + auto rs = model->transcribe_with_timestamps(speech.samples, 16000, pk::Decoder::kDefault, lang); + auto rc = model->transcribe_with_timestamps(clip.samples, 16000, pk::Decoder::kDefault, lang); + auto out = model->transcribe_pcm_batch_with_timestamps({speech.samples, clip.samples}, 16000, + pk::Decoder::kDefault, lang); + bool pass = out.size() == 2 && out[0].text == rs.text && out[1].text == rc.text; + std::fprintf(stderr, "(c) batched timestamps: speech=%s clip=%s -> %s\n", + (out.size() > 0 && out[0].text == rs.text) ? "OK" : "DIFF", + (out.size() > 1 && out[1].text == rc.text) ? "OK" : "DIFF", + pass ? "PASS" : "FAIL"); + ok = ok && pass; + } + + std::fprintf(stderr, "%s\n", ok ? "ALL PASS" : "FAILURES"); + return ok ? 0 : 1; +}