diff --git a/include/parakeet_capi.h b/include/parakeet_capi.h
index def9d2d..8926197 100644
--- a/include/parakeet_capi.h
+++ b/include/parakeet_capi.h
@@ -19,9 +19,11 @@ typedef struct parakeet_ctx parakeet_ctx;
 // function signatures or semantics below.
 //
 // v3: added the target_lang variants (parakeet_capi_transcribe_path_lang,
-//     parakeet_capi_transcribe_pcm_lang, parakeet_capi_stream_begin_lang) for
-//     multilingual prompt-conditioned (nemotron) models. The original non-lang
-//     entry points are unchanged and delegate with the model default language.
+//     parakeet_capi_transcribe_pcm_lang, parakeet_capi_stream_begin_lang,
+//     parakeet_capi_transcribe_pcm_batch_json_lang,
+//     parakeet_capi_transcribe_pcm_batch_lang) for multilingual
+//     prompt-conditioned (nemotron) models. The original non-lang entry points
+//     are unchanged and delegate with the model default language.
 int parakeet_capi_abi_version(void);
 
 // Load a GGUF model. Returns an owning context, or NULL on failure.
@@ -79,6 +81,20 @@ int parakeet_capi_transcribe_pcm_batch(parakeet_ctx* ctx,
                                        int sample_rate, int decoder,
                                        char** out);
 
+// Like parakeet_capi_transcribe_pcm_batch but selects the language prompt for
+// multilingual (nemotron) models. ONE `target_lang` applies to the whole batch:
+// a locale string (e.g. "en", "de", "auto"); NULL or "" uses the model's
+// default ("auto"). Ignored by non-prompt models. On an unknown locale (for a
+// prompt model) returns nonzero, sets the context's last error, and leaves
+// every out[] entry NULL. parakeet_capi_transcribe_pcm_batch delegates here
+// with the model default.
+int parakeet_capi_transcribe_pcm_batch_lang(parakeet_ctx* ctx,
+                                            const float* const* samples,
+                                            const int* n_samples, int n_clips,
+                                            int sample_rate, int decoder,
+                                            const char* target_lang,
+                                            char** out);
+
 // Transcribe a WAV file returning a malloc'd UTF-8 JSON document with per-word
 // and per-token timestamps + confidence (matching NeMo timestamps=True and the
 // 'max_prob' confidence method). `decoder` is as in
@@ -111,6 +127,18 @@ char* parakeet_capi_transcribe_pcm_batch_json(parakeet_ctx* ctx,
                                               const int* n_samples, int n_clips,
                                               int sample_rate, int decoder);
 
+// Like parakeet_capi_transcribe_pcm_batch_json but selects the language prompt
+// for multilingual (nemotron) models. ONE `target_lang` applies to the whole
+// batch: a locale string (e.g. "en", "de", "auto"); NULL or "" uses the model's
+// default ("auto"). Ignored by non-prompt models. On an unknown locale (for a
+// prompt model) returns NULL and sets the context's last error.
+// parakeet_capi_transcribe_pcm_batch_json delegates here with the model default.
+char* parakeet_capi_transcribe_pcm_batch_json_lang(parakeet_ctx* ctx,
+                                                   const float* samples_concat,
+                                                   const int* n_samples, int n_clips,
+                                                   int sample_rate, int decoder,
+                                                   const char* target_lang);
+
 // ---------------------------------------------------------------------------
 // Streaming API (cache-aware streaming RNN-T, e.g. the EOU model
 // nvidia/parakeet_realtime_eou_120m-v1). The stream session buffers incoming
diff --git a/src/parakeet_capi.cpp b/src/parakeet_capi.cpp
index 40e8577..7871d0e 100644
--- a/src/parakeet_capi.cpp
+++ b/src/parakeet_capi.cpp
@@ -17,7 +17,9 @@
 
 // ABI version. Bump on breaking changes.
 // v3: target_lang variants (transcribe_path_lang / transcribe_pcm_lang /
-//     stream_begin_lang) for multilingual prompt-conditioned (nemotron) models.
+//     stream_begin_lang / transcribe_pcm_batch_json_lang /
+//     transcribe_pcm_batch_lang) for multilingual prompt-conditioned (nemotron)
+//     models.
 #define PARAKEET_CAPI_ABI_VERSION 3
 
 // The opaque context: a loaded model plus a buffer for the last error message.
@@ -268,17 +270,20 @@ extern "C" char* parakeet_capi_transcribe_pcm(parakeet_ctx* ctx, const float* sa
                                              decoder, nullptr);
 }
 
-extern "C" int parakeet_capi_transcribe_pcm_batch(parakeet_ctx* ctx,
-                                                  const float* const* samples,
-                                                  const int* n_samples, int n_clips,
-                                                  int sample_rate, int decoder,
-                                                  char** out) {
+extern "C" int parakeet_capi_transcribe_pcm_batch_lang(parakeet_ctx* ctx,
+                                                       const float* const* samples,
+                                                       const int* n_samples, int n_clips,
+                                                       int sample_rate, int decoder,
+                                                       const char* target_lang,
+                                                       char** out) {
     if (!ctx) return 1;
     if (!ctx->model) { ctx->last_error = "context has no loaded model"; return 1; }
     if (!samples || !n_samples || !out || n_clips < 0) {
         ctx->last_error = "invalid batch arguments";
         return 1;
     }
+    // NULL / "" -> model default language (ignored by non-prompt models).
+    const std::string lang = target_lang ? target_lang : "";
     // Contract: on any error path (validation, exception, OOM) every out[]
     // entry is left NULL, so the caller owns nothing and frees nothing.
     for (int i = 0; i < n_clips; ++i) out[i] = nullptr;
@@ -292,7 +297,7 @@ extern "C" int parakeet_capi_transcribe_pcm_batch(parakeet_ctx* ctx,
             pcms[i].assign(samples[i], samples[i] + n_samples[i]);
         }
         std::vector<std::string> texts =
-            ctx->model->transcribe_pcm_batch(pcms, sample_rate, to_decoder(decoder));
+            ctx->model->transcribe_pcm_batch(pcms, sample_rate, to_decoder(decoder), lang);
         ctx->last_error.clear();
         for (int i = 0; i < n_clips; ++i) {
             char* s = dup_to_c(texts[i]);
@@ -315,6 +320,16 @@ extern "C" int parakeet_capi_transcribe_pcm_batch(parakeet_ctx* ctx,
     }
 }
 
+extern "C" int parakeet_capi_transcribe_pcm_batch(parakeet_ctx* ctx,
+                                                  const float* const* samples,
+                                                  const int* n_samples, int n_clips,
+                                                  int sample_rate, int decoder,
+                                                  char** out) {
+    // Delegate with the model default language.
+    return parakeet_capi_transcribe_pcm_batch_lang(ctx, samples, n_samples, n_clips,
+                                                   sample_rate, decoder, nullptr, out);
+}
+
 extern "C" char* parakeet_capi_transcribe_path_json(parakeet_ctx* ctx,
                                                     const char* wav_path,
                                                     int decoder) {
@@ -342,14 +357,16 @@ extern "C" char* parakeet_capi_transcribe_path_json(parakeet_ctx* ctx,
     }
 }
 
-extern "C" char* parakeet_capi_transcribe_pcm_batch_json(parakeet_ctx* ctx,
+extern "C" char* parakeet_capi_transcribe_pcm_batch_json_lang(parakeet_ctx* ctx,
         const float* samples_concat, const int* n_samples, int n_clips,
-        int sample_rate, int decoder) {
+        int sample_rate, int decoder, const char* target_lang) {
     if (!ctx) return nullptr;
     if (!ctx->model) { ctx->last_error = "context has no loaded model"; return nullptr; }
     if (!samples_concat || !n_samples || n_clips < 0) {
         ctx->last_error = "invalid batch arguments"; return nullptr;
     }
+    // NULL / "" -> model default language (ignored by non-prompt models).
+    const std::string lang = target_lang ? target_lang : "";
     try {
         std::vector<std::vector<float>> pcms(n_clips);
         size_t off = 0;
@@ -360,7 +377,7 @@ extern "C" char* parakeet_capi_transcribe_pcm_batch_json(parakeet_ctx* ctx,
         }
         std::vector<pk::Transcription> trs =
             ctx->model->transcribe_pcm_batch_with_timestamps(pcms, sample_rate,
-                                                             to_decoder(decoder));
+                                                             to_decoder(decoder), lang);
         const pk::ParakeetConfig& cfg = ctx->model->config();
         const float frame_sec =
             (float)cfg.hop_length * (float)cfg.subsampling_factor / (float)cfg.sample_rate;
@@ -381,6 +398,15 @@ extern "C" char* parakeet_capi_transcribe_pcm_batch_json(parakeet_ctx* ctx,
     }
 }
 
+extern "C" char* parakeet_capi_transcribe_pcm_batch_json(parakeet_ctx* ctx,
+        const float* samples_concat, const int* n_samples, int n_clips,
+        int sample_rate, int decoder) {
+    // Delegate with the model default language.
+    return parakeet_capi_transcribe_pcm_batch_json_lang(ctx, samples_concat, n_samples,
+                                                        n_clips, sample_rate, decoder,
+                                                        nullptr);
+}
+
 // ---------------------------------------------------------------------------
 // Streaming API
 // ---------------------------------------------------------------------------
diff --git a/src/subsampling.cpp b/src/subsampling.cpp
index f06b881..7af909a 100644
--- a/src/subsampling.cpp
+++ b/src/subsampling.cpp
@@ -54,10 +54,12 @@ ggml_tensor* Subsampling::build_graph_batched(ggml_context* ctx,
     const int F = n_mels;            // feature dim (80)
     const ModelLoader& ml = ml_;
 
-    // This task targets the NON-causal (offline) model only. Batched causal
-    // subsampling (per-stage time masking with a batch axis) is out of scope:
-    // the causal branch below still operates on the single-item assumption.
-    GGML_ASSERT(!(causal_ && B > 1) && "batched causal subsampling not supported");
+    // Batched causal subsampling IS supported: the causal branch below applies
+    // the leading ggml_pad_ext (lp1=2/rp1=1 on time) uniformly across the batch,
+    // and the per-item trailing-pad time masking (mask_time on the batch axis)
+    // plus the all_paddings=3 valid-length recurrence reproduce, per item, the
+    // exact standalone causal boundary. A clip in a B>1 batch is byte-identical
+    // to the same clip transcribed standalone (see test_subsampling_batch_causal).
 
     // --- Input (host-side): ggml conv data layout is [W=feat, H=T, IC=1, N=B].
     // NeMo conv input is [B,1,T,feat] (H=T, W=feat). We must feed
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 71a3999..b83f76a 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -14,6 +14,7 @@ pk_add_test(test_mel)
 pk_add_test(test_mel_gpu)
 pk_add_test(test_subsampling)
 pk_add_test(test_subsampling_batch)
+pk_add_test(test_subsampling_batch_causal)
 pk_add_test(test_relpos_attention)
 pk_add_test(test_relpos_attention_local)
 pk_add_test(test_relpos_attention_local_chunked)
@@ -58,7 +59,7 @@ pk_add_test(test_capi_batch)
 pk_add_test(test_capi_stream)
 pk_add_test(test_capi_timestamps)
 pk_add_test(test_capi_batch_json)
-set_tests_properties(test_model_loader test_mel test_mel_gpu test_subsampling test_subsampling_batch test_relpos_attention test_relpos_attention_batch test_relpos_attention_local_chunked
+set_tests_properties(test_model_loader test_mel test_mel_gpu test_subsampling test_subsampling_batch test_subsampling_batch_causal test_relpos_attention test_relpos_attention_batch test_relpos_attention_local_chunked
                      test_conformer test_conformer_batch test_conv_eou test_encoder test_encoder_batch test_encoder_batch_local test_encoder_eou
                      test_streaming_encoder test_ctc test_prediction
                      test_prediction_step test_prediction_step_batch
@@ -71,7 +72,7 @@ set_tests_properties(test_model_loader test_mel test_mel_gpu test_subsampling te
                      test_capi_timestamps test_capi_batch_json
                      PROPERTIES LABELS "model")
 # These tests read fixtures/baselines via paths relative to the project root.
-set_tests_properties(test_mel test_mel_gpu test_subsampling test_subsampling_batch test_relpos_attention test_relpos_attention_batch test_conformer test_conformer_batch
+set_tests_properties(test_mel test_mel_gpu test_subsampling test_subsampling_batch test_subsampling_batch_causal test_relpos_attention test_relpos_attention_batch test_conformer test_conformer_batch
                      test_conv_eou test_encoder test_encoder_batch test_encoder_batch_local test_encoder_eou test_streaming_encoder
                      test_ctc test_prediction test_prediction_step test_prediction_step_batch
                      test_joint test_joint_step_batch test_prompt_kernel
diff --git a/tests/test_capi_batch_json.cpp b/tests/test_capi_batch_json.cpp
index 0ccb38c..038603d 100644
--- a/tests/test_capi_batch_json.cpp
+++ b/tests/test_capi_batch_json.cpp
@@ -4,42 +4,124 @@
 #include <cstdio>
 #include <string>
 #include <vector>
+
+// The two model env vars are independent: the offline block (PARAKEET_TEST_GGUF)
+// exercises the non-lang batch JSON path and its language-aware delegate on a
+// real offline model; the prompt block (PARAKEET_TEST_GGUF_NEMOTRON) exercises
+// the batched target_lang error handling on a multilingual prompt model. Each
+// block runs only when its env var is set; if neither is set we skip (77).
 int main() {
-    const char* gguf = std::getenv("PARAKEET_TEST_GGUF");
-    if (!gguf) { std::fprintf(stderr, "env not set; skip\n"); return 77; }
-    parakeet_ctx* ctx = parakeet_capi_load(gguf);
-    if (!ctx) { std::fprintf(stderr, "load failed\n"); return 1; }
+    bool ran_any = false;
+
+    // Build a 2-clip batch (speech.wav twice) reused by both blocks.
     pk::Audio a;
     if (!pk::load_audio_16k_mono("tests/fixtures/speech.wav", a) || a.samples.empty()) {
-        std::fprintf(stderr, "wav load failed\n"); parakeet_capi_free(ctx); return 1;
+        std::fprintf(stderr, "wav load failed\n"); return 1;
     }
-    char* single = parakeet_capi_transcribe_path_json(ctx, "tests/fixtures/speech.wav", 0);
-    if (!single) { std::fprintf(stderr, "single failed: %s\n", parakeet_capi_last_error(ctx)); parakeet_capi_free(ctx); return 1; }
-    std::string single_doc(single);
-    parakeet_capi_free_string(single);
-
     std::vector<float> concat;
     concat.insert(concat.end(), a.samples.begin(), a.samples.end());
     concat.insert(concat.end(), a.samples.begin(), a.samples.end());
     int n_samples[2] = { (int)a.samples.size(), (int)a.samples.size() };
-    char* batch = parakeet_capi_transcribe_pcm_batch_json(ctx, concat.data(), n_samples, 2, 16000, 0);
-    if (!batch) { std::fprintf(stderr, "batch failed: %s\n", parakeet_capi_last_error(ctx)); parakeet_capi_free(ctx); return 1; }
-    std::string doc(batch);
-    parakeet_capi_free_string(batch);
-
-    auto text_field = [](const std::string& s) -> std::string {
-        size_t p = s.find("\"text\":\"");
-        if (p == std::string::npos) return "";
-        p += 8; size_t q = s.find('"', p);
-        return s.substr(p, q - p);
-    };
-    std::string t = text_field(single_doc);
-    bool is_array = !doc.empty() && doc.front() == '[' && doc.back() == ']';
-    size_t cnt = 0, pos = 0;
-    std::string needle = "\"text\":\"" + t + "\"";
-    while ((pos = doc.find(needle, pos)) != std::string::npos) { ++cnt; pos += needle.size(); }
-    bool ok = is_array && !t.empty() && cnt == 2;
-    std::fprintf(stderr, "array=%d text='%s' count=%zu -> %s\n", is_array, t.c_str(), cnt, ok?"OK":"FAIL");
-    parakeet_capi_free(ctx);
-    return ok ? 0 : 1;
+
+    const char* gguf = std::getenv("PARAKEET_TEST_GGUF");
+    if (gguf) {
+        ran_any = true;
+        parakeet_ctx* ctx = parakeet_capi_load(gguf);
+        if (!ctx) { std::fprintf(stderr, "load failed\n"); return 1; }
+
+        char* single = parakeet_capi_transcribe_path_json(ctx, "tests/fixtures/speech.wav", 0);
+        if (!single) { std::fprintf(stderr, "single failed: %s\n", parakeet_capi_last_error(ctx)); parakeet_capi_free(ctx); return 1; }
+        std::string single_doc(single);
+        parakeet_capi_free_string(single);
+
+        char* batch = parakeet_capi_transcribe_pcm_batch_json(ctx, concat.data(), n_samples, 2, 16000, 0);
+        if (!batch) { std::fprintf(stderr, "batch failed: %s\n", parakeet_capi_last_error(ctx)); parakeet_capi_free(ctx); return 1; }
+        std::string doc(batch);
+        parakeet_capi_free_string(batch);
+
+        auto text_field = [](const std::string& s) -> std::string {
+            size_t p = s.find("\"text\":\"");
+            if (p == std::string::npos) return "";
+            p += 8; size_t q = s.find('"', p);
+            return s.substr(p, q - p);
+        };
+        std::string t = text_field(single_doc);
+        bool is_array = !doc.empty() && doc.front() == '[' && doc.back() == ']';
+        size_t cnt = 0, pos = 0;
+        std::string needle = "\"text\":\"" + t + "\"";
+        while ((pos = doc.find(needle, pos)) != std::string::npos) { ++cnt; pos += needle.size(); }
+        bool ok = is_array && !t.empty() && cnt == 2;
+        std::fprintf(stderr, "array=%d text='%s' count=%zu -> %s\n", is_array, t.c_str(), cnt, ok?"OK":"FAIL");
+
+        // The language-aware delegate with the model default (NULL) must match the
+        // non-lang path byte-for-byte on an offline (non-prompt) model.
+        char* batch_lang = parakeet_capi_transcribe_pcm_batch_json_lang(ctx, concat.data(), n_samples, 2, 16000, 0, nullptr);
+        bool lang_ok = batch_lang && doc == std::string(batch_lang);
+        std::fprintf(stderr, "batch_json_lang(NULL)==batch_json -> %s\n", lang_ok?"OK":"FAIL");
+        parakeet_capi_free_string(batch_lang);
+
+        parakeet_capi_free(ctx);
+        if (!ok || !lang_ok) return 1;
+    }
+
+    // Prompt (multilingual / nemotron) model: exercise the batched target_lang
+    // variant. This fixture is a CAUSAL streaming prompt model
+    // (causal_downsampling=True). Batched causal subsampling is now supported
+    // (byte-identical to per-item), so a valid-language 2-clip batch runs through
+    // the batched encoder and returns a JSON array of length 2. We also assert
+    // the catchable error path: an unknown locale is rejected by
+    // resolve_prompt_index (which runs before the encoder) -> NULL + non-empty
+    // last_error, proving target_lang is threaded through the batched C-API.
+    // Skipped cleanly when the env var is unset.
+    const char* nemotron = std::getenv("PARAKEET_TEST_GGUF_NEMOTRON");
+    if (nemotron) {
+        ran_any = true;
+        parakeet_ctx* nctx = parakeet_capi_load(nemotron);
+        if (!nctx) { std::fprintf(stderr, "nemotron load failed\n"); return 1; }
+
+        // A valid locale ("de") must return a non-NULL JSON array of length 2.
+        char* ngood = parakeet_capi_transcribe_pcm_batch_json_lang(
+            nctx, concat.data(), n_samples, 2, 16000, 0, "de");
+        if (!ngood) {
+            std::fprintf(stderr, "nemotron batch_json_lang(de) returned NULL: %s\n",
+                         parakeet_capi_last_error(nctx));
+            parakeet_capi_free(nctx);
+            return 1;
+        }
+        std::string ndoc(ngood);
+        parakeet_capi_free_string(ngood);
+        bool nis_array = !ndoc.empty() && ndoc.front() == '[' && ndoc.back() == ']';
+        // Two JSON objects in the array (one per clip).
+        size_t nobj = 0, npos = 0;
+        while ((npos = ndoc.find("\"text\":", npos)) != std::string::npos) { ++nobj; npos += 7; }
+        bool ngood_ok = nis_array && nobj == 2;
+        std::fprintf(stderr, "nemotron batch_json_lang(de) array=%d objects=%zu -> %s\n",
+                     nis_array, nobj, ngood_ok ? "OK" : "FAIL");
+        if (!ngood_ok) { parakeet_capi_free(nctx); return 1; }
+
+        // An unknown locale must fail cleanly: NULL + non-empty last_error.
+        char* nbad = parakeet_capi_transcribe_pcm_batch_json_lang(
+            nctx, concat.data(), n_samples, 2, 16000, 0, "zzz");
+        if (nbad != nullptr) {
+            std::fprintf(stderr, "nemotron batch_json_lang(zzz) returned non-NULL\n");
+            parakeet_capi_free_string(nbad);
+            parakeet_capi_free(nctx);
+            return 1;
+        }
+        const char* nerr = parakeet_capi_last_error(nctx);
+        if (!nerr || nerr[0] == '\0') {
+            std::fprintf(stderr, "nemotron unknown-lang did not set last_error\n");
+            parakeet_capi_free(nctx);
+            return 1;
+        }
+        std::fprintf(stderr, "nemotron unknown-lang error = %s\n", nerr);
+        parakeet_capi_free(nctx);
+        std::fprintf(stderr, "PASS nemotron batch_json_lang error path\n");
+    }
+
+    if (!ran_any) {
+        std::fprintf(stderr, "no model env var set (PARAKEET_TEST_GGUF / PARAKEET_TEST_GGUF_NEMOTRON); skip\n");
+        return 77;
+    }
+    return 0;
 }
diff --git a/tests/test_subsampling_batch_causal.cpp b/tests/test_subsampling_batch_causal.cpp
new file mode 100644
index 0000000..d9b79fe
--- /dev/null
+++ b/tests/test_subsampling_batch_causal.cpp
@@ -0,0 +1,131 @@
+#include "model.hpp"
+#include "audio_io.hpp"
+#include <cstdlib>
+#include <cstdio>
+#include <string>
+#include <vector>
+
+// Batched CAUSAL subsampling parity for the multilingual streaming nemotron
+// model (causal_downsampling=True). Greedy decode is deterministic and the
+// per-item causal path is NeMo-validated at WER 0, so the gold check is
+// byte-identical equivalence: a clip transcribed inside a B>1 batch MUST equal
+// the SAME clip transcribed standalone. Mixed-length batches exercise the
+// per-item trailing-pad masking that interacts with the causal right pad of 1.
+//
+// Skips (77) unless PARAKEET_TEST_GGUF_NEMOTRON points at a causal nemotron gguf.
+int main() {
+    const char* gguf = std::getenv("PARAKEET_TEST_GGUF_NEMOTRON");
+    if (!gguf) { std::fprintf(stderr, "PARAKEET_TEST_GGUF_NEMOTRON not set; skip\n"); return 77; }
+
+    auto model = pk::Model::load(gguf);
+    if (!model) { std::fprintf(stderr, "load failed\n"); return 1; }
+
+    pk::Audio speech, clip;
+    if (!pk::load_audio_16k_mono("tests/fixtures/speech.wav", speech) || speech.samples.empty()) {
+        std::fprintf(stderr, "speech.wav load failed\n"); return 1;
+    }
+    if (!pk::load_audio_16k_mono("tests/fixtures/clip.wav", clip) || clip.samples.empty()) {
+        std::fprintf(stderr, "clip.wav load failed\n"); return 1;
+    }
+
+    const std::string lang = "en";
+
+    // A truncated speech slice: a SHORTER clip that still carries real content
+    // (non-empty transcript), so when it is the padded/masked item in a batch a
+    // per-item masking bug would corrupt its tokens. This is the strongest
+    // mixed-length masking probe.
+    std::vector<float> half(speech.samples.begin(),
+                            speech.samples.begin() + (speech.samples.size() * 3) / 5);
+
+    // Per-item standalone references (the ground truth for byte-identity).
+    std::string ref_speech = model->transcribe_16k(speech.samples, pk::Decoder::kDefault, lang);
+    std::string ref_clip   = model->transcribe_16k(clip.samples,   pk::Decoder::kDefault, lang);
+    std::string ref_half   = model->transcribe_16k(half,           pk::Decoder::kDefault, lang);
+    std::fprintf(stderr, "ref_speech='%s'\nref_clip='%s'\nref_half='%s'\n",
+                 ref_speech.c_str(), ref_clip.c_str(), ref_half.c_str());
+
+    bool ok = true;
+
+    // (a) Uniform batch: both items identical to the standalone speech transcript.
+    {
+        auto out = model->transcribe_pcm_batch({speech.samples, speech.samples}, 16000,
+                                               pk::Decoder::kDefault, lang);
+        bool pass = out.size() == 2 && out[0] == ref_speech && out[1] == ref_speech;
+        std::fprintf(stderr, "(a) uniform batch: size=%zu item0=%s item1=%s -> %s\n",
+                     out.size(),
+                     (out.size() > 0 && out[0] == ref_speech) ? "OK" : "DIFF",
+                     (out.size() > 1 && out[1] == ref_speech) ? "OK" : "DIFF",
+                     pass ? "PASS" : "FAIL");
+        ok = ok && pass;
+    }
+
+    // (b) MIXED-LENGTH batch (the real test): different lengths exercise the
+    //     per-item causal masking; each item must be byte-identical to standalone.
+    {
+        auto out = model->transcribe_pcm_batch({speech.samples, clip.samples}, 16000,
+                                               pk::Decoder::kDefault, lang);
+        bool pass = out.size() == 2 && out[0] == ref_speech && out[1] == ref_clip;
+        std::fprintf(stderr, "(b) mixed batch: size=%zu speech=%s clip=%s -> %s\n",
+                     out.size(),
+                     (out.size() > 0 && out[0] == ref_speech) ? "OK" : "DIFF",
+                     (out.size() > 1 && out[1] == ref_clip) ? "OK" : "DIFF",
+                     pass ? "PASS" : "FAIL");
+        if (out.size() > 0 && out[0] != ref_speech)
+            std::fprintf(stderr, "    batched speech='%s'\n", out[0].c_str());
+        if (out.size() > 1 && out[1] != ref_clip)
+            std::fprintf(stderr, "    batched clip='%s'\n", out[1].c_str());
+        ok = ok && pass;
+    }
+
+    // Reversed order too (clip first): order must not perturb per-item identity.
+    {
+        auto out = model->transcribe_pcm_batch({clip.samples, speech.samples}, 16000,
+                                               pk::Decoder::kDefault, lang);
+        bool pass = out.size() == 2 && out[0] == ref_clip && out[1] == ref_speech;
+        std::fprintf(stderr, "(b') reversed mixed batch: clip=%s speech=%s -> %s\n",
+                     (out.size() > 0 && out[0] == ref_clip) ? "OK" : "DIFF",
+                     (out.size() > 1 && out[1] == ref_speech) ? "OK" : "DIFF",
+                     pass ? "PASS" : "FAIL");
+        ok = ok && pass;
+    }
+
+    // (b'') MIXED-LENGTH with a non-empty shorter item, both orderings. The half
+    //       slice is the padded/masked item in one ordering; its tokens must be
+    //       byte-identical to its standalone transcript.
+    {
+        auto out1 = model->transcribe_pcm_batch({speech.samples, half}, 16000,
+                                                pk::Decoder::kDefault, lang);
+        auto out2 = model->transcribe_pcm_batch({half, speech.samples}, 16000,
+                                                pk::Decoder::kDefault, lang);
+        bool pass = out1.size() == 2 && out1[0] == ref_speech && out1[1] == ref_half
+                 && out2.size() == 2 && out2[0] == ref_half   && out2[1] == ref_speech;
+        std::fprintf(stderr,
+                     "(b'') [speech,half]: speech=%s half=%s ; [half,speech]: half=%s speech=%s -> %s\n",
+                     (out1.size() > 0 && out1[0] == ref_speech) ? "OK" : "DIFF",
+                     (out1.size() > 1 && out1[1] == ref_half) ? "OK" : "DIFF",
+                     (out2.size() > 0 && out2[0] == ref_half) ? "OK" : "DIFF",
+                     (out2.size() > 1 && out2[1] == ref_speech) ? "OK" : "DIFF",
+                     pass ? "PASS" : "FAIL");
+        if (out2.size() > 1 && out2[0] != ref_half)
+            std::fprintf(stderr, "    batched half (padded item)='%s'\n", out2[0].c_str());
+        ok = ok && pass;
+    }
+
+    // (c) Batched timestamps consistency: per-item text matches the single-clip
+    //     timestamped text.
+    {
+        auto rs = model->transcribe_with_timestamps(speech.samples, 16000, pk::Decoder::kDefault, lang);
+        auto rc = model->transcribe_with_timestamps(clip.samples,   16000, pk::Decoder::kDefault, lang);
+        auto out = model->transcribe_pcm_batch_with_timestamps({speech.samples, clip.samples}, 16000,
+                                                               pk::Decoder::kDefault, lang);
+        bool pass = out.size() == 2 && out[0].text == rs.text && out[1].text == rc.text;
+        std::fprintf(stderr, "(c) batched timestamps: speech=%s clip=%s -> %s\n",
+                     (out.size() > 0 && out[0].text == rs.text) ? "OK" : "DIFF",
+                     (out.size() > 1 && out[1].text == rc.text) ? "OK" : "DIFF",
+                     pass ? "PASS" : "FAIL");
+        ok = ok && pass;
+    }
+
+    std::fprintf(stderr, "%s\n", ok ? "ALL PASS" : "FAILURES");
+    return ok ? 0 : 1;
+}