From b3fb33a7b435993cf921fadf515afc4e0ff04292 Mon Sep 17 00:00:00 2001
From: VelvetBeans <velvetbeanvibes@gmail.com>
Date: Sat, 30 May 2026 00:22:24 -0500
Subject: [PATCH] tts: chunked streaming acoustic decode to bound decode VRAM

The TTS acoustic decode upsampled the entire latent sequence to 24 kHz
audio in a single ggml graph, so peak VRAM grew ~linearly with clip
length (~161 MiB per latent frame on this build). A 12 GB GPU OOMs in
the decode once a clip passes ~20 frames (~2.7 s), even though the LM
and encoder fit comfortably -- making long clips and the Q8_0 1.5B model
undecodable on CUDA. There was no knob to chunk it.

Decode the latent sequence in fixed-size frame chunks through a streaming
decoder that keeps a small per-conv left-context cache, mirroring the
existing long-form ASR encoder path (encoder_forward_streaming +
StreamingCache). Each chunk pushes only C frames through the decoder, so
peak activation memory is bounded to one chunk regardless of total
length, while the per-conv caches carry kernel-1 (regular convs) /
ceil((K-1)/stride) (transposed upsamplers) frames of context -- making
the concatenated output bit-exact with a single-shot decode.

New primitive: sconv_transpose1d_causal_streaming, the streaming causal
transposed convolution for the decoder's upsamplers (the ASR path only
streams downsampling convs, so the existing helper could not be reused).
Everything else reuses the established streaming building blocks
(sconv1d_causal_streaming, block1d_forward_streaming). Sequences <= chunk
still take the original single-shot path (renamed
decode_latent_single_shot), so short clips are byte-for-byte unchanged.
Default chunk: 15 frames on CUDA (safely under ggml-cuda's IM2COL
gridDim.y 65535 cap), 64 on CPU; override with VIBEVOICE_DECODE_CHUNK_FRAMES.

Verification:
  * CPU streaming vs single-shot: bit-exact (max abs diff 0.0,
    0/35200 samples differ).
  * CUDA: differs only by float rounding (RMS diff 0.082% of signal).
  * A 26.7 s clip and the Q8_0 1.5B model -- both previously OOM in the
    decode -- now complete end-to-end on a 12 GB RTX 4070.
  * ctest green, no regressions.

Signed-off-by: VelvetBeans <velvetbeanvibes@gmail.com>
Assisted-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 src/acoustic_tokenizer.cpp |  54 ++++++++++++++
 src/acoustic_tokenizer.hpp |  12 +++
 src/conv1d.cpp             |  73 ++++++++++++++++++
 src/conv1d.hpp             |  16 ++++
 src/vibevoice_tts.cpp      | 148 ++++++++++++++++++++++++++++++++++++-
 5 files changed, 299 insertions(+), 4 deletions(-)

diff --git a/src/acoustic_tokenizer.cpp b/src/acoustic_tokenizer.cpp
index 511c640..aa3ab37 100644
--- a/src/acoustic_tokenizer.cpp
+++ b/src/acoustic_tokenizer.cpp
@@ -351,4 +351,58 @@ struct ggml_tensor* decoder_forward(struct ggml_context*  ctx,
     return y;
 }
 
+namespace {
+
+// Streaming causal SConvTranspose1d (upsample) — cache-backed counterpart of
+// convtr_step.
+struct ggml_tensor* convtr_step_streaming(struct ggml_context*       ctx,
+                                          struct ggml_tensor*        x,
+                                          const StridedConvWeights&  w,
+                                          StreamingCache&            cache,
+                                          const std::string&         layer_id) {
+    return sconv_transpose1d_causal_streaming(ctx, x, w.kernel, w.bias, w.stride,
+                                              cache, layer_id);
+}
+
+}  // namespace
+
+struct ggml_tensor* decoder_forward_streaming(struct ggml_context*  ctx,
+                                              struct ggml_tensor*   z,
+                                              const DecoderWeights& w,
+                                              const AcousticConfig& cfg,
+                                              StreamingCache&       cache) {
+    char buf[80];
+
+    struct ggml_tensor* h = sconv1d_causal_streaming(
+        ctx, z, w.stem.kernel, w.stem.bias, w.stem.stride, /*dilation=*/1, /*groups=*/1,
+        cache, "dec.stem");
+    for (size_t j = 0; j < w.stages[0].size(); ++j) {
+        std::snprintf(buf, sizeof(buf), "dec.stage_0_block_%zu", j);
+        h = block1d_forward_streaming(ctx, h, w.stages[0][j], cfg.eps, cache, buf);
+    }
+
+    for (size_t i = 1; i < cfg.depths.size(); ++i) {
+        std::snprintf(buf, sizeof(buf), "dec.up_%zu", i);
+        h = convtr_step_streaming(ctx, h, w.ups[i - 1], cache, buf);
+        for (size_t j = 0; j < w.stages[i].size(); ++j) {
+            std::snprintf(buf, sizeof(buf), "dec.stage_%zu_block_%zu", i, j);
+            h = block1d_forward_streaming(ctx, h, w.stages[i][j], cfg.eps, cache, buf);
+        }
+    }
+
+    struct ggml_tensor* y = h;
+    if (w.final_norm) {
+        struct ggml_tensor* p = ggml_permute(ctx, y, 1, 0, 2, 3);
+        p = ggml_cont(ctx, p);
+        p = ggml_rms_norm(ctx, p, cfg.eps);
+        p = ggml_mul(ctx, p, w.final_norm);
+        p = ggml_permute(ctx, p, 1, 0, 2, 3);
+        y = ggml_cont(ctx, p);
+    }
+    y = sconv1d_causal_streaming(
+        ctx, y, w.head.kernel, w.head.bias, w.head.stride, /*dilation=*/1, /*groups=*/1,
+        cache, "dec.head");
+    return y;
+}
+
 }  // namespace vv
diff --git a/src/acoustic_tokenizer.hpp b/src/acoustic_tokenizer.hpp
index 89f7a5f..b658c6a 100644
--- a/src/acoustic_tokenizer.hpp
+++ b/src/acoustic_tokenizer.hpp
@@ -120,6 +120,18 @@ struct ggml_tensor* encoder_forward_streaming(struct ggml_context*    ctx,
                                               const AcousticConfig&   cfg,
                                               StreamingCache&         cache);
 
+// Streaming decoder: same math as decoder_forward, but every causal conv
+// (stem, per-block depthwise mixers, the transposed upsamplers, and the
+// head) reads/writes its left context through `cache`. Driven in chunk
+// order with cache.is_first_chunk / is_final_chunk set by the caller, the
+// concatenated per-chunk audio is bit-exact with a single-shot decode while
+// keeping peak activation memory bounded to one chunk's worth of frames.
+struct ggml_tensor* decoder_forward_streaming(struct ggml_context*  ctx,
+                                              struct ggml_tensor*   z,
+                                              const DecoderWeights& w,
+                                              const AcousticConfig& cfg,
+                                              StreamingCache&       cache);
+
 }  // namespace vv
 
 #endif  // VIBEVOICE_ACOUSTIC_TOKENIZER_HPP
diff --git a/src/conv1d.cpp b/src/conv1d.cpp
index f35476d..38a9d04 100644
--- a/src/conv1d.cpp
+++ b/src/conv1d.cpp
@@ -175,4 +175,77 @@ struct ggml_tensor* sconv_transpose1d_causal(struct ggml_context* ctx,
     return maybe_add_bias_t(ctx, y, bias);
 }
 
+struct ggml_tensor* sconv_transpose1d_causal_streaming(struct ggml_context* ctx,
+                                                       struct ggml_tensor*  x,
+                                                       struct ggml_tensor*  kernel,
+                                                       struct ggml_tensor*  bias,
+                                                       int stride,
+                                                       StreamingCache&      cache,
+                                                       const std::string&   layer_id) {
+    const int K     = static_cast<int>(kernel->ne[0]);
+    const int C_in  = static_cast<int>(x->ne[1]);
+    const int B     = static_cast<int>(x->ne[2]);
+    const int T_in  = static_cast<int>(x->ne[0]);
+    const int S     = stride;
+
+    // Left-context input frames needed so the first kept output sample is
+    // exact: output position p depends on input frames down to
+    // floor((p - K + 1) / S). For the first kept position p = context*S this
+    // reaches context - ceil((K-1)/S), so context = ceil((K-1)/S) frames of
+    // history make the kept region bit-exact vs single-shot.
+    const int context = (K - 1 + S - 1) / S;
+
+    auto& entry = cache[layer_id];
+    entry.T = context;
+    entry.C = C_in;
+
+    // Cache-prefix leaf (filled by caller post-alloc): zeros on the first
+    // chunk, the previous chunk's trailing `context` input frames otherwise.
+    struct ggml_tensor* prefix = nullptr;
+    if (context > 0) {
+        prefix = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, context, C_in, B);
+        ggml_set_name(prefix, ("cache_prefix_" + layer_id).c_str());
+    }
+    entry.prefix = prefix;
+
+    struct ggml_tensor* xp = x;
+    if (prefix) xp = ggml_concat(ctx, prefix, x, /*dim=*/0);
+
+    // ggml-cuda's conv_transpose_1d requires F32 kernels (see the single-shot
+    // path); standardize on F32 so the same graph runs on CPU and CUDA.
+    struct ggml_tensor* k = (kernel->type == GGML_TYPE_F32)
+                              ? kernel
+                              : ggml_cast(ctx, kernel, GGML_TYPE_F32);
+
+    struct ggml_tensor* y_full = ggml_conv_transpose_1d(ctx, k, xp, S, /*p0=*/0, /*d0=*/1);
+    // y_full length = (context + T_in - 1) * S + K. The output frames that
+    // belong to the NEW input frames live at [context*S, (context+T_in)*S);
+    // that upper bound equals the single-shot trimmed length, so the slice
+    // both drops the warmup prefix and applies the causal right-trim.
+    const int64_t out_start = static_cast<int64_t>(context) * S;
+    const int64_t out_len   = static_cast<int64_t>(T_in)    * S;
+    struct ggml_tensor* y = ggml_view_3d(ctx, y_full,
+                                         /*ne0=*/out_len,
+                                         /*ne1=*/y_full->ne[1],
+                                         /*ne2=*/y_full->ne[2],
+                                         /*nb1=*/y_full->nb[1],
+                                         /*nb2=*/y_full->nb[2],
+                                         /*offset=*/static_cast<size_t>(out_start) * y_full->nb[0]);
+    y = ggml_cont(ctx, y);
+
+    // Register a view of the last `context` input frames (of the concatenated
+    // stream) so the caller can carry them into the next chunk.
+    if (context > 0) {
+        const int start = std::max(0, (T_in + context) - context);  // = T_in
+        struct ggml_tensor* view = ggml_view_3d(
+            ctx, xp,
+            /*ne0=*/context, /*ne1=*/C_in, /*ne2=*/B,
+            /*nb1=*/xp->nb[1], /*nb2=*/xp->nb[2],
+            /*offset=*/static_cast<size_t>(start) * xp->nb[0]);
+        entry.next_view = ggml_cont(ctx, view);
+    }
+
+    return maybe_add_bias_t(ctx, y, bias);
+}
+
 }  // namespace vv
diff --git a/src/conv1d.hpp b/src/conv1d.hpp
index 5ff2e27..bbc72bf 100644
--- a/src/conv1d.hpp
+++ b/src/conv1d.hpp
@@ -110,6 +110,22 @@ struct ggml_tensor* sconv_transpose1d_causal(struct ggml_context* ctx,
                                              struct ggml_tensor*  bias,   // [C_out] or null
                                              int stride);
 
+// Streaming variant of sconv_transpose1d_causal. Mirrors
+// sconv1d_causal_streaming: prepends `context = ceil((K-1)/stride)` input
+// frames from the cache (zeros on the first chunk), runs the transposed
+// conv on the concatenated input, then slices the output region that
+// belongs to the NEW frames ([context*stride, (context+T_in)*stride)) so
+// the per-chunk outputs concatenate bit-exact with a single-shot pass.
+// Registers a view of the last `context` input frames as
+// cache[layer_id].next_view for the caller to copy out post-compute.
+struct ggml_tensor* sconv_transpose1d_causal_streaming(struct ggml_context* ctx,
+                                                       struct ggml_tensor*  x,      // [T, C_in, B]
+                                                       struct ggml_tensor*  kernel, // [K, C_out, C_in]
+                                                       struct ggml_tensor*  bias,   // [C_out] or null
+                                                       int stride,
+                                                       StreamingCache&      cache,
+                                                       const std::string&   layer_id);
+
 }  // namespace vv
 
 #endif  // VIBEVOICE_CONV1D_HPP
diff --git a/src/vibevoice_tts.cpp b/src/vibevoice_tts.cpp
index 2d5beec..fb69be4 100644
--- a/src/vibevoice_tts.cpp
+++ b/src/vibevoice_tts.cpp
@@ -11,6 +11,7 @@
 #include <algorithm>
 #include <cmath>
 #include <cstdio>
+#include <cstdlib>
 #include <cstring>
 #include <random>
 #include <regex>
@@ -575,10 +576,14 @@ void add_input_type_embedding(const VibeVoiceConfig& cfg,
 // `scaled_latents` has shape [vae_dim * n_frames] in row-major (latent
 // fastest), matching what `ggml_new_tensor_3d(ctx, F32, n_frames, vae_dim, 1)`
 // expects when ne[0] = n_frames is the contiguous dim.
-std::vector<float> decode_latent_sequence(const VibeVoiceConfig&  cfg,
-                                          const VibeVoiceWeights& w,
-                                          const float*            scaled_latents,
-                                          int                     n_frames) {
+// Single-shot acoustic decode: the whole latent sequence is upsampled to
+// audio in one graph. Peak activation memory scales with n_frames (the last
+// decoder stages run at 24 kHz), so this is only used for short sequences;
+// longer ones go through decode_latent_streaming below.
+std::vector<float> decode_latent_single_shot(const VibeVoiceConfig&  cfg,
+                                             const VibeVoiceWeights& w,
+                                             const float*            scaled_latents,
+                                             int                     n_frames) {
     if (n_frames <= 0) return {};
     // Backend-aware compute: build the graph in a no_alloc ctx, allocate
     // leaf tensors on the active backend's buffer, upload input via
@@ -614,6 +619,141 @@ std::vector<float> decode_latent_sequence(const VibeVoiceConfig&  cfg,
     return samples;
 }
 
+// Per-chunk default for the streaming decode, in latent frames. Each frame
+// upsamples to 3200 audio samples, so peak decoder activation memory and
+// ggml-cuda's IM2COL gridDim.y limit (65535 over the
+// 24 kHz time axis) both scale with this. CUDA stays well under the limit at
+// 15 frames (48 k samples); CPU can afford much larger chunks. Override with
+// VIBEVOICE_DECODE_CHUNK_FRAMES.
+int decode_chunk_frames() {
+    if (const char* env = std::getenv("VIBEVOICE_DECODE_CHUNK_FRAMES")) {
+        const int v = std::atoi(env);
+        if (v > 0) return v;
+    }
+    const ggml_backend_t b = vv::backend();
+    const bool is_cuda = b &&
+        std::string(ggml_backend_name(b)).find("CUDA") != std::string::npos;
+    return is_cuda ? 15 : 64;
+}
+
+// One streaming decode chunk: builds the decoder graph against `cache`, runs
+// it, and pulls each conv's trailing context into the cache for the next
+// call. `chunk` is the packed latent slice [vae_dim * C] in ggml ne layout
+// (time fastest: data[d * C + t]). Returns this chunk's C * 3200 audio
+// samples.
+std::vector<float> run_decoder_chunk_streaming(const VibeVoiceConfig&  cfg,
+                                               const VibeVoiceWeights& w,
+                                               const std::vector<float>& chunk,
+                                               int                       C,
+                                               StreamingCache&           cache) {
+    struct ggml_init_params p {};
+    p.mem_size = ggml_tensor_overhead() * 32768 + ggml_graph_overhead_custom(32768, false);
+    p.no_alloc = true;
+    struct ggml_context* ctx = ggml_init(p);
+    if (!ctx) return {};
+
+    struct ggml_tensor* z = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, C, cfg.vae_dim, 1);
+    ggml_set_name(z, "decode_z_chunk");
+    struct ggml_tensor* y = decoder_forward_streaming(ctx, z, w.at_dec, cfg.acoustic, cache);
+
+    struct ggml_cgraph* gf = ggml_new_graph_custom(ctx, 32768, false);
+    ggml_build_forward_expand(gf, y);
+    // Keep each conv's "trailing context" view alive in the graph so its
+    // memory survives until we copy it back into the cache.
+    for (auto& kv : cache) {
+        if (kv.second.next_view) ggml_build_forward_expand(gf, kv.second.next_view);
+    }
+
+    ggml_backend_buffer_t in_buf = vv::allocate_ctx_tensors(ctx);
+    if (!in_buf) { ggml_free(ctx); return {}; }
+    ggml_backend_tensor_set(z, chunk.data(), 0, sizeof(float) * cfg.vae_dim * C);
+
+    // Populate the per-conv prefixes: zeros on the first chunk, the previous
+    // chunk's tail thereafter. (.data is only valid after the alloc above.)
+    for (auto& kv : cache) {
+        StreamingCacheEntry& e = kv.second;
+        if (!e.prefix || e.T == 0) continue;
+        const size_t need = static_cast<size_t>(e.T) * e.C;
+        if (cache.is_first_chunk || e.data.size() != need) {
+            std::vector<float> zeros(need, 0.0f);
+            ggml_backend_tensor_set(e.prefix, zeros.data(), 0, sizeof(float) * need);
+        } else {
+            ggml_backend_tensor_set(e.prefix, e.data.data(), 0, sizeof(float) * need);
+        }
+    }
+
+    if (!vv::compute_graph(gf)) {
+        ggml_backend_buffer_free(in_buf);
+        ggml_free(ctx);
+        return {};
+    }
+    const int T_full = static_cast<int>(y->ne[0]);
+    std::vector<float> samples(T_full);
+    ggml_backend_tensor_get(y, samples.data(), 0, sizeof(float) * T_full);
+
+    // Carry each conv's trailing context into the cache for the next chunk.
+    for (auto& kv : cache) {
+        StreamingCacheEntry& e = kv.second;
+        if (!e.next_view || e.T == 0) continue;
+        const size_t n = static_cast<size_t>(e.T) * e.C;
+        e.data.assign(n, 0.0f);
+        ggml_backend_tensor_get(e.next_view, e.data.data(), 0, sizeof(float) * n);
+        e.next_view = nullptr;
+        e.prefix    = nullptr;
+    }
+    cache.is_first_chunk = false;
+    ggml_backend_buffer_free(in_buf);
+    ggml_free(ctx);
+    return samples;
+}
+
+// Acoustic decode entry point. Dispatches short sequences to the single-shot
+// path and longer ones to a chunked streaming decode whose peak memory is
+// bounded to one chunk. The streaming cache makes the concatenated output
+// bit-exact with single-shot. `scaled_latents` is packed [vae_dim * n_frames]
+// in ggml ne layout (time fastest: data[d * n_frames + t]).
+std::vector<float> decode_latent_sequence(const VibeVoiceConfig&  cfg,
+                                          const VibeVoiceWeights& w,
+                                          const float*            scaled_latents,
+                                          int                     n_frames) {
+    if (n_frames <= 0) return {};
+
+    const int chunk = decode_chunk_frames();
+    if (n_frames <= chunk) {
+        return decode_latent_single_shot(cfg, w, scaled_latents, n_frames);
+    }
+
+    const int vae = cfg.vae_dim;
+    StreamingCache cache;
+    cache.is_first_chunk = true;
+
+    // Audio samples per latent frame = product of the decoder upsample ratios.
+    size_t upsample = 1;
+    for (int r : cfg.acoustic.ratios) upsample *= static_cast<size_t>(r);
+
+    std::vector<float> out;
+    out.reserve(static_cast<size_t>(n_frames) * upsample);
+
+    for (int off = 0; off < n_frames; off += chunk) {
+        const int end = std::min(off + chunk, n_frames);
+        const int C   = end - off;
+        cache.is_final_chunk = (end == n_frames);
+
+        // Slice this chunk into a contiguous [vae_dim * C] (time fastest)
+        // buffer matching the z tensor layout.
+        std::vector<float> cp(static_cast<size_t>(vae) * C);
+        for (int d = 0; d < vae; ++d) {
+            const float* src = scaled_latents + static_cast<size_t>(d) * n_frames + off;
+            std::copy(src, src + C, cp.begin() + static_cast<size_t>(d) * C);
+        }
+
+        std::vector<float> seg = run_decoder_chunk_streaming(cfg, w, cp, C, cache);
+        if (seg.empty()) return {};
+        out.insert(out.end(), seg.begin(), seg.end());
+    }
+    return out;
+}
+
 }  // namespace
 
 // Forward declaration — implementation later in this file.