diff --git a/src/acoustic_tokenizer.cpp b/src/acoustic_tokenizer.cpp index 511c640..aa3ab37 100644 --- a/src/acoustic_tokenizer.cpp +++ b/src/acoustic_tokenizer.cpp @@ -351,4 +351,58 @@ struct ggml_tensor* decoder_forward(struct ggml_context* ctx, return y; } +namespace { + +// Streaming causal SConvTranspose1d (upsample) — cache-backed counterpart of +// convtr_step. +struct ggml_tensor* convtr_step_streaming(struct ggml_context* ctx, + struct ggml_tensor* x, + const StridedConvWeights& w, + StreamingCache& cache, + const std::string& layer_id) { + return sconv_transpose1d_causal_streaming(ctx, x, w.kernel, w.bias, w.stride, + cache, layer_id); +} + +} // namespace + +struct ggml_tensor* decoder_forward_streaming(struct ggml_context* ctx, + struct ggml_tensor* z, + const DecoderWeights& w, + const AcousticConfig& cfg, + StreamingCache& cache) { + char buf[80]; + + struct ggml_tensor* h = sconv1d_causal_streaming( + ctx, z, w.stem.kernel, w.stem.bias, w.stem.stride, /*dilation=*/1, /*groups=*/1, + cache, "dec.stem"); + for (size_t j = 0; j < w.stages[0].size(); ++j) { + std::snprintf(buf, sizeof(buf), "dec.stage_0_block_%zu", j); + h = block1d_forward_streaming(ctx, h, w.stages[0][j], cfg.eps, cache, buf); + } + + for (size_t i = 1; i < cfg.depths.size(); ++i) { + std::snprintf(buf, sizeof(buf), "dec.up_%zu", i); + h = convtr_step_streaming(ctx, h, w.ups[i - 1], cache, buf); + for (size_t j = 0; j < w.stages[i].size(); ++j) { + std::snprintf(buf, sizeof(buf), "dec.stage_%zu_block_%zu", i, j); + h = block1d_forward_streaming(ctx, h, w.stages[i][j], cfg.eps, cache, buf); + } + } + + struct ggml_tensor* y = h; + if (w.final_norm) { + struct ggml_tensor* p = ggml_permute(ctx, y, 1, 0, 2, 3); + p = ggml_cont(ctx, p); + p = ggml_rms_norm(ctx, p, cfg.eps); + p = ggml_mul(ctx, p, w.final_norm); + p = ggml_permute(ctx, p, 1, 0, 2, 3); + y = ggml_cont(ctx, p); + } + y = sconv1d_causal_streaming( + ctx, y, w.head.kernel, w.head.bias, w.head.stride, /*dilation=*/1, /*groups=*/1, + cache, "dec.head"); + return y; +} + } // namespace vv diff --git a/src/acoustic_tokenizer.hpp b/src/acoustic_tokenizer.hpp index 89f7a5f..b658c6a 100644 --- a/src/acoustic_tokenizer.hpp +++ b/src/acoustic_tokenizer.hpp @@ -120,6 +120,18 @@ struct ggml_tensor* encoder_forward_streaming(struct ggml_context* ctx, const AcousticConfig& cfg, StreamingCache& cache); +// Streaming decoder: same math as decoder_forward, but every causal conv +// (stem, per-block depthwise mixers, the transposed upsamplers, and the +// head) reads/writes its left context through `cache`. Driven in chunk +// order with cache.is_first_chunk / is_final_chunk set by the caller, the +// concatenated per-chunk audio is bit-exact with a single-shot decode while +// keeping peak activation memory bounded to one chunk's worth of frames. +struct ggml_tensor* decoder_forward_streaming(struct ggml_context* ctx, + struct ggml_tensor* z, + const DecoderWeights& w, + const AcousticConfig& cfg, + StreamingCache& cache); + } // namespace vv #endif // VIBEVOICE_ACOUSTIC_TOKENIZER_HPP diff --git a/src/conv1d.cpp b/src/conv1d.cpp index f35476d..38a9d04 100644 --- a/src/conv1d.cpp +++ b/src/conv1d.cpp @@ -175,4 +175,77 @@ struct ggml_tensor* sconv_transpose1d_causal(struct ggml_context* ctx, return maybe_add_bias_t(ctx, y, bias); } +struct ggml_tensor* sconv_transpose1d_causal_streaming(struct ggml_context* ctx, + struct ggml_tensor* x, + struct ggml_tensor* kernel, + struct ggml_tensor* bias, + int stride, + StreamingCache& cache, + const std::string& layer_id) { + const int K = static_cast(kernel->ne[0]); + const int C_in = static_cast(x->ne[1]); + const int B = static_cast(x->ne[2]); + const int T_in = static_cast(x->ne[0]); + const int S = stride; + + // Left-context input frames needed so the first kept output sample is + // exact: output position p depends on input frames down to + // floor((p - K + 1) / S). For the first kept position p = context*S this + // reaches context - ceil((K-1)/S), so context = ceil((K-1)/S) frames of + // history make the kept region bit-exact vs single-shot. + const int context = (K - 1 + S - 1) / S; + + auto& entry = cache[layer_id]; + entry.T = context; + entry.C = C_in; + + // Cache-prefix leaf (filled by caller post-alloc): zeros on the first + // chunk, the previous chunk's trailing `context` input frames otherwise. + struct ggml_tensor* prefix = nullptr; + if (context > 0) { + prefix = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, context, C_in, B); + ggml_set_name(prefix, ("cache_prefix_" + layer_id).c_str()); + } + entry.prefix = prefix; + + struct ggml_tensor* xp = x; + if (prefix) xp = ggml_concat(ctx, prefix, x, /*dim=*/0); + + // ggml-cuda's conv_transpose_1d requires F32 kernels (see the single-shot + // path); standardize on F32 so the same graph runs on CPU and CUDA. + struct ggml_tensor* k = (kernel->type == GGML_TYPE_F32) + ? kernel + : ggml_cast(ctx, kernel, GGML_TYPE_F32); + + struct ggml_tensor* y_full = ggml_conv_transpose_1d(ctx, k, xp, S, /*p0=*/0, /*d0=*/1); + // y_full length = (context + T_in - 1) * S + K. The output frames that + // belong to the NEW input frames live at [context*S, (context+T_in)*S); + // that upper bound equals the single-shot trimmed length, so the slice + // both drops the warmup prefix and applies the causal right-trim. + const int64_t out_start = static_cast(context) * S; + const int64_t out_len = static_cast(T_in) * S; + struct ggml_tensor* y = ggml_view_3d(ctx, y_full, + /*ne0=*/out_len, + /*ne1=*/y_full->ne[1], + /*ne2=*/y_full->ne[2], + /*nb1=*/y_full->nb[1], + /*nb2=*/y_full->nb[2], + /*offset=*/static_cast(out_start) * y_full->nb[0]); + y = ggml_cont(ctx, y); + + // Register a view of the last `context` input frames (of the concatenated + // stream) so the caller can carry them into the next chunk. + if (context > 0) { + const int start = std::max(0, (T_in + context) - context); // = T_in + struct ggml_tensor* view = ggml_view_3d( + ctx, xp, + /*ne0=*/context, /*ne1=*/C_in, /*ne2=*/B, + /*nb1=*/xp->nb[1], /*nb2=*/xp->nb[2], + /*offset=*/static_cast(start) * xp->nb[0]); + entry.next_view = ggml_cont(ctx, view); + } + + return maybe_add_bias_t(ctx, y, bias); +} + } // namespace vv diff --git a/src/conv1d.hpp b/src/conv1d.hpp index 5ff2e27..bbc72bf 100644 --- a/src/conv1d.hpp +++ b/src/conv1d.hpp @@ -110,6 +110,22 @@ struct ggml_tensor* sconv_transpose1d_causal(struct ggml_context* ctx, struct ggml_tensor* bias, // [C_out] or null int stride); +// Streaming variant of sconv_transpose1d_causal. Mirrors +// sconv1d_causal_streaming: prepends `context = ceil((K-1)/stride)` input +// frames from the cache (zeros on the first chunk), runs the transposed +// conv on the concatenated input, then slices the output region that +// belongs to the NEW frames ([context*stride, (context+T_in)*stride)) so +// the per-chunk outputs concatenate bit-exact with a single-shot pass. +// Registers a view of the last `context` input frames as +// cache[layer_id].next_view for the caller to copy out post-compute. +struct ggml_tensor* sconv_transpose1d_causal_streaming(struct ggml_context* ctx, + struct ggml_tensor* x, // [T, C_in, B] + struct ggml_tensor* kernel, // [K, C_out, C_in] + struct ggml_tensor* bias, // [C_out] or null + int stride, + StreamingCache& cache, + const std::string& layer_id); + } // namespace vv #endif // VIBEVOICE_CONV1D_HPP diff --git a/src/vibevoice_tts.cpp b/src/vibevoice_tts.cpp index 2d5beec..fb69be4 100644 --- a/src/vibevoice_tts.cpp +++ b/src/vibevoice_tts.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -575,10 +576,14 @@ void add_input_type_embedding(const VibeVoiceConfig& cfg, // `scaled_latents` has shape [vae_dim * n_frames] in row-major (latent // fastest), matching what `ggml_new_tensor_3d(ctx, F32, n_frames, vae_dim, 1)` // expects when ne[0] = n_frames is the contiguous dim. -std::vector decode_latent_sequence(const VibeVoiceConfig& cfg, - const VibeVoiceWeights& w, - const float* scaled_latents, - int n_frames) { +// Single-shot acoustic decode: the whole latent sequence is upsampled to +// audio in one graph. Peak activation memory scales with n_frames (the last +// decoder stages run at 24 kHz), so this is only used for short sequences; +// longer ones go through decode_latent_streaming below. +std::vector decode_latent_single_shot(const VibeVoiceConfig& cfg, + const VibeVoiceWeights& w, + const float* scaled_latents, + int n_frames) { if (n_frames <= 0) return {}; // Backend-aware compute: build the graph in a no_alloc ctx, allocate // leaf tensors on the active backend's buffer, upload input via @@ -614,6 +619,141 @@ std::vector decode_latent_sequence(const VibeVoiceConfig& cfg, return samples; } +// Per-chunk default for the streaming decode, in latent frames. Each frame +// upsamples to 3200 audio samples, so peak decoder activation memory and +// ggml-cuda's IM2COL gridDim.y limit (65535 over the +// 24 kHz time axis) both scale with this. CUDA stays well under the limit at +// 15 frames (48 k samples); CPU can afford much larger chunks. Override with +// VIBEVOICE_DECODE_CHUNK_FRAMES. +int decode_chunk_frames() { + if (const char* env = std::getenv("VIBEVOICE_DECODE_CHUNK_FRAMES")) { + const int v = std::atoi(env); + if (v > 0) return v; + } + const ggml_backend_t b = vv::backend(); + const bool is_cuda = b && + std::string(ggml_backend_name(b)).find("CUDA") != std::string::npos; + return is_cuda ? 15 : 64; +} + +// One streaming decode chunk: builds the decoder graph against `cache`, runs +// it, and pulls each conv's trailing context into the cache for the next +// call. `chunk` is the packed latent slice [vae_dim * C] in ggml ne layout +// (time fastest: data[d * C + t]). Returns this chunk's C * 3200 audio +// samples. +std::vector run_decoder_chunk_streaming(const VibeVoiceConfig& cfg, + const VibeVoiceWeights& w, + const std::vector& chunk, + int C, + StreamingCache& cache) { + struct ggml_init_params p {}; + p.mem_size = ggml_tensor_overhead() * 32768 + ggml_graph_overhead_custom(32768, false); + p.no_alloc = true; + struct ggml_context* ctx = ggml_init(p); + if (!ctx) return {}; + + struct ggml_tensor* z = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, C, cfg.vae_dim, 1); + ggml_set_name(z, "decode_z_chunk"); + struct ggml_tensor* y = decoder_forward_streaming(ctx, z, w.at_dec, cfg.acoustic, cache); + + struct ggml_cgraph* gf = ggml_new_graph_custom(ctx, 32768, false); + ggml_build_forward_expand(gf, y); + // Keep each conv's "trailing context" view alive in the graph so its + // memory survives until we copy it back into the cache. + for (auto& kv : cache) { + if (kv.second.next_view) ggml_build_forward_expand(gf, kv.second.next_view); + } + + ggml_backend_buffer_t in_buf = vv::allocate_ctx_tensors(ctx); + if (!in_buf) { ggml_free(ctx); return {}; } + ggml_backend_tensor_set(z, chunk.data(), 0, sizeof(float) * cfg.vae_dim * C); + + // Populate the per-conv prefixes: zeros on the first chunk, the previous + // chunk's tail thereafter. (.data is only valid after the alloc above.) + for (auto& kv : cache) { + StreamingCacheEntry& e = kv.second; + if (!e.prefix || e.T == 0) continue; + const size_t need = static_cast(e.T) * e.C; + if (cache.is_first_chunk || e.data.size() != need) { + std::vector zeros(need, 0.0f); + ggml_backend_tensor_set(e.prefix, zeros.data(), 0, sizeof(float) * need); + } else { + ggml_backend_tensor_set(e.prefix, e.data.data(), 0, sizeof(float) * need); + } + } + + if (!vv::compute_graph(gf)) { + ggml_backend_buffer_free(in_buf); + ggml_free(ctx); + return {}; + } + const int T_full = static_cast(y->ne[0]); + std::vector samples(T_full); + ggml_backend_tensor_get(y, samples.data(), 0, sizeof(float) * T_full); + + // Carry each conv's trailing context into the cache for the next chunk. + for (auto& kv : cache) { + StreamingCacheEntry& e = kv.second; + if (!e.next_view || e.T == 0) continue; + const size_t n = static_cast(e.T) * e.C; + e.data.assign(n, 0.0f); + ggml_backend_tensor_get(e.next_view, e.data.data(), 0, sizeof(float) * n); + e.next_view = nullptr; + e.prefix = nullptr; + } + cache.is_first_chunk = false; + ggml_backend_buffer_free(in_buf); + ggml_free(ctx); + return samples; +} + +// Acoustic decode entry point. Dispatches short sequences to the single-shot +// path and longer ones to a chunked streaming decode whose peak memory is +// bounded to one chunk. The streaming cache makes the concatenated output +// bit-exact with single-shot. `scaled_latents` is packed [vae_dim * n_frames] +// in ggml ne layout (time fastest: data[d * n_frames + t]). +std::vector decode_latent_sequence(const VibeVoiceConfig& cfg, + const VibeVoiceWeights& w, + const float* scaled_latents, + int n_frames) { + if (n_frames <= 0) return {}; + + const int chunk = decode_chunk_frames(); + if (n_frames <= chunk) { + return decode_latent_single_shot(cfg, w, scaled_latents, n_frames); + } + + const int vae = cfg.vae_dim; + StreamingCache cache; + cache.is_first_chunk = true; + + // Audio samples per latent frame = product of the decoder upsample ratios. + size_t upsample = 1; + for (int r : cfg.acoustic.ratios) upsample *= static_cast(r); + + std::vector out; + out.reserve(static_cast(n_frames) * upsample); + + for (int off = 0; off < n_frames; off += chunk) { + const int end = std::min(off + chunk, n_frames); + const int C = end - off; + cache.is_final_chunk = (end == n_frames); + + // Slice this chunk into a contiguous [vae_dim * C] (time fastest) + // buffer matching the z tensor layout. + std::vector cp(static_cast(vae) * C); + for (int d = 0; d < vae; ++d) { + const float* src = scaled_latents + static_cast(d) * n_frames + off; + std::copy(src, src + C, cp.begin() + static_cast(d) * C); + } + + std::vector seg = run_decoder_chunk_streaming(cfg, w, cp, C, cache); + if (seg.empty()) return {}; + out.insert(out.end(), seg.begin(), seg.end()); + } + return out; +} + } // namespace // Forward declaration — implementation later in this file.