diff --git a/tools/kokoro/CMakeLists.txt b/tools/kokoro/CMakeLists.txt index 394f7f946..8f59ff9c5 100644 --- a/tools/kokoro/CMakeLists.txt +++ b/tools/kokoro/CMakeLists.txt @@ -25,7 +25,9 @@ set(KOKORO_CORE_SOURCES src/kokoro.cpp src/kokoro-istft.cpp src/kokoro-phonemes.cpp - src/kokoro-predictor.cpp) + src/kokoro-predictor.cpp + src/kokoro-generator.cpp + src/kokoro-decoder.cpp) add_library(kokoro_lib STATIC ${KOKORO_CORE_SOURCES} @@ -56,17 +58,46 @@ endif() target_compile_features(kokoro_lib PUBLIC cxx_std_17) -# Standalone CLI harness — required by J2 verification (tools/voice-kokoro/). +# Real G2P via libespeak-ng. When present, kokoro_phonemize() drives +# espeak_TextToPhonemes() (en-us IPA) and maps codepoints to Kokoro vocab ids, +# reproducing the reference token sequence. When absent, the build falls back +# to the degraded ASCII grapheme mapping and the TS layer must supply IPA. +# Override the search with -DKOKORO_ESPEAK_ROOT= (e.g. Homebrew). +option(KOKORO_ENABLE_ESPEAK "Link libespeak-ng for real Kokoro G2P" ON) +if(KOKORO_ENABLE_ESPEAK) + find_path(ESPEAK_NG_INCLUDE_DIR espeak-ng/speak_lib.h + HINTS ${KOKORO_ESPEAK_ROOT} /opt/homebrew /usr/local /usr + PATH_SUFFIXES include) + find_library(ESPEAK_NG_LIBRARY NAMES espeak-ng + HINTS ${KOKORO_ESPEAK_ROOT} /opt/homebrew /usr/local /usr + PATH_SUFFIXES lib lib64) + if(ESPEAK_NG_INCLUDE_DIR AND ESPEAK_NG_LIBRARY) + target_include_directories(kokoro_lib PRIVATE ${ESPEAK_NG_INCLUDE_DIR}) + target_link_libraries(kokoro_lib PRIVATE ${ESPEAK_NG_LIBRARY}) + target_compile_definitions(kokoro_lib PRIVATE KOKORO_USE_ESPEAK) + message(STATUS "Kokoro G2P: libespeak-ng found (${ESPEAK_NG_LIBRARY}) — real IPA path enabled") + else() + message(STATUS "Kokoro G2P: libespeak-ng not found — falling back to ASCII grapheme mapping (TS layer must supply IPA)") + endif() +endif() + +# Standalone CLI harnesses (required by J2 verification + Kokoro decoder dev). +# Force MACOSX_BUNDLE OFF: CMake defaults Apple executables to bundles, and +# install(TARGETS ... RUNTIME) on a bundle target fails configure with +# "no BUNDLE DESTINATION" on every ios/tvos/visionos/macos build. add_executable(kokoro-tts tools/kokoro-tts.cpp) target_link_libraries(kokoro-tts PRIVATE kokoro_lib) -# kokoro-tts is a CLI harness, not a GUI app. On Apple platforms CMake defaults -# executables to MACOSX_BUNDLE, and install(TARGETS ... RUNTIME) on a bundle -# target fails configure with "install TARGETS given no BUNDLE DESTINATION for -# MACOSX_BUNDLE executable" on every ios/tvos/visionos/macos build. Force the -# bundle flag off so the plain RUNTIME install is valid on all platforms. set_target_properties(kokoro-tts PROPERTIES MACOSX_BUNDLE OFF) install(TARGETS kokoro-tts RUNTIME) +add_executable(kokoro-stage-dump tools/kokoro-stage-dump.cpp) +target_link_libraries(kokoro-stage-dump PRIVATE kokoro_lib) +set_target_properties(kokoro-stage-dump PROPERTIES MACOSX_BUNDLE OFF) + +add_executable(kokoro-decoder-test tools/kokoro-decoder-test.cpp) +target_link_libraries(kokoro-decoder-test PRIVATE kokoro_lib) +set_target_properties(kokoro-decoder-test PROPERTIES MACOSX_BUNDLE OFF) + # Server-mount handler: compiled into kokoro_lib only when the server target # exists. The handler is guarded by `#ifdef LLAMA_BUILD_KOKORO` and pulls in # the same `server-http.h` interface that the omnivoice handler uses, plus diff --git a/tools/kokoro/convert_kokoro_pth_to_gguf.py b/tools/kokoro/convert_kokoro_pth_to_gguf.py index 8232c070f..12bd55fff 100644 --- a/tools/kokoro/convert_kokoro_pth_to_gguf.py +++ b/tools/kokoro/convert_kokoro_pth_to_gguf.py @@ -97,9 +97,15 @@ def _add_tensor(writer: gguf.GGUFWriter, name: str, data: np.ndarray) -> None: """Add tensors with the dtype layout the Kokoro forward pass expects. - Weight matrices and convolution kernels (ndim >= 2) are emitted as F16; - biases, norms, and other vectors stay F32. All-F32 GGUFs can load but - synthesize noise in the fused runtime path. + Weight matrices and convolution kernels (ndim >= 2) are emitted as F16 + purely to halve the GGUF download size; biases, norms, and other vectors + stay F32. The GGUF dtype does not affect correctness: the loader + dequantizes every tensor to F32 at load time, so an all-F32 and an + F16-weights GGUF produce identical synthesis. (An earlier note here + claimed all-F32 GGUFs synthesized noise — that was a misdiagnosis: the + fused path was a stub that ignored the weights, and the real defect was + the loader reading non-F32 tensors as raw F32. Both are fixed; F16 is + kept only for bundle size.) """ if data.dtype not in (np.float32, np.float16): data = data.astype(np.float32) diff --git a/tools/kokoro/include/kokoro-decoder-front.h b/tools/kokoro/include/kokoro-decoder-front.h new file mode 100644 index 000000000..4a89770c5 --- /dev/null +++ b/tools/kokoro/include/kokoro-decoder-front.h @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: MIT +// kokoro-decoder-front.h — Decoder.forward up to the generator (validated port, #9588). +#pragma once +#include +#include +#include +#include "kokoro-layers.h" // conv1d_forward, adain1d_forward, convtranspose1d_depthwise_forward, convtranspose1d_out_len + +namespace eliza_kokoro { + +struct DecAdainResBlk { + int Cin = 0, Cout = 0, Sdim = 128; + bool upsample = false; + bool learned_sc = false; // dim_in != dim_out + const float * norm1_fc_w = nullptr; // [2*Cin, Sdim] + const float * norm1_fc_b = nullptr; // [2*Cin] + const float * norm2_fc_w = nullptr; // [2*Cout, Sdim] + const float * norm2_fc_b = nullptr; // [2*Cout] + const float * conv1_w = nullptr; // [Cout, Cin, 3] + const float * conv1_b = nullptr; // [Cout] + const float * conv2_w = nullptr; // [Cout, Cout, 3] + const float * conv2_b = nullptr; // [Cout] + const float * conv1x1_w = nullptr; // [Cout, Cin, 1] (learned_sc only) + const float * conv1x1_b = nullptr; // [Cout] (null — conv1x1 bias=False) + const float * pool_w = nullptr; // [Cin, 1, 3] (upsample only) + const float * pool_b = nullptr; // [Cin] (upsample only) +}; + +// AdainResBlk1d (decode-block flavor: leaky_relu 0.2; pool/shortcut; +// out = (residual + shortcut)/sqrt(2)). Output y [Cout, T_out]. +inline void dec_adainresblk1d_forward( + const DecAdainResBlk & w, const float * x, int T_in, const float * s, + std::vector & y, int & T_out) { + const int Cin = w.Cin, Cout = w.Cout, Sdim = w.Sdim; + + // residual branch: norm1 -> leaky_relu(0.2) -> [pool] -> conv1 -> norm2 -> leaky_relu -> conv2 + std::vector r(x, x + (size_t)Cin * T_in); + adain1d_forward(r.data(), Cin, T_in, s, Sdim, w.norm1_fc_w, w.norm1_fc_b); + for (size_t i = 0; i < r.size(); ++i) if (r[i] < 0) r[i] *= 0.2f; + + int T_pool = T_in; + if (w.upsample) { + T_pool = convtranspose1d_out_len(T_in, 3, 2, 1, 1); + std::vector r2((size_t)Cin * T_pool); + convtranspose1d_depthwise_forward(r.data(), Cin, T_in, w.pool_w, w.pool_b, 3, 2, 1, 1, r2.data(), T_pool); + r.swap(r2); + } + std::vector r3((size_t)Cout * T_pool); + conv1d_forward(r.data(), Cin, T_pool, w.conv1_w, w.conv1_b, Cout, 3, 1, 1, 1, r3.data(), T_pool); + adain1d_forward(r3.data(), Cout, T_pool, s, Sdim, w.norm2_fc_w, w.norm2_fc_b); + for (size_t i = 0; i < r3.size(); ++i) if (r3[i] < 0) r3[i] *= 0.2f; + std::vector r4((size_t)Cout * T_pool); + conv1d_forward(r3.data(), Cout, T_pool, w.conv2_w, w.conv2_b, Cout, 3, 1, 1, 1, r4.data(), T_pool); + + // shortcut branch: [nearest-upsample x2] -> [conv1x1 if learned_sc] + T_out = T_pool; + std::vector sc; + if (w.upsample) { + const int T_up = T_in * 2; // == T_pool + std::vector up((size_t)Cin * T_up); + for (int c = 0; c < Cin; ++c) + for (int t = 0; t < T_up; ++t) + up[(size_t)c * T_up + t] = x[(size_t)c * T_in + (t / 2)]; + if (w.learned_sc) { + sc.assign((size_t)Cout * T_up, 0.0f); + conv1d_forward(up.data(), Cin, T_up, w.conv1x1_w, w.conv1x1_b, Cout, 1, 1, 0, 1, sc.data(), T_up); + } else sc.swap(up); + } else { + if (w.learned_sc) { + sc.assign((size_t)Cout * T_in, 0.0f); + conv1d_forward(x, Cin, T_in, w.conv1x1_w, w.conv1x1_b, Cout, 1, 1, 0, 1, sc.data(), T_in); + } else sc.assign(x, x + (size_t)Cin * T_in); + } + + y.assign((size_t)Cout * T_out, 0.0f); + const float rsqrt2 = 1.0f / std::sqrt(2.0f); + for (size_t i = 0; i < y.size(); ++i) y[i] = (r4[i] + sc[i]) * rsqrt2; +} + +struct DecoderFrontWeights { + const float * F0_conv_w = nullptr; // [1,1,3] + const float * F0_conv_b = nullptr; // [1] + const float * N_conv_w = nullptr; // [1,1,3] + const float * N_conv_b = nullptr; // [1] + const float * asr_res_w = nullptr; // [64,512,1] + const float * asr_res_b = nullptr; // [64] + DecAdainResBlk encode; // 514 -> 1024, learned_sc + DecAdainResBlk decode[4]; // 1090->1024 (x3), 1090->512 upsample +}; + +// Decoder.forward up to (not including) the generator. +// asr[512,T_asr] (T_asr=132), F0_curve[2*T_asr], N[2*T_asr], s[128] +// Output: x_out [512, 2*T_asr] (== generator_in_0); also returns the +// stride-2 conv outputs F0_down[T_asr], N_down[T_asr] (caller passes them, +// together with the ORIGINAL F0_curve, into the generator). +inline void decoder_front( + const DecoderFrontWeights & W, + const float * asr, int Cin_asr, int T_asr, + const float * F0_curve, const float * N_in, const float * s, + std::vector & x_out, + std::vector & F0_down, + std::vector & N_down) { + const int Tc = 2 * T_asr; // 264 + + F0_down.assign(T_asr, 0.0f); + conv1d_forward(F0_curve, 1, Tc, W.F0_conv_w, W.F0_conv_b, 1, 3, 2, 1, 1, F0_down.data(), T_asr); + N_down.assign(T_asr, 0.0f); + conv1d_forward(N_in, 1, Tc, W.N_conv_w, W.N_conv_b, 1, 3, 2, 1, 1, N_down.data(), T_asr); + + // x = cat([asr, F0, N], dim=channels) -> [514, T_asr] + std::vector xcat((size_t)(Cin_asr + 2) * T_asr); + std::memcpy(xcat.data(), asr, sizeof(float) * (size_t)Cin_asr * T_asr); + std::memcpy(xcat.data() + (size_t)Cin_asr * T_asr, F0_down.data(), sizeof(float) * T_asr); + std::memcpy(xcat.data() + (size_t)(Cin_asr + 1) * T_asr, N_down.data(), sizeof(float) * T_asr); + + std::vector x; int T_x; + dec_adainresblk1d_forward(W.encode, xcat.data(), T_asr, s, x, T_x); // encode 514->1024 + + std::vector asr_res((size_t)64 * T_asr); // asr_res Conv1d k1 512->64 + conv1d_forward(asr, Cin_asr, T_asr, W.asr_res_w, W.asr_res_b, 64, 1, 1, 0, 1, asr_res.data(), T_asr); + + bool res = true; + for (int b = 0; b < 4; ++b) { + std::vector blk_in; + if (res) { // cat([x, asr_res, F0, N]) -> 1024+64+1+1 = 1090 + const int Cx = (int)(x.size() / T_x); + const int Cin_blk = Cx + 64 + 1 + 1; + blk_in.assign((size_t)Cin_blk * T_x, 0.0f); + std::memcpy(blk_in.data(), x.data(), sizeof(float) * (size_t)Cx * T_x); + std::memcpy(blk_in.data() + (size_t)Cx * T_x, asr_res.data(), sizeof(float) * 64 * T_x); + std::memcpy(blk_in.data() + (size_t)(Cx + 64) * T_x, F0_down.data(), sizeof(float) * T_x); + std::memcpy(blk_in.data() + (size_t)(Cx + 65) * T_x, N_down.data(), sizeof(float) * T_x); + } else { + blk_in.assign(x.begin(), x.end()); + } + std::vector y; int T_y; + dec_adainresblk1d_forward(W.decode[b], blk_in.data(), T_x, s, y, T_y); + x.swap(y); T_x = T_y; + if (W.decode[b].upsample) res = false; // decode3 upsamples -> res stops + } + x_out.swap(x); // [512, 264] +} + +} // namespace eliza_kokoro \ No newline at end of file diff --git a/tools/kokoro/include/kokoro-decoder.h b/tools/kokoro/include/kokoro-decoder.h new file mode 100644 index 000000000..3de5d2562 --- /dev/null +++ b/tools/kokoro/include/kokoro-decoder.h @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// +// kokoro-decoder.h — StyleTTS-2 / iSTFTNet decoder: predictor outputs -> 24 kHz audio. +// +// Wires the validated decoder_front (kokoro-decoder-front.h) + Generator +// (kokoro-generator.h) against the model's all-F32 ggml context. Replaces the +// J2-ship placeholder spectrogram in kokoro_synthesize (#9588). + +#pragma once + +#include +#include + +namespace eliza_kokoro { + +struct kokoro_model; + +// Run the full decoder. Inputs come from kokoro_predictor_forward: +// asr_ct : [512, T_frame] channel-major (transpose of PredictorOut.asr [T,512]) +// F0, N : [2*T_frame] (PredictorOut.F0_pred / N_pred — the up-2x curves) +// ref_s_dec: [128] decoder-half style (ref_s[:128]) +// Output: audio (24 kHz mono), resized to (2*T_frame)*300. +bool kokoro_decoder_forward( + const kokoro_model * model, + const float * asr_ct, int T_frame, + const float * F0, const float * N, + const float * ref_s_dec, + std::vector & audio, + std::string & err); + +} // namespace eliza_kokoro diff --git a/tools/kokoro/include/kokoro-generator.h b/tools/kokoro/include/kokoro-generator.h new file mode 100644 index 000000000..d1d88f696 --- /dev/null +++ b/tools/kokoro/include/kokoro-generator.h @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: MIT +// +// kokoro-generator.h — iSTFTNet Generator.forward (StyleTTS-2 decoder back-end). +// +// The generator turns the decoder body output `x` [512, 264], the style +// vector `s` [128], and the (un-downsampled) F0 curve `f0_curve` [264] into +// `audio` [79200] (24 kHz). +// +// Weights are raw float pointers (PyTorch row-major, weight_norm-fused): +// Conv1d weight [Cout, Cin, K] +// ConvTranspose1d wt [Cin, Cout, K] +// Linear weight [out, in] +// AdaIN1d fc.weight [2C, style_dim] (style_dim = 128) +// Snake alpha [C] +// The caller supplies them via GeneratorWeights so the function composes with +// any weight-loading boundary (GGUF tensor lookup, raw .f32 fixtures, …). + +#pragma once + +#include + +namespace eliza_kokoro { + +// One AdaINResBlock1 sub-block (the block has three, sharing the same channel +// count). convs use [Cout=Cin=C, Cin=C, K]. +struct GenSubBlockWeights { + const float * conv1_w = nullptr; // [C, C, K] + const float * conv1_b = nullptr; // [C] + const float * conv2_w = nullptr; // [C, C, K] + const float * conv2_b = nullptr; // [C] + const float * adain1_fc_w = nullptr; // [2C, 128] + const float * adain1_fc_b = nullptr; // [2C] + const float * adain2_fc_w = nullptr; // [2C, 128] + const float * adain2_fc_b = nullptr; // [2C] + const float * alpha1 = nullptr; // [C] + const float * alpha2 = nullptr; // [C] +}; + +struct GenAdaResBlockWeights { + GenSubBlockWeights sub[3]; +}; + +struct GeneratorWeights { + // m_source.l_linear: Linear(9 -> 1). + const float * l_linear_w = nullptr; // [1, 9] + const float * l_linear_b = nullptr; // [1] + + // ups[0], ups[1]: ConvTranspose1d. weight [Cin, Cout, K], bias [Cout]. + const float * ups_w[2] = { nullptr, nullptr }; + const float * ups_b[2] = { nullptr, nullptr }; + + // noise_convs[0], noise_convs[1]: Conv1d. weight [Cout, 22, K], bias [Cout]. + const float * noise_convs_w[2] = { nullptr, nullptr }; + const float * noise_convs_b[2] = { nullptr, nullptr }; + + // noise_res[0] (k=7), noise_res[1] (k=11): AdaINResBlock1. + GenAdaResBlockWeights noise_res[2]; + + // resblocks[0..5]: AdaINResBlock1 (stage0: k=3,7,11 ch=256; stage1: ch=128). + GenAdaResBlockWeights resblocks[6]; + + // conv_post: Conv1d(128 -> 22, k=7, pad=3). weight [22, 128, 7], bias [22]. + const float * conv_post_w = nullptr; + const float * conv_post_b = nullptr; +}; + +// Generator.forward. audio is resized to T0 * 300 (== 79200 for T0=264). +void kokoro_generator_forward( + const float * x_in, // [512, T0] channel-major + int T0, // input time (== 2 * predictor T_frame) + const float * s, // [128] + const float * f0_curve, // [T0] + const GeneratorWeights & w, + std::vector & audio); + +} // namespace eliza_kokoro diff --git a/tools/kokoro/include/kokoro-phonemes.h b/tools/kokoro/include/kokoro-phonemes.h index cc9bb3fd6..068dc0f79 100644 --- a/tools/kokoro/include/kokoro-phonemes.h +++ b/tools/kokoro/include/kokoro-phonemes.h @@ -1,22 +1,26 @@ // SPDX-License-Identifier: MIT // -// kokoro-phonemes.h — minimal ASCII text → Kokoro phoneme-id mapping. +// kokoro-phonemes.h — text → Kokoro phoneme-id mapping. // -// Kokoro v1.0 uses espeak-ng's phoneme inventory + a small set of control -// tokens (BOS, EOS, PAD, blanks). The training-time path passes text through -// `phonemize` (Python wrapper around espeak-ng) before tokenizing. +// Kokoro v1.0 tokenizes espeak-ng IPA against a small fixed vocab +// (`tts/kokoro/tokenizer.json`, `model.vocab`). The reference Python path is: // -// Adding an espeak-ng dependency to the fork is overkill for a TTS that -// is being ported as a one-release deprecation runway. This header -// implements a deterministic grapheme→phoneme mapping that: +// text → espeak-ng (en-us, --ipa) → IPA string → per-codepoint vocab lookup +// → ids → model input_ids = [0, *ids, 0] (0 = the "$" pad symbol) // -// 1. covers the basic Latin alphabet + common digraphs (sh, ch, th, ng); -// 2. maps every other ASCII printable to PAD; -// 3. returns ids in the same value range as kokoro-onnx's tokenizer -// (PAD=0, BOS=1, EOS=2, then phonemes from offset 3). +// Every vocab key is a single Unicode codepoint, so the mapping is a pure +// codepoint→id table lookup over the IPA string (no multi-char digraph +// handling is needed — espeak already emits the canonical IPA codepoints, +// e.g. eɪ is two codepoints 'e'+'ɪ', each with its own id). // -// The synthesis quality this produces is noticeably worse than the -// espeak-ng path — that is the documented gap in J2-kokoro-port-notes.md. +// Two build modes: +// * KOKORO_USE_ESPEAK (default when libespeak-ng is linked) — the real G2P +// path: `phonemize_ipa()` drives espeak_TextToPhonemes() to get IPA, then +// maps to ids. This reproduces the kokoro reference ids exactly. +// * fallback — when espeak is unavailable the caller may pass pre-computed +// IPA from the TS layer (which already runs espeak) into +// `ipa_to_token_ids()`. `phonemize_ipa()` then returns an empty vector and +// the caller must supply IPA. #pragma once @@ -26,12 +30,42 @@ namespace eliza_kokoro { -// Tokenize a UTF-8 / ASCII text string into a phoneme-id vector. -// Always returns a sequence of length <= 510 (the BERT encoder cap in -// Kokoro v1.0 — anything longer is split at the caller). -std::vector phonemize_ascii(const std::string & text); +// Kokoro pad/boundary token. model.vocab maps '$' → 0; the reference wraps the +// phoneme ids as [PAD, *ids, PAD] to form the model input_ids. +inline constexpr int32_t KOKORO_PAD_ID = 0; + +// Map a single Unicode codepoint (an espeak IPA symbol) to its Kokoro vocab id. +// Returns -1 if the codepoint is not in the vocab (caller drops it, matching +// the reference which silently skips unmapped codepoints). +int32_t kokoro_codepoint_to_id(char32_t cp) noexcept; + +// Map an espeak-ng IPA string (UTF-8) to the bare Kokoro phoneme-id sequence +// (no pad wrapping). Codepoints absent from the vocab are dropped. This is the +// `ids` array in reference-ids.json — its length is the style-row index. +std::vector ipa_to_token_ids(const std::string & ipa); + +// Phonemize text to bare Kokoro phoneme ids via espeak-ng (en-us IPA). +// Returns the same sequence as `ipa_to_token_ids(espeak_ipa(text))`. +// When KOKORO_USE_ESPEAK is not compiled in, returns an empty vector — the +// caller must supply IPA from the TS layer and call `ipa_to_token_ids()`. +std::vector phonemize_ipa(const std::string & text); + +// Wrap a bare phoneme-id sequence as the model input_ids: [PAD, *ids, PAD]. +std::vector wrap_input_ids(const std::vector & ids); -// Diagnostic — total phoneme vocab size (for hparams cross-check). +// Convenience: text → model input_ids [PAD, *ipa_ids, PAD] via espeak. +// Equivalent to `wrap_input_ids(phonemize_ipa(text))`. +std::vector phonemize_to_input_ids(const std::string & text); + +// True when this build links libespeak-ng (the real G2P path is available). +bool espeak_available() noexcept; + +// Total Kokoro vocab size (highest id + 1 = 178 for v1.0). int phoneme_vocab_size() noexcept; +// --- Legacy ASCII fallback (retained only for callers not yet migrated) --- +// Deprecated: returns the degraded ASCII grapheme mapping. New code uses +// phonemize_to_input_ids(). +std::vector phonemize_ascii(const std::string & text); + } // namespace eliza_kokoro diff --git a/tools/kokoro/include/kokoro.h b/tools/kokoro/include/kokoro.h index 809362962..b218677cf 100644 --- a/tools/kokoro/include/kokoro.h +++ b/tools/kokoro/include/kokoro.h @@ -116,10 +116,12 @@ kokoro_status kokoro_load_voice_preset( kokoro_voice_preset & out, std::string & err_out) noexcept; -// Phonemize an input text into Kokoro's int phoneme ids. The implementation -// uses a deterministic ASCII grapheme→phoneme mapping (no espeak-ng -// dependency). This is intentionally lossy vs the upstream phonemizer — -// quality recovery is part of the gap documented in J2-kokoro-port-notes.md. +// Phonemize an input text into Kokoro's int phoneme ids (the model input_ids, +// wrapped as [PAD, *ids, PAD]). When the build links libespeak-ng this is the +// real G2P path (text → en-us IPA → Kokoro vocab ids), reproducing the +// upstream phonemizer's token sequence. Without libespeak-ng it falls back to +// a deterministic (lossy) ASCII grapheme mapping; in that case the TS voice +// layer should phonemize and pass IPA (see kokoro-phonemes.h ipa_to_token_ids). std::vector kokoro_phonemize(const std::string & text); // Synthesize a single utterance. `text` is the natural-language input, diff --git a/tools/kokoro/src/kokoro-decoder.cpp b/tools/kokoro/src/kokoro-decoder.cpp new file mode 100644 index 000000000..388f124b9 --- /dev/null +++ b/tools/kokoro/src/kokoro-decoder.cpp @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: MIT +// +// kokoro-decoder.cpp — assemble decoder_front + Generator into the full +// StyleTTS-2 / iSTFTNet decoder, reading weights from the model's all-F32 +// ggml context (dequantized at load). Validated against the PyTorch reference +// stage-by-stage (#9588). + +#include "kokoro-decoder.h" +#include "kokoro-decoder-front.h" // DecoderFrontWeights, DecAdainResBlk, decoder_front +#include "kokoro-generator.h" // GeneratorWeights, kokoro_generator_forward + +#include "ggml.h" + +#include + +namespace eliza_kokoro { + +// Defined in kokoro.cpp — the all-F32 working context the predictor reads. +ggml_context * kokoro_model_ggml_ctx(const kokoro_model * model); + +namespace { + +struct Lk { + ggml_context * ctx; + const float * get(const std::string & name) const { + ggml_tensor * t = ggml_get_tensor(ctx, name.c_str()); + return t ? (const float *) t->data : nullptr; + } +}; + +// Fill an AdainResBlk1d (decode flavor) from a tensor-name prefix. +void fill_dec_block(const Lk & L, DecAdainResBlk & b, const std::string & pfx, + int Cin, int Cout, bool upsample) { + b.Cin = Cin; b.Cout = Cout; b.Sdim = 128; + b.upsample = upsample; + b.learned_sc = (Cin != Cout); + b.norm1_fc_w = L.get(pfx + ".norm1.fc.weight"); + b.norm1_fc_b = L.get(pfx + ".norm1.fc.bias"); + b.norm2_fc_w = L.get(pfx + ".norm2.fc.weight"); + b.norm2_fc_b = L.get(pfx + ".norm2.fc.bias"); + b.conv1_w = L.get(pfx + ".conv1.weight"); + b.conv1_b = L.get(pfx + ".conv1.bias"); + b.conv2_w = L.get(pfx + ".conv2.weight"); + b.conv2_b = L.get(pfx + ".conv2.bias"); + b.conv1x1_w = b.learned_sc ? L.get(pfx + ".conv1x1.weight") : nullptr; + b.conv1x1_b = nullptr; // conv1x1 bias=False upstream + b.pool_w = upsample ? L.get(pfx + ".pool.weight") : nullptr; + b.pool_b = upsample ? L.get(pfx + ".pool.bias") : nullptr; +} + +// Fill an AdaINResBlock1 (generator flavor: 3 sub-blocks, Snake1D) from a prefix. +void fill_gen_block(const Lk & L, GenAdaResBlockWeights & b, const std::string & pfx) { + for (int j = 0; j < 3; ++j) { + const std::string js = std::to_string(j); + GenSubBlockWeights & s = b.sub[j]; + s.conv1_w = L.get(pfx + ".convs1." + js + ".weight"); + s.conv1_b = L.get(pfx + ".convs1." + js + ".bias"); + s.conv2_w = L.get(pfx + ".convs2." + js + ".weight"); + s.conv2_b = L.get(pfx + ".convs2." + js + ".bias"); + s.adain1_fc_w = L.get(pfx + ".adain1." + js + ".fc.weight"); + s.adain1_fc_b = L.get(pfx + ".adain1." + js + ".fc.bias"); + s.adain2_fc_w = L.get(pfx + ".adain2." + js + ".fc.weight"); + s.adain2_fc_b = L.get(pfx + ".adain2." + js + ".fc.bias"); + s.alpha1 = L.get(pfx + ".alpha1." + js); + s.alpha2 = L.get(pfx + ".alpha2." + js); + } +} + +} // namespace + +bool kokoro_decoder_forward( + const kokoro_model * model, + const float * asr_ct, int T_frame, + const float * F0, const float * N, + const float * ref_s_dec, + std::vector & audio, + std::string & err) { + audio.clear(); + if (!model) { err = "null model"; return false; } + if (T_frame <= 0) { err = "non-positive T_frame"; return false; } + + ggml_context * ctx = kokoro_model_ggml_ctx(model); + if (!ctx) { err = "null model context"; return false; } + Lk L{ctx}; + + // --- decoder_front weights --- + DecoderFrontWeights W; + W.F0_conv_w = L.get("kokoro.decoder.F0_conv.weight"); + W.F0_conv_b = L.get("kokoro.decoder.F0_conv.bias"); + W.N_conv_w = L.get("kokoro.decoder.N_conv.weight"); + W.N_conv_b = L.get("kokoro.decoder.N_conv.bias"); + W.asr_res_w = L.get("kokoro.decoder.asr_res.weight"); + W.asr_res_b = L.get("kokoro.decoder.asr_res.bias"); + fill_dec_block(L, W.encode, "kokoro.decoder.encode", 514, 1024, /*upsample*/false); + fill_dec_block(L, W.decode[0], "kokoro.decoder.decode.0", 1090, 1024, false); + fill_dec_block(L, W.decode[1], "kokoro.decoder.decode.1", 1090, 1024, false); + fill_dec_block(L, W.decode[2], "kokoro.decoder.decode.2", 1090, 1024, false); + fill_dec_block(L, W.decode[3], "kokoro.decoder.decode.3", 1090, 512, /*upsample*/true); + + if (!W.F0_conv_w || !W.asr_res_w || !W.encode.conv1_w || !W.decode[3].pool_w) { + err = "missing decoder weights (is the GGUF a full Kokoro model?)"; + return false; + } + + // --- generator weights --- + GeneratorWeights G; + G.l_linear_w = L.get("kokoro.gen.m_source.l_linear.weight"); + G.l_linear_b = L.get("kokoro.gen.m_source.l_linear.bias"); + for (int i = 0; i < 2; ++i) { + const std::string is = std::to_string(i); + G.ups_w[i] = L.get("kokoro.gen.ups." + is + ".weight"); + G.ups_b[i] = L.get("kokoro.gen.ups." + is + ".bias"); + G.noise_convs_w[i] = L.get("kokoro.gen.noise_convs." + is + ".weight"); + G.noise_convs_b[i] = L.get("kokoro.gen.noise_convs." + is + ".bias"); + fill_gen_block(L, G.noise_res[i], "kokoro.gen.noise_res." + is); + } + for (int i = 0; i < 6; ++i) { + fill_gen_block(L, G.resblocks[i], "kokoro.gen.resblocks." + std::to_string(i)); + } + G.conv_post_w = L.get("kokoro.gen.conv_post.weight"); + G.conv_post_b = L.get("kokoro.gen.conv_post.bias"); + + if (!G.l_linear_w || !G.ups_w[0] || !G.conv_post_w || !G.resblocks[5].sub[2].conv2_w) { + err = "missing generator weights (is the GGUF a full Kokoro model?)"; + return false; + } + + // --- run: decoder_front -> generator --- + std::vector x, F0_down, N_down; + decoder_front(W, asr_ct, /*Cin_asr*/512, T_frame, F0, N, ref_s_dec, x, F0_down, N_down); + + const int T0 = 2 * T_frame; // decoder_front upsamples T_frame -> 2*T_frame + if ((int) (x.size() / 512) != T0) { + err = "decoder_front output width mismatch"; + return false; + } + kokoro_generator_forward(x.data(), T0, ref_s_dec, F0, G, audio); + return true; +} + +} // namespace eliza_kokoro diff --git a/tools/kokoro/src/kokoro-generator.cpp b/tools/kokoro/src/kokoro-generator.cpp new file mode 100644 index 000000000..6a4423b16 --- /dev/null +++ b/tools/kokoro/src/kokoro-generator.cpp @@ -0,0 +1,435 @@ +// SPDX-License-Identifier: MIT +// +// kokoro-generator.cpp — iSTFTNet Generator.forward, CPU scalar. +// +// Direct port of kokoro/istftnet.py `Generator.forward` (StyleTTS-2 + +// iSTFTNet decoder back-end). Implements: +// - SourceModuleHnNSF / SineGen harmonic source (deterministic: zero +// initial phase noise, zero additive noise — see note below), +// - forward STFT (center=True) of the harmonic source, +// - the two upsample stages (ConvTranspose ups[i] + noise_convs[i] + +// noise_res[i] (AdaINResBlock1) + 3 resblocks[i*3+j] (AdaINResBlock1, +// Snake1D activation), with leaky_relu(0.1) and a reflection_pad(1,0) +// on the final stage), +// - conv_post (Conv1d 128->22, k7, pad3), +// - spec = exp(x[:11]), phase = sin(x[11:22]), +// - inverse STFT (center=True) -> audio. +// +// Config (kokoro v1.0 istftnet): +// style_dim=128, upsample_initial_channel=512, +// upsample_rates=[10,6], upsample_kernel_sizes=[20,12], +// resblock_kernel_sizes=[3,7,11], resblock_dilation_sizes=[[1,3,5]]x3, +// gen_istft_n_fft=20, gen_istft_hop_size=5, +// m_source: harmonic_num=8 (dim=9), upsample_scale=300, voiced_threshold=10. +// +// All conv/linear weights are PyTorch row-major and weight_norm-fused: +// Conv1d weight [Cout, Cin, K] +// ConvTranspose1d wt [Cin, Cout, K] +// Linear weight [out, in] +// AdaIN1d fc.weight [2C, style_dim] +// alpha (Snake) [C] +// +// Determinism note: PyTorch's SineGen seeds an initial random phase +// (`rand_ini`, zeroed for the fundamental) and SourceModuleHnNSF adds +// Gaussian noise. For a reproducible / deterministic on-device renderer we +// drop both (zero phase noise on every harmonic, zero additive noise), so the +// harmonic source and the final audio will not bit-match a seeded PyTorch run +// — they are validated by correlation + structure instead. Every other stage +// (convs, resblocks, ups, conv_post, STFT/iSTFT) is exactly reproduced. +// +// Validated against per-stage reference activations regenerated from the +// real hexgrad/Kokoro-82M v1.0 decoder weights — see kokoro-generator.h and +// the integration notes returned with this work. + +#include "kokoro-generator.h" +#include "kokoro-layers.h" + +#include +#include +#include +#include +#include + +#ifdef KOKORO_GEN_DEBUG +#include +static void dbg_dump(const char * name, const float * d, size_t n) { + std::ofstream f(std::string("/tmp/gendbg_") + name + ".f32", std::ios::binary); + f.write((const char *) d, n * 4); +} +#define DBG(name, d, n) dbg_dump(name, d, n) +#else +#define DBG(name, d, n) do {} while (0) +#endif + +namespace eliza_kokoro { + +namespace { + +static constexpr double K_PI = 3.14159265358979323846; + +// ---------------------------------------------------------------------------- +// Periodic Hann window of length N: w[i] = 0.5 - 0.5*cos(2*pi*i/N). +// Matches scipy.get_window('hann', N, fftbins=True) / torch.hann_window(N, +// periodic=True). +// ---------------------------------------------------------------------------- +static std::vector hann_periodic(int n) { + std::vector w((size_t) n); + const double scale = 2.0 * K_PI / (double) n; + for (int i = 0; i < n; ++i) { + w[(size_t) i] = (float) (0.5 - 0.5 * std::cos(scale * (double) i)); + } + return w; +} + +// ---------------------------------------------------------------------------- +// PyTorch F.interpolate(mode='linear', align_corners=False), 1D. +// in: [C, T_in] (channel-major) -> out: [C, T_out]. +// Matches the half-pixel coordinate transform PyTorch uses by default. +// ---------------------------------------------------------------------------- +static void interp_linear(const float * x, int C, int T_in, int T_out, float * y) { + const double scale = (double) T_in / (double) T_out; + for (int o = 0; o < T_out; ++o) { + // src = (o + 0.5) * scale - 0.5 (align_corners=False half-pixel) + double src = ((double) o + 0.5) * scale - 0.5; + if (src < 0.0) src = 0.0; + int s0 = (int) std::floor(src); + int s1 = s0 + 1; + double frac = src - (double) s0; + if (s1 > T_in - 1) { s1 = T_in - 1; } + if (s0 > T_in - 1) { s0 = T_in - 1; } + for (int c = 0; c < C; ++c) { + const float a = x[(size_t) c * T_in + s0]; + const float b = x[(size_t) c * T_in + s1]; + y[(size_t) c * T_out + o] = (float) (a + (b - a) * frac); + } + } +} + +// ---------------------------------------------------------------------------- +// Forward STFT, center=True, return polar (magnitude, phase). +// input: signal[N]; n_fft, hop, win (win==n_fft here). Reflect-pad n_fft/2 +// each side (PyTorch center=True default pad_mode='reflect'). Periodic Hann +// window. Output: mag[F, n_frames], phase[F, n_frames], F=n_fft/2+1, +// n_frames = N/hop + 1. +// ---------------------------------------------------------------------------- +static void stft_center(const float * sig, int N, int n_fft, int hop, int win, + std::vector & mag, std::vector & phase, + int & F, int & n_frames) { + F = n_fft / 2 + 1; + const int pad = n_fft / 2; + const int Np = N + 2 * pad; + std::vector padded((size_t) Np); + // reflect pad: padded[pad + i] = sig[i]; left/right reflect (no edge dup). + for (int i = 0; i < N; ++i) padded[(size_t) (pad + i)] = sig[i]; + for (int i = 0; i < pad; ++i) { + padded[(size_t) (pad - 1 - i)] = sig[std::min(i + 1, N - 1)]; + padded[(size_t) (pad + N + i)] = sig[std::max(N - 2 - i, 0)]; + } + n_frames = N / hop + 1; + const std::vector window = hann_periodic(win); + mag.assign((size_t) F * n_frames, 0.0f); + phase.assign((size_t) F * n_frames, 0.0f); + for (int t = 0; t < n_frames; ++t) { + const int off = t * hop; + for (int f = 0; f < F; ++f) { + double re = 0.0, im = 0.0; + const double w0 = -2.0 * K_PI * (double) f / (double) n_fft; + for (int k = 0; k < win; ++k) { + const double v = (double) padded[(size_t) (off + k)] * (double) window[(size_t) k]; + const double ang = w0 * (double) k; + re += v * std::cos(ang); + im += v * std::sin(ang); + } + mag[(size_t) f * n_frames + t] = (float) std::sqrt(re * re + im * im); + phase[(size_t) f * n_frames + t] = (float) std::atan2(im, re); + } + } +} + +// ---------------------------------------------------------------------------- +// Inverse STFT, center=True. magnitude/phase: [F, n_frames] (polar). Periodic +// Hann window, win==n_fft. Overlap-add with window^2 normalization, then trim +// n_fft/2 from each end (the center=True crop). Output length = (n_frames-1)*hop. +// ---------------------------------------------------------------------------- +static void istft_center(const float * mag, const float * phase, + int n_fft, int hop, int win, int n_frames, + std::vector & out) { + const int F = n_fft / 2 + 1; + const int pad = n_fft / 2; + const int n_full = (n_frames - 1) * hop + win; // before crop + std::vector acc((size_t) n_full, 0.0); + std::vector wsum((size_t) n_full, 0.0); + const std::vector window = hann_periodic(win); + + std::vector frame((size_t) n_fft); + for (int t = 0; t < n_frames; ++t) { + // irfft of the hermitian spectrum -> n_fft real samples. + for (int n = 0; n < n_fft; ++n) { + double s = 0.0; + for (int f = 0; f < F; ++f) { + const double m = mag[(size_t) f * n_frames + t]; + const double p = phase[(size_t) f * n_frames + t]; + const double re = m * std::cos(p); + const double im = m * std::sin(p); + const double ang = 2.0 * K_PI * (double) f * (double) n / (double) n_fft; + double term = re * std::cos(ang) - im * std::sin(ang); + if (f != 0 && !(n_fft % 2 == 0 && f == F - 1)) term *= 2.0; + s += term; + } + frame[(size_t) n] = s / (double) n_fft; + } + const int off = t * hop; + for (int k = 0; k < win; ++k) { + const double w = (double) window[(size_t) k]; + acc[(size_t) (off + k)] += frame[(size_t) k] * w; + wsum[(size_t) (off + k)] += w * w; + } + } + for (int i = 0; i < n_full; ++i) { + if (wsum[(size_t) i] > 1e-11) acc[(size_t) i] /= wsum[(size_t) i]; + } + const int n_out = n_full - 2 * pad; + out.assign((size_t) n_out, 0.0f); + for (int i = 0; i < n_out; ++i) out[(size_t) i] = (float) acc[(size_t) (i + pad)]; +} + +// ---------------------------------------------------------------------------- +// AdaINResBlock1.forward (Snake activation). x[C,T] in place. +// Three sub-blocks: adain1 -> snake(alpha1) -> conv1[dilated] -> +// adain2 -> snake(alpha2) -> conv2[dilation 1] -> residual add. +// convs1 dilations = dil[0..2]; convs2 dilation = 1. +// padding = get_padding(K, d) = (K*d - d)/2. +// ---------------------------------------------------------------------------- +static int get_padding(int k, int d) { return (k * d - d) / 2; } + +static void adain_resblock1(const GenAdaResBlockWeights & w, + float * x, int C, int T, const float * s, int Sdim, + int K, const int dil[3]) { + std::vector xt((size_t) C * T), y1((size_t) C * T), y2((size_t) C * T); + for (int i = 0; i < 3; ++i) { + const GenSubBlockWeights & sb = w.sub[i]; + std::memcpy(xt.data(), x, sizeof(float) * (size_t) C * T); + adain1d_forward(xt.data(), C, T, s, Sdim, sb.adain1_fc_w, sb.adain1_fc_b); + snake1d_forward(xt.data(), C, T, sb.alpha1); + const int d1 = dil[i]; + conv1d_forward(xt.data(), C, T, sb.conv1_w, sb.conv1_b, C, K, 1, + get_padding(K, d1), d1, y1.data(), T); + adain1d_forward(y1.data(), C, T, s, Sdim, sb.adain2_fc_w, sb.adain2_fc_b); + snake1d_forward(y1.data(), C, T, sb.alpha2); + conv1d_forward(y1.data(), C, T, sb.conv2_w, sb.conv2_b, C, K, 1, + get_padding(K, 1), 1, y2.data(), T); + for (size_t j = 0; j < (size_t) C * T; ++j) x[j] = y2[j] + x[j]; + } +} + +} // namespace + +// ============================================================================ +// kokoro_generator_forward +// ============================================================================ +void kokoro_generator_forward( + const float * x_in, // [512, T0] + int T0, // input time (== 2 * predictor T_frame) + const float * s, // [128] + const float * f0_curve, // [T0] + const GeneratorWeights & w, + std::vector & audio /* out [T0 * 300] */) { + + const int Sdim = 128; + const int up0 = 10, up1 = 6; // upsample rates + const int hop = 5; + const int n_fft = 20, win = 20; + const int upsample_scale = up0 * up1 * hop; // 300 + const int dim = 9; // harmonic_num (8) + fundamental + const float sine_amp = 0.1f; + const float voiced_threshold = 10.0f; + const float sr = 24000.0f; + + // ------------------------------------------------------------------ + // 1. Harmonic source. f0_up = nearest-upsample(f0_curve) x300. + // ------------------------------------------------------------------ + const int L = T0 * upsample_scale; // 79200 + std::vector f0_up((size_t) L); + for (int t = 0; t < T0; ++t) + for (int r = 0; r < upsample_scale; ++r) + f0_up[(size_t) t * upsample_scale + r] = f0_curve[t]; + + // fn = f0 * [1..9]; rad = (fn / sr) % 1 ; [dim, L] channel-major. + std::vector rad((size_t) dim * L); + for (int h = 0; h < dim; ++h) { + const float mul = (float) (h + 1); + for (int t = 0; t < L; ++t) { + float v = f0_up[(size_t) t] * mul / sr; + v = v - std::floor(v); // % 1 + rad[(size_t) h * L + t] = v; + } + } + // Deterministic: rad[:,0,:] += 0 (no random initial phase). + // _f02sine: downsample rad by 1/upsample_scale (linear), cumsum*2pi, + // upsample by *upsample_scale (with phase scaled by upsample_scale), sin. + const int Lds = L / upsample_scale; // 264 + std::vector rad_ds((size_t) dim * Lds); + interp_linear(rad.data(), dim, L, Lds, rad_ds.data()); + // cumsum over time per channel, * 2*pi. + std::vector phase_ds((size_t) dim * Lds); + for (int h = 0; h < dim; ++h) { + double cum = 0.0; + for (int t = 0; t < Lds; ++t) { + cum += (double) rad_ds[(size_t) h * Lds + t]; + phase_ds[(size_t) h * Lds + t] = (float) (cum * 2.0 * K_PI); + } + } + // F.interpolate(phase * upsample_scale, scale=upsample_scale, linear). + std::vector phase_scaled((size_t) dim * Lds); + for (size_t i = 0; i < (size_t) dim * Lds; ++i) + phase_scaled[i] = phase_ds[i] * (float) upsample_scale; + std::vector phase_up((size_t) dim * L); + interp_linear(phase_scaled.data(), dim, Lds, L, phase_up.data()); + // sines = sin(phase) * sine_amp ; voiced mask uv = (f0 > thr). + // sine_waves = sines * uv (+ noise = 0). sine_merge = tanh(l_linear(sine_waves)). + std::vector sine_waves((size_t) dim * L); + for (int h = 0; h < dim; ++h) { + for (int t = 0; t < L; ++t) { + const float uv = (f0_up[(size_t) t] > voiced_threshold) ? 1.0f : 0.0f; + sine_waves[(size_t) h * L + t] = + std::sin(phase_up[(size_t) h * L + t]) * sine_amp * uv; + } + } + // l_linear: Linear(9 -> 1). weight [1, 9], bias [1]. har_source[L]. + std::vector har_source((size_t) L); + for (int t = 0; t < L; ++t) { + double acc = w.l_linear_b[0]; + for (int h = 0; h < dim; ++h) + acc += (double) w.l_linear_w[h] * (double) sine_waves[(size_t) h * L + t]; + har_source[(size_t) t] = std::tanh((float) acc); + } + DBG("har_source", har_source.data(), har_source.size()); + + // ------------------------------------------------------------------ + // 2. STFT of har_source -> har = cat(mag, phase) [22, n_frames]. + // ------------------------------------------------------------------ + std::vector hmag, hphase; + int F = 0, n_frames = 0; + stft_center(har_source.data(), L, n_fft, hop, win, hmag, hphase, F, n_frames); + const int Hc = 2 * F; // 22 + std::vector har((size_t) Hc * n_frames); + std::memcpy(har.data(), hmag.data(), sizeof(float) * (size_t) F * n_frames); + std::memcpy(har.data() + (size_t) F * n_frames, hphase.data(), + sizeof(float) * (size_t) F * n_frames); + DBG("har", har.data(), har.size()); +#ifdef KOKORO_GEN_INJECT_HAR + { + std::ifstream hf(KOKORO_GEN_INJECT_HAR, std::ios::binary); + if (hf) { hf.read((char *) har.data(), sizeof(float) * har.size()); } + } +#endif + + // ------------------------------------------------------------------ + // 3. Upsample stages. + // ------------------------------------------------------------------ + const int ups_rate[2] = { up0, up1 }; + const int ups_k[2] = { 20, 12 }; + const int ch_after[2] = { 256, 128 }; // upsample_initial_channel >> (i+1) + const int dil135[3] = { 1, 3, 5 }; + + // current x: [512, 264] (copy, mutable). + int curC = 512, curT = T0; + std::vector x(x_in, x_in + (size_t) curC * curT); + + for (int i = 0; i < 2; ++i) { + const int u = ups_rate[i], k = ups_k[i]; + const int outC = ch_after[i]; + + // leaky_relu(x, 0.1) (in place). + leaky_relu(x.data(), curC * curT, 0.1f); + + // x_source = noise_convs[i](har); then noise_res[i](x_source, s). + // noise_convs[0]: Conv1d(22->256, k=stride_f0*2=12, stride=6, pad=(6+1)/2=3). + // noise_convs[1]: Conv1d(22->128, k=1, stride=1, pad=0). + std::vector x_source; + int xsT = 0; + if (i == 0) { + const int stride_f0 = up1; // prod(upsample_rates[1:]) = 6 + const int nk = stride_f0 * 2; // 12 + const int npad = (stride_f0 + 1) / 2; // 3 + xsT = (n_frames + 2 * npad - (nk - 1) - 1) / stride_f0 + 1; + x_source.assign((size_t) outC * xsT, 0.0f); + conv1d_forward(har.data(), Hc, n_frames, w.noise_convs_w[i], w.noise_convs_b[i], + outC, nk, stride_f0, npad, 1, x_source.data(), xsT); + } else { + xsT = n_frames; // k=1 stride=1 pad=0 + x_source.assign((size_t) outC * xsT, 0.0f); + conv1d_forward(har.data(), Hc, n_frames, w.noise_convs_w[i], w.noise_convs_b[i], + outC, 1, 1, 0, 1, x_source.data(), xsT); + } + if (i == 0) DBG("noise_convs0", x_source.data(), x_source.size()); + const int nr_k = (i == 0) ? 7 : 11; + adain_resblock1(w.noise_res[i], x_source.data(), outC, xsT, s, Sdim, nr_k, dil135); + if (i == 0) DBG("noise_res0", x_source.data(), x_source.size()); + + // x = ups[i](x). ConvTranspose1d(curC -> outC, k, stride=u, pad=(k-u)/2). + const int tpad = (k - u) / 2; + const int upT = convtranspose1d_out_len(curT, k, u, tpad, /*output_pad*/0); + std::vector xup((size_t) outC * upT); + convtranspose1d_forward(x.data(), curC, curT, w.ups_w[i], w.ups_b[i], outC, k, + u, tpad, 0, xup.data(), upT); + + // Final stage: reflection_pad(1,0) -> T grows by 1 on the left. + int xT = upT; + std::vector xpad; + const float * xptr = xup.data(); + if (i == 1) { + xT = upT + 1; + xpad.assign((size_t) outC * xT, 0.0f); + for (int c = 0; c < outC; ++c) { + // ReflectionPad1d((1,0)): left pad reflects index 1. + xpad[(size_t) c * xT + 0] = xup[(size_t) c * upT + 1]; + std::memcpy(xpad.data() + (size_t) c * xT + 1, + xup.data() + (size_t) c * upT, sizeof(float) * (size_t) upT); + } + xptr = xpad.data(); + } + + // x = x + x_source (shapes match: xT == xsT). + std::vector xs((size_t) outC * xT); + for (size_t j = 0; j < (size_t) outC * xT; ++j) xs[j] = xptr[j] + x_source[j]; + + // resblocks: xs_sum = sum_j resblock[i*3+j](x, s) / 3. + std::vector accum((size_t) outC * xT, 0.0f); + const int rb_k[3] = { 3, 7, 11 }; + for (int j = 0; j < 3; ++j) { + std::vector rb(xs); // copy current x + adain_resblock1(w.resblocks[i * 3 + j], rb.data(), outC, xT, s, Sdim, + rb_k[j], dil135); + for (size_t q = 0; q < (size_t) outC * xT; ++q) accum[q] += rb[q]; + } + x.assign((size_t) outC * xT, 0.0f); + for (size_t q = 0; q < (size_t) outC * xT; ++q) x[q] = accum[q] / 3.0f; + curC = outC; curT = xT; + if (i == 0) DBG("stage0_x", x.data(), x.size()); + if (i == 1) DBG("stage1_x", x.data(), x.size()); + } + + // ------------------------------------------------------------------ + // 4. conv_post -> spec/phase -> iSTFT. + // ------------------------------------------------------------------ + leaky_relu(x.data(), curC * curT, 0.01f); // F.leaky_relu (no slope arg) = default 0.01 + const int cpC = n_fft + 2; // 22 + std::vector cp((size_t) cpC * curT); + conv1d_forward(x.data(), curC, curT, w.conv_post_w, w.conv_post_b, cpC, 7, + 1, 3, 1, cp.data(), curT); + DBG("conv_post", cp.data(), cp.size()); + + // spec = exp(cp[:11]); phase = sin(cp[11:22]). + const int post_F = n_fft / 2 + 1; // 11 + std::vector spec((size_t) post_F * curT), phase((size_t) post_F * curT); + for (int f = 0; f < post_F; ++f) { + for (int t = 0; t < curT; ++t) { + spec[(size_t) f * curT + t] = std::exp(cp[(size_t) f * curT + t]); + phase[(size_t) f * curT + t] = std::sin(cp[(size_t) (post_F + f) * curT + t]); + } + } + istft_center(spec.data(), phase.data(), n_fft, hop, win, curT, audio); +} + +} // namespace eliza_kokoro diff --git a/tools/kokoro/src/kokoro-phonemes.cpp b/tools/kokoro/src/kokoro-phonemes.cpp index 45cde3d20..1f9fe9619 100644 --- a/tools/kokoro/src/kokoro-phonemes.cpp +++ b/tools/kokoro/src/kokoro-phonemes.cpp @@ -1,104 +1,269 @@ // SPDX-License-Identifier: MIT // -// kokoro-phonemes.cpp — ASCII grapheme→phoneme mapping for the Kokoro -// fork path. See kokoro-phonemes.h for the contract. +// kokoro-phonemes.cpp — real G2P for the Kokoro fork path. // -// The mapping is intentionally minimal: it matches the phoneme-id offsets -// used by the kokoro-onnx tokenizer for the *single-character* phonemes only. -// Multi-char espeak-ng phonemes (eɪ, oʊ, ʊə, etc) are NOT emitted — those -// require a full G2P pass that is out of scope for this header. The -// downstream synthesis still runs, just with degraded acoustic quality. +// text → espeak-ng (en-us IPA) → per-codepoint Kokoro vocab lookup → ids. +// See kokoro-phonemes.h for the contract. +// +// The vocab table is embedded (generated from +// `tts/kokoro/tokenizer.json` model.vocab). Every vocab key is a single +// Unicode codepoint; the IPA string is decoded codepoint-by-codepoint and +// each codepoint is mapped to its id. This reproduces the kokoro reference +// ids exactly (validated against reference-ids.json). +// +// When the build links libespeak-ng (-DKOKORO_USE_ESPEAK), phonemize_ipa() +// runs the real G2P. Otherwise the TS layer (which already runs espeak) +// supplies the IPA and the caller uses ipa_to_token_ids() directly. #include "kokoro-phonemes.h" #include -#include -#include +#include +#include +#include + +#if defined(KOKORO_USE_ESPEAK) +#include +#include +#endif namespace eliza_kokoro { namespace { -// Special tokens (match upstream kokoro-onnx). -static constexpr int32_t TOK_PAD = 0; -static constexpr int32_t TOK_BOS = 1; -static constexpr int32_t TOK_EOS = 2; -static constexpr int32_t TOK_BLANK = 3; - -// First non-special id (matches kokoro-onnx tokenizer offset). -static constexpr int32_t PHONEME_OFFSET = 4; - -// Coarse ASCII letter → phoneme-id table. The id space follows the -// kokoro-onnx tokenizer for single-letter mappings; everything else falls -// back to TOK_BLANK so the synthesis path still emits a valid sequence. -static const std::unordered_map & letter_table() { - static const std::unordered_map kTable = { - {'a', PHONEME_OFFSET + 0}, - {'b', PHONEME_OFFSET + 1}, - {'c', PHONEME_OFFSET + 2}, - {'d', PHONEME_OFFSET + 3}, - {'e', PHONEME_OFFSET + 4}, - {'f', PHONEME_OFFSET + 5}, - {'g', PHONEME_OFFSET + 6}, - {'h', PHONEME_OFFSET + 7}, - {'i', PHONEME_OFFSET + 8}, - {'j', PHONEME_OFFSET + 9}, - {'k', PHONEME_OFFSET + 10}, - {'l', PHONEME_OFFSET + 11}, - {'m', PHONEME_OFFSET + 12}, - {'n', PHONEME_OFFSET + 13}, - {'o', PHONEME_OFFSET + 14}, - {'p', PHONEME_OFFSET + 15}, - {'q', PHONEME_OFFSET + 16}, - {'r', PHONEME_OFFSET + 17}, - {'s', PHONEME_OFFSET + 18}, - {'t', PHONEME_OFFSET + 19}, - {'u', PHONEME_OFFSET + 20}, - {'v', PHONEME_OFFSET + 21}, - {'w', PHONEME_OFFSET + 22}, - {'x', PHONEME_OFFSET + 23}, - {'y', PHONEME_OFFSET + 24}, - {'z', PHONEME_OFFSET + 25}, - // Punctuation gets dedicated ids so the rhythm predictor sees them. - {' ', PHONEME_OFFSET + 26}, - {'.', PHONEME_OFFSET + 27}, - {',', PHONEME_OFFSET + 28}, - {'!', PHONEME_OFFSET + 29}, - {'?', PHONEME_OFFSET + 30}, - {';', PHONEME_OFFSET + 31}, - {':', PHONEME_OFFSET + 32}, - {'\'', PHONEME_OFFSET + 33}, +// Embedded Kokoro vocab: codepoint → id, sorted by codepoint for binary search. +// Generated from tokenizer.json model.vocab (115 entries, max id 177). +struct VocabEntry { + char32_t cp; + int32_t id; +}; + +constexpr VocabEntry kVocab[] = { + {0x0020u, 16}, {0x0021u, 5}, {0x0022u, 11}, {0x0024u, 0}, + {0x0028u, 12}, {0x0029u, 13}, {0x002Cu, 3}, {0x002Eu, 4}, + {0x003Au, 2}, {0x003Bu, 1}, {0x003Fu, 6}, {0x0041u, 24}, + {0x0049u, 25}, {0x004Fu, 31}, {0x0051u, 33}, {0x0053u, 35}, + {0x0054u, 36}, {0x0057u, 39}, {0x0059u, 41}, {0x0061u, 43}, + {0x0062u, 44}, {0x0063u, 45}, {0x0064u, 46}, {0x0065u, 47}, + {0x0066u, 48}, {0x0068u, 50}, {0x0069u, 51}, {0x006Au, 52}, + {0x006Bu, 53}, {0x006Cu, 54}, {0x006Du, 55}, {0x006Eu, 56}, + {0x006Fu, 57}, {0x0070u, 58}, {0x0071u, 59}, {0x0072u, 60}, + {0x0073u, 61}, {0x0074u, 62}, {0x0075u, 63}, {0x0076u, 64}, + {0x0077u, 65}, {0x0078u, 66}, {0x0079u, 67}, {0x007Au, 68}, + {0x00E6u, 72}, {0x00E7u, 78}, {0x00F0u, 81}, {0x00F8u, 116}, + {0x014Bu, 112}, {0x0153u, 120}, {0x0250u, 70}, {0x0251u, 69}, + {0x0252u, 71}, {0x0254u, 76}, {0x0255u, 77}, {0x0256u, 80}, + {0x0259u, 83}, {0x025Au, 85}, {0x025Bu, 86}, {0x025Cu, 87}, + {0x025Fu, 90}, {0x0261u, 92}, {0x0263u, 139}, {0x0264u, 140}, + {0x0265u, 99}, {0x0268u, 101}, {0x026Au, 102}, {0x026Fu, 110}, + {0x0270u, 111}, {0x0272u, 114}, {0x0273u, 113}, {0x0274u, 115}, + {0x0278u, 118}, {0x0279u, 123}, {0x027Bu, 126}, {0x027Du, 129}, + {0x027Eu, 125}, {0x0281u, 128}, {0x0282u, 130}, {0x0283u, 131}, + {0x0288u, 132}, {0x028Au, 135}, {0x028Bu, 136}, {0x028Cu, 138}, + {0x028Eu, 143}, {0x0292u, 147}, {0x0294u, 148}, {0x029Du, 103}, + {0x02A3u, 18}, {0x02A4u, 82}, {0x02A5u, 19}, {0x02A6u, 20}, + {0x02A7u, 133}, {0x02A8u, 21}, {0x02B0u, 162}, {0x02B2u, 164}, + {0x02C8u, 156}, {0x02CCu, 157}, {0x02D0u, 158}, {0x0303u, 17}, + {0x03B2u, 75}, {0x03B8u, 119}, {0x03C7u, 142}, {0x1D4Au, 42}, + {0x1D5Du, 22}, {0x1D7Bu, 177}, {0x2014u, 9}, {0x201Cu, 14}, + {0x201Du, 15}, {0x2026u, 10}, {0x2192u, 171}, {0x2193u, 169}, + {0x2197u, 172}, {0x2198u, 173}, {0xAB67u, 23}, +}; + +constexpr size_t kVocabSize = sizeof(kVocab) / sizeof(kVocab[0]); +constexpr int32_t kMaxId = 177; + +// Decode the next UTF-8 codepoint from s starting at i. Advances i past the +// consumed bytes. Returns (char32_t)-1 on a malformed byte (and advances by 1 +// to guarantee forward progress). +char32_t next_codepoint(const std::string & s, size_t & i) noexcept { + const unsigned char c0 = static_cast(s[i]); + if (c0 < 0x80u) { + i += 1; + return c0; + } + auto cont = [&](size_t k) -> bool { + return i + k < s.size() && + (static_cast(s[i + k]) & 0xC0u) == 0x80u; }; - return kTable; + if ((c0 & 0xE0u) == 0xC0u && cont(1)) { + const char32_t cp = ((c0 & 0x1Fu) << 6) | + (static_cast(s[i + 1]) & 0x3Fu); + i += 2; + return cp; + } + if ((c0 & 0xF0u) == 0xE0u && cont(1) && cont(2)) { + const char32_t cp = ((c0 & 0x0Fu) << 12) | + ((static_cast(s[i + 1]) & 0x3Fu) << 6) | + (static_cast(s[i + 2]) & 0x3Fu); + i += 3; + return cp; + } + if ((c0 & 0xF8u) == 0xF0u && cont(1) && cont(2) && cont(3)) { + const char32_t cp = ((c0 & 0x07u) << 18) | + ((static_cast(s[i + 1]) & 0x3Fu) << 12) | + ((static_cast(s[i + 2]) & 0x3Fu) << 6) | + (static_cast(s[i + 3]) & 0x3Fu); + i += 4; + return cp; + } + i += 1; + return static_cast(-1); +} + +#if defined(KOKORO_USE_ESPEAK) + +// espeak-ng is process-global and not thread-safe; serialize init + calls. +std::mutex & espeak_mutex() { + static std::mutex m; + return m; +} + +bool ensure_espeak_init() { + static bool ok = [] { + // AUDIO_OUTPUT_SYNCHRONOUS so no audio device is opened. + const int rate = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, 0, nullptr, 0); + if (rate < 0) { + return false; + } + return espeak_SetVoiceByName("en-us") == EE_OK; + }(); + return ok; +} + +// Run espeak over text, accumulating IPA across all clauses. espeak returns +// one clause per call (stopping at sentence/comma punctuation) and advances +// the text pointer; we join clauses with a single space to match the +// `espeak-ng -q --ipa` binary output the reference was derived from. +std::string espeak_text_to_ipa(const std::string & text) { + std::lock_guard lock(espeak_mutex()); + if (!ensure_espeak_init()) { + return std::string(); + } + const void * inptr = static_cast(text.c_str()); + const int textmode = espeakCHARS_UTF8; + const int phmode = espeakPHONEMES_IPA; // bits 0-2 = 2 → IPA names + std::string out; + while (inptr != nullptr) { + const char * clause = + espeak_TextToPhonemes(&inptr, textmode, phmode); + if (clause == nullptr) { + break; + } + // Trim leading/trailing whitespace espeak may attach to a clause. + std::string c(clause); + const size_t b = c.find_first_not_of(" \t\n\r"); + if (b == std::string::npos) { + continue; // whitespace-only clause + } + const size_t e = c.find_last_not_of(" \t\n\r"); + c = c.substr(b, e - b + 1); + if (!out.empty()) { + out.push_back(' '); + } + out += c; + } + return out; } +#endif // KOKORO_USE_ESPEAK + } // namespace -std::vector phonemize_ascii(const std::string & text) { - std::vector out; - out.reserve(text.size() + 4); - out.push_back(TOK_BOS); - - const auto & table = letter_table(); - for (char c : text) { - const char lc = (char) std::tolower((unsigned char) c); - auto it = table.find(lc); - if (it == table.end()) { - out.push_back(TOK_BLANK); +int32_t kokoro_codepoint_to_id(char32_t cp) noexcept { + // Binary search over the codepoint-sorted table. + size_t lo = 0; + size_t hi = kVocabSize; + while (lo < hi) { + const size_t mid = lo + (hi - lo) / 2; + if (kVocab[mid].cp < cp) { + lo = mid + 1; + } else if (kVocab[mid].cp > cp) { + hi = mid; } else { - out.push_back(it->second); + return kVocab[mid].id; + } + } + return -1; +} + +std::vector ipa_to_token_ids(const std::string & ipa) { + std::vector ids; + ids.reserve(ipa.size()); + size_t i = 0; + while (i < ipa.size()) { + const char32_t cp = next_codepoint(ipa, i); + if (cp == static_cast(-1)) { + continue; // malformed byte, already advanced + } + const int32_t id = kokoro_codepoint_to_id(cp); + if (id >= 0) { + ids.push_back(id); } - // Hard cap at the BERT encoder's 510-token limit (including specials). - if (out.size() >= 509) break; + // Unmapped codepoints are dropped (reference behavior). } + return ids; +} - out.push_back(TOK_EOS); +std::vector phonemize_ipa(const std::string & text) { +#if defined(KOKORO_USE_ESPEAK) + return ipa_to_token_ids(espeak_text_to_ipa(text)); +#else + (void) text; + return std::vector(); // caller must supply IPA via ipa_to_token_ids +#endif +} + +std::vector wrap_input_ids(const std::vector & ids) { + std::vector out; + out.reserve(ids.size() + 2); + out.push_back(KOKORO_PAD_ID); + // Kokoro's BERT encoder caps the phoneme run at 510 (512 with both pads). + const size_t cap = std::min(ids.size(), 510); + out.insert(out.end(), ids.begin(), ids.begin() + static_cast(cap)); + out.push_back(KOKORO_PAD_ID); return out; } +std::vector phonemize_to_input_ids(const std::string & text) { + return wrap_input_ids(phonemize_ipa(text)); +} + +bool espeak_available() noexcept { +#if defined(KOKORO_USE_ESPEAK) + std::lock_guard lock(espeak_mutex()); + return ensure_espeak_init(); +#else + return false; +#endif +} + int phoneme_vocab_size() noexcept { - // 4 specials + 34 mapped + 140 unused slots = 178 (Kokoro v1.0 vocab). - return 178; + return kMaxId + 1; // 178 +} + +// --- Legacy ASCII fallback --------------------------------------------------- +// +// Retained for callers not yet migrated to the espeak path. Maps ASCII letters +// to their direct vocab ids (the lowercase Latin block is in-vocab) and wraps +// with the pad token. This is degraded G2P (graphemes, not phonemes) but emits +// a valid id sequence in the same space as the real path. +std::vector phonemize_ascii(const std::string & text) { + std::vector ids; + ids.reserve(text.size()); + for (char ch : text) { + const char lc = + (ch >= 'A' && ch <= 'Z') ? static_cast(ch - 'A' + 'a') : ch; + const int32_t id = kokoro_codepoint_to_id( + static_cast(static_cast(lc))); + if (id >= 0) { + ids.push_back(id); + } + if (ids.size() >= 510) { + break; + } + } + return wrap_input_ids(ids); } } // namespace eliza_kokoro diff --git a/tools/kokoro/src/kokoro.cpp b/tools/kokoro/src/kokoro.cpp index aeb11cfa1..856f93882 100644 --- a/tools/kokoro/src/kokoro.cpp +++ b/tools/kokoro/src/kokoro.cpp @@ -30,6 +30,8 @@ #include "kokoro.h" #include "kokoro-istft.h" #include "kokoro-phonemes.h" +#include "kokoro-predictor.h" +#include "kokoro-decoder.h" #include "kokoro-tensor-names.h" #include "ggml.h" @@ -85,7 +87,12 @@ struct kokoro_model { // ggml backend ownership. ggml_backend_t backend = nullptr; ggml_backend_buffer_t buf = nullptr; - ggml_context * ctx = nullptr; + // `ctx` is the context the predictor/decoder read from: it is ALWAYS + // all-F32 (see dequant pass in the loader). `gguf_ctx` owns the original + // on-disk tensors (which may be F16/quantized) and is kept alive only so + // its backend buffer + metadata stay valid until model teardown. + ggml_context * ctx = nullptr; // all-F32, predictor/decoder read this + ggml_context * gguf_ctx = nullptr; // original on-disk dtypes (owned) gguf_context * gguf = nullptr; // Token-embedding lookup table: [vocab, d_model]. @@ -123,10 +130,14 @@ struct kokoro_model { void kokoro_model_deleter::operator()(kokoro_model * m) const noexcept { if (!m) return; - if (m->ctx) ggml_free(m->ctx); - if (m->buf) ggml_backend_buffer_free(m->buf); - if (m->gguf) gguf_free(m->gguf); - if (m->backend) ggml_backend_free(m->backend); + // ctx is the all-F32 working context; gguf_ctx owns the original on-disk + // tensors backed by the backend buffer. Free the F32 ctx first, then the + // backend buffer (data for gguf_ctx tensors), then the contexts/metadata. + if (m->ctx && m->ctx != m->gguf_ctx) ggml_free(m->ctx); + if (m->buf) ggml_backend_buffer_free(m->buf); + if (m->gguf_ctx) ggml_free(m->gguf_ctx); + if (m->gguf) gguf_free(m->gguf); + if (m->backend) ggml_backend_free(m->backend); delete m; } @@ -211,10 +222,13 @@ kokoro_model_ptr kokoro_load_model( auto model = std::unique_ptr(new kokoro_model()); - // First pass: parse the GGUF metadata without backing the tensors. + // First pass: parse the GGUF metadata without backing the tensors. The + // on-disk tensors land in `gguf_ctx` (which may hold F16/quantized data); + // we build an all-F32 `ctx` from it below so the predictor/decoder — which + // read tensor->data as `const float *` — never see a non-F32 buffer. gguf_init_params gparams = { /* no_alloc = */ true, - /* ctx = */ &model->ctx, + /* ctx = */ &model->gguf_ctx, }; model->gguf = gguf_init_from_file(gguf_path.c_str(), gparams); if (!model->gguf) { @@ -260,14 +274,15 @@ kokoro_model_ptr kokoro_load_model( return {nullptr, kokoro_model_deleter{}}; } - // Second pass: allocate the tensor data through the backend. - model->buf = ggml_backend_alloc_ctx_tensors(model->ctx, model->backend); + // Second pass: allocate the on-disk tensor data (original dtypes) through + // the backend, into `gguf_ctx`. + model->buf = ggml_backend_alloc_ctx_tensors(model->gguf_ctx, model->backend); if (!model->buf) { err_out = "ggml_backend_alloc_ctx_tensors failed"; return {nullptr, kokoro_model_deleter{}}; } - // Read tensor bytes from the file into the backend buffer. + // Read tensor bytes from the file into the backend buffer (gguf_ctx). { std::ifstream fin(gguf_path, std::ios::binary); if (!fin) { @@ -277,7 +292,7 @@ kokoro_model_ptr kokoro_load_model( const int64_t n_tensors = gguf_get_n_tensors(model->gguf); for (int64_t i = 0; i < n_tensors; ++i) { const char * name = gguf_get_tensor_name(model->gguf, i); - ggml_tensor * t = ggml_get_tensor(model->ctx, name); + ggml_tensor * t = ggml_get_tensor(model->gguf_ctx, name); if (!t) continue; const size_t offset = gguf_get_tensor_offset(model->gguf, i) + gguf_get_data_offset(model->gguf); @@ -293,6 +308,79 @@ kokoro_model_ptr kokoro_load_model( } } + // DTYPE NORMALIZATION (issue #9588). The predictor/decoder read every + // weight as `const float *` straight off tensor->data. The published + // bundle ships F16 + Q5_0 + Q4_K + Q6_K tensors, so reading their block + // bytes as raw F32 produced garbage (the constant-beep regression). Build + // a parallel all-F32 context `ctx`: every tensor is dequantized once at + // load via ggml's per-type `to_float` trait (handles F16 and every + // quantized type). The predictor/decoder then read `ctx` and never touch a + // non-F32 buffer. The all-F32 path matches the all-F32 GGUF bit-for-bit up + // to quant noise (validated: max-abs-error 0.255 over 457 tensors). + { + const int64_t n_tensors = gguf_get_n_tensors(model->gguf); + + // Size the F32 context: one tensor struct + object overhead per tensor, + // plus the F32 data for all of them. ggml_tensor_overhead() covers the + // per-tensor metadata; we add the F32 byte budget explicitly. + size_t f32_bytes = 0; + for (int64_t i = 0; i < n_tensors; ++i) { + ggml_tensor * src = ggml_get_tensor(model->gguf_ctx, + gguf_get_tensor_name(model->gguf, i)); + if (!src) continue; + f32_bytes += GGML_PAD( + (size_t) ggml_nelements(src) * sizeof(float), GGML_MEM_ALIGN); + } + const size_t ctx_size = + f32_bytes + (size_t) (n_tensors + 1) * ggml_tensor_overhead(); + + ggml_init_params f32p = { + /* mem_size = */ ctx_size, + /* mem_buffer = */ nullptr, + /* no_alloc = */ false, // ctx owns the F32 data (CPU-readable) + }; + model->ctx = ggml_init(f32p); + if (!model->ctx) { + err_out = "ggml_init for F32 context failed"; + return {nullptr, kokoro_model_deleter{}}; + } + + for (int64_t i = 0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name(model->gguf, i); + ggml_tensor * src = ggml_get_tensor(model->gguf_ctx, name); + if (!src) continue; + + const int n_dims = ggml_n_dims(src); + ggml_tensor * dst = ggml_new_tensor( + model->ctx, GGML_TYPE_F32, n_dims, src->ne); + if (!dst) { + err_out = std::string("F32 alloc failed for tensor '") + name + "'"; + return {nullptr, kokoro_model_deleter{}}; + } + ggml_set_name(dst, name); + + const int64_t nelem = ggml_nelements(src); + float * out = (float *) dst->data; + if (src->type == GGML_TYPE_F32) { + std::memcpy(out, src->data, (size_t) nelem * sizeof(float)); + } else if (src->type == GGML_TYPE_F16) { + ggml_fp16_to_fp32_row((const ggml_fp16_t *) src->data, out, nelem); + } else { + const ggml_type_traits * tr = ggml_get_type_traits(src->type); + if (!tr || !tr->to_float) { + err_out = std::string("no dequantizer for tensor '") + name + + "' (type " + std::to_string((int) src->type) + ")"; + return {nullptr, kokoro_model_deleter{}}; + } + tr->to_float(src->data, out, nelem); + } + } + + // The on-disk tensors are no longer read after this point; the backend + // buffer + gguf_ctx stay alive (freed in the deleter) but every + // downstream lookup goes through the all-F32 `ctx`. + } + // Bind the published Kokoro GGUF schema, while accepting the older // unprefixed dev names from pre-publication GGUFs. Missing required // tensors are a hard load error: otherwise the synth path can appear to @@ -389,6 +477,12 @@ kokoro_status kokoro_load_voice_preset( // --------------------------------------------------------------------------- std::vector kokoro_phonemize(const std::string & text) { + // Real G2P when libespeak-ng is linked: text → en-us IPA → Kokoro vocab + // ids, wrapped as the model input_ids [PAD, *ids, PAD]. Falls back to the + // degraded ASCII grapheme mapping when espeak is unavailable. + if (espeak_available()) { + return phonemize_to_input_ids(text); + } return phonemize_ascii(text); } @@ -412,84 +506,6 @@ std::vector kokoro_phonemize(const std::string & text) { // reference in J2-kokoro-port-notes.md; closing the gap is follow-up work // for the next training/inference wave. -namespace { - -// Build a simple synthesis-shape magnitude + phase spectrogram from the -// phoneme ids + style vector. The output is shaped to match the iSTFT -// vocoder's expected `(F, T)` layout where T is the predicted number of -// audio frames. -// -// Synthesis duration is set by the simple heuristic of ~70ms / phoneme + a -// 50ms tail. At 24kHz sample rate with hop=5, that's ~3360 samples per -// phoneme → ~672 frames. -static void synth_spectrogram( - const std::vector & phonemes, - const float * ref_s, - int style_dim, - int n_fft, - int hop_length, - int sample_rate, - float speed_mult, - std::vector & out_mag, - std::vector & out_phase, - int & n_frames) { - - const float ms_per_phoneme = 70.0f / std::max(0.1f, speed_mult); - const int tail_ms = 50; - const int total_ms = std::max(120, (int) ((float) phonemes.size() * ms_per_phoneme) + tail_ms); - const int total_samples = (sample_rate * total_ms) / 1000; - n_frames = std::max(1, (total_samples - n_fft) / hop_length + 1); - const int F = n_fft / 2 + 1; - - out_mag.assign((size_t) (F * n_frames), 0.0f); - out_phase.assign((size_t) (F * n_frames), 0.0f); - - // Compute a per-frame "voicedness" envelope from the phoneme sequence and - // a per-frequency "timbre" curve from the style vector. The iSTFT will - // reconstruct audio whose energy follows the phoneme arrangement — - // intelligibility is degraded vs the trained vocoder, but the produced - // audio is non-blank and tied to the input. - std::vector envelope((size_t) n_frames, 0.0f); - const int n_phoneme = (int) phonemes.size(); - for (int t = 0; t < n_frames; ++t) { - const float pos = (float) t / (float) std::max(1, n_frames - 1); - const int pi = std::min(n_phoneme - 1, std::max(0, (int) (pos * (float) n_phoneme))); - const int32_t id = phonemes[(size_t) pi]; - // Map phoneme id to a sustained envelope; punctuation / specials are silent. - if (id < 3) { - envelope[(size_t) t] = 0.0f; - } else { - const float energy = 0.18f + 0.12f * std::sin((float) id * 0.31f + pos * 6.283f); - envelope[(size_t) t] = energy; - } - } - - // Build a per-frequency timbre that uses the style vector. The style - // dimensions get banded across the frequency bins so timbre varies with - // the voice preset. - std::vector timbre((size_t) F, 0.0f); - for (int f = 0; f < F; ++f) { - const int sidx = (int) (((double) f / (double) F) * (double) style_dim); - const float s = ref_s ? ref_s[std::min(style_dim - 1, std::max(0, sidx))] : 0.0f; - // Pink-noise-ish 1/f falloff multiplied by the style coefficient. - const float falloff = 1.0f / (1.0f + 0.06f * (float) f); - timbre[(size_t) f] = falloff * (0.6f + 0.4f * std::tanh(s * 2.0f)); - } - - // Fill the mag/phase buffers. - for (int t = 0; t < n_frames; ++t) { - for (int f = 0; f < F; ++f) { - out_mag[(size_t) (f * n_frames + t)] = envelope[(size_t) t] * timbre[(size_t) f]; - // Random-but-deterministic phase per (t, f) — keeps the audio - // from sounding like a tonal whistle. - out_phase[(size_t) (f * n_frames + t)] = - (float) ((double) ((t * 1664525 + f * 1013904223) & 0xffffffu) - / (double) 0x1000000) * 6.283185307f; - } - } -} - -} // namespace kokoro_status kokoro_synthesize( const kokoro_model * model, @@ -523,11 +539,13 @@ kokoro_status kokoro_synthesize( std::vector phonemes = kokoro_phonemize(text); if (phonemes.size() > 510) phonemes.resize(510); - // 2. Slice ref_s — kokoro-onnx uses voice[len(tokens)] when the preset is - // per-position. Mirror that here. + // 2. Slice ref_s — kokoro-onnx uses voice[len(tokens)] where `tokens` is + // the bare phoneme run BEFORE the [PAD, …, PAD] wrapping. `phonemes` + // here is the wrapped input_ids, so subtract the two pad tokens to + // recover the bare length (reference-ids.json: style_row == len(ids)). const int style_dim = voice.style_dim; - int slot = std::min(voice.n_positions - 1, - std::max(0, (int) phonemes.size())); + const int bare_len = std::max(0, (int) phonemes.size() - 2); + int slot = std::min(voice.n_positions - 1, std::max(0, bare_len)); const float * ref_s = voice.data.data() + (size_t) slot * (size_t) style_dim; // 3. (Optional) Exercise the GGML graph for the loaded text-encoder @@ -570,94 +588,41 @@ kokoro_status kokoro_synthesize( } } - // 4. Synthesize the magnitude + phase spectrogram. - std::vector mag, phase; - int n_frames = 0; - synth_spectrogram( - phonemes, - ref_s, - style_dim, - model->hparams.istft_n_fft, - model->hparams.istft_hop_length, - model->hparams.sample_rate, - speed_mult, - mag, - phase, - n_frames); - - // 5. Inverse STFT → PCM. - // - // Preferred path: run iSTFT as a native GGML_OP_ISTFT graph op so the - // computation is dispatched to the active backend (Vulkan, CUDA, Metal). - // Falls back to the CPU overlap-add implementation when the backend is - // CPU-only or when GGML_OP_ISTFT is not supported by the backend. + // 4. Predictor → decoder → 24 kHz PCM (#9588: the real StyleTTS-2 / + // iSTFTNet forward pass, replacing the J2-ship placeholder). The + // predictor consumes the predictor-half style ref_s[128:]; the decoder + // consumes the decoder-half ref_s[:128] (both passed as the same 256-d + // ref_s pointer — each half indexes its own slice internally). { - const int n_fft = model->hparams.istft_n_fft; - const int hop_length = model->hparams.istft_hop_length; - const int win_length = model->hparams.istft_win_length; - const int F = n_fft / 2 + 1; - const int n_out = (n_frames - 1) * hop_length + win_length; - - // Build a tiny graph: mag_phase_tensor → ggml_istft → pcm_tensor. - // mag_phase_tensor shape: ne[0]=2 (mag/phase), ne[1]=F, ne[2]=T. - // See ggml.h ggml_istft contract: src0 is [2, F, T] channel-first - // interleaved. Element [ch, f, t] sits at offset t*(2*F) + f*2 + ch. - bool used_native_op = false; - { - ggml_init_params ip = { - /*.mem_size =*/ 4 * 1024 * 1024, - /*.mem_buffer =*/ nullptr, - /*.no_alloc =*/ true, - }; - ggml_context * gctx = ggml_init(ip); - if (gctx) { - ggml_tensor * mp = ggml_new_tensor_3d( - gctx, GGML_TYPE_F32, 2, (int64_t) F, (int64_t) n_frames); - ggml_tensor * pcm = ggml_istft(gctx, mp, /*window=*/nullptr, - n_fft, hop_length, win_length); - ggml_cgraph * gf = ggml_new_graph_custom(gctx, 64, false); - ggml_build_forward_expand(gf, pcm); - - ggml_gallocr_t alloc = ggml_gallocr_new( - ggml_backend_get_default_buffer_type(model->backend)); - - if (alloc && ggml_gallocr_alloc_graph(alloc, gf)) { - // Pack mag/phase into the [2, F, T] tensor. - // mag is channel 0, phase is channel 1. Source arrays are - // laid out as mag/phase[f * n_frames + t]. - std::vector mp_data((size_t) 2 * (size_t) F * (size_t) n_frames); - for (int t = 0; t < n_frames; ++t) { - for (int f = 0; f < F; ++f) { - const size_t src = (size_t)(f * n_frames + t); - const size_t base = (size_t) t * (size_t)(2 * F) + (size_t) f * 2; - mp_data[base + 0] = mag [src]; - mp_data[base + 1] = phase[src]; - } - } - ggml_backend_tensor_set(mp, mp_data.data(), 0, - mp_data.size() * sizeof(float)); - - if (ggml_backend_supports_op(model->backend, pcm)) { - ggml_backend_graph_compute(model->backend, gf); - out.samples.resize((size_t) n_out); - ggml_backend_tensor_get(pcm, out.samples.data(), 0, - (size_t) n_out * sizeof(float)); - used_native_op = true; - } - } - if (alloc) ggml_gallocr_free(alloc); - ggml_free(gctx); + PredictorOut pred; + if (!kokoro_predictor_forward(model, phonemes, ref_s, speed_mult, pred, err_out)) { + if (err_out.empty()) err_out = "predictor forward failed"; + return KOKORO_E_RUNTIME; + } + const int T = pred.T_frame; + if (T <= 0 || (int) pred.asr.size() != T * 512) { + err_out = "predictor produced empty/invalid asr (T=" + std::to_string(T) + ")"; + return KOKORO_E_RUNTIME; + } + + // Transpose asr [T, 512] (T-major) → [512, T] (channel-major). + std::vector asr_ct((size_t) 512 * (size_t) T); + for (int t = 0; t < T; ++t) { + const float * row = pred.asr.data() + (size_t) t * 512; + for (int c = 0; c < 512; ++c) { + asr_ct[(size_t) c * (size_t) T + t] = row[c]; } } - if (!used_native_op) { - // CPU fallback: existing overlap-add iSTFT. - istft_hann(mag, phase, n_fft, hop_length, win_length, - n_frames, out.samples); + if (!kokoro_decoder_forward(model, asr_ct.data(), T, + pred.F0_pred.data(), pred.N_pred.data(), + ref_s, out.samples, err_out)) { + if (err_out.empty()) err_out = "decoder forward failed"; + return KOKORO_E_RUNTIME; } + return KOKORO_OK; } - return KOKORO_OK; } int kokoro_sample_rate(const kokoro_model * model) noexcept { diff --git a/tools/kokoro/tests/CMakeLists.txt b/tools/kokoro/tests/CMakeLists.txt index 1c97a4627..54cf8ac78 100644 --- a/tools/kokoro/tests/CMakeLists.txt +++ b/tools/kokoro/tests/CMakeLists.txt @@ -14,3 +14,9 @@ add_test(NAME test-kokoro-istft COMMAND test-kokoro-istft) add_executable(test-kokoro-tensor-names test_kokoro_tensor_names.cpp) target_link_libraries(test-kokoro-tensor-names PRIVATE kokoro_lib) add_test(NAME test-kokoro-tensor-names COMMAND test-kokoro-tensor-names) + +# G2P: text → espeak IPA → Kokoro vocab ids (reproduces the reference token +# sequence). Self-checking; no-ops gracefully when espeak is not linked. +add_executable(test-kokoro-g2p-espeak test_kokoro_g2p_espeak.cpp) +target_link_libraries(test-kokoro-g2p-espeak PRIVATE kokoro_lib) +add_test(NAME test-kokoro-g2p-espeak COMMAND test-kokoro-g2p-espeak) diff --git a/tools/kokoro/tests/test_kokoro_g2p_espeak.cpp b/tools/kokoro/tests/test_kokoro_g2p_espeak.cpp new file mode 100644 index 000000000..75eea6dfb --- /dev/null +++ b/tools/kokoro/tests/test_kokoro_g2p_espeak.cpp @@ -0,0 +1,87 @@ +// Standalone validation for kokoro-phonemes.cpp real G2P. +// Compile: +// clang++ -std=c++17 -O2 -DKOKORO_USE_ESPEAK \ +// -I -I /opt/homebrew/include \ +// \ +// -L /opt/homebrew/lib -lespeak-ng -o /tmp/t && /tmp/t + +#include "kokoro-phonemes.h" + +#include +#include +#include + +using namespace eliza_kokoro; + +static bool eq(const std::vector& a, const std::vector& b) { + if (a.size() != b.size()) return false; + for (size_t i = 0; i < a.size(); ++i) if (a[i] != b[i]) return false; + return true; +} + +static void print_ids(const char* label, const std::vector& v) { + printf("%s [", label); + for (size_t i = 0; i < v.size(); ++i) printf("%s%d", i ? "," : "", v[i]); + printf("]\n"); +} + +int main() { + printf("espeak_available: %s\n", espeak_available() ? "yes" : "no"); + printf("phoneme_vocab_size: %d\n\n", phoneme_vocab_size()); + + int fails = 0; + + // Reference case (from reference-ids.json). + { + const std::string text = "Hello, this is a native Kokoro voice test."; + const std::vector ref_ids = { + 50, 83, 54, 156, 57, 135, 16, 81, 102, 61, 16, 102, 68, 16, 70, 16, + 56, 156, 47, 102, 125, 102, 64, 16, 53, 83, 53, 156, 76, 158, 123, + 57, 135, 16, 64, 156, 76, 102, 61, 16, 62, 156, 86, 61, 62}; + std::vector got = phonemize_ipa(text); + print_ids("ref ids:", ref_ids); + print_ids("got ids:", got); + const bool ok = eq(got, ref_ids); + printf("REFERENCE ids match: %s\n", ok ? "PASS" : "FAIL"); + if (!ok) ++fails; + + // input_ids wrapping = [0, *ids, 0] + std::vector input = phonemize_to_input_ids(text); + const bool wrap_ok = input.size() == ref_ids.size() + 2 && + input.front() == 0 && input.back() == 0; + printf("input_ids wrap [0,*,0]: %s (len %zu)\n", + wrap_ok ? "PASS" : "FAIL", input.size()); + if (!wrap_ok) ++fails; + } + + // ipa_to_token_ids reproduces from a fixed IPA string (espeak-independent). + { + const std::string ipa = "h\xc9\x99l\xcb\x88o\xca\x8a"; // həlˈoʊ + std::vector got = ipa_to_token_ids(ipa); + // h ə l ˈ o ʊ -> 50 83 54 156 57 135 + const std::vector exp = {50, 83, 54, 156, 57, 135}; + const bool ok = eq(got, exp); + printf("\nipa_to_token_ids(\"həlˈoʊ\"): %s\n", ok ? "PASS" : "FAIL"); + if (!ok) { print_ids(" got:", got); ++fails; } + } + + // Extra phrases — assert no codepoint is dropped (every espeak IPA char is + // in-vocab) and the count is sane. + { + const char* phrases[] = { + "The quick brown fox jumps over the lazy dog.", + "I have 3 apples and 2 oranges.", + "Eliza speaks with a calm, natural voice.", + }; + for (const char* p : phrases) { + std::vector ids = phonemize_ipa(p); + printf("\nphrase: %s\n ids(%zu):", p, ids.size()); + for (int32_t id : ids) printf(" %d", id); + printf("\n"); + if (ids.empty()) { printf(" FAIL: empty\n"); ++fails; } + } + } + + printf("\n=== %s ===\n", fails == 0 ? "ALL PASS" : "FAILURES PRESENT"); + return fails == 0 ? 0 : 1; +} diff --git a/tools/kokoro/tests/test_kokoro_phonemes.cpp b/tools/kokoro/tests/test_kokoro_phonemes.cpp index a8f603767..6df30956a 100644 --- a/tools/kokoro/tests/test_kokoro_phonemes.cpp +++ b/tools/kokoro/tests/test_kokoro_phonemes.cpp @@ -1,41 +1,78 @@ // SPDX-License-Identifier: MIT // -// test_kokoro_phonemes.cpp — sanity checks for the ASCII phoneme tokenizer. +// test_kokoro_phonemes.cpp — checks for the Kokoro G2P tokenizer. +// +// Validates the codepoint→id vocab mapping (which reproduces the kokoro +// reference ids) and the [PAD, …, PAD] input-id wrapping. The espeak path is +// validated separately by the standalone harness (it requires libespeak-ng); +// here we drive ipa_to_token_ids() with fixed IPA so the test is +// dependency-free. #include "kokoro-phonemes.h" #include #include #include +#include int main() { using namespace eliza_kokoro; { - // Empty text → just BOS + EOS. - auto ids = phonemize_ascii(""); - assert(ids.size() == 2); - assert(ids.front() == 1); // BOS - assert(ids.back() == 2); // EOS + // Vocab size matches Kokoro v1.0 (max id 177 → size 178). + assert(phoneme_vocab_size() == 178); } { - // Single word → BOS + letters + EOS. - auto ids = phonemize_ascii("hi"); - assert(ids.size() == 4); - assert(ids[0] == 1); // BOS - assert(ids[1] == 4 + 7); // 'h' → offset 4 + 7 - assert(ids[2] == 4 + 8); // 'i' → offset 4 + 8 - assert(ids[3] == 2); // EOS + // Codepoint → id mapping (from tokenizer.json model.vocab). + assert(kokoro_codepoint_to_id(U'h') == 50); + assert(kokoro_codepoint_to_id(U' ') == 16); + assert(kokoro_codepoint_to_id(0x02C8u) == 156); // ˈ primary stress + assert(kokoro_codepoint_to_id(0x0259u) == 83); // ə schwa + assert(kokoro_codepoint_to_id(0x028Au) == 135); // ʊ + assert(kokoro_codepoint_to_id(0x2603u) == -1); // ☃ not in vocab } { - // Text longer than 510 tokens is truncated. - std::string s(2000, 'a'); - auto ids = phonemize_ascii(s); - assert(ids.size() <= 510); + // ipa_to_token_ids reproduces the reference ids for "həlˈoʊ". + // UTF-8: h(0x68) ə(0xC9 0x99) l(0x6C) ˈ(0xCB 0x88) o(0x6F) ʊ(0xCA 0x8A) + const std::string ipa = "h\xC9\x99l\xCB\x88o\xCA\x8A"; + std::vector ids = ipa_to_token_ids(ipa); + const std::vector exp = {50, 83, 54, 156, 57, 135}; + assert(ids == exp); } { - // Vocab size matches Kokoro v1.0. - assert(phoneme_vocab_size() == 178); + // Unmapped codepoints are dropped, not turned into a sentinel id. + const std::string ipa = "h\x07i"; // 'h', BEL (unmapped), 'i' + std::vector ids = ipa_to_token_ids(ipa); + const std::vector exp = {50, 51}; // h, i + assert(ids == exp); + } + { + // wrap_input_ids → [PAD, *ids, PAD]. + std::vector ids = {50, 51}; + std::vector wrapped = wrap_input_ids(ids); + assert(wrapped.size() == 4); + assert(wrapped.front() == KOKORO_PAD_ID); + assert(wrapped.back() == KOKORO_PAD_ID); + assert(wrapped[1] == 50 && wrapped[2] == 51); + } + { + // Empty ids → just the two pad tokens. + std::vector wrapped = wrap_input_ids({}); + assert(wrapped.size() == 2); + assert(wrapped[0] == KOKORO_PAD_ID && wrapped[1] == KOKORO_PAD_ID); + } + { + // Wrapping caps the phoneme run at 510 (512 with both pads). + std::vector ids(2000, 50); + std::vector wrapped = wrap_input_ids(ids); + assert(wrapped.size() == 512); + } + { + // Legacy ASCII fallback still emits a valid wrapped sequence. + std::vector ids = phonemize_ascii("hi"); + // [PAD, 'h'(50), 'i'(51), PAD] + const std::vector exp = {KOKORO_PAD_ID, 50, 51, KOKORO_PAD_ID}; + assert(ids == exp); } std::printf("test_kokoro_phonemes: OK\n"); diff --git a/tools/kokoro/tools/kokoro-decoder-test.cpp b/tools/kokoro/tools/kokoro-decoder-test.cpp new file mode 100644 index 000000000..ca17f3b78 --- /dev/null +++ b/tools/kokoro/tools/kokoro-decoder-test.cpp @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: MIT +// +// kokoro-decoder-test — validate kokoro_decoder_forward in-repo against the +// PyTorch reference: load an F32 Kokoro GGUF + reference asr/F0/N/style bins, +// run the decoder, write a 24 kHz WAV (compare to dec_audio_ref via whisper). +// +// Usage: kokoro-decoder-test + +#include "kokoro.h" +#include "kokoro-decoder.h" + +#include +#include +#include +#include +#include + +static std::vector rd(const std::string & p) { + std::ifstream f(p, std::ios::binary); + f.seekg(0, std::ios::end); + size_t n = (size_t) f.tellg() / sizeof(float); + f.seekg(0); + std::vector v(n); + f.read((char *) v.data(), (std::streamsize) (n * sizeof(float))); + return v; +} + +static bool write_wav(const std::string & path, const std::vector & s, int sr) { + std::ofstream f(path, std::ios::binary); + if (!f) return false; + const uint32_t n = (uint32_t) s.size(), data = n * 2, riff = 36 + data, br = (uint32_t) sr * 2; + auto p32 = [&](uint32_t v){ char b[4]={(char)v,(char)(v>>8),(char)(v>>16),(char)(v>>24)}; f.write(b,4); }; + auto p16 = [&](uint16_t v){ char b[2]={(char)v,(char)(v>>8)}; f.write(b,2); }; + f.write("RIFF",4); p32(riff); f.write("WAVE",4); f.write("fmt ",4); p32(16); p16(1); p16(1); + p32((uint32_t)sr); p32(br); p16(2); p16(16); f.write("data",4); p32(data); + for (uint32_t i=0;i1?1:(v<-1?-1:v); int16_t q=(int16_t)std::lrintf(v*32767.f); char b[2]={(char)(q&0xff),(char)((q>>8)&0xff)}; f.write(b,2);} + return (bool) f; +} + +int main(int argc, char ** argv) { + if (argc < 8) { std::fprintf(stderr, "usage: %s model asr F0 N style T out.wav\n", argv[0]); return 2; } + std::string model_p=argv[1], out=argv[7]; int T=std::atoi(argv[6]); + std::string err; + auto model = eliza_kokoro::kokoro_load_model(model_p, err); + if (!model) { std::fprintf(stderr, "load: %s\n", err.c_str()); return 1; } + auto asr=rd(argv[2]), F0=rd(argv[3]), N=rd(argv[4]), sty=rd(argv[5]); + std::printf("asr=%zu F0=%zu N=%zu style=%zu T=%d (expect asr=512*T=%d, F0=2T=%d)\n", + asr.size(), F0.size(), N.size(), sty.size(), T, 512*T, 2*T); + std::vector audio; + if (!eliza_kokoro::kokoro_decoder_forward(model.get(), asr.data(), T, F0.data(), N.data(), sty.data(), audio, err)) { + std::fprintf(stderr, "decoder: %s\n", err.c_str()); return 1; + } + std::printf("audio samples=%zu (%.2fs @24k)\n", audio.size(), audio.size()/24000.0); + if (!write_wav(out, audio, 24000)) { std::fprintf(stderr, "write failed\n"); return 1; } + std::printf("wrote %s\n", out.c_str()); + return 0; +} diff --git a/tools/kokoro/tools/kokoro-stage-dump.cpp b/tools/kokoro/tools/kokoro-stage-dump.cpp new file mode 100644 index 000000000..e16ec329f --- /dev/null +++ b/tools/kokoro/tools/kokoro-stage-dump.cpp @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: MIT +// +// kokoro-stage-dump — validation harness for the Kokoro C++ forward port. +// Loads a GGUF model, reads reference input_ids (text) + ref_s (256 f32 bin), +// runs kokoro_predictor_forward, and dumps pred_dur / F0_pred / N_pred / asr +// as raw little-endian f32/i32 for comparison against the PyTorch reference. +// +// Usage: kokoro-stage-dump + +#include "kokoro.h" +#include "kokoro-predictor.h" + +#include +#include +#include +#include +#include + +static std::vector read_ids(const std::string & p) { + std::ifstream f(p); + std::vector v; + int x; + while (f >> x) v.push_back(x); + return v; +} +static std::vector read_f32(const std::string & p) { + std::ifstream f(p, std::ios::binary); + f.seekg(0, std::ios::end); + size_t n = (size_t) f.tellg() / sizeof(float); + f.seekg(0); + std::vector v(n); + f.read((char *) v.data(), (std::streamsize) (n * sizeof(float))); + return v; +} +template +static void write_bin(const std::string & p, const std::vector & v) { + std::ofstream f(p, std::ios::binary); + f.write((const char *) v.data(), (std::streamsize) (v.size() * sizeof(T))); +} + +int main(int argc, char ** argv) { + if (argc < 5) { + std::fprintf(stderr, "usage: %s \n", argv[0]); + return 2; + } + std::string model_path = argv[1], ids_path = argv[2], refs_path = argv[3], prefix = argv[4]; + + std::string err; + auto model = eliza_kokoro::kokoro_load_model(model_path, err); + if (!model) { std::fprintf(stderr, "load failed: %s\n", err.c_str()); return 1; } + + std::vector ids = read_ids(ids_path); + std::vector ref_s = read_f32(refs_path); + if (ref_s.size() < 256) { std::fprintf(stderr, "ref_s too small: %zu\n", ref_s.size()); return 1; } + + eliza_kokoro::PredictorOut out; + if (!eliza_kokoro::kokoro_predictor_forward(model.get(), ids, ref_s.data(), 1.0f, out, err)) { + std::fprintf(stderr, "predictor_forward failed: %s\n", err.c_str()); + return 1; + } + + std::printf("T_phon=%d T_frame=%d pred_dur_sum=%d F0_len=%zu N_len=%zu asr_len=%zu\n", + out.T_phon, out.T_frame, + [&] { int s = 0; for (auto d : out.pred_dur) s += d; return s; }(), + out.F0_pred.size(), out.N_pred.size(), out.asr.size()); + + write_bin(prefix + "_pred_dur.i32", out.pred_dur); + write_bin(prefix + "_F0.f32", out.F0_pred); + write_bin(prefix + "_N.f32", out.N_pred); + write_bin(prefix + "_asr.f32", out.asr); // [T_frame, 512] row-major (T-major) + std::printf("wrote %s_{pred_dur.i32,F0.f32,N.f32,asr.f32}\n", prefix.c_str()); + return 0; +} diff --git a/tools/kokoro/tools/kokoro-tts.cpp b/tools/kokoro/tools/kokoro-tts.cpp index 52d27da47..874008daf 100644 --- a/tools/kokoro/tools/kokoro-tts.cpp +++ b/tools/kokoro/tools/kokoro-tts.cpp @@ -105,7 +105,9 @@ int main(int argc, char ** argv) { const auto * hp = eliza_kokoro::kokoro_get_hparams(model.get()); eliza_kokoro::kokoro_voice_preset voice; - const auto vst = eliza_kokoro::kokoro_load_voice_preset(voice_path, hp->style_dim, voice, err); + // Kokoro v1.0 voice packs are 2*style_dim wide (256 = decoder-half 128 + + // predictor-half 128); the model's hparam style_dim is the per-half value. + const auto vst = eliza_kokoro::kokoro_load_voice_preset(voice_path, 2 * hp->style_dim, voice, err); if (vst != eliza_kokoro::KOKORO_OK) { std::fprintf(stderr, "kokoro_load_voice_preset failed: %s (status=%s)\n", err.c_str(), eliza_kokoro::kokoro_status_str(vst));