elizaOS · lalalune · Jun 25, 2026 · Jun 25, 2026
diff --git a/tools/kokoro/CMakeLists.txt b/tools/kokoro/CMakeLists.txt
@@ -25,7 +25,9 @@ set(KOKORO_CORE_SOURCES
     src/kokoro.cpp
     src/kokoro-istft.cpp
     src/kokoro-phonemes.cpp
-    src/kokoro-predictor.cpp)
+    src/kokoro-predictor.cpp
+    src/kokoro-generator.cpp
+    src/kokoro-decoder.cpp)
 
 add_library(kokoro_lib STATIC
     ${KOKORO_CORE_SOURCES}
@@ -56,17 +58,46 @@ endif()
 
 target_compile_features(kokoro_lib PUBLIC cxx_std_17)
 
-# Standalone CLI harness — required by J2 verification (tools/voice-kokoro/).
+# Real G2P via libespeak-ng. When present, kokoro_phonemize() drives
+# espeak_TextToPhonemes() (en-us IPA) and maps codepoints to Kokoro vocab ids,
+# reproducing the reference token sequence. When absent, the build falls back
+# to the degraded ASCII grapheme mapping and the TS layer must supply IPA.
+# Override the search with -DKOKORO_ESPEAK_ROOT=<prefix> (e.g. Homebrew).
+option(KOKORO_ENABLE_ESPEAK "Link libespeak-ng for real Kokoro G2P" ON)
+if(KOKORO_ENABLE_ESPEAK)
+    find_path(ESPEAK_NG_INCLUDE_DIR espeak-ng/speak_lib.h
+        HINTS ${KOKORO_ESPEAK_ROOT} /opt/homebrew /usr/local /usr
+        PATH_SUFFIXES include)
+    find_library(ESPEAK_NG_LIBRARY NAMES espeak-ng
+        HINTS ${KOKORO_ESPEAK_ROOT} /opt/homebrew /usr/local /usr
+        PATH_SUFFIXES lib lib64)
+    if(ESPEAK_NG_INCLUDE_DIR AND ESPEAK_NG_LIBRARY)
+        target_include_directories(kokoro_lib PRIVATE ${ESPEAK_NG_INCLUDE_DIR})
+        target_link_libraries(kokoro_lib PRIVATE ${ESPEAK_NG_LIBRARY})
+        target_compile_definitions(kokoro_lib PRIVATE KOKORO_USE_ESPEAK)
+        message(STATUS "Kokoro G2P: libespeak-ng found (${ESPEAK_NG_LIBRARY}) — real IPA path enabled")
+    else()
+        message(STATUS "Kokoro G2P: libespeak-ng not found — falling back to ASCII grapheme mapping (TS layer must supply IPA)")
+    endif()
+endif()
+
+# Standalone CLI harnesses (required by J2 verification + Kokoro decoder dev).
+# Force MACOSX_BUNDLE OFF: CMake defaults Apple executables to bundles, and
+# install(TARGETS ... RUNTIME) on a bundle target fails configure with
+# "no BUNDLE DESTINATION" on every ios/tvos/visionos/macos build.
 add_executable(kokoro-tts tools/kokoro-tts.cpp)
 target_link_libraries(kokoro-tts PRIVATE kokoro_lib)
-# kokoro-tts is a CLI harness, not a GUI app. On Apple platforms CMake defaults
-# executables to MACOSX_BUNDLE, and install(TARGETS ... RUNTIME) on a bundle
-# target fails configure with "install TARGETS given no BUNDLE DESTINATION for
-# MACOSX_BUNDLE executable" on every ios/tvos/visionos/macos build. Force the
-# bundle flag off so the plain RUNTIME install is valid on all platforms.
 set_target_properties(kokoro-tts PROPERTIES MACOSX_BUNDLE OFF)
 install(TARGETS kokoro-tts RUNTIME)
 
+add_executable(kokoro-stage-dump tools/kokoro-stage-dump.cpp)
+target_link_libraries(kokoro-stage-dump PRIVATE kokoro_lib)
+set_target_properties(kokoro-stage-dump PROPERTIES MACOSX_BUNDLE OFF)
+
+add_executable(kokoro-decoder-test tools/kokoro-decoder-test.cpp)
+target_link_libraries(kokoro-decoder-test PRIVATE kokoro_lib)
+set_target_properties(kokoro-decoder-test PROPERTIES MACOSX_BUNDLE OFF)
+
 # Server-mount handler: compiled into kokoro_lib only when the server target
 # exists. The handler is guarded by `#ifdef LLAMA_BUILD_KOKORO` and pulls in
 # the same `server-http.h` interface that the omnivoice handler uses, plus

diff --git a/tools/kokoro/convert_kokoro_pth_to_gguf.py b/tools/kokoro/convert_kokoro_pth_to_gguf.py
@@ -97,9 +97,15 @@
 def _add_tensor(writer: gguf.GGUFWriter, name: str, data: np.ndarray) -> None:
     """Add tensors with the dtype layout the Kokoro forward pass expects.
 
-    Weight matrices and convolution kernels (ndim >= 2) are emitted as F16;
-    biases, norms, and other vectors stay F32. All-F32 GGUFs can load but
-    synthesize noise in the fused runtime path.
+    Weight matrices and convolution kernels (ndim >= 2) are emitted as F16
+    purely to halve the GGUF download size; biases, norms, and other vectors
+    stay F32. The GGUF dtype does not affect correctness: the loader
+    dequantizes every tensor to F32 at load time, so an all-F32 and an
+    F16-weights GGUF produce identical synthesis. (An earlier note here
+    claimed all-F32 GGUFs synthesized noise — that was a misdiagnosis: the
+    fused path was a stub that ignored the weights, and the real defect was
+    the loader reading non-F32 tensors as raw F32. Both are fixed; F16 is
+    kept only for bundle size.)
     """
     if data.dtype not in (np.float32, np.float16):
         data = data.astype(np.float32)

diff --git a/tools/kokoro/include/kokoro-decoder-front.h b/tools/kokoro/include/kokoro-decoder-front.h
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: MIT
+// kokoro-decoder-front.h — Decoder.forward up to the generator (validated port, #9588).
+#pragma once
+#include <cmath>
+#include <cstring>
+#include <vector>
+#include "kokoro-layers.h"   // conv1d_forward, adain1d_forward, convtranspose1d_depthwise_forward, convtranspose1d_out_len
+
+namespace eliza_kokoro {
+
+struct DecAdainResBlk {
+    int Cin = 0, Cout = 0, Sdim = 128;
+    bool upsample = false;
+    bool learned_sc = false;             // dim_in != dim_out
+    const float * norm1_fc_w = nullptr;  // [2*Cin, Sdim]
+    const float * norm1_fc_b = nullptr;  // [2*Cin]
+    const float * norm2_fc_w = nullptr;  // [2*Cout, Sdim]
+    const float * norm2_fc_b = nullptr;  // [2*Cout]
+    const float * conv1_w    = nullptr;  // [Cout, Cin, 3]
+    const float * conv1_b    = nullptr;  // [Cout]
+    const float * conv2_w    = nullptr;  // [Cout, Cout, 3]
+    const float * conv2_b    = nullptr;  // [Cout]
+    const float * conv1x1_w  = nullptr;  // [Cout, Cin, 1] (learned_sc only)
+    const float * conv1x1_b  = nullptr;  // [Cout] (null — conv1x1 bias=False)
+    const float * pool_w     = nullptr;  // [Cin, 1, 3] (upsample only)
+    const float * pool_b     = nullptr;  // [Cin] (upsample only)
+};
+
+// AdainResBlk1d (decode-block flavor: leaky_relu 0.2; pool/shortcut;
+// out = (residual + shortcut)/sqrt(2)). Output y [Cout, T_out].
+inline void dec_adainresblk1d_forward(
+        const DecAdainResBlk & w, const float * x, int T_in, const float * s,
+        std::vector<float> & y, int & T_out) {
+    const int Cin = w.Cin, Cout = w.Cout, Sdim = w.Sdim;
+
+    // residual branch: norm1 -> leaky_relu(0.2) -> [pool] -> conv1 -> norm2 -> leaky_relu -> conv2
+    std::vector<float> r(x, x + (size_t)Cin * T_in);
+    adain1d_forward(r.data(), Cin, T_in, s, Sdim, w.norm1_fc_w, w.norm1_fc_b);
+    for (size_t i = 0; i < r.size(); ++i) if (r[i] < 0) r[i] *= 0.2f;
+
+    int T_pool = T_in;
+    if (w.upsample) {
+        T_pool = convtranspose1d_out_len(T_in, 3, 2, 1, 1);
+        std::vector<float> r2((size_t)Cin * T_pool);
+        convtranspose1d_depthwise_forward(r.data(), Cin, T_in, w.pool_w, w.pool_b, 3, 2, 1, 1, r2.data(), T_pool);
+        r.swap(r2);
+    }
+    std::vector<float> r3((size_t)Cout * T_pool);
+    conv1d_forward(r.data(), Cin, T_pool, w.conv1_w, w.conv1_b, Cout, 3, 1, 1, 1, r3.data(), T_pool);
+    adain1d_forward(r3.data(), Cout, T_pool, s, Sdim, w.norm2_fc_w, w.norm2_fc_b);
+    for (size_t i = 0; i < r3.size(); ++i) if (r3[i] < 0) r3[i] *= 0.2f;
+    std::vector<float> r4((size_t)Cout * T_pool);
+    conv1d_forward(r3.data(), Cout, T_pool, w.conv2_w, w.conv2_b, Cout, 3, 1, 1, 1, r4.data(), T_pool);
+
+    // shortcut branch: [nearest-upsample x2] -> [conv1x1 if learned_sc]
+    T_out = T_pool;
+    std::vector<float> sc;
+    if (w.upsample) {
+        const int T_up = T_in * 2;  // == T_pool
+        std::vector<float> up((size_t)Cin * T_up);
+        for (int c = 0; c < Cin; ++c)
+            for (int t = 0; t < T_up; ++t)
+                up[(size_t)c * T_up + t] = x[(size_t)c * T_in + (t / 2)];
+        if (w.learned_sc) {
+            sc.assign((size_t)Cout * T_up, 0.0f);
+            conv1d_forward(up.data(), Cin, T_up, w.conv1x1_w, w.conv1x1_b, Cout, 1, 1, 0, 1, sc.data(), T_up);
+        } else sc.swap(up);
+    } else {
+        if (w.learned_sc) {
+            sc.assign((size_t)Cout * T_in, 0.0f);
+            conv1d_forward(x, Cin, T_in, w.conv1x1_w, w.conv1x1_b, Cout, 1, 1, 0, 1, sc.data(), T_in);
+        } else sc.assign(x, x + (size_t)Cin * T_in);
+    }
+
+    y.assign((size_t)Cout * T_out, 0.0f);
+    const float rsqrt2 = 1.0f / std::sqrt(2.0f);
+    for (size_t i = 0; i < y.size(); ++i) y[i] = (r4[i] + sc[i]) * rsqrt2;
+}
+
+struct DecoderFrontWeights {
+    const float * F0_conv_w = nullptr;  // [1,1,3]
+    const float * F0_conv_b = nullptr;  // [1]
+    const float * N_conv_w  = nullptr;  // [1,1,3]
+    const float * N_conv_b  = nullptr;  // [1]
+    const float * asr_res_w = nullptr;  // [64,512,1]
+    const float * asr_res_b = nullptr;  // [64]
+    DecAdainResBlk encode;              // 514 -> 1024, learned_sc
+    DecAdainResBlk decode[4];           // 1090->1024 (x3), 1090->512 upsample
+};
+
+// Decoder.forward up to (not including) the generator.
+//   asr[512,T_asr] (T_asr=132), F0_curve[2*T_asr], N[2*T_asr], s[128]
+// Output: x_out [512, 2*T_asr] (== generator_in_0); also returns the
+// stride-2 conv outputs F0_down[T_asr], N_down[T_asr] (caller passes them,
+// together with the ORIGINAL F0_curve, into the generator).
+inline void decoder_front(
+        const DecoderFrontWeights & W,
+        const float * asr, int Cin_asr, int T_asr,
+        const float * F0_curve, const float * N_in, const float * s,
+        std::vector<float> & x_out,
+        std::vector<float> & F0_down,
+        std::vector<float> & N_down) {
+    const int Tc = 2 * T_asr;  // 264
+
+    F0_down.assign(T_asr, 0.0f);
+    conv1d_forward(F0_curve, 1, Tc, W.F0_conv_w, W.F0_conv_b, 1, 3, 2, 1, 1, F0_down.data(), T_asr);
+    N_down.assign(T_asr, 0.0f);
+    conv1d_forward(N_in, 1, Tc, W.N_conv_w, W.N_conv_b, 1, 3, 2, 1, 1, N_down.data(), T_asr);
+
+    // x = cat([asr, F0, N], dim=channels) -> [514, T_asr]
+    std::vector<float> xcat((size_t)(Cin_asr + 2) * T_asr);
+    std::memcpy(xcat.data(), asr, sizeof(float) * (size_t)Cin_asr * T_asr);
+    std::memcpy(xcat.data() + (size_t)Cin_asr * T_asr, F0_down.data(), sizeof(float) * T_asr);
+    std::memcpy(xcat.data() + (size_t)(Cin_asr + 1) * T_asr, N_down.data(), sizeof(float) * T_asr);
+
+    std::vector<float> x; int T_x;
+    dec_adainresblk1d_forward(W.encode, xcat.data(), T_asr, s, x, T_x);   // encode 514->1024
+
+    std::vector<float> asr_res((size_t)64 * T_asr);                       // asr_res Conv1d k1 512->64
+    conv1d_forward(asr, Cin_asr, T_asr, W.asr_res_w, W.asr_res_b, 64, 1, 1, 0, 1, asr_res.data(), T_asr);
+
+    bool res = true;
+    for (int b = 0; b < 4; ++b) {
+        std::vector<float> blk_in;
+        if (res) {  // cat([x, asr_res, F0, N]) -> 1024+64+1+1 = 1090
+            const int Cx = (int)(x.size() / T_x);
+            const int Cin_blk = Cx + 64 + 1 + 1;
+            blk_in.assign((size_t)Cin_blk * T_x, 0.0f);
+            std::memcpy(blk_in.data(), x.data(), sizeof(float) * (size_t)Cx * T_x);
+            std::memcpy(blk_in.data() + (size_t)Cx * T_x, asr_res.data(), sizeof(float) * 64 * T_x);
+            std::memcpy(blk_in.data() + (size_t)(Cx + 64) * T_x, F0_down.data(), sizeof(float) * T_x);
+            std::memcpy(blk_in.data() + (size_t)(Cx + 65) * T_x, N_down.data(), sizeof(float) * T_x);
+        } else {
+            blk_in.assign(x.begin(), x.end());
+        }
+        std::vector<float> y; int T_y;
+        dec_adainresblk1d_forward(W.decode[b], blk_in.data(), T_x, s, y, T_y);
+        x.swap(y); T_x = T_y;
+        if (W.decode[b].upsample) res = false;  // decode3 upsamples -> res stops
+    }
+    x_out.swap(x);  // [512, 264]
+}
+
+} // namespace eliza_kokoro
diff --git a/tools/kokoro/include/kokoro-decoder.h b/tools/kokoro/include/kokoro-decoder.h
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+//
+// kokoro-decoder.h — StyleTTS-2 / iSTFTNet decoder: predictor outputs -> 24 kHz audio.
+//
+// Wires the validated decoder_front (kokoro-decoder-front.h) + Generator
+// (kokoro-generator.h) against the model's all-F32 ggml context. Replaces the
+// J2-ship placeholder spectrogram in kokoro_synthesize (#9588).
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+namespace eliza_kokoro {
+
+struct kokoro_model;
+
+// Run the full decoder. Inputs come from kokoro_predictor_forward:
+//   asr_ct   : [512, T_frame] channel-major (transpose of PredictorOut.asr [T,512])
+//   F0, N    : [2*T_frame]    (PredictorOut.F0_pred / N_pred — the up-2x curves)
+//   ref_s_dec: [128]          decoder-half style (ref_s[:128])
+// Output: audio (24 kHz mono), resized to (2*T_frame)*300.
+bool kokoro_decoder_forward(
+        const kokoro_model * model,
+        const float * asr_ct, int T_frame,
+        const float * F0, const float * N,
+        const float * ref_s_dec,
+        std::vector<float> & audio,
+        std::string & err);
+
+} // namespace eliza_kokoro
diff --git a/tools/kokoro/include/kokoro-generator.h b/tools/kokoro/include/kokoro-generator.h
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: MIT
+//
+// kokoro-generator.h — iSTFTNet Generator.forward (StyleTTS-2 decoder back-end).
+//
+// The generator turns the decoder body output `x` [512, 264], the style
+// vector `s` [128], and the (un-downsampled) F0 curve `f0_curve` [264] into
+// `audio` [79200] (24 kHz).
+//
+// Weights are raw float pointers (PyTorch row-major, weight_norm-fused):
+//   Conv1d weight       [Cout, Cin, K]
+//   ConvTranspose1d wt  [Cin,  Cout, K]
+//   Linear weight       [out,  in]
+//   AdaIN1d fc.weight    [2C,   style_dim]   (style_dim = 128)
+//   Snake alpha          [C]
+// The caller supplies them via GeneratorWeights so the function composes with
+// any weight-loading boundary (GGUF tensor lookup, raw .f32 fixtures, …).
+
+#pragma once
+
+#include <vector>
+
+namespace eliza_kokoro {
+
+// One AdaINResBlock1 sub-block (the block has three, sharing the same channel
+// count). convs use [Cout=Cin=C, Cin=C, K].
+struct GenSubBlockWeights {
+    const float * conv1_w = nullptr;     // [C, C, K]
+    const float * conv1_b = nullptr;     // [C]
+    const float * conv2_w = nullptr;     // [C, C, K]
+    const float * conv2_b = nullptr;     // [C]
+    const float * adain1_fc_w = nullptr; // [2C, 128]
+    const float * adain1_fc_b = nullptr; // [2C]
+    const float * adain2_fc_w = nullptr; // [2C, 128]
+    const float * adain2_fc_b = nullptr; // [2C]
+    const float * alpha1 = nullptr;      // [C]
+    const float * alpha2 = nullptr;      // [C]
+};
+
+struct GenAdaResBlockWeights {
+    GenSubBlockWeights sub[3];
+};
+
+struct GeneratorWeights {
+    // m_source.l_linear: Linear(9 -> 1).
+    const float * l_linear_w = nullptr;  // [1, 9]
+    const float * l_linear_b = nullptr;  // [1]
+
+    // ups[0], ups[1]: ConvTranspose1d. weight [Cin, Cout, K], bias [Cout].
+    const float * ups_w[2] = { nullptr, nullptr };
+    const float * ups_b[2] = { nullptr, nullptr };
+
+    // noise_convs[0], noise_convs[1]: Conv1d. weight [Cout, 22, K], bias [Cout].
+    const float * noise_convs_w[2] = { nullptr, nullptr };
+    const float * noise_convs_b[2] = { nullptr, nullptr };
+
+    // noise_res[0] (k=7), noise_res[1] (k=11): AdaINResBlock1.
+    GenAdaResBlockWeights noise_res[2];
+
+    // resblocks[0..5]: AdaINResBlock1 (stage0: k=3,7,11 ch=256; stage1: ch=128).
+    GenAdaResBlockWeights resblocks[6];
+
+    // conv_post: Conv1d(128 -> 22, k=7, pad=3). weight [22, 128, 7], bias [22].
+    const float * conv_post_w = nullptr;
+    const float * conv_post_b = nullptr;
+};
+
+// Generator.forward. audio is resized to T0 * 300 (== 79200 for T0=264).
+void kokoro_generator_forward(
+        const float * x_in,      // [512, T0] channel-major
+        int T0,                  // input time (== 2 * predictor T_frame)
+        const float * s,         // [128]
+        const float * f0_curve,  // [T0]
+        const GeneratorWeights & w,
+        std::vector<float> & audio);
+
+} // namespace eliza_kokoro