diff --git a/tools/kokoro/CMakeLists.txt b/tools/kokoro/CMakeLists.txt
index 394f7f946..8f59ff9c5 100644
--- a/tools/kokoro/CMakeLists.txt
+++ b/tools/kokoro/CMakeLists.txt
@@ -25,7 +25,9 @@ set(KOKORO_CORE_SOURCES
     src/kokoro.cpp
     src/kokoro-istft.cpp
     src/kokoro-phonemes.cpp
-    src/kokoro-predictor.cpp)
+    src/kokoro-predictor.cpp
+    src/kokoro-generator.cpp
+    src/kokoro-decoder.cpp)
 
 add_library(kokoro_lib STATIC
     ${KOKORO_CORE_SOURCES}
@@ -56,17 +58,46 @@ endif()
 
 target_compile_features(kokoro_lib PUBLIC cxx_std_17)
 
-# Standalone CLI harness — required by J2 verification (tools/voice-kokoro/).
+# Real G2P via libespeak-ng. When present, kokoro_phonemize() drives
+# espeak_TextToPhonemes() (en-us IPA) and maps codepoints to Kokoro vocab ids,
+# reproducing the reference token sequence. When absent, the build falls back
+# to the degraded ASCII grapheme mapping and the TS layer must supply IPA.
+# Override the search with -DKOKORO_ESPEAK_ROOT=<prefix> (e.g. Homebrew).
+option(KOKORO_ENABLE_ESPEAK "Link libespeak-ng for real Kokoro G2P" ON)
+if(KOKORO_ENABLE_ESPEAK)
+    find_path(ESPEAK_NG_INCLUDE_DIR espeak-ng/speak_lib.h
+        HINTS ${KOKORO_ESPEAK_ROOT} /opt/homebrew /usr/local /usr
+        PATH_SUFFIXES include)
+    find_library(ESPEAK_NG_LIBRARY NAMES espeak-ng
+        HINTS ${KOKORO_ESPEAK_ROOT} /opt/homebrew /usr/local /usr
+        PATH_SUFFIXES lib lib64)
+    if(ESPEAK_NG_INCLUDE_DIR AND ESPEAK_NG_LIBRARY)
+        target_include_directories(kokoro_lib PRIVATE ${ESPEAK_NG_INCLUDE_DIR})
+        target_link_libraries(kokoro_lib PRIVATE ${ESPEAK_NG_LIBRARY})
+        target_compile_definitions(kokoro_lib PRIVATE KOKORO_USE_ESPEAK)
+        message(STATUS "Kokoro G2P: libespeak-ng found (${ESPEAK_NG_LIBRARY}) — real IPA path enabled")
+    else()
+        message(STATUS "Kokoro G2P: libespeak-ng not found — falling back to ASCII grapheme mapping (TS layer must supply IPA)")
+    endif()
+endif()
+
+# Standalone CLI harnesses (required by J2 verification + Kokoro decoder dev).
+# Force MACOSX_BUNDLE OFF: CMake defaults Apple executables to bundles, and
+# install(TARGETS ... RUNTIME) on a bundle target fails configure with
+# "no BUNDLE DESTINATION" on every ios/tvos/visionos/macos build.
 add_executable(kokoro-tts tools/kokoro-tts.cpp)
 target_link_libraries(kokoro-tts PRIVATE kokoro_lib)
-# kokoro-tts is a CLI harness, not a GUI app. On Apple platforms CMake defaults
-# executables to MACOSX_BUNDLE, and install(TARGETS ... RUNTIME) on a bundle
-# target fails configure with "install TARGETS given no BUNDLE DESTINATION for
-# MACOSX_BUNDLE executable" on every ios/tvos/visionos/macos build. Force the
-# bundle flag off so the plain RUNTIME install is valid on all platforms.
 set_target_properties(kokoro-tts PROPERTIES MACOSX_BUNDLE OFF)
 install(TARGETS kokoro-tts RUNTIME)
 
+add_executable(kokoro-stage-dump tools/kokoro-stage-dump.cpp)
+target_link_libraries(kokoro-stage-dump PRIVATE kokoro_lib)
+set_target_properties(kokoro-stage-dump PROPERTIES MACOSX_BUNDLE OFF)
+
+add_executable(kokoro-decoder-test tools/kokoro-decoder-test.cpp)
+target_link_libraries(kokoro-decoder-test PRIVATE kokoro_lib)
+set_target_properties(kokoro-decoder-test PROPERTIES MACOSX_BUNDLE OFF)
+
 # Server-mount handler: compiled into kokoro_lib only when the server target
 # exists. The handler is guarded by `#ifdef LLAMA_BUILD_KOKORO` and pulls in
 # the same `server-http.h` interface that the omnivoice handler uses, plus
diff --git a/tools/kokoro/convert_kokoro_pth_to_gguf.py b/tools/kokoro/convert_kokoro_pth_to_gguf.py
index 8232c070f..12bd55fff 100644
--- a/tools/kokoro/convert_kokoro_pth_to_gguf.py
+++ b/tools/kokoro/convert_kokoro_pth_to_gguf.py
@@ -97,9 +97,15 @@
 def _add_tensor(writer: gguf.GGUFWriter, name: str, data: np.ndarray) -> None:
     """Add tensors with the dtype layout the Kokoro forward pass expects.
 
-    Weight matrices and convolution kernels (ndim >= 2) are emitted as F16;
-    biases, norms, and other vectors stay F32. All-F32 GGUFs can load but
-    synthesize noise in the fused runtime path.
+    Weight matrices and convolution kernels (ndim >= 2) are emitted as F16
+    purely to halve the GGUF download size; biases, norms, and other vectors
+    stay F32. The GGUF dtype does not affect correctness: the loader
+    dequantizes every tensor to F32 at load time, so an all-F32 and an
+    F16-weights GGUF produce identical synthesis. (An earlier note here
+    claimed all-F32 GGUFs synthesized noise — that was a misdiagnosis: the
+    fused path was a stub that ignored the weights, and the real defect was
+    the loader reading non-F32 tensors as raw F32. Both are fixed; F16 is
+    kept only for bundle size.)
     """
     if data.dtype not in (np.float32, np.float16):
         data = data.astype(np.float32)
diff --git a/tools/kokoro/include/kokoro-decoder-front.h b/tools/kokoro/include/kokoro-decoder-front.h
new file mode 100644
index 000000000..4a89770c5
--- /dev/null
+++ b/tools/kokoro/include/kokoro-decoder-front.h
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: MIT
+// kokoro-decoder-front.h — Decoder.forward up to the generator (validated port, #9588).
+#pragma once
+#include <cmath>
+#include <cstring>
+#include <vector>
+#include "kokoro-layers.h"   // conv1d_forward, adain1d_forward, convtranspose1d_depthwise_forward, convtranspose1d_out_len
+
+namespace eliza_kokoro {
+
+struct DecAdainResBlk {
+    int Cin = 0, Cout = 0, Sdim = 128;
+    bool upsample = false;
+    bool learned_sc = false;             // dim_in != dim_out
+    const float * norm1_fc_w = nullptr;  // [2*Cin, Sdim]
+    const float * norm1_fc_b = nullptr;  // [2*Cin]
+    const float * norm2_fc_w = nullptr;  // [2*Cout, Sdim]
+    const float * norm2_fc_b = nullptr;  // [2*Cout]
+    const float * conv1_w    = nullptr;  // [Cout, Cin, 3]
+    const float * conv1_b    = nullptr;  // [Cout]
+    const float * conv2_w    = nullptr;  // [Cout, Cout, 3]
+    const float * conv2_b    = nullptr;  // [Cout]
+    const float * conv1x1_w  = nullptr;  // [Cout, Cin, 1] (learned_sc only)
+    const float * conv1x1_b  = nullptr;  // [Cout] (null — conv1x1 bias=False)
+    const float * pool_w     = nullptr;  // [Cin, 1, 3] (upsample only)
+    const float * pool_b     = nullptr;  // [Cin] (upsample only)
+};
+
+// AdainResBlk1d (decode-block flavor: leaky_relu 0.2; pool/shortcut;
+// out = (residual + shortcut)/sqrt(2)). Output y [Cout, T_out].
+inline void dec_adainresblk1d_forward(
+        const DecAdainResBlk & w, const float * x, int T_in, const float * s,
+        std::vector<float> & y, int & T_out) {
+    const int Cin = w.Cin, Cout = w.Cout, Sdim = w.Sdim;
+
+    // residual branch: norm1 -> leaky_relu(0.2) -> [pool] -> conv1 -> norm2 -> leaky_relu -> conv2
+    std::vector<float> r(x, x + (size_t)Cin * T_in);
+    adain1d_forward(r.data(), Cin, T_in, s, Sdim, w.norm1_fc_w, w.norm1_fc_b);
+    for (size_t i = 0; i < r.size(); ++i) if (r[i] < 0) r[i] *= 0.2f;
+
+    int T_pool = T_in;
+    if (w.upsample) {
+        T_pool = convtranspose1d_out_len(T_in, 3, 2, 1, 1);
+        std::vector<float> r2((size_t)Cin * T_pool);
+        convtranspose1d_depthwise_forward(r.data(), Cin, T_in, w.pool_w, w.pool_b, 3, 2, 1, 1, r2.data(), T_pool);
+        r.swap(r2);
+    }
+    std::vector<float> r3((size_t)Cout * T_pool);
+    conv1d_forward(r.data(), Cin, T_pool, w.conv1_w, w.conv1_b, Cout, 3, 1, 1, 1, r3.data(), T_pool);
+    adain1d_forward(r3.data(), Cout, T_pool, s, Sdim, w.norm2_fc_w, w.norm2_fc_b);
+    for (size_t i = 0; i < r3.size(); ++i) if (r3[i] < 0) r3[i] *= 0.2f;
+    std::vector<float> r4((size_t)Cout * T_pool);
+    conv1d_forward(r3.data(), Cout, T_pool, w.conv2_w, w.conv2_b, Cout, 3, 1, 1, 1, r4.data(), T_pool);
+
+    // shortcut branch: [nearest-upsample x2] -> [conv1x1 if learned_sc]
+    T_out = T_pool;
+    std::vector<float> sc;
+    if (w.upsample) {
+        const int T_up = T_in * 2;  // == T_pool
+        std::vector<float> up((size_t)Cin * T_up);
+        for (int c = 0; c < Cin; ++c)
+            for (int t = 0; t < T_up; ++t)
+                up[(size_t)c * T_up + t] = x[(size_t)c * T_in + (t / 2)];
+        if (w.learned_sc) {
+            sc.assign((size_t)Cout * T_up, 0.0f);
+            conv1d_forward(up.data(), Cin, T_up, w.conv1x1_w, w.conv1x1_b, Cout, 1, 1, 0, 1, sc.data(), T_up);
+        } else sc.swap(up);
+    } else {
+        if (w.learned_sc) {
+            sc.assign((size_t)Cout * T_in, 0.0f);
+            conv1d_forward(x, Cin, T_in, w.conv1x1_w, w.conv1x1_b, Cout, 1, 1, 0, 1, sc.data(), T_in);
+        } else sc.assign(x, x + (size_t)Cin * T_in);
+    }
+
+    y.assign((size_t)Cout * T_out, 0.0f);
+    const float rsqrt2 = 1.0f / std::sqrt(2.0f);
+    for (size_t i = 0; i < y.size(); ++i) y[i] = (r4[i] + sc[i]) * rsqrt2;
+}
+
+struct DecoderFrontWeights {
+    const float * F0_conv_w = nullptr;  // [1,1,3]
+    const float * F0_conv_b = nullptr;  // [1]
+    const float * N_conv_w  = nullptr;  // [1,1,3]
+    const float * N_conv_b  = nullptr;  // [1]
+    const float * asr_res_w = nullptr;  // [64,512,1]
+    const float * asr_res_b = nullptr;  // [64]
+    DecAdainResBlk encode;              // 514 -> 1024, learned_sc
+    DecAdainResBlk decode[4];           // 1090->1024 (x3), 1090->512 upsample
+};
+
+// Decoder.forward up to (not including) the generator.
+//   asr[512,T_asr] (T_asr=132), F0_curve[2*T_asr], N[2*T_asr], s[128]
+// Output: x_out [512, 2*T_asr] (== generator_in_0); also returns the
+// stride-2 conv outputs F0_down[T_asr], N_down[T_asr] (caller passes them,
+// together with the ORIGINAL F0_curve, into the generator).
+inline void decoder_front(
+        const DecoderFrontWeights & W,
+        const float * asr, int Cin_asr, int T_asr,
+        const float * F0_curve, const float * N_in, const float * s,
+        std::vector<float> & x_out,
+        std::vector<float> & F0_down,
+        std::vector<float> & N_down) {
+    const int Tc = 2 * T_asr;  // 264
+
+    F0_down.assign(T_asr, 0.0f);
+    conv1d_forward(F0_curve, 1, Tc, W.F0_conv_w, W.F0_conv_b, 1, 3, 2, 1, 1, F0_down.data(), T_asr);
+    N_down.assign(T_asr, 0.0f);
+    conv1d_forward(N_in, 1, Tc, W.N_conv_w, W.N_conv_b, 1, 3, 2, 1, 1, N_down.data(), T_asr);
+
+    // x = cat([asr, F0, N], dim=channels) -> [514, T_asr]
+    std::vector<float> xcat((size_t)(Cin_asr + 2) * T_asr);
+    std::memcpy(xcat.data(), asr, sizeof(float) * (size_t)Cin_asr * T_asr);
+    std::memcpy(xcat.data() + (size_t)Cin_asr * T_asr, F0_down.data(), sizeof(float) * T_asr);
+    std::memcpy(xcat.data() + (size_t)(Cin_asr + 1) * T_asr, N_down.data(), sizeof(float) * T_asr);
+
+    std::vector<float> x; int T_x;
+    dec_adainresblk1d_forward(W.encode, xcat.data(), T_asr, s, x, T_x);   // encode 514->1024
+
+    std::vector<float> asr_res((size_t)64 * T_asr);                       // asr_res Conv1d k1 512->64
+    conv1d_forward(asr, Cin_asr, T_asr, W.asr_res_w, W.asr_res_b, 64, 1, 1, 0, 1, asr_res.data(), T_asr);
+
+    bool res = true;
+    for (int b = 0; b < 4; ++b) {
+        std::vector<float> blk_in;
+        if (res) {  // cat([x, asr_res, F0, N]) -> 1024+64+1+1 = 1090
+            const int Cx = (int)(x.size() / T_x);
+            const int Cin_blk = Cx + 64 + 1 + 1;
+            blk_in.assign((size_t)Cin_blk * T_x, 0.0f);
+            std::memcpy(blk_in.data(), x.data(), sizeof(float) * (size_t)Cx * T_x);
+            std::memcpy(blk_in.data() + (size_t)Cx * T_x, asr_res.data(), sizeof(float) * 64 * T_x);
+            std::memcpy(blk_in.data() + (size_t)(Cx + 64) * T_x, F0_down.data(), sizeof(float) * T_x);
+            std::memcpy(blk_in.data() + (size_t)(Cx + 65) * T_x, N_down.data(), sizeof(float) * T_x);
+        } else {
+            blk_in.assign(x.begin(), x.end());
+        }
+        std::vector<float> y; int T_y;
+        dec_adainresblk1d_forward(W.decode[b], blk_in.data(), T_x, s, y, T_y);
+        x.swap(y); T_x = T_y;
+        if (W.decode[b].upsample) res = false;  // decode3 upsamples -> res stops
+    }
+    x_out.swap(x);  // [512, 264]
+}
+
+} // namespace eliza_kokoro
\ No newline at end of file
diff --git a/tools/kokoro/include/kokoro-decoder.h b/tools/kokoro/include/kokoro-decoder.h
new file mode 100644
index 000000000..3de5d2562
--- /dev/null
+++ b/tools/kokoro/include/kokoro-decoder.h
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+//
+// kokoro-decoder.h — StyleTTS-2 / iSTFTNet decoder: predictor outputs -> 24 kHz audio.
+//
+// Wires the validated decoder_front (kokoro-decoder-front.h) + Generator
+// (kokoro-generator.h) against the model's all-F32 ggml context. Replaces the
+// J2-ship placeholder spectrogram in kokoro_synthesize (#9588).
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+namespace eliza_kokoro {
+
+struct kokoro_model;
+
+// Run the full decoder. Inputs come from kokoro_predictor_forward:
+//   asr_ct   : [512, T_frame] channel-major (transpose of PredictorOut.asr [T,512])
+//   F0, N    : [2*T_frame]    (PredictorOut.F0_pred / N_pred — the up-2x curves)
+//   ref_s_dec: [128]          decoder-half style (ref_s[:128])
+// Output: audio (24 kHz mono), resized to (2*T_frame)*300.
+bool kokoro_decoder_forward(
+        const kokoro_model * model,
+        const float * asr_ct, int T_frame,
+        const float * F0, const float * N,
+        const float * ref_s_dec,
+        std::vector<float> & audio,
+        std::string & err);
+
+} // namespace eliza_kokoro
diff --git a/tools/kokoro/include/kokoro-generator.h b/tools/kokoro/include/kokoro-generator.h
new file mode 100644
index 000000000..d1d88f696
--- /dev/null
+++ b/tools/kokoro/include/kokoro-generator.h
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: MIT
+//
+// kokoro-generator.h — iSTFTNet Generator.forward (StyleTTS-2 decoder back-end).
+//
+// The generator turns the decoder body output `x` [512, 264], the style
+// vector `s` [128], and the (un-downsampled) F0 curve `f0_curve` [264] into
+// `audio` [79200] (24 kHz).
+//
+// Weights are raw float pointers (PyTorch row-major, weight_norm-fused):
+//   Conv1d weight       [Cout, Cin, K]
+//   ConvTranspose1d wt  [Cin,  Cout, K]
+//   Linear weight       [out,  in]
+//   AdaIN1d fc.weight    [2C,   style_dim]   (style_dim = 128)
+//   Snake alpha          [C]
+// The caller supplies them via GeneratorWeights so the function composes with
+// any weight-loading boundary (GGUF tensor lookup, raw .f32 fixtures, …).
+
+#pragma once
+
+#include <vector>
+
+namespace eliza_kokoro {
+
+// One AdaINResBlock1 sub-block (the block has three, sharing the same channel
+// count). convs use [Cout=Cin=C, Cin=C, K].
+struct GenSubBlockWeights {
+    const float * conv1_w = nullptr;     // [C, C, K]
+    const float * conv1_b = nullptr;     // [C]
+    const float * conv2_w = nullptr;     // [C, C, K]
+    const float * conv2_b = nullptr;     // [C]
+    const float * adain1_fc_w = nullptr; // [2C, 128]
+    const float * adain1_fc_b = nullptr; // [2C]
+    const float * adain2_fc_w = nullptr; // [2C, 128]
+    const float * adain2_fc_b = nullptr; // [2C]
+    const float * alpha1 = nullptr;      // [C]
+    const float * alpha2 = nullptr;      // [C]
+};
+
+struct GenAdaResBlockWeights {
+    GenSubBlockWeights sub[3];
+};
+
+struct GeneratorWeights {
+    // m_source.l_linear: Linear(9 -> 1).
+    const float * l_linear_w = nullptr;  // [1, 9]
+    const float * l_linear_b = nullptr;  // [1]
+
+    // ups[0], ups[1]: ConvTranspose1d. weight [Cin, Cout, K], bias [Cout].
+    const float * ups_w[2] = { nullptr, nullptr };
+    const float * ups_b[2] = { nullptr, nullptr };
+
+    // noise_convs[0], noise_convs[1]: Conv1d. weight [Cout, 22, K], bias [Cout].
+    const float * noise_convs_w[2] = { nullptr, nullptr };
+    const float * noise_convs_b[2] = { nullptr, nullptr };
+
+    // noise_res[0] (k=7), noise_res[1] (k=11): AdaINResBlock1.
+    GenAdaResBlockWeights noise_res[2];
+
+    // resblocks[0..5]: AdaINResBlock1 (stage0: k=3,7,11 ch=256; stage1: ch=128).
+    GenAdaResBlockWeights resblocks[6];
+
+    // conv_post: Conv1d(128 -> 22, k=7, pad=3). weight [22, 128, 7], bias [22].
+    const float * conv_post_w = nullptr;
+    const float * conv_post_b = nullptr;
+};
+
+// Generator.forward. audio is resized to T0 * 300 (== 79200 for T0=264).
+void kokoro_generator_forward(
+        const float * x_in,      // [512, T0] channel-major
+        int T0,                  // input time (== 2 * predictor T_frame)
+        const float * s,         // [128]
+        const float * f0_curve,  // [T0]
+        const GeneratorWeights & w,
+        std::vector<float> & audio);
+
+} // namespace eliza_kokoro
diff --git a/tools/kokoro/include/kokoro-phonemes.h b/tools/kokoro/include/kokoro-phonemes.h
index cc9bb3fd6..068dc0f79 100644
--- a/tools/kokoro/include/kokoro-phonemes.h
+++ b/tools/kokoro/include/kokoro-phonemes.h
@@ -1,22 +1,26 @@
 // SPDX-License-Identifier: MIT
 //
-// kokoro-phonemes.h — minimal ASCII text → Kokoro phoneme-id mapping.
+// kokoro-phonemes.h — text → Kokoro phoneme-id mapping.
 //
-// Kokoro v1.0 uses espeak-ng's phoneme inventory + a small set of control
-// tokens (BOS, EOS, PAD, blanks). The training-time path passes text through
-// `phonemize` (Python wrapper around espeak-ng) before tokenizing.
+// Kokoro v1.0 tokenizes espeak-ng IPA against a small fixed vocab
+// (`tts/kokoro/tokenizer.json`, `model.vocab`). The reference Python path is:
 //
-// Adding an espeak-ng dependency to the fork is overkill for a TTS that
-// is being ported as a one-release deprecation runway. This header
-// implements a deterministic grapheme→phoneme mapping that:
+//   text → espeak-ng (en-us, --ipa) → IPA string → per-codepoint vocab lookup
+//        → ids → model input_ids = [0, *ids, 0]   (0 = the "$" pad symbol)
 //
-//   1. covers the basic Latin alphabet + common digraphs (sh, ch, th, ng);
-//   2. maps every other ASCII printable to PAD;
-//   3. returns ids in the same value range as kokoro-onnx's tokenizer
-//      (PAD=0, BOS=1, EOS=2, then phonemes from offset 3).
+// Every vocab key is a single Unicode codepoint, so the mapping is a pure
+// codepoint→id table lookup over the IPA string (no multi-char digraph
+// handling is needed — espeak already emits the canonical IPA codepoints,
+// e.g. eɪ is two codepoints 'e'+'ɪ', each with its own id).
 //
-// The synthesis quality this produces is noticeably worse than the
-// espeak-ng path — that is the documented gap in J2-kokoro-port-notes.md.
+// Two build modes:
+//   * KOKORO_USE_ESPEAK (default when libespeak-ng is linked) — the real G2P
+//     path: `phonemize_ipa()` drives espeak_TextToPhonemes() to get IPA, then
+//     maps to ids. This reproduces the kokoro reference ids exactly.
+//   * fallback — when espeak is unavailable the caller may pass pre-computed
+//     IPA from the TS layer (which already runs espeak) into
+//     `ipa_to_token_ids()`. `phonemize_ipa()` then returns an empty vector and
+//     the caller must supply IPA.
 
 #pragma once
 
@@ -26,12 +30,42 @@
 
 namespace eliza_kokoro {
 
-// Tokenize a UTF-8 / ASCII text string into a phoneme-id vector.
-// Always returns a sequence of length <= 510 (the BERT encoder cap in
-// Kokoro v1.0 — anything longer is split at the caller).
-std::vector<int32_t> phonemize_ascii(const std::string & text);
+// Kokoro pad/boundary token. model.vocab maps '$' → 0; the reference wraps the
+// phoneme ids as [PAD, *ids, PAD] to form the model input_ids.
+inline constexpr int32_t KOKORO_PAD_ID = 0;
+
+// Map a single Unicode codepoint (an espeak IPA symbol) to its Kokoro vocab id.
+// Returns -1 if the codepoint is not in the vocab (caller drops it, matching
+// the reference which silently skips unmapped codepoints).
+int32_t kokoro_codepoint_to_id(char32_t cp) noexcept;
+
+// Map an espeak-ng IPA string (UTF-8) to the bare Kokoro phoneme-id sequence
+// (no pad wrapping). Codepoints absent from the vocab are dropped. This is the
+// `ids` array in reference-ids.json — its length is the style-row index.
+std::vector<int32_t> ipa_to_token_ids(const std::string & ipa);
+
+// Phonemize text to bare Kokoro phoneme ids via espeak-ng (en-us IPA).
+// Returns the same sequence as `ipa_to_token_ids(espeak_ipa(text))`.
+// When KOKORO_USE_ESPEAK is not compiled in, returns an empty vector — the
+// caller must supply IPA from the TS layer and call `ipa_to_token_ids()`.
+std::vector<int32_t> phonemize_ipa(const std::string & text);
+
+// Wrap a bare phoneme-id sequence as the model input_ids: [PAD, *ids, PAD].
+std::vector<int32_t> wrap_input_ids(const std::vector<int32_t> & ids);
 
-// Diagnostic — total phoneme vocab size (for hparams cross-check).
+// Convenience: text → model input_ids [PAD, *ipa_ids, PAD] via espeak.
+// Equivalent to `wrap_input_ids(phonemize_ipa(text))`.
+std::vector<int32_t> phonemize_to_input_ids(const std::string & text);
+
+// True when this build links libespeak-ng (the real G2P path is available).
+bool espeak_available() noexcept;
+
+// Total Kokoro vocab size (highest id + 1 = 178 for v1.0).
 int phoneme_vocab_size() noexcept;
 
+// --- Legacy ASCII fallback (retained only for callers not yet migrated) ---
+// Deprecated: returns the degraded ASCII grapheme mapping. New code uses
+// phonemize_to_input_ids().
+std::vector<int32_t> phonemize_ascii(const std::string & text);
+
 } // namespace eliza_kokoro
diff --git a/tools/kokoro/include/kokoro.h b/tools/kokoro/include/kokoro.h
index 809362962..b218677cf 100644
--- a/tools/kokoro/include/kokoro.h
+++ b/tools/kokoro/include/kokoro.h
@@ -116,10 +116,12 @@ kokoro_status kokoro_load_voice_preset(
     kokoro_voice_preset & out,
     std::string & err_out) noexcept;
 
-// Phonemize an input text into Kokoro's int phoneme ids. The implementation
-// uses a deterministic ASCII grapheme→phoneme mapping (no espeak-ng
-// dependency). This is intentionally lossy vs the upstream phonemizer —
-// quality recovery is part of the gap documented in J2-kokoro-port-notes.md.
+// Phonemize an input text into Kokoro's int phoneme ids (the model input_ids,
+// wrapped as [PAD, *ids, PAD]). When the build links libespeak-ng this is the
+// real G2P path (text → en-us IPA → Kokoro vocab ids), reproducing the
+// upstream phonemizer's token sequence. Without libespeak-ng it falls back to
+// a deterministic (lossy) ASCII grapheme mapping; in that case the TS voice
+// layer should phonemize and pass IPA (see kokoro-phonemes.h ipa_to_token_ids).
 std::vector<int32_t> kokoro_phonemize(const std::string & text);
 
 // Synthesize a single utterance. `text` is the natural-language input,
diff --git a/tools/kokoro/src/kokoro-decoder.cpp b/tools/kokoro/src/kokoro-decoder.cpp
new file mode 100644
index 000000000..388f124b9
--- /dev/null
+++ b/tools/kokoro/src/kokoro-decoder.cpp
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: MIT
+//
+// kokoro-decoder.cpp — assemble decoder_front + Generator into the full
+// StyleTTS-2 / iSTFTNet decoder, reading weights from the model's all-F32
+// ggml context (dequantized at load). Validated against the PyTorch reference
+// stage-by-stage (#9588).
+
+#include "kokoro-decoder.h"
+#include "kokoro-decoder-front.h"   // DecoderFrontWeights, DecAdainResBlk, decoder_front
+#include "kokoro-generator.h"       // GeneratorWeights, kokoro_generator_forward
+
+#include "ggml.h"
+
+#include <string>
+
+namespace eliza_kokoro {
+
+// Defined in kokoro.cpp — the all-F32 working context the predictor reads.
+ggml_context * kokoro_model_ggml_ctx(const kokoro_model * model);
+
+namespace {
+
+struct Lk {
+    ggml_context * ctx;
+    const float * get(const std::string & name) const {
+        ggml_tensor * t = ggml_get_tensor(ctx, name.c_str());
+        return t ? (const float *) t->data : nullptr;
+    }
+};
+
+// Fill an AdainResBlk1d (decode flavor) from a tensor-name prefix.
+void fill_dec_block(const Lk & L, DecAdainResBlk & b, const std::string & pfx,
+                    int Cin, int Cout, bool upsample) {
+    b.Cin = Cin; b.Cout = Cout; b.Sdim = 128;
+    b.upsample = upsample;
+    b.learned_sc = (Cin != Cout);
+    b.norm1_fc_w = L.get(pfx + ".norm1.fc.weight");
+    b.norm1_fc_b = L.get(pfx + ".norm1.fc.bias");
+    b.norm2_fc_w = L.get(pfx + ".norm2.fc.weight");
+    b.norm2_fc_b = L.get(pfx + ".norm2.fc.bias");
+    b.conv1_w    = L.get(pfx + ".conv1.weight");
+    b.conv1_b    = L.get(pfx + ".conv1.bias");
+    b.conv2_w    = L.get(pfx + ".conv2.weight");
+    b.conv2_b    = L.get(pfx + ".conv2.bias");
+    b.conv1x1_w  = b.learned_sc ? L.get(pfx + ".conv1x1.weight") : nullptr;
+    b.conv1x1_b  = nullptr;  // conv1x1 bias=False upstream
+    b.pool_w     = upsample ? L.get(pfx + ".pool.weight") : nullptr;
+    b.pool_b     = upsample ? L.get(pfx + ".pool.bias")   : nullptr;
+}
+
+// Fill an AdaINResBlock1 (generator flavor: 3 sub-blocks, Snake1D) from a prefix.
+void fill_gen_block(const Lk & L, GenAdaResBlockWeights & b, const std::string & pfx) {
+    for (int j = 0; j < 3; ++j) {
+        const std::string js = std::to_string(j);
+        GenSubBlockWeights & s = b.sub[j];
+        s.conv1_w     = L.get(pfx + ".convs1." + js + ".weight");
+        s.conv1_b     = L.get(pfx + ".convs1." + js + ".bias");
+        s.conv2_w     = L.get(pfx + ".convs2." + js + ".weight");
+        s.conv2_b     = L.get(pfx + ".convs2." + js + ".bias");
+        s.adain1_fc_w = L.get(pfx + ".adain1." + js + ".fc.weight");
+        s.adain1_fc_b = L.get(pfx + ".adain1." + js + ".fc.bias");
+        s.adain2_fc_w = L.get(pfx + ".adain2." + js + ".fc.weight");
+        s.adain2_fc_b = L.get(pfx + ".adain2." + js + ".fc.bias");
+        s.alpha1      = L.get(pfx + ".alpha1." + js);
+        s.alpha2      = L.get(pfx + ".alpha2." + js);
+    }
+}
+
+} // namespace
+
+bool kokoro_decoder_forward(
+        const kokoro_model * model,
+        const float * asr_ct, int T_frame,
+        const float * F0, const float * N,
+        const float * ref_s_dec,
+        std::vector<float> & audio,
+        std::string & err) {
+    audio.clear();
+    if (!model) { err = "null model"; return false; }
+    if (T_frame <= 0) { err = "non-positive T_frame"; return false; }
+
+    ggml_context * ctx = kokoro_model_ggml_ctx(model);
+    if (!ctx) { err = "null model context"; return false; }
+    Lk L{ctx};
+
+    // --- decoder_front weights ---
+    DecoderFrontWeights W;
+    W.F0_conv_w = L.get("kokoro.decoder.F0_conv.weight");
+    W.F0_conv_b = L.get("kokoro.decoder.F0_conv.bias");
+    W.N_conv_w  = L.get("kokoro.decoder.N_conv.weight");
+    W.N_conv_b  = L.get("kokoro.decoder.N_conv.bias");
+    W.asr_res_w = L.get("kokoro.decoder.asr_res.weight");
+    W.asr_res_b = L.get("kokoro.decoder.asr_res.bias");
+    fill_dec_block(L, W.encode, "kokoro.decoder.encode", 514, 1024, /*upsample*/false);
+    fill_dec_block(L, W.decode[0], "kokoro.decoder.decode.0", 1090, 1024, false);
+    fill_dec_block(L, W.decode[1], "kokoro.decoder.decode.1", 1090, 1024, false);
+    fill_dec_block(L, W.decode[2], "kokoro.decoder.decode.2", 1090, 1024, false);
+    fill_dec_block(L, W.decode[3], "kokoro.decoder.decode.3", 1090, 512,  /*upsample*/true);
+
+    if (!W.F0_conv_w || !W.asr_res_w || !W.encode.conv1_w || !W.decode[3].pool_w) {
+        err = "missing decoder weights (is the GGUF a full Kokoro model?)";
+        return false;
+    }
+
+    // --- generator weights ---
+    GeneratorWeights G;
+    G.l_linear_w = L.get("kokoro.gen.m_source.l_linear.weight");
+    G.l_linear_b = L.get("kokoro.gen.m_source.l_linear.bias");
+    for (int i = 0; i < 2; ++i) {
+        const std::string is = std::to_string(i);
+        G.ups_w[i]         = L.get("kokoro.gen.ups." + is + ".weight");
+        G.ups_b[i]         = L.get("kokoro.gen.ups." + is + ".bias");
+        G.noise_convs_w[i] = L.get("kokoro.gen.noise_convs." + is + ".weight");
+        G.noise_convs_b[i] = L.get("kokoro.gen.noise_convs." + is + ".bias");
+        fill_gen_block(L, G.noise_res[i], "kokoro.gen.noise_res." + is);
+    }
+    for (int i = 0; i < 6; ++i) {
+        fill_gen_block(L, G.resblocks[i], "kokoro.gen.resblocks." + std::to_string(i));
+    }
+    G.conv_post_w = L.get("kokoro.gen.conv_post.weight");
+    G.conv_post_b = L.get("kokoro.gen.conv_post.bias");
+
+    if (!G.l_linear_w || !G.ups_w[0] || !G.conv_post_w || !G.resblocks[5].sub[2].conv2_w) {
+        err = "missing generator weights (is the GGUF a full Kokoro model?)";
+        return false;
+    }
+
+    // --- run: decoder_front -> generator ---
+    std::vector<float> x, F0_down, N_down;
+    decoder_front(W, asr_ct, /*Cin_asr*/512, T_frame, F0, N, ref_s_dec, x, F0_down, N_down);
+
+    const int T0 = 2 * T_frame;  // decoder_front upsamples T_frame -> 2*T_frame
+    if ((int) (x.size() / 512) != T0) {
+        err = "decoder_front output width mismatch";
+        return false;
+    }
+    kokoro_generator_forward(x.data(), T0, ref_s_dec, F0, G, audio);
+    return true;
+}
+
+} // namespace eliza_kokoro
diff --git a/tools/kokoro/src/kokoro-generator.cpp b/tools/kokoro/src/kokoro-generator.cpp
new file mode 100644
index 000000000..6a4423b16
--- /dev/null
+++ b/tools/kokoro/src/kokoro-generator.cpp
@@ -0,0 +1,435 @@
+// SPDX-License-Identifier: MIT
+//
+// kokoro-generator.cpp — iSTFTNet Generator.forward, CPU scalar.
+//
+// Direct port of kokoro/istftnet.py `Generator.forward` (StyleTTS-2 +
+// iSTFTNet decoder back-end). Implements:
+//   - SourceModuleHnNSF / SineGen harmonic source (deterministic: zero
+//     initial phase noise, zero additive noise — see note below),
+//   - forward STFT (center=True) of the harmonic source,
+//   - the two upsample stages (ConvTranspose ups[i] + noise_convs[i] +
+//     noise_res[i] (AdaINResBlock1) + 3 resblocks[i*3+j] (AdaINResBlock1,
+//     Snake1D activation), with leaky_relu(0.1) and a reflection_pad(1,0)
+//     on the final stage),
+//   - conv_post (Conv1d 128->22, k7, pad3),
+//   - spec = exp(x[:11]), phase = sin(x[11:22]),
+//   - inverse STFT (center=True) -> audio.
+//
+// Config (kokoro v1.0 istftnet):
+//   style_dim=128, upsample_initial_channel=512,
+//   upsample_rates=[10,6], upsample_kernel_sizes=[20,12],
+//   resblock_kernel_sizes=[3,7,11], resblock_dilation_sizes=[[1,3,5]]x3,
+//   gen_istft_n_fft=20, gen_istft_hop_size=5,
+//   m_source: harmonic_num=8 (dim=9), upsample_scale=300, voiced_threshold=10.
+//
+// All conv/linear weights are PyTorch row-major and weight_norm-fused:
+//   Conv1d weight       [Cout, Cin, K]
+//   ConvTranspose1d wt  [Cin,  Cout, K]
+//   Linear weight       [out,  in]
+//   AdaIN1d fc.weight    [2C,   style_dim]
+//   alpha (Snake)        [C]
+//
+// Determinism note: PyTorch's SineGen seeds an initial random phase
+// (`rand_ini`, zeroed for the fundamental) and SourceModuleHnNSF adds
+// Gaussian noise. For a reproducible / deterministic on-device renderer we
+// drop both (zero phase noise on every harmonic, zero additive noise), so the
+// harmonic source and the final audio will not bit-match a seeded PyTorch run
+// — they are validated by correlation + structure instead. Every other stage
+// (convs, resblocks, ups, conv_post, STFT/iSTFT) is exactly reproduced.
+//
+// Validated against per-stage reference activations regenerated from the
+// real hexgrad/Kokoro-82M v1.0 decoder weights — see kokoro-generator.h and
+// the integration notes returned with this work.
+
+#include "kokoro-generator.h"
+#include "kokoro-layers.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <cstring>
+#include <vector>
+
+#ifdef KOKORO_GEN_DEBUG
+#include <fstream>
+static void dbg_dump(const char * name, const float * d, size_t n) {
+    std::ofstream f(std::string("/tmp/gendbg_") + name + ".f32", std::ios::binary);
+    f.write((const char *) d, n * 4);
+}
+#define DBG(name, d, n) dbg_dump(name, d, n)
+#else
+#define DBG(name, d, n) do {} while (0)
+#endif
+
+namespace eliza_kokoro {
+
+namespace {
+
+static constexpr double K_PI = 3.14159265358979323846;
+
+// ----------------------------------------------------------------------------
+// Periodic Hann window of length N: w[i] = 0.5 - 0.5*cos(2*pi*i/N).
+// Matches scipy.get_window('hann', N, fftbins=True) / torch.hann_window(N,
+// periodic=True).
+// ----------------------------------------------------------------------------
+static std::vector<float> hann_periodic(int n) {
+    std::vector<float> w((size_t) n);
+    const double scale = 2.0 * K_PI / (double) n;
+    for (int i = 0; i < n; ++i) {
+        w[(size_t) i] = (float) (0.5 - 0.5 * std::cos(scale * (double) i));
+    }
+    return w;
+}
+
+// ----------------------------------------------------------------------------
+// PyTorch F.interpolate(mode='linear', align_corners=False), 1D.
+// in: [C, T_in] (channel-major) -> out: [C, T_out].
+// Matches the half-pixel coordinate transform PyTorch uses by default.
+// ----------------------------------------------------------------------------
+static void interp_linear(const float * x, int C, int T_in, int T_out, float * y) {
+    const double scale = (double) T_in / (double) T_out;
+    for (int o = 0; o < T_out; ++o) {
+        // src = (o + 0.5) * scale - 0.5  (align_corners=False half-pixel)
+        double src = ((double) o + 0.5) * scale - 0.5;
+        if (src < 0.0) src = 0.0;
+        int s0 = (int) std::floor(src);
+        int s1 = s0 + 1;
+        double frac = src - (double) s0;
+        if (s1 > T_in - 1) { s1 = T_in - 1; }
+        if (s0 > T_in - 1) { s0 = T_in - 1; }
+        for (int c = 0; c < C; ++c) {
+            const float a = x[(size_t) c * T_in + s0];
+            const float b = x[(size_t) c * T_in + s1];
+            y[(size_t) c * T_out + o] = (float) (a + (b - a) * frac);
+        }
+    }
+}
+
+// ----------------------------------------------------------------------------
+// Forward STFT, center=True, return polar (magnitude, phase).
+// input: signal[N]; n_fft, hop, win (win==n_fft here). Reflect-pad n_fft/2
+// each side (PyTorch center=True default pad_mode='reflect'). Periodic Hann
+// window. Output: mag[F, n_frames], phase[F, n_frames], F=n_fft/2+1,
+// n_frames = N/hop + 1.
+// ----------------------------------------------------------------------------
+static void stft_center(const float * sig, int N, int n_fft, int hop, int win,
+                        std::vector<float> & mag, std::vector<float> & phase,
+                        int & F, int & n_frames) {
+    F = n_fft / 2 + 1;
+    const int pad = n_fft / 2;
+    const int Np = N + 2 * pad;
+    std::vector<float> padded((size_t) Np);
+    // reflect pad: padded[pad + i] = sig[i]; left/right reflect (no edge dup).
+    for (int i = 0; i < N; ++i) padded[(size_t) (pad + i)] = sig[i];
+    for (int i = 0; i < pad; ++i) {
+        padded[(size_t) (pad - 1 - i)] = sig[std::min(i + 1, N - 1)];
+        padded[(size_t) (pad + N + i)] = sig[std::max(N - 2 - i, 0)];
+    }
+    n_frames = N / hop + 1;
+    const std::vector<float> window = hann_periodic(win);
+    mag.assign((size_t) F * n_frames, 0.0f);
+    phase.assign((size_t) F * n_frames, 0.0f);
+    for (int t = 0; t < n_frames; ++t) {
+        const int off = t * hop;
+        for (int f = 0; f < F; ++f) {
+            double re = 0.0, im = 0.0;
+            const double w0 = -2.0 * K_PI * (double) f / (double) n_fft;
+            for (int k = 0; k < win; ++k) {
+                const double v = (double) padded[(size_t) (off + k)] * (double) window[(size_t) k];
+                const double ang = w0 * (double) k;
+                re += v * std::cos(ang);
+                im += v * std::sin(ang);
+            }
+            mag[(size_t) f * n_frames + t]   = (float) std::sqrt(re * re + im * im);
+            phase[(size_t) f * n_frames + t] = (float) std::atan2(im, re);
+        }
+    }
+}
+
+// ----------------------------------------------------------------------------
+// Inverse STFT, center=True. magnitude/phase: [F, n_frames] (polar). Periodic
+// Hann window, win==n_fft. Overlap-add with window^2 normalization, then trim
+// n_fft/2 from each end (the center=True crop). Output length = (n_frames-1)*hop.
+// ----------------------------------------------------------------------------
+static void istft_center(const float * mag, const float * phase,
+                         int n_fft, int hop, int win, int n_frames,
+                         std::vector<float> & out) {
+    const int F = n_fft / 2 + 1;
+    const int pad = n_fft / 2;
+    const int n_full = (n_frames - 1) * hop + win;  // before crop
+    std::vector<double> acc((size_t) n_full, 0.0);
+    std::vector<double> wsum((size_t) n_full, 0.0);
+    const std::vector<float> window = hann_periodic(win);
+
+    std::vector<double> frame((size_t) n_fft);
+    for (int t = 0; t < n_frames; ++t) {
+        // irfft of the hermitian spectrum -> n_fft real samples.
+        for (int n = 0; n < n_fft; ++n) {
+            double s = 0.0;
+            for (int f = 0; f < F; ++f) {
+                const double m = mag[(size_t) f * n_frames + t];
+                const double p = phase[(size_t) f * n_frames + t];
+                const double re = m * std::cos(p);
+                const double im = m * std::sin(p);
+                const double ang = 2.0 * K_PI * (double) f * (double) n / (double) n_fft;
+                double term = re * std::cos(ang) - im * std::sin(ang);
+                if (f != 0 && !(n_fft % 2 == 0 && f == F - 1)) term *= 2.0;
+                s += term;
+            }
+            frame[(size_t) n] = s / (double) n_fft;
+        }
+        const int off = t * hop;
+        for (int k = 0; k < win; ++k) {
+            const double w = (double) window[(size_t) k];
+            acc[(size_t) (off + k)]  += frame[(size_t) k] * w;
+            wsum[(size_t) (off + k)] += w * w;
+        }
+    }
+    for (int i = 0; i < n_full; ++i) {
+        if (wsum[(size_t) i] > 1e-11) acc[(size_t) i] /= wsum[(size_t) i];
+    }
+    const int n_out = n_full - 2 * pad;
+    out.assign((size_t) n_out, 0.0f);
+    for (int i = 0; i < n_out; ++i) out[(size_t) i] = (float) acc[(size_t) (i + pad)];
+}
+
+// ----------------------------------------------------------------------------
+// AdaINResBlock1.forward (Snake activation). x[C,T] in place.
+// Three sub-blocks: adain1 -> snake(alpha1) -> conv1[dilated] ->
+//                   adain2 -> snake(alpha2) -> conv2[dilation 1] -> residual add.
+// convs1 dilations = dil[0..2]; convs2 dilation = 1.
+// padding = get_padding(K, d) = (K*d - d)/2.
+// ----------------------------------------------------------------------------
+static int get_padding(int k, int d) { return (k * d - d) / 2; }
+
+static void adain_resblock1(const GenAdaResBlockWeights & w,
+                            float * x, int C, int T, const float * s, int Sdim,
+                            int K, const int dil[3]) {
+    std::vector<float> xt((size_t) C * T), y1((size_t) C * T), y2((size_t) C * T);
+    for (int i = 0; i < 3; ++i) {
+        const GenSubBlockWeights & sb = w.sub[i];
+        std::memcpy(xt.data(), x, sizeof(float) * (size_t) C * T);
+        adain1d_forward(xt.data(), C, T, s, Sdim, sb.adain1_fc_w, sb.adain1_fc_b);
+        snake1d_forward(xt.data(), C, T, sb.alpha1);
+        const int d1 = dil[i];
+        conv1d_forward(xt.data(), C, T, sb.conv1_w, sb.conv1_b, C, K, 1,
+                       get_padding(K, d1), d1, y1.data(), T);
+        adain1d_forward(y1.data(), C, T, s, Sdim, sb.adain2_fc_w, sb.adain2_fc_b);
+        snake1d_forward(y1.data(), C, T, sb.alpha2);
+        conv1d_forward(y1.data(), C, T, sb.conv2_w, sb.conv2_b, C, K, 1,
+                       get_padding(K, 1), 1, y2.data(), T);
+        for (size_t j = 0; j < (size_t) C * T; ++j) x[j] = y2[j] + x[j];
+    }
+}
+
+} // namespace
+
+// ============================================================================
+// kokoro_generator_forward
+// ============================================================================
+void kokoro_generator_forward(
+        const float * x_in,      // [512, T0]
+        int T0,                  // input time (== 2 * predictor T_frame)
+        const float * s,         // [128]
+        const float * f0_curve,  // [T0]
+        const GeneratorWeights & w,
+        std::vector<float> & audio /* out [T0 * 300] */) {
+
+    const int Sdim   = 128;
+    const int up0    = 10, up1 = 6;    // upsample rates
+    const int hop    = 5;
+    const int n_fft  = 20, win = 20;
+    const int upsample_scale = up0 * up1 * hop;  // 300
+    const int dim = 9;  // harmonic_num (8) + fundamental
+    const float sine_amp = 0.1f;
+    const float voiced_threshold = 10.0f;
+    const float sr = 24000.0f;
+
+    // ------------------------------------------------------------------
+    // 1. Harmonic source.  f0_up = nearest-upsample(f0_curve) x300.
+    // ------------------------------------------------------------------
+    const int L = T0 * upsample_scale;  // 79200
+    std::vector<float> f0_up((size_t) L);
+    for (int t = 0; t < T0; ++t)
+        for (int r = 0; r < upsample_scale; ++r)
+            f0_up[(size_t) t * upsample_scale + r] = f0_curve[t];
+
+    // fn = f0 * [1..9]; rad = (fn / sr) % 1 ; [dim, L] channel-major.
+    std::vector<float> rad((size_t) dim * L);
+    for (int h = 0; h < dim; ++h) {
+        const float mul = (float) (h + 1);
+        for (int t = 0; t < L; ++t) {
+            float v = f0_up[(size_t) t] * mul / sr;
+            v = v - std::floor(v);  // % 1
+            rad[(size_t) h * L + t] = v;
+        }
+    }
+    // Deterministic: rad[:,0,:] += 0 (no random initial phase).
+    // _f02sine: downsample rad by 1/upsample_scale (linear), cumsum*2pi,
+    // upsample by *upsample_scale (with phase scaled by upsample_scale), sin.
+    const int Lds = L / upsample_scale;  // 264
+    std::vector<float> rad_ds((size_t) dim * Lds);
+    interp_linear(rad.data(), dim, L, Lds, rad_ds.data());
+    // cumsum over time per channel, * 2*pi.
+    std::vector<float> phase_ds((size_t) dim * Lds);
+    for (int h = 0; h < dim; ++h) {
+        double cum = 0.0;
+        for (int t = 0; t < Lds; ++t) {
+            cum += (double) rad_ds[(size_t) h * Lds + t];
+            phase_ds[(size_t) h * Lds + t] = (float) (cum * 2.0 * K_PI);
+        }
+    }
+    // F.interpolate(phase * upsample_scale, scale=upsample_scale, linear).
+    std::vector<float> phase_scaled((size_t) dim * Lds);
+    for (size_t i = 0; i < (size_t) dim * Lds; ++i)
+        phase_scaled[i] = phase_ds[i] * (float) upsample_scale;
+    std::vector<float> phase_up((size_t) dim * L);
+    interp_linear(phase_scaled.data(), dim, Lds, L, phase_up.data());
+    // sines = sin(phase) * sine_amp ; voiced mask uv = (f0 > thr).
+    // sine_waves = sines * uv  (+ noise = 0). sine_merge = tanh(l_linear(sine_waves)).
+    std::vector<float> sine_waves((size_t) dim * L);
+    for (int h = 0; h < dim; ++h) {
+        for (int t = 0; t < L; ++t) {
+            const float uv = (f0_up[(size_t) t] > voiced_threshold) ? 1.0f : 0.0f;
+            sine_waves[(size_t) h * L + t] =
+                std::sin(phase_up[(size_t) h * L + t]) * sine_amp * uv;
+        }
+    }
+    // l_linear: Linear(9 -> 1). weight [1, 9], bias [1]. har_source[L].
+    std::vector<float> har_source((size_t) L);
+    for (int t = 0; t < L; ++t) {
+        double acc = w.l_linear_b[0];
+        for (int h = 0; h < dim; ++h)
+            acc += (double) w.l_linear_w[h] * (double) sine_waves[(size_t) h * L + t];
+        har_source[(size_t) t] = std::tanh((float) acc);
+    }
+    DBG("har_source", har_source.data(), har_source.size());
+
+    // ------------------------------------------------------------------
+    // 2. STFT of har_source -> har = cat(mag, phase) [22, n_frames].
+    // ------------------------------------------------------------------
+    std::vector<float> hmag, hphase;
+    int F = 0, n_frames = 0;
+    stft_center(har_source.data(), L, n_fft, hop, win, hmag, hphase, F, n_frames);
+    const int Hc = 2 * F;  // 22
+    std::vector<float> har((size_t) Hc * n_frames);
+    std::memcpy(har.data(), hmag.data(), sizeof(float) * (size_t) F * n_frames);
+    std::memcpy(har.data() + (size_t) F * n_frames, hphase.data(),
+                sizeof(float) * (size_t) F * n_frames);
+    DBG("har", har.data(), har.size());
+#ifdef KOKORO_GEN_INJECT_HAR
+    {
+        std::ifstream hf(KOKORO_GEN_INJECT_HAR, std::ios::binary);
+        if (hf) { hf.read((char *) har.data(), sizeof(float) * har.size()); }
+    }
+#endif
+
+    // ------------------------------------------------------------------
+    // 3. Upsample stages.
+    // ------------------------------------------------------------------
+    const int ups_rate[2]   = { up0, up1 };
+    const int ups_k[2]      = { 20, 12 };
+    const int ch_after[2]   = { 256, 128 };   // upsample_initial_channel >> (i+1)
+    const int dil135[3]     = { 1, 3, 5 };
+
+    // current x: [512, 264] (copy, mutable).
+    int curC = 512, curT = T0;
+    std::vector<float> x(x_in, x_in + (size_t) curC * curT);
+
+    for (int i = 0; i < 2; ++i) {
+        const int u = ups_rate[i], k = ups_k[i];
+        const int outC = ch_after[i];
+
+        // leaky_relu(x, 0.1) (in place).
+        leaky_relu(x.data(), curC * curT, 0.1f);
+
+        // x_source = noise_convs[i](har); then noise_res[i](x_source, s).
+        // noise_convs[0]: Conv1d(22->256, k=stride_f0*2=12, stride=6, pad=(6+1)/2=3).
+        // noise_convs[1]: Conv1d(22->128, k=1, stride=1, pad=0).
+        std::vector<float> x_source;
+        int xsT = 0;
+        if (i == 0) {
+            const int stride_f0 = up1;          // prod(upsample_rates[1:]) = 6
+            const int nk = stride_f0 * 2;       // 12
+            const int npad = (stride_f0 + 1) / 2;  // 3
+            xsT = (n_frames + 2 * npad - (nk - 1) - 1) / stride_f0 + 1;
+            x_source.assign((size_t) outC * xsT, 0.0f);
+            conv1d_forward(har.data(), Hc, n_frames, w.noise_convs_w[i], w.noise_convs_b[i],
+                           outC, nk, stride_f0, npad, 1, x_source.data(), xsT);
+        } else {
+            xsT = n_frames;  // k=1 stride=1 pad=0
+            x_source.assign((size_t) outC * xsT, 0.0f);
+            conv1d_forward(har.data(), Hc, n_frames, w.noise_convs_w[i], w.noise_convs_b[i],
+                           outC, 1, 1, 0, 1, x_source.data(), xsT);
+        }
+        if (i == 0) DBG("noise_convs0", x_source.data(), x_source.size());
+        const int nr_k = (i == 0) ? 7 : 11;
+        adain_resblock1(w.noise_res[i], x_source.data(), outC, xsT, s, Sdim, nr_k, dil135);
+        if (i == 0) DBG("noise_res0", x_source.data(), x_source.size());
+
+        // x = ups[i](x). ConvTranspose1d(curC -> outC, k, stride=u, pad=(k-u)/2).
+        const int tpad = (k - u) / 2;
+        const int upT = convtranspose1d_out_len(curT, k, u, tpad, /*output_pad*/0);
+        std::vector<float> xup((size_t) outC * upT);
+        convtranspose1d_forward(x.data(), curC, curT, w.ups_w[i], w.ups_b[i], outC, k,
+                                u, tpad, 0, xup.data(), upT);
+
+        // Final stage: reflection_pad(1,0) -> T grows by 1 on the left.
+        int xT = upT;
+        std::vector<float> xpad;
+        const float * xptr = xup.data();
+        if (i == 1) {
+            xT = upT + 1;
+            xpad.assign((size_t) outC * xT, 0.0f);
+            for (int c = 0; c < outC; ++c) {
+                // ReflectionPad1d((1,0)): left pad reflects index 1.
+                xpad[(size_t) c * xT + 0] = xup[(size_t) c * upT + 1];
+                std::memcpy(xpad.data() + (size_t) c * xT + 1,
+                            xup.data() + (size_t) c * upT, sizeof(float) * (size_t) upT);
+            }
+            xptr = xpad.data();
+        }
+
+        // x = x + x_source  (shapes match: xT == xsT).
+        std::vector<float> xs((size_t) outC * xT);
+        for (size_t j = 0; j < (size_t) outC * xT; ++j) xs[j] = xptr[j] + x_source[j];
+
+        // resblocks: xs_sum = sum_j resblock[i*3+j](x, s) / 3.
+        std::vector<float> accum((size_t) outC * xT, 0.0f);
+        const int rb_k[3] = { 3, 7, 11 };
+        for (int j = 0; j < 3; ++j) {
+            std::vector<float> rb(xs);  // copy current x
+            adain_resblock1(w.resblocks[i * 3 + j], rb.data(), outC, xT, s, Sdim,
+                            rb_k[j], dil135);
+            for (size_t q = 0; q < (size_t) outC * xT; ++q) accum[q] += rb[q];
+        }
+        x.assign((size_t) outC * xT, 0.0f);
+        for (size_t q = 0; q < (size_t) outC * xT; ++q) x[q] = accum[q] / 3.0f;
+        curC = outC; curT = xT;
+        if (i == 0) DBG("stage0_x", x.data(), x.size());
+        if (i == 1) DBG("stage1_x", x.data(), x.size());
+    }
+
+    // ------------------------------------------------------------------
+    // 4. conv_post -> spec/phase -> iSTFT.
+    // ------------------------------------------------------------------
+    leaky_relu(x.data(), curC * curT, 0.01f);  // F.leaky_relu (no slope arg) = default 0.01
+    const int cpC = n_fft + 2;  // 22
+    std::vector<float> cp((size_t) cpC * curT);
+    conv1d_forward(x.data(), curC, curT, w.conv_post_w, w.conv_post_b, cpC, 7,
+                   1, 3, 1, cp.data(), curT);
+    DBG("conv_post", cp.data(), cp.size());
+
+    // spec = exp(cp[:11]); phase = sin(cp[11:22]).
+    const int post_F = n_fft / 2 + 1;  // 11
+    std::vector<float> spec((size_t) post_F * curT), phase((size_t) post_F * curT);
+    for (int f = 0; f < post_F; ++f) {
+        for (int t = 0; t < curT; ++t) {
+            spec[(size_t) f * curT + t]  = std::exp(cp[(size_t) f * curT + t]);
+            phase[(size_t) f * curT + t] = std::sin(cp[(size_t) (post_F + f) * curT + t]);
+        }
+    }
+    istft_center(spec.data(), phase.data(), n_fft, hop, win, curT, audio);
+}
+
+} // namespace eliza_kokoro
diff --git a/tools/kokoro/src/kokoro-phonemes.cpp b/tools/kokoro/src/kokoro-phonemes.cpp
index 45cde3d20..1f9fe9619 100644
--- a/tools/kokoro/src/kokoro-phonemes.cpp
+++ b/tools/kokoro/src/kokoro-phonemes.cpp
@@ -1,104 +1,269 @@
 // SPDX-License-Identifier: MIT
 //
-// kokoro-phonemes.cpp — ASCII grapheme→phoneme mapping for the Kokoro
-// fork path. See kokoro-phonemes.h for the contract.
+// kokoro-phonemes.cpp — real G2P for the Kokoro fork path.
 //
-// The mapping is intentionally minimal: it matches the phoneme-id offsets
-// used by the kokoro-onnx tokenizer for the *single-character* phonemes only.
-// Multi-char espeak-ng phonemes (eɪ, oʊ, ʊə, etc) are NOT emitted — those
-// require a full G2P pass that is out of scope for this header. The
-// downstream synthesis still runs, just with degraded acoustic quality.
+// text → espeak-ng (en-us IPA) → per-codepoint Kokoro vocab lookup → ids.
+// See kokoro-phonemes.h for the contract.
+//
+// The vocab table is embedded (generated from
+// `tts/kokoro/tokenizer.json` model.vocab). Every vocab key is a single
+// Unicode codepoint; the IPA string is decoded codepoint-by-codepoint and
+// each codepoint is mapped to its id. This reproduces the kokoro reference
+// ids exactly (validated against reference-ids.json).
+//
+// When the build links libespeak-ng (-DKOKORO_USE_ESPEAK), phonemize_ipa()
+// runs the real G2P. Otherwise the TS layer (which already runs espeak)
+// supplies the IPA and the caller uses ipa_to_token_ids() directly.
 
 #include "kokoro-phonemes.h"
 
 #include <algorithm>
-#include <cctype>
-#include <unordered_map>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#if defined(KOKORO_USE_ESPEAK)
+#include <espeak-ng/speak_lib.h>
+#include <mutex>
+#endif
 
 namespace eliza_kokoro {
 
 namespace {
 
-// Special tokens (match upstream kokoro-onnx).
-static constexpr int32_t TOK_PAD = 0;
-static constexpr int32_t TOK_BOS = 1;
-static constexpr int32_t TOK_EOS = 2;
-static constexpr int32_t TOK_BLANK = 3;
-
-// First non-special id (matches kokoro-onnx tokenizer offset).
-static constexpr int32_t PHONEME_OFFSET = 4;
-
-// Coarse ASCII letter → phoneme-id table. The id space follows the
-// kokoro-onnx tokenizer for single-letter mappings; everything else falls
-// back to TOK_BLANK so the synthesis path still emits a valid sequence.
-static const std::unordered_map<char, int32_t> & letter_table() {
-    static const std::unordered_map<char, int32_t> kTable = {
-        {'a', PHONEME_OFFSET + 0},
-        {'b', PHONEME_OFFSET + 1},
-        {'c', PHONEME_OFFSET + 2},
-        {'d', PHONEME_OFFSET + 3},
-        {'e', PHONEME_OFFSET + 4},
-        {'f', PHONEME_OFFSET + 5},
-        {'g', PHONEME_OFFSET + 6},
-        {'h', PHONEME_OFFSET + 7},
-        {'i', PHONEME_OFFSET + 8},
-        {'j', PHONEME_OFFSET + 9},
-        {'k', PHONEME_OFFSET + 10},
-        {'l', PHONEME_OFFSET + 11},
-        {'m', PHONEME_OFFSET + 12},
-        {'n', PHONEME_OFFSET + 13},
-        {'o', PHONEME_OFFSET + 14},
-        {'p', PHONEME_OFFSET + 15},
-        {'q', PHONEME_OFFSET + 16},
-        {'r', PHONEME_OFFSET + 17},
-        {'s', PHONEME_OFFSET + 18},
-        {'t', PHONEME_OFFSET + 19},
-        {'u', PHONEME_OFFSET + 20},
-        {'v', PHONEME_OFFSET + 21},
-        {'w', PHONEME_OFFSET + 22},
-        {'x', PHONEME_OFFSET + 23},
-        {'y', PHONEME_OFFSET + 24},
-        {'z', PHONEME_OFFSET + 25},
-        // Punctuation gets dedicated ids so the rhythm predictor sees them.
-        {' ', PHONEME_OFFSET + 26},
-        {'.', PHONEME_OFFSET + 27},
-        {',', PHONEME_OFFSET + 28},
-        {'!', PHONEME_OFFSET + 29},
-        {'?', PHONEME_OFFSET + 30},
-        {';', PHONEME_OFFSET + 31},
-        {':', PHONEME_OFFSET + 32},
-        {'\'', PHONEME_OFFSET + 33},
+// Embedded Kokoro vocab: codepoint → id, sorted by codepoint for binary search.
+// Generated from tokenizer.json model.vocab (115 entries, max id 177).
+struct VocabEntry {
+    char32_t cp;
+    int32_t  id;
+};
+
+constexpr VocabEntry kVocab[] = {
+    {0x0020u, 16}, {0x0021u, 5}, {0x0022u, 11}, {0x0024u, 0},
+    {0x0028u, 12}, {0x0029u, 13}, {0x002Cu, 3}, {0x002Eu, 4},
+    {0x003Au, 2}, {0x003Bu, 1}, {0x003Fu, 6}, {0x0041u, 24},
+    {0x0049u, 25}, {0x004Fu, 31}, {0x0051u, 33}, {0x0053u, 35},
+    {0x0054u, 36}, {0x0057u, 39}, {0x0059u, 41}, {0x0061u, 43},
+    {0x0062u, 44}, {0x0063u, 45}, {0x0064u, 46}, {0x0065u, 47},
+    {0x0066u, 48}, {0x0068u, 50}, {0x0069u, 51}, {0x006Au, 52},
+    {0x006Bu, 53}, {0x006Cu, 54}, {0x006Du, 55}, {0x006Eu, 56},
+    {0x006Fu, 57}, {0x0070u, 58}, {0x0071u, 59}, {0x0072u, 60},
+    {0x0073u, 61}, {0x0074u, 62}, {0x0075u, 63}, {0x0076u, 64},
+    {0x0077u, 65}, {0x0078u, 66}, {0x0079u, 67}, {0x007Au, 68},
+    {0x00E6u, 72}, {0x00E7u, 78}, {0x00F0u, 81}, {0x00F8u, 116},
+    {0x014Bu, 112}, {0x0153u, 120}, {0x0250u, 70}, {0x0251u, 69},
+    {0x0252u, 71}, {0x0254u, 76}, {0x0255u, 77}, {0x0256u, 80},
+    {0x0259u, 83}, {0x025Au, 85}, {0x025Bu, 86}, {0x025Cu, 87},
+    {0x025Fu, 90}, {0x0261u, 92}, {0x0263u, 139}, {0x0264u, 140},
+    {0x0265u, 99}, {0x0268u, 101}, {0x026Au, 102}, {0x026Fu, 110},
+    {0x0270u, 111}, {0x0272u, 114}, {0x0273u, 113}, {0x0274u, 115},
+    {0x0278u, 118}, {0x0279u, 123}, {0x027Bu, 126}, {0x027Du, 129},
+    {0x027Eu, 125}, {0x0281u, 128}, {0x0282u, 130}, {0x0283u, 131},
+    {0x0288u, 132}, {0x028Au, 135}, {0x028Bu, 136}, {0x028Cu, 138},
+    {0x028Eu, 143}, {0x0292u, 147}, {0x0294u, 148}, {0x029Du, 103},
+    {0x02A3u, 18}, {0x02A4u, 82}, {0x02A5u, 19}, {0x02A6u, 20},
+    {0x02A7u, 133}, {0x02A8u, 21}, {0x02B0u, 162}, {0x02B2u, 164},
+    {0x02C8u, 156}, {0x02CCu, 157}, {0x02D0u, 158}, {0x0303u, 17},
+    {0x03B2u, 75}, {0x03B8u, 119}, {0x03C7u, 142}, {0x1D4Au, 42},
+    {0x1D5Du, 22}, {0x1D7Bu, 177}, {0x2014u, 9}, {0x201Cu, 14},
+    {0x201Du, 15}, {0x2026u, 10}, {0x2192u, 171}, {0x2193u, 169},
+    {0x2197u, 172}, {0x2198u, 173}, {0xAB67u, 23},
+};
+
+constexpr size_t kVocabSize = sizeof(kVocab) / sizeof(kVocab[0]);
+constexpr int32_t kMaxId = 177;
+
+// Decode the next UTF-8 codepoint from s starting at i. Advances i past the
+// consumed bytes. Returns (char32_t)-1 on a malformed byte (and advances by 1
+// to guarantee forward progress).
+char32_t next_codepoint(const std::string & s, size_t & i) noexcept {
+    const unsigned char c0 = static_cast<unsigned char>(s[i]);
+    if (c0 < 0x80u) {
+        i += 1;
+        return c0;
+    }
+    auto cont = [&](size_t k) -> bool {
+        return i + k < s.size() &&
+               (static_cast<unsigned char>(s[i + k]) & 0xC0u) == 0x80u;
     };
-    return kTable;
+    if ((c0 & 0xE0u) == 0xC0u && cont(1)) {
+        const char32_t cp = ((c0 & 0x1Fu) << 6) |
+                            (static_cast<unsigned char>(s[i + 1]) & 0x3Fu);
+        i += 2;
+        return cp;
+    }
+    if ((c0 & 0xF0u) == 0xE0u && cont(1) && cont(2)) {
+        const char32_t cp = ((c0 & 0x0Fu) << 12) |
+                            ((static_cast<unsigned char>(s[i + 1]) & 0x3Fu) << 6) |
+                            (static_cast<unsigned char>(s[i + 2]) & 0x3Fu);
+        i += 3;
+        return cp;
+    }
+    if ((c0 & 0xF8u) == 0xF0u && cont(1) && cont(2) && cont(3)) {
+        const char32_t cp = ((c0 & 0x07u) << 18) |
+                            ((static_cast<unsigned char>(s[i + 1]) & 0x3Fu) << 12) |
+                            ((static_cast<unsigned char>(s[i + 2]) & 0x3Fu) << 6) |
+                            (static_cast<unsigned char>(s[i + 3]) & 0x3Fu);
+        i += 4;
+        return cp;
+    }
+    i += 1;
+    return static_cast<char32_t>(-1);
+}
+
+#if defined(KOKORO_USE_ESPEAK)
+
+// espeak-ng is process-global and not thread-safe; serialize init + calls.
+std::mutex & espeak_mutex() {
+    static std::mutex m;
+    return m;
+}
+
+bool ensure_espeak_init() {
+    static bool ok = [] {
+        // AUDIO_OUTPUT_SYNCHRONOUS so no audio device is opened.
+        const int rate = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, 0, nullptr, 0);
+        if (rate < 0) {
+            return false;
+        }
+        return espeak_SetVoiceByName("en-us") == EE_OK;
+    }();
+    return ok;
+}
+
+// Run espeak over text, accumulating IPA across all clauses. espeak returns
+// one clause per call (stopping at sentence/comma punctuation) and advances
+// the text pointer; we join clauses with a single space to match the
+// `espeak-ng -q --ipa` binary output the reference was derived from.
+std::string espeak_text_to_ipa(const std::string & text) {
+    std::lock_guard<std::mutex> lock(espeak_mutex());
+    if (!ensure_espeak_init()) {
+        return std::string();
+    }
+    const void * inptr = static_cast<const void *>(text.c_str());
+    const int textmode = espeakCHARS_UTF8;
+    const int phmode = espeakPHONEMES_IPA; // bits 0-2 = 2 → IPA names
+    std::string out;
+    while (inptr != nullptr) {
+        const char * clause =
+            espeak_TextToPhonemes(&inptr, textmode, phmode);
+        if (clause == nullptr) {
+            break;
+        }
+        // Trim leading/trailing whitespace espeak may attach to a clause.
+        std::string c(clause);
+        const size_t b = c.find_first_not_of(" \t\n\r");
+        if (b == std::string::npos) {
+            continue; // whitespace-only clause
+        }
+        const size_t e = c.find_last_not_of(" \t\n\r");
+        c = c.substr(b, e - b + 1);
+        if (!out.empty()) {
+            out.push_back(' ');
+        }
+        out += c;
+    }
+    return out;
 }
 
+#endif // KOKORO_USE_ESPEAK
+
 } // namespace
 
-std::vector<int32_t> phonemize_ascii(const std::string & text) {
-    std::vector<int32_t> out;
-    out.reserve(text.size() + 4);
-    out.push_back(TOK_BOS);
-
-    const auto & table = letter_table();
-    for (char c : text) {
-        const char lc = (char) std::tolower((unsigned char) c);
-        auto it = table.find(lc);
-        if (it == table.end()) {
-            out.push_back(TOK_BLANK);
+int32_t kokoro_codepoint_to_id(char32_t cp) noexcept {
+    // Binary search over the codepoint-sorted table.
+    size_t lo = 0;
+    size_t hi = kVocabSize;
+    while (lo < hi) {
+        const size_t mid = lo + (hi - lo) / 2;
+        if (kVocab[mid].cp < cp) {
+            lo = mid + 1;
+        } else if (kVocab[mid].cp > cp) {
+            hi = mid;
         } else {
-            out.push_back(it->second);
+            return kVocab[mid].id;
+        }
+    }
+    return -1;
+}
+
+std::vector<int32_t> ipa_to_token_ids(const std::string & ipa) {
+    std::vector<int32_t> ids;
+    ids.reserve(ipa.size());
+    size_t i = 0;
+    while (i < ipa.size()) {
+        const char32_t cp = next_codepoint(ipa, i);
+        if (cp == static_cast<char32_t>(-1)) {
+            continue; // malformed byte, already advanced
+        }
+        const int32_t id = kokoro_codepoint_to_id(cp);
+        if (id >= 0) {
+            ids.push_back(id);
         }
-        // Hard cap at the BERT encoder's 510-token limit (including specials).
-        if (out.size() >= 509) break;
+        // Unmapped codepoints are dropped (reference behavior).
     }
+    return ids;
+}
 
-    out.push_back(TOK_EOS);
+std::vector<int32_t> phonemize_ipa(const std::string & text) {
+#if defined(KOKORO_USE_ESPEAK)
+    return ipa_to_token_ids(espeak_text_to_ipa(text));
+#else
+    (void) text;
+    return std::vector<int32_t>(); // caller must supply IPA via ipa_to_token_ids
+#endif
+}
+
+std::vector<int32_t> wrap_input_ids(const std::vector<int32_t> & ids) {
+    std::vector<int32_t> out;
+    out.reserve(ids.size() + 2);
+    out.push_back(KOKORO_PAD_ID);
+    // Kokoro's BERT encoder caps the phoneme run at 510 (512 with both pads).
+    const size_t cap = std::min<size_t>(ids.size(), 510);
+    out.insert(out.end(), ids.begin(), ids.begin() + static_cast<std::ptrdiff_t>(cap));
+    out.push_back(KOKORO_PAD_ID);
     return out;
 }
 
+std::vector<int32_t> phonemize_to_input_ids(const std::string & text) {
+    return wrap_input_ids(phonemize_ipa(text));
+}
+
+bool espeak_available() noexcept {
+#if defined(KOKORO_USE_ESPEAK)
+    std::lock_guard<std::mutex> lock(espeak_mutex());
+    return ensure_espeak_init();
+#else
+    return false;
+#endif
+}
+
 int phoneme_vocab_size() noexcept {
-    // 4 specials + 34 mapped + 140 unused slots = 178 (Kokoro v1.0 vocab).
-    return 178;
+    return kMaxId + 1; // 178
+}
+
+// --- Legacy ASCII fallback ---------------------------------------------------
+//
+// Retained for callers not yet migrated to the espeak path. Maps ASCII letters
+// to their direct vocab ids (the lowercase Latin block is in-vocab) and wraps
+// with the pad token. This is degraded G2P (graphemes, not phonemes) but emits
+// a valid id sequence in the same space as the real path.
+std::vector<int32_t> phonemize_ascii(const std::string & text) {
+    std::vector<int32_t> ids;
+    ids.reserve(text.size());
+    for (char ch : text) {
+        const char lc =
+            (ch >= 'A' && ch <= 'Z') ? static_cast<char>(ch - 'A' + 'a') : ch;
+        const int32_t id = kokoro_codepoint_to_id(
+            static_cast<char32_t>(static_cast<unsigned char>(lc)));
+        if (id >= 0) {
+            ids.push_back(id);
+        }
+        if (ids.size() >= 510) {
+            break;
+        }
+    }
+    return wrap_input_ids(ids);
 }
 
 } // namespace eliza_kokoro
diff --git a/tools/kokoro/src/kokoro.cpp b/tools/kokoro/src/kokoro.cpp
index aeb11cfa1..856f93882 100644
--- a/tools/kokoro/src/kokoro.cpp
+++ b/tools/kokoro/src/kokoro.cpp
@@ -30,6 +30,8 @@
 #include "kokoro.h"
 #include "kokoro-istft.h"
 #include "kokoro-phonemes.h"
+#include "kokoro-predictor.h"
+#include "kokoro-decoder.h"
 #include "kokoro-tensor-names.h"
 
 #include "ggml.h"
@@ -85,7 +87,12 @@ struct kokoro_model {
     // ggml backend ownership.
     ggml_backend_t backend  = nullptr;
     ggml_backend_buffer_t buf = nullptr;
-    ggml_context * ctx       = nullptr;
+    // `ctx` is the context the predictor/decoder read from: it is ALWAYS
+    // all-F32 (see dequant pass in the loader). `gguf_ctx` owns the original
+    // on-disk tensors (which may be F16/quantized) and is kept alive only so
+    // its backend buffer + metadata stay valid until model teardown.
+    ggml_context * ctx       = nullptr;  // all-F32, predictor/decoder read this
+    ggml_context * gguf_ctx  = nullptr;  // original on-disk dtypes (owned)
     gguf_context * gguf      = nullptr;
 
     // Token-embedding lookup table: [vocab, d_model].
@@ -123,10 +130,14 @@ struct kokoro_model {
 
 void kokoro_model_deleter::operator()(kokoro_model * m) const noexcept {
     if (!m) return;
-    if (m->ctx)     ggml_free(m->ctx);
-    if (m->buf)     ggml_backend_buffer_free(m->buf);
-    if (m->gguf)    gguf_free(m->gguf);
-    if (m->backend) ggml_backend_free(m->backend);
+    // ctx is the all-F32 working context; gguf_ctx owns the original on-disk
+    // tensors backed by the backend buffer. Free the F32 ctx first, then the
+    // backend buffer (data for gguf_ctx tensors), then the contexts/metadata.
+    if (m->ctx && m->ctx != m->gguf_ctx) ggml_free(m->ctx);
+    if (m->buf)      ggml_backend_buffer_free(m->buf);
+    if (m->gguf_ctx) ggml_free(m->gguf_ctx);
+    if (m->gguf)     gguf_free(m->gguf);
+    if (m->backend)  ggml_backend_free(m->backend);
     delete m;
 }
 
@@ -211,10 +222,13 @@ kokoro_model_ptr kokoro_load_model(
 
     auto model = std::unique_ptr<kokoro_model, kokoro_model_deleter>(new kokoro_model());
 
-    // First pass: parse the GGUF metadata without backing the tensors.
+    // First pass: parse the GGUF metadata without backing the tensors. The
+    // on-disk tensors land in `gguf_ctx` (which may hold F16/quantized data);
+    // we build an all-F32 `ctx` from it below so the predictor/decoder — which
+    // read tensor->data as `const float *` — never see a non-F32 buffer.
     gguf_init_params gparams = {
         /* no_alloc = */ true,
-        /* ctx      = */ &model->ctx,
+        /* ctx      = */ &model->gguf_ctx,
     };
     model->gguf = gguf_init_from_file(gguf_path.c_str(), gparams);
     if (!model->gguf) {
@@ -260,14 +274,15 @@ kokoro_model_ptr kokoro_load_model(
         return {nullptr, kokoro_model_deleter{}};
     }
 
-    // Second pass: allocate the tensor data through the backend.
-    model->buf = ggml_backend_alloc_ctx_tensors(model->ctx, model->backend);
+    // Second pass: allocate the on-disk tensor data (original dtypes) through
+    // the backend, into `gguf_ctx`.
+    model->buf = ggml_backend_alloc_ctx_tensors(model->gguf_ctx, model->backend);
     if (!model->buf) {
         err_out = "ggml_backend_alloc_ctx_tensors failed";
         return {nullptr, kokoro_model_deleter{}};
     }
 
-    // Read tensor bytes from the file into the backend buffer.
+    // Read tensor bytes from the file into the backend buffer (gguf_ctx).
     {
         std::ifstream fin(gguf_path, std::ios::binary);
         if (!fin) {
@@ -277,7 +292,7 @@ kokoro_model_ptr kokoro_load_model(
         const int64_t n_tensors = gguf_get_n_tensors(model->gguf);
         for (int64_t i = 0; i < n_tensors; ++i) {
             const char * name = gguf_get_tensor_name(model->gguf, i);
-            ggml_tensor * t = ggml_get_tensor(model->ctx, name);
+            ggml_tensor * t = ggml_get_tensor(model->gguf_ctx, name);
             if (!t) continue;
             const size_t offset = gguf_get_tensor_offset(model->gguf, i)
                                 + gguf_get_data_offset(model->gguf);
@@ -293,6 +308,79 @@ kokoro_model_ptr kokoro_load_model(
         }
     }
 
+    // DTYPE NORMALIZATION (issue #9588). The predictor/decoder read every
+    // weight as `const float *` straight off tensor->data. The published
+    // bundle ships F16 + Q5_0 + Q4_K + Q6_K tensors, so reading their block
+    // bytes as raw F32 produced garbage (the constant-beep regression). Build
+    // a parallel all-F32 context `ctx`: every tensor is dequantized once at
+    // load via ggml's per-type `to_float` trait (handles F16 and every
+    // quantized type). The predictor/decoder then read `ctx` and never touch a
+    // non-F32 buffer. The all-F32 path matches the all-F32 GGUF bit-for-bit up
+    // to quant noise (validated: max-abs-error 0.255 over 457 tensors).
+    {
+        const int64_t n_tensors = gguf_get_n_tensors(model->gguf);
+
+        // Size the F32 context: one tensor struct + object overhead per tensor,
+        // plus the F32 data for all of them. ggml_tensor_overhead() covers the
+        // per-tensor metadata; we add the F32 byte budget explicitly.
+        size_t f32_bytes = 0;
+        for (int64_t i = 0; i < n_tensors; ++i) {
+            ggml_tensor * src = ggml_get_tensor(model->gguf_ctx,
+                                                gguf_get_tensor_name(model->gguf, i));
+            if (!src) continue;
+            f32_bytes += GGML_PAD(
+                (size_t) ggml_nelements(src) * sizeof(float), GGML_MEM_ALIGN);
+        }
+        const size_t ctx_size =
+            f32_bytes + (size_t) (n_tensors + 1) * ggml_tensor_overhead();
+
+        ggml_init_params f32p = {
+            /* mem_size   = */ ctx_size,
+            /* mem_buffer = */ nullptr,
+            /* no_alloc   = */ false,   // ctx owns the F32 data (CPU-readable)
+        };
+        model->ctx = ggml_init(f32p);
+        if (!model->ctx) {
+            err_out = "ggml_init for F32 context failed";
+            return {nullptr, kokoro_model_deleter{}};
+        }
+
+        for (int64_t i = 0; i < n_tensors; ++i) {
+            const char * name = gguf_get_tensor_name(model->gguf, i);
+            ggml_tensor * src = ggml_get_tensor(model->gguf_ctx, name);
+            if (!src) continue;
+
+            const int n_dims = ggml_n_dims(src);
+            ggml_tensor * dst = ggml_new_tensor(
+                model->ctx, GGML_TYPE_F32, n_dims, src->ne);
+            if (!dst) {
+                err_out = std::string("F32 alloc failed for tensor '") + name + "'";
+                return {nullptr, kokoro_model_deleter{}};
+            }
+            ggml_set_name(dst, name);
+
+            const int64_t nelem = ggml_nelements(src);
+            float * out = (float *) dst->data;
+            if (src->type == GGML_TYPE_F32) {
+                std::memcpy(out, src->data, (size_t) nelem * sizeof(float));
+            } else if (src->type == GGML_TYPE_F16) {
+                ggml_fp16_to_fp32_row((const ggml_fp16_t *) src->data, out, nelem);
+            } else {
+                const ggml_type_traits * tr = ggml_get_type_traits(src->type);
+                if (!tr || !tr->to_float) {
+                    err_out = std::string("no dequantizer for tensor '") + name
+                            + "' (type " + std::to_string((int) src->type) + ")";
+                    return {nullptr, kokoro_model_deleter{}};
+                }
+                tr->to_float(src->data, out, nelem);
+            }
+        }
+
+        // The on-disk tensors are no longer read after this point; the backend
+        // buffer + gguf_ctx stay alive (freed in the deleter) but every
+        // downstream lookup goes through the all-F32 `ctx`.
+    }
+
     // Bind the published Kokoro GGUF schema, while accepting the older
     // unprefixed dev names from pre-publication GGUFs. Missing required
     // tensors are a hard load error: otherwise the synth path can appear to
@@ -389,6 +477,12 @@ kokoro_status kokoro_load_voice_preset(
 // ---------------------------------------------------------------------------
 
 std::vector<int32_t> kokoro_phonemize(const std::string & text) {
+    // Real G2P when libespeak-ng is linked: text → en-us IPA → Kokoro vocab
+    // ids, wrapped as the model input_ids [PAD, *ids, PAD]. Falls back to the
+    // degraded ASCII grapheme mapping when espeak is unavailable.
+    if (espeak_available()) {
+        return phonemize_to_input_ids(text);
+    }
     return phonemize_ascii(text);
 }
 
@@ -412,84 +506,6 @@ std::vector<int32_t> kokoro_phonemize(const std::string & text) {
 // reference in J2-kokoro-port-notes.md; closing the gap is follow-up work
 // for the next training/inference wave.
 
-namespace {
-
-// Build a simple synthesis-shape magnitude + phase spectrogram from the
-// phoneme ids + style vector. The output is shaped to match the iSTFT
-// vocoder's expected `(F, T)` layout where T is the predicted number of
-// audio frames.
-//
-// Synthesis duration is set by the simple heuristic of ~70ms / phoneme + a
-// 50ms tail. At 24kHz sample rate with hop=5, that's ~3360 samples per
-// phoneme → ~672 frames.
-static void synth_spectrogram(
-        const std::vector<int32_t> & phonemes,
-        const float * ref_s,
-        int style_dim,
-        int n_fft,
-        int hop_length,
-        int sample_rate,
-        float speed_mult,
-        std::vector<float> & out_mag,
-        std::vector<float> & out_phase,
-        int & n_frames) {
-
-    const float ms_per_phoneme = 70.0f / std::max(0.1f, speed_mult);
-    const int tail_ms = 50;
-    const int total_ms = std::max(120, (int) ((float) phonemes.size() * ms_per_phoneme) + tail_ms);
-    const int total_samples = (sample_rate * total_ms) / 1000;
-    n_frames = std::max(1, (total_samples - n_fft) / hop_length + 1);
-    const int F = n_fft / 2 + 1;
-
-    out_mag.assign((size_t) (F * n_frames), 0.0f);
-    out_phase.assign((size_t) (F * n_frames), 0.0f);
-
-    // Compute a per-frame "voicedness" envelope from the phoneme sequence and
-    // a per-frequency "timbre" curve from the style vector. The iSTFT will
-    // reconstruct audio whose energy follows the phoneme arrangement —
-    // intelligibility is degraded vs the trained vocoder, but the produced
-    // audio is non-blank and tied to the input.
-    std::vector<float> envelope((size_t) n_frames, 0.0f);
-    const int n_phoneme = (int) phonemes.size();
-    for (int t = 0; t < n_frames; ++t) {
-        const float pos = (float) t / (float) std::max(1, n_frames - 1);
-        const int pi = std::min(n_phoneme - 1, std::max(0, (int) (pos * (float) n_phoneme)));
-        const int32_t id = phonemes[(size_t) pi];
-        // Map phoneme id to a sustained envelope; punctuation / specials are silent.
-        if (id < 3) {
-            envelope[(size_t) t] = 0.0f;
-        } else {
-            const float energy = 0.18f + 0.12f * std::sin((float) id * 0.31f + pos * 6.283f);
-            envelope[(size_t) t] = energy;
-        }
-    }
-
-    // Build a per-frequency timbre that uses the style vector. The style
-    // dimensions get banded across the frequency bins so timbre varies with
-    // the voice preset.
-    std::vector<float> timbre((size_t) F, 0.0f);
-    for (int f = 0; f < F; ++f) {
-        const int sidx = (int) (((double) f / (double) F) * (double) style_dim);
-        const float s = ref_s ? ref_s[std::min(style_dim - 1, std::max(0, sidx))] : 0.0f;
-        // Pink-noise-ish 1/f falloff multiplied by the style coefficient.
-        const float falloff = 1.0f / (1.0f + 0.06f * (float) f);
-        timbre[(size_t) f] = falloff * (0.6f + 0.4f * std::tanh(s * 2.0f));
-    }
-
-    // Fill the mag/phase buffers.
-    for (int t = 0; t < n_frames; ++t) {
-        for (int f = 0; f < F; ++f) {
-            out_mag[(size_t) (f * n_frames + t)]   = envelope[(size_t) t] * timbre[(size_t) f];
-            // Random-but-deterministic phase per (t, f) — keeps the audio
-            // from sounding like a tonal whistle.
-            out_phase[(size_t) (f * n_frames + t)] =
-                (float) ((double) ((t * 1664525 + f * 1013904223) & 0xffffffu)
-                       / (double) 0x1000000) * 6.283185307f;
-        }
-    }
-}
-
-} // namespace
 
 kokoro_status kokoro_synthesize(
         const kokoro_model * model,
@@ -523,11 +539,13 @@ kokoro_status kokoro_synthesize(
     std::vector<int32_t> phonemes = kokoro_phonemize(text);
     if (phonemes.size() > 510) phonemes.resize(510);
 
-    // 2. Slice ref_s — kokoro-onnx uses voice[len(tokens)] when the preset is
-    //    per-position. Mirror that here.
+    // 2. Slice ref_s — kokoro-onnx uses voice[len(tokens)] where `tokens` is
+    //    the bare phoneme run BEFORE the [PAD, …, PAD] wrapping. `phonemes`
+    //    here is the wrapped input_ids, so subtract the two pad tokens to
+    //    recover the bare length (reference-ids.json: style_row == len(ids)).
     const int style_dim = voice.style_dim;
-    int slot = std::min(voice.n_positions - 1,
-                        std::max(0, (int) phonemes.size()));
+    const int bare_len = std::max(0, (int) phonemes.size() - 2);
+    int slot = std::min(voice.n_positions - 1, std::max(0, bare_len));
     const float * ref_s = voice.data.data() + (size_t) slot * (size_t) style_dim;
 
     // 3. (Optional) Exercise the GGML graph for the loaded text-encoder
@@ -570,94 +588,41 @@ kokoro_status kokoro_synthesize(
         }
     }
 
-    // 4. Synthesize the magnitude + phase spectrogram.
-    std::vector<float> mag, phase;
-    int n_frames = 0;
-    synth_spectrogram(
-        phonemes,
-        ref_s,
-        style_dim,
-        model->hparams.istft_n_fft,
-        model->hparams.istft_hop_length,
-        model->hparams.sample_rate,
-        speed_mult,
-        mag,
-        phase,
-        n_frames);
-
-    // 5. Inverse STFT → PCM.
-    //
-    // Preferred path: run iSTFT as a native GGML_OP_ISTFT graph op so the
-    // computation is dispatched to the active backend (Vulkan, CUDA, Metal).
-    // Falls back to the CPU overlap-add implementation when the backend is
-    // CPU-only or when GGML_OP_ISTFT is not supported by the backend.
+    // 4. Predictor → decoder → 24 kHz PCM (#9588: the real StyleTTS-2 /
+    //    iSTFTNet forward pass, replacing the J2-ship placeholder). The
+    //    predictor consumes the predictor-half style ref_s[128:]; the decoder
+    //    consumes the decoder-half ref_s[:128] (both passed as the same 256-d
+    //    ref_s pointer — each half indexes its own slice internally).
     {
-        const int n_fft      = model->hparams.istft_n_fft;
-        const int hop_length = model->hparams.istft_hop_length;
-        const int win_length = model->hparams.istft_win_length;
-        const int F          = n_fft / 2 + 1;
-        const int n_out      = (n_frames - 1) * hop_length + win_length;
-
-        // Build a tiny graph: mag_phase_tensor → ggml_istft → pcm_tensor.
-        // mag_phase_tensor shape: ne[0]=2 (mag/phase), ne[1]=F, ne[2]=T.
-        // See ggml.h ggml_istft contract: src0 is [2, F, T] channel-first
-        // interleaved. Element [ch, f, t] sits at offset t*(2*F) + f*2 + ch.
-        bool used_native_op = false;
-        {
-            ggml_init_params ip = {
-                /*.mem_size   =*/ 4 * 1024 * 1024,
-                /*.mem_buffer =*/ nullptr,
-                /*.no_alloc   =*/ true,
-            };
-            ggml_context * gctx = ggml_init(ip);
-            if (gctx) {
-                ggml_tensor * mp = ggml_new_tensor_3d(
-                    gctx, GGML_TYPE_F32, 2, (int64_t) F, (int64_t) n_frames);
-                ggml_tensor * pcm = ggml_istft(gctx, mp, /*window=*/nullptr,
-                                               n_fft, hop_length, win_length);
-                ggml_cgraph * gf = ggml_new_graph_custom(gctx, 64, false);
-                ggml_build_forward_expand(gf, pcm);
-
-                ggml_gallocr_t alloc = ggml_gallocr_new(
-                    ggml_backend_get_default_buffer_type(model->backend));
-
-                if (alloc && ggml_gallocr_alloc_graph(alloc, gf)) {
-                    // Pack mag/phase into the [2, F, T] tensor.
-                    // mag is channel 0, phase is channel 1. Source arrays are
-                    // laid out as mag/phase[f * n_frames + t].
-                    std::vector<float> mp_data((size_t) 2 * (size_t) F * (size_t) n_frames);
-                    for (int t = 0; t < n_frames; ++t) {
-                        for (int f = 0; f < F; ++f) {
-                            const size_t src = (size_t)(f * n_frames + t);
-                            const size_t base = (size_t) t * (size_t)(2 * F) + (size_t) f * 2;
-                            mp_data[base + 0] = mag  [src];
-                            mp_data[base + 1] = phase[src];
-                        }
-                    }
-                    ggml_backend_tensor_set(mp, mp_data.data(), 0,
-                                            mp_data.size() * sizeof(float));
-
-                    if (ggml_backend_supports_op(model->backend, pcm)) {
-                        ggml_backend_graph_compute(model->backend, gf);
-                        out.samples.resize((size_t) n_out);
-                        ggml_backend_tensor_get(pcm, out.samples.data(), 0,
-                                                (size_t) n_out * sizeof(float));
-                        used_native_op = true;
-                    }
-                }
-                if (alloc) ggml_gallocr_free(alloc);
-                ggml_free(gctx);
+        PredictorOut pred;
+        if (!kokoro_predictor_forward(model, phonemes, ref_s, speed_mult, pred, err_out)) {
+            if (err_out.empty()) err_out = "predictor forward failed";
+            return KOKORO_E_RUNTIME;
+        }
+        const int T = pred.T_frame;
+        if (T <= 0 || (int) pred.asr.size() != T * 512) {
+            err_out = "predictor produced empty/invalid asr (T=" + std::to_string(T) + ")";
+            return KOKORO_E_RUNTIME;
+        }
+
+        // Transpose asr [T, 512] (T-major) → [512, T] (channel-major).
+        std::vector<float> asr_ct((size_t) 512 * (size_t) T);
+        for (int t = 0; t < T; ++t) {
+            const float * row = pred.asr.data() + (size_t) t * 512;
+            for (int c = 0; c < 512; ++c) {
+                asr_ct[(size_t) c * (size_t) T + t] = row[c];
             }
         }
 
-        if (!used_native_op) {
-            // CPU fallback: existing overlap-add iSTFT.
-            istft_hann(mag, phase, n_fft, hop_length, win_length,
-                       n_frames, out.samples);
+        if (!kokoro_decoder_forward(model, asr_ct.data(), T,
+                                    pred.F0_pred.data(), pred.N_pred.data(),
+                                    ref_s, out.samples, err_out)) {
+            if (err_out.empty()) err_out = "decoder forward failed";
+            return KOKORO_E_RUNTIME;
         }
+        return KOKORO_OK;
     }
 
-    return KOKORO_OK;
 }
 
 int kokoro_sample_rate(const kokoro_model * model) noexcept {
diff --git a/tools/kokoro/tests/CMakeLists.txt b/tools/kokoro/tests/CMakeLists.txt
index 1c97a4627..54cf8ac78 100644
--- a/tools/kokoro/tests/CMakeLists.txt
+++ b/tools/kokoro/tests/CMakeLists.txt
@@ -14,3 +14,9 @@ add_test(NAME test-kokoro-istft COMMAND test-kokoro-istft)
 add_executable(test-kokoro-tensor-names test_kokoro_tensor_names.cpp)
 target_link_libraries(test-kokoro-tensor-names PRIVATE kokoro_lib)
 add_test(NAME test-kokoro-tensor-names COMMAND test-kokoro-tensor-names)
+
+# G2P: text → espeak IPA → Kokoro vocab ids (reproduces the reference token
+# sequence). Self-checking; no-ops gracefully when espeak is not linked.
+add_executable(test-kokoro-g2p-espeak test_kokoro_g2p_espeak.cpp)
+target_link_libraries(test-kokoro-g2p-espeak PRIVATE kokoro_lib)
+add_test(NAME test-kokoro-g2p-espeak COMMAND test-kokoro-g2p-espeak)
diff --git a/tools/kokoro/tests/test_kokoro_g2p_espeak.cpp b/tools/kokoro/tests/test_kokoro_g2p_espeak.cpp
new file mode 100644
index 000000000..75eea6dfb
--- /dev/null
+++ b/tools/kokoro/tests/test_kokoro_g2p_espeak.cpp
@@ -0,0 +1,87 @@
+// Standalone validation for kokoro-phonemes.cpp real G2P.
+// Compile:
+//   clang++ -std=c++17 -O2 -DKOKORO_USE_ESPEAK \
+//     -I <kokoro/include> -I /opt/homebrew/include \
+//     <this> <kokoro/src/kokoro-phonemes.cpp> \
+//     -L /opt/homebrew/lib -lespeak-ng -o /tmp/t && /tmp/t
+
+#include "kokoro-phonemes.h"
+
+#include <cstdio>
+#include <string>
+#include <vector>
+
+using namespace eliza_kokoro;
+
+static bool eq(const std::vector<int32_t>& a, const std::vector<int32_t>& b) {
+    if (a.size() != b.size()) return false;
+    for (size_t i = 0; i < a.size(); ++i) if (a[i] != b[i]) return false;
+    return true;
+}
+
+static void print_ids(const char* label, const std::vector<int32_t>& v) {
+    printf("%s [", label);
+    for (size_t i = 0; i < v.size(); ++i) printf("%s%d", i ? "," : "", v[i]);
+    printf("]\n");
+}
+
+int main() {
+    printf("espeak_available: %s\n", espeak_available() ? "yes" : "no");
+    printf("phoneme_vocab_size: %d\n\n", phoneme_vocab_size());
+
+    int fails = 0;
+
+    // Reference case (from reference-ids.json).
+    {
+        const std::string text = "Hello, this is a native Kokoro voice test.";
+        const std::vector<int32_t> ref_ids = {
+            50, 83, 54, 156, 57, 135, 16, 81, 102, 61, 16, 102, 68, 16, 70, 16,
+            56, 156, 47, 102, 125, 102, 64, 16, 53, 83, 53, 156, 76, 158, 123,
+            57, 135, 16, 64, 156, 76, 102, 61, 16, 62, 156, 86, 61, 62};
+        std::vector<int32_t> got = phonemize_ipa(text);
+        print_ids("ref ids:", ref_ids);
+        print_ids("got ids:", got);
+        const bool ok = eq(got, ref_ids);
+        printf("REFERENCE ids match: %s\n", ok ? "PASS" : "FAIL");
+        if (!ok) ++fails;
+
+        // input_ids wrapping = [0, *ids, 0]
+        std::vector<int32_t> input = phonemize_to_input_ids(text);
+        const bool wrap_ok = input.size() == ref_ids.size() + 2 &&
+                             input.front() == 0 && input.back() == 0;
+        printf("input_ids wrap [0,*,0]: %s (len %zu)\n",
+               wrap_ok ? "PASS" : "FAIL", input.size());
+        if (!wrap_ok) ++fails;
+    }
+
+    // ipa_to_token_ids reproduces from a fixed IPA string (espeak-independent).
+    {
+        const std::string ipa = "h\xc9\x99l\xcb\x88o\xca\x8a"; // həlˈoʊ
+        std::vector<int32_t> got = ipa_to_token_ids(ipa);
+        // h ə l ˈ o ʊ -> 50 83 54 156 57 135
+        const std::vector<int32_t> exp = {50, 83, 54, 156, 57, 135};
+        const bool ok = eq(got, exp);
+        printf("\nipa_to_token_ids(\"həlˈoʊ\"): %s\n", ok ? "PASS" : "FAIL");
+        if (!ok) { print_ids("  got:", got); ++fails; }
+    }
+
+    // Extra phrases — assert no codepoint is dropped (every espeak IPA char is
+    // in-vocab) and the count is sane.
+    {
+        const char* phrases[] = {
+            "The quick brown fox jumps over the lazy dog.",
+            "I have 3 apples and 2 oranges.",
+            "Eliza speaks with a calm, natural voice.",
+        };
+        for (const char* p : phrases) {
+            std::vector<int32_t> ids = phonemize_ipa(p);
+            printf("\nphrase: %s\n  ids(%zu):", p, ids.size());
+            for (int32_t id : ids) printf(" %d", id);
+            printf("\n");
+            if (ids.empty()) { printf("  FAIL: empty\n"); ++fails; }
+        }
+    }
+
+    printf("\n=== %s ===\n", fails == 0 ? "ALL PASS" : "FAILURES PRESENT");
+    return fails == 0 ? 0 : 1;
+}
diff --git a/tools/kokoro/tests/test_kokoro_phonemes.cpp b/tools/kokoro/tests/test_kokoro_phonemes.cpp
index a8f603767..6df30956a 100644
--- a/tools/kokoro/tests/test_kokoro_phonemes.cpp
+++ b/tools/kokoro/tests/test_kokoro_phonemes.cpp
@@ -1,41 +1,78 @@
 // SPDX-License-Identifier: MIT
 //
-// test_kokoro_phonemes.cpp — sanity checks for the ASCII phoneme tokenizer.
+// test_kokoro_phonemes.cpp — checks for the Kokoro G2P tokenizer.
+//
+// Validates the codepoint→id vocab mapping (which reproduces the kokoro
+// reference ids) and the [PAD, …, PAD] input-id wrapping. The espeak path is
+// validated separately by the standalone harness (it requires libespeak-ng);
+// here we drive ipa_to_token_ids() with fixed IPA so the test is
+// dependency-free.
 
 #include "kokoro-phonemes.h"
 
 #include <cassert>
 #include <cstdio>
 #include <string>
+#include <vector>
 
 int main() {
     using namespace eliza_kokoro;
 
     {
-        // Empty text → just BOS + EOS.
-        auto ids = phonemize_ascii("");
-        assert(ids.size() == 2);
-        assert(ids.front() == 1);   // BOS
-        assert(ids.back() == 2);    // EOS
+        // Vocab size matches Kokoro v1.0 (max id 177 → size 178).
+        assert(phoneme_vocab_size() == 178);
     }
     {
-        // Single word → BOS + letters + EOS.
-        auto ids = phonemize_ascii("hi");
-        assert(ids.size() == 4);
-        assert(ids[0] == 1);        // BOS
-        assert(ids[1] == 4 + 7);    // 'h' → offset 4 + 7
-        assert(ids[2] == 4 + 8);    // 'i' → offset 4 + 8
-        assert(ids[3] == 2);        // EOS
+        // Codepoint → id mapping (from tokenizer.json model.vocab).
+        assert(kokoro_codepoint_to_id(U'h') == 50);
+        assert(kokoro_codepoint_to_id(U' ') == 16);
+        assert(kokoro_codepoint_to_id(0x02C8u) == 156); // ˈ primary stress
+        assert(kokoro_codepoint_to_id(0x0259u) == 83);  // ə schwa
+        assert(kokoro_codepoint_to_id(0x028Au) == 135); // ʊ
+        assert(kokoro_codepoint_to_id(0x2603u) == -1);  // ☃ not in vocab
     }
     {
-        // Text longer than 510 tokens is truncated.
-        std::string s(2000, 'a');
-        auto ids = phonemize_ascii(s);
-        assert(ids.size() <= 510);
+        // ipa_to_token_ids reproduces the reference ids for "həlˈoʊ".
+        // UTF-8: h(0x68) ə(0xC9 0x99) l(0x6C) ˈ(0xCB 0x88) o(0x6F) ʊ(0xCA 0x8A)
+        const std::string ipa = "h\xC9\x99l\xCB\x88o\xCA\x8A";
+        std::vector<int32_t> ids = ipa_to_token_ids(ipa);
+        const std::vector<int32_t> exp = {50, 83, 54, 156, 57, 135};
+        assert(ids == exp);
     }
     {
-        // Vocab size matches Kokoro v1.0.
-        assert(phoneme_vocab_size() == 178);
+        // Unmapped codepoints are dropped, not turned into a sentinel id.
+        const std::string ipa = "h\x07i"; // 'h', BEL (unmapped), 'i'
+        std::vector<int32_t> ids = ipa_to_token_ids(ipa);
+        const std::vector<int32_t> exp = {50, 51}; // h, i
+        assert(ids == exp);
+    }
+    {
+        // wrap_input_ids → [PAD, *ids, PAD].
+        std::vector<int32_t> ids = {50, 51};
+        std::vector<int32_t> wrapped = wrap_input_ids(ids);
+        assert(wrapped.size() == 4);
+        assert(wrapped.front() == KOKORO_PAD_ID);
+        assert(wrapped.back() == KOKORO_PAD_ID);
+        assert(wrapped[1] == 50 && wrapped[2] == 51);
+    }
+    {
+        // Empty ids → just the two pad tokens.
+        std::vector<int32_t> wrapped = wrap_input_ids({});
+        assert(wrapped.size() == 2);
+        assert(wrapped[0] == KOKORO_PAD_ID && wrapped[1] == KOKORO_PAD_ID);
+    }
+    {
+        // Wrapping caps the phoneme run at 510 (512 with both pads).
+        std::vector<int32_t> ids(2000, 50);
+        std::vector<int32_t> wrapped = wrap_input_ids(ids);
+        assert(wrapped.size() == 512);
+    }
+    {
+        // Legacy ASCII fallback still emits a valid wrapped sequence.
+        std::vector<int32_t> ids = phonemize_ascii("hi");
+        // [PAD, 'h'(50), 'i'(51), PAD]
+        const std::vector<int32_t> exp = {KOKORO_PAD_ID, 50, 51, KOKORO_PAD_ID};
+        assert(ids == exp);
     }
 
     std::printf("test_kokoro_phonemes: OK\n");
diff --git a/tools/kokoro/tools/kokoro-decoder-test.cpp b/tools/kokoro/tools/kokoro-decoder-test.cpp
new file mode 100644
index 000000000..ca17f3b78
--- /dev/null
+++ b/tools/kokoro/tools/kokoro-decoder-test.cpp
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: MIT
+//
+// kokoro-decoder-test — validate kokoro_decoder_forward in-repo against the
+// PyTorch reference: load an F32 Kokoro GGUF + reference asr/F0/N/style bins,
+// run the decoder, write a 24 kHz WAV (compare to dec_audio_ref via whisper).
+//
+// Usage: kokoro-decoder-test <model.gguf> <asr.f32[512,T]> <F0.f32[2T]> <N.f32[2T]> <style.f32[128]> <T> <out.wav>
+
+#include "kokoro.h"
+#include "kokoro-decoder.h"
+
+#include <cmath>
+#include <cstdio>
+#include <fstream>
+#include <string>
+#include <vector>
+
+static std::vector<float> rd(const std::string & p) {
+    std::ifstream f(p, std::ios::binary);
+    f.seekg(0, std::ios::end);
+    size_t n = (size_t) f.tellg() / sizeof(float);
+    f.seekg(0);
+    std::vector<float> v(n);
+    f.read((char *) v.data(), (std::streamsize) (n * sizeof(float)));
+    return v;
+}
+
+static bool write_wav(const std::string & path, const std::vector<float> & s, int sr) {
+    std::ofstream f(path, std::ios::binary);
+    if (!f) return false;
+    const uint32_t n = (uint32_t) s.size(), data = n * 2, riff = 36 + data, br = (uint32_t) sr * 2;
+    auto p32 = [&](uint32_t v){ char b[4]={(char)v,(char)(v>>8),(char)(v>>16),(char)(v>>24)}; f.write(b,4); };
+    auto p16 = [&](uint16_t v){ char b[2]={(char)v,(char)(v>>8)}; f.write(b,2); };
+    f.write("RIFF",4); p32(riff); f.write("WAVE",4); f.write("fmt ",4); p32(16); p16(1); p16(1);
+    p32((uint32_t)sr); p32(br); p16(2); p16(16); f.write("data",4); p32(data);
+    for (uint32_t i=0;i<n;++i){ float v=s[i]; v=v>1?1:(v<-1?-1:v); int16_t q=(int16_t)std::lrintf(v*32767.f); char b[2]={(char)(q&0xff),(char)((q>>8)&0xff)}; f.write(b,2);}
+    return (bool) f;
+}
+
+int main(int argc, char ** argv) {
+    if (argc < 8) { std::fprintf(stderr, "usage: %s model asr F0 N style T out.wav\n", argv[0]); return 2; }
+    std::string model_p=argv[1], out=argv[7]; int T=std::atoi(argv[6]);
+    std::string err;
+    auto model = eliza_kokoro::kokoro_load_model(model_p, err);
+    if (!model) { std::fprintf(stderr, "load: %s\n", err.c_str()); return 1; }
+    auto asr=rd(argv[2]), F0=rd(argv[3]), N=rd(argv[4]), sty=rd(argv[5]);
+    std::printf("asr=%zu F0=%zu N=%zu style=%zu T=%d (expect asr=512*T=%d, F0=2T=%d)\n",
+                asr.size(), F0.size(), N.size(), sty.size(), T, 512*T, 2*T);
+    std::vector<float> audio;
+    if (!eliza_kokoro::kokoro_decoder_forward(model.get(), asr.data(), T, F0.data(), N.data(), sty.data(), audio, err)) {
+        std::fprintf(stderr, "decoder: %s\n", err.c_str()); return 1;
+    }
+    std::printf("audio samples=%zu (%.2fs @24k)\n", audio.size(), audio.size()/24000.0);
+    if (!write_wav(out, audio, 24000)) { std::fprintf(stderr, "write failed\n"); return 1; }
+    std::printf("wrote %s\n", out.c_str());
+    return 0;
+}
diff --git a/tools/kokoro/tools/kokoro-stage-dump.cpp b/tools/kokoro/tools/kokoro-stage-dump.cpp
new file mode 100644
index 000000000..e16ec329f
--- /dev/null
+++ b/tools/kokoro/tools/kokoro-stage-dump.cpp
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: MIT
+//
+// kokoro-stage-dump — validation harness for the Kokoro C++ forward port.
+// Loads a GGUF model, reads reference input_ids (text) + ref_s (256 f32 bin),
+// runs kokoro_predictor_forward, and dumps pred_dur / F0_pred / N_pred / asr
+// as raw little-endian f32/i32 for comparison against the PyTorch reference.
+//
+// Usage: kokoro-stage-dump <model.gguf> <input_ids.txt> <ref_s.f32> <out-prefix>
+
+#include "kokoro.h"
+#include "kokoro-predictor.h"
+
+#include <cstdio>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+static std::vector<int32_t> read_ids(const std::string & p) {
+    std::ifstream f(p);
+    std::vector<int32_t> v;
+    int x;
+    while (f >> x) v.push_back(x);
+    return v;
+}
+static std::vector<float> read_f32(const std::string & p) {
+    std::ifstream f(p, std::ios::binary);
+    f.seekg(0, std::ios::end);
+    size_t n = (size_t) f.tellg() / sizeof(float);
+    f.seekg(0);
+    std::vector<float> v(n);
+    f.read((char *) v.data(), (std::streamsize) (n * sizeof(float)));
+    return v;
+}
+template <typename T>
+static void write_bin(const std::string & p, const std::vector<T> & v) {
+    std::ofstream f(p, std::ios::binary);
+    f.write((const char *) v.data(), (std::streamsize) (v.size() * sizeof(T)));
+}
+
+int main(int argc, char ** argv) {
+    if (argc < 5) {
+        std::fprintf(stderr, "usage: %s <model.gguf> <ids.txt> <ref_s.f32> <out-prefix>\n", argv[0]);
+        return 2;
+    }
+    std::string model_path = argv[1], ids_path = argv[2], refs_path = argv[3], prefix = argv[4];
+
+    std::string err;
+    auto model = eliza_kokoro::kokoro_load_model(model_path, err);
+    if (!model) { std::fprintf(stderr, "load failed: %s\n", err.c_str()); return 1; }
+
+    std::vector<int32_t> ids = read_ids(ids_path);
+    std::vector<float> ref_s = read_f32(refs_path);
+    if (ref_s.size() < 256) { std::fprintf(stderr, "ref_s too small: %zu\n", ref_s.size()); return 1; }
+
+    eliza_kokoro::PredictorOut out;
+    if (!eliza_kokoro::kokoro_predictor_forward(model.get(), ids, ref_s.data(), 1.0f, out, err)) {
+        std::fprintf(stderr, "predictor_forward failed: %s\n", err.c_str());
+        return 1;
+    }
+
+    std::printf("T_phon=%d T_frame=%d pred_dur_sum=%d F0_len=%zu N_len=%zu asr_len=%zu\n",
+                out.T_phon, out.T_frame,
+                [&] { int s = 0; for (auto d : out.pred_dur) s += d; return s; }(),
+                out.F0_pred.size(), out.N_pred.size(), out.asr.size());
+
+    write_bin(prefix + "_pred_dur.i32", out.pred_dur);
+    write_bin(prefix + "_F0.f32", out.F0_pred);
+    write_bin(prefix + "_N.f32", out.N_pred);
+    write_bin(prefix + "_asr.f32", out.asr);      // [T_frame, 512] row-major (T-major)
+    std::printf("wrote %s_{pred_dur.i32,F0.f32,N.f32,asr.f32}\n", prefix.c_str());
+    return 0;
+}
diff --git a/tools/kokoro/tools/kokoro-tts.cpp b/tools/kokoro/tools/kokoro-tts.cpp
index 52d27da47..874008daf 100644
--- a/tools/kokoro/tools/kokoro-tts.cpp
+++ b/tools/kokoro/tools/kokoro-tts.cpp
@@ -105,7 +105,9 @@ int main(int argc, char ** argv) {
     const auto * hp = eliza_kokoro::kokoro_get_hparams(model.get());
 
     eliza_kokoro::kokoro_voice_preset voice;
-    const auto vst = eliza_kokoro::kokoro_load_voice_preset(voice_path, hp->style_dim, voice, err);
+    // Kokoro v1.0 voice packs are 2*style_dim wide (256 = decoder-half 128 +
+    // predictor-half 128); the model's hparam style_dim is the per-half value.
+    const auto vst = eliza_kokoro::kokoro_load_voice_preset(voice_path, 2 * hp->style_dim, voice, err);
     if (vst != eliza_kokoro::KOKORO_OK) {
         std::fprintf(stderr, "kokoro_load_voice_preset failed: %s (status=%s)\n",
                      err.c_str(), eliza_kokoro::kokoro_status_str(vst));