Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 38 additions & 7 deletions tools/kokoro/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ set(KOKORO_CORE_SOURCES
src/kokoro.cpp
src/kokoro-istft.cpp
src/kokoro-phonemes.cpp
src/kokoro-predictor.cpp)
src/kokoro-predictor.cpp
src/kokoro-generator.cpp
src/kokoro-decoder.cpp)

add_library(kokoro_lib STATIC
${KOKORO_CORE_SOURCES}
Expand Down Expand Up @@ -56,17 +58,46 @@ endif()

target_compile_features(kokoro_lib PUBLIC cxx_std_17)

# Standalone CLI harness — required by J2 verification (tools/voice-kokoro/).
# Real G2P via libespeak-ng. When present, kokoro_phonemize() drives
# espeak_TextToPhonemes() (en-us IPA) and maps codepoints to Kokoro vocab ids,
# reproducing the reference token sequence. When absent, the build falls back
# to the degraded ASCII grapheme mapping and the TS layer must supply IPA.
# Override the search with -DKOKORO_ESPEAK_ROOT=<prefix> (e.g. Homebrew).
option(KOKORO_ENABLE_ESPEAK "Link libespeak-ng for real Kokoro G2P" ON)
if(KOKORO_ENABLE_ESPEAK)
find_path(ESPEAK_NG_INCLUDE_DIR espeak-ng/speak_lib.h
HINTS ${KOKORO_ESPEAK_ROOT} /opt/homebrew /usr/local /usr
PATH_SUFFIXES include)
find_library(ESPEAK_NG_LIBRARY NAMES espeak-ng
HINTS ${KOKORO_ESPEAK_ROOT} /opt/homebrew /usr/local /usr
PATH_SUFFIXES lib lib64)
if(ESPEAK_NG_INCLUDE_DIR AND ESPEAK_NG_LIBRARY)
target_include_directories(kokoro_lib PRIVATE ${ESPEAK_NG_INCLUDE_DIR})
target_link_libraries(kokoro_lib PRIVATE ${ESPEAK_NG_LIBRARY})
target_compile_definitions(kokoro_lib PRIVATE KOKORO_USE_ESPEAK)
message(STATUS "Kokoro G2P: libespeak-ng found (${ESPEAK_NG_LIBRARY}) — real IPA path enabled")
else()
message(STATUS "Kokoro G2P: libespeak-ng not found — falling back to ASCII grapheme mapping (TS layer must supply IPA)")
endif()
endif()

# Standalone CLI harnesses (required by J2 verification + Kokoro decoder dev).
# Force MACOSX_BUNDLE OFF: CMake defaults Apple executables to bundles, and
# install(TARGETS ... RUNTIME) on a bundle target fails configure with
# "no BUNDLE DESTINATION" on every ios/tvos/visionos/macos build.
add_executable(kokoro-tts tools/kokoro-tts.cpp)
target_link_libraries(kokoro-tts PRIVATE kokoro_lib)
# kokoro-tts is a CLI harness, not a GUI app. On Apple platforms CMake defaults
# executables to MACOSX_BUNDLE, and install(TARGETS ... RUNTIME) on a bundle
# target fails configure with "install TARGETS given no BUNDLE DESTINATION for
# MACOSX_BUNDLE executable" on every ios/tvos/visionos/macos build. Force the
# bundle flag off so the plain RUNTIME install is valid on all platforms.
set_target_properties(kokoro-tts PROPERTIES MACOSX_BUNDLE OFF)
install(TARGETS kokoro-tts RUNTIME)

add_executable(kokoro-stage-dump tools/kokoro-stage-dump.cpp)
target_link_libraries(kokoro-stage-dump PRIVATE kokoro_lib)
set_target_properties(kokoro-stage-dump PROPERTIES MACOSX_BUNDLE OFF)

add_executable(kokoro-decoder-test tools/kokoro-decoder-test.cpp)
target_link_libraries(kokoro-decoder-test PRIVATE kokoro_lib)
set_target_properties(kokoro-decoder-test PROPERTIES MACOSX_BUNDLE OFF)

# Server-mount handler: compiled into kokoro_lib only when the server target
# exists. The handler is guarded by `#ifdef LLAMA_BUILD_KOKORO` and pulls in
# the same `server-http.h` interface that the omnivoice handler uses, plus
Expand Down
12 changes: 9 additions & 3 deletions tools/kokoro/convert_kokoro_pth_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,15 @@
def _add_tensor(writer: gguf.GGUFWriter, name: str, data: np.ndarray) -> None:
"""Add tensors with the dtype layout the Kokoro forward pass expects.

Weight matrices and convolution kernels (ndim >= 2) are emitted as F16;
biases, norms, and other vectors stay F32. All-F32 GGUFs can load but
synthesize noise in the fused runtime path.
Weight matrices and convolution kernels (ndim >= 2) are emitted as F16
purely to halve the GGUF download size; biases, norms, and other vectors
stay F32. The GGUF dtype does not affect correctness: the loader
dequantizes every tensor to F32 at load time, so an all-F32 and an
F16-weights GGUF produce identical synthesis. (An earlier note here
claimed all-F32 GGUFs synthesized noise — that was a misdiagnosis: the
fused path was a stub that ignored the weights, and the real defect was
the loader reading non-F32 tensors as raw F32. Both are fixed; F16 is
kept only for bundle size.)
"""
if data.dtype not in (np.float32, np.float16):
data = data.astype(np.float32)
Expand Down
144 changes: 144 additions & 0 deletions tools/kokoro/include/kokoro-decoder-front.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
// SPDX-License-Identifier: MIT
// kokoro-decoder-front.h — Decoder.forward up to the generator (validated port, #9588).
#pragma once
#include <cmath>
#include <cstring>
#include <vector>
#include "kokoro-layers.h" // conv1d_forward, adain1d_forward, convtranspose1d_depthwise_forward, convtranspose1d_out_len

namespace eliza_kokoro {

struct DecAdainResBlk {
int Cin = 0, Cout = 0, Sdim = 128;
bool upsample = false;
bool learned_sc = false; // dim_in != dim_out
const float * norm1_fc_w = nullptr; // [2*Cin, Sdim]
const float * norm1_fc_b = nullptr; // [2*Cin]
const float * norm2_fc_w = nullptr; // [2*Cout, Sdim]
const float * norm2_fc_b = nullptr; // [2*Cout]
const float * conv1_w = nullptr; // [Cout, Cin, 3]
const float * conv1_b = nullptr; // [Cout]
const float * conv2_w = nullptr; // [Cout, Cout, 3]
const float * conv2_b = nullptr; // [Cout]
const float * conv1x1_w = nullptr; // [Cout, Cin, 1] (learned_sc only)
const float * conv1x1_b = nullptr; // [Cout] (null — conv1x1 bias=False)
const float * pool_w = nullptr; // [Cin, 1, 3] (upsample only)
const float * pool_b = nullptr; // [Cin] (upsample only)
};

// AdainResBlk1d (decode-block flavor: leaky_relu 0.2; pool/shortcut;
// out = (residual + shortcut)/sqrt(2)). Output y [Cout, T_out].
inline void dec_adainresblk1d_forward(
const DecAdainResBlk & w, const float * x, int T_in, const float * s,
std::vector<float> & y, int & T_out) {
const int Cin = w.Cin, Cout = w.Cout, Sdim = w.Sdim;

// residual branch: norm1 -> leaky_relu(0.2) -> [pool] -> conv1 -> norm2 -> leaky_relu -> conv2
std::vector<float> r(x, x + (size_t)Cin * T_in);
adain1d_forward(r.data(), Cin, T_in, s, Sdim, w.norm1_fc_w, w.norm1_fc_b);
for (size_t i = 0; i < r.size(); ++i) if (r[i] < 0) r[i] *= 0.2f;

int T_pool = T_in;
if (w.upsample) {
T_pool = convtranspose1d_out_len(T_in, 3, 2, 1, 1);
std::vector<float> r2((size_t)Cin * T_pool);
convtranspose1d_depthwise_forward(r.data(), Cin, T_in, w.pool_w, w.pool_b, 3, 2, 1, 1, r2.data(), T_pool);
r.swap(r2);
}
std::vector<float> r3((size_t)Cout * T_pool);
conv1d_forward(r.data(), Cin, T_pool, w.conv1_w, w.conv1_b, Cout, 3, 1, 1, 1, r3.data(), T_pool);
adain1d_forward(r3.data(), Cout, T_pool, s, Sdim, w.norm2_fc_w, w.norm2_fc_b);
for (size_t i = 0; i < r3.size(); ++i) if (r3[i] < 0) r3[i] *= 0.2f;
std::vector<float> r4((size_t)Cout * T_pool);
conv1d_forward(r3.data(), Cout, T_pool, w.conv2_w, w.conv2_b, Cout, 3, 1, 1, 1, r4.data(), T_pool);

// shortcut branch: [nearest-upsample x2] -> [conv1x1 if learned_sc]
T_out = T_pool;
std::vector<float> sc;
if (w.upsample) {
const int T_up = T_in * 2; // == T_pool
std::vector<float> up((size_t)Cin * T_up);
for (int c = 0; c < Cin; ++c)
for (int t = 0; t < T_up; ++t)
up[(size_t)c * T_up + t] = x[(size_t)c * T_in + (t / 2)];
if (w.learned_sc) {
sc.assign((size_t)Cout * T_up, 0.0f);
conv1d_forward(up.data(), Cin, T_up, w.conv1x1_w, w.conv1x1_b, Cout, 1, 1, 0, 1, sc.data(), T_up);
} else sc.swap(up);
} else {
if (w.learned_sc) {
sc.assign((size_t)Cout * T_in, 0.0f);
conv1d_forward(x, Cin, T_in, w.conv1x1_w, w.conv1x1_b, Cout, 1, 1, 0, 1, sc.data(), T_in);
} else sc.assign(x, x + (size_t)Cin * T_in);
}

y.assign((size_t)Cout * T_out, 0.0f);
const float rsqrt2 = 1.0f / std::sqrt(2.0f);
for (size_t i = 0; i < y.size(); ++i) y[i] = (r4[i] + sc[i]) * rsqrt2;
}

struct DecoderFrontWeights {
const float * F0_conv_w = nullptr; // [1,1,3]
const float * F0_conv_b = nullptr; // [1]
const float * N_conv_w = nullptr; // [1,1,3]
const float * N_conv_b = nullptr; // [1]
const float * asr_res_w = nullptr; // [64,512,1]
const float * asr_res_b = nullptr; // [64]
DecAdainResBlk encode; // 514 -> 1024, learned_sc
DecAdainResBlk decode[4]; // 1090->1024 (x3), 1090->512 upsample
};

// Decoder.forward up to (not including) the generator.
// asr[512,T_asr] (T_asr=132), F0_curve[2*T_asr], N[2*T_asr], s[128]
// Output: x_out [512, 2*T_asr] (== generator_in_0); also returns the
// stride-2 conv outputs F0_down[T_asr], N_down[T_asr] (caller passes them,
// together with the ORIGINAL F0_curve, into the generator).
inline void decoder_front(
const DecoderFrontWeights & W,
const float * asr, int Cin_asr, int T_asr,
const float * F0_curve, const float * N_in, const float * s,
std::vector<float> & x_out,
std::vector<float> & F0_down,
std::vector<float> & N_down) {
const int Tc = 2 * T_asr; // 264

F0_down.assign(T_asr, 0.0f);
conv1d_forward(F0_curve, 1, Tc, W.F0_conv_w, W.F0_conv_b, 1, 3, 2, 1, 1, F0_down.data(), T_asr);
N_down.assign(T_asr, 0.0f);
conv1d_forward(N_in, 1, Tc, W.N_conv_w, W.N_conv_b, 1, 3, 2, 1, 1, N_down.data(), T_asr);

// x = cat([asr, F0, N], dim=channels) -> [514, T_asr]
std::vector<float> xcat((size_t)(Cin_asr + 2) * T_asr);
std::memcpy(xcat.data(), asr, sizeof(float) * (size_t)Cin_asr * T_asr);
std::memcpy(xcat.data() + (size_t)Cin_asr * T_asr, F0_down.data(), sizeof(float) * T_asr);
std::memcpy(xcat.data() + (size_t)(Cin_asr + 1) * T_asr, N_down.data(), sizeof(float) * T_asr);

std::vector<float> x; int T_x;
dec_adainresblk1d_forward(W.encode, xcat.data(), T_asr, s, x, T_x); // encode 514->1024

std::vector<float> asr_res((size_t)64 * T_asr); // asr_res Conv1d k1 512->64
conv1d_forward(asr, Cin_asr, T_asr, W.asr_res_w, W.asr_res_b, 64, 1, 1, 0, 1, asr_res.data(), T_asr);

bool res = true;
for (int b = 0; b < 4; ++b) {
std::vector<float> blk_in;
if (res) { // cat([x, asr_res, F0, N]) -> 1024+64+1+1 = 1090
const int Cx = (int)(x.size() / T_x);
const int Cin_blk = Cx + 64 + 1 + 1;
blk_in.assign((size_t)Cin_blk * T_x, 0.0f);
std::memcpy(blk_in.data(), x.data(), sizeof(float) * (size_t)Cx * T_x);
std::memcpy(blk_in.data() + (size_t)Cx * T_x, asr_res.data(), sizeof(float) * 64 * T_x);
std::memcpy(blk_in.data() + (size_t)(Cx + 64) * T_x, F0_down.data(), sizeof(float) * T_x);
std::memcpy(blk_in.data() + (size_t)(Cx + 65) * T_x, N_down.data(), sizeof(float) * T_x);
} else {
blk_in.assign(x.begin(), x.end());
}
std::vector<float> y; int T_y;
dec_adainresblk1d_forward(W.decode[b], blk_in.data(), T_x, s, y, T_y);
x.swap(y); T_x = T_y;
if (W.decode[b].upsample) res = false; // decode3 upsamples -> res stops
}
x_out.swap(x); // [512, 264]
}

} // namespace eliza_kokoro
31 changes: 31 additions & 0 deletions tools/kokoro/include/kokoro-decoder.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
// SPDX-License-Identifier: MIT
//
// kokoro-decoder.h — StyleTTS-2 / iSTFTNet decoder: predictor outputs -> 24 kHz audio.
//
// Wires the validated decoder_front (kokoro-decoder-front.h) + Generator
// (kokoro-generator.h) against the model's all-F32 ggml context. Replaces the
// J2-ship placeholder spectrogram in kokoro_synthesize (#9588).

#pragma once

#include <string>
#include <vector>

namespace eliza_kokoro {

struct kokoro_model;

// Run the full decoder. Inputs come from kokoro_predictor_forward:
// asr_ct : [512, T_frame] channel-major (transpose of PredictorOut.asr [T,512])
// F0, N : [2*T_frame] (PredictorOut.F0_pred / N_pred — the up-2x curves)
// ref_s_dec: [128] decoder-half style (ref_s[:128])
// Output: audio (24 kHz mono), resized to (2*T_frame)*300.
bool kokoro_decoder_forward(
const kokoro_model * model,
const float * asr_ct, int T_frame,
const float * F0, const float * N,
const float * ref_s_dec,
std::vector<float> & audio,
std::string & err);

} // namespace eliza_kokoro
76 changes: 76 additions & 0 deletions tools/kokoro/include/kokoro-generator.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// SPDX-License-Identifier: MIT
//
// kokoro-generator.h — iSTFTNet Generator.forward (StyleTTS-2 decoder back-end).
//
// The generator turns the decoder body output `x` [512, 264], the style
// vector `s` [128], and the (un-downsampled) F0 curve `f0_curve` [264] into
// `audio` [79200] (24 kHz).
//
// Weights are raw float pointers (PyTorch row-major, weight_norm-fused):
// Conv1d weight [Cout, Cin, K]
// ConvTranspose1d wt [Cin, Cout, K]
// Linear weight [out, in]
// AdaIN1d fc.weight [2C, style_dim] (style_dim = 128)
// Snake alpha [C]
// The caller supplies them via GeneratorWeights so the function composes with
// any weight-loading boundary (GGUF tensor lookup, raw .f32 fixtures, …).

#pragma once

#include <vector>

namespace eliza_kokoro {

// One AdaINResBlock1 sub-block (the block has three, sharing the same channel
// count). convs use [Cout=Cin=C, Cin=C, K].
struct GenSubBlockWeights {
const float * conv1_w = nullptr; // [C, C, K]
const float * conv1_b = nullptr; // [C]
const float * conv2_w = nullptr; // [C, C, K]
const float * conv2_b = nullptr; // [C]
const float * adain1_fc_w = nullptr; // [2C, 128]
const float * adain1_fc_b = nullptr; // [2C]
const float * adain2_fc_w = nullptr; // [2C, 128]
const float * adain2_fc_b = nullptr; // [2C]
const float * alpha1 = nullptr; // [C]
const float * alpha2 = nullptr; // [C]
};

struct GenAdaResBlockWeights {
GenSubBlockWeights sub[3];
};

struct GeneratorWeights {
// m_source.l_linear: Linear(9 -> 1).
const float * l_linear_w = nullptr; // [1, 9]
const float * l_linear_b = nullptr; // [1]

// ups[0], ups[1]: ConvTranspose1d. weight [Cin, Cout, K], bias [Cout].
const float * ups_w[2] = { nullptr, nullptr };
const float * ups_b[2] = { nullptr, nullptr };

// noise_convs[0], noise_convs[1]: Conv1d. weight [Cout, 22, K], bias [Cout].
const float * noise_convs_w[2] = { nullptr, nullptr };
const float * noise_convs_b[2] = { nullptr, nullptr };

// noise_res[0] (k=7), noise_res[1] (k=11): AdaINResBlock1.
GenAdaResBlockWeights noise_res[2];

// resblocks[0..5]: AdaINResBlock1 (stage0: k=3,7,11 ch=256; stage1: ch=128).
GenAdaResBlockWeights resblocks[6];

// conv_post: Conv1d(128 -> 22, k=7, pad=3). weight [22, 128, 7], bias [22].
const float * conv_post_w = nullptr;
const float * conv_post_b = nullptr;
};

// Generator.forward. audio is resized to T0 * 300 (== 79200 for T0=264).
void kokoro_generator_forward(
const float * x_in, // [512, T0] channel-major
int T0, // input time (== 2 * predictor T_frame)
const float * s, // [128]
const float * f0_curve, // [T0]
const GeneratorWeights & w,
std::vector<float> & audio);

} // namespace eliza_kokoro
Loading
Loading