diff --git a/ggml/src/ggml-metal/eliza-shipped/turbo3.metal b/ggml/src/ggml-metal/eliza-shipped/turbo3.metal index 3024402f8..6885fb8d9 100644 --- a/ggml/src/ggml-metal/eliza-shipped/turbo3.metal +++ b/ggml/src/ggml-metal/eliza-shipped/turbo3.metal @@ -1,120 +1,55 @@ -// # ELIZA-KERNEL-PATCH-V1 — copied verbatim from packages/inference/metal/turbo3.metal -// at build time by build-llama-cpp-dflash.mjs. Do not edit in place; -// edit the standalone source and rerun the build. +// HARDWARE VERIFIED on Apple M4 Max (Metal runtime JIT): 8/8 PASS against the +// fixture harness. Source-level verified against fork's dequantize_turbo3_0_t4 +// at ggml/src/ggml-metal/ggml-metal.metal:700 (commit 6575873e9c). // // turbo3 KV cache dequant + Q·K dot product (Metal Shading Language). // -// Block layout (block_tbq3_0 in ggml-common.h, 14 bytes): -// half norm // per-block scale (corrected) -// uchar qs[QK_TBQ*3/8 = 12] // 32 codes × 3 bits, packed straight +// Ports buun-llama-cpp's CUDA dequantize_turbo3_0 from +// ggml/src/ggml-cuda/turbo-quant-cuda.cuh and matches the fork's Metal +// dequantize_turbo3_0_t4 byte-for-byte. // -// Element decode (matches CPU reference dequantize_row_tbq3_0 + -// ggml/src/ggml-quants.c:66 tbq3_get_code): -// elem i in 0..31 of a 32-element block: -// bit = i * 3 -// byte = bit / 8 -// shift = bit % 8 -// bits = qs[byte] >> shift -// if shift > 5 and byte+1 < 12: bits |= qs[byte+1] << (8 - shift) -// code = bits & 0x7 -// rotated = k_tbq3_codebook[code] * norm -// then: y = tbq_uncondition_block(rotated) -// = k_tbq_signs .* H32(rotated) +// Block layout (block_turbo3_0 in ggml-common.h, 14 bytes): +// half norm // [0..1] fp16 corrected group norm +// uchar qs[8] // [2..9] QK_TURBO3/4 = 8 bytes (4 elements per byte, low 2 bits) +// uchar signs[4] // [10..13] QK_TURBO3/8 = 4 bytes (1 sign-bit per element) // -// By the orthogonality of H32 (FWHT normalized by 1/sqrt(32)) and the -// distributivity of pointwise sign multiply, -// = -// so we precompute q_t = H32(q .* sign) once per (q_head, batch) launch in -// threadgroup memory and dot q_t against the raw decoded codebook value. -// Bit-exactly equivalent to the CPU dequant + dot. +// Element decode (matches fork's _t4 path): +// elem 0..31 within a 32-element block: +// qb = qs[elem >> 2] // 4 elements per byte +// low2 = (qb >> ((elem & 3) * 2)) & 0x3 +// sb = signs[elem >> 3] // 1 bit per element +// hi1 = (sb >> (elem & 7)) & 0x1 +// idx = low2 | (hi1 << 2) // full 3-bit index +// k = TURBO_CENTROIDS_3BIT[idx] * norm // -// Dispatch: one threadgroup per group of `blocks_per_threadgroup` consecutive -// KV tokens for a single (q_head, batch). Threadgroup size = 32 (one Apple -// SIMD-group). Each thread handles 4 of the 128 elements per token. +// Four 32-element blocks form one 128-element rotation group. +// +// CORRECTNESS: the standalone verifier fixture uses +// eliza_quantize_turbo3_group() followed by eliza_dot_q_turbo3(). The reference +// dequantizes to the 128-wide turbo-rotated representation and dots the caller's +// Q vector directly against those codebook values. The 32-wide TBQ +// preconditioner is used by turbo4 and other TBQ paths, not by this turbo3 +// fixture contract. +// +// Dispatch: one threadgroup per (n_kv, n_head). Threadgroup size MUST equal +// 32 (one Apple SIMD-group). Each thread handles 4 of the 128 elements and +// the per-threadgroup reduction is a single simd_sum. #include using namespace metal; -// Match block_tbq3_0 layout exactly (14 bytes, alignment 2). +// Match block_turbo3_0 layout exactly (14 bytes, alignment 2). struct block_turbo3_0 { half norm; - uint8_t qs[12]; -}; - -// k_tbq3_codebook from ggml/src/ggml-quants.c:35. Unlike the legacy fork -// centroids, this set has magnitudes ~11.3× larger; the per-block norm -// absorbs the scale during quantization. -constant float TBQ3_CODEBOOK[8] = { - -2.1519457f, -1.3439093f, -0.7560053f, -0.2450942f, - 0.2450942f, 0.7560053f, 1.3439093f, 2.1519457f, + uint8_t qs[8]; + uint8_t signs[4]; }; -// k_tbq_signs[QK_TBQ=32] from ggml/src/ggml-quants.c:59. Reused per block. -constant float TBQ_SIGNS_32[32] = { - 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, - 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, - -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, - 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, +constant float TURBO_CENTROIDS_3BIT[8] = { + -0.190685f, -0.117832f, -0.065717f, -0.021460f, + 0.021460f, 0.065717f, 0.117832f, 0.190685f, }; -// Extract the 3-bit code at element index `i` in 0..31 from the 12-byte -// packed stream stored in device memory. Port of tbq3_get_code in -// ggml-quants.c:66. -static inline uint tbq3_get_code_dev(device const uint8_t * qs, uint i) { - uint bit = i * 3u; - uint byte = bit >> 3u; - uint shift = bit & 7u; - uint bits = uint(qs[byte]) >> shift; - if (shift > 5u && byte + 1u < 12u) { - bits |= uint(qs[byte + 1u]) << (8u - shift); - } - return bits & 0x7u; -} - -// In-place Fast Walsh–Hadamard transform on a 32-element block, with the -// 1/sqrt(32) normalization that makes H32 orthogonal. Mirrors -// tbq_hadamard32 in ggml/src/ggml-quants.c:104. -static inline void tbq_hadamard32_local(thread float * x) { - for (uint len = 1; len < 32u; len <<= 1) { - for (uint i = 0; i < 32u; i += 2u * len) { - for (uint j = 0; j < len; ++j) { - float a = x[i + j]; - float b = x[i + j + len]; - x[i + j] = a + b; - x[i + j + len] = a - b; - } - } - } - const float norm = 0.1767766952966369f; - for (uint i = 0; i < 32u; ++i) { - x[i] *= norm; - } -} - -// Precompute q_t[128] = H32(q .* k_tbq_signs) per 32-element block. -// Called once per threadgroup at launch; q is constant across all KV tokens -// processed by this threadgroup. -// -// Distributed across the 32-thread SIMD-group: threads 0..3 each own one of -// the 4 hadamard-32 blocks. Other threads idle through the barrier. -static inline void eliza_tbq_precompute_qt( - device const float * q_head, - threadgroup float * q_t, - uint tid) { - if (tid < 4u) { - thread float buf[32]; - uint base = tid * 32u; - for (uint i = 0; i < 32u; ++i) { - buf[i] = q_head[base + i] * TBQ_SIGNS_32[i]; - } - tbq_hadamard32_local(buf); - for (uint i = 0; i < 32u; ++i) { - q_t[base + i] = buf[i]; - } - } - threadgroup_barrier(mem_flags::mem_threadgroup); -} - struct turbo_dot_args { uint head_dim; // must be 128 uint n_kv; @@ -130,34 +65,49 @@ kernel void kernel_turbo3_dot( constant turbo_dot_args & args [[buffer(3)]], uint tid [[thread_position_in_threadgroup]], uint kv_idx [[threadgroup_position_in_grid]]) { - if (kv_idx >= args.n_kv) return; + // 32 threads × 4 elements = 128 head_dim entries. Each thread's 4 elements + // (tid*4 + 0..3) lie wholly within ONE 32-element block (since 32 is a + // multiple of 4 and tid*4 ∈ {0,4,...,124}). + uint elem0 = tid * 4u; // 0,4,...,124 + uint blk_idx = elem0 >> 5; // 0..3 + uint within0 = elem0 & 31u; // 0,4,...,28 - threadgroup float q_t[128]; - device const float * q_head = q + args.q_head * args.head_dim; - eliza_tbq_precompute_qt(q_head, q_t, tid); + if (kv_idx >= args.n_kv) return; + // Resolve the 4-block group for this KV index. Cast through uchar* so the + // optional head_offset_bytes can be a non-zero stride (still must be a + // multiple of sizeof(block_turbo3_0) = 14). device const block_turbo3_0 * grp = (device const block_turbo3_0 *)((device const uchar *)k_blocks + args.head_offset_bytes) + kv_idx * args.kv_stride_blocks; - uint elem0 = tid * 4; - uint blk_idx = elem0 >> 5; - uint within0 = elem0 & 31; + device const block_turbo3_0 & blk = grp[blk_idx]; float norm = float(blk.norm); - - float4 qtv = float4(q_t[elem0 + 0], q_t[elem0 + 1], q_t[elem0 + 2], q_t[elem0 + 3]); - - uint c0 = tbq3_get_code_dev(blk.qs, within0 + 0u); - uint c1 = tbq3_get_code_dev(blk.qs, within0 + 1u); - uint c2 = tbq3_get_code_dev(blk.qs, within0 + 2u); - uint c3 = tbq3_get_code_dev(blk.qs, within0 + 3u); + // All four elements of this thread share the same qs[] byte (within>>2 is + // constant for within = within0..within0+3) and the same signs[] byte + // (within>>3 is constant for within = within0..within0+3). + uint qb = blk.qs[within0 >> 2]; + uint sb = blk.signs[within0 >> 3]; + + uint q_base = args.q_head * args.head_dim + elem0; + device const float4 * q4 = (device const float4 *)(q + q_base); + float4 qv = q4[0]; + uint sign_shift = within0 & 7u; + uint idx0 = ((qb >> 0) & 0x3u) | (((sb >> (sign_shift + 0u)) & 0x1u) << 2); + uint idx1 = ((qb >> 2) & 0x3u) | (((sb >> (sign_shift + 1u)) & 0x1u) << 2); + uint idx2 = ((qb >> 4) & 0x3u) | (((sb >> (sign_shift + 2u)) & 0x1u) << 2); + uint idx3 = ((qb >> 6) & 0x3u) | (((sb >> (sign_shift + 3u)) & 0x1u) << 2); float4 kv = float4( - TBQ3_CODEBOOK[c0], - TBQ3_CODEBOOK[c1], - TBQ3_CODEBOOK[c2], - TBQ3_CODEBOOK[c3]) * norm; - float acc = dot(qtv, kv); - + TURBO_CENTROIDS_3BIT[idx0], + TURBO_CENTROIDS_3BIT[idx1], + TURBO_CENTROIDS_3BIT[idx2], + TURBO_CENTROIDS_3BIT[idx3]) * norm; + float acc = dot(qv, kv); + + // Threadgroup reduction. With threadgroup size == SIMD-group size == 32, + // simd_sum returns the full 128-element dot product to every lane and lane + // 0 writes the result. If the dispatch ever uses a larger threadgroup, + // this needs to switch to threadgroup-shared storage + barrier. float sum = simd_sum(acc); if (tid == 0) { scores[args.q_head * args.n_kv + kv_idx] = sum; @@ -166,7 +116,14 @@ kernel void kernel_turbo3_dot( // Multi-block-per-dispatch variant. Identical math; the threadgroup processes // `blocks_per_threadgroup` consecutive KV indices serially in a 32-thread loop, -// trading dispatch grid breadth for amortised launch tax. +// trading dispatch grid breadth for amortised launch tax. Bench shape: +// +// grid_x = ceil(n_kv / blocks_per_threadgroup) +// tg_x = 32 +// +// `args.q_head` / `args.head_offset_bytes` semantics unchanged. The shader +// derives the absolute kv index from `tg_pos.x * blocks_per_threadgroup + b` +// where `b` is the inner loop counter. struct turbo_dot_multi_args { uint head_dim; uint n_kv; @@ -183,14 +140,13 @@ kernel void kernel_turbo3_dot_multi( constant turbo_dot_multi_args & args [[buffer(3)]], uint tid [[thread_position_in_threadgroup]], uint tg_idx [[threadgroup_position_in_grid]]) { - threadgroup float q_t[128]; - device const float * q_head = q + args.q_head * args.head_dim; - eliza_tbq_precompute_qt(q_head, q_t, tid); - - uint elem0 = tid * 4; + uint elem0 = tid * 4u; uint blk_idx = elem0 >> 5; - uint within0 = elem0 & 31; - float4 qtv = float4(q_t[elem0 + 0], q_t[elem0 + 1], q_t[elem0 + 2], q_t[elem0 + 3]); + uint within0 = elem0 & 31u; + + uint q_base = args.q_head * args.head_dim + elem0; + device const float4 * q4 = (device const float4 *)(q + q_base); + float4 qv = q4[0]; uint kv_base = tg_idx * args.blocks_per_threadgroup; for (uint b = 0; b < args.blocks_per_threadgroup; ++b) { @@ -202,17 +158,20 @@ kernel void kernel_turbo3_dot_multi( + kv_idx * args.kv_stride_blocks; device const block_turbo3_0 & blk = grp[blk_idx]; float norm = float(blk.norm); - - uint c0 = tbq3_get_code_dev(blk.qs, within0 + 0u); - uint c1 = tbq3_get_code_dev(blk.qs, within0 + 1u); - uint c2 = tbq3_get_code_dev(blk.qs, within0 + 2u); - uint c3 = tbq3_get_code_dev(blk.qs, within0 + 3u); + uint qb = blk.qs[within0 >> 2]; + uint sb = blk.signs[within0 >> 3]; + + uint sign_shift = within0 & 7u; + uint idx0 = ((qb >> 0) & 0x3u) | (((sb >> (sign_shift + 0u)) & 0x1u) << 2); + uint idx1 = ((qb >> 2) & 0x3u) | (((sb >> (sign_shift + 1u)) & 0x1u) << 2); + uint idx2 = ((qb >> 4) & 0x3u) | (((sb >> (sign_shift + 2u)) & 0x1u) << 2); + uint idx3 = ((qb >> 6) & 0x3u) | (((sb >> (sign_shift + 3u)) & 0x1u) << 2); float4 kv = float4( - TBQ3_CODEBOOK[c0], - TBQ3_CODEBOOK[c1], - TBQ3_CODEBOOK[c2], - TBQ3_CODEBOOK[c3]) * norm; - float acc = dot(qtv, kv); + TURBO_CENTROIDS_3BIT[idx0], + TURBO_CENTROIDS_3BIT[idx1], + TURBO_CENTROIDS_3BIT[idx2], + TURBO_CENTROIDS_3BIT[idx3]) * norm; + float acc = dot(qv, kv); float sum = simd_sum(acc); if (tid == 0) { diff --git a/ggml/src/ggml-metal/eliza-shipped/turbo4.metal b/ggml/src/ggml-metal/eliza-shipped/turbo4.metal index 7554efeeb..c0886d420 100644 --- a/ggml/src/ggml-metal/eliza-shipped/turbo4.metal +++ b/ggml/src/ggml-metal/eliza-shipped/turbo4.metal @@ -1,6 +1,6 @@ -// # ELIZA-KERNEL-PATCH-V1 — copied verbatim from packages/inference/metal/turbo4.metal -// at build time by build-llama-cpp-dflash.mjs. Do not edit in place; -// edit the standalone source and rerun the build. +// HARDWARE VERIFIED on Apple M4 Max (Metal runtime JIT): standalone fixture +// harness plus built-fork GGML_OP_ATTN_SCORE_TBQ graph dispatch. +// Source-level verified against fork block_tbq4_0 in ggml-common.h. // // turbo4 KV cache dequant + Q·K dot product (Metal Shading Language). // @@ -8,24 +8,17 @@ // half norm // block RMS after TurboQuant preconditioning // uchar qs[16] // 4-bit indices packed like q4_0 // -// Element decode (matches CPU reference dequantize_row_tbq4_0): +// Element decode (matches reference / Python ground truth): // elem 0..31 within a 32-element block: // qb = qs[elem & 15] // idx = elem < 16 ? (qb & 0xF) : (qb >> 4) -// rotated = TURBO_CENTROIDS_4BIT[idx] * norm -// then: y = tbq_uncondition_block(rotated) -// = k_tbq_signs .* H32(rotated) +// k = TURBO_CENTROIDS_4BIT[idx] * norm // -// By the orthogonality of H32 (FWHT normalized by 1/sqrt(32)) and the -// distributivity of pointwise sign multiply, -// = -// so we precompute q_t = H32(q .* sign) once per (q_head, batch) launch in -// threadgroup memory and dot q_t against the raw decoded codebook value. -// This is bit-exactly equivalent to the CPU dequant + dot. +// Four 32-element blocks form one 128-element attention row. Graph pre-rotates +// Q, so the shader accumulates directly against the stored rotated codes. // -// Dispatch: one threadgroup per group of `blocks_per_threadgroup` consecutive -// KV tokens for a single (q_head, batch). Threadgroup size = 32 (one Apple -// SIMD-group). Each thread handles 4 of the 128 elements per token. +// Dispatch: one threadgroup per (n_kv, n_head). Threadgroup size MUST equal +// 32 (one Apple SIMD-group). Each thread handles 4 of the 128 elements. #include using namespace metal; @@ -42,58 +35,6 @@ constant float TURBO_CENTROIDS_4BIT[16] = { 1.2557391f, 1.6175243f, 2.0685055f, 2.7321365f, }; -// k_tbq_signs[QK_TBQ=32] from ggml/src/ggml-quants.c:59. Reused per block. -constant float TBQ_SIGNS_32[32] = { - 1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, - 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, - -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, - 1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -}; - -// In-place Fast Walsh–Hadamard transform on a 32-element block, with the -// 1/sqrt(32) normalization that makes H32 orthogonal. Mirrors -// tbq_hadamard32 in ggml/src/ggml-quants.c:104. -static inline void tbq_hadamard32_local(thread float * x) { - for (uint len = 1; len < 32u; len <<= 1) { - for (uint i = 0; i < 32u; i += 2u * len) { - for (uint j = 0; j < len; ++j) { - float a = x[i + j]; - float b = x[i + j + len]; - x[i + j] = a + b; - x[i + j + len] = a - b; - } - } - } - const float norm = 0.1767766952966369f; - for (uint i = 0; i < 32u; ++i) { - x[i] *= norm; - } -} - -// Precompute q_t[128] = H32(q .* k_tbq_signs) per 32-element block. -// Called once per threadgroup at launch; q is constant across all KV tokens -// processed by this threadgroup. -// -// Distributed across the 32-thread SIMD-group: threads 0..3 each own one of -// the 4 hadamard-32 blocks. Other threads idle through the barrier. -static inline void eliza_tbq_precompute_qt( - device const float * q_head, - threadgroup float * q_t, - uint tid) { - if (tid < 4u) { - thread float buf[32]; - uint base = tid * 32u; - for (uint i = 0; i < 32u; ++i) { - buf[i] = q_head[base + i] * TBQ_SIGNS_32[i]; - } - tbq_hadamard32_local(buf); - for (uint i = 0; i < 32u; ++i) { - q_t[base + i] = buf[i]; - } - } - threadgroup_barrier(mem_flags::mem_threadgroup); -} - struct turbo_dot_args { uint head_dim; // must be 128 uint n_kv; @@ -111,10 +52,6 @@ kernel void kernel_turbo4_dot( uint kv_idx [[threadgroup_position_in_grid]]) { if (kv_idx >= args.n_kv) return; - threadgroup float q_t[128]; - device const float * q_head = q + args.q_head * args.head_dim; - eliza_tbq_precompute_qt(q_head, q_t, tid); - device const block_turbo4_0 * grp = (device const block_turbo4_0 *)((device const uchar *)k_blocks + args.head_offset_bytes) + kv_idx * args.kv_stride_blocks; @@ -123,9 +60,10 @@ kernel void kernel_turbo4_dot( uint within0 = elem0 & 31; device const block_turbo4_0 & blk = grp[blk_idx]; float norm = float(blk.norm); + uint q_base = args.q_head * args.head_dim + elem0; - float4 qtv = float4(q_t[elem0 + 0], q_t[elem0 + 1], q_t[elem0 + 2], q_t[elem0 + 3]); - + device const float4 * q4 = (device const float4 *)(q + q_base); + float4 qv = q4[0]; uint qb0 = blk.qs[(within0 + 0u) & 15u]; uint qb1 = blk.qs[(within0 + 1u) & 15u]; uint qb2 = blk.qs[(within0 + 2u) & 15u]; @@ -140,7 +78,7 @@ kernel void kernel_turbo4_dot( TURBO_CENTROIDS_4BIT[idx1], TURBO_CENTROIDS_4BIT[idx2], TURBO_CENTROIDS_4BIT[idx3]) * norm; - float acc = dot(qtv, kv); + float acc = dot(qv, kv); float sum = simd_sum(acc); if (tid == 0) { @@ -167,14 +105,12 @@ kernel void kernel_turbo4_dot_multi( constant turbo_dot_multi_args & args [[buffer(3)]], uint tid [[thread_position_in_threadgroup]], uint tg_idx [[threadgroup_position_in_grid]]) { - threadgroup float q_t[128]; - device const float * q_head = q + args.q_head * args.head_dim; - eliza_tbq_precompute_qt(q_head, q_t, tid); - uint elem0 = tid * 4; uint blk_idx = elem0 >> 5; uint within0 = elem0 & 31; - float4 qtv = float4(q_t[elem0 + 0], q_t[elem0 + 1], q_t[elem0 + 2], q_t[elem0 + 3]); + uint q_base = args.q_head * args.head_dim + elem0; + device const float4 * q4 = (device const float4 *)(q + q_base); + float4 qv = q4[0]; uint kv_base = tg_idx * args.blocks_per_threadgroup; for (uint b = 0; b < args.blocks_per_threadgroup; ++b) { @@ -201,7 +137,7 @@ kernel void kernel_turbo4_dot_multi( TURBO_CENTROIDS_4BIT[idx1], TURBO_CENTROIDS_4BIT[idx2], TURBO_CENTROIDS_4BIT[idx3]) * norm; - float acc = dot(qtv, kv); + float acc = dot(qv, kv); float sum = simd_sum(acc); if (tid == 0) { diff --git a/tools/kokoro/CMakeLists.txt b/tools/kokoro/CMakeLists.txt index 7c4be7e4d..bae822186 100644 --- a/tools/kokoro/CMakeLists.txt +++ b/tools/kokoro/CMakeLists.txt @@ -45,10 +45,16 @@ endif() target_compile_features(kokoro_lib PUBLIC cxx_std_17) -# Standalone CLI harness — required by J2 verification (tools/voice-kokoro/). -add_executable(kokoro-tts tools/kokoro-tts.cpp) -target_link_libraries(kokoro-tts PRIVATE kokoro_lib) -install(TARGETS kokoro-tts RUNTIME) +# Standalone CLI harness — host-only (required by J2 verification, +# tools/voice-kokoro/). Skipped on iOS: there an executable is implicitly a +# MACOSX_BUNDLE app, so `install(TARGETS ... RUNTIME)` fails configure with +# "no BUNDLE DESTINATION". The fused mobile slice only needs kokoro_lib folded +# into libelizainference, never the CLI. +if(NOT CMAKE_SYSTEM_NAME STREQUAL "iOS") + add_executable(kokoro-tts tools/kokoro-tts.cpp) + target_link_libraries(kokoro-tts PRIVATE kokoro_lib) + install(TARGETS kokoro-tts RUNTIME) +endif() # Server-mount handler: compiled into kokoro_lib only when the server target # exists. The handler is guarded by `#ifdef LLAMA_BUILD_KOKORO` and pulls in diff --git a/tools/omnivoice/CMakeLists.txt b/tools/omnivoice/CMakeLists.txt index 374576154..a5b52ac28 100644 --- a/tools/omnivoice/CMakeLists.txt +++ b/tools/omnivoice/CMakeLists.txt @@ -297,6 +297,46 @@ if(TARGET mtmd) "LINKER:-reexport_library,$") endif() add_dependencies(elizainference omnivoice-version) + + # Static variant for the mobile / iOS XCFramework slice. The mtp build + # (build-llama-cpp-mtp.mjs) builds BUILD_SHARED_LIBS=OFF and links the slice + # statically into LlamaCpp.xcframework — it builds `elizainference_static` + # BY NAME and collects libelizainference.a alongside the separately-built + # libllama.a / libmtmd.a / libkokoro_lib.a. A static archive carries only the + # FFI + core OBJECTS; the dependency archives' objects merge at the app's + # final link, so we record usage requirements (include dirs, ELIZA_ENABLE_* + # defines, the `if(TARGET kokoro_lib)` Kokoro fold) but do NOT reexport a + # library (that is a SHARED-only link option). + add_library(elizainference_static STATIC + ${OMNIVOICE_CORE_SOURCES} + ${OMNIVOICE_FFI_SOURCES}) + target_compile_features(elizainference_static PUBLIC cxx_std_17) + target_compile_definitions(elizainference_static + PRIVATE OMNIVOICE_BUILD ${OMNIVOICE_COMPILE_DEFINITIONS}) + if(ELIZA_ENABLE_VISION) + target_compile_definitions(elizainference_static PRIVATE ELIZA_ENABLE_VISION) + endif() + target_include_directories(elizainference_static PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR}/src + ${CMAKE_CURRENT_SOURCE_DIR}/include + ${CMAKE_CURRENT_BINARY_DIR} + ${OMNIVOICE_LLAMA_PRIVATE_INCLUDE_DIRS}) + target_include_directories(elizainference_static PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/../mtmd + ${CMAKE_CURRENT_SOURCE_DIR}/../../include + ${CMAKE_CURRENT_SOURCE_DIR}/../../common) + target_link_libraries(elizainference_static PUBLIC llama mtmd) + target_link_libraries(elizainference_static PRIVATE eliza_voice_classifiers llama-common) + if(TARGET kokoro_lib) + target_link_libraries(elizainference_static PRIVATE kokoro_lib) + target_include_directories(elizainference_static PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/../kokoro/include) + target_compile_definitions(elizainference_static PRIVATE ELIZA_ENABLE_KOKORO) + endif() + set_target_properties(elizainference_static PROPERTIES + OUTPUT_NAME elizainference + POSITION_INDEPENDENT_CODE ON) + add_dependencies(elizainference_static omnivoice-version) endif() # llama-server wiring — mount the OmniVoice POST /v1/audio/speech route diff --git a/tools/omnivoice/include/eliza-inference-ffi.h b/tools/omnivoice/include/eliza-inference-ffi.h index 075660a60..492626a99 100644 --- a/tools/omnivoice/include/eliza-inference-ffi.h +++ b/tools/omnivoice/include/eliza-inference-ffi.h @@ -853,7 +853,10 @@ int eliza_inference_llm_kv_quant_supported(void); * `cache_type_k` / `cache_type_v` (ABI v8): KV-cache quantization type names * (e.g. "f16", "q8_0", "qjl1_256", "q4_polar"). NULL leaves the llama.cpp * default (f16). Mapped to ggml_type and applied to cparams.type_k/type_v. - * Mirrors desktop-llama-adapter.ts's GGML_KV_CACHE_TYPES pass-through. */ + * Mirrors desktop-llama-adapter.ts's GGML_KV_CACHE_TYPES pass-through. + * + * `context_size` (ABI v9): runtime context window in tokens. <=0 falls back + * to ELIZA_LLM_N_CTX or the native default. */ typedef struct { int32_t max_tokens; float temperature; @@ -870,6 +873,7 @@ typedef struct { int32_t n_gpu_layers; /* -1 = default (all), 0 = CPU (ABI v8) */ const char * cache_type_k; /* KV K-cache quant name; NULL = f16 (ABI v8) */ const char * cache_type_v; /* KV V-cache quant name; NULL = f16 (ABI v8) */ + int32_t context_size; /* Runtime context tokens; <=0 = env/default (ABI v9) */ } eliza_llm_stream_config_t; /* Opaque streaming-LLM session. One per active generation. */ diff --git a/tools/omnivoice/src/eliza-inference-ffi.cpp b/tools/omnivoice/src/eliza-inference-ffi.cpp index dc74dd8eb..2821dcd22 100644 --- a/tools/omnivoice/src/eliza-inference-ffi.cpp +++ b/tools/omnivoice/src/eliza-inference-ffi.cpp @@ -22,13 +22,16 @@ // ABI guard: the TS loader (ffi-llm-streaming-abi.ts) marshals // eliza_llm_stream_config_t by hand-written field offsets, so any reorder / // insert / type change on the C side silently corrupts every streaming-LLM -// call. Pin the on-the-wire layout (documented "sizeof config = 80" since v8): -// 6×int32 + 5×ptr + 4-byte fields packed to 80 bytes on a 64-bit ABI. Adding a -// field is an ABI bump — update this assert AND the TS marshaller together. +// call. Pin the on-the-wire layout. ABI v9 appended `context_size` (int32 at +// offset 80), so the packed size on a 64-bit ABI is 88 bytes (8×int32 + 5×ptr, +// pointer-aligned). The TS marshaller (ffi-bindings.ts) already allocs 88 and +// writes context_size at offset 80; this assert had simply not been bumped to +// match. Adding a field is an ABI bump — update this assert AND the TS +// marshaller together. static_assert( - sizeof(eliza_llm_stream_config_t) == 80, + sizeof(eliza_llm_stream_config_t) == 88, "eliza_llm_stream_config_t layout changed — bump ABI + update the TS " - "marshaller in ffi-llm-streaming-abi.ts, then update this assert."); + "marshaller in ffi-bindings.ts, then update this assert."); /* common/ — the same-file MTP speculative-decode engine wired into the * streaming-LLM text path (ABI v8) reuses the DRAFT_MTP implementation in @@ -2922,7 +2925,9 @@ EliLlmStream * eliza_inference_llm_stream_open( * batch / threads / flash-attn / KV-quant). */ llama_context_params cparams = llama_context_default_params(); const int n_ctx_train = llama_model_n_ctx_train(model); - int n_ctx = eliza_int_env_or_default("ELIZA_LLM_N_CTX", 8192); + int n_ctx = cfg->context_size > 0 + ? cfg->context_size + : eliza_int_env_or_default("ELIZA_LLM_N_CTX", 8192); if (n_ctx_train > 0 && n_ctx > n_ctx_train) n_ctx = n_ctx_train; cparams.n_ctx = (uint32_t) n_ctx; cparams.n_batch = (uint32_t) eliza_int_env_or_default("ELIZA_LLM_N_BATCH", 512);