-
Notifications
You must be signed in to change notification settings - Fork 47
feat(audio_tts): add Kokoro-only Text To Speech node (RR-411) #662
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,169 @@ | ||
| # ============================================================================= | ||
| # MIT License | ||
| # Copyright (c) 2026 Aparavi Software AG | ||
| # ============================================================================= | ||
|
|
||
| import base64 | ||
| import os | ||
| import tempfile | ||
| import wave | ||
| from typing import Any, Dict, Optional | ||
|
|
||
| import numpy as np | ||
|
|
||
| from rocketlib import IGlobalBase, OPEN_MODE | ||
| from ai.common.config import Config | ||
| from ai.common.models.base import ModelClient, get_model_server_address | ||
|
|
||
|
|
||
| _KOKORO_REPO_ID = 'hexgrad/Kokoro-82M' | ||
| _KOKORO_LOADER_TYPE = 'kokoro' | ||
| _KOKORO_SAMPLE_RATE = 24000 | ||
| _INT16_MAX = 32767 | ||
| _WAV_MIME = 'audio/wav' | ||
|
|
||
|
|
||
| class IGlobal(IGlobalBase): | ||
| """Kokoro-only TTS node global state. | ||
|
|
||
| Holds either a local ``KPipeline`` (when no model server is configured) or a | ||
| ``ModelClient`` bound to a remote Kokoro loader. The local path needs the | ||
| ``kokoro``/``soundfile`` wheels and the ``en_core_web_sm`` spaCy model; the | ||
| remote path only needs the base client libraries. | ||
| """ | ||
|
|
||
| _voice: str | ||
| _lang: str | ||
| _pipeline: Optional[Any] = None | ||
| _remote_client: Optional[Any] = None | ||
|
|
||
| def beginGlobal(self): | ||
| """Initialise local pipeline or remote client from the node configuration. | ||
|
|
||
| No-op when the endpoint is opened in ``CONFIG`` mode (the UI only needs | ||
| the schema). Otherwise validates that a voice is configured, then either | ||
| connects to the model server — skipping the heavy local dependency | ||
| install — or installs the local requirements and constructs a | ||
| ``KPipeline``. | ||
| """ | ||
| if self.IEndpoint.endpoint.openMode == OPEN_MODE.CONFIG: | ||
| return | ||
|
|
||
| cfg = Config.getNodeConfig(self.glb.logicalType, self.glb.connConfig) | ||
| voice = str(cfg.get('kokoro_voice') or '').strip() | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
| if not voice: | ||
| raise Exception('Kokoro: choose a voice from the list') | ||
| self._voice = voice | ||
| self._lang = voice[0] | ||
|
|
||
| addr = get_model_server_address() | ||
| if addr: | ||
| host, port = addr | ||
| self._remote_client = ModelClient(port, host) | ||
| self._remote_client.load_model( | ||
| _KOKORO_REPO_ID, | ||
| _KOKORO_LOADER_TYPE, | ||
| {'lang_code': self._lang, 'repo_id': _KOKORO_REPO_ID}, | ||
| ) | ||
| else: | ||
| from depends import depends # type: ignore | ||
|
|
||
| depends(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'requirements.txt')) | ||
|
|
||
| self._ensure_spacy_en_model() | ||
| from kokoro import KPipeline | ||
|
|
||
| self._pipeline = KPipeline(lang_code=self._lang) | ||
|
|
||
| @staticmethod | ||
| def _ensure_spacy_en_model() -> None: | ||
| """Install ``en_core_web_sm`` matching the installed spaCy version (GitHub wheel).""" | ||
| try: | ||
| import en_core_web_sm # noqa: F401 | ||
| return | ||
| except ImportError: | ||
| pass | ||
| try: | ||
| import spacy | ||
| except ImportError: | ||
| return | ||
| import subprocess | ||
| import sys | ||
|
|
||
| major, minor = spacy.__version__.split('.')[:2] | ||
| model_ver = f'{major}.{minor}.0' | ||
| url = f'https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-{model_ver}/en_core_web_sm-{model_ver}-py3-none-any.whl' | ||
| subprocess.run( | ||
| [sys.executable, '-m', 'pip', 'install', url], | ||
| check=True, | ||
| stdout=subprocess.DEVNULL, | ||
| stderr=subprocess.DEVNULL, | ||
| ) | ||
|
|
||
| def synthesize(self, text: str) -> Dict[str, Any]: | ||
| """Synthesise ``text`` to a temporary WAV file and return its path. | ||
|
|
||
| Writes a freshly-allocated file under the system temp dir and returns | ||
| ``{'path': <abs path>, 'mime_type': 'audio/wav'}``. The caller owns the | ||
| file and is responsible for deleting it once the bytes have been | ||
| streamed. On any synthesis error the temp file is removed before the | ||
| exception propagates so there are no orphans on disk. | ||
| """ | ||
| fd, out_path = tempfile.mkstemp(prefix='tts_', suffix='.wav') | ||
| os.close(fd) | ||
| try: | ||
| if self._remote_client is not None: | ||
| body = self._remote_client.send_command( | ||
| 'inference', | ||
| { | ||
| 'inputs': [{'text': text, 'voice': self._voice, 'speed': 1}], | ||
| 'output_fields': ['wav_base64'], | ||
| }, | ||
| ) | ||
| rows = body.get('result') or [] | ||
| if not rows: | ||
| raise ValueError('Kokoro model server returned no result') | ||
| wav_bytes = base64.b64decode(rows[0]['wav_base64']) | ||
| with open(out_path, 'wb') as f: | ||
| f.write(wav_bytes) | ||
| else: | ||
| chunks: list[np.ndarray] = [] | ||
| for _gs, _ps, audio in self._pipeline(text, voice=self._voice, speed=1): | ||
| if audio is None: | ||
| continue | ||
| if hasattr(audio, 'detach'): | ||
| arr = audio.detach().cpu().numpy().astype(np.float32) | ||
| else: | ||
| arr = np.asarray(audio, dtype=np.float32) | ||
| if arr.size == 0: | ||
| continue | ||
| if arr.ndim > 1: | ||
| arr = arr.reshape(-1) | ||
| chunks.append(arr) | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
| if not chunks: | ||
| raise ValueError('Kokoro returned no audio samples') | ||
| audio_arr = np.concatenate(chunks) if len(chunks) > 1 else chunks[0] | ||
| audio_arr = np.clip(audio_arr, -1.0, 1.0) | ||
| with wave.open(out_path, 'wb') as wavf: | ||
| wavf.setnchannels(1) | ||
| wavf.setsampwidth(2) | ||
| wavf.setframerate(_KOKORO_SAMPLE_RATE) | ||
| wavf.writeframes((audio_arr * _INT16_MAX).astype(np.int16).tobytes()) | ||
| return {'path': out_path, 'mime_type': _WAV_MIME} | ||
| except Exception: | ||
| try: | ||
| os.remove(out_path) | ||
| except OSError: | ||
| pass | ||
| raise | ||
|
Comment on lines
+153
to
+158
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧹 Nitpick | 🔵 Trivial Consider The Proposed fix+import contextlib
...
except Exception:
- try:
- os.remove(out_path)
- except OSError:
- pass
+ with contextlib.suppress(OSError):
+ os.remove(out_path)
raise🧰 Tools🪛 Ruff (0.15.9)[warning] 101-104: Use Replace (SIM105) 🤖 Prompt for AI Agents |
||
|
|
||
| def endGlobal(self): | ||
| """Release the local pipeline and disconnect the remote client, if any.""" | ||
| self._pipeline = None | ||
| client = self._remote_client | ||
| if client is not None: | ||
| try: | ||
| client.disconnect() | ||
| except Exception: | ||
| pass | ||
| self._remote_client = None | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,52 @@ | ||
| # ============================================================================= | ||
| # MIT License | ||
| # Copyright (c) 2026 Aparavi Software AG | ||
| # ============================================================================= | ||
|
|
||
| import os | ||
| from typing import Optional | ||
|
|
||
| from rocketlib import IInstanceBase, AVI_ACTION, warning | ||
| from .IGlobal import IGlobal | ||
|
|
||
|
|
||
| class IInstance(IInstanceBase): | ||
| IGlobal: IGlobal | ||
|
|
||
| def writeDocuments(self, documents): | ||
| docs = documents if isinstance(documents, list) else [documents] | ||
| text = '\n'.join(doc.page_content for doc in docs if doc.page_content and doc.type not in ('Image', 'Audio', 'Video')) | ||
| self.writeText(text) | ||
|
|
||
| def writeQuestions(self, question): | ||
| text = ' '.join(q.text for q in question.questions) if question.questions else '' | ||
| self.writeText(text) | ||
|
|
||
| def writeAnswers(self, answer): | ||
| text = answer.getText() if hasattr(answer, 'getText') else str(answer) | ||
| self.writeText(text) | ||
|
|
||
| def writeText(self, text: str): | ||
| value = (text or '').strip() | ||
| if not value: | ||
| return | ||
|
|
||
| temp_path: Optional[str] = None | ||
| try: | ||
| payload = self.IGlobal.synthesize(value) | ||
| temp_path = payload['path'] | ||
| with open(temp_path, 'rb') as fin: | ||
| raw = fin.read() | ||
| mime = payload['mime_type'] | ||
| self.instance.writeAudio(AVI_ACTION.BEGIN, mime) | ||
| self.instance.writeAudio(AVI_ACTION.WRITE, mime, raw) | ||
| self.instance.writeAudio(AVI_ACTION.END, mime) | ||
|
Comment on lines
+41
to
+43
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: #!/bin/bash
set -euo pipefail
python - <<'PY'
import ast, pathlib
base = pathlib.Path('packages/server/engine-lib/rocketlib-python/lib/rocketlib/filters.py')
tree = ast.parse(base.read_text())
for node in ast.walk(tree):
if isinstance(node, ast.FunctionDef) and node.name == 'writeAudio':
args = [a.arg for a in node.args.args]
print('IInstanceBase.writeAudio signature args:', args)
print('defaults count:', len(node.args.defaults))
break
target = pathlib.Path('nodes/src/nodes/audio_tts/IInstance.py')
t = ast.parse(target.read_text())
for node in ast.walk(t):
if isinstance(node, ast.Call) and isinstance(node.func, ast.Attribute) and node.func.attr == 'writeAudio':
print(f'Line {node.lineno}: positional args passed = {len(node.args)}')
PYRepository: rocketride-org/rocketride-server Length of output: 283 🏁 Script executed: cat -n nodes/src/nodes/audio_tts/IInstance.py | sed -n '40,55p'Repository: rocketride-org/rocketride-server Length of output: 772 🏁 Script executed: cat -n packages/server/engine-lib/rocketlib-python/lib/rocketlib/filters.py | grep -A 15 'def writeAudio'Repository: rocketride-org/rocketride-server Length of output: 1512 Fix Lines 47 and 49 call Fix- self.instance.writeAudio(AVI_ACTION.BEGIN, mime)
+ self.instance.writeAudio(AVI_ACTION.BEGIN, mime, b'')
self.instance.writeAudio(AVI_ACTION.WRITE, mime, raw)
- self.instance.writeAudio(AVI_ACTION.END, mime)
+ self.instance.writeAudio(AVI_ACTION.END, mime, b'')🤖 Prompt for AI Agents |
||
| except Exception as e: | ||
| warning(f'TTS synthesis failed: {e}') | ||
| raise | ||
| finally: | ||
| if temp_path: | ||
| try: | ||
| os.remove(temp_path) | ||
| except OSError: | ||
| pass | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,23 @@ | ||
| # Audio TTS Node (`audio_tts`) — Kokoro (RR-411 phase 1) | ||
|
|
||
| Text-to-speech node using **Kokoro-82M** only. Additional engines (Piper, Bark, cloud) land in follow-up PRs. | ||
|
|
||
| ## Behavior (aligned with reference branch for `engine: kokoro`) | ||
|
|
||
| - **Input:** `text` lane | ||
| - **Output:** `audio` lane — WAV bytes via `writeAudio` (BEGIN / WRITE / END) with MIME `audio/wav` | ||
| - **Local:** `kokoro.KPipeline`, spaCy `en_core_web_sm` via `ensure_spacy_en_model()` | ||
| - **`--modelserver`:** `ModelClient` + `KokoroLoader` on the server | ||
|
|
||
| ## Configuration | ||
|
|
||
| - Profile **`kokoro`** — `kokoro_voice` dropdown in `services.json` | ||
| - Language code is the **first character** of the voice id (`af_*` → `a`, etc.), same as reference | ||
|
|
||
| ## Dependencies | ||
|
|
||
| See `requirements.txt`: `numpy`, `kokoro`, `soundfile`. | ||
|
|
||
| ## Troubleshooting (`Exception: 1` / wasabi) | ||
|
|
||
| If misaki/spaCy initialization fails, see the full multi-engine README on the reference branch; this node uses the same `spacy_en_model` helper as reference. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,27 @@ | ||
| # ============================================================================= | ||
| # MIT License | ||
| # Copyright (c) 2026 Aparavi Software AG | ||
| # | ||
| # Permission is hereby granted, free of charge, to any person obtaining a copy | ||
| # of this software and associated documentation files (the "Software"), to deal | ||
| # in the Software without restriction, including without limitation the rights | ||
| # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
| # copies of the Software, and to permit persons to whom the Software is | ||
| # furnished to do so, subject to the following conditions: | ||
| # | ||
| # The above copyright notice and this permission notice shall be included in | ||
| # all copies or substantial portions of the Software. | ||
| # | ||
| # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
| # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
| # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
| # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
| # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
| # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
| # THE SOFTWARE. | ||
| # ============================================================================= | ||
|
|
||
| from .IGlobal import IGlobal | ||
| from .IInstance import IInstance | ||
|
|
||
| __all__ = ['IGlobal', 'IInstance'] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| numpy | ||
| # Kokoro-82M: official inference package (not transformers AutoModel) | ||
| kokoro>=0.9.4 | ||
| soundfile>=0.12.0 | ||
| # en_core_web_sm is installed dynamically via spacy_en_model.ensure_spacy_en_model() |
Uh oh!
There was an error while loading. Please reload this page.