Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
169 changes: 169 additions & 0 deletions nodes/src/nodes/audio_tts/IGlobal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
# =============================================================================
# MIT License
# Copyright (c) 2026 Aparavi Software AG
# =============================================================================

import base64
import os
import tempfile
import wave
from typing import Any, Dict, Optional

import numpy as np

from rocketlib import IGlobalBase, OPEN_MODE
from ai.common.config import Config
from ai.common.models.base import ModelClient, get_model_server_address


_KOKORO_REPO_ID = 'hexgrad/Kokoro-82M'
_KOKORO_LOADER_TYPE = 'kokoro'
_KOKORO_SAMPLE_RATE = 24000
_INT16_MAX = 32767
_WAV_MIME = 'audio/wav'


class IGlobal(IGlobalBase):
"""Kokoro-only TTS node global state.

Holds either a local ``KPipeline`` (when no model server is configured) or a
``ModelClient`` bound to a remote Kokoro loader. The local path needs the
``kokoro``/``soundfile`` wheels and the ``en_core_web_sm`` spaCy model; the
remote path only needs the base client libraries.
"""

_voice: str
_lang: str
_pipeline: Optional[Any] = None
_remote_client: Optional[Any] = None

def beginGlobal(self):
Comment thread
coderabbitai[bot] marked this conversation as resolved.
"""Initialise local pipeline or remote client from the node configuration.

No-op when the endpoint is opened in ``CONFIG`` mode (the UI only needs
the schema). Otherwise validates that a voice is configured, then either
connects to the model server — skipping the heavy local dependency
install — or installs the local requirements and constructs a
``KPipeline``.
"""
if self.IEndpoint.endpoint.openMode == OPEN_MODE.CONFIG:
return

cfg = Config.getNodeConfig(self.glb.logicalType, self.glb.connConfig)
voice = str(cfg.get('kokoro_voice') or '').strip()
Comment thread
coderabbitai[bot] marked this conversation as resolved.
if not voice:
raise Exception('Kokoro: choose a voice from the list')
self._voice = voice
self._lang = voice[0]

addr = get_model_server_address()
if addr:
host, port = addr
self._remote_client = ModelClient(port, host)
self._remote_client.load_model(
_KOKORO_REPO_ID,
_KOKORO_LOADER_TYPE,
{'lang_code': self._lang, 'repo_id': _KOKORO_REPO_ID},
)
else:
from depends import depends # type: ignore

depends(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'requirements.txt'))

self._ensure_spacy_en_model()
from kokoro import KPipeline

self._pipeline = KPipeline(lang_code=self._lang)

@staticmethod
def _ensure_spacy_en_model() -> None:
"""Install ``en_core_web_sm`` matching the installed spaCy version (GitHub wheel)."""
try:
import en_core_web_sm # noqa: F401
return
except ImportError:
pass
try:
import spacy
except ImportError:
return
import subprocess
import sys

major, minor = spacy.__version__.split('.')[:2]
model_ver = f'{major}.{minor}.0'
url = f'https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-{model_ver}/en_core_web_sm-{model_ver}-py3-none-any.whl'
subprocess.run(
[sys.executable, '-m', 'pip', 'install', url],
check=True,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)

def synthesize(self, text: str) -> Dict[str, Any]:
"""Synthesise ``text`` to a temporary WAV file and return its path.

Writes a freshly-allocated file under the system temp dir and returns
``{'path': <abs path>, 'mime_type': 'audio/wav'}``. The caller owns the
file and is responsible for deleting it once the bytes have been
streamed. On any synthesis error the temp file is removed before the
exception propagates so there are no orphans on disk.
"""
fd, out_path = tempfile.mkstemp(prefix='tts_', suffix='.wav')
os.close(fd)
try:
if self._remote_client is not None:
body = self._remote_client.send_command(
'inference',
{
'inputs': [{'text': text, 'voice': self._voice, 'speed': 1}],
'output_fields': ['wav_base64'],
},
)
rows = body.get('result') or []
if not rows:
raise ValueError('Kokoro model server returned no result')
wav_bytes = base64.b64decode(rows[0]['wav_base64'])
with open(out_path, 'wb') as f:
f.write(wav_bytes)
else:
chunks: list[np.ndarray] = []
for _gs, _ps, audio in self._pipeline(text, voice=self._voice, speed=1):
if audio is None:
continue
if hasattr(audio, 'detach'):
arr = audio.detach().cpu().numpy().astype(np.float32)
else:
arr = np.asarray(audio, dtype=np.float32)
if arr.size == 0:
continue
if arr.ndim > 1:
arr = arr.reshape(-1)
chunks.append(arr)
Comment thread
coderabbitai[bot] marked this conversation as resolved.
if not chunks:
raise ValueError('Kokoro returned no audio samples')
audio_arr = np.concatenate(chunks) if len(chunks) > 1 else chunks[0]
audio_arr = np.clip(audio_arr, -1.0, 1.0)
with wave.open(out_path, 'wb') as wavf:
wavf.setnchannels(1)
wavf.setsampwidth(2)
wavf.setframerate(_KOKORO_SAMPLE_RATE)
wavf.writeframes((audio_arr * _INT16_MAX).astype(np.int16).tobytes())
return {'path': out_path, 'mime_type': _WAV_MIME}
except Exception:
try:
os.remove(out_path)
except OSError:
pass
raise
Comment on lines +153 to +158
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Nitpick | 🔵 Trivial

Consider contextlib.suppress for cleaner exception handling.

The try-except-pass pattern can be simplified using contextlib.suppress(OSError).

Proposed fix
+import contextlib
...
         except Exception:
-            try:
-                os.remove(out_path)
-            except OSError:
-                pass
+            with contextlib.suppress(OSError):
+                os.remove(out_path)
             raise
🧰 Tools
🪛 Ruff (0.15.9)

[warning] 101-104: Use contextlib.suppress(OSError) instead of try-except-pass

Replace try-except-pass with with contextlib.suppress(OSError): ...

(SIM105)

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@nodes/src/nodes/audio_tts/IGlobal.py` around lines 100 - 105, Replace the
try/except/pass used to remove a temporary file inside the except Exception
handler with contextlib.suppress for clarity: import contextlib at top and
change the inner block that currently attempts to os.remove(out_path) inside the
except Exception in IGlobal.py to use with contextlib.suppress(OSError):
os.remove(out_path), then re-raise the original exception as before.


def endGlobal(self):
"""Release the local pipeline and disconnect the remote client, if any."""
self._pipeline = None
client = self._remote_client
if client is not None:
try:
client.disconnect()
except Exception:
pass
self._remote_client = None
52 changes: 52 additions & 0 deletions nodes/src/nodes/audio_tts/IInstance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# =============================================================================
# MIT License
# Copyright (c) 2026 Aparavi Software AG
# =============================================================================

import os
from typing import Optional

from rocketlib import IInstanceBase, AVI_ACTION, warning
from .IGlobal import IGlobal


class IInstance(IInstanceBase):
IGlobal: IGlobal

def writeDocuments(self, documents):
docs = documents if isinstance(documents, list) else [documents]
text = '\n'.join(doc.page_content for doc in docs if doc.page_content and doc.type not in ('Image', 'Audio', 'Video'))
self.writeText(text)

def writeQuestions(self, question):
text = ' '.join(q.text for q in question.questions) if question.questions else ''
self.writeText(text)

def writeAnswers(self, answer):
text = answer.getText() if hasattr(answer, 'getText') else str(answer)
self.writeText(text)

def writeText(self, text: str):
value = (text or '').strip()
if not value:
return

temp_path: Optional[str] = None
try:
payload = self.IGlobal.synthesize(value)
temp_path = payload['path']
with open(temp_path, 'rb') as fin:
raw = fin.read()
mime = payload['mime_type']
self.instance.writeAudio(AVI_ACTION.BEGIN, mime)
self.instance.writeAudio(AVI_ACTION.WRITE, mime, raw)
self.instance.writeAudio(AVI_ACTION.END, mime)
Comment on lines +41 to +43
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
set -euo pipefail

python - <<'PY'
import ast, pathlib

base = pathlib.Path('packages/server/engine-lib/rocketlib-python/lib/rocketlib/filters.py')
tree = ast.parse(base.read_text())
for node in ast.walk(tree):
    if isinstance(node, ast.FunctionDef) and node.name == 'writeAudio':
        args = [a.arg for a in node.args.args]
        print('IInstanceBase.writeAudio signature args:', args)
        print('defaults count:', len(node.args.defaults))
        break

target = pathlib.Path('nodes/src/nodes/audio_tts/IInstance.py')
t = ast.parse(target.read_text())
for node in ast.walk(t):
    if isinstance(node, ast.Call) and isinstance(node.func, ast.Attribute) and node.func.attr == 'writeAudio':
        print(f'Line {node.lineno}: positional args passed = {len(node.args)}')
PY

Repository: rocketride-org/rocketride-server

Length of output: 283


🏁 Script executed:

cat -n nodes/src/nodes/audio_tts/IInstance.py | sed -n '40,55p'

Repository: rocketride-org/rocketride-server

Length of output: 772


🏁 Script executed:

cat -n packages/server/engine-lib/rocketlib-python/lib/rocketlib/filters.py | grep -A 15 'def writeAudio'

Repository: rocketride-org/rocketride-server

Length of output: 1512


Fix writeAudio calls on BEGIN and END actions to include empty buffer.

Lines 47 and 49 call writeAudio(action, mimeType) with 2 arguments, but the method signature requires 3: (action, mimeType, buffer). This will fail at runtime with a TypeError for missing the required buffer parameter. BEGIN and END are action markers without a data payload, so pass an empty bytes object.

Fix
-            self.instance.writeAudio(AVI_ACTION.BEGIN, mime)
+            self.instance.writeAudio(AVI_ACTION.BEGIN, mime, b'')
             self.instance.writeAudio(AVI_ACTION.WRITE, mime, raw)
-            self.instance.writeAudio(AVI_ACTION.END, mime)
+            self.instance.writeAudio(AVI_ACTION.END, mime, b'')
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@nodes/src/nodes/audio_tts/IInstance.py` around lines 47 - 49, The calls to
writeAudio in IInstance (writeAudio(AVI_ACTION.BEGIN, mime) and
writeAudio(AVI_ACTION.END, mime)) are missing the required third parameter
(buffer); update those calls to pass an empty bytes object as the buffer (e.g.,
b'') so they match the signature writeAudio(action, mimeType, buffer) and avoid
a TypeError at runtime when using AVI_ACTION.BEGIN and AVI_ACTION.END.

except Exception as e:
warning(f'TTS synthesis failed: {e}')
raise
finally:
if temp_path:
try:
os.remove(temp_path)
except OSError:
pass
23 changes: 23 additions & 0 deletions nodes/src/nodes/audio_tts/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Audio TTS Node (`audio_tts`) — Kokoro (RR-411 phase 1)

Text-to-speech node using **Kokoro-82M** only. Additional engines (Piper, Bark, cloud) land in follow-up PRs.

## Behavior (aligned with reference branch for `engine: kokoro`)

- **Input:** `text` lane
- **Output:** `audio` lane — WAV bytes via `writeAudio` (BEGIN / WRITE / END) with MIME `audio/wav`
- **Local:** `kokoro.KPipeline`, spaCy `en_core_web_sm` via `ensure_spacy_en_model()`
- **`--modelserver`:** `ModelClient` + `KokoroLoader` on the server

## Configuration

- Profile **`kokoro`** — `kokoro_voice` dropdown in `services.json`
- Language code is the **first character** of the voice id (`af_*` → `a`, etc.), same as reference

## Dependencies

See `requirements.txt`: `numpy`, `kokoro`, `soundfile`.

## Troubleshooting (`Exception: 1` / wasabi)

If misaki/spaCy initialization fails, see the full multi-engine README on the reference branch; this node uses the same `spacy_en_model` helper as reference.
27 changes: 27 additions & 0 deletions nodes/src/nodes/audio_tts/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# =============================================================================
# MIT License
# Copyright (c) 2026 Aparavi Software AG
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# =============================================================================

from .IGlobal import IGlobal
from .IInstance import IInstance

__all__ = ['IGlobal', 'IInstance']
5 changes: 5 additions & 0 deletions nodes/src/nodes/audio_tts/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
numpy
# Kokoro-82M: official inference package (not transformers AutoModel)
kokoro>=0.9.4
soundfile>=0.12.0
# en_core_web_sm is installed dynamically via spacy_en_model.ensure_spacy_en_model()
Loading
Loading