rocketride-org · dsapandora · Apr 13, 2026 · Apr 13, 2026 · Apr 17, 2026 · Apr 17, 2026
@@ -0,0 +1,169 @@
+# =============================================================================
+# MIT License
+# Copyright (c) 2026 Aparavi Software AG
+# =============================================================================
+
+import base64
+import os
+import tempfile
+import wave
+from typing import Any, Dict, Optional
+
+import numpy as np
+
+from rocketlib import IGlobalBase, OPEN_MODE
+from ai.common.config import Config
+from ai.common.models.base import ModelClient, get_model_server_address
+
+
+_KOKORO_REPO_ID = 'hexgrad/Kokoro-82M'
+_KOKORO_LOADER_TYPE = 'kokoro'
+_KOKORO_SAMPLE_RATE = 24000
+_INT16_MAX = 32767
+_WAV_MIME = 'audio/wav'
+
+
+class IGlobal(IGlobalBase):
+    """Kokoro-only TTS node global state.
+
+    Holds either a local ``KPipeline`` (when no model server is configured) or a
+    ``ModelClient`` bound to a remote Kokoro loader. The local path needs the
+    ``kokoro``/``soundfile`` wheels and the ``en_core_web_sm`` spaCy model; the
+    remote path only needs the base client libraries.
+    """
+
+    _voice: str
+    _lang: str
+    _pipeline: Optional[Any] = None
+    _remote_client: Optional[Any] = None
+
+    def beginGlobal(self):
+        """Initialise local pipeline or remote client from the node configuration.
+
+        No-op when the endpoint is opened in ``CONFIG`` mode (the UI only needs
+        the schema). Otherwise validates that a voice is configured, then either
+        connects to the model server — skipping the heavy local dependency
+        install — or installs the local requirements and constructs a
+        ``KPipeline``.
+        """
+        if self.IEndpoint.endpoint.openMode == OPEN_MODE.CONFIG:
+            return
+
+        cfg = Config.getNodeConfig(self.glb.logicalType, self.glb.connConfig)
+        voice = str(cfg.get('kokoro_voice') or '').strip()
+        if not voice:
+            raise Exception('Kokoro: choose a voice from the list')
+        self._voice = voice
+        self._lang = voice[0]
+
+        addr = get_model_server_address()
+        if addr:
+            host, port = addr
+            self._remote_client = ModelClient(port, host)
+            self._remote_client.load_model(
+                _KOKORO_REPO_ID,
+                _KOKORO_LOADER_TYPE,
+                {'lang_code': self._lang, 'repo_id': _KOKORO_REPO_ID},
+            )
+        else:
+            from depends import depends  # type: ignore
+
+            depends(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'requirements.txt'))
+
+            self._ensure_spacy_en_model()
+            from kokoro import KPipeline
+
+            self._pipeline = KPipeline(lang_code=self._lang)
+
+    @staticmethod
+    def _ensure_spacy_en_model() -> None:
+        """Install ``en_core_web_sm`` matching the installed spaCy version (GitHub wheel)."""
+        try:
+            import en_core_web_sm  # noqa: F401
+            return
+        except ImportError:
+            pass
+        try:
+            import spacy
+        except ImportError:
+            return
+        import subprocess
+        import sys
+
+        major, minor = spacy.__version__.split('.')[:2]
+        model_ver = f'{major}.{minor}.0'
+        url = f'https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-{model_ver}/en_core_web_sm-{model_ver}-py3-none-any.whl'
+        subprocess.run(
+            [sys.executable, '-m', 'pip', 'install', url],
+            check=True,
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+        )
+
+    def synthesize(self, text: str) -> Dict[str, Any]:
+        """Synthesise ``text`` to a temporary WAV file and return its path.
+
+        Writes a freshly-allocated file under the system temp dir and returns
+        ``{'path': <abs path>, 'mime_type': 'audio/wav'}``. The caller owns the
+        file and is responsible for deleting it once the bytes have been
+        streamed. On any synthesis error the temp file is removed before the
+        exception propagates so there are no orphans on disk.
+        """
+        fd, out_path = tempfile.mkstemp(prefix='tts_', suffix='.wav')
+        os.close(fd)
+        try:
+            if self._remote_client is not None:
+                body = self._remote_client.send_command(
+                    'inference',
+                    {
+                        'inputs': [{'text': text, 'voice': self._voice, 'speed': 1}],
+                        'output_fields': ['wav_base64'],
+                    },
+                )
+                rows = body.get('result') or []
+                if not rows:
+                    raise ValueError('Kokoro model server returned no result')
+                wav_bytes = base64.b64decode(rows[0]['wav_base64'])
+                with open(out_path, 'wb') as f:
+                    f.write(wav_bytes)
+            else:
+                chunks: list[np.ndarray] = []
+                for _gs, _ps, audio in self._pipeline(text, voice=self._voice, speed=1):
+                    if audio is None:
+                        continue
+                    if hasattr(audio, 'detach'):
+                        arr = audio.detach().cpu().numpy().astype(np.float32)
+                    else:
+                        arr = np.asarray(audio, dtype=np.float32)
+                    if arr.size == 0:
+                        continue
+                    if arr.ndim > 1:
+                        arr = arr.reshape(-1)
+                    chunks.append(arr)
+                if not chunks:
+                    raise ValueError('Kokoro returned no audio samples')
+                audio_arr = np.concatenate(chunks) if len(chunks) > 1 else chunks[0]
+                audio_arr = np.clip(audio_arr, -1.0, 1.0)
+                with wave.open(out_path, 'wb') as wavf:
+                    wavf.setnchannels(1)
+                    wavf.setsampwidth(2)
+                    wavf.setframerate(_KOKORO_SAMPLE_RATE)
+                    wavf.writeframes((audio_arr * _INT16_MAX).astype(np.int16).tobytes())
+            return {'path': out_path, 'mime_type': _WAV_MIME}
+        except Exception:
+            try:
+                os.remove(out_path)
+            except OSError:
+                pass
+            raise
+
+    def endGlobal(self):
+        """Release the local pipeline and disconnect the remote client, if any."""
+        self._pipeline = None
+        client = self._remote_client
+        if client is not None:
+            try:
+                client.disconnect()
+            except Exception:
+                pass
+            self._remote_client = None
@@ -0,0 +1,52 @@
+# =============================================================================
+# MIT License
+# Copyright (c) 2026 Aparavi Software AG
+# =============================================================================
+
+import os
+from typing import Optional
+
+from rocketlib import IInstanceBase, AVI_ACTION, warning
+from .IGlobal import IGlobal
+
+
+class IInstance(IInstanceBase):
+    IGlobal: IGlobal
+
+    def writeDocuments(self, documents):
+        docs = documents if isinstance(documents, list) else [documents]
+        text = '\n'.join(doc.page_content for doc in docs if doc.page_content and doc.type not in ('Image', 'Audio', 'Video'))
+        self.writeText(text)
+
+    def writeQuestions(self, question):
+        text = ' '.join(q.text for q in question.questions) if question.questions else ''
+        self.writeText(text)
+
+    def writeAnswers(self, answer):
+        text = answer.getText() if hasattr(answer, 'getText') else str(answer)
+        self.writeText(text)
+
+    def writeText(self, text: str):
+        value = (text or '').strip()
+        if not value:
+            return
+
+        temp_path: Optional[str] = None
+        try:
+            payload = self.IGlobal.synthesize(value)
+            temp_path = payload['path']
+            with open(temp_path, 'rb') as fin:
+                raw = fin.read()
+            mime = payload['mime_type']
+            self.instance.writeAudio(AVI_ACTION.BEGIN, mime)
+            self.instance.writeAudio(AVI_ACTION.WRITE, mime, raw)
+            self.instance.writeAudio(AVI_ACTION.END, mime)
+        except Exception as e:
+            warning(f'TTS synthesis failed: {e}')
+            raise
+        finally:
+            if temp_path:
+                try:
+                    os.remove(temp_path)
+                except OSError:
+                    pass
@@ -0,0 +1,23 @@
+# Audio TTS Node (`audio_tts`) — Kokoro (RR-411 phase 1)
+
+Text-to-speech node using **Kokoro-82M** only. Additional engines (Piper, Bark, cloud) land in follow-up PRs.
+
+## Behavior (aligned with reference branch for `engine: kokoro`)
+
+- **Input:** `text` lane
+- **Output:** `audio` lane — WAV bytes via `writeAudio` (BEGIN / WRITE / END) with MIME `audio/wav`
+- **Local:** `kokoro.KPipeline`, spaCy `en_core_web_sm` via `ensure_spacy_en_model()`
+- **`--modelserver`:** `ModelClient` + `KokoroLoader` on the server
+
+## Configuration
+
+- Profile **`kokoro`** — `kokoro_voice` dropdown in `services.json`
+- Language code is the **first character** of the voice id (`af_*` → `a`, etc.), same as reference
+
+## Dependencies
+
+See `requirements.txt`: `numpy`, `kokoro`, `soundfile`.
+
+## Troubleshooting (`Exception: 1` / wasabi)
+
+If misaki/spaCy initialization fails, see the full multi-engine README on the reference branch; this node uses the same `spacy_en_model` helper as reference.
@@ -0,0 +1,27 @@
+# =============================================================================
+# MIT License
+# Copyright (c) 2026 Aparavi Software AG
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+# =============================================================================
+
+from .IGlobal import IGlobal
+from .IInstance import IInstance
+
+__all__ = ['IGlobal', 'IInstance']
@@ -0,0 +1,5 @@
+numpy
+# Kokoro-82M: official inference package (not transformers AutoModel)
+kokoro>=0.9.4
+soundfile>=0.12.0
+# en_core_web_sm is installed dynamically via spacy_en_model.ensure_spacy_en_model()