diff --git a/README.md b/README.md index 4f915844..f14ac250 100644 --- a/README.md +++ b/README.md @@ -208,7 +208,9 @@ For full architectural details, see [project/docs/system_design.md](project/docs | Component | Description | |-----------|-------------| +| **AudioIn** | Audio capture from a selectable input device (including virtual cables) | | **Mic** | Microphone audio capture via sounddevice | +| **RemoteAudioIn** | Audio capture from a remote bridge server over WebSocket | | **Camera** | Webcam video capture with device enumeration | | **VideoPlayer** | Video file playback (looping) | | **TextInput** | Text input from the node UI | @@ -295,7 +297,9 @@ For full architectural details, see [project/docs/system_design.md](project/docs | Component | Description | |-----------|-------------| +| **AudioOut** | Audio playback to a selectable output device (including virtual cables) | | **Speaker** | Audio playback via sounddevice | +| **RemoteAudioOut** | Audio playback to a remote bridge server over WebSocket | | **VideoStream** | JPEG video stream to node UI | | **TextDisplay** | Text display in node UI | | **OSCFace** | VRChat facial animation via OSC (with emotion detection) | diff --git a/backend/src/lib/audio/__init__.py b/backend/src/lib/audio/__init__.py index 5c0fa23d..2666eb28 100644 --- a/backend/src/lib/audio/__init__.py +++ b/backend/src/lib/audio/__init__.py @@ -1,5 +1,9 @@ +from src.lib.audio.audio_in import AudioIn as AudioIn +from src.lib.audio.audio_out import AudioOut as AudioOut from src.lib.audio.mic import Mic as Mic from src.lib.audio.mic_browser import MicBrowser as MicBrowser +from src.lib.audio.remote_audio_in import RemoteAudioIn as RemoteAudioIn +from src.lib.audio.remote_audio_out import RemoteAudioOut as RemoteAudioOut from src.lib.audio.speaker import Speaker as Speaker from src.lib.audio.speaker_browser import SpeakerBrowser as SpeakerBrowser from src.lib.audio.vad import VAD as VAD diff --git a/backend/src/lib/audio/audio_in.py b/backend/src/lib/audio/audio_in.py new file mode 100644 index 00000000..61622a69 --- /dev/null +++ b/backend/src/lib/audio/audio_in.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +from typing import Any + +import sounddevice as sd +from pydantic import BaseModel, ConfigDict + +from src.lib.audio.devices import ( + coerce_sounddevice_device, + list_audio_input_devices, +) +from src.lib.audio.mic import MicOutputs +from src.core.component import ThreadedComponent, Tag +from src.core.frames import AudioFrame + + +class AudioInConfig(BaseModel): + model_config = ConfigDict(json_schema_extra={"options": {"device": {}}}) + + device: str = "" + sample_rate: int = 48000 + channels: int = 2 + frame_ms: int = 20 + + +class AudioIn(ThreadedComponent[tuple[()], MicOutputs]): + tags = Tag(io={"source"}, functionality={"audio"}) + description = "Captures audio from any system **input device**. The node exposes the same `audio` output as `Mic`, but adds a dynamic device dropdown so you can listen to virtual cables or other capture devices." + + def __init__(self, config: AudioInConfig = AudioInConfig()) -> None: + super().__init__() + self.config = config + self._sample_rate = self.config.sample_rate + self._channels = self.config.channels + self._frame_samples = int(self._sample_rate * self.config.frame_ms / 1000) + self._device = coerce_sounddevice_device(self.config.device) + + @classmethod + def get_options(cls, values: dict[str, Any]) -> dict[str, Any]: + return {"config": {"device": list_audio_input_devices()}} + + def run(self, inputs: tuple[()], outputs: MicOutputs) -> None: + with sd.InputStream( + samplerate=self._sample_rate, + channels=self._channels, + dtype="int16", + blocksize=self._frame_samples, + device=self._device, + latency="low", + ) as stream: + while not self.stop_event.is_set(): + data, _ = stream.read(self._frame_samples) + outputs.audio.send( + AudioFrame.new( + data=data.copy(), + sample_rate=self._sample_rate, + channels=self._channels, + ) + ) diff --git a/backend/src/lib/audio/audio_out.py b/backend/src/lib/audio/audio_out.py new file mode 100644 index 00000000..13860a2f --- /dev/null +++ b/backend/src/lib/audio/audio_out.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +from typing import Any + +import numpy as np +import sounddevice as sd +from pydantic import BaseModel, ConfigDict + +from src.lib.audio.devices import ( + coerce_sounddevice_device, + list_audio_output_devices, +) +from src.lib.audio.speaker import SpeakerInputs +from src.core.component import ThreadedComponent, Tag +from src.core.frames import AudioDataFormat + + +class AudioOutConfig(BaseModel): + model_config = ConfigDict(json_schema_extra={"options": {"device": {}}}) + + device: str = "" + sample_rate: int = 48000 + channels: int = 1 + + +class AudioOut(ThreadedComponent[SpeakerInputs, tuple[()]]): + tags = Tag(io={"sink"}, functionality={"audio"}) + description = "Plays `AudioFrame` data to any system **output device**. The node exposes the same `audio` input as `Speaker`, but adds a dynamic device dropdown so you can target virtual cables or specific playback devices." + + def __init__(self, config: AudioOutConfig = AudioOutConfig()) -> None: + super().__init__() + self.config = config + self._sample_rate = self.config.sample_rate + self._channels = self.config.channels + self._device = coerce_sounddevice_device(self.config.device) + + @classmethod + def get_options(cls, values: dict[str, Any]) -> dict[str, Any]: + return {"config": {"device": list_audio_output_devices()}} + + def run(self, inputs: SpeakerInputs, outputs: tuple[()]) -> None: + with sd.OutputStream( + samplerate=self._sample_rate, + channels=self._channels, + dtype="int16", + device=self._device, + latency="low", + ) as stream: + for frame in inputs.audio: + if frame is None: + break + + pcm_bytes = frame.get( + sample_rate=self._sample_rate, + num_channels=self._channels, + data_format=AudioDataFormat.PCM16, + ) + + stream.write( + np.frombuffer(pcm_bytes, dtype=np.int16).reshape(-1, self._channels) + ) diff --git a/backend/src/lib/audio/devices.py b/backend/src/lib/audio/devices.py new file mode 100644 index 00000000..1add6114 --- /dev/null +++ b/backend/src/lib/audio/devices.py @@ -0,0 +1,120 @@ +from __future__ import annotations + +from typing import Any, Literal, TypeAlias + +import sounddevice as sd + +AudioDeviceDirection = Literal["input", "output"] +AudioDeviceOption: TypeAlias = dict[str, str] + + +def _hostapi_names() -> list[str]: + try: + hostapis = sd.query_hostapis() + except Exception: + return [] + return [ + str(api.get("name", f"Host API {index}")) for index, api in enumerate(hostapis) + ] + + +def _default_device_index(direction: AudioDeviceDirection) -> int | None: + try: + input_default, output_default = sd.default.device + except Exception: + return None + + raw = input_default if direction == "input" else output_default + if raw is None: + return None + + try: + index = int(raw) + except (TypeError, ValueError): + return None + + return None if index < 0 else index + + +def _format_device_label( + index: int, + device: dict[str, Any], + hostapis: list[str], +) -> str: + hostapi_index = int(device.get("hostapi", -1) or -1) + hostapi_name = ( + hostapis[hostapi_index] + if 0 <= hostapi_index < len(hostapis) + else "Unknown Host API" + ) + name = str(device.get("name", f"Device {index}")) + default_sample_rate = int(float(device.get("default_samplerate", 0) or 0)) + return f"{name} [{hostapi_name}] (#{index}, {default_sample_rate} Hz)" + + +def list_audio_devices( + direction: AudioDeviceDirection, + *, + include_default: bool = True, +) -> list[AudioDeviceOption]: + try: + devices = sd.query_devices() + except Exception: + return [] + + max_channels_key = ( + "max_input_channels" if direction == "input" else "max_output_channels" + ) + hostapis = _hostapi_names() + + options: list[AudioDeviceOption] = [] + + if include_default: + default_index = _default_device_index(direction) + default_label = "System Default" + if default_index is not None and default_index < len(devices): + default_label = ( + "System Default: " + f"{_format_device_label(default_index, devices[default_index], hostapis)}" + ) + options.append({"value": "", "label": default_label}) + + for index, device in enumerate(devices): + max_channels = int(device.get(max_channels_key, 0) or 0) + if max_channels <= 0: + continue + options.append( + { + "value": str(index), + "label": _format_device_label(index, device, hostapis), + } + ) + + return options + + +def list_audio_input_devices( + *, include_default: bool = True +) -> list[AudioDeviceOption]: + return list_audio_devices("input", include_default=include_default) + + +def list_audio_output_devices( + *, + include_default: bool = True, +) -> list[AudioDeviceOption]: + return list_audio_devices("output", include_default=include_default) + + +def coerce_sounddevice_device(value: str | None) -> int | str | None: + if value is None: + return None + + stripped = value.strip() + if not stripped: + return None + + if stripped.lstrip("-").isdigit(): + return int(stripped) + + return stripped diff --git a/backend/src/lib/audio/remote_audio_in.py b/backend/src/lib/audio/remote_audio_in.py new file mode 100644 index 00000000..81cde192 --- /dev/null +++ b/backend/src/lib/audio/remote_audio_in.py @@ -0,0 +1,107 @@ +from __future__ import annotations + +from websockets.sync.client import ClientConnection, connect +from pydantic import BaseModel, TypeAdapter + +from src.lib.audio.mic import MicOutputs +from src.lib.audio.remote_audio_protocol import ( + RemoteAudioHello, + RemoteAudioResponse, +) +from src.core.component import ThreadedComponent, Tag +from src.core.frames import AudioFrame + + +class RemoteAudioInConfig(BaseModel): + server_url: str = "ws://127.0.0.1:8765/ws/audio" + sample_rate: int = 48000 + channels: int = 2 + frame_ms: int = 20 + max_reconnect_delay: float = 10.0 + + +class RemoteAudioIn(ThreadedComponent[tuple[()], MicOutputs]): + tags = Tag(io={"source"}, functionality={"audio"}) + description = "Receives `AudioFrame` data from a remote audio bridge over **WebSocket**. Use it when the game runs on another machine and you want OpenNeuro to hear the remote system's game audio." + + def __init__(self, config: RemoteAudioInConfig = RemoteAudioInConfig()) -> None: + super().__init__() + self.config = config + self._ws: ClientConnection | None = None + self._response_adapter: TypeAdapter[RemoteAudioResponse] = TypeAdapter( + RemoteAudioResponse + ) + + def stop(self) -> None: + super().stop() + self._close_ws() + + def _close_ws(self) -> None: + ws = self._ws + if ws is None: + return + try: + ws.close() + except Exception: + pass + finally: + self._ws = None + + def _connect(self) -> ClientConnection: + ws = connect( + self.config.server_url, + max_size=8 * 1024 * 1024, + ping_interval=20, + ping_timeout=20, + ) + self._ws = ws + + hello = RemoteAudioHello( + role="audio_in", + sample_rate=self.config.sample_rate, + channels=self.config.channels, + frame_ms=self.config.frame_ms, + ) + ws.send(hello.model_dump_json()) + + ack_raw = ws.recv() + if not isinstance(ack_raw, str): + raise RuntimeError("RemoteAudioIn expected a text handshake response") + + response = self._response_adapter.validate_json(ack_raw) + if not response.ok: + raise RuntimeError(response.error) + return ws + + def run(self, inputs: tuple[()], outputs: MicOutputs) -> None: + reconnect_delay = 1.0 + + while not self.stop_event.is_set(): + try: + ws = self._connect() + reconnect_delay = 1.0 + + while not self.stop_event.is_set(): + message = ws.recv() + if isinstance(message, str): + raise RuntimeError( + f"RemoteAudioIn expected binary audio, got text: {message}" + ) + outputs.audio.send( + AudioFrame.new( + data=message, + sample_rate=self.config.sample_rate, + channels=self.config.channels, + ) + ) + except Exception as exc: + if self.stop_event.is_set(): + break + print(f"[RemoteAudioIn] {exc} (reconnecting in {reconnect_delay:.1f}s)") + if self.stop_event.wait(reconnect_delay): + break + reconnect_delay = min( + reconnect_delay * 2, self.config.max_reconnect_delay + ) + finally: + self._close_ws() diff --git a/backend/src/lib/audio/remote_audio_out.py b/backend/src/lib/audio/remote_audio_out.py new file mode 100644 index 00000000..42964f14 --- /dev/null +++ b/backend/src/lib/audio/remote_audio_out.py @@ -0,0 +1,120 @@ +from __future__ import annotations + +import time + +from websockets.sync.client import ClientConnection, connect +from pydantic import BaseModel, TypeAdapter + +from src.lib.audio.remote_audio_protocol import ( + RemoteAudioHello, + RemoteAudioResponse, +) +from src.lib.audio.speaker import SpeakerInputs +from src.core.component import ThreadedComponent, Tag +from src.core.frames import AudioDataFormat + + +class RemoteAudioOutConfig(BaseModel): + server_url: str = "ws://127.0.0.1:8765/ws/audio" + sample_rate: int = 48000 + channels: int = 1 + frame_ms: int = 20 + max_reconnect_delay: float = 10.0 + + +class RemoteAudioOut(ThreadedComponent[SpeakerInputs, tuple[()]]): + tags = Tag(io={"sink"}, functionality={"audio"}) + description = "Sends `AudioFrame` data to a remote audio bridge over **WebSocket**. Use it when the game runs on another machine and OpenNeuro needs to speak into that machine's virtual microphone." + + def __init__(self, config: RemoteAudioOutConfig = RemoteAudioOutConfig()) -> None: + super().__init__() + self.config = config + self._ws: ClientConnection | None = None + self._response_adapter: TypeAdapter[RemoteAudioResponse] = TypeAdapter( + RemoteAudioResponse + ) + + def stop(self) -> None: + super().stop() + self._close_ws() + + def _close_ws(self) -> None: + ws = self._ws + if ws is None: + return + try: + ws.close() + except Exception: + pass + finally: + self._ws = None + + def _connect(self) -> ClientConnection: + ws = connect( + self.config.server_url, + max_size=8 * 1024 * 1024, + ping_interval=20, + ping_timeout=20, + ) + self._ws = ws + + hello = RemoteAudioHello( + role="audio_out", + sample_rate=self.config.sample_rate, + channels=self.config.channels, + frame_ms=self.config.frame_ms, + ) + ws.send(hello.model_dump_json()) + + ack_raw = ws.recv() + if not isinstance(ack_raw, str): + raise RuntimeError("RemoteAudioOut expected a text handshake response") + + response = self._response_adapter.validate_json(ack_raw) + if not response.ok: + raise RuntimeError(response.error) + return ws + + def run(self, inputs: SpeakerInputs, outputs: tuple[()]) -> None: + reconnect_delay = 1.0 + next_retry_at = 0.0 + + for frame in inputs.audio: + if frame is None or self.stop_event.is_set(): + break + + if self._ws is None: + now = time.monotonic() + if now < next_retry_at: + continue + try: + self._connect() + reconnect_delay = 1.0 + except Exception as exc: + print( + f"[RemoteAudioOut] {exc} (retrying in {reconnect_delay:.1f}s)" + ) + next_retry_at = now + reconnect_delay + reconnect_delay = min( + reconnect_delay * 2, self.config.max_reconnect_delay + ) + self._close_ws() + continue + + assert self._ws is not None + try: + pcm_bytes = frame.get( + sample_rate=self.config.sample_rate, + num_channels=self.config.channels, + data_format=AudioDataFormat.PCM16, + ) + self._ws.send(pcm_bytes) + except Exception as exc: + print(f"[RemoteAudioOut] {exc} (connection dropped)") + next_retry_at = time.monotonic() + reconnect_delay + reconnect_delay = min( + reconnect_delay * 2, self.config.max_reconnect_delay + ) + self._close_ws() + + self._close_ws() diff --git a/backend/src/lib/audio/remote_audio_protocol.py b/backend/src/lib/audio/remote_audio_protocol.py new file mode 100644 index 00000000..f4c227e1 --- /dev/null +++ b/backend/src/lib/audio/remote_audio_protocol.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +from typing import Literal + +from pydantic import BaseModel, Field + + +RemoteAudioRole = Literal["audio_in", "audio_out"] + + +class RemoteAudioHello(BaseModel): + role: RemoteAudioRole + sample_rate: int = Field(gt=0) + channels: int = Field(gt=0) + frame_ms: int = Field(gt=0) + + +class RemoteAudioAck(BaseModel): + ok: Literal[True] = True + role: RemoteAudioRole + sample_rate: int + channels: int + frame_ms: int + device: str + + +class RemoteAudioError(BaseModel): + ok: Literal[False] = False + error: str + + +RemoteAudioResponse = RemoteAudioError | RemoteAudioAck + + +def build_remote_audio_ack(hello: RemoteAudioHello, *, device: str) -> str: + return RemoteAudioAck( + role=hello.role, + sample_rate=hello.sample_rate, + channels=hello.channels, + frame_ms=hello.frame_ms, + device=device, + ).model_dump_json() + + +def build_remote_audio_error(message: str) -> str: + return RemoteAudioError(error=message).model_dump_json() diff --git a/backend/src/lib/audio/remote_audio_server.py b/backend/src/lib/audio/remote_audio_server.py new file mode 100644 index 00000000..822380b5 --- /dev/null +++ b/backend/src/lib/audio/remote_audio_server.py @@ -0,0 +1,270 @@ +from __future__ import annotations + +import argparse +import asyncio +import logging +import threading +from collections import deque + +import numpy as np +import sounddevice as sd +from fastapi import FastAPI, WebSocket, WebSocketDisconnect +from pydantic import BaseModel + +from src.lib.audio.devices import ( + AudioDeviceOption, + coerce_sounddevice_device, + list_audio_input_devices, + list_audio_output_devices, +) +from src.lib.audio.remote_audio_protocol import ( + RemoteAudioHello, + build_remote_audio_ack, + build_remote_audio_error, +) + +logger = logging.getLogger(__name__) + + +class RemoteAudioBridgeConfig(BaseModel): + input_device: str = "CABLE-B Output" + output_device: str = "CABLE-A Input" + max_output_buffer_ms: int = 500 + + +class _PCMByteBuffer: + def __init__(self, max_size_bytes: int) -> None: + self._max_size_bytes = max_size_bytes + self._buffer = deque[bytes]() + self._size_bytes = 0 + self._lock = threading.Lock() + + def append(self, chunk: bytes) -> None: + with self._lock: + self._buffer.append(chunk) + self._size_bytes += len(chunk) + while self._size_bytes > self._max_size_bytes and self._buffer: + removed = self._buffer.popleft() + self._size_bytes -= len(removed) + + def pop(self, size: int) -> bytes: + with self._lock: + parts: list[bytes] = [] + remaining = size + + while remaining > 0 and self._buffer: + head = self._buffer[0] + if len(head) <= remaining: + parts.append(self._buffer.popleft()) + self._size_bytes -= len(head) + remaining -= len(head) + continue + + parts.append(head[:remaining]) + self._buffer[0] = head[remaining:] + self._size_bytes -= remaining + remaining = 0 + + chunk = b"".join(parts) + if len(chunk) < size: + chunk += b"\x00" * (size - len(chunk)) + return chunk + + +def _frame_samples(hello: RemoteAudioHello) -> int: + return max(1, int(hello.sample_rate * hello.frame_ms / 1000)) + + +def _buffer_size_bytes( + hello: RemoteAudioHello, + *, + max_buffer_ms: int, +) -> int: + return max( + hello.channels * 2, + int(hello.sample_rate * hello.channels * 2 * max_buffer_ms / 1000), + ) + + +async def _stream_audio_in( + websocket: WebSocket, + hello: RemoteAudioHello, + config: RemoteAudioBridgeConfig, +) -> None: + loop = asyncio.get_running_loop() + queue: asyncio.Queue[bytes] = asyncio.Queue(maxsize=16) + + def enqueue(chunk: bytes) -> None: + if queue.full(): + try: + queue.get_nowait() + except asyncio.QueueEmpty: + pass + queue.put_nowait(chunk) + + def callback( + indata: np.ndarray, + frames: int, + time_info: object, + status: object, + ) -> None: + del frames, time_info + if status: + logger.warning("Remote audio input callback status: %s", status) + try: + loop.call_soon_threadsafe(enqueue, indata.copy().tobytes()) + except RuntimeError: + pass + + with sd.InputStream( + samplerate=hello.sample_rate, + channels=hello.channels, + dtype="int16", + blocksize=_frame_samples(hello), + device=coerce_sounddevice_device(config.input_device), + latency="low", + callback=callback, + ): + await websocket.send_text( + build_remote_audio_ack(hello, device=config.input_device) + ) + while True: + chunk = await queue.get() + await websocket.send_bytes(chunk) + + +async def _stream_audio_out( + websocket: WebSocket, + hello: RemoteAudioHello, + config: RemoteAudioBridgeConfig, +) -> None: + pcm_buffer = _PCMByteBuffer( + _buffer_size_bytes(hello, max_buffer_ms=config.max_output_buffer_ms) + ) + + def callback( + outdata: np.ndarray, + frames: int, + time_info: object, + status: object, + ) -> None: + del time_info + if status: + logger.warning("Remote audio output callback status: %s", status) + raw = pcm_buffer.pop(frames * hello.channels * 2) + outdata[:] = np.frombuffer(raw, dtype=np.int16).reshape(-1, hello.channels) + + with sd.OutputStream( + samplerate=hello.sample_rate, + channels=hello.channels, + dtype="int16", + blocksize=_frame_samples(hello), + device=coerce_sounddevice_device(config.output_device), + latency="low", + callback=callback, + ): + await websocket.send_text( + build_remote_audio_ack(hello, device=config.output_device) + ) + while True: + chunk = await websocket.receive_bytes() + pcm_buffer.append(chunk) + + +def create_remote_audio_bridge_app( + config: RemoteAudioBridgeConfig, +) -> FastAPI: + app = FastAPI() + + @app.get("/health") + def health() -> dict[str, str]: + return {"status": "ok"} + + @app.get("/devices") + def devices() -> dict[str, list[AudioDeviceOption]]: + return { + "input": list_audio_input_devices(), + "output": list_audio_output_devices(), + } + + @app.websocket("/ws/audio") + async def audio_bridge(websocket: WebSocket) -> None: + await websocket.accept() + + try: + hello = RemoteAudioHello.model_validate_json(await websocket.receive_text()) + if hello.role == "audio_in": + await _stream_audio_in(websocket, hello, config) + else: + await _stream_audio_out(websocket, hello, config) + except WebSocketDisconnect: + return + except Exception as exc: + logger.exception("Remote audio bridge error") + try: + await websocket.send_text(build_remote_audio_error(str(exc))) + except Exception: + pass + try: + await websocket.close(code=1011) + except Exception: + pass + + return app + + +def _print_devices() -> None: + print("Input devices:") + for option in list_audio_input_devices(): + print(f" {option['value'] or ''}: {option['label']}") + + print("\nOutput devices:") + for option in list_audio_output_devices(): + print(f" {option['value'] or ''}: {option['label']}") + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Bridge remote game audio into OpenNeuro over WebSocket." + ) + parser.add_argument("--host", default="0.0.0.0") + parser.add_argument("--port", type=int, default=8765) + parser.add_argument("--input-device", default="CABLE-B Output") + parser.add_argument("--output-device", default="CABLE-A Input") + parser.add_argument("--max-output-buffer-ms", type=int, default=500) + parser.add_argument("--list-devices", action="store_true") + return parser.parse_args() + + +def main() -> None: + args = _parse_args() + logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s") + + if args.list_devices: + _print_devices() + return + + config = RemoteAudioBridgeConfig( + input_device=args.input_device, + output_device=args.output_device, + max_output_buffer_ms=args.max_output_buffer_ms, + ) + + print( + "[remote-audio-bridge] " + f"listening on ws://{args.host}:{args.port}/ws/audio " + f"(game audio input: {config.input_device}, game mic output: {config.output_device})" + ) + + import uvicorn + + uvicorn.run( + create_remote_audio_bridge_app(config), + host=args.host, + port=args.port, + log_level="info", + ) + + +if __name__ == "__main__": + main() diff --git a/backend/tests/io/test_audio_routing_components.py b/backend/tests/io/test_audio_routing_components.py new file mode 100644 index 00000000..3cfd0d21 --- /dev/null +++ b/backend/tests/io/test_audio_routing_components.py @@ -0,0 +1,251 @@ +from __future__ import annotations + +import json +import types + +import numpy as np +from fastapi.testclient import TestClient + +from src.core.frames import AudioFrame +from src.lib.audio.audio_in import AudioIn, AudioInConfig +from src.lib.audio.audio_out import AudioOut, AudioOutConfig +from src.lib.audio.mic import MicOutputs +from src.lib.audio.remote_audio_in import RemoteAudioIn, RemoteAudioInConfig +from src.lib.audio.remote_audio_out import RemoteAudioOut, RemoteAudioOutConfig +from src.lib.audio.remote_audio_server import ( + RemoteAudioBridgeConfig, + create_remote_audio_bridge_app, +) +from src.lib.audio.speaker import SpeakerInputs + + +class _FakeRecv: + def __init__(self, items: list[object | None]) -> None: + self._iter = iter(items) + + def __iter__(self) -> "_FakeRecv": + return self + + def __next__(self) -> object | None: + return next(self._iter) + + +def test_audio_in_and_audio_out_options_and_streams(monkeypatch) -> None: + monkeypatch.setattr( + "src.lib.audio.audio_in.list_audio_input_devices", + lambda: [{"value": "7", "label": "CABLE-B Output"}], + ) + monkeypatch.setattr( + "src.lib.audio.audio_out.list_audio_output_devices", + lambda: [{"value": "3", "label": "CABLE-A Input"}], + ) + + assert AudioIn.get_options({}) == { + "config": {"device": [{"value": "7", "label": "CABLE-B Output"}]} + } + assert AudioOut.get_options({}) == { + "config": {"device": [{"value": "3", "label": "CABLE-A Input"}]} + } + + class _InputStream: + def __enter__(self) -> "_InputStream": + return self + + def __exit__(self, *args: object) -> bool: + return False + + def read(self, frame_samples: int) -> tuple[np.ndarray, None]: + audio_in.stop_event.set() + return np.zeros((frame_samples, 2), dtype=np.int16), None + + monkeypatch.setattr( + "src.lib.audio.audio_in.sd", + types.SimpleNamespace(InputStream=lambda **kwargs: _InputStream()), + ) + + captured_frames: list[AudioFrame] = [] + audio_in = AudioIn( + AudioInConfig(device="7", sample_rate=48000, channels=2, frame_ms=20) + ) + audio_in.run( + (), + MicOutputs( + audio=types.SimpleNamespace( + send=lambda frame: captured_frames.append(frame) + ) + ), + ) + assert audio_in._device == 7 + assert captured_frames and captured_frames[0].channels == 2 + + writes: list[np.ndarray] = [] + + class _OutputStream: + def __enter__(self) -> "_OutputStream": + return self + + def __exit__(self, *args: object) -> bool: + return False + + def write(self, chunk: np.ndarray) -> None: + writes.append(chunk) + + monkeypatch.setattr( + "src.lib.audio.audio_out.sd", + types.SimpleNamespace(OutputStream=lambda **kwargs: _OutputStream()), + ) + + audio_out = AudioOut(AudioOutConfig(device="3", sample_rate=48000, channels=1)) + audio_frame = AudioFrame.new( + data=np.zeros((1, 960), dtype=np.float32), + sample_rate=48000, + channels=1, + ) + audio_out.run( + SpeakerInputs(audio=_FakeRecv([audio_frame, None])), + (), + ) + assert audio_out._device == 3 + assert len(writes) == 1 + + +def test_remote_audio_in_and_out_handshake(monkeypatch) -> None: + class _RemoteInWS: + def __init__(self, component: RemoteAudioIn) -> None: + self.component = component + self.sent: list[object] = [] + self._recv_count = 0 + + def send(self, payload: object) -> None: + self.sent.append(payload) + + def recv(self) -> str | bytes: + self._recv_count += 1 + if self._recv_count == 1: + return json.dumps( + { + "ok": True, + "role": "audio_in", + "sample_rate": 48000, + "channels": 2, + "frame_ms": 20, + "device": "CABLE-B Output", + } + ) + if self._recv_count == 2: + return b"\x00\x00" * 960 * 2 + self.component.stop_event.set() + raise RuntimeError("closed") + + def close(self) -> None: + return None + + remote_in: RemoteAudioIn | None = None + remote_in_ws: _RemoteInWS | None = None + + def fake_remote_in_connect(*args: object, **kwargs: object) -> _RemoteInWS: + del args, kwargs + assert remote_in is not None + nonlocal remote_in_ws + remote_in_ws = _RemoteInWS(remote_in) + return remote_in_ws + + monkeypatch.setattr("src.lib.audio.remote_audio_in.connect", fake_remote_in_connect) + + received_frames: list[AudioFrame] = [] + remote_in = RemoteAudioIn( + RemoteAudioInConfig(sample_rate=48000, channels=2, frame_ms=20) + ) + remote_in.run( + (), + MicOutputs( + audio=types.SimpleNamespace( + send=lambda frame: received_frames.append(frame) + ) + ), + ) + + assert remote_in_ws is not None + hello_in = json.loads(remote_in_ws.sent[0]) + assert hello_in["role"] == "audio_in" + assert received_frames and received_frames[0].channels == 2 + + class _RemoteOutWS: + def __init__(self) -> None: + self.sent: list[object] = [] + self._recv_count = 0 + + def send(self, payload: object) -> None: + self.sent.append(payload) + + def recv(self) -> str: + self._recv_count += 1 + assert self._recv_count == 1 + return json.dumps( + { + "ok": True, + "role": "audio_out", + "sample_rate": 48000, + "channels": 1, + "frame_ms": 20, + "device": "CABLE-A Input", + } + ) + + def close(self) -> None: + return None + + remote_out_ws = _RemoteOutWS() + monkeypatch.setattr( + "src.lib.audio.remote_audio_out.connect", + lambda *args, **kwargs: remote_out_ws, + ) + + remote_out = RemoteAudioOut( + RemoteAudioOutConfig(sample_rate=48000, channels=1, frame_ms=20) + ) + remote_out.run( + SpeakerInputs( + audio=_FakeRecv( + [ + AudioFrame.new( + data=np.zeros((1, 960), dtype=np.float32), + sample_rate=48000, + channels=1, + ), + None, + ] + ) + ), + (), + ) + + hello_out = json.loads(remote_out_ws.sent[0]) + assert hello_out["role"] == "audio_out" + assert isinstance(remote_out_ws.sent[1], bytes) + + +def test_remote_audio_bridge_http_endpoints(monkeypatch) -> None: + monkeypatch.setattr( + "src.lib.audio.remote_audio_server.list_audio_input_devices", + lambda: [{"value": "", "label": "System Default"}], + ) + monkeypatch.setattr( + "src.lib.audio.remote_audio_server.list_audio_output_devices", + lambda: [{"value": "3", "label": "CABLE-A Input"}], + ) + + client = TestClient( + create_remote_audio_bridge_app( + RemoteAudioBridgeConfig( + input_device="CABLE-B Output", + output_device="CABLE-A Input", + ) + ) + ) + + assert client.get("/health").json() == {"status": "ok"} + assert client.get("/devices").json() == { + "input": [{"value": "", "label": "System Default"}], + "output": [{"value": "3", "label": "CABLE-A Input"}], + } diff --git a/bun.lock b/bun.lock index 054cede7..eb6d5ee3 100644 --- a/bun.lock +++ b/bun.lock @@ -1,5 +1,6 @@ { "lockfileVersion": 1, + "configVersion": 0, "workspaces": { "": { "name": "openneuro", diff --git a/docs/user-manual/_meta.ts b/docs/user-manual/_meta.ts index d5cef6cc..ad6710ae 100644 --- a/docs/user-manual/_meta.ts +++ b/docs/user-manual/_meta.ts @@ -1,3 +1,4 @@ export default { index: "Introduction", + "vrchat-audio-routing": "VRChat Audio Routing", }; diff --git a/docs/user-manual/index.mdx b/docs/user-manual/index.mdx index 99423429..5f31eac9 100644 --- a/docs/user-manual/index.mdx +++ b/docs/user-manual/index.mdx @@ -8,6 +8,9 @@ OpenNeuro provides a graph-based architecture for composing audio processing pipelines. Components communicate through typed channels with automatic format conversion. +For VRChat and VB-Cable A+B routing instructions, see +[VRChat Audio Routing](./vrchat-audio-routing). + ## Quick Start ```bash diff --git a/docs/user-manual/vrchat-audio-routing.mdx b/docs/user-manual/vrchat-audio-routing.mdx new file mode 100644 index 00000000..524153f3 --- /dev/null +++ b/docs/user-manual/vrchat-audio-routing.mdx @@ -0,0 +1,121 @@ +# VRChat Audio Routing + +This guide explains how to use `AudioIn`, `AudioOut`, `RemoteAudioIn`, and +`RemoteAudioOut` with VB-Cable A+B so OpenNeuro can hear VRChat voice chat and +talk back to other players. + +## Cable Mapping + +Use the two virtual cables like this: + +- `CABLE-A` carries **AI speech into VRChat's microphone** +- `CABLE-B` carries **VRChat's game audio into OpenNeuro** + +The recommended signal flow is: + +```text +OpenNeuro AudioOut -> CABLE-A Input -> CABLE-A Output -> VRChat microphone +Windows default output / VRChat output -> CABLE-B Input -> CABLE-B Output -> OpenNeuro AudioIn +``` + +## Important VRChat Note + +VRChat lets you change the **microphone** device, but its **output** follows +the Windows system playback device. That means: + +- Set the VRChat microphone to `CABLE-A Output` +- Set the Windows default playback device to `CABLE-B Input` + +If you still want to hear the game locally, enable **Listen to this device** on +`CABLE-B Output` and route it to your real headphones or speakers. + +## Local Setup + +Use this when OpenNeuro and VRChat are running on the **same machine**. + +### OpenNeuro Components + +Configure the graph like this: + +- `AudioIn` + - `device`: `CABLE-B Output` + - `sample_rate`: `48000` + - `channels`: `2` + - `frame_ms`: `20` +- `AudioOut` + - `device`: `CABLE-A Input` + - `sample_rate`: `48000` + - `channels`: `1` + +### VRChat / Windows + +- In VRChat, set **Microphone** to `CABLE-A Output` +- In Windows, set the **default playback device** to `CABLE-B Input` +- Optional: enable **Listen to this device** on `CABLE-B Output` so you can + monitor game audio through your real headset + +## Remote Setup + +Use this when OpenNeuro and VRChat are running on **different machines**. + +### On the Game Machine + +Install VB-Cable A+B and start the bridge server: + +```bash +cd backend +uv run python -m src.lib.audio.remote_audio_server +``` + +By default the bridge assumes: + +- `--input-device "CABLE-B Output"` +- `--output-device "CABLE-A Input"` + +If your device names differ, list them first: + +```bash +cd backend +uv run python -m src.lib.audio.remote_audio_server --list-devices +``` + +Then launch the server with explicit device names if needed: + +```bash +cd backend +uv run python -m src.lib.audio.remote_audio_server --input-device "CABLE-B Output (VB-Audio Virtual Cable B)" --output-device "CABLE-A Input (VB-Audio Virtual Cable A)" +``` + +Also configure the game machine the same way as the local setup: + +- In VRChat, set **Microphone** to `CABLE-A Output` +- In Windows, set the **default playback device** to `CABLE-B Input` + +### On the OpenNeuro Machine + +Point both remote components at the game machine's bridge: + +- `RemoteAudioIn` + - `server_url`: `ws://:8765/ws/audio` + - `sample_rate`: `48000` + - `channels`: `2` + - `frame_ms`: `20` +- `RemoteAudioOut` + - `server_url`: `ws://:8765/ws/audio` + - `sample_rate`: `48000` + - `channels`: `1` + - `frame_ms`: `20` + +## Recommended Graphs + +For voice-to-voice VRChat flows: + +- `AudioIn` or `RemoteAudioIn` +- `VAD` +- `ASR` +- `LLM` +- `TTS` +- `AudioOut` or `RemoteAudioOut` + +If your downstream audio stack expects mono, that's okay: `AudioFrame` +conversion inside the pipeline will down-mix and resample automatically.