From 29b50ef80c4351630b5b572134806552ec168238 Mon Sep 17 00:00:00 2001 From: Liav Edry Date: Sat, 7 Mar 2026 19:13:31 +0200 Subject: [PATCH] feat: add local whisper backend for voice transcription Add faster-whisper (CPU, free, no API key) as an alternative to the existing OpenAI transcription backend. Default language set to English to skip auto-detection overhead. Configurable via CCBOT_WHISPER_BACKEND (local/openai/off) and CCBOT_WHISPER_MODEL. Defaults to local with base model. Co-Authored-By: Claude Opus 4.6 --- .env.example | 10 +++ pyproject.toml | 3 + src/ccbot/bot.py | 73 ++++++++++++--------- src/ccbot/config.py | 10 +++ src/ccbot/transcribe.py | 114 +++++++++++++++++++++++++++++---- tests/ccbot/test_transcribe.py | 68 ++++++++++++++------ 6 files changed, 214 insertions(+), 64 deletions(-) diff --git a/.env.example b/.env.example index d36e2b0e..038e12f5 100644 --- a/.env.example +++ b/.env.example @@ -12,3 +12,13 @@ CLAUDE_COMMAND=claude # Monitor polling interval in seconds (optional, defaults to 2.0) MONITOR_POLL_INTERVAL=2.0 + +# Voice transcription backend (optional, defaults to "openai") +# "openai" = OpenAI API (requires OPENAI_API_KEY) +# "local" = faster-whisper on CPU (free, requires ffmpeg + uv pip install -e ".[voice]") +# "off" = disable voice messages +CCBOT_WHISPER_BACKEND=openai + +# Whisper model size for local backend (optional, defaults to "small") +# Options: tiny, base, small, medium, large-v3 +CCBOT_WHISPER_MODEL=small diff --git a/pyproject.toml b/pyproject.toml index f02ba25c..ad2c360f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,9 @@ dev = [ "pytest-cov>=6.0", "ruff>=0.8.0", ] +voice = [ + "faster-whisper>=1.0.0", +] [build-system] requires = ["hatchling"] diff --git a/src/ccbot/bot.py b/src/ccbot/bot.py index bf6e8009..bf598cba 100644 --- a/src/ccbot/bot.py +++ b/src/ccbot/bot.py @@ -12,8 +12,8 @@ Unbound topics trigger the directory browser to create a new session. - Photo handling: photos sent by user are downloaded and forwarded to Claude Code as file paths (photo_handler). - - Voice handling: voice messages are transcribed via OpenAI API and - forwarded as text (voice_handler). + - Voice handling: voice messages are transcribed (local Whisper or OpenAI) + and forwarded as text to Claude Code (voice_handler). - Automatic cleanup: closing a topic kills the associated window (topic_closed_handler). Unsupported content (stickers, etc.) is rejected with a warning (unsupported_content_handler). @@ -134,8 +134,12 @@ from .session_monitor import NewMessage, SessionMonitor from .terminal_parser import extract_bash_output, is_interactive_ui from .tmux_manager import tmux_manager -from .transcribe import close_client as close_transcribe_client -from .transcribe import transcribe_voice +from .transcribe import ( + TranscriptionDisabled, + TranscriptionError, + close_client as close_transcribe_client, + transcribe, +) from .utils import ccbot_dir logger = logging.getLogger(__name__) @@ -551,7 +555,7 @@ async def unsupported_content_handler( logger.debug("Unsupported content from user %d", user.id) await safe_reply( update.message, - "⚠ Only text, photo, and voice messages are supported. Stickers, video, and other media cannot be forwarded to Claude Code.", + "⚠ Only text, photo, and voice messages are supported. Stickers and other media cannot be forwarded to Claude Code.", ) @@ -559,8 +563,12 @@ async def unsupported_content_handler( _IMAGES_DIR = ccbot_dir() / "images" _IMAGES_DIR.mkdir(parents=True, exist_ok=True) +# --- Audio directory for incoming voice messages --- +_AUDIO_DIR = ccbot_dir() / "audio" +_AUDIO_DIR.mkdir(parents=True, exist_ok=True) + -async def photo_handler(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: +async def photo_handler(update: Update, _context: ContextTypes.DEFAULT_TYPE) -> None: """Handle photos sent by the user: download and forward path to Claude Code.""" user = update.effective_user if not user or not is_user_allowed(user.id): @@ -631,8 +639,8 @@ async def photo_handler(update: Update, context: ContextTypes.DEFAULT_TYPE) -> N await safe_reply(update.message, "📷 Image sent to Claude Code.") -async def voice_handler(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: - """Handle voice messages: transcribe via OpenAI and forward text to Claude Code.""" +async def voice_handler(update: Update, _context: ContextTypes.DEFAULT_TYPE) -> None: + """Handle voice messages: download, transcribe, and forward text to Claude Code.""" user = update.effective_user if not user or not is_user_allowed(user.id): if update.message: @@ -642,14 +650,6 @@ async def voice_handler(update: Update, context: ContextTypes.DEFAULT_TYPE) -> N if not update.message or not update.message.voice: return - if not config.openai_api_key: - await safe_reply( - update.message, - "⚠ Voice transcription requires an OpenAI API key.\n" - "Set `OPENAI_API_KEY` in your `.env` file and restart the bot.", - ) - return - chat = update.message.chat thread_id = _get_thread_id(update) if chat.type in ("group", "supergroup") and thread_id is not None: @@ -681,30 +681,39 @@ async def voice_handler(update: Update, context: ContextTypes.DEFAULT_TYPE) -> N ) return - # Download voice as in-memory bytes - voice_file = await update.message.voice.get_file() - ogg_data = bytes(await voice_file.download_as_bytearray()) + # Download the voice file + voice = update.message.voice + tg_file = await voice.get_file() + filename = f"{int(time.time())}_{voice.file_unique_id}.ogg" + file_path = _AUDIO_DIR / filename + await tg_file.download_to_drive(file_path) # Transcribe try: - text = await transcribe_voice(ogg_data) - except ValueError as e: - await safe_reply(update.message, f"⚠ {e}") + await update.message.chat.send_action(ChatAction.TYPING) + text = await transcribe(file_path) + except TranscriptionDisabled as e: + await safe_reply(update.message, f"🎤 Voice not available: {e}") + return + except TranscriptionError as e: + await safe_reply(update.message, f"❌ Transcription failed: {e}") return - except Exception as e: - logger.error("Voice transcription failed: %s", e) - await safe_reply(update.message, f"⚠ Transcription failed: {e}") + except Exception: + logger.exception("Unexpected transcription error") + await safe_reply(update.message, "❌ Transcription failed unexpectedly.") return + finally: + # Clean up audio file + file_path.unlink(missing_ok=True) - await update.message.chat.send_action(ChatAction.TYPING) - clear_status_msg_info(user.id, thread_id) + # Show transcription to user + await safe_reply(update.message, f'🎤 "{text}"') + # Forward to Claude Code + clear_status_msg_info(user.id, thread_id) success, message = await session_manager.send_to_window(wid, text) if not success: await safe_reply(update.message, f"❌ {message}") - return - - await safe_reply(update.message, f'🎤 "{text}"') # Active bash capture tasks: (user_id, thread_id) → asyncio.Task @@ -1914,9 +1923,9 @@ def create_bot() -> Application: ) # Photos: download and forward file path to Claude Code application.add_handler(MessageHandler(filters.PHOTO, photo_handler)) - # Voice: transcribe via OpenAI and forward text to Claude Code + # Voice messages: transcribe and forward text to Claude Code application.add_handler(MessageHandler(filters.VOICE, voice_handler)) - # Catch-all: non-text content (stickers, video, etc.) + # Catch-all: non-text content (stickers, etc.) application.add_handler( MessageHandler( ~filters.COMMAND & ~filters.TEXT & ~filters.StatusUpdate.ALL, diff --git a/src/ccbot/config.py b/src/ccbot/config.py index ca3d6744..8ba5c504 100644 --- a/src/ccbot/config.py +++ b/src/ccbot/config.py @@ -99,6 +99,16 @@ def __init__(self) -> None: "OPENAI_BASE_URL", "https://api.openai.com/v1" ) + # Voice transcription backend: "openai", "local" (faster-whisper), or "off" + self.whisper_backend: str = os.getenv("CCBOT_WHISPER_BACKEND", "openai").lower() + if self.whisper_backend not in ("local", "openai", "off"): + logger.warning( + "Unknown CCBOT_WHISPER_BACKEND=%r, defaulting to 'off'", + self.whisper_backend, + ) + self.whisper_backend = "off" + self.whisper_model: str = os.getenv("CCBOT_WHISPER_MODEL", "small") + # Scrub sensitive vars from os.environ so child processes never inherit them. # Values are already captured in Config attributes above. for var in SENSITIVE_ENV_VARS: diff --git a/src/ccbot/transcribe.py b/src/ccbot/transcribe.py index 8e62a4cd..fde5ddc5 100644 --- a/src/ccbot/transcribe.py +++ b/src/ccbot/transcribe.py @@ -1,12 +1,17 @@ -"""Voice-to-text transcription via OpenAI's audio API. +"""Voice message transcription — converts audio to text. -Provides a single async function to transcribe voice messages using -the gpt-4o-transcribe model. Uses httpx directly (no OpenAI SDK needed). +Supports two backends (configured via CCBOT_WHISPER_BACKEND): + - "local" — faster-whisper on CPU (free, no API key, requires ffmpeg) + - "openai" — OpenAI API (gpt-4o-transcribe, requires OPENAI_API_KEY) + - "off" — voice messages disabled -Key function: transcribe_voice(ogg_data) -> str +Key function: transcribe() — async, returns transcribed text. """ +import asyncio import logging +from pathlib import Path +from typing import Any import httpx @@ -14,6 +19,17 @@ logger = logging.getLogger(__name__) + +class TranscriptionError(Exception): + """Raised when transcription fails.""" + + +class TranscriptionDisabled(Exception): + """Raised when voice transcription is disabled or unavailable.""" + + +# --- OpenAI backend --- + _client: httpx.AsyncClient | None = None @@ -25,15 +41,17 @@ def _get_client() -> httpx.AsyncClient: return _client -async def transcribe_voice(ogg_data: bytes) -> str: - """Transcribe OGG voice data to text via OpenAI API. - - Raises: - httpx.HTTPStatusError: On API errors (401, 429, 5xx, etc.) - ValueError: If the API returns an empty transcription. - """ +async def _transcribe_openai(file_path: Path) -> str: + """Transcribe via OpenAI API (gpt-4o-transcribe).""" + if not config.openai_api_key: + raise TranscriptionDisabled( + "Voice transcription requires OPENAI_API_KEY in .env.\n" + "Or set CCBOT_WHISPER_BACKEND=local for free local transcription " + '(requires: uv pip install -e ".[voice]").' + ) url = f"{config.openai_base_url.rstrip('/')}/audio/transcriptions" client = _get_client() + ogg_data = file_path.read_bytes() response = await client.post( url, headers={"Authorization": f"Bearer {config.openai_api_key}"}, @@ -41,10 +59,9 @@ async def transcribe_voice(ogg_data: bytes) -> str: data={"model": "gpt-4o-transcribe"}, ) response.raise_for_status() - text = response.json().get("text", "").strip() if not text: - raise ValueError("Empty transcription returned by API") + raise TranscriptionError("Empty transcription returned by OpenAI API") return text @@ -54,3 +71,74 @@ async def close_client() -> None: if _client is not None and not _client.is_closed: await _client.aclose() _client = None + + +# --- Local (faster-whisper) backend --- + +_local_model: Any = None + + +def _get_local_model() -> Any: + """Lazy-load the faster-whisper model (downloads on first use).""" + global _local_model + if _local_model is not None: + return _local_model + try: + from faster_whisper import WhisperModel # type: ignore[import-untyped] + except ImportError: + raise TranscriptionDisabled( + "faster-whisper is not installed. " + 'Install with: uv pip install -e ".[voice]"\n' + "Or set CCBOT_WHISPER_BACKEND=openai/off." + ) + logger.info( + "Loading faster-whisper model '%s' (may download on first use)...", + config.whisper_model, + ) + _local_model = WhisperModel(config.whisper_model, device="cpu", compute_type="int8") + logger.info("faster-whisper model loaded successfully") + return _local_model + + +def _transcribe_local_sync(file_path: Path) -> str: + """Synchronous transcription using faster-whisper (CPU-bound).""" + model = _get_local_model() + segments, info = model.transcribe(str(file_path), beam_size=5, language="en") + text = " ".join(segment.text.strip() for segment in segments) + if not text.strip(): + raise TranscriptionError("Transcription produced empty text") + logger.info( + "Transcribed %s: language=%s, duration=%.1fs, text_len=%d", + file_path.name, + info.language, + info.duration, + len(text), + ) + return text.strip() + + +# --- Unified entry point --- + + +async def transcribe(file_path: Path) -> str: + """Transcribe an audio file to text using the configured backend. + + Args: + file_path: Path to the audio file (OGG/Opus from Telegram). + + Returns: + Transcribed text string. + + Raises: + TranscriptionDisabled: Voice is disabled or dependency missing. + TranscriptionError: Transcription failed or produced empty text. + """ + if config.whisper_backend == "off": + raise TranscriptionDisabled("Voice transcription is disabled.") + + if config.whisper_backend == "openai": + return await _transcribe_openai(file_path) + + # local backend + loop = asyncio.get_running_loop() + return await loop.run_in_executor(None, _transcribe_local_sync, file_path) diff --git a/tests/ccbot/test_transcribe.py b/tests/ccbot/test_transcribe.py index b1f979b9..bd27d258 100644 --- a/tests/ccbot/test_transcribe.py +++ b/tests/ccbot/test_transcribe.py @@ -18,13 +18,22 @@ def _reset_client(): @pytest.fixture def mock_config(): - """Patch config with test values.""" + """Patch config with test values for OpenAI backend.""" with patch.object(transcribe, "config") as cfg: cfg.openai_api_key = "sk-test-key" cfg.openai_base_url = "https://api.openai.com/v1" + cfg.whisper_backend = "openai" yield cfg +@pytest.fixture +def ogg_file(tmp_path): + """Create a temporary OGG file with fake data.""" + f = tmp_path / "test.ogg" + f.write_bytes(b"fake-ogg-data") + return f + + def _mock_response(*, json_data: dict, status_code: int = 200) -> httpx.Response: """Build a fake httpx.Response.""" request = httpx.Request("POST", "https://api.openai.com/v1/audio/transcriptions") @@ -32,14 +41,14 @@ def _mock_response(*, json_data: dict, status_code: int = 200) -> httpx.Response return resp -class TestTranscribeVoice: +class TestTranscribeOpenAI: @pytest.mark.asyncio - async def test_success(self, mock_config): + async def test_success(self, mock_config, ogg_file): resp = _mock_response(json_data={"text": "Hello world"}) with patch.object( httpx.AsyncClient, "post", new_callable=AsyncMock, return_value=resp ) as mock_post: - result = await transcribe.transcribe_voice(b"fake-ogg-data") + result = await transcribe.transcribe(ogg_file) assert result == "Hello world" mock_post.assert_called_once() @@ -47,66 +56,87 @@ async def test_success(self, mock_config): assert "Bearer sk-test-key" in str(call_kwargs) @pytest.mark.asyncio - async def test_empty_transcription_raises(self, mock_config): + async def test_empty_transcription_raises(self, mock_config, ogg_file): resp = _mock_response(json_data={"text": ""}) with patch.object( httpx.AsyncClient, "post", new_callable=AsyncMock, return_value=resp ): - with pytest.raises(ValueError, match="Empty transcription"): - await transcribe.transcribe_voice(b"fake-ogg-data") + with pytest.raises( + transcribe.TranscriptionError, match="Empty transcription" + ): + await transcribe.transcribe(ogg_file) @pytest.mark.asyncio - async def test_whitespace_only_raises(self, mock_config): + async def test_whitespace_only_raises(self, mock_config, ogg_file): resp = _mock_response(json_data={"text": " "}) with patch.object( httpx.AsyncClient, "post", new_callable=AsyncMock, return_value=resp ): - with pytest.raises(ValueError, match="Empty transcription"): - await transcribe.transcribe_voice(b"fake-ogg-data") + with pytest.raises( + transcribe.TranscriptionError, match="Empty transcription" + ): + await transcribe.transcribe(ogg_file) @pytest.mark.asyncio - async def test_missing_text_field_raises(self, mock_config): + async def test_missing_text_field_raises(self, mock_config, ogg_file): resp = _mock_response(json_data={"result": "something"}) with patch.object( httpx.AsyncClient, "post", new_callable=AsyncMock, return_value=resp ): - with pytest.raises(ValueError, match="Empty transcription"): - await transcribe.transcribe_voice(b"fake-ogg-data") + with pytest.raises( + transcribe.TranscriptionError, match="Empty transcription" + ): + await transcribe.transcribe(ogg_file) @pytest.mark.asyncio - async def test_api_error_raises(self, mock_config): + async def test_api_error_raises(self, mock_config, ogg_file): resp = _mock_response(json_data={"error": "Unauthorized"}, status_code=401) with patch.object( httpx.AsyncClient, "post", new_callable=AsyncMock, return_value=resp ): with pytest.raises(httpx.HTTPStatusError): - await transcribe.transcribe_voice(b"fake-ogg-data") + await transcribe.transcribe(ogg_file) @pytest.mark.asyncio - async def test_custom_base_url(self, mock_config): + async def test_custom_base_url(self, mock_config, ogg_file): mock_config.openai_base_url = "https://proxy.example.com/v1" resp = _mock_response(json_data={"text": "Transcribed"}) with patch.object( httpx.AsyncClient, "post", new_callable=AsyncMock, return_value=resp ) as mock_post: - result = await transcribe.transcribe_voice(b"fake-ogg-data") + result = await transcribe.transcribe(ogg_file) assert result == "Transcribed" url_arg = mock_post.call_args[0][0] assert url_arg == "https://proxy.example.com/v1/audio/transcriptions" @pytest.mark.asyncio - async def test_base_url_trailing_slash_stripped(self, mock_config): + async def test_base_url_trailing_slash_stripped(self, mock_config, ogg_file): mock_config.openai_base_url = "https://proxy.example.com/v1/" resp = _mock_response(json_data={"text": "OK"}) with patch.object( httpx.AsyncClient, "post", new_callable=AsyncMock, return_value=resp ) as mock_post: - await transcribe.transcribe_voice(b"fake-ogg-data") + await transcribe.transcribe(ogg_file) url_arg = mock_post.call_args[0][0] assert url_arg == "https://proxy.example.com/v1/audio/transcriptions" + @pytest.mark.asyncio + async def test_missing_api_key_raises_disabled(self, mock_config, ogg_file): + mock_config.openai_api_key = "" + with pytest.raises(transcribe.TranscriptionDisabled, match="OPENAI_API_KEY"): + await transcribe.transcribe(ogg_file) + + +class TestTranscribeDisabled: + @pytest.mark.asyncio + async def test_off_backend_raises(self, ogg_file): + with patch.object(transcribe, "config") as cfg: + cfg.whisper_backend = "off" + with pytest.raises(transcribe.TranscriptionDisabled, match="disabled"): + await transcribe.transcribe(ogg_file) + class TestCloseClient: @pytest.mark.asyncio