From 29b50ef80c4351630b5b572134806552ec168238 Mon Sep 17 00:00:00 2001
From: Liav Edry <liav.acc@gmail.com>
Date: Sat, 7 Mar 2026 19:13:31 +0200
Subject: [PATCH] feat: add local whisper backend for voice transcription

Add faster-whisper (CPU, free, no API key) as an alternative to the
existing OpenAI transcription backend. Default language set to English
to skip auto-detection overhead.

Configurable via CCBOT_WHISPER_BACKEND (local/openai/off) and
CCBOT_WHISPER_MODEL. Defaults to local with base model.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .env.example                   |  10 +++
 pyproject.toml                 |   3 +
 src/ccbot/bot.py               |  73 ++++++++++++---------
 src/ccbot/config.py            |  10 +++
 src/ccbot/transcribe.py        | 114 +++++++++++++++++++++++++++++----
 tests/ccbot/test_transcribe.py |  68 ++++++++++++++------
 6 files changed, 214 insertions(+), 64 deletions(-)

diff --git a/.env.example b/.env.example
index d36e2b0e..038e12f5 100644
--- a/.env.example
+++ b/.env.example
@@ -12,3 +12,13 @@ CLAUDE_COMMAND=claude
 
 # Monitor polling interval in seconds (optional, defaults to 2.0)
 MONITOR_POLL_INTERVAL=2.0
+
+# Voice transcription backend (optional, defaults to "openai")
+# "openai" = OpenAI API (requires OPENAI_API_KEY)
+# "local" = faster-whisper on CPU (free, requires ffmpeg + uv pip install -e ".[voice]")
+# "off" = disable voice messages
+CCBOT_WHISPER_BACKEND=openai
+
+# Whisper model size for local backend (optional, defaults to "small")
+# Options: tiny, base, small, medium, large-v3
+CCBOT_WHISPER_MODEL=small
diff --git a/pyproject.toml b/pyproject.toml
index f02ba25c..ad2c360f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,6 +25,9 @@ dev = [
     "pytest-cov>=6.0",
     "ruff>=0.8.0",
 ]
+voice = [
+    "faster-whisper>=1.0.0",
+]
 
 [build-system]
 requires = ["hatchling"]
diff --git a/src/ccbot/bot.py b/src/ccbot/bot.py
index bf6e8009..bf598cba 100644
--- a/src/ccbot/bot.py
+++ b/src/ccbot/bot.py
@@ -12,8 +12,8 @@
     Unbound topics trigger the directory browser to create a new session.
   - Photo handling: photos sent by user are downloaded and forwarded
     to Claude Code as file paths (photo_handler).
-  - Voice handling: voice messages are transcribed via OpenAI API and
-    forwarded as text (voice_handler).
+  - Voice handling: voice messages are transcribed (local Whisper or OpenAI)
+    and forwarded as text to Claude Code (voice_handler).
   - Automatic cleanup: closing a topic kills the associated window
     (topic_closed_handler). Unsupported content (stickers, etc.)
     is rejected with a warning (unsupported_content_handler).
@@ -134,8 +134,12 @@
 from .session_monitor import NewMessage, SessionMonitor
 from .terminal_parser import extract_bash_output, is_interactive_ui
 from .tmux_manager import tmux_manager
-from .transcribe import close_client as close_transcribe_client
-from .transcribe import transcribe_voice
+from .transcribe import (
+    TranscriptionDisabled,
+    TranscriptionError,
+    close_client as close_transcribe_client,
+    transcribe,
+)
 from .utils import ccbot_dir
 
 logger = logging.getLogger(__name__)
@@ -551,7 +555,7 @@ async def unsupported_content_handler(
     logger.debug("Unsupported content from user %d", user.id)
     await safe_reply(
         update.message,
-        "⚠ Only text, photo, and voice messages are supported. Stickers, video, and other media cannot be forwarded to Claude Code.",
+        "⚠ Only text, photo, and voice messages are supported. Stickers and other media cannot be forwarded to Claude Code.",
     )
 
 
@@ -559,8 +563,12 @@ async def unsupported_content_handler(
 _IMAGES_DIR = ccbot_dir() / "images"
 _IMAGES_DIR.mkdir(parents=True, exist_ok=True)
 
+# --- Audio directory for incoming voice messages ---
+_AUDIO_DIR = ccbot_dir() / "audio"
+_AUDIO_DIR.mkdir(parents=True, exist_ok=True)
+
 
-async def photo_handler(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
+async def photo_handler(update: Update, _context: ContextTypes.DEFAULT_TYPE) -> None:
     """Handle photos sent by the user: download and forward path to Claude Code."""
     user = update.effective_user
     if not user or not is_user_allowed(user.id):
@@ -631,8 +639,8 @@ async def photo_handler(update: Update, context: ContextTypes.DEFAULT_TYPE) -> N
     await safe_reply(update.message, "📷 Image sent to Claude Code.")
 
 
-async def voice_handler(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
-    """Handle voice messages: transcribe via OpenAI and forward text to Claude Code."""
+async def voice_handler(update: Update, _context: ContextTypes.DEFAULT_TYPE) -> None:
+    """Handle voice messages: download, transcribe, and forward text to Claude Code."""
     user = update.effective_user
     if not user or not is_user_allowed(user.id):
         if update.message:
@@ -642,14 +650,6 @@ async def voice_handler(update: Update, context: ContextTypes.DEFAULT_TYPE) -> N
     if not update.message or not update.message.voice:
         return
 
-    if not config.openai_api_key:
-        await safe_reply(
-            update.message,
-            "⚠ Voice transcription requires an OpenAI API key.\n"
-            "Set `OPENAI_API_KEY` in your `.env` file and restart the bot.",
-        )
-        return
-
     chat = update.message.chat
     thread_id = _get_thread_id(update)
     if chat.type in ("group", "supergroup") and thread_id is not None:
@@ -681,30 +681,39 @@ async def voice_handler(update: Update, context: ContextTypes.DEFAULT_TYPE) -> N
         )
         return
 
-    # Download voice as in-memory bytes
-    voice_file = await update.message.voice.get_file()
-    ogg_data = bytes(await voice_file.download_as_bytearray())
+    # Download the voice file
+    voice = update.message.voice
+    tg_file = await voice.get_file()
+    filename = f"{int(time.time())}_{voice.file_unique_id}.ogg"
+    file_path = _AUDIO_DIR / filename
+    await tg_file.download_to_drive(file_path)
 
     # Transcribe
     try:
-        text = await transcribe_voice(ogg_data)
-    except ValueError as e:
-        await safe_reply(update.message, f"⚠ {e}")
+        await update.message.chat.send_action(ChatAction.TYPING)
+        text = await transcribe(file_path)
+    except TranscriptionDisabled as e:
+        await safe_reply(update.message, f"🎤 Voice not available: {e}")
+        return
+    except TranscriptionError as e:
+        await safe_reply(update.message, f"❌ Transcription failed: {e}")
         return
-    except Exception as e:
-        logger.error("Voice transcription failed: %s", e)
-        await safe_reply(update.message, f"⚠ Transcription failed: {e}")
+    except Exception:
+        logger.exception("Unexpected transcription error")
+        await safe_reply(update.message, "❌ Transcription failed unexpectedly.")
         return
+    finally:
+        # Clean up audio file
+        file_path.unlink(missing_ok=True)
 
-    await update.message.chat.send_action(ChatAction.TYPING)
-    clear_status_msg_info(user.id, thread_id)
+    # Show transcription to user
+    await safe_reply(update.message, f'🎤 "{text}"')
 
+    # Forward to Claude Code
+    clear_status_msg_info(user.id, thread_id)
     success, message = await session_manager.send_to_window(wid, text)
     if not success:
         await safe_reply(update.message, f"❌ {message}")
-        return
-
-    await safe_reply(update.message, f'🎤 "{text}"')
 
 
 # Active bash capture tasks: (user_id, thread_id) → asyncio.Task
@@ -1914,9 +1923,9 @@ def create_bot() -> Application:
     )
     # Photos: download and forward file path to Claude Code
     application.add_handler(MessageHandler(filters.PHOTO, photo_handler))
-    # Voice: transcribe via OpenAI and forward text to Claude Code
+    # Voice messages: transcribe and forward text to Claude Code
     application.add_handler(MessageHandler(filters.VOICE, voice_handler))
-    # Catch-all: non-text content (stickers, video, etc.)
+    # Catch-all: non-text content (stickers, etc.)
     application.add_handler(
         MessageHandler(
             ~filters.COMMAND & ~filters.TEXT & ~filters.StatusUpdate.ALL,
diff --git a/src/ccbot/config.py b/src/ccbot/config.py
index ca3d6744..8ba5c504 100644
--- a/src/ccbot/config.py
+++ b/src/ccbot/config.py
@@ -99,6 +99,16 @@ def __init__(self) -> None:
             "OPENAI_BASE_URL", "https://api.openai.com/v1"
         )
 
+        # Voice transcription backend: "openai", "local" (faster-whisper), or "off"
+        self.whisper_backend: str = os.getenv("CCBOT_WHISPER_BACKEND", "openai").lower()
+        if self.whisper_backend not in ("local", "openai", "off"):
+            logger.warning(
+                "Unknown CCBOT_WHISPER_BACKEND=%r, defaulting to 'off'",
+                self.whisper_backend,
+            )
+            self.whisper_backend = "off"
+        self.whisper_model: str = os.getenv("CCBOT_WHISPER_MODEL", "small")
+
         # Scrub sensitive vars from os.environ so child processes never inherit them.
         # Values are already captured in Config attributes above.
         for var in SENSITIVE_ENV_VARS:
diff --git a/src/ccbot/transcribe.py b/src/ccbot/transcribe.py
index 8e62a4cd..fde5ddc5 100644
--- a/src/ccbot/transcribe.py
+++ b/src/ccbot/transcribe.py
@@ -1,12 +1,17 @@
-"""Voice-to-text transcription via OpenAI's audio API.
+"""Voice message transcription — converts audio to text.
 
-Provides a single async function to transcribe voice messages using
-the gpt-4o-transcribe model. Uses httpx directly (no OpenAI SDK needed).
+Supports two backends (configured via CCBOT_WHISPER_BACKEND):
+  - "local"  — faster-whisper on CPU (free, no API key, requires ffmpeg)
+  - "openai" — OpenAI API (gpt-4o-transcribe, requires OPENAI_API_KEY)
+  - "off"    — voice messages disabled
 
-Key function: transcribe_voice(ogg_data) -> str
+Key function: transcribe() — async, returns transcribed text.
 """
 
+import asyncio
 import logging
+from pathlib import Path
+from typing import Any
 
 import httpx
 
@@ -14,6 +19,17 @@
 
 logger = logging.getLogger(__name__)
 
+
+class TranscriptionError(Exception):
+    """Raised when transcription fails."""
+
+
+class TranscriptionDisabled(Exception):
+    """Raised when voice transcription is disabled or unavailable."""
+
+
+# --- OpenAI backend ---
+
 _client: httpx.AsyncClient | None = None
 
 
@@ -25,15 +41,17 @@ def _get_client() -> httpx.AsyncClient:
     return _client
 
 
-async def transcribe_voice(ogg_data: bytes) -> str:
-    """Transcribe OGG voice data to text via OpenAI API.
-
-    Raises:
-        httpx.HTTPStatusError: On API errors (401, 429, 5xx, etc.)
-        ValueError: If the API returns an empty transcription.
-    """
+async def _transcribe_openai(file_path: Path) -> str:
+    """Transcribe via OpenAI API (gpt-4o-transcribe)."""
+    if not config.openai_api_key:
+        raise TranscriptionDisabled(
+            "Voice transcription requires OPENAI_API_KEY in .env.\n"
+            "Or set CCBOT_WHISPER_BACKEND=local for free local transcription "
+            '(requires: uv pip install -e ".[voice]").'
+        )
     url = f"{config.openai_base_url.rstrip('/')}/audio/transcriptions"
     client = _get_client()
+    ogg_data = file_path.read_bytes()
     response = await client.post(
         url,
         headers={"Authorization": f"Bearer {config.openai_api_key}"},
@@ -41,10 +59,9 @@ async def transcribe_voice(ogg_data: bytes) -> str:
         data={"model": "gpt-4o-transcribe"},
     )
     response.raise_for_status()
-
     text = response.json().get("text", "").strip()
     if not text:
-        raise ValueError("Empty transcription returned by API")
+        raise TranscriptionError("Empty transcription returned by OpenAI API")
     return text
 
 
@@ -54,3 +71,74 @@ async def close_client() -> None:
     if _client is not None and not _client.is_closed:
         await _client.aclose()
         _client = None
+
+
+# --- Local (faster-whisper) backend ---
+
+_local_model: Any = None
+
+
+def _get_local_model() -> Any:
+    """Lazy-load the faster-whisper model (downloads on first use)."""
+    global _local_model
+    if _local_model is not None:
+        return _local_model
+    try:
+        from faster_whisper import WhisperModel  # type: ignore[import-untyped]
+    except ImportError:
+        raise TranscriptionDisabled(
+            "faster-whisper is not installed. "
+            'Install with: uv pip install -e ".[voice]"\n'
+            "Or set CCBOT_WHISPER_BACKEND=openai/off."
+        )
+    logger.info(
+        "Loading faster-whisper model '%s' (may download on first use)...",
+        config.whisper_model,
+    )
+    _local_model = WhisperModel(config.whisper_model, device="cpu", compute_type="int8")
+    logger.info("faster-whisper model loaded successfully")
+    return _local_model
+
+
+def _transcribe_local_sync(file_path: Path) -> str:
+    """Synchronous transcription using faster-whisper (CPU-bound)."""
+    model = _get_local_model()
+    segments, info = model.transcribe(str(file_path), beam_size=5, language="en")
+    text = " ".join(segment.text.strip() for segment in segments)
+    if not text.strip():
+        raise TranscriptionError("Transcription produced empty text")
+    logger.info(
+        "Transcribed %s: language=%s, duration=%.1fs, text_len=%d",
+        file_path.name,
+        info.language,
+        info.duration,
+        len(text),
+    )
+    return text.strip()
+
+
+# --- Unified entry point ---
+
+
+async def transcribe(file_path: Path) -> str:
+    """Transcribe an audio file to text using the configured backend.
+
+    Args:
+        file_path: Path to the audio file (OGG/Opus from Telegram).
+
+    Returns:
+        Transcribed text string.
+
+    Raises:
+        TranscriptionDisabled: Voice is disabled or dependency missing.
+        TranscriptionError: Transcription failed or produced empty text.
+    """
+    if config.whisper_backend == "off":
+        raise TranscriptionDisabled("Voice transcription is disabled.")
+
+    if config.whisper_backend == "openai":
+        return await _transcribe_openai(file_path)
+
+    # local backend
+    loop = asyncio.get_running_loop()
+    return await loop.run_in_executor(None, _transcribe_local_sync, file_path)
diff --git a/tests/ccbot/test_transcribe.py b/tests/ccbot/test_transcribe.py
index b1f979b9..bd27d258 100644
--- a/tests/ccbot/test_transcribe.py
+++ b/tests/ccbot/test_transcribe.py
@@ -18,13 +18,22 @@ def _reset_client():
 
 @pytest.fixture
 def mock_config():
-    """Patch config with test values."""
+    """Patch config with test values for OpenAI backend."""
     with patch.object(transcribe, "config") as cfg:
         cfg.openai_api_key = "sk-test-key"
         cfg.openai_base_url = "https://api.openai.com/v1"
+        cfg.whisper_backend = "openai"
         yield cfg
 
 
+@pytest.fixture
+def ogg_file(tmp_path):
+    """Create a temporary OGG file with fake data."""
+    f = tmp_path / "test.ogg"
+    f.write_bytes(b"fake-ogg-data")
+    return f
+
+
 def _mock_response(*, json_data: dict, status_code: int = 200) -> httpx.Response:
     """Build a fake httpx.Response."""
     request = httpx.Request("POST", "https://api.openai.com/v1/audio/transcriptions")
@@ -32,14 +41,14 @@ def _mock_response(*, json_data: dict, status_code: int = 200) -> httpx.Response
     return resp
 
 
-class TestTranscribeVoice:
+class TestTranscribeOpenAI:
     @pytest.mark.asyncio
-    async def test_success(self, mock_config):
+    async def test_success(self, mock_config, ogg_file):
         resp = _mock_response(json_data={"text": "Hello world"})
         with patch.object(
             httpx.AsyncClient, "post", new_callable=AsyncMock, return_value=resp
         ) as mock_post:
-            result = await transcribe.transcribe_voice(b"fake-ogg-data")
+            result = await transcribe.transcribe(ogg_file)
 
         assert result == "Hello world"
         mock_post.assert_called_once()
@@ -47,66 +56,87 @@ async def test_success(self, mock_config):
         assert "Bearer sk-test-key" in str(call_kwargs)
 
     @pytest.mark.asyncio
-    async def test_empty_transcription_raises(self, mock_config):
+    async def test_empty_transcription_raises(self, mock_config, ogg_file):
         resp = _mock_response(json_data={"text": ""})
         with patch.object(
             httpx.AsyncClient, "post", new_callable=AsyncMock, return_value=resp
         ):
-            with pytest.raises(ValueError, match="Empty transcription"):
-                await transcribe.transcribe_voice(b"fake-ogg-data")
+            with pytest.raises(
+                transcribe.TranscriptionError, match="Empty transcription"
+            ):
+                await transcribe.transcribe(ogg_file)
 
     @pytest.mark.asyncio
-    async def test_whitespace_only_raises(self, mock_config):
+    async def test_whitespace_only_raises(self, mock_config, ogg_file):
         resp = _mock_response(json_data={"text": "   "})
         with patch.object(
             httpx.AsyncClient, "post", new_callable=AsyncMock, return_value=resp
         ):
-            with pytest.raises(ValueError, match="Empty transcription"):
-                await transcribe.transcribe_voice(b"fake-ogg-data")
+            with pytest.raises(
+                transcribe.TranscriptionError, match="Empty transcription"
+            ):
+                await transcribe.transcribe(ogg_file)
 
     @pytest.mark.asyncio
-    async def test_missing_text_field_raises(self, mock_config):
+    async def test_missing_text_field_raises(self, mock_config, ogg_file):
         resp = _mock_response(json_data={"result": "something"})
         with patch.object(
             httpx.AsyncClient, "post", new_callable=AsyncMock, return_value=resp
         ):
-            with pytest.raises(ValueError, match="Empty transcription"):
-                await transcribe.transcribe_voice(b"fake-ogg-data")
+            with pytest.raises(
+                transcribe.TranscriptionError, match="Empty transcription"
+            ):
+                await transcribe.transcribe(ogg_file)
 
     @pytest.mark.asyncio
-    async def test_api_error_raises(self, mock_config):
+    async def test_api_error_raises(self, mock_config, ogg_file):
         resp = _mock_response(json_data={"error": "Unauthorized"}, status_code=401)
         with patch.object(
             httpx.AsyncClient, "post", new_callable=AsyncMock, return_value=resp
         ):
             with pytest.raises(httpx.HTTPStatusError):
-                await transcribe.transcribe_voice(b"fake-ogg-data")
+                await transcribe.transcribe(ogg_file)
 
     @pytest.mark.asyncio
-    async def test_custom_base_url(self, mock_config):
+    async def test_custom_base_url(self, mock_config, ogg_file):
         mock_config.openai_base_url = "https://proxy.example.com/v1"
         resp = _mock_response(json_data={"text": "Transcribed"})
         with patch.object(
             httpx.AsyncClient, "post", new_callable=AsyncMock, return_value=resp
         ) as mock_post:
-            result = await transcribe.transcribe_voice(b"fake-ogg-data")
+            result = await transcribe.transcribe(ogg_file)
 
         assert result == "Transcribed"
         url_arg = mock_post.call_args[0][0]
         assert url_arg == "https://proxy.example.com/v1/audio/transcriptions"
 
     @pytest.mark.asyncio
-    async def test_base_url_trailing_slash_stripped(self, mock_config):
+    async def test_base_url_trailing_slash_stripped(self, mock_config, ogg_file):
         mock_config.openai_base_url = "https://proxy.example.com/v1/"
         resp = _mock_response(json_data={"text": "OK"})
         with patch.object(
             httpx.AsyncClient, "post", new_callable=AsyncMock, return_value=resp
         ) as mock_post:
-            await transcribe.transcribe_voice(b"fake-ogg-data")
+            await transcribe.transcribe(ogg_file)
 
         url_arg = mock_post.call_args[0][0]
         assert url_arg == "https://proxy.example.com/v1/audio/transcriptions"
 
+    @pytest.mark.asyncio
+    async def test_missing_api_key_raises_disabled(self, mock_config, ogg_file):
+        mock_config.openai_api_key = ""
+        with pytest.raises(transcribe.TranscriptionDisabled, match="OPENAI_API_KEY"):
+            await transcribe.transcribe(ogg_file)
+
+
+class TestTranscribeDisabled:
+    @pytest.mark.asyncio
+    async def test_off_backend_raises(self, ogg_file):
+        with patch.object(transcribe, "config") as cfg:
+            cfg.whisper_backend = "off"
+            with pytest.raises(transcribe.TranscriptionDisabled, match="disabled"):
+                await transcribe.transcribe(ogg_file)
+
 
 class TestCloseClient:
     @pytest.mark.asyncio