six-ddc · Devail1 · Mar 7, 2026
diff --git a/.env.example b/.env.example
@@ -12,3 +12,13 @@ CLAUDE_COMMAND=claude
 
 # Monitor polling interval in seconds (optional, defaults to 2.0)
 MONITOR_POLL_INTERVAL=2.0
+
+# Voice transcription backend (optional, defaults to "openai")
+# "openai" = OpenAI API (requires OPENAI_API_KEY)
+# "local" = faster-whisper on CPU (free, requires ffmpeg + uv pip install -e ".[voice]")
+# "off" = disable voice messages
+CCBOT_WHISPER_BACKEND=openai
+
+# Whisper model size for local backend (optional, defaults to "small")
+# Options: tiny, base, small, medium, large-v3
+CCBOT_WHISPER_MODEL=small
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,6 +25,9 @@ dev = [
     "pytest-cov>=6.0",
     "ruff>=0.8.0",
 ]
+voice = [
+    "faster-whisper>=1.0.0",
+]
 
 [build-system]
 requires = ["hatchling"]

diff --git a/src/ccbot/bot.py b/src/ccbot/bot.py
@@ -12,8 +12,8 @@
     Unbound topics trigger the directory browser to create a new session.
   - Photo handling: photos sent by user are downloaded and forwarded
     to Claude Code as file paths (photo_handler).
-  - Voice handling: voice messages are transcribed via OpenAI API and
-    forwarded as text (voice_handler).
+  - Voice handling: voice messages are transcribed (local Whisper or OpenAI)
+    and forwarded as text to Claude Code (voice_handler).
   - Automatic cleanup: closing a topic kills the associated window
     (topic_closed_handler). Unsupported content (stickers, etc.)
     is rejected with a warning (unsupported_content_handler).
@@ -134,8 +134,12 @@
 from .session_monitor import NewMessage, SessionMonitor
 from .terminal_parser import extract_bash_output, is_interactive_ui
 from .tmux_manager import tmux_manager
-from .transcribe import close_client as close_transcribe_client
-from .transcribe import transcribe_voice
+from .transcribe import (
+    TranscriptionDisabled,
+    TranscriptionError,
+    close_client as close_transcribe_client,
+    transcribe,
+)
 from .utils import ccbot_dir
 
 logger = logging.getLogger(__name__)
@@ -551,16 +555,20 @@ async def unsupported_content_handler(
     logger.debug("Unsupported content from user %d", user.id)
     await safe_reply(
         update.message,
-        "⚠ Only text, photo, and voice messages are supported. Stickers, video, and other media cannot be forwarded to Claude Code.",
+        "⚠ Only text, photo, and voice messages are supported. Stickers and other media cannot be forwarded to Claude Code.",
     )
 
 
 # --- Image directory for incoming photos ---
 _IMAGES_DIR = ccbot_dir() / "images"
 _IMAGES_DIR.mkdir(parents=True, exist_ok=True)
 
+# --- Audio directory for incoming voice messages ---
+_AUDIO_DIR = ccbot_dir() / "audio"
+_AUDIO_DIR.mkdir(parents=True, exist_ok=True)
+
 
-async def photo_handler(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
+async def photo_handler(update: Update, _context: ContextTypes.DEFAULT_TYPE) -> None:
     """Handle photos sent by the user: download and forward path to Claude Code."""
     user = update.effective_user
     if not user or not is_user_allowed(user.id):
@@ -631,8 +639,8 @@ async def photo_handler(update: Update, context: ContextTypes.DEFAULT_TYPE) -> N
     await safe_reply(update.message, "📷 Image sent to Claude Code.")
 
 
-async def voice_handler(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
-    """Handle voice messages: transcribe via OpenAI and forward text to Claude Code."""
+async def voice_handler(update: Update, _context: ContextTypes.DEFAULT_TYPE) -> None:
+    """Handle voice messages: download, transcribe, and forward text to Claude Code."""
     user = update.effective_user
     if not user or not is_user_allowed(user.id):
         if update.message:
@@ -642,14 +650,6 @@ async def voice_handler(update: Update, context: ContextTypes.DEFAULT_TYPE) -> N
     if not update.message or not update.message.voice:
         return
 
-    if not config.openai_api_key:
-        await safe_reply(
-            update.message,
-            "⚠ Voice transcription requires an OpenAI API key.\n"
-            "Set `OPENAI_API_KEY` in your `.env` file and restart the bot.",
-        )
-        return
-
     chat = update.message.chat
     thread_id = _get_thread_id(update)
     if chat.type in ("group", "supergroup") and thread_id is not None:
@@ -681,30 +681,39 @@ async def voice_handler(update: Update, context: ContextTypes.DEFAULT_TYPE) -> N
         )
         return
 
-    # Download voice as in-memory bytes
-    voice_file = await update.message.voice.get_file()
-    ogg_data = bytes(await voice_file.download_as_bytearray())
+    # Download the voice file
+    voice = update.message.voice
+    tg_file = await voice.get_file()
+    filename = f"{int(time.time())}_{voice.file_unique_id}.ogg"
+    file_path = _AUDIO_DIR / filename
+    await tg_file.download_to_drive(file_path)
 
     # Transcribe
     try:
-        text = await transcribe_voice(ogg_data)
-    except ValueError as e:
-        await safe_reply(update.message, f"⚠ {e}")
+        await update.message.chat.send_action(ChatAction.TYPING)
+        text = await transcribe(file_path)
+    except TranscriptionDisabled as e:
+        await safe_reply(update.message, f"🎤 Voice not available: {e}")
+        return
+    except TranscriptionError as e:
+        await safe_reply(update.message, f"❌ Transcription failed: {e}")
         return
-    except Exception as e:
-        logger.error("Voice transcription failed: %s", e)
-        await safe_reply(update.message, f"⚠ Transcription failed: {e}")
+    except Exception:
+        logger.exception("Unexpected transcription error")
+        await safe_reply(update.message, "❌ Transcription failed unexpectedly.")
         return
+    finally:
+        # Clean up audio file
+        file_path.unlink(missing_ok=True)
 
-    await update.message.chat.send_action(ChatAction.TYPING)
-    clear_status_msg_info(user.id, thread_id)
+    # Show transcription to user
+    await safe_reply(update.message, f'🎤 "{text}"')
 
+    # Forward to Claude Code
+    clear_status_msg_info(user.id, thread_id)
     success, message = await session_manager.send_to_window(wid, text)
     if not success:
         await safe_reply(update.message, f"❌ {message}")
-        return
-
-    await safe_reply(update.message, f'🎤 "{text}"')
 
 
 # Active bash capture tasks: (user_id, thread_id) → asyncio.Task
@@ -1914,9 +1923,9 @@ def create_bot() -> Application:
     )
     # Photos: download and forward file path to Claude Code
     application.add_handler(MessageHandler(filters.PHOTO, photo_handler))
-    # Voice: transcribe via OpenAI and forward text to Claude Code
+    # Voice messages: transcribe and forward text to Claude Code
     application.add_handler(MessageHandler(filters.VOICE, voice_handler))
-    # Catch-all: non-text content (stickers, video, etc.)
+    # Catch-all: non-text content (stickers, etc.)
     application.add_handler(
         MessageHandler(
             ~filters.COMMAND & ~filters.TEXT & ~filters.StatusUpdate.ALL,

diff --git a/src/ccbot/config.py b/src/ccbot/config.py
@@ -99,6 +99,16 @@ def __init__(self) -> None:
             "OPENAI_BASE_URL", "https://api.openai.com/v1"
         )
 
+        # Voice transcription backend: "openai", "local" (faster-whisper), or "off"
+        self.whisper_backend: str = os.getenv("CCBOT_WHISPER_BACKEND", "openai").lower()
+        if self.whisper_backend not in ("local", "openai", "off"):
+            logger.warning(
+                "Unknown CCBOT_WHISPER_BACKEND=%r, defaulting to 'off'",
+                self.whisper_backend,
+            )
+            self.whisper_backend = "off"
+        self.whisper_model: str = os.getenv("CCBOT_WHISPER_MODEL", "small")
+
         # Scrub sensitive vars from os.environ so child processes never inherit them.
         # Values are already captured in Config attributes above.
         for var in SENSITIVE_ENV_VARS:

diff --git a/src/ccbot/transcribe.py b/src/ccbot/transcribe.py
@@ -1,19 +1,35 @@
-"""Voice-to-text transcription via OpenAI's audio API.
+"""Voice message transcription — converts audio to text.
 
-Provides a single async function to transcribe voice messages using
-the gpt-4o-transcribe model. Uses httpx directly (no OpenAI SDK needed).
+Supports two backends (configured via CCBOT_WHISPER_BACKEND):
+  - "local"  — faster-whisper on CPU (free, no API key, requires ffmpeg)
+  - "openai" — OpenAI API (gpt-4o-transcribe, requires OPENAI_API_KEY)
+  - "off"    — voice messages disabled
 
-Key function: transcribe_voice(ogg_data) -> str
+Key function: transcribe() — async, returns transcribed text.
 """
 
+import asyncio
 import logging
+from pathlib import Path
+from typing import Any
 
 import httpx
 
 from .config import config
 
 logger = logging.getLogger(__name__)
 
+
+class TranscriptionError(Exception):
+    """Raised when transcription fails."""
+
+
+class TranscriptionDisabled(Exception):
+    """Raised when voice transcription is disabled or unavailable."""
+
+
+# --- OpenAI backend ---
+
 _client: httpx.AsyncClient | None = None
 
 
@@ -25,26 +41,27 @@ def _get_client() -> httpx.AsyncClient:
     return _client
 
 
-async def transcribe_voice(ogg_data: bytes) -> str:
-    """Transcribe OGG voice data to text via OpenAI API.
-
-    Raises:
-        httpx.HTTPStatusError: On API errors (401, 429, 5xx, etc.)
-        ValueError: If the API returns an empty transcription.
-    """
+async def _transcribe_openai(file_path: Path) -> str:
+    """Transcribe via OpenAI API (gpt-4o-transcribe)."""
+    if not config.openai_api_key:
+        raise TranscriptionDisabled(
+            "Voice transcription requires OPENAI_API_KEY in .env.\n"
+            "Or set CCBOT_WHISPER_BACKEND=local for free local transcription "
+            '(requires: uv pip install -e ".[voice]").'
+        )
     url = f"{config.openai_base_url.rstrip('/')}/audio/transcriptions"
     client = _get_client()
+    ogg_data = file_path.read_bytes()
     response = await client.post(
         url,
         headers={"Authorization": f"Bearer {config.openai_api_key}"},
         files={"file": ("voice.ogg", ogg_data, "audio/ogg")},
         data={"model": "gpt-4o-transcribe"},
     )
     response.raise_for_status()
-
     text = response.json().get("text", "").strip()
     if not text:
-        raise ValueError("Empty transcription returned by API")
+        raise TranscriptionError("Empty transcription returned by OpenAI API")
     return text
 
 
@@ -54,3 +71,74 @@ async def close_client() -> None:
     if _client is not None and not _client.is_closed:
         await _client.aclose()
         _client = None
+
+
+# --- Local (faster-whisper) backend ---
+
+_local_model: Any = None
+
+
+def _get_local_model() -> Any:
+    """Lazy-load the faster-whisper model (downloads on first use)."""
+    global _local_model
+    if _local_model is not None:
+        return _local_model
+    try:
+        from faster_whisper import WhisperModel  # type: ignore[import-untyped]
+    except ImportError:
+        raise TranscriptionDisabled(
+            "faster-whisper is not installed. "
+            'Install with: uv pip install -e ".[voice]"\n'
+            "Or set CCBOT_WHISPER_BACKEND=openai/off."
+        )
+    logger.info(
+        "Loading faster-whisper model '%s' (may download on first use)...",
+        config.whisper_model,
+    )
+    _local_model = WhisperModel(config.whisper_model, device="cpu", compute_type="int8")
+    logger.info("faster-whisper model loaded successfully")
+    return _local_model
+
+
+def _transcribe_local_sync(file_path: Path) -> str:
+    """Synchronous transcription using faster-whisper (CPU-bound)."""
+    model = _get_local_model()
+    segments, info = model.transcribe(str(file_path), beam_size=5, language="en")
+    text = " ".join(segment.text.strip() for segment in segments)
+    if not text.strip():
+        raise TranscriptionError("Transcription produced empty text")
+    logger.info(
+        "Transcribed %s: language=%s, duration=%.1fs, text_len=%d",
+        file_path.name,
+        info.language,
+        info.duration,
+        len(text),
+    )
+    return text.strip()
+
+
+# --- Unified entry point ---
+
+
+async def transcribe(file_path: Path) -> str:
+    """Transcribe an audio file to text using the configured backend.
+
+    Args:
+        file_path: Path to the audio file (OGG/Opus from Telegram).
+
+    Returns:
+        Transcribed text string.
+
+    Raises:
+        TranscriptionDisabled: Voice is disabled or dependency missing.
+        TranscriptionError: Transcription failed or produced empty text.
+    """
+    if config.whisper_backend == "off":
+        raise TranscriptionDisabled("Voice transcription is disabled.")
+
+    if config.whisper_backend == "openai":
+        return await _transcribe_openai(file_path)
+
+    # local backend
+    loop = asyncio.get_running_loop()
+    return await loop.run_in_executor(None, _transcribe_local_sync, file_path)