Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,13 @@ CLAUDE_COMMAND=claude

# Monitor polling interval in seconds (optional, defaults to 2.0)
MONITOR_POLL_INTERVAL=2.0

# Voice transcription backend (optional, defaults to "openai")
# "openai" = OpenAI API (requires OPENAI_API_KEY)
# "local" = faster-whisper on CPU (free, requires ffmpeg + uv pip install -e ".[voice]")
# "off" = disable voice messages
CCBOT_WHISPER_BACKEND=openai

# Whisper model size for local backend (optional, defaults to "small")
# Options: tiny, base, small, medium, large-v3
CCBOT_WHISPER_MODEL=small
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ dev = [
"pytest-cov>=6.0",
"ruff>=0.8.0",
]
voice = [
"faster-whisper>=1.0.0",
]

[build-system]
requires = ["hatchling"]
Expand Down
73 changes: 41 additions & 32 deletions src/ccbot/bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
Unbound topics trigger the directory browser to create a new session.
- Photo handling: photos sent by user are downloaded and forwarded
to Claude Code as file paths (photo_handler).
- Voice handling: voice messages are transcribed via OpenAI API and
forwarded as text (voice_handler).
- Voice handling: voice messages are transcribed (local Whisper or OpenAI)
and forwarded as text to Claude Code (voice_handler).
- Automatic cleanup: closing a topic kills the associated window
(topic_closed_handler). Unsupported content (stickers, etc.)
is rejected with a warning (unsupported_content_handler).
Expand Down Expand Up @@ -134,8 +134,12 @@
from .session_monitor import NewMessage, SessionMonitor
from .terminal_parser import extract_bash_output, is_interactive_ui
from .tmux_manager import tmux_manager
from .transcribe import close_client as close_transcribe_client
from .transcribe import transcribe_voice
from .transcribe import (
TranscriptionDisabled,
TranscriptionError,
close_client as close_transcribe_client,
transcribe,
)
from .utils import ccbot_dir

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -551,16 +555,20 @@ async def unsupported_content_handler(
logger.debug("Unsupported content from user %d", user.id)
await safe_reply(
update.message,
"⚠ Only text, photo, and voice messages are supported. Stickers, video, and other media cannot be forwarded to Claude Code.",
"⚠ Only text, photo, and voice messages are supported. Stickers and other media cannot be forwarded to Claude Code.",
)


# --- Image directory for incoming photos ---
_IMAGES_DIR = ccbot_dir() / "images"
_IMAGES_DIR.mkdir(parents=True, exist_ok=True)

# --- Audio directory for incoming voice messages ---
_AUDIO_DIR = ccbot_dir() / "audio"
_AUDIO_DIR.mkdir(parents=True, exist_ok=True)


async def photo_handler(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
async def photo_handler(update: Update, _context: ContextTypes.DEFAULT_TYPE) -> None:
"""Handle photos sent by the user: download and forward path to Claude Code."""
user = update.effective_user
if not user or not is_user_allowed(user.id):
Expand Down Expand Up @@ -631,8 +639,8 @@ async def photo_handler(update: Update, context: ContextTypes.DEFAULT_TYPE) -> N
await safe_reply(update.message, "📷 Image sent to Claude Code.")


async def voice_handler(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
"""Handle voice messages: transcribe via OpenAI and forward text to Claude Code."""
async def voice_handler(update: Update, _context: ContextTypes.DEFAULT_TYPE) -> None:
"""Handle voice messages: download, transcribe, and forward text to Claude Code."""
user = update.effective_user
if not user or not is_user_allowed(user.id):
if update.message:
Expand All @@ -642,14 +650,6 @@ async def voice_handler(update: Update, context: ContextTypes.DEFAULT_TYPE) -> N
if not update.message or not update.message.voice:
return

if not config.openai_api_key:
await safe_reply(
update.message,
"⚠ Voice transcription requires an OpenAI API key.\n"
"Set `OPENAI_API_KEY` in your `.env` file and restart the bot.",
)
return

chat = update.message.chat
thread_id = _get_thread_id(update)
if chat.type in ("group", "supergroup") and thread_id is not None:
Expand Down Expand Up @@ -681,30 +681,39 @@ async def voice_handler(update: Update, context: ContextTypes.DEFAULT_TYPE) -> N
)
return

# Download voice as in-memory bytes
voice_file = await update.message.voice.get_file()
ogg_data = bytes(await voice_file.download_as_bytearray())
# Download the voice file
voice = update.message.voice
tg_file = await voice.get_file()
filename = f"{int(time.time())}_{voice.file_unique_id}.ogg"
file_path = _AUDIO_DIR / filename
await tg_file.download_to_drive(file_path)

# Transcribe
try:
text = await transcribe_voice(ogg_data)
except ValueError as e:
await safe_reply(update.message, f"⚠ {e}")
await update.message.chat.send_action(ChatAction.TYPING)
text = await transcribe(file_path)
except TranscriptionDisabled as e:
await safe_reply(update.message, f"🎤 Voice not available: {e}")
return
except TranscriptionError as e:
await safe_reply(update.message, f"❌ Transcription failed: {e}")
return
except Exception as e:
logger.error("Voice transcription failed: %s", e)
await safe_reply(update.message, f"⚠ Transcription failed: {e}")
except Exception:
logger.exception("Unexpected transcription error")
await safe_reply(update.message, "❌ Transcription failed unexpectedly.")
return
finally:
# Clean up audio file
file_path.unlink(missing_ok=True)

await update.message.chat.send_action(ChatAction.TYPING)
clear_status_msg_info(user.id, thread_id)
# Show transcription to user
await safe_reply(update.message, f'🎤 "{text}"')

# Forward to Claude Code
clear_status_msg_info(user.id, thread_id)
success, message = await session_manager.send_to_window(wid, text)
if not success:
await safe_reply(update.message, f"❌ {message}")
return

await safe_reply(update.message, f'🎤 "{text}"')


# Active bash capture tasks: (user_id, thread_id) → asyncio.Task
Expand Down Expand Up @@ -1914,9 +1923,9 @@ def create_bot() -> Application:
)
# Photos: download and forward file path to Claude Code
application.add_handler(MessageHandler(filters.PHOTO, photo_handler))
# Voice: transcribe via OpenAI and forward text to Claude Code
# Voice messages: transcribe and forward text to Claude Code
application.add_handler(MessageHandler(filters.VOICE, voice_handler))
# Catch-all: non-text content (stickers, video, etc.)
# Catch-all: non-text content (stickers, etc.)
application.add_handler(
MessageHandler(
~filters.COMMAND & ~filters.TEXT & ~filters.StatusUpdate.ALL,
Expand Down
10 changes: 10 additions & 0 deletions src/ccbot/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,16 @@ def __init__(self) -> None:
"OPENAI_BASE_URL", "https://api.openai.com/v1"
)

# Voice transcription backend: "openai", "local" (faster-whisper), or "off"
self.whisper_backend: str = os.getenv("CCBOT_WHISPER_BACKEND", "openai").lower()
if self.whisper_backend not in ("local", "openai", "off"):
logger.warning(
"Unknown CCBOT_WHISPER_BACKEND=%r, defaulting to 'off'",
self.whisper_backend,
)
self.whisper_backend = "off"
self.whisper_model: str = os.getenv("CCBOT_WHISPER_MODEL", "small")

# Scrub sensitive vars from os.environ so child processes never inherit them.
# Values are already captured in Config attributes above.
for var in SENSITIVE_ENV_VARS:
Expand Down
114 changes: 101 additions & 13 deletions src/ccbot/transcribe.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,35 @@
"""Voice-to-text transcription via OpenAI's audio API.
"""Voice message transcription — converts audio to text.

Provides a single async function to transcribe voice messages using
the gpt-4o-transcribe model. Uses httpx directly (no OpenAI SDK needed).
Supports two backends (configured via CCBOT_WHISPER_BACKEND):
- "local" — faster-whisper on CPU (free, no API key, requires ffmpeg)
- "openai" — OpenAI API (gpt-4o-transcribe, requires OPENAI_API_KEY)
- "off" — voice messages disabled

Key function: transcribe_voice(ogg_data) -> str
Key function: transcribe() — async, returns transcribed text.
"""

import asyncio
import logging
from pathlib import Path
from typing import Any

import httpx

from .config import config

logger = logging.getLogger(__name__)


class TranscriptionError(Exception):
"""Raised when transcription fails."""


class TranscriptionDisabled(Exception):
"""Raised when voice transcription is disabled or unavailable."""


# --- OpenAI backend ---

_client: httpx.AsyncClient | None = None


Expand All @@ -25,26 +41,27 @@ def _get_client() -> httpx.AsyncClient:
return _client


async def transcribe_voice(ogg_data: bytes) -> str:
"""Transcribe OGG voice data to text via OpenAI API.

Raises:
httpx.HTTPStatusError: On API errors (401, 429, 5xx, etc.)
ValueError: If the API returns an empty transcription.
"""
async def _transcribe_openai(file_path: Path) -> str:
"""Transcribe via OpenAI API (gpt-4o-transcribe)."""
if not config.openai_api_key:
raise TranscriptionDisabled(
"Voice transcription requires OPENAI_API_KEY in .env.\n"
"Or set CCBOT_WHISPER_BACKEND=local for free local transcription "
'(requires: uv pip install -e ".[voice]").'
)
url = f"{config.openai_base_url.rstrip('/')}/audio/transcriptions"
client = _get_client()
ogg_data = file_path.read_bytes()
response = await client.post(
url,
headers={"Authorization": f"Bearer {config.openai_api_key}"},
files={"file": ("voice.ogg", ogg_data, "audio/ogg")},
data={"model": "gpt-4o-transcribe"},
)
response.raise_for_status()

text = response.json().get("text", "").strip()
if not text:
raise ValueError("Empty transcription returned by API")
raise TranscriptionError("Empty transcription returned by OpenAI API")
return text


Expand All @@ -54,3 +71,74 @@ async def close_client() -> None:
if _client is not None and not _client.is_closed:
await _client.aclose()
_client = None


# --- Local (faster-whisper) backend ---

_local_model: Any = None


def _get_local_model() -> Any:
"""Lazy-load the faster-whisper model (downloads on first use)."""
global _local_model
if _local_model is not None:
return _local_model
try:
from faster_whisper import WhisperModel # type: ignore[import-untyped]
except ImportError:
raise TranscriptionDisabled(
"faster-whisper is not installed. "
'Install with: uv pip install -e ".[voice]"\n'
"Or set CCBOT_WHISPER_BACKEND=openai/off."
)
logger.info(
"Loading faster-whisper model '%s' (may download on first use)...",
config.whisper_model,
)
_local_model = WhisperModel(config.whisper_model, device="cpu", compute_type="int8")
logger.info("faster-whisper model loaded successfully")
return _local_model


def _transcribe_local_sync(file_path: Path) -> str:
"""Synchronous transcription using faster-whisper (CPU-bound)."""
model = _get_local_model()
segments, info = model.transcribe(str(file_path), beam_size=5, language="en")
text = " ".join(segment.text.strip() for segment in segments)
if not text.strip():
raise TranscriptionError("Transcription produced empty text")
logger.info(
"Transcribed %s: language=%s, duration=%.1fs, text_len=%d",
file_path.name,
info.language,
info.duration,
len(text),
)
return text.strip()


# --- Unified entry point ---


async def transcribe(file_path: Path) -> str:
"""Transcribe an audio file to text using the configured backend.

Args:
file_path: Path to the audio file (OGG/Opus from Telegram).

Returns:
Transcribed text string.

Raises:
TranscriptionDisabled: Voice is disabled or dependency missing.
TranscriptionError: Transcription failed or produced empty text.
"""
if config.whisper_backend == "off":
raise TranscriptionDisabled("Voice transcription is disabled.")

if config.whisper_backend == "openai":
return await _transcribe_openai(file_path)

# local backend
loop = asyncio.get_running_loop()
return await loop.run_in_executor(None, _transcribe_local_sync, file_path)
Loading
Loading