diff --git a/src/bot/features/image_handler.py b/src/bot/features/image_handler.py index f9cf3798..4c909c39 100644 --- a/src/bot/features/image_handler.py +++ b/src/bot/features/image_handler.py @@ -57,7 +57,6 @@ async def process_image( else: prompt = self._create_generic_prompt(caption) - # Convert to base64 for Claude (if supported in future) base64_image = base64.b64encode(image_bytes).decode("utf-8") return ProcessedImage( diff --git a/src/bot/orchestrator.py b/src/bot/orchestrator.py index 1124d006..c66f31e6 100644 --- a/src/bot/orchestrator.py +++ b/src/bot/orchestrator.py @@ -42,6 +42,13 @@ logger = structlog.get_logger() +_MEDIA_TYPE_MAP = { + "png": "image/png", + "jpeg": "image/jpeg", + "gif": "image/gif", + "webp": "image/webp", +} + # Patterns that look like secrets/credentials in CLI arguments _SECRET_PATTERNS: List[re.Pattern[str]] = [ # API keys / tokens (sk-ant-..., sk-..., ghp_..., gho_..., github_pat_..., xoxb-...) @@ -1353,6 +1360,14 @@ async def agentic_photo( processed_image = await image_handler.process_image( photo, update.message.caption ) + fmt = processed_image.metadata.get("format", "png") + images = [ + { + "data": processed_image.base64_data, + "media_type": _MEDIA_TYPE_MAP.get(fmt, "image/png"), + } + ] + await self._handle_agentic_media_message( update=update, context=context, @@ -1360,6 +1375,7 @@ async def agentic_photo( progress_msg=progress_msg, user_id=user_id, chat=chat, + images=images, ) except Exception as e: @@ -1420,6 +1436,7 @@ async def _handle_agentic_media_message( progress_msg: Any, user_id: int, chat: Any, + images: Optional[List[Dict[str, str]]] = None, ) -> None: """Run a media-derived prompt through Claude and send responses.""" claude_integration = context.bot_data.get("claude_integration") @@ -1456,6 +1473,7 @@ async def _handle_agentic_media_message( session_id=session_id, on_stream=on_stream, force_new=force_new, + images=images, ) finally: heartbeat.cancel() diff --git a/src/claude/facade.py b/src/claude/facade.py index 5c7276eb..b1cafba4 100644 --- a/src/claude/facade.py +++ b/src/claude/facade.py @@ -39,6 +39,7 @@ async def run_command( on_stream: Optional[Callable[[StreamUpdate], None]] = None, force_new: bool = False, interrupt_event: Optional["asyncio.Event"] = None, + images: Optional[List[Dict[str, str]]] = None, ) -> ClaudeResponse: """Run Claude Code command with full integration.""" logger.info( @@ -88,6 +89,7 @@ async def run_command( continue_session=should_continue, stream_callback=on_stream, interrupt_event=interrupt_event, + images=images, ) except Exception as resume_error: # If resume failed (e.g., session expired/missing on Claude's side), @@ -113,6 +115,7 @@ async def run_command( continue_session=False, stream_callback=on_stream, interrupt_event=interrupt_event, + images=images, ) else: raise @@ -157,6 +160,7 @@ async def _execute( continue_session: bool = False, stream_callback: Optional[Callable] = None, interrupt_event: Optional[asyncio.Event] = None, + images: Optional[List[Dict[str, str]]] = None, ) -> ClaudeResponse: """Execute command via SDK.""" return await self.sdk_manager.execute_command( @@ -166,6 +170,7 @@ async def _execute( continue_session=continue_session, stream_callback=stream_callback, interrupt_event=interrupt_event, + images=images, ) async def _find_resumable_session( diff --git a/src/claude/sdk_integration.py b/src/claude/sdk_integration.py index ab9c4046..b594005e 100644 --- a/src/claude/sdk_integration.py +++ b/src/claude/sdk_integration.py @@ -4,7 +4,7 @@ import os from dataclasses import dataclass, field from pathlib import Path -from typing import Any, Callable, Dict, List, Optional +from typing import Any, AsyncIterator, Callable, Dict, List, Optional import structlog from claude_agent_sdk import ( @@ -155,6 +155,7 @@ async def execute_command( continue_session: bool = False, stream_callback: Optional[Callable[[StreamUpdate], None]] = None, interrupt_event: Optional[asyncio.Event] = None, + images: Optional[List[Dict[str, str]]] = None, ) -> ClaudeResponse: """Execute Claude Code command via SDK.""" start_time = asyncio.get_event_loop().time() @@ -248,7 +249,37 @@ async def _run_client() -> None: client = ClaudeSDKClient(options) try: await client.connect() - await client.query(prompt) + + if images: + content_blocks: List[Dict[str, Any]] = [] + for img in images: + media_type = img.get("media_type", "image/png") + content_blocks.append( + { + "type": "image", + "source": { + "type": "base64", + "media_type": media_type, + "data": img["data"], + }, + } + ) + content_blocks.append({"type": "text", "text": prompt}) + + multimodal_msg = { + "type": "user", + "message": { + "role": "user", + "content": content_blocks, + }, + } + + async def _multimodal_prompt() -> AsyncIterator[Dict[str, Any]]: + yield multimodal_msg + + await client.query(_multimodal_prompt()) + else: + await client.query(prompt) async for raw_data in client._query.receive_messages(): try: