From a404083ad9428f9af7af0ee7c03a674cd878f4f8 Mon Sep 17 00:00:00 2001 From: whanod Date: Sun, 29 Mar 2026 17:08:48 +0000 Subject: [PATCH] fix: wire image data through to Claude for screenshot/photo support The bot was downloading and base64-encoding images but only passing a text prompt to Claude, never the actual image data. This sends images as multimodal content blocks via the SDK AsyncIterable path so Claude can actually see uploaded screenshots and photos. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/bot/features/image_handler.py | 1 - src/bot/orchestrator.py | 18 ++++++++++++++++ src/claude/facade.py | 5 +++++ src/claude/sdk_integration.py | 35 +++++++++++++++++++++++++++++-- 4 files changed, 56 insertions(+), 3 deletions(-) diff --git a/src/bot/features/image_handler.py b/src/bot/features/image_handler.py index f9cf3798..4c909c39 100644 --- a/src/bot/features/image_handler.py +++ b/src/bot/features/image_handler.py @@ -57,7 +57,6 @@ async def process_image( else: prompt = self._create_generic_prompt(caption) - # Convert to base64 for Claude (if supported in future) base64_image = base64.b64encode(image_bytes).decode("utf-8") return ProcessedImage( diff --git a/src/bot/orchestrator.py b/src/bot/orchestrator.py index 1124d006..c66f31e6 100644 --- a/src/bot/orchestrator.py +++ b/src/bot/orchestrator.py @@ -42,6 +42,13 @@ logger = structlog.get_logger() +_MEDIA_TYPE_MAP = { + "png": "image/png", + "jpeg": "image/jpeg", + "gif": "image/gif", + "webp": "image/webp", +} + # Patterns that look like secrets/credentials in CLI arguments _SECRET_PATTERNS: List[re.Pattern[str]] = [ # API keys / tokens (sk-ant-..., sk-..., ghp_..., gho_..., github_pat_..., xoxb-...) @@ -1353,6 +1360,14 @@ async def agentic_photo( processed_image = await image_handler.process_image( photo, update.message.caption ) + fmt = processed_image.metadata.get("format", "png") + images = [ + { + "data": processed_image.base64_data, + "media_type": _MEDIA_TYPE_MAP.get(fmt, "image/png"), + } + ] + await self._handle_agentic_media_message( update=update, context=context, @@ -1360,6 +1375,7 @@ async def agentic_photo( progress_msg=progress_msg, user_id=user_id, chat=chat, + images=images, ) except Exception as e: @@ -1420,6 +1436,7 @@ async def _handle_agentic_media_message( progress_msg: Any, user_id: int, chat: Any, + images: Optional[List[Dict[str, str]]] = None, ) -> None: """Run a media-derived prompt through Claude and send responses.""" claude_integration = context.bot_data.get("claude_integration") @@ -1456,6 +1473,7 @@ async def _handle_agentic_media_message( session_id=session_id, on_stream=on_stream, force_new=force_new, + images=images, ) finally: heartbeat.cancel() diff --git a/src/claude/facade.py b/src/claude/facade.py index 5c7276eb..b1cafba4 100644 --- a/src/claude/facade.py +++ b/src/claude/facade.py @@ -39,6 +39,7 @@ async def run_command( on_stream: Optional[Callable[[StreamUpdate], None]] = None, force_new: bool = False, interrupt_event: Optional["asyncio.Event"] = None, + images: Optional[List[Dict[str, str]]] = None, ) -> ClaudeResponse: """Run Claude Code command with full integration.""" logger.info( @@ -88,6 +89,7 @@ async def run_command( continue_session=should_continue, stream_callback=on_stream, interrupt_event=interrupt_event, + images=images, ) except Exception as resume_error: # If resume failed (e.g., session expired/missing on Claude's side), @@ -113,6 +115,7 @@ async def run_command( continue_session=False, stream_callback=on_stream, interrupt_event=interrupt_event, + images=images, ) else: raise @@ -157,6 +160,7 @@ async def _execute( continue_session: bool = False, stream_callback: Optional[Callable] = None, interrupt_event: Optional[asyncio.Event] = None, + images: Optional[List[Dict[str, str]]] = None, ) -> ClaudeResponse: """Execute command via SDK.""" return await self.sdk_manager.execute_command( @@ -166,6 +170,7 @@ async def _execute( continue_session=continue_session, stream_callback=stream_callback, interrupt_event=interrupt_event, + images=images, ) async def _find_resumable_session( diff --git a/src/claude/sdk_integration.py b/src/claude/sdk_integration.py index ab9c4046..b594005e 100644 --- a/src/claude/sdk_integration.py +++ b/src/claude/sdk_integration.py @@ -4,7 +4,7 @@ import os from dataclasses import dataclass, field from pathlib import Path -from typing import Any, Callable, Dict, List, Optional +from typing import Any, AsyncIterator, Callable, Dict, List, Optional import structlog from claude_agent_sdk import ( @@ -155,6 +155,7 @@ async def execute_command( continue_session: bool = False, stream_callback: Optional[Callable[[StreamUpdate], None]] = None, interrupt_event: Optional[asyncio.Event] = None, + images: Optional[List[Dict[str, str]]] = None, ) -> ClaudeResponse: """Execute Claude Code command via SDK.""" start_time = asyncio.get_event_loop().time() @@ -248,7 +249,37 @@ async def _run_client() -> None: client = ClaudeSDKClient(options) try: await client.connect() - await client.query(prompt) + + if images: + content_blocks: List[Dict[str, Any]] = [] + for img in images: + media_type = img.get("media_type", "image/png") + content_blocks.append( + { + "type": "image", + "source": { + "type": "base64", + "media_type": media_type, + "data": img["data"], + }, + } + ) + content_blocks.append({"type": "text", "text": prompt}) + + multimodal_msg = { + "type": "user", + "message": { + "role": "user", + "content": content_blocks, + }, + } + + async def _multimodal_prompt() -> AsyncIterator[Dict[str, Any]]: + yield multimodal_msg + + await client.query(_multimodal_prompt()) + else: + await client.query(prompt) async for raw_data in client._query.receive_messages(): try: