From 4933ce1438a36ecd6cbe1bd2d86fc3190e1a3c70 Mon Sep 17 00:00:00 2001 From: veyal Date: Mon, 22 Jun 2026 20:40:25 +0700 Subject: [PATCH] fix: honour vision=false by stripping image_url blocks before API call The vision flag on ModelConfig was defined but never consulted when building the message payload sent to LiteLLM. Any conversation with image attachments forwarded image_url content to the model even when vision was disabled, causing OpenRouter (and other providers) to return: {"error":{"message":"No endpoints found that support image input"}} - Add strip_images() to helpers/images.py that removes image_url blocks from a content list and collapses a single leftover text block back to a plain string. - In LiteLLMChatWrapper._convert_messages, read vision from a0_model_conf and call strip_images() when vision is False. --- helpers/images.py | 13 ++++++++ models.py | 6 +++- tests/test_vision_load_image_refs.py | 45 ++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+), 1 deletion(-) diff --git a/helpers/images.py b/helpers/images.py index 4e9a5e9172..021c442e8a 100644 --- a/helpers/images.py +++ b/helpers/images.py @@ -29,6 +29,19 @@ def prepare_content(content: Any) -> Any: return {key: prepare_content(value) for key, value in content.items()} +def strip_images(content: Any) -> Any: + """Remove all image_url blocks from message content when vision is disabled.""" + if isinstance(content, list): + filtered = [strip_images(item) for item in content if not (isinstance(item, dict) and item.get("type") == "image_url")] + if not filtered: + return "" # image-only message; return empty string rather than [] + # Collapse single-text-block list back to a plain string + if len(filtered) == 1 and isinstance(filtered[0], dict) and filtered[0].get("type") == "text": + return filtered[0].get("text", "") + return filtered + return content + + def is_local_ref(url: str) -> bool: if not url: return False diff --git a/models.py b/models.py index 8518f0d1e3..9a3af5f6d1 100644 --- a/models.py +++ b/models.py @@ -327,9 +327,13 @@ def _convert_messages(self, messages: List[BaseMessage], explicit_caching: bool "system": "system", "tool": "tool", } + vision_enabled = self.a0_model_conf.vision if self.a0_model_conf else True for m in messages: role = role_mapping.get(m.type, m.type) - message_dict = {"role": role, "content": images.prepare_content(m.content)} + content = images.prepare_content(m.content) + if not vision_enabled: + content = images.strip_images(content) + message_dict = {"role": role, "content": content} # Handle tool calls for AI messages tool_calls = getattr(m, "tool_calls", None) diff --git a/tests/test_vision_load_image_refs.py b/tests/test_vision_load_image_refs.py index 2842a5e1e2..65e9467af8 100644 --- a/tests/test_vision_load_image_refs.py +++ b/tests/test_vision_load_image_refs.py @@ -52,6 +52,51 @@ def __init__(self, raw_content, preview): monkeypatch.delitem(sys.modules, "tools.vision_load", raising=False) +def test_strip_images_passthrough_string(): + assert images.strip_images("hello") == "hello" + assert images.strip_images("") == "" + + +def test_strip_images_image_only_returns_empty_string(): + content = [{"type": "image_url", "image_url": {"url": "data:image/png;base64,abc"}}] + assert images.strip_images(content) == "" + + +def test_strip_images_multi_image_only_returns_empty_string(): + content = [ + {"type": "image_url", "image_url": {"url": "data:image/png;base64,abc"}}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,xyz"}}, + ] + assert images.strip_images(content) == "" + + +def test_strip_images_text_and_image_collapses_to_string(): + content = [ + {"type": "text", "text": "describe this"}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,abc"}}, + ] + assert images.strip_images(content) == "describe this" + + +def test_strip_images_multiple_text_blocks_preserved(): + content = [ + {"type": "text", "text": "first"}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,abc"}}, + {"type": "text", "text": "second"}, + ] + result = images.strip_images(content) + assert result == [{"type": "text", "text": "first"}, {"type": "text", "text": "second"}] + + +def test_strip_images_no_images_unchanged(): + content = [{"type": "text", "text": "plain text"}] + assert images.strip_images(content) == "plain text" + + +def test_strip_images_plain_string_content_unchanged(): + assert images.strip_images("no images here") == "no images here" + + def test_prepare_content_keeps_missing_local_image_refs_strict(): missing_path = "/tmp/a0-missing-desktop-screenshot.png"