From 4933ce1438a36ecd6cbe1bd2d86fc3190e1a3c70 Mon Sep 17 00:00:00 2001
From: veyal <andrew.salim@binus.ac.id>
Date: Mon, 22 Jun 2026 20:40:25 +0700
Subject: [PATCH] fix: honour vision=false by stripping image_url blocks before
 API call

The vision flag on ModelConfig was defined but never consulted when
building the message payload sent to LiteLLM. Any conversation with
image attachments forwarded image_url content to the model even when
vision was disabled, causing OpenRouter (and other providers) to return:
  {"error":{"message":"No endpoints found that support image input"}}

- Add strip_images() to helpers/images.py that removes image_url blocks
  from a content list and collapses a single leftover text block back to
  a plain string.
- In LiteLLMChatWrapper._convert_messages, read vision from
  a0_model_conf and call strip_images() when vision is False.
---
 helpers/images.py                    | 13 ++++++++
 models.py                            |  6 +++-
 tests/test_vision_load_image_refs.py | 45 ++++++++++++++++++++++++++++
 3 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/helpers/images.py b/helpers/images.py
index 4e9a5e9172..021c442e8a 100644
--- a/helpers/images.py
+++ b/helpers/images.py
@@ -29,6 +29,19 @@ def prepare_content(content: Any) -> Any:
     return {key: prepare_content(value) for key, value in content.items()}
 
 
+def strip_images(content: Any) -> Any:
+    """Remove all image_url blocks from message content when vision is disabled."""
+    if isinstance(content, list):
+        filtered = [strip_images(item) for item in content if not (isinstance(item, dict) and item.get("type") == "image_url")]
+        if not filtered:
+            return ""  # image-only message; return empty string rather than []
+        # Collapse single-text-block list back to a plain string
+        if len(filtered) == 1 and isinstance(filtered[0], dict) and filtered[0].get("type") == "text":
+            return filtered[0].get("text", "")
+        return filtered
+    return content
+
+
 def is_local_ref(url: str) -> bool:
     if not url:
         return False
diff --git a/models.py b/models.py
index 8518f0d1e3..9a3af5f6d1 100644
--- a/models.py
+++ b/models.py
@@ -327,9 +327,13 @@ def _convert_messages(self, messages: List[BaseMessage], explicit_caching: bool
             "system": "system",
             "tool": "tool",
         }
+        vision_enabled = self.a0_model_conf.vision if self.a0_model_conf else True
         for m in messages:
             role = role_mapping.get(m.type, m.type)
-            message_dict = {"role": role, "content": images.prepare_content(m.content)}
+            content = images.prepare_content(m.content)
+            if not vision_enabled:
+                content = images.strip_images(content)
+            message_dict = {"role": role, "content": content}
 
             # Handle tool calls for AI messages
             tool_calls = getattr(m, "tool_calls", None)
diff --git a/tests/test_vision_load_image_refs.py b/tests/test_vision_load_image_refs.py
index 2842a5e1e2..65e9467af8 100644
--- a/tests/test_vision_load_image_refs.py
+++ b/tests/test_vision_load_image_refs.py
@@ -52,6 +52,51 @@ def __init__(self, raw_content, preview):
     monkeypatch.delitem(sys.modules, "tools.vision_load", raising=False)
 
 
+def test_strip_images_passthrough_string():
+    assert images.strip_images("hello") == "hello"
+    assert images.strip_images("") == ""
+
+
+def test_strip_images_image_only_returns_empty_string():
+    content = [{"type": "image_url", "image_url": {"url": "data:image/png;base64,abc"}}]
+    assert images.strip_images(content) == ""
+
+
+def test_strip_images_multi_image_only_returns_empty_string():
+    content = [
+        {"type": "image_url", "image_url": {"url": "data:image/png;base64,abc"}},
+        {"type": "image_url", "image_url": {"url": "data:image/png;base64,xyz"}},
+    ]
+    assert images.strip_images(content) == ""
+
+
+def test_strip_images_text_and_image_collapses_to_string():
+    content = [
+        {"type": "text", "text": "describe this"},
+        {"type": "image_url", "image_url": {"url": "data:image/png;base64,abc"}},
+    ]
+    assert images.strip_images(content) == "describe this"
+
+
+def test_strip_images_multiple_text_blocks_preserved():
+    content = [
+        {"type": "text", "text": "first"},
+        {"type": "image_url", "image_url": {"url": "data:image/png;base64,abc"}},
+        {"type": "text", "text": "second"},
+    ]
+    result = images.strip_images(content)
+    assert result == [{"type": "text", "text": "first"}, {"type": "text", "text": "second"}]
+
+
+def test_strip_images_no_images_unchanged():
+    content = [{"type": "text", "text": "plain text"}]
+    assert images.strip_images(content) == "plain text"
+
+
+def test_strip_images_plain_string_content_unchanged():
+    assert images.strip_images("no images here") == "no images here"
+
+
 def test_prepare_content_keeps_missing_local_image_refs_strict():
     missing_path = "/tmp/a0-missing-desktop-screenshot.png"