From 583df3dd6ad850440f29d187368c5ed1483ad1c4 Mon Sep 17 00:00:00 2001
From: lekt8 <lewistham9x@gmail.com>
Date: Wed, 3 Jun 2026 03:17:40 +0800
Subject: [PATCH] Recognize gemma3/llama4/mistral-small3.1+/multimodal as
 vision models (#1430)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

is_vision_model() classified several genuinely multimodal families as text-only
because their names contain neither "vision" nor "vl": Gemma 3 (4b+), Llama 4,
Mistral Small 3.1/3.2, and *-multimodal models (e.g. phi-4-multimodal). For those
the attached image was stripped before the request, so the model never saw it —
a "can't read the image" report (issue #1274), common with Ollama tags like
gemma3:4b.

Add those keywords (plus a generic "multimodal"). Per the file's err-toward-True
policy (#124), a rare text-only tag treated as vision is the safer failure than
dropping a real image. Guard tests confirm the text-only siblings (gemma2, plain
gemma, mistral-small, phi-3) are not over-matched.

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/chat_helpers.py                  | 10 +++++++++-
 tests/test_vision_model_detection.py | 18 ++++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/src/chat_helpers.py b/src/chat_helpers.py
index 6bb0611..52c2246 100644
--- a/src/chat_helpers.py
+++ b/src/chat_helpers.py
@@ -36,8 +36,16 @@ _VISION_MODEL_KEYWORDS = (
     "gpt-4o", "gpt-4.1", "gpt-4.5", "gpt-4-turbo", "gpt-4-vision",
     "claude-sonnet", "claude-opus", "claude-haiku", "gemini",
     # open / local
-    "vision", "llava", "bakllava", "moondream", "pixtral", "minicpm",
+    "vision", "multimodal", "llava", "bakllava", "moondream", "pixtral", "minicpm",
     "internvl", "cogvlm", "qwen-vl", "qwen2-vl", "qwen3-vl", "qwen3vl",
+    # multimodal families whose names don't contain "vision"/"vl" but DO accept
+    # images — without these the image is silently dropped for common Ollama tags
+    # like gemma3:4b (issue #1274). Gemma 3 (4b+), Llama 4 (all), and Mistral
+    # Small 3.1/3.2 are vision-capable; per the err-toward-True policy (#124) a
+    # rare text-only tag (e.g. gemma3:1b) being treated as vision is the safer
+    # failure than dropping a real image.
+    "gemma-3", "gemma3", "llama-4", "llama4",
+    "mistral-small-3.1", "mistral-small3.1", "mistral-small-3.2", "mistral-small3.2",
     # zhipu / glm (glm-4.5v, glm-4.6v, glm-5v-turbo, etc.)
     "glm-4.5v", "glm-4.6v", "glm-5v",
 )
diff --git a/tests/test_vision_model_detection.py b/tests/test_vision_model_detection.py
index b0efe68..cbc1f4e 100644
--- a/tests/test_vision_model_detection.py
+++ b/tests/test_vision_model_detection.py
@@ -28,3 +28,21 @@ def test_text_only_models_not_flagged():
 
 def test_none_is_safe():
     assert is_vision_model(None) is False
+
+
+def test_recognizes_multimodal_families_without_vision_in_name():
+    # issue #1274: these are vision-capable but their names don't contain
+    # "vision"/"vl", so they were dropped and the model never saw the image.
+    for name in [
+        "gemma3:4b", "gemma3", "gemma-3-27b-it",
+        "llama4:scout", "llama4", "llama-4-maverick",
+        "mistral-small3.1", "mistral-small-3.2",
+        "phi-4-multimodal", "phi4-multimodal",
+    ]:
+        assert is_vision_model(name), f"{name!r} should be detected as vision-capable"
+
+
+def test_new_keywords_do_not_overmatch_text_models():
+    # The added families must not flag their text-only siblings.
+    for name in ["gemma2:9b", "gemma:7b", "llama3.3", "mistral-small", "phi-3-mini"]:
+        assert not is_vision_model(name), f"{name!r} should not be flagged as vision"