From 578f56ab92c1eeffbbc5c6c32ee06ad9b7c24169 Mon Sep 17 00:00:00 2001
From: Lucas Daniel <94806303+NoodleLDS@users.noreply.github.com>
Date: Wed, 3 Jun 2026 01:36:50 -0300
Subject: [PATCH] fix(vision): recognize Gemma 4 and Phi-4 as vision-capable
 models (#1704)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Gemma 4 and Phi-4 multimodal are natively vision-capable but their Ollama
tags ("gemma4:12b", "phi-4", "phi4") did not match any keyword in
_VISION_MODEL_KEYWORDS. The image was silently routed to the VL fallback
path instead of being passed directly to the model — users saw the model
respond to a placeholder like "[VL model unavailable - image not analyzed]"
rather than the actual image.

Adds "gemma-4"/"gemma4" and "phi-4"/"phi4" to the keyword list, following
the existing err-toward-True policy (#124): a text-only variant being
treated as vision is the safer failure than dropping a real image.

Fixes #1274 (partial — covers the Gemma 4 + Phi-4 case; the OpenRouter/free
vision fallback path is a separate issue).
---
 src/chat_helpers.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/chat_helpers.py b/src/chat_helpers.py
index 52c2246..1c8d1c9 100644
--- a/src/chat_helpers.py
+++ b/src/chat_helpers.py
@@ -40,12 +40,16 @@ _VISION_MODEL_KEYWORDS = (
     "internvl", "cogvlm", "qwen-vl", "qwen2-vl", "qwen3-vl", "qwen3vl",
     # multimodal families whose names don't contain "vision"/"vl" but DO accept
     # images — without these the image is silently dropped for common Ollama tags
-    # like gemma3:4b (issue #1274). Gemma 3 (4b+), Llama 4 (all), and Mistral
-    # Small 3.1/3.2 are vision-capable; per the err-toward-True policy (#124) a
-    # rare text-only tag (e.g. gemma3:1b) being treated as vision is the safer
-    # failure than dropping a real image.
-    "gemma-3", "gemma3", "llama-4", "llama4",
+    # like gemma3:4b or gemma4:12b (issue #1274). Gemma 3/4 (4b+), Llama 4 (all),
+    # Mistral Small 3.1/3.2, and Phi-4 multimodal are vision-capable; per the
+    # err-toward-True policy (#124) a rare text-only tag being treated as vision is
+    # the safer failure than silently dropping a real image.
+    "gemma-3", "gemma3", "gemma-4", "gemma4",
+    "llama-4", "llama4",
     "mistral-small-3.1", "mistral-small3.1", "mistral-small-3.2", "mistral-small3.2",
+    # Microsoft Phi-4 ships a dedicated multimodal variant ("phi-4-multimodal-instruct")
+    # but users often load it under the bare "phi-4" or "phi4" Ollama tag.
+    "phi-4", "phi4",
     # zhipu / glm (glm-4.5v, glm-4.6v, glm-5v-turbo, etc.)
     "glm-4.5v", "glm-4.6v", "glm-5v",
 )