From 578f56ab92c1eeffbbc5c6c32ee06ad9b7c24169 Mon Sep 17 00:00:00 2001 From: Lucas Daniel <94806303+NoodleLDS@users.noreply.github.com> Date: Wed, 3 Jun 2026 01:36:50 -0300 Subject: [PATCH] fix(vision): recognize Gemma 4 and Phi-4 as vision-capable models (#1704) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Gemma 4 and Phi-4 multimodal are natively vision-capable but their Ollama tags ("gemma4:12b", "phi-4", "phi4") did not match any keyword in _VISION_MODEL_KEYWORDS. The image was silently routed to the VL fallback path instead of being passed directly to the model — users saw the model respond to a placeholder like "[VL model unavailable - image not analyzed]" rather than the actual image. Adds "gemma-4"/"gemma4" and "phi-4"/"phi4" to the keyword list, following the existing err-toward-True policy (#124): a text-only variant being treated as vision is the safer failure than dropping a real image. Fixes #1274 (partial — covers the Gemma 4 + Phi-4 case; the OpenRouter/free vision fallback path is a separate issue). --- src/chat_helpers.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/chat_helpers.py b/src/chat_helpers.py index 52c2246..1c8d1c9 100644 --- a/src/chat_helpers.py +++ b/src/chat_helpers.py @@ -40,12 +40,16 @@ _VISION_MODEL_KEYWORDS = ( "internvl", "cogvlm", "qwen-vl", "qwen2-vl", "qwen3-vl", "qwen3vl", # multimodal families whose names don't contain "vision"/"vl" but DO accept # images — without these the image is silently dropped for common Ollama tags - # like gemma3:4b (issue #1274). Gemma 3 (4b+), Llama 4 (all), and Mistral - # Small 3.1/3.2 are vision-capable; per the err-toward-True policy (#124) a - # rare text-only tag (e.g. gemma3:1b) being treated as vision is the safer - # failure than dropping a real image. - "gemma-3", "gemma3", "llama-4", "llama4", + # like gemma3:4b or gemma4:12b (issue #1274). Gemma 3/4 (4b+), Llama 4 (all), + # Mistral Small 3.1/3.2, and Phi-4 multimodal are vision-capable; per the + # err-toward-True policy (#124) a rare text-only tag being treated as vision is + # the safer failure than silently dropping a real image. + "gemma-3", "gemma3", "gemma-4", "gemma4", + "llama-4", "llama4", "mistral-small-3.1", "mistral-small3.1", "mistral-small-3.2", "mistral-small3.2", + # Microsoft Phi-4 ships a dedicated multimodal variant ("phi-4-multimodal-instruct") + # but users often load it under the bare "phi-4" or "phi4" Ollama tag. + "phi-4", "phi4", # zhipu / glm (glm-4.5v, glm-4.6v, glm-5v-turbo, etc.) "glm-4.5v", "glm-4.6v", "glm-5v", )