diff --git a/src/chat_helpers.py b/src/chat_helpers.py index 6bb0611..52c2246 100644 --- a/src/chat_helpers.py +++ b/src/chat_helpers.py @@ -36,8 +36,16 @@ _VISION_MODEL_KEYWORDS = ( "gpt-4o", "gpt-4.1", "gpt-4.5", "gpt-4-turbo", "gpt-4-vision", "claude-sonnet", "claude-opus", "claude-haiku", "gemini", # open / local - "vision", "llava", "bakllava", "moondream", "pixtral", "minicpm", + "vision", "multimodal", "llava", "bakllava", "moondream", "pixtral", "minicpm", "internvl", "cogvlm", "qwen-vl", "qwen2-vl", "qwen3-vl", "qwen3vl", + # multimodal families whose names don't contain "vision"/"vl" but DO accept + # images — without these the image is silently dropped for common Ollama tags + # like gemma3:4b (issue #1274). Gemma 3 (4b+), Llama 4 (all), and Mistral + # Small 3.1/3.2 are vision-capable; per the err-toward-True policy (#124) a + # rare text-only tag (e.g. gemma3:1b) being treated as vision is the safer + # failure than dropping a real image. + "gemma-3", "gemma3", "llama-4", "llama4", + "mistral-small-3.1", "mistral-small3.1", "mistral-small-3.2", "mistral-small3.2", # zhipu / glm (glm-4.5v, glm-4.6v, glm-5v-turbo, etc.) "glm-4.5v", "glm-4.6v", "glm-5v", ) diff --git a/tests/test_vision_model_detection.py b/tests/test_vision_model_detection.py index b0efe68..cbc1f4e 100644 --- a/tests/test_vision_model_detection.py +++ b/tests/test_vision_model_detection.py @@ -28,3 +28,21 @@ def test_text_only_models_not_flagged(): def test_none_is_safe(): assert is_vision_model(None) is False + + +def test_recognizes_multimodal_families_without_vision_in_name(): + # issue #1274: these are vision-capable but their names don't contain + # "vision"/"vl", so they were dropped and the model never saw the image. + for name in [ + "gemma3:4b", "gemma3", "gemma-3-27b-it", + "llama4:scout", "llama4", "llama-4-maverick", + "mistral-small3.1", "mistral-small-3.2", + "phi-4-multimodal", "phi4-multimodal", + ]: + assert is_vision_model(name), f"{name!r} should be detected as vision-capable" + + +def test_new_keywords_do_not_overmatch_text_models(): + # The added families must not flag their text-only siblings. + for name in ["gemma2:9b", "gemma:7b", "llama3.3", "mistral-small", "phi-3-mini"]: + assert not is_vision_model(name), f"{name!r} should not be flagged as vision"