Files
odysseus/tests/test_vision_model_detection.py
lekt8 583df3dd6a Recognize gemma3/llama4/mistral-small3.1+/multimodal as vision models (#1430)
is_vision_model() classified several genuinely multimodal families as text-only
because their names contain neither "vision" nor "vl": Gemma 3 (4b+), Llama 4,
Mistral Small 3.1/3.2, and *-multimodal models (e.g. phi-4-multimodal). For those
the attached image was stripped before the request, so the model never saw it —
a "can't read the image" report (issue #1274), common with Ollama tags like
gemma3:4b.

Add those keywords (plus a generic "multimodal"). Per the file's err-toward-True
policy (#124), a rare text-only tag treated as vision is the safer failure than
dropping a real image. Guard tests confirm the text-only siblings (gemma2, plain
gemma, mistral-small, phi-3) are not over-matched.

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-03 04:17:40 +09:00

49 lines
1.9 KiB
Python

"""Tests for is_vision_model (issue #124).
Local vision models served through Ollama/llama.cpp show up under many
names. If one isn't recognized as vision-capable, the image attachment is
stripped from the request before it reaches the model, so it silently never
sees the picture.
"""
from src.chat_helpers import is_vision_model
def test_recognizes_local_and_hosted_vision_models():
for name in [
# the ones #124 missed
"moondream", "moondream:latest",
"llama3.2-vision:11b", "granite3.2-vision",
"qwen2.5-vl:7b", "qwen2.5vl", "internvl2.5", "cogvlm",
# already worked, keep them working
"llava", "llava:7b", "bakllava", "minicpm-v",
"gpt-4o", "claude-sonnet-4", "gemini-2.0-flash", "pixtral-12b",
]:
assert is_vision_model(name), f"{name!r} should be detected as vision-capable"
def test_text_only_models_not_flagged():
for name in ["qwen2.5:3b", "mistral", "llama3.1:8b", "deepseek-r1", "phi3", "vicuna", ""]:
assert not is_vision_model(name), f"{name!r} should not be flagged as vision"
def test_none_is_safe():
assert is_vision_model(None) is False
def test_recognizes_multimodal_families_without_vision_in_name():
# issue #1274: these are vision-capable but their names don't contain
# "vision"/"vl", so they were dropped and the model never saw the image.
for name in [
"gemma3:4b", "gemma3", "gemma-3-27b-it",
"llama4:scout", "llama4", "llama-4-maverick",
"mistral-small3.1", "mistral-small-3.2",
"phi-4-multimodal", "phi4-multimodal",
]:
assert is_vision_model(name), f"{name!r} should be detected as vision-capable"
def test_new_keywords_do_not_overmatch_text_models():
# The added families must not flag their text-only siblings.
for name in ["gemma2:9b", "gemma:7b", "llama3.3", "mistral-small", "phi-3-mini"]:
assert not is_vision_model(name), f"{name!r} should not be flagged as vision"