From 91d351158037d9a2a7c472c315ff72929208c9b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5kon=20Julius=20St=C3=B8rholt?= <43246021+hakonstoerholt@users.noreply.github.com> Date: Mon, 1 Jun 2026 06:09:21 +0200 Subject: [PATCH] Recognize local vision models so their images aren't dropped (#185) An image attachment only got through if the model name was on a short built-in list. Anything else was treated as text-only and the image was quietly dropped, so the model never saw it. That left out a lot of the smaller vision models you can run locally (moondream was the one I hit). Pulled the check into is_vision_model() in chat_helpers, broadened it to cover those, and added a test. Models that already worked are unaffected. Fixes #124. --- src/chat_handler.py | 14 ++----------- src/chat_helpers.py | 30 ++++++++++++++++++++++++++++ tests/test_vision_model_detection.py | 30 ++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+), 12 deletions(-) create mode 100644 tests/test_vision_model_detection.py diff --git a/src/chat_handler.py b/src/chat_handler.py index ccfcd4c..01daa52 100644 --- a/src/chat_handler.py +++ b/src/chat_handler.py @@ -15,7 +15,7 @@ from src.constants import ( UPLOAD_DIR, ) from core.models import ChatMessage -from src.chat_helpers import extract_urls +from src.chat_helpers import extract_urls, is_vision_model from src.document_processor import build_user_content, analyze_image_with_vl_result from src.youtube_handler import ( is_youtube_url, @@ -147,17 +147,7 @@ class ChatHandler: # Analyze images — skip if vision disabled, or if main model is vision-capable from src.settings import get_setting vision_enabled = get_setting("vision_enabled", True) - VISION_KEYWORDS = [ - "gpt-4o", "gpt-4.1", "gpt-4.5", "gpt-4-turbo", "gpt-4-vision", - "claude-sonnet", "claude-opus", "claude-haiku", - "gemini", "llava", "pixtral", "qwen2-vl", "qwen-vl", "qwen3-vl", "qwen3vl", "minicpm", - ] - main_model = (sess.model or "").lower() - main_is_vision = any(kw in main_model for kw in VISION_KEYWORDS) - # Also match models with "vl" in the name (e.g. Qwen3VL, InternVL, any *-VL-*) - if not main_is_vision: - import re - main_is_vision = bool(re.search(r'\dvl|vl\d|[-_]vl[-_.\d]|vl-', main_model)) + main_is_vision = is_vision_model(sess.model or "") # Read uploads DB once and index by id (was read twice + linear-scanned per attachment) files_by_id: Dict[str, Dict] = {} diff --git a/src/chat_helpers.py b/src/chat_helpers.py index fa4aed9..d690796 100644 --- a/src/chat_helpers.py +++ b/src/chat_helpers.py @@ -23,6 +23,36 @@ def extract_urls(text: str) -> List[str]: return cleaned_urls +# Model-name substrings that signal native image input. A missed match here +# silently drops the image from the chat request (it gets swapped for a text +# caption), so the model never sees it. Keep this broad, especially for local +# models (Ollama/llama.cpp) that ship under many names. See issue #124. +_VISION_MODEL_KEYWORDS = ( + # hosted + "gpt-4o", "gpt-4.1", "gpt-4.5", "gpt-4-turbo", "gpt-4-vision", + "claude-sonnet", "claude-opus", "claude-haiku", "gemini", + # open / local + "vision", "llava", "bakllava", "moondream", "pixtral", "minicpm", + "internvl", "cogvlm", "qwen-vl", "qwen2-vl", "qwen3-vl", "qwen3vl", +) +# Catches the "*-VL-*" / "*VL*" family not covered by a literal keyword above +# (e.g. Qwen2.5-VL and various tags): a standalone "vl" token, plus "vlm". +_VISION_VL_RE = re.compile(r'(? bool: + """Best-effort check of whether a model can natively accept images. + + Decides whether image attachments get passed through to the model or + swapped for a separate caption. Err toward True, since a false negative + drops the image entirely. See issue #124. + """ + m = (model_name or "").lower() + if any(kw in m for kw in _VISION_MODEL_KEYWORDS): + return True + return bool(_VISION_VL_RE.search(m)) + + def validate_message(message: str) -> str: """Validate message input.""" if not message: diff --git a/tests/test_vision_model_detection.py b/tests/test_vision_model_detection.py new file mode 100644 index 0000000..b0efe68 --- /dev/null +++ b/tests/test_vision_model_detection.py @@ -0,0 +1,30 @@ +"""Tests for is_vision_model (issue #124). + +Local vision models served through Ollama/llama.cpp show up under many +names. If one isn't recognized as vision-capable, the image attachment is +stripped from the request before it reaches the model, so it silently never +sees the picture. +""" +from src.chat_helpers import is_vision_model + + +def test_recognizes_local_and_hosted_vision_models(): + for name in [ + # the ones #124 missed + "moondream", "moondream:latest", + "llama3.2-vision:11b", "granite3.2-vision", + "qwen2.5-vl:7b", "qwen2.5vl", "internvl2.5", "cogvlm", + # already worked, keep them working + "llava", "llava:7b", "bakllava", "minicpm-v", + "gpt-4o", "claude-sonnet-4", "gemini-2.0-flash", "pixtral-12b", + ]: + assert is_vision_model(name), f"{name!r} should be detected as vision-capable" + + +def test_text_only_models_not_flagged(): + for name in ["qwen2.5:3b", "mistral", "llama3.1:8b", "deepseek-r1", "phi3", "vicuna", ""]: + assert not is_vision_model(name), f"{name!r} should not be flagged as vision" + + +def test_none_is_safe(): + assert is_vision_model(None) is False