Files
odysseus/src/chat_helpers.py
Lucas Daniel 578f56ab92 fix(vision): recognize Gemma 4 and Phi-4 as vision-capable models (#1704)
Gemma 4 and Phi-4 multimodal are natively vision-capable but their Ollama
tags ("gemma4:12b", "phi-4", "phi4") did not match any keyword in
_VISION_MODEL_KEYWORDS. The image was silently routed to the VL fallback
path instead of being passed directly to the model — users saw the model
respond to a placeholder like "[VL model unavailable - image not analyzed]"
rather than the actual image.

Adds "gemma-4"/"gemma4" and "phi-4"/"phi4" to the keyword list, following
the existing err-toward-True policy (#124): a text-only variant being
treated as vision is the safer failure than dropping a real image.

Fixes #1274 (partial — covers the Gemma 4 + Phi-4 case; the OpenRouter/free
vision fallback path is a separate issue).
2026-06-03 13:36:50 +09:00

307 lines
11 KiB
Python

# src/chat_helpers.py
"""URL extraction, message/upload validation, request parsing."""
import re
import os
import json
import time
import ipaddress
import logging
import httpx
from urllib.parse import urlparse
from fastapi import HTTPException
from fastapi import UploadFile
from typing import List, Optional
logger = logging.getLogger(__name__)
def extract_urls(text: str) -> List[str]:
"""Extract URLs from text using regex pattern."""
url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
urls = re.findall(url_pattern, text)
cleaned_urls = []
for url in urls:
url = re.sub(r'[.,;:!?\)]+$', '', url)
cleaned_urls.append(url)
return cleaned_urls
# Model-name substrings that signal native image input. A missed match here
# silently drops the image from the chat request (it gets swapped for a text
# caption), so the model never sees it. Keep this broad, especially for local
# models (Ollama/llama.cpp) that ship under many names. See issue #124.
_VISION_MODEL_KEYWORDS = (
# hosted
"gpt-4o", "gpt-4.1", "gpt-4.5", "gpt-4-turbo", "gpt-4-vision",
"claude-sonnet", "claude-opus", "claude-haiku", "gemini",
# open / local
"vision", "multimodal", "llava", "bakllava", "moondream", "pixtral", "minicpm",
"internvl", "cogvlm", "qwen-vl", "qwen2-vl", "qwen3-vl", "qwen3vl",
# multimodal families whose names don't contain "vision"/"vl" but DO accept
# images — without these the image is silently dropped for common Ollama tags
# like gemma3:4b or gemma4:12b (issue #1274). Gemma 3/4 (4b+), Llama 4 (all),
# Mistral Small 3.1/3.2, and Phi-4 multimodal are vision-capable; per the
# err-toward-True policy (#124) a rare text-only tag being treated as vision is
# the safer failure than silently dropping a real image.
"gemma-3", "gemma3", "gemma-4", "gemma4",
"llama-4", "llama4",
"mistral-small-3.1", "mistral-small3.1", "mistral-small-3.2", "mistral-small3.2",
# Microsoft Phi-4 ships a dedicated multimodal variant ("phi-4-multimodal-instruct")
# but users often load it under the bare "phi-4" or "phi4" Ollama tag.
"phi-4", "phi4",
# zhipu / glm (glm-4.5v, glm-4.6v, glm-5v-turbo, etc.)
"glm-4.5v", "glm-4.6v", "glm-5v",
)
# Catches the "*-VL-*" / "*VL*" family not covered by a literal keyword above
# (e.g. Qwen2.5-VL and various tags): a standalone "vl" token, plus "vlm".
_VISION_VL_RE = re.compile(r'(?<![a-z])vl(?![a-z])|vlm')
def is_vision_model(model_name: str) -> bool:
"""Best-effort check of whether a model can natively accept images.
Decides whether image attachments get passed through to the model or
swapped for a separate caption. Err toward True, since a false negative
drops the image entirely. See issue #124.
"""
m = (model_name or "").lower()
if any(kw in m for kw in _VISION_MODEL_KEYWORDS):
return True
return bool(_VISION_VL_RE.search(m))
_PROVIDER_FINGERPRINT_TTL = 60.0
# (host, port) -> (models_list | None, expiry); list = LM Studio, None = not LM Studio.
_lmstudio_models_cache: dict = {}
def _is_local_host(host: Optional[str]) -> bool:
"""True for loopback/LAN/Tailscale hosts (never public domains)."""
host = (host or "").lower()
if not host:
return False
if host in {"localhost", "host.docker.internal"} or host.endswith(".local"):
return True
try:
ip = ipaddress.ip_address(host)
except ValueError:
return "." not in host
if ip.is_loopback or ip.is_private or ip.is_link_local:
return True
return ip in ipaddress.ip_network("100.64.0.0/10")
def _probe_lmstudio_models(url: str) -> Optional[list]:
"""Return LM Studio's native /api/v1/models list, or None when the endpoint
isn't LM Studio or is unreachable (short-TTL cached; transient errors uncached)."""
parsed = urlparse(url)
host = parsed.hostname or ""
key = (host, parsed.port)
now = time.time()
cached = _lmstudio_models_cache.get(key)
if cached is not None and cached[1] > now:
return cached[0]
authority = host if parsed.port is None else f"{host}:{parsed.port}"
probe_url = f"{parsed.scheme or 'http'}://{authority}/api/v1/models"
try:
r = httpx.get(probe_url, timeout=1.0)
except Exception:
return None
try:
data = r.json() if r.is_success else {}
except Exception:
data = {}
models = data.get("models")
valid = (
isinstance(models, list) and bool(models)
and isinstance(models[0], dict)
and "key" in models[0] and "architecture" in models[0]
)
models = models if valid else None
_lmstudio_models_cache[key] = (models, now + _PROVIDER_FINGERPRINT_TTL)
return models
def lmstudio_supports_vision(url: str, model: str) -> Optional[bool]:
"""Read `model`'s capabilities.vision flag from LM Studio, or None when the
endpoint isn't LM Studio or doesn't report it (so callers fall back)."""
if not model:
return None
# Never probe a remote provider; LM Studio is always a local/LAN host.
if not _is_local_host(urlparse(url).hostname):
return None
models = _probe_lmstudio_models(url)
if not models:
return None
want = model.strip().lower()
for m in models:
if not isinstance(m, dict):
continue
names = {str(m.get("key", "")).lower(), str(m.get("display_name", "")).lower()}
if want in names:
caps = m.get("capabilities")
if isinstance(caps, dict) and "vision" in caps:
return bool(caps.get("vision"))
return None
return None
def model_supports_vision(model_name: str, endpoint_url: str = "") -> bool:
"""Whether a model accepts images, using the endpoint's reported
capability when available (LM Studio) and falling back to name-based
detection otherwise."""
if endpoint_url:
try:
advertised = lmstudio_supports_vision(endpoint_url, model_name or "")
except Exception:
advertised = None
if advertised is not None:
return advertised
return is_vision_model(model_name)
def validate_message(message: str) -> str:
"""Validate message input."""
if not message:
raise HTTPException(status_code=400, detail="Message is required")
message = message.strip()
if len(message) == 0:
raise HTTPException(status_code=400, detail="Message cannot be empty")
if len(message) > 50000:
raise HTTPException(status_code=400, detail="Message exceeds maximum length")
return message
def validate_file_upload(file: UploadFile) -> UploadFile:
"""Validate uploaded file meets requirements."""
if not file or not file.filename:
raise HTTPException(
status_code=400,
detail={
"error": "INVALID_FILE",
"message": "No file uploaded or invalid filename"
}
)
try:
file.file.seek(0, 2)
file_size = file.file.tell()
file.file.seek(0)
if file_size == 0:
raise HTTPException(
status_code=400,
detail={
"error": "EMPTY_FILE",
"message": "File is empty"
}
)
if file_size > 10 * 1024 * 1024:
raise HTTPException(
status_code=400,
detail={
"error": "FILE_TOO_LARGE",
"message": "File size exceeds 10MB limit"
}
)
except IOError as e:
logger.error(f"Error reading file size for {file.filename}: {e}")
raise HTTPException(
status_code=500,
detail={
"error": "FILE_READ_ERROR",
"message": "Error reading uploaded file"
}
)
allowed_extensions = {'.txt', '.py', '.html', '.md', '.json', '.csv', '.js',
'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.webp', '.pdf',
'.webm', '.wav', '.mp3', '.m4a', '.ogg'}
_, ext = os.path.splitext(file.filename.lower())
if ext not in allowed_extensions:
raise HTTPException(
status_code=400,
detail={
"error": "UNSUPPORTED_FILE_TYPE",
"message": f"File type '{ext}' not allowed",
"allowed_types": sorted(allowed_extensions)
}
)
return file
def coerce_message_and_session(req_json: dict | None, message: str | None,
session: str | None, session_manager,
allow_empty: bool = False):
"""Extract message and session from request, with validation.
If allow_empty=True (e.g. attachment-only sends), the message-required
check is skipped and an empty/whitespace message is normalized to "".
"""
try:
if message is None or session is None:
if req_json is None:
raise HTTPException(
status_code=400,
detail={
"error": "MISSING_PARAMETERS",
"message": "Missing 'message' and/or 'session' in request"
}
)
message = message or req_json.get("message")
session = session or req_json.get("session")
if allow_empty and (message is None or not str(message).strip()):
message = ""
else:
message = validate_message(message)
if not session:
raise HTTPException(
status_code=400,
detail={
"error": "VALIDATION_ERROR",
"message": "Session ID is required"
}
)
try:
session_manager.get_session(session)
except KeyError:
raise HTTPException(
status_code=404,
detail={
"error": "SESSION_NOT_FOUND",
"message": f"Session '{session}' not found"
}
)
return message, session
except HTTPException:
raise
except json.JSONDecodeError as e:
logger.error(f"JSON decode error: {e}")
raise HTTPException(
status_code=400,
detail={
"error": "INVALID_JSON",
"message": "Invalid JSON in request body"
}
)
except Exception as e:
logger.error(f"Unexpected error in coerce_message_and_session: {e}")
raise HTTPException(
status_code=400,
detail={
"error": "REQUEST_PROCESSING_ERROR",
"message": "Error processing request"
}
)