# src/chat_helpers.py """URL extraction, message/upload validation, request parsing.""" import re import os import json import logging from fastapi import HTTPException from fastapi import UploadFile from typing import List logger = logging.getLogger(__name__) def extract_urls(text: str) -> List[str]: """Extract URLs from text using regex pattern.""" url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+' urls = re.findall(url_pattern, text) cleaned_urls = [] for url in urls: url = re.sub(r'[.,;:!?\)]+$', '', url) cleaned_urls.append(url) return cleaned_urls # Model-name substrings that signal native image input. A missed match here # silently drops the image from the chat request (it gets swapped for a text # caption), so the model never sees it. Keep this broad, especially for local # models (Ollama/llama.cpp) that ship under many names. See issue #124. _VISION_MODEL_KEYWORDS = ( # hosted "gpt-4o", "gpt-4.1", "gpt-4.5", "gpt-4-turbo", "gpt-4-vision", "claude-sonnet", "claude-opus", "claude-haiku", "gemini", # open / local "vision", "llava", "bakllava", "moondream", "pixtral", "minicpm", "internvl", "cogvlm", "qwen-vl", "qwen2-vl", "qwen3-vl", "qwen3vl", ) # Catches the "*-VL-*" / "*VL*" family not covered by a literal keyword above # (e.g. Qwen2.5-VL and various tags): a standalone "vl" token, plus "vlm". _VISION_VL_RE = re.compile(r'(? bool: """Best-effort check of whether a model can natively accept images. Decides whether image attachments get passed through to the model or swapped for a separate caption. Err toward True, since a false negative drops the image entirely. See issue #124. """ m = (model_name or "").lower() if any(kw in m for kw in _VISION_MODEL_KEYWORDS): return True return bool(_VISION_VL_RE.search(m)) def validate_message(message: str) -> str: """Validate message input.""" if not message: raise HTTPException(status_code=400, detail="Message is required") message = message.strip() if len(message) == 0: raise HTTPException(status_code=400, detail="Message cannot be empty") if len(message) > 50000: raise HTTPException(status_code=400, detail="Message exceeds maximum length") return message def validate_file_upload(file: UploadFile) -> UploadFile: """Validate uploaded file meets requirements.""" if not file or not file.filename: raise HTTPException( status_code=400, detail={ "error": "INVALID_FILE", "message": "No file uploaded or invalid filename" } ) try: file.file.seek(0, 2) file_size = file.file.tell() file.file.seek(0) if file_size == 0: raise HTTPException( status_code=400, detail={ "error": "EMPTY_FILE", "message": "File is empty" } ) if file_size > 10 * 1024 * 1024: raise HTTPException( status_code=400, detail={ "error": "FILE_TOO_LARGE", "message": "File size exceeds 10MB limit" } ) except IOError as e: logger.error(f"Error reading file size for {file.filename}: {e}") raise HTTPException( status_code=500, detail={ "error": "FILE_READ_ERROR", "message": "Error reading uploaded file" } ) allowed_extensions = {'.txt', '.py', '.html', '.md', '.json', '.csv', '.js', '.png', '.jpg', '.jpeg', '.gif', '.bmp', '.webp', '.pdf', '.webm', '.wav', '.mp3', '.m4a', '.ogg'} _, ext = os.path.splitext(file.filename.lower()) if ext not in allowed_extensions: raise HTTPException( status_code=400, detail={ "error": "UNSUPPORTED_FILE_TYPE", "message": f"File type '{ext}' not allowed", "allowed_types": sorted(allowed_extensions) } ) return file def coerce_message_and_session(req_json: dict | None, message: str | None, session: str | None, session_manager, allow_empty: bool = False): """Extract message and session from request, with validation. If allow_empty=True (e.g. attachment-only sends), the message-required check is skipped and an empty/whitespace message is normalized to "". """ try: if message is None or session is None: if req_json is None: raise HTTPException( status_code=400, detail={ "error": "MISSING_PARAMETERS", "message": "Missing 'message' and/or 'session' in request" } ) message = message or req_json.get("message") session = session or req_json.get("session") if allow_empty and (message is None or not str(message).strip()): message = "" else: message = validate_message(message) if not session: raise HTTPException( status_code=400, detail={ "error": "VALIDATION_ERROR", "message": "Session ID is required" } ) try: session_manager.get_session(session) except KeyError: raise HTTPException( status_code=404, detail={ "error": "SESSION_NOT_FOUND", "message": f"Session '{session}' not found" } ) return message, session except HTTPException: raise except json.JSONDecodeError as e: logger.error(f"JSON decode error: {e}") raise HTTPException( status_code=400, detail={ "error": "INVALID_JSON", "message": "Invalid JSON in request body" } ) except Exception as e: logger.error(f"Unexpected error in coerce_message_and_session: {e}") raise HTTPException( status_code=400, detail={ "error": "REQUEST_PROCESSING_ERROR", "message": "Error processing request" } )