diff --git a/routes/email_pollers.py b/routes/email_pollers.py index ec8b1e1..a06cbdc 100644 --- a/routes/email_pollers.py +++ b/routes/email_pollers.py @@ -132,7 +132,7 @@ async def _auto_summarize_pass_single(days_back: int = 1, account_id: str | None import sqlite3 as _sql3 import requests as _req from src.endpoint_resolver import resolve_endpoint - from src.llm_core import _uses_max_completion_tokens + from src.llm_core import _uses_max_completion_tokens, _restricts_temperature settings = _load_settings() auto_sum = settings.get("email_auto_summarize", False) @@ -355,6 +355,9 @@ async def _auto_summarize_pass_single(days_back: int = 1, account_id: str | None "temperature": 0.3, "stream": False, } + # Reasoning models (o1/o3/o4/gpt-5) reject an explicit temperature. + if _restricts_temperature(model): + payload.pop("temperature", None) try: # Use to_thread so this sync HTTP call doesn't freeze # the entire event loop while the LLM thinks (240s). @@ -806,6 +809,9 @@ async def _auto_summarize_pass_single(days_back: int = 1, account_id: str | None "temperature": 0.1, "stream": False, } + # Reasoning models (o1/o3/o4/gpt-5) reject an explicit temperature. + if _restricts_temperature(model): + payload.pop("temperature", None) # to_thread keeps the event loop responsive during the LLM call resp = await asyncio.to_thread( _req.post, url, json=payload, headers=req_headers, timeout=120 diff --git a/routes/email_routes.py b/routes/email_routes.py index 24f085b..9870cb4 100644 --- a/routes/email_routes.py +++ b/routes/email_routes.py @@ -2419,7 +2419,7 @@ def setup_email_routes(): """Generate a quick AI summary of an email body.""" try: from src.endpoint_resolver import resolve_endpoint - from src.llm_core import _uses_max_completion_tokens + from src.llm_core import _uses_max_completion_tokens, _restricts_temperature import requests as _req body = data.get("body", "") @@ -2476,6 +2476,9 @@ def setup_email_routes(): "temperature": 0.3, "stream": False, } + # Reasoning models (o1/o3/o4/gpt-5) reject an explicit temperature. + if _restricts_temperature(model): + payload.pop("temperature", None) resp = await asyncio.to_thread( _req.post, url, json=payload, headers=req_headers, timeout=180 ) diff --git a/routes/gallery_routes.py b/routes/gallery_routes.py index badc389..8ec2176 100644 --- a/routes/gallery_routes.py +++ b/routes/gallery_routes.py @@ -1707,7 +1707,7 @@ def setup_gallery_routes() -> APIRouter: return {"error": "No vision-capable endpoint configured"} # Call vision model — format differs between Anthropic and OpenAI - from src.llm_core import _detect_provider + from src.llm_core import _detect_provider, _restricts_temperature, _uses_max_completion_tokens provider = _detect_provider(chat_url) tag_prompt = ( "Analyze this photo. Return ONLY a comma-separated list of tags. " @@ -1732,6 +1732,7 @@ def setup_gallery_routes() -> APIRouter: }], } else: + _tok_key = "max_completion_tokens" if _uses_max_completion_tokens(model_name) else "max_tokens" payload = { "model": model_name, "messages": [{ @@ -1741,9 +1742,12 @@ def setup_gallery_routes() -> APIRouter: {"type": "image_url", "image_url": {"url": f"data:{mime};base64,{b64}"}}, ], }], - "max_tokens": 200, + _tok_key: 200, "temperature": 0.3, } + # Reasoning models (o1/o3/o4/gpt-5) reject an explicit temperature. + if _restricts_temperature(model_name): + payload.pop("temperature", None) h = {"Content-Type": "application/json"} if headers: diff --git a/routes/model_routes.py b/routes/model_routes.py index f04f2f2..b0fd1f6 100644 --- a/routes/model_routes.py +++ b/routes/model_routes.py @@ -251,9 +251,13 @@ def _probe_single_model(base: str, api_key: str, model_id: str, timeout: int = 1 target_url = build_chat_url(base) h = build_headers(api_key, base) h["Content-Type"] = "application/json" - from src.llm_core import _uses_max_completion_tokens + from src.llm_core import _uses_max_completion_tokens, _restricts_temperature _max_key = "max_completion_tokens" if _uses_max_completion_tokens(model_id) else "max_tokens" - payload = {"model": model_id, "messages": messages, _max_key: 5, "temperature": 0.0} + payload = {"model": model_id, "messages": messages, _max_key: 5} + # Reasoning models (o1/o3/o4/gpt-5) reject an explicit temperature, so a + # probe that hardcodes one falsely reports a working endpoint as failing. + if not _restricts_temperature(model_id): + payload["temperature"] = 0.0 if _test_tools: payload["tools"] = _test_tools diff --git a/src/llm_core.py b/src/llm_core.py index a407f97..18ccba7 100644 --- a/src/llm_core.py +++ b/src/llm_core.py @@ -403,6 +403,22 @@ def _uses_max_completion_tokens(model: str) -> bool: m = model.lower() return any(m.startswith(p) or f"/{p}" in m for p in _MAX_COMPLETION_TOKENS_MODELS) +# OpenAI reasoning models (o1, o3, o4, gpt-5 families) only accept the default +# temperature. Sending any explicit value — even 0.0 — returns HTTP 400 +# ("Only the default (1) value is supported"). That otherwise breaks chat when a +# preset sets a non-default temperature, and makes endpoint probing report a +# perfectly good model as failing. For these models we omit the field and let +# the API use its required default. (gpt-4.5 is intentionally excluded — it is +# not a reasoning model and accepts temperature normally.) +_FIXED_TEMPERATURE_MODELS = ("o1", "o3", "o4", "gpt-5") + +def _restricts_temperature(model: str) -> bool: + """Check if a model rejects any non-default temperature.""" + if not model: + return False + m = model.lower() + return any(m.startswith(p) or f"/{p}" in m for p in _FIXED_TEMPERATURE_MODELS) + # Models that support structured thinking — may output without opening tag _THINKING_MODEL_PATTERNS = ("qwen3", "qwq", "deepseek-r1", "deepseek-reasoner", "minimax", "m2-reap") @@ -738,6 +754,8 @@ def llm_call(url: str, model: str, messages: List[Dict], temperature: float = LL "messages": messages_copy, "temperature": temperature, } + if _restricts_temperature(model): + payload.pop("temperature", None) if max_tokens and max_tokens > 0: tok_key = "max_completion_tokens" if _uses_max_completion_tokens(model) else "max_tokens" payload[tok_key] = max_tokens @@ -857,6 +875,8 @@ async def llm_call_async( "messages": messages_copy, "temperature": temperature, } + if _restricts_temperature(model): + payload.pop("temperature", None) if max_tokens and max_tokens > 0: tok_key = "max_completion_tokens" if _uses_max_completion_tokens(model) else "max_tokens" payload[tok_key] = max_tokens @@ -958,6 +978,8 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl "temperature": temperature, "stream": True, } + if _restricts_temperature(model): + payload.pop("temperature", None) if provider not in {"openrouter", "groq"}: payload["stream_options"] = {"include_usage": True} if max_tokens and max_tokens > 0: diff --git a/tests/test_llm_core_temperature.py b/tests/test_llm_core_temperature.py new file mode 100644 index 0000000..09abf8a --- /dev/null +++ b/tests/test_llm_core_temperature.py @@ -0,0 +1,68 @@ +"""Regression tests: OpenAI reasoning models reject a non-default temperature. + +o1/o3/o4/gpt-5 only accept the default temperature (1); sending an explicit +value — even 0.0 — returns HTTP 400 "Only the default (1) value is supported". +The OpenAI-compatible payload builders must omit the temperature field for these +models so chat (with a non-default preset) and endpoint probing don't break. +""" +import httpx +import pytest + +from src import llm_core + + +@pytest.mark.parametrize( + "model", + ["o1", "o1-mini", "o3", "o3-mini", "o4-mini", "gpt-5", "gpt-5-mini", + "openrouter/openai/o3-mini", "OpenAI/GPT-5"], +) +def test_reasoning_models_restrict_temperature(model): + assert llm_core._restricts_temperature(model) is True + + +@pytest.mark.parametrize( + "model", + ["gpt-4o", "gpt-4.1", "gpt-3.5-turbo", "gpt-4.5-preview", + "claude-3-5-sonnet", "llama3.1", "", None], +) +def test_normal_models_allow_temperature(model): + assert llm_core._restricts_temperature(model) is False + + +def _capture_openai_payload(monkeypatch, model, temperature): + """Run a synchronous OpenAI-compatible call and return the posted JSON body.""" + llm_core._response_cache.clear() + seen = {} + + def fake_post(url, headers=None, json=None, timeout=None): + seen["json"] = json + request = httpx.Request("POST", url) + return httpx.Response( + 200, + request=request, + json={"choices": [{"message": {"content": "OK"}}]}, + ) + + monkeypatch.setattr(llm_core.httpx, "post", fake_post) + result = llm_core.llm_call( + "https://api.openai.com/v1/chat/completions", + model, + [{"role": "user", "content": "Say OK"}], + temperature=temperature, + max_tokens=5, + ) + assert result == "OK" + return seen["json"] + + +def test_reasoning_model_payload_omits_temperature(monkeypatch): + payload = _capture_openai_payload(monkeypatch, "o3-mini", 0.0) + assert "temperature" not in payload + # Reasoning models also use max_completion_tokens, which must survive. + assert payload["max_completion_tokens"] == 5 + + +def test_normal_model_payload_keeps_temperature(monkeypatch): + payload = _capture_openai_payload(monkeypatch, "gpt-4o", 0.2) + assert payload["temperature"] == 0.2 + assert payload["max_tokens"] == 5