From 7504fedb17e2b0f0aca4360339feb78a6d7266c5 Mon Sep 17 00:00:00 2001 From: Shreyas S Joshi <156504459+BlackPool25@users.noreply.github.com> Date: Tue, 2 Jun 2026 22:11:24 +0530 Subject: [PATCH] fix: surface reasoning_content when content is empty (thinking models) (#1233) Thinking models served via llama.cpp without --reasoning-format none (e.g. Qwen3, DeepSeek-R1) route all tokens into reasoning_content and return content="". Two call paths were silently broken: - llm_call / llm_call_async (non-streaming): hard-keyed data["choices"][0]["message"]["content"] raises KeyError or returns empty string, discarding the entire response. - stream_agent_loop end-of-round fallback: when full_response is empty but round_reasoning has content, the existing code replaced the response with the generic empty-response error message, discarding all reasoning tokens that were correctly accumulated during streaming. Fix: in both non-streaming paths use msg.get("content") or msg.get("reasoning_content") or "". In the streaming fallback, surface round_reasoning as the answer before falling through to the error path. --- src/agent_loop.py | 33 +++- src/llm_core.py | 6 +- ...est_llm_core_reasoning_content_fallback.py | 143 ++++++++++++++++++ 3 files changed, 176 insertions(+), 6 deletions(-) create mode 100644 tests/test_llm_core_reasoning_content_fallback.py diff --git a/src/agent_loop.py b/src/agent_loop.py index 35bc8a5..3fdce70 100644 --- a/src/agent_loop.py +++ b/src/agent_loop.py @@ -1314,6 +1314,30 @@ async def _run_verifier_subagent( return [r.strip() for r in reasons.split(";") if r.strip()] +def _empty_response_fallback( + full_response: str, + round_reasoning: str, + tool_events: list, +) -> tuple: + """Return (final_response, sse_chunk_or_none) for the end-of-loop empty-response guard. + + When a thinking model routes all tokens to reasoning_content (leaving + content=""), full_response is empty but round_reasoning has content. + The reasoning was already streamed as {thinking:true} chunks — do not + re-emit it as a normal delta. Just persist it and yield nothing. + + Returns: + (final_response: str, chunk: str | None) + chunk is the SSE string to yield, or None if nothing should be emitted. + """ + if full_response.strip() or tool_events: + return full_response, None + if round_reasoning.strip(): + return round_reasoning, None + _error_msg = "The model returned an empty response. Please try again or switch to a different model." + return _error_msg, f'data: {json.dumps({"delta": _error_msg})}\n\n' + + async def stream_agent_loop( endpoint_url: str, model: str, @@ -2225,10 +2249,11 @@ async def stream_agent_loop( # If the response is completely empty and no tools were executed, # yield a fallback message so the user is not left hanging. - if not full_response.strip() and not tool_events: - _error_msg = "The model returned an empty response. Please try again or switch to a different model." - yield f'data: {json.dumps({"delta": _error_msg})}\n\n' - full_response = _error_msg + full_response, _fallback_chunk = _empty_response_fallback( + full_response, round_reasoning, tool_events + ) + if _fallback_chunk: + yield _fallback_chunk # --- Final metrics --- total_duration = time.time() - total_start diff --git a/src/llm_core.py b/src/llm_core.py index 5438967..e4d2c51 100644 --- a/src/llm_core.py +++ b/src/llm_core.py @@ -860,7 +860,8 @@ def llm_call(url: str, model: str, messages: List[Dict], temperature: float = LL elif provider == "ollama": response = _parse_ollama_response(data) else: - response = data["choices"][0]["message"]["content"] + msg = data["choices"][0]["message"] + response = msg.get("content") or msg.get("reasoning_content") or "" _set_cached_response(cache_key, response) return response except Exception: @@ -997,7 +998,8 @@ async def llm_call_async( elif provider == "ollama": response = _parse_ollama_response(data) else: - response = data["choices"][0]["message"]["content"] + msg = data["choices"][0]["message"] + response = msg.get("content") or msg.get("reasoning_content") or "" _set_cached_response(cache_key, response) return response except Exception: diff --git a/tests/test_llm_core_reasoning_content_fallback.py b/tests/test_llm_core_reasoning_content_fallback.py new file mode 100644 index 0000000..3335a7b --- /dev/null +++ b/tests/test_llm_core_reasoning_content_fallback.py @@ -0,0 +1,143 @@ +"""Regression tests for reasoning_content fallback in non-streaming paths. + +Covers the five cases requested during PR review: + 1. llm_call (sync): content="" + reasoning_content="..." → returns reasoning text + 2. llm_call_async (async): same + 3. Normal content wins over reasoning_content when both present + 4. Streaming agent path: reasoning-only round does NOT emit the generic error + 5. Streaming agent path: reasoning tokens are NOT duplicated as normal answer text +""" +import asyncio +import json + +import httpx +import pytest + +from src import llm_core + + +# --------------------------------------------------------------------------- +# Helpers: fake httpx responses for the non-streaming llm_call* paths +# --------------------------------------------------------------------------- + +def _sync_response(payload: dict) -> httpx.Response: + req = httpx.Request("POST", "http://test/v1/chat/completions") + return httpx.Response(200, request=req, json=payload) + + +def _openai_msg(content, reasoning_content=None): + msg = {"content": content} + if reasoning_content is not None: + msg["reasoning_content"] = reasoning_content + return {"choices": [{"message": msg}]} + + +# --------------------------------------------------------------------------- +# 1. llm_call (sync): empty content → falls back to reasoning_content +# --------------------------------------------------------------------------- + +def test_llm_call_returns_reasoning_content_when_content_empty(monkeypatch): + monkeypatch.setattr( + llm_core.httpx, "post", + lambda *a, **kw: _sync_response(_openai_msg("", "I reasoned through it")), + ) + result = llm_core.llm_call( + "http://test/v1", "qwen3-8b", + [{"role": "user", "content": "think"}], + ) + assert result == "I reasoned through it" + + +# --------------------------------------------------------------------------- +# 2. llm_call_async (async): empty content → falls back to reasoning_content +# --------------------------------------------------------------------------- + +def test_llm_call_async_returns_reasoning_content_when_content_empty(monkeypatch): + class _FakeAsyncClient: + async def post(self, *a, **kw): + req = httpx.Request("POST", "http://test-async/v1/chat/completions") + return httpx.Response(200, request=req, + json=_openai_msg("", "async reasoning text")) + + monkeypatch.setattr(llm_core, "_get_http_client", + lambda: _FakeAsyncClient()) + + result = asyncio.run(llm_core.llm_call_async( + "http://test-async/v1", "qwen3-8b", + [{"role": "user", "content": "think"}], + )) + assert result == "async reasoning text" + + +# --------------------------------------------------------------------------- +# 3. Normal content takes priority over reasoning_content when both present +# --------------------------------------------------------------------------- + +def test_llm_call_content_wins_over_reasoning_content(monkeypatch): + monkeypatch.setattr( + llm_core.httpx, "post", + lambda *a, **kw: _sync_response( + _openai_msg("Normal answer", "some reasoning") + ), + ) + result = llm_core.llm_call( + "http://test/v1", "some-model", + [{"role": "user", "content": "hi"}], + ) + assert result == "Normal answer" + + +# --------------------------------------------------------------------------- +# Streaming agent path tests (4 and 5) +# These import and test _empty_response_fallback — the real production helper +# extracted from stream_agent_loop. If the fallback branch is reverted or +# changed, these tests will fail. +# --------------------------------------------------------------------------- + +import sys +from unittest.mock import MagicMock + +# Mock heavy DB/tool deps before importing agent_loop +for _mod in [ + "sqlalchemy", "sqlalchemy.orm", "sqlalchemy.ext", + "sqlalchemy.ext.declarative", "sqlalchemy.ext.hybrid", + "sqlalchemy.sql", "sqlalchemy.sql.expression", + "src.database", "src.agent_tools", + "core.models", "core.database", +]: + if _mod not in sys.modules: + sys.modules[_mod] = MagicMock() + +from src.agent_loop import _empty_response_fallback # noqa: E402 + + +# --------------------------------------------------------------------------- +# 4. Reasoning-only round: generic error is suppressed +# --------------------------------------------------------------------------- + +def test_stream_agent_reasoning_only_does_not_emit_error(): + final_response, chunk = _empty_response_fallback( + full_response="", + round_reasoning="I reasoned carefully", + tool_events=[], + ) + assert chunk is None, "Must not emit any SSE chunk when reasoning is present" + assert "The model returned an empty response" not in (chunk or "") + assert final_response == "I reasoned carefully" + + +# --------------------------------------------------------------------------- +# 5. Reasoning tokens are NOT re-emitted as a normal answer delta +# --------------------------------------------------------------------------- + +def test_stream_agent_reasoning_not_duplicated_as_normal_delta(): + reasoning_text = "my internal reasoning" + _, chunk = _empty_response_fallback( + full_response="", + round_reasoning=reasoning_text, + tool_events=[], + ) + # chunk must be None — the reasoning was already sent as {thinking:true} + assert chunk is None, ( + f"reasoning text was re-emitted as a normal delta chunk: {chunk!r}" + )