fix: surface reasoning_content when content is empty (thinking models) (#1233)

Thinking models served via llama.cpp without --reasoning-format none (e.g. Qwen3, DeepSeek-R1) route all tokens into reasoning_content and return content="". Two call paths were silently broken: - llm_call / llm_call_async (non-streaming): hard-keyed data["choices"][0]["message"]["content"] raises KeyError or returns empty string, discarding the entire response. - stream_agent_loop end-of-round fallback: when full_response is empty but round_reasoning has content, the existing code replaced the response with the generic empty-response error message, discarding all reasoning tokens that were correctly accumulated during streaming. Fix: in both non-streaming paths use msg.get("content") or msg.get("reasoning_content") or "". In the streaming fallback, surface round_reasoning as the answer before falling through to the error path.
2026-06-02 22:11:24 +05:30
parent 257f7ee7b2
commit 7504fedb17
3 changed files with 176 additions and 6 deletions
--- a/src/agent_loop.py
+++ b/src/agent_loop.py
@@ -1314,6 +1314,30 @@ async def _run_verifier_subagent(
    return [r.strip() for r in reasons.split(";") if r.strip()]
 def _empty_response_fallback(
    full_response: str,
    round_reasoning: str,
    tool_events: list,
 ) -> tuple:
    """Return (final_response, sse_chunk_or_none) for the end-of-loop empty-response guard.
    When a thinking model routes all tokens to reasoning_content (leaving
    content=""), full_response is empty but round_reasoning has content.
    The reasoning was already streamed as {thinking:true} chunks — do not
    re-emit it as a normal delta.  Just persist it and yield nothing.
    Returns:
        (final_response: str, chunk: str | None)
            chunk is the SSE string to yield, or None if nothing should be emitted.
    """
    if full_response.strip() or tool_events:
        return full_response, None
    if round_reasoning.strip():
        return round_reasoning, None
    _error_msg = "The model returned an empty response. Please try again or switch to a different model."
    return _error_msg, f'data: {json.dumps({"delta": _error_msg})}\n\n'
 async def stream_agent_loop(
    endpoint_url: str,
    model: str,
@@ -2225,10 +2249,11 @@ async def stream_agent_loop(
    # If the response is completely empty and no tools were executed,
    # yield a fallback message so the user is not left hanging.
-    if not full_response.strip() and not tool_events:
+    full_response, _fallback_chunk = _empty_response_fallback(
-        _error_msg = "The model returned an empty response. Please try again or switch to a different model."
+        full_response, round_reasoning, tool_events
-        yield f'data: {json.dumps({"delta": _error_msg})}\n\n'
+    )
-        full_response = _error_msg
+    if _fallback_chunk:
        yield _fallback_chunk
    # --- Final metrics ---
    total_duration = time.time() - total_start
--- a/src/llm_core.py
+++ b/src/llm_core.py
@@ -860,7 +860,8 @@ def llm_call(url: str, model: str, messages: List[Dict], temperature: float = LL
        elif provider == "ollama":
            response = _parse_ollama_response(data)
        else:
-            response = data["choices"][0]["message"]["content"]
+            msg = data["choices"][0]["message"]
            response = msg.get("content") or msg.get("reasoning_content") or ""
        _set_cached_response(cache_key, response)
        return response
    except Exception:
@@ -997,7 +998,8 @@ async def llm_call_async(
                elif provider == "ollama":
                    response = _parse_ollama_response(data)
                else:
-                    response = data["choices"][0]["message"]["content"]
+                    msg = data["choices"][0]["message"]
                    response = msg.get("content") or msg.get("reasoning_content") or ""
                _set_cached_response(cache_key, response)
                return response
            except Exception:
--- a/tests/test_llm_core_reasoning_content_fallback.py
+++ b/tests/test_llm_core_reasoning_content_fallback.py
@@ -0,0 +1,143 @@
 """Regression tests for reasoning_content fallback in non-streaming paths.
 Covers the five cases requested during PR review:
  1. llm_call (sync): content="" + reasoning_content="..." → returns reasoning text
  2. llm_call_async (async): same
  3. Normal content wins over reasoning_content when both present
  4. Streaming agent path: reasoning-only round does NOT emit the generic error
  5. Streaming agent path: reasoning tokens are NOT duplicated as normal answer text
 """
 import asyncio
 import json
 import httpx
 import pytest
 from src import llm_core
 # ---------------------------------------------------------------------------
 # Helpers: fake httpx responses for the non-streaming llm_call* paths
 # ---------------------------------------------------------------------------
 def _sync_response(payload: dict) -> httpx.Response:
    req = httpx.Request("POST", "http://test/v1/chat/completions")
    return httpx.Response(200, request=req, json=payload)
 def _openai_msg(content, reasoning_content=None):
    msg = {"content": content}
    if reasoning_content is not None:
        msg["reasoning_content"] = reasoning_content
    return {"choices": [{"message": msg}]}
 # ---------------------------------------------------------------------------
 # 1. llm_call (sync): empty content → falls back to reasoning_content
 # ---------------------------------------------------------------------------
 def test_llm_call_returns_reasoning_content_when_content_empty(monkeypatch):
    monkeypatch.setattr(
        llm_core.httpx, "post",
        lambda *a, **kw: _sync_response(_openai_msg("", "I reasoned through it")),
    )
    result = llm_core.llm_call(
        "http://test/v1", "qwen3-8b",
        [{"role": "user", "content": "think"}],
    )
    assert result == "I reasoned through it"
 # ---------------------------------------------------------------------------
 # 2. llm_call_async (async): empty content → falls back to reasoning_content
 # ---------------------------------------------------------------------------
 def test_llm_call_async_returns_reasoning_content_when_content_empty(monkeypatch):
    class _FakeAsyncClient:
        async def post(self, *a, **kw):
            req = httpx.Request("POST", "http://test-async/v1/chat/completions")
            return httpx.Response(200, request=req,
                                  json=_openai_msg("", "async reasoning text"))
    monkeypatch.setattr(llm_core, "_get_http_client",
                        lambda: _FakeAsyncClient())
    result = asyncio.run(llm_core.llm_call_async(
        "http://test-async/v1", "qwen3-8b",
        [{"role": "user", "content": "think"}],
    ))
    assert result == "async reasoning text"
 # ---------------------------------------------------------------------------
 # 3. Normal content takes priority over reasoning_content when both present
 # ---------------------------------------------------------------------------
 def test_llm_call_content_wins_over_reasoning_content(monkeypatch):
    monkeypatch.setattr(
        llm_core.httpx, "post",
        lambda *a, **kw: _sync_response(
            _openai_msg("Normal answer", "some reasoning")
        ),
    )
    result = llm_core.llm_call(
        "http://test/v1", "some-model",
        [{"role": "user", "content": "hi"}],
    )
    assert result == "Normal answer"
 # ---------------------------------------------------------------------------
 # Streaming agent path tests (4 and 5)
 # These import and test _empty_response_fallback — the real production helper
 # extracted from stream_agent_loop.  If the fallback branch is reverted or
 # changed, these tests will fail.
 # ---------------------------------------------------------------------------
 import sys
 from unittest.mock import MagicMock
 # Mock heavy DB/tool deps before importing agent_loop
 for _mod in [
    "sqlalchemy", "sqlalchemy.orm", "sqlalchemy.ext",
    "sqlalchemy.ext.declarative", "sqlalchemy.ext.hybrid",
    "sqlalchemy.sql", "sqlalchemy.sql.expression",
    "src.database", "src.agent_tools",
    "core.models", "core.database",
 ]:
    if _mod not in sys.modules:
        sys.modules[_mod] = MagicMock()
 from src.agent_loop import _empty_response_fallback  # noqa: E402
 # ---------------------------------------------------------------------------
 # 4. Reasoning-only round: generic error is suppressed
 # ---------------------------------------------------------------------------
 def test_stream_agent_reasoning_only_does_not_emit_error():
    final_response, chunk = _empty_response_fallback(
        full_response="",
        round_reasoning="I reasoned carefully",
        tool_events=[],
    )
    assert chunk is None, "Must not emit any SSE chunk when reasoning is present"
    assert "The model returned an empty response" not in (chunk or "")
    assert final_response == "I reasoned carefully"
 # ---------------------------------------------------------------------------
 # 5. Reasoning tokens are NOT re-emitted as a normal answer delta
 # ---------------------------------------------------------------------------
 def test_stream_agent_reasoning_not_duplicated_as_normal_delta():
    reasoning_text = "my internal reasoning"
    _, chunk = _empty_response_fallback(
        full_response="",
        round_reasoning=reasoning_text,
        tool_events=[],
    )
    # chunk must be None — the reasoning was already sent as {thinking:true}
    assert chunk is None, (
        f"reasoning text was re-emitted as a normal delta chunk: {chunk!r}"
    )