fix: surface reasoning_content when content is empty (thinking models) (#1233)
Thinking models served via llama.cpp without --reasoning-format none
(e.g. Qwen3, DeepSeek-R1) route all tokens into reasoning_content and
return content="". Two call paths were silently broken:
- llm_call / llm_call_async (non-streaming): hard-keyed
data["choices"][0]["message"]["content"] raises KeyError or returns
empty string, discarding the entire response.
- stream_agent_loop end-of-round fallback: when full_response is empty
but round_reasoning has content, the existing code replaced the
response with the generic empty-response error message, discarding
all reasoning tokens that were correctly accumulated during streaming.
Fix: in both non-streaming paths use msg.get("content") or
msg.get("reasoning_content") or "". In the streaming fallback, surface
round_reasoning as the answer before falling through to the error path.
This commit is contained in:
@@ -1314,6 +1314,30 @@ async def _run_verifier_subagent(
|
|||||||
return [r.strip() for r in reasons.split(";") if r.strip()]
|
return [r.strip() for r in reasons.split(";") if r.strip()]
|
||||||
|
|
||||||
|
|
||||||
|
def _empty_response_fallback(
|
||||||
|
full_response: str,
|
||||||
|
round_reasoning: str,
|
||||||
|
tool_events: list,
|
||||||
|
) -> tuple:
|
||||||
|
"""Return (final_response, sse_chunk_or_none) for the end-of-loop empty-response guard.
|
||||||
|
|
||||||
|
When a thinking model routes all tokens to reasoning_content (leaving
|
||||||
|
content=""), full_response is empty but round_reasoning has content.
|
||||||
|
The reasoning was already streamed as {thinking:true} chunks — do not
|
||||||
|
re-emit it as a normal delta. Just persist it and yield nothing.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(final_response: str, chunk: str | None)
|
||||||
|
chunk is the SSE string to yield, or None if nothing should be emitted.
|
||||||
|
"""
|
||||||
|
if full_response.strip() or tool_events:
|
||||||
|
return full_response, None
|
||||||
|
if round_reasoning.strip():
|
||||||
|
return round_reasoning, None
|
||||||
|
_error_msg = "The model returned an empty response. Please try again or switch to a different model."
|
||||||
|
return _error_msg, f'data: {json.dumps({"delta": _error_msg})}\n\n'
|
||||||
|
|
||||||
|
|
||||||
async def stream_agent_loop(
|
async def stream_agent_loop(
|
||||||
endpoint_url: str,
|
endpoint_url: str,
|
||||||
model: str,
|
model: str,
|
||||||
@@ -2225,10 +2249,11 @@ async def stream_agent_loop(
|
|||||||
|
|
||||||
# If the response is completely empty and no tools were executed,
|
# If the response is completely empty and no tools were executed,
|
||||||
# yield a fallback message so the user is not left hanging.
|
# yield a fallback message so the user is not left hanging.
|
||||||
if not full_response.strip() and not tool_events:
|
full_response, _fallback_chunk = _empty_response_fallback(
|
||||||
_error_msg = "The model returned an empty response. Please try again or switch to a different model."
|
full_response, round_reasoning, tool_events
|
||||||
yield f'data: {json.dumps({"delta": _error_msg})}\n\n'
|
)
|
||||||
full_response = _error_msg
|
if _fallback_chunk:
|
||||||
|
yield _fallback_chunk
|
||||||
|
|
||||||
# --- Final metrics ---
|
# --- Final metrics ---
|
||||||
total_duration = time.time() - total_start
|
total_duration = time.time() - total_start
|
||||||
|
|||||||
@@ -860,7 +860,8 @@ def llm_call(url: str, model: str, messages: List[Dict], temperature: float = LL
|
|||||||
elif provider == "ollama":
|
elif provider == "ollama":
|
||||||
response = _parse_ollama_response(data)
|
response = _parse_ollama_response(data)
|
||||||
else:
|
else:
|
||||||
response = data["choices"][0]["message"]["content"]
|
msg = data["choices"][0]["message"]
|
||||||
|
response = msg.get("content") or msg.get("reasoning_content") or ""
|
||||||
_set_cached_response(cache_key, response)
|
_set_cached_response(cache_key, response)
|
||||||
return response
|
return response
|
||||||
except Exception:
|
except Exception:
|
||||||
@@ -997,7 +998,8 @@ async def llm_call_async(
|
|||||||
elif provider == "ollama":
|
elif provider == "ollama":
|
||||||
response = _parse_ollama_response(data)
|
response = _parse_ollama_response(data)
|
||||||
else:
|
else:
|
||||||
response = data["choices"][0]["message"]["content"]
|
msg = data["choices"][0]["message"]
|
||||||
|
response = msg.get("content") or msg.get("reasoning_content") or ""
|
||||||
_set_cached_response(cache_key, response)
|
_set_cached_response(cache_key, response)
|
||||||
return response
|
return response
|
||||||
except Exception:
|
except Exception:
|
||||||
|
|||||||
143
tests/test_llm_core_reasoning_content_fallback.py
Normal file
143
tests/test_llm_core_reasoning_content_fallback.py
Normal file
@@ -0,0 +1,143 @@
|
|||||||
|
"""Regression tests for reasoning_content fallback in non-streaming paths.
|
||||||
|
|
||||||
|
Covers the five cases requested during PR review:
|
||||||
|
1. llm_call (sync): content="" + reasoning_content="..." → returns reasoning text
|
||||||
|
2. llm_call_async (async): same
|
||||||
|
3. Normal content wins over reasoning_content when both present
|
||||||
|
4. Streaming agent path: reasoning-only round does NOT emit the generic error
|
||||||
|
5. Streaming agent path: reasoning tokens are NOT duplicated as normal answer text
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from src import llm_core
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Helpers: fake httpx responses for the non-streaming llm_call* paths
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _sync_response(payload: dict) -> httpx.Response:
|
||||||
|
req = httpx.Request("POST", "http://test/v1/chat/completions")
|
||||||
|
return httpx.Response(200, request=req, json=payload)
|
||||||
|
|
||||||
|
|
||||||
|
def _openai_msg(content, reasoning_content=None):
|
||||||
|
msg = {"content": content}
|
||||||
|
if reasoning_content is not None:
|
||||||
|
msg["reasoning_content"] = reasoning_content
|
||||||
|
return {"choices": [{"message": msg}]}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 1. llm_call (sync): empty content → falls back to reasoning_content
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def test_llm_call_returns_reasoning_content_when_content_empty(monkeypatch):
|
||||||
|
monkeypatch.setattr(
|
||||||
|
llm_core.httpx, "post",
|
||||||
|
lambda *a, **kw: _sync_response(_openai_msg("", "I reasoned through it")),
|
||||||
|
)
|
||||||
|
result = llm_core.llm_call(
|
||||||
|
"http://test/v1", "qwen3-8b",
|
||||||
|
[{"role": "user", "content": "think"}],
|
||||||
|
)
|
||||||
|
assert result == "I reasoned through it"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 2. llm_call_async (async): empty content → falls back to reasoning_content
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def test_llm_call_async_returns_reasoning_content_when_content_empty(monkeypatch):
|
||||||
|
class _FakeAsyncClient:
|
||||||
|
async def post(self, *a, **kw):
|
||||||
|
req = httpx.Request("POST", "http://test-async/v1/chat/completions")
|
||||||
|
return httpx.Response(200, request=req,
|
||||||
|
json=_openai_msg("", "async reasoning text"))
|
||||||
|
|
||||||
|
monkeypatch.setattr(llm_core, "_get_http_client",
|
||||||
|
lambda: _FakeAsyncClient())
|
||||||
|
|
||||||
|
result = asyncio.run(llm_core.llm_call_async(
|
||||||
|
"http://test-async/v1", "qwen3-8b",
|
||||||
|
[{"role": "user", "content": "think"}],
|
||||||
|
))
|
||||||
|
assert result == "async reasoning text"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 3. Normal content takes priority over reasoning_content when both present
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def test_llm_call_content_wins_over_reasoning_content(monkeypatch):
|
||||||
|
monkeypatch.setattr(
|
||||||
|
llm_core.httpx, "post",
|
||||||
|
lambda *a, **kw: _sync_response(
|
||||||
|
_openai_msg("Normal answer", "some reasoning")
|
||||||
|
),
|
||||||
|
)
|
||||||
|
result = llm_core.llm_call(
|
||||||
|
"http://test/v1", "some-model",
|
||||||
|
[{"role": "user", "content": "hi"}],
|
||||||
|
)
|
||||||
|
assert result == "Normal answer"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Streaming agent path tests (4 and 5)
|
||||||
|
# These import and test _empty_response_fallback — the real production helper
|
||||||
|
# extracted from stream_agent_loop. If the fallback branch is reverted or
|
||||||
|
# changed, these tests will fail.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
# Mock heavy DB/tool deps before importing agent_loop
|
||||||
|
for _mod in [
|
||||||
|
"sqlalchemy", "sqlalchemy.orm", "sqlalchemy.ext",
|
||||||
|
"sqlalchemy.ext.declarative", "sqlalchemy.ext.hybrid",
|
||||||
|
"sqlalchemy.sql", "sqlalchemy.sql.expression",
|
||||||
|
"src.database", "src.agent_tools",
|
||||||
|
"core.models", "core.database",
|
||||||
|
]:
|
||||||
|
if _mod not in sys.modules:
|
||||||
|
sys.modules[_mod] = MagicMock()
|
||||||
|
|
||||||
|
from src.agent_loop import _empty_response_fallback # noqa: E402
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 4. Reasoning-only round: generic error is suppressed
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def test_stream_agent_reasoning_only_does_not_emit_error():
|
||||||
|
final_response, chunk = _empty_response_fallback(
|
||||||
|
full_response="",
|
||||||
|
round_reasoning="I reasoned carefully",
|
||||||
|
tool_events=[],
|
||||||
|
)
|
||||||
|
assert chunk is None, "Must not emit any SSE chunk when reasoning is present"
|
||||||
|
assert "The model returned an empty response" not in (chunk or "")
|
||||||
|
assert final_response == "I reasoned carefully"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 5. Reasoning tokens are NOT re-emitted as a normal answer delta
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def test_stream_agent_reasoning_not_duplicated_as_normal_delta():
|
||||||
|
reasoning_text = "my internal reasoning"
|
||||||
|
_, chunk = _empty_response_fallback(
|
||||||
|
full_response="",
|
||||||
|
round_reasoning=reasoning_text,
|
||||||
|
tool_events=[],
|
||||||
|
)
|
||||||
|
# chunk must be None — the reasoning was already sent as {thinking:true}
|
||||||
|
assert chunk is None, (
|
||||||
|
f"reasoning text was re-emitted as a normal delta chunk: {chunk!r}"
|
||||||
|
)
|
||||||
Reference in New Issue
Block a user