fix(llm): auto-detect <think> in content stream for unregistered thinking models (#2588)

* fix(llm): auto-detect <think> in content stream for unregistered thinking models

_THINKING_MODEL_PATTERNS only covers known model families by name. Qwen3-derived
models with non-standard names (e.g. Qwopus, custom QwQ forks) are not matched,
so their <think>...</think> content streams through as visible chat text instead
of being routed to the thinking display.

When the first content delta opens with <think> and the model was not already
identified as a thinking model, dynamically flag the stream as a thinking model
for the remainder of the response. This enables the existing </think> repair path
(line below) and ensures the frontend receives the full <think>...</think> wrapper
it needs to split thinking from the final answer.

The check is restricted to the very first content delta (_first_content_sent is
False) to avoid misidentifying models that happen to write "<think>" mid-answer.

Fixes #2225
Related: #2420 (covered by separate PR from @AmmarS-Analyst), #2224 (@RaresKeY)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* fix(llm): replace inert _thinking_model flag with _in_think_tag state machine

The original auto-detect set _thinking_model=True on the first <think> chunk
but still emitted it as a regular delta and set _first_content_sent=True
immediately, so no subsequent chunk could enter the repair path.

Replace with _in_think_tag bool: enter thinking mode when first content starts
with <think>, route all chunks to the thinking channel until </think> is found,
then the tail becomes the first regular delta. Adds three regression tests.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* fix(llm): replace _first_content_sent guard with _think_open_stripped

Opening-tag stripping used `not _first_content_sent` as the guard, but
_first_content_sent stays False throughout the entire think block (it only
flips when regular content is emitted). So `find(">")` ran on every
reasoning chunk — not just the first — and silently truncated everything
before the first ">" in any reasoning text containing comparisons, arrows,
or code.

Fix: add `_think_open_stripped = False` alongside `_in_think_tag`. Use it
as the strip guard in both the "still inside <think>" path and the
"</think> found in same chunk" split path. Set it True once the opening
tag is consumed so all subsequent chunks reach the thinking channel
unmolested.

Add regression test: 3-chunk stream where the middle chunk contains
"c > d" — confirms "more c " is not dropped.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

---------

Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Giuseppe
2026-06-04 20:18:19 +02:00
committed by GitHub
parent 0ead3a4eb2
commit 6d511f6e66
2 changed files with 125 additions and 8 deletions

View File

@@ -1363,6 +1363,8 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
# can detect thinking-in-progress (some models output </think> but no <think>)
_thinking_model = _supports_thinking(model)
_first_content_sent = False
_in_think_tag = False # True while consuming <think>…</think> content
_think_open_stripped = False # opening <think> tag already removed
def _emit_tool_calls():
"""Build the tool_calls event string if any were accumulated."""
@@ -1444,14 +1446,53 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
yield f'data: {json.dumps({"delta": reasoning, "thinking": True})}\n\n'
content = delta.get("content") or ""
if content:
# Some thinking backends start normal content with a
# stray closing tag. Repair only that shape; do not
# wrap every first token for model families like
# MiniMax, which often stream ordinary answers.
if _thinking_model and not _first_content_sent and content.lstrip().lower().startswith("</think"):
content = "<think>" + content
_first_content_sent = True
yield f'data: {json.dumps({"delta": content})}\n\n'
stripped = content.lstrip()
# Auto-detect <think>…</think> in content stream.
# Covers Qwen3-derived models (Qwopus, QwQ forks) whose
# names don't match _THINKING_MODEL_PATTERNS but still
# emit literal <think> markup via llama.cpp --jinja.
if not _first_content_sent and not _thinking_model and not _in_think_tag and stripped.lower().startswith("<think"):
_thinking_model = True
_in_think_tag = True
if _in_think_tag:
close_idx = content.lower().find("</think>")
if close_idx != -1:
# Split: up-to-</think> → thinking, remainder → content
think_part = content[:close_idx]
if not _think_open_stripped:
# Strip the opening <think[...] > from the first chunk.
# Use a dedicated flag — _first_content_sent stays False
# throughout the think block, so it must not be reused.
tag_end = think_part.lower().find(">")
if tag_end != -1:
think_part = think_part[tag_end + 1:]
_think_open_stripped = True
regular_part = content[close_idx + len("</think>"):]
_in_think_tag = False
if think_part:
yield f'data: {json.dumps({"delta": think_part, "thinking": True})}\n\n'
if regular_part:
_first_content_sent = True
yield f'data: {json.dumps({"delta": regular_part})}\n\n'
else:
# Still inside <think>: route to thinking channel
if not _think_open_stripped:
# Strip the opening <think[...] > tag (first chunk only)
tag_end = stripped.lower().find(">")
if tag_end != -1:
content = stripped[tag_end + 1:]
_think_open_stripped = True
if content:
yield f'data: {json.dumps({"delta": content, "thinking": True})}\n\n'
else:
# Some thinking backends start normal content with a
# stray closing tag. Repair only that shape; do not
# wrap every first token for model families like
# MiniMax, which often stream ordinary answers.
if _thinking_model and not _first_content_sent and stripped.lower().startswith("</think"):
content = "<think>" + content
_first_content_sent = True
yield f'data: {json.dumps({"delta": content})}\n\n'
# Native tool calls — accumulate across chunks
for tc in delta.get("tool_calls") or []:
if tc is None:

View File

@@ -96,3 +96,79 @@ def test_reasoning_content_field_still_supported(monkeypatch):
)
assert any(d.get("thinking") and "older field" in d["delta"] for d in deltas), deltas
assert any((not d.get("thinking")) and d["delta"] == "Answer" for d in deltas), deltas
def test_think_tag_in_content_stream_routes_to_thinking_channel(monkeypatch):
# Regression: unregistered model (Qwopus-style) that emits <think>…</think>
# directly in the content field. Reasoning must surface as thinking chunks;
# only the answer after </think> is a normal delta.
deltas = _run_stream(
"Qwopus3-9B-custom", # name not in _THINKING_MODEL_PATTERNS
[
'data: {"choices":[{"delta":{"content":"<think>step one "}}]}',
'data: {"choices":[{"delta":{"content":"step two"}}]}',
'data: {"choices":[{"delta":{"content":"</think>Final answer"}}]}',
"data: [DONE]",
],
monkeypatch,
)
thinking = [d for d in deltas if d.get("thinking")]
regular = [d for d in deltas if not d.get("thinking")]
assert thinking, f"expected thinking deltas, got: {deltas}"
assert all("Final answer" not in d["delta"] for d in thinking), thinking
assert regular, f"expected regular delta after </think>, got: {deltas}"
assert any("Final answer" in d["delta"] for d in regular), regular
def test_think_tag_and_close_in_same_chunk(monkeypatch):
# <think>reasoning</think>answer all arrive in a single content chunk.
deltas = _run_stream(
"Qwopus3-9B-custom",
[
'data: {"choices":[{"delta":{"content":"<think>my reasoning</think>my answer"}}]}',
"data: [DONE]",
],
monkeypatch,
)
thinking = [d for d in deltas if d.get("thinking")]
regular = [d for d in deltas if not d.get("thinking")]
assert thinking and "my reasoning" in thinking[0]["delta"], thinking
assert regular and "my answer" in regular[0]["delta"], regular
def test_think_tag_gt_in_mid_reasoning_not_truncated(monkeypatch):
# Regression for _first_content_sent misuse: the opening-tag strip ran on every
# chunk (not just the first) because _first_content_sent stays False throughout
# the think block. On chunk 2 it did find(">") over reasoning text and silently
# dropped everything before the first ">". Repro: 3 chunks, ">" in chunk 2.
deltas = _run_stream(
"Qwopus3-9B-custom",
[
'data: {"choices":[{"delta":{"content":"<think>reasoning a "}}]}',
'data: {"choices":[{"delta":{"content":"more c > d "}}]}',
'data: {"choices":[{"delta":{"content":"</think>answer"}}]}',
"data: [DONE]",
],
monkeypatch,
)
thinking = [d for d in deltas if d.get("thinking")]
regular = [d for d in deltas if not d.get("thinking")]
# "more c " must survive — must not be truncated at the '>'
assert any("more c > d" in d["delta"] for d in thinking), thinking
assert any("answer" in d["delta"] for d in regular), regular
def test_registered_thinking_model_stray_close_tag_repair_unchanged(monkeypatch):
# The existing </think> repair for registered models must not regress.
# A registered model that starts content with </think> gets <think> prepended.
deltas = _run_stream(
"qwq-32b", # registered in _THINKING_MODEL_PATTERNS
[
'data: {"choices":[{"delta":{"content":"</think>Here is my answer"}}]}',
"data: [DONE]",
],
monkeypatch,
)
assert deltas, deltas
first = deltas[0]["delta"]
assert first.startswith("<think>"), f"expected repair prefix, got: {first!r}"