From 6d511f6e66057c10ff8e6104dfdaaea257932d34 Mon Sep 17 00:00:00 2001 From: Giuseppe Date: Thu, 4 Jun 2026 20:18:19 +0200 Subject: [PATCH] fix(llm): auto-detect in content stream for unregistered thinking models (#2588) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(llm): auto-detect in content stream for unregistered thinking models _THINKING_MODEL_PATTERNS only covers known model families by name. Qwen3-derived models with non-standard names (e.g. Qwopus, custom QwQ forks) are not matched, so their ... content streams through as visible chat text instead of being routed to the thinking display. When the first content delta opens with and the model was not already identified as a thinking model, dynamically flag the stream as a thinking model for the remainder of the response. This enables the existing repair path (line below) and ensures the frontend receives the full ... wrapper it needs to split thinking from the final answer. The check is restricted to the very first content delta (_first_content_sent is False) to avoid misidentifying models that happen to write "" mid-answer. Fixes #2225 Related: #2420 (covered by separate PR from @AmmarS-Analyst), #2224 (@RaresKeY) Co-Authored-By: Claude Sonnet 4.6 * fix(llm): replace inert _thinking_model flag with _in_think_tag state machine The original auto-detect set _thinking_model=True on the first chunk but still emitted it as a regular delta and set _first_content_sent=True immediately, so no subsequent chunk could enter the repair path. Replace with _in_think_tag bool: enter thinking mode when first content starts with , route all chunks to the thinking channel until is found, then the tail becomes the first regular delta. Adds three regression tests. Co-Authored-By: Claude Sonnet 4.6 * fix(llm): replace _first_content_sent guard with _think_open_stripped Opening-tag stripping used `not _first_content_sent` as the guard, but _first_content_sent stays False throughout the entire think block (it only flips when regular content is emitted). So `find(">")` ran on every reasoning chunk — not just the first — and silently truncated everything before the first ">" in any reasoning text containing comparisons, arrows, or code. Fix: add `_think_open_stripped = False` alongside `_in_think_tag`. Use it as the strip guard in both the "still inside " path and the " found in same chunk" split path. Set it True once the opening tag is consumed so all subsequent chunks reach the thinking channel unmolested. Add regression test: 3-chunk stream where the middle chunk contains "c > d" — confirms "more c " is not dropped. Co-Authored-By: Claude Sonnet 4.6 --------- Co-authored-by: Claude Sonnet 4.6 --- src/llm_core.py | 57 ++++++++++++++++++++---- tests/test_llm_core_reasoning.py | 76 ++++++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+), 8 deletions(-) diff --git a/src/llm_core.py b/src/llm_core.py index 1995982..1baf184 100644 --- a/src/llm_core.py +++ b/src/llm_core.py @@ -1363,6 +1363,8 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl # can detect thinking-in-progress (some models output but no ) _thinking_model = _supports_thinking(model) _first_content_sent = False + _in_think_tag = False # True while consuming content + _think_open_stripped = False # opening tag already removed def _emit_tool_calls(): """Build the tool_calls event string if any were accumulated.""" @@ -1444,14 +1446,53 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl yield f'data: {json.dumps({"delta": reasoning, "thinking": True})}\n\n' content = delta.get("content") or "" if content: - # Some thinking backends start normal content with a - # stray closing tag. Repair only that shape; do not - # wrap every first token for model families like - # MiniMax, which often stream ordinary answers. - if _thinking_model and not _first_content_sent and content.lstrip().lower().startswith("… in content stream. + # Covers Qwen3-derived models (Qwopus, QwQ forks) whose + # names don't match _THINKING_MODEL_PATTERNS but still + # emit literal markup via llama.cpp --jinja. + if not _first_content_sent and not _thinking_model and not _in_think_tag and stripped.lower().startswith("") + if close_idx != -1: + # Split: up-to- → thinking, remainder → content + think_part = content[:close_idx] + if not _think_open_stripped: + # Strip the opening from the first chunk. + # Use a dedicated flag — _first_content_sent stays False + # throughout the think block, so it must not be reused. + tag_end = think_part.lower().find(">") + if tag_end != -1: + think_part = think_part[tag_end + 1:] + _think_open_stripped = True + regular_part = content[close_idx + len(""):] + _in_think_tag = False + if think_part: + yield f'data: {json.dumps({"delta": think_part, "thinking": True})}\n\n' + if regular_part: + _first_content_sent = True + yield f'data: {json.dumps({"delta": regular_part})}\n\n' + else: + # Still inside : route to thinking channel + if not _think_open_stripped: + # Strip the opening tag (first chunk only) + tag_end = stripped.lower().find(">") + if tag_end != -1: + content = stripped[tag_end + 1:] + _think_open_stripped = True + if content: + yield f'data: {json.dumps({"delta": content, "thinking": True})}\n\n' + else: + # Some thinking backends start normal content with a + # stray closing tag. Repair only that shape; do not + # wrap every first token for model families like + # MiniMax, which often stream ordinary answers. + if _thinking_model and not _first_content_sent and stripped.lower().startswith("… + # directly in the content field. Reasoning must surface as thinking chunks; + # only the answer after is a normal delta. + deltas = _run_stream( + "Qwopus3-9B-custom", # name not in _THINKING_MODEL_PATTERNS + [ + 'data: {"choices":[{"delta":{"content":"step one "}}]}', + 'data: {"choices":[{"delta":{"content":"step two"}}]}', + 'data: {"choices":[{"delta":{"content":"Final answer"}}]}', + "data: [DONE]", + ], + monkeypatch, + ) + thinking = [d for d in deltas if d.get("thinking")] + regular = [d for d in deltas if not d.get("thinking")] + assert thinking, f"expected thinking deltas, got: {deltas}" + assert all("Final answer" not in d["delta"] for d in thinking), thinking + assert regular, f"expected regular delta after , got: {deltas}" + assert any("Final answer" in d["delta"] for d in regular), regular + + +def test_think_tag_and_close_in_same_chunk(monkeypatch): + # reasoninganswer all arrive in a single content chunk. + deltas = _run_stream( + "Qwopus3-9B-custom", + [ + 'data: {"choices":[{"delta":{"content":"my reasoningmy answer"}}]}', + "data: [DONE]", + ], + monkeypatch, + ) + thinking = [d for d in deltas if d.get("thinking")] + regular = [d for d in deltas if not d.get("thinking")] + assert thinking and "my reasoning" in thinking[0]["delta"], thinking + assert regular and "my answer" in regular[0]["delta"], regular + + +def test_think_tag_gt_in_mid_reasoning_not_truncated(monkeypatch): + # Regression for _first_content_sent misuse: the opening-tag strip ran on every + # chunk (not just the first) because _first_content_sent stays False throughout + # the think block. On chunk 2 it did find(">") over reasoning text and silently + # dropped everything before the first ">". Repro: 3 chunks, ">" in chunk 2. + deltas = _run_stream( + "Qwopus3-9B-custom", + [ + 'data: {"choices":[{"delta":{"content":"reasoning a "}}]}', + 'data: {"choices":[{"delta":{"content":"more c > d "}}]}', + 'data: {"choices":[{"delta":{"content":"answer"}}]}', + "data: [DONE]", + ], + monkeypatch, + ) + thinking = [d for d in deltas if d.get("thinking")] + regular = [d for d in deltas if not d.get("thinking")] + # "more c " must survive — must not be truncated at the '>' + assert any("more c > d" in d["delta"] for d in thinking), thinking + assert any("answer" in d["delta"] for d in regular), regular + + +def test_registered_thinking_model_stray_close_tag_repair_unchanged(monkeypatch): + # The existing repair for registered models must not regress. + # A registered model that starts content with gets prepended. + deltas = _run_stream( + "qwq-32b", # registered in _THINKING_MODEL_PATTERNS + [ + 'data: {"choices":[{"delta":{"content":"Here is my answer"}}]}', + "data: [DONE]", + ], + monkeypatch, + ) + assert deltas, deltas + first = deltas[0]["delta"] + assert first.startswith(""), f"expected repair prefix, got: {first!r}"