diff --git a/src/llm_core.py b/src/llm_core.py index 1995982..1baf184 100644 --- a/src/llm_core.py +++ b/src/llm_core.py @@ -1363,6 +1363,8 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl # can detect thinking-in-progress (some models output but no ) _thinking_model = _supports_thinking(model) _first_content_sent = False + _in_think_tag = False # True while consuming content + _think_open_stripped = False # opening tag already removed def _emit_tool_calls(): """Build the tool_calls event string if any were accumulated.""" @@ -1444,14 +1446,53 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl yield f'data: {json.dumps({"delta": reasoning, "thinking": True})}\n\n' content = delta.get("content") or "" if content: - # Some thinking backends start normal content with a - # stray closing tag. Repair only that shape; do not - # wrap every first token for model families like - # MiniMax, which often stream ordinary answers. - if _thinking_model and not _first_content_sent and content.lstrip().lower().startswith("… in content stream. + # Covers Qwen3-derived models (Qwopus, QwQ forks) whose + # names don't match _THINKING_MODEL_PATTERNS but still + # emit literal markup via llama.cpp --jinja. + if not _first_content_sent and not _thinking_model and not _in_think_tag and stripped.lower().startswith("") + if close_idx != -1: + # Split: up-to- → thinking, remainder → content + think_part = content[:close_idx] + if not _think_open_stripped: + # Strip the opening from the first chunk. + # Use a dedicated flag — _first_content_sent stays False + # throughout the think block, so it must not be reused. + tag_end = think_part.lower().find(">") + if tag_end != -1: + think_part = think_part[tag_end + 1:] + _think_open_stripped = True + regular_part = content[close_idx + len(""):] + _in_think_tag = False + if think_part: + yield f'data: {json.dumps({"delta": think_part, "thinking": True})}\n\n' + if regular_part: + _first_content_sent = True + yield f'data: {json.dumps({"delta": regular_part})}\n\n' + else: + # Still inside : route to thinking channel + if not _think_open_stripped: + # Strip the opening tag (first chunk only) + tag_end = stripped.lower().find(">") + if tag_end != -1: + content = stripped[tag_end + 1:] + _think_open_stripped = True + if content: + yield f'data: {json.dumps({"delta": content, "thinking": True})}\n\n' + else: + # Some thinking backends start normal content with a + # stray closing tag. Repair only that shape; do not + # wrap every first token for model families like + # MiniMax, which often stream ordinary answers. + if _thinking_model and not _first_content_sent and stripped.lower().startswith("… + # directly in the content field. Reasoning must surface as thinking chunks; + # only the answer after is a normal delta. + deltas = _run_stream( + "Qwopus3-9B-custom", # name not in _THINKING_MODEL_PATTERNS + [ + 'data: {"choices":[{"delta":{"content":"step one "}}]}', + 'data: {"choices":[{"delta":{"content":"step two"}}]}', + 'data: {"choices":[{"delta":{"content":"Final answer"}}]}', + "data: [DONE]", + ], + monkeypatch, + ) + thinking = [d for d in deltas if d.get("thinking")] + regular = [d for d in deltas if not d.get("thinking")] + assert thinking, f"expected thinking deltas, got: {deltas}" + assert all("Final answer" not in d["delta"] for d in thinking), thinking + assert regular, f"expected regular delta after , got: {deltas}" + assert any("Final answer" in d["delta"] for d in regular), regular + + +def test_think_tag_and_close_in_same_chunk(monkeypatch): + # reasoninganswer all arrive in a single content chunk. + deltas = _run_stream( + "Qwopus3-9B-custom", + [ + 'data: {"choices":[{"delta":{"content":"my reasoningmy answer"}}]}', + "data: [DONE]", + ], + monkeypatch, + ) + thinking = [d for d in deltas if d.get("thinking")] + regular = [d for d in deltas if not d.get("thinking")] + assert thinking and "my reasoning" in thinking[0]["delta"], thinking + assert regular and "my answer" in regular[0]["delta"], regular + + +def test_think_tag_gt_in_mid_reasoning_not_truncated(monkeypatch): + # Regression for _first_content_sent misuse: the opening-tag strip ran on every + # chunk (not just the first) because _first_content_sent stays False throughout + # the think block. On chunk 2 it did find(">") over reasoning text and silently + # dropped everything before the first ">". Repro: 3 chunks, ">" in chunk 2. + deltas = _run_stream( + "Qwopus3-9B-custom", + [ + 'data: {"choices":[{"delta":{"content":"reasoning a "}}]}', + 'data: {"choices":[{"delta":{"content":"more c > d "}}]}', + 'data: {"choices":[{"delta":{"content":"answer"}}]}', + "data: [DONE]", + ], + monkeypatch, + ) + thinking = [d for d in deltas if d.get("thinking")] + regular = [d for d in deltas if not d.get("thinking")] + # "more c " must survive — must not be truncated at the '>' + assert any("more c > d" in d["delta"] for d in thinking), thinking + assert any("answer" in d["delta"] for d in regular), regular + + +def test_registered_thinking_model_stray_close_tag_repair_unchanged(monkeypatch): + # The existing repair for registered models must not regress. + # A registered model that starts content with gets prepended. + deltas = _run_stream( + "qwq-32b", # registered in _THINKING_MODEL_PATTERNS + [ + 'data: {"choices":[{"delta":{"content":"Here is my answer"}}]}', + "data: [DONE]", + ], + monkeypatch, + ) + assert deltas, deltas + first = deltas[0]["delta"] + assert first.startswith(""), f"expected repair prefix, got: {first!r}"