diff --git a/src/llm_core.py b/src/llm_core.py
index 1995982..1baf184 100644
--- a/src/llm_core.py
+++ b/src/llm_core.py
@@ -1363,6 +1363,8 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
# can detect thinking-in-progress (some models output but no )
_thinking_model = _supports_thinking(model)
_first_content_sent = False
+ _in_think_tag = False # True while consuming … content
+ _think_open_stripped = False # opening tag already removed
def _emit_tool_calls():
"""Build the tool_calls event string if any were accumulated."""
@@ -1444,14 +1446,53 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
yield f'data: {json.dumps({"delta": reasoning, "thinking": True})}\n\n'
content = delta.get("content") or ""
if content:
- # Some thinking backends start normal content with a
- # stray closing tag. Repair only that shape; do not
- # wrap every first token for model families like
- # MiniMax, which often stream ordinary answers.
- if _thinking_model and not _first_content_sent and content.lstrip().lower().startswith("" + content
- _first_content_sent = True
- yield f'data: {json.dumps({"delta": content})}\n\n'
+ stripped = content.lstrip()
+ # Auto-detect … in content stream.
+ # Covers Qwen3-derived models (Qwopus, QwQ forks) whose
+ # names don't match _THINKING_MODEL_PATTERNS but still
+ # emit literal markup via llama.cpp --jinja.
+ if not _first_content_sent and not _thinking_model and not _in_think_tag and stripped.lower().startswith("")
+ if close_idx != -1:
+ # Split: up-to- → thinking, remainder → content
+ think_part = content[:close_idx]
+ if not _think_open_stripped:
+ # Strip the opening from the first chunk.
+ # Use a dedicated flag — _first_content_sent stays False
+ # throughout the think block, so it must not be reused.
+ tag_end = think_part.lower().find(">")
+ if tag_end != -1:
+ think_part = think_part[tag_end + 1:]
+ _think_open_stripped = True
+ regular_part = content[close_idx + len(""):]
+ _in_think_tag = False
+ if think_part:
+ yield f'data: {json.dumps({"delta": think_part, "thinking": True})}\n\n'
+ if regular_part:
+ _first_content_sent = True
+ yield f'data: {json.dumps({"delta": regular_part})}\n\n'
+ else:
+ # Still inside : route to thinking channel
+ if not _think_open_stripped:
+ # Strip the opening tag (first chunk only)
+ tag_end = stripped.lower().find(">")
+ if tag_end != -1:
+ content = stripped[tag_end + 1:]
+ _think_open_stripped = True
+ if content:
+ yield f'data: {json.dumps({"delta": content, "thinking": True})}\n\n'
+ else:
+ # Some thinking backends start normal content with a
+ # stray closing tag. Repair only that shape; do not
+ # wrap every first token for model families like
+ # MiniMax, which often stream ordinary answers.
+ if _thinking_model and not _first_content_sent and stripped.lower().startswith("" + content
+ _first_content_sent = True
+ yield f'data: {json.dumps({"delta": content})}\n\n'
# Native tool calls — accumulate across chunks
for tc in delta.get("tool_calls") or []:
if tc is None:
diff --git a/tests/test_llm_core_reasoning.py b/tests/test_llm_core_reasoning.py
index 35dafcc..03ce194 100644
--- a/tests/test_llm_core_reasoning.py
+++ b/tests/test_llm_core_reasoning.py
@@ -96,3 +96,79 @@ def test_reasoning_content_field_still_supported(monkeypatch):
)
assert any(d.get("thinking") and "older field" in d["delta"] for d in deltas), deltas
assert any((not d.get("thinking")) and d["delta"] == "Answer" for d in deltas), deltas
+
+
+def test_think_tag_in_content_stream_routes_to_thinking_channel(monkeypatch):
+ # Regression: unregistered model (Qwopus-style) that emits …
+ # directly in the content field. Reasoning must surface as thinking chunks;
+ # only the answer after is a normal delta.
+ deltas = _run_stream(
+ "Qwopus3-9B-custom", # name not in _THINKING_MODEL_PATTERNS
+ [
+ 'data: {"choices":[{"delta":{"content":"step one "}}]}',
+ 'data: {"choices":[{"delta":{"content":"step two"}}]}',
+ 'data: {"choices":[{"delta":{"content":"Final answer"}}]}',
+ "data: [DONE]",
+ ],
+ monkeypatch,
+ )
+ thinking = [d for d in deltas if d.get("thinking")]
+ regular = [d for d in deltas if not d.get("thinking")]
+ assert thinking, f"expected thinking deltas, got: {deltas}"
+ assert all("Final answer" not in d["delta"] for d in thinking), thinking
+ assert regular, f"expected regular delta after , got: {deltas}"
+ assert any("Final answer" in d["delta"] for d in regular), regular
+
+
+def test_think_tag_and_close_in_same_chunk(monkeypatch):
+ # reasoninganswer all arrive in a single content chunk.
+ deltas = _run_stream(
+ "Qwopus3-9B-custom",
+ [
+ 'data: {"choices":[{"delta":{"content":"my reasoningmy answer"}}]}',
+ "data: [DONE]",
+ ],
+ monkeypatch,
+ )
+ thinking = [d for d in deltas if d.get("thinking")]
+ regular = [d for d in deltas if not d.get("thinking")]
+ assert thinking and "my reasoning" in thinking[0]["delta"], thinking
+ assert regular and "my answer" in regular[0]["delta"], regular
+
+
+def test_think_tag_gt_in_mid_reasoning_not_truncated(monkeypatch):
+ # Regression for _first_content_sent misuse: the opening-tag strip ran on every
+ # chunk (not just the first) because _first_content_sent stays False throughout
+ # the think block. On chunk 2 it did find(">") over reasoning text and silently
+ # dropped everything before the first ">". Repro: 3 chunks, ">" in chunk 2.
+ deltas = _run_stream(
+ "Qwopus3-9B-custom",
+ [
+ 'data: {"choices":[{"delta":{"content":"reasoning a "}}]}',
+ 'data: {"choices":[{"delta":{"content":"more c > d "}}]}',
+ 'data: {"choices":[{"delta":{"content":"answer"}}]}',
+ "data: [DONE]",
+ ],
+ monkeypatch,
+ )
+ thinking = [d for d in deltas if d.get("thinking")]
+ regular = [d for d in deltas if not d.get("thinking")]
+ # "more c " must survive — must not be truncated at the '>'
+ assert any("more c > d" in d["delta"] for d in thinking), thinking
+ assert any("answer" in d["delta"] for d in regular), regular
+
+
+def test_registered_thinking_model_stray_close_tag_repair_unchanged(monkeypatch):
+ # The existing repair for registered models must not regress.
+ # A registered model that starts content with gets prepended.
+ deltas = _run_stream(
+ "qwq-32b", # registered in _THINKING_MODEL_PATTERNS
+ [
+ 'data: {"choices":[{"delta":{"content":"Here is my answer"}}]}',
+ "data: [DONE]",
+ ],
+ monkeypatch,
+ )
+ assert deltas, deltas
+ first = deltas[0]["delta"]
+ assert first.startswith(""), f"expected repair prefix, got: {first!r}"