From 5645cce6d0180529df3bc75b044404514829229f Mon Sep 17 00:00:00 2001 From: nsgds <161509862+nsgds@users.noreply.github.com> Date: Tue, 2 Jun 2026 10:48:17 +0800 Subject: [PATCH] Support vLLM 0.20.2 / NIM reasoning-parser output end-to-end (surface + agent context + render) (#602) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(stream): read 'reasoning' SSE field for vLLM 0.20.2 / NIM vLLM 0.20.2 / NVIDIA NIM emit reasoning-parser output in the `reasoning` delta field; older builds use `reasoning_content`. stream_llm() read only the latter, so reasoning from models like Nemotron-3-Nano (--reasoning-parser) was silently dropped and never rendered. Accept either field. Co-Authored-By: Claude Opus 4.8 (1M context) * fix(agent): keep reasoning_content only on the latest assistant turn The agent loop echoed each round's reasoning back as `reasoning_content` on every assistant turn, assuming vendors ignore it. Nemotron's chat template re-injects ALL prior reasoning_content as blocks, and the loop is trimmed only once (before it starts) — so reasoning accumulated unbounded across rounds, bloating context and feeding the model its own prior reasoning, which reinforced repetition/looping. Strip reasoning_content from earlier assistant turns so only the most recent round carries it (still satisfies DeepSeek's thinking-mode follow-up requirement). Co-Authored-By: Claude Opus 4.8 (1M context) * fix(agent-ui): wrap each round's reasoning in its own block The streamed think-tag wrapper gated on whole-message substring checks (accumulated.includes('')), which only ever wrapped ONE reasoning block per message. A multi-round agent response has a reasoning phase per round, so once round 1 closed its ..., rounds 2+ reasoning was emitted unwrapped and leaked into the visible answer. Replace the substring checks with a stateful open/close flag that toggles per think/answer cycle, so each round's reasoning gets its own collapsible block. Single-turn chat is unchanged (one open, one close). Co-Authored-By: Claude Opus 4.8 (1M context) * test(stream): reasoning/reasoning_content delta surfaces as thinking chunk Covers @pewdiepie-archdaemon's requested regression: a streamed {reasoning: ...} delta emits a thinking chunk while {content: ...} streams as normal content; plus the older reasoning_content field for backward compat. Mirrors the #591 scenario. Co-Authored-By: Claude Opus 4.8 (1M context) --------- Co-authored-by: Claude Opus 4.8 (1M context) --- src/agent_loop.py | 14 ++++- src/llm_core.py | 4 +- static/js/chat.js | 15 +++-- tests/test_llm_core_reasoning.py | 98 ++++++++++++++++++++++++++++++++ 4 files changed, 124 insertions(+), 7 deletions(-) create mode 100644 tests/test_llm_core_reasoning.py diff --git a/src/agent_loop.py b/src/agent_loop.py index 2a945ea..1b62f82 100644 --- a/src/agent_loop.py +++ b/src/agent_loop.py @@ -1101,8 +1101,20 @@ def _append_tool_results( `round_reasoning` (DeepSeek / vLLM reasoning-parser deltas) is echoed back via `reasoning_content` on the assistant message — DeepSeek's API rejects follow-up requests in thinking mode that don't include the - prior reasoning. Other vendors ignore the extra field. + prior reasoning. + + NOTE: it is NOT universally ignored. Nemotron's chat template re-injects + EVERY prior `reasoning_content` as a block, and this agent loop is + trimmed only once (before the loop), so across rounds the reasoning piles + up unbounded — bloating context and feeding the model its own prior + reasoning, which reinforces repetition/looping. So keep reasoning_content + on the MOST RECENT assistant turn only: enough for DeepSeek continuity, + without the per-round accumulation. """ + # Strip reasoning_content from earlier assistant turns; only the newest keeps it. + for _m in messages: + if _m.get("role") == "assistant": + _m.pop("reasoning_content", None) if used_native and native_tool_calls: assistant_msg = {"role": "assistant"} # When the model emitted ONLY tool calls (no prose), content must be diff --git a/src/llm_core.py b/src/llm_core.py index ac8de73..00cff41 100644 --- a/src/llm_core.py +++ b/src/llm_core.py @@ -1127,8 +1127,8 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl delta = j["choices"][0].get("delta") or {} if isinstance(delta, dict): # Text content - # Reasoning tokens (VLLM --reasoning-parser, e.g. Qwen3/DeepSeek-R1) - reasoning = delta.get("reasoning_content") or "" + # Reasoning tokens (VLLM --reasoning-parser, e.g. Qwen3/DeepSeek-R1, Nemotron). vLLM 0.20.2 / NIM emit the field as `reasoning`; older builds use `reasoning_content`. Accept either. + reasoning = delta.get("reasoning_content") or delta.get("reasoning") or "" if reasoning: yield f'data: {json.dumps({"delta": reasoning, "thinking": True})}\n\n' content = delta.get("content") or "" diff --git a/static/js/chat.js b/static/js/chat.js index 4a7632c..969cf7e 100644 --- a/static/js/chat.js +++ b/static/js/chat.js @@ -512,6 +512,10 @@ import createResearchSynapse from './researchSynapse.js'; // Declare accumulated outside try block so it's accessible in catch let accumulated = ''; + // Are we currently inside an unclosed block? Toggled per think/answer + // cycle so a multi-round agent response (one reasoning phase PER round) wraps each + // round's reasoning in its own instead of leaking rounds 2+ as text. + let _thinkOpen = false; let holder = null; let finalMeta = null; let finalModelName = null; @@ -1357,12 +1361,15 @@ import createResearchSynapse from './researchSynapse.js'; if (_threadAbove && _threadAbove.classList.contains('agent-thread') && !_threadAbove.classList.contains('has-bottom')) { _threadAbove.classList.add('has-bottom'); } - // VLLM reasoning tokens: wrap in tags for the thinking UI + // VLLM reasoning tokens: wrap in tags for the thinking UI. + // Stateful open/close (not a whole-message substring check) so each round + // of a multi-round agent response gets its own — otherwise + // only round 1 is wrapped and rounds 2+ reasoning leaks into the answer. let _delta = json.delta; if (json.thinking) { - if (!accumulated.includes('')) _delta = '' + _delta; - } else if (accumulated.includes('') && !accumulated.includes('')) { - _delta = '' + _delta; + if (!_thinkOpen) { _delta = '' + _delta; _thinkOpen = true; } + } else if (_thinkOpen) { + _delta = '' + _delta; _thinkOpen = false; } const wasEmpty = !accumulated; accumulated += _delta; diff --git a/tests/test_llm_core_reasoning.py b/tests/test_llm_core_reasoning.py new file mode 100644 index 0000000..35dafcc --- /dev/null +++ b/tests/test_llm_core_reasoning.py @@ -0,0 +1,98 @@ +"""Regression: a streamed `reasoning` delta (vLLM 0.20.2 / NIM / Ollama) must surface +as a thinking chunk, while a `content` delta still streams as normal content. Also +covers the older `reasoning_content` field name for backward compatibility. +""" +import asyncio +import json + +from src import llm_core + + +class _FakeResp: + status_code = 200 + + def __init__(self, lines): + self._lines = lines + + async def aiter_lines(self): + for ln in self._lines: + yield ln + + async def aread(self): # only used on non-200; present for safety + return b"" + + +class _FakeStreamCtx: + def __init__(self, lines): + self._lines = lines + + async def __aenter__(self): + return _FakeResp(self._lines) + + async def __aexit__(self, *exc): + return False + + +class _FakeClient: + def __init__(self, lines): + self._lines = lines + + def stream(self, *args, **kwargs): + return _FakeStreamCtx(self._lines) + + +def _run_stream(model, lines, monkeypatch): + """Drive stream_llm against a faked upstream and return parsed SSE payloads.""" + monkeypatch.setattr(llm_core, "_get_http_client", lambda: _FakeClient(lines)) + + async def _go(): + out = [] + async for chunk in llm_core.stream_llm( + "http://nim-nano:8000/v1/chat/completions", + model, + [{"role": "user", "content": "hi"}], + ): + out.append(chunk) + return out + + parsed = [] + for chunk in asyncio.run(_go()): + for raw in chunk.splitlines(): + raw = raw.strip() + if raw.startswith("data:"): + payload = raw[5:].strip() + if payload.startswith("{"): + try: + parsed.append(json.loads(payload)) + except json.JSONDecodeError: + pass + return [p for p in parsed if "delta" in p] + + +def test_reasoning_field_emits_thinking_chunk(monkeypatch): + deltas = _run_stream( + "nvidia/nemotron-3-nano", + [ + 'data: {"choices":[{"delta":{"reasoning":"weighing options"}}]}', + 'data: {"choices":[{"delta":{"content":"Hello"}}]}', + "data: [DONE]", + ], + monkeypatch, + ) + assert any(d.get("thinking") and "weighing options" in d["delta"] for d in deltas), deltas + assert any((not d.get("thinking")) and d["delta"] == "Hello" for d in deltas), deltas + + +def test_reasoning_content_field_still_supported(monkeypatch): + # Older builds emit `reasoning_content`; it must still surface as thinking. + deltas = _run_stream( + "some-thinking-model", + [ + 'data: {"choices":[{"delta":{"reasoning_content":"older field"}}]}', + 'data: {"choices":[{"delta":{"content":"Answer"}}]}', + "data: [DONE]", + ], + monkeypatch, + ) + assert any(d.get("thinking") and "older field" in d["delta"] for d in deltas), deltas + assert any((not d.get("thinking")) and d["delta"] == "Answer" for d in deltas), deltas