From 4a84a895a0f95fbbaf9d2822440b850295454cf1 Mon Sep 17 00:00:00 2001 From: Mahdi Salmanzade Date: Tue, 2 Jun 2026 06:17:41 +0400 Subject: [PATCH] Keep reasoning (thinking) tokens out of the saved chat reply (#856) Streamed deltas flagged thinking:true (reasoning-model traces) were being folded into full_response and persisted as part of the assistant message, so saved replies were polluted with the model's chain-of-thought. Forward those deltas to the client (for a live thinking indicator) but exclude them from the accumulated saved reply, in both chat and research-stream paths. Mirrors the existing rewrite path's handling. --- routes/chat_routes.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/routes/chat_routes.py b/routes/chat_routes.py index d0da480..044d02b 100644 --- a/routes/chat_routes.py +++ b/routes/chat_routes.py @@ -708,8 +708,13 @@ def setup_chat_routes( try: data = json.loads(chunk[6:]) if "delta" in data: - full_response += data["delta"] - _stream_set(session, partial=full_response) + # Reasoning tokens arrive flagged thinking:true. + # Forward them so the client can show a thinking + # indicator, but don't fold them into the saved + # reply (mirrors the rewrite path below). + if not data.get("thinking"): + full_response += data["delta"] + _stream_set(session, partial=full_response) yield chunk elif data.get("type") == "usage": last_metrics = data.get("data", {}) @@ -805,8 +810,12 @@ def setup_chat_routes( try: data = json.loads(chunk[6:]) if "delta" in data: - full_response += data["delta"] - _stream_set(session, partial=full_response) + # Reasoning tokens arrive flagged thinking:true. + # Forward them for the live indicator, but keep + # them out of the saved reply (same as chat mode). + if not data.get("thinking"): + full_response += data["delta"] + _stream_set(session, partial=full_response) yield chunk elif data.get("type") == "web_sources": web_sources = data.get("data", [])