From 4a84a895a0f95fbbaf9d2822440b850295454cf1 Mon Sep 17 00:00:00 2001
From: Mahdi Salmanzade <mahdisalmanzadehasl@gmail.com>
Date: Tue, 2 Jun 2026 06:17:41 +0400
Subject: [PATCH] Keep reasoning (thinking) tokens out of the saved chat reply
 (#856)

Streamed deltas flagged thinking:true (reasoning-model traces) were being folded
into full_response and persisted as part of the assistant message, so saved
replies were polluted with the model's chain-of-thought. Forward those deltas to
the client (for a live thinking indicator) but exclude them from the accumulated
saved reply, in both chat and research-stream paths. Mirrors the existing rewrite
path's handling.
---
 routes/chat_routes.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/routes/chat_routes.py b/routes/chat_routes.py
index d0da480..044d02b 100644
--- a/routes/chat_routes.py
+++ b/routes/chat_routes.py
@@ -708,8 +708,13 @@ def setup_chat_routes(
                             try:
                                 data = json.loads(chunk[6:])
                                 if "delta" in data:
-                                    full_response += data["delta"]
-                                    _stream_set(session, partial=full_response)
+                                    # Reasoning tokens arrive flagged thinking:true.
+                                    # Forward them so the client can show a thinking
+                                    # indicator, but don't fold them into the saved
+                                    # reply (mirrors the rewrite path below).
+                                    if not data.get("thinking"):
+                                        full_response += data["delta"]
+                                        _stream_set(session, partial=full_response)
                                     yield chunk
                                 elif data.get("type") == "usage":
                                     last_metrics = data.get("data", {})
@@ -805,8 +810,12 @@ def setup_chat_routes(
                             try:
                                 data = json.loads(chunk[6:])
                                 if "delta" in data:
-                                    full_response += data["delta"]
-                                    _stream_set(session, partial=full_response)
+                                    # Reasoning tokens arrive flagged thinking:true.
+                                    # Forward them for the live indicator, but keep
+                                    # them out of the saved reply (same as chat mode).
+                                    if not data.get("thinking"):
+                                        full_response += data["delta"]
+                                        _stream_set(session, partial=full_response)
                                     yield chunk
                                 elif data.get("type") == "web_sources":
                                     web_sources = data.get("data", [])