fix: deep research runs the prompt's example queries when the model echoes them (#1666)

2026-06-03 06:23:07 +01:00
parent 26d040d116
commit 8a0b79bc84
2 changed files with 81 additions and 2 deletions
--- a/src/deep_research.py
+++ b/src/deep_research.py
@@ -800,6 +800,17 @@ class DeepResearcher:
        except json.JSONDecodeError:
            pass

+        # Handle truncated arrays — e.g. '["query one", "query two", "query thr'
+        # Repair from the LAST array start so an echoed example array earlier
+        # in the reply is not harvested into the real query set.
+        last_start = text.rfind('[')
+        truncated = last_start != -1 and ']' not in text[last_start:]
+        if truncated:
+            complete_items = re.findall(r'"([^"]*)"', text[last_start:])
+            if complete_items:
+                logger.info(f"Repaired truncated JSON array: recovered {len(complete_items)} items")
+                return complete_items
+
        # Greedy match to capture the full outermost array
        match = re.search(r'\[[\s\S]*\]', text)
        if match:
@@ -810,8 +821,22 @@ class DeepResearcher:
            except json.JSONDecodeError:
                pass

-        # Handle truncated arrays — e.g. '["query one", "query two", "query thr'
-        # Try to find the start of an array and repair it
+        # Multiple complete arrays in one reply (e.g. the model echoes the
+        # prompt's Example: [...] before the real array). The greedy match
+        # above spans them all and fails to parse, so scan non-greedily and
+        # keep the LAST parseable array, which is the model's actual answer.
+        last_parsed = None
+        for m in re.finditer(r'\[[\s\S]*?\]', text):
+            try:
+                parsed = json.loads(m.group())
+                if isinstance(parsed, list):
+                    last_parsed = parsed
+            except json.JSONDecodeError:
+                continue
+        if last_parsed is not None:
+            return [str(item) for item in last_parsed]
+
+        # Last resort: harvest quoted strings from the first array start
        arr_start = text.find('[')
        if arr_start != -1:
            fragment = text[arr_start:]