From 65751186bd6ac4208c6198a863fe777d519dbfba Mon Sep 17 00:00:00 2001
From: Afonso Coutinho <afonso@omelhorsite.pt>
Date: Tue, 2 Jun 2026 17:21:57 +0100
Subject: [PATCH] fix: merging consecutive user messages corrupts multimodal
 (image) content (#1277)

* fix: preserve multimodal content blocks when merging consecutive user messages

* test: consecutive user-message merge keeps multimodal image blocks
---
 src/llm_core.py                         | 38 +++++++++++++++++++++----
 tests/test_sanitize_multimodal_merge.py | 28 ++++++++++++++++++
 2 files changed, 60 insertions(+), 6 deletions(-)
 create mode 100644 tests/test_sanitize_multimodal_merge.py

diff --git a/src/llm_core.py b/src/llm_core.py
index 1e3d12a..5438967 100644
--- a/src/llm_core.py
+++ b/src/llm_core.py
@@ -576,6 +576,20 @@ def _parse_anthropic_response(data: dict) -> str:
     )
 
 
+def _as_content_blocks(content) -> List[Dict]:
+    """Coerce a message `content` into a list of content blocks.
+
+    A list (multimodal: text + image parts) passes through; a non-empty string
+    becomes a single text block; None/empty yields no blocks. Used when merging
+    consecutive user messages so multimodal content isn't str()-ed away.
+    """
+    if isinstance(content, list):
+        return content
+    if content:
+        return [{"type": "text", "text": str(content)}]
+    return []
+
+
 def _sanitize_llm_messages(messages: List[Dict]) -> List[Dict]:
     """Strip Odysseus-only metadata before sending messages to providers.
 
@@ -689,13 +703,25 @@ def _sanitize_llm_messages(messages: List[Dict]) -> List[Dict]:
         last = merged[-1]
         if last.get("role") == "user" and item.get("role") == "user":
             last_copy = dict(last)
-            last_str = str(last_copy.get("content")) if last_copy.get("content") is not None else ""
-            item_str = str(item.get("content")) if item.get("content") is not None else ""
-            new_content = "\n\n".join(part for part in (last_str, item_str) if part)
-            if new_content:
-                last_copy["content"] = new_content
+            lc = last_copy.get("content")
+            ic = item.get("content")
+            if isinstance(lc, list) or isinstance(ic, list):
+                # Preserve multimodal content blocks (e.g. an image part) by
+                # concatenating the block lists. str()-ing a list turned an
+                # image message into its Python repr and dropped the image.
+                merged_blocks = _as_content_blocks(lc) + _as_content_blocks(ic)
+                if merged_blocks:
+                    last_copy["content"] = merged_blocks
+                else:
+                    last_copy.pop("content", None)
             else:
-                last_copy.pop("content", None)
+                last_str = str(lc) if lc is not None else ""
+                item_str = str(ic) if ic is not None else ""
+                new_content = "\n\n".join(part for part in (last_str, item_str) if part)
+                if new_content:
+                    last_copy["content"] = new_content
+                else:
+                    last_copy.pop("content", None)
             merged[-1] = last_copy
         else:
             merged.append(item)
diff --git a/tests/test_sanitize_multimodal_merge.py b/tests/test_sanitize_multimodal_merge.py
new file mode 100644
index 0000000..1304f9c
--- /dev/null
+++ b/tests/test_sanitize_multimodal_merge.py
@@ -0,0 +1,28 @@
+"""Regression: merging consecutive user messages must not str() multimodal content."""
+
+from src.llm_core import _sanitize_llm_messages
+
+
+def test_multimodal_user_message_keeps_image_block_when_merged():
+    image_msg = {"role": "user", "content": [
+        {"type": "text", "text": "look at this"},
+        {"type": "image_url", "image_url": {"url": "data:image/png;base64,AAAA"}},
+    ]}
+    tool_result = {"role": "user", "content": "Tool result: 42"}
+    out = _sanitize_llm_messages([image_msg, tool_result])
+
+    # The two consecutive user messages collapse into one...
+    assert len(out) == 1
+    content = out[0]["content"]
+    # ...and the image block survives (it used to be str()-ed into a repr).
+    assert isinstance(content, list)
+    assert any(b.get("type") == "image_url" for b in content)
+    assert content[-1] == {"type": "text", "text": "Tool result: 42"}
+
+
+def test_string_only_user_merge_unchanged():
+    a = {"role": "user", "content": "first"}
+    b = {"role": "user", "content": "second"}
+    out = _sanitize_llm_messages([a, b])
+    assert len(out) == 1
+    assert out[0]["content"] == "first\n\nsecond"