fix: merging consecutive user messages corrupts multimodal (image) content (#1277)

* fix: preserve multimodal content blocks when merging consecutive user messages * test: consecutive user-message merge keeps multimodal image blocks
2026-06-02 17:21:57 +01:00
parent 83aa35b83e
commit 65751186bd
2 changed files with 60 additions and 6 deletions
--- a/src/llm_core.py
+++ b/src/llm_core.py
@@ -576,6 +576,20 @@ def _parse_anthropic_response(data: dict) -> str:
    )


+def _as_content_blocks(content) -> List[Dict]:
+    """Coerce a message `content` into a list of content blocks.
+
+    A list (multimodal: text + image parts) passes through; a non-empty string
+    becomes a single text block; None/empty yields no blocks. Used when merging
+    consecutive user messages so multimodal content isn't str()-ed away.
+    """
+    if isinstance(content, list):
+        return content
+    if content:
+        return [{"type": "text", "text": str(content)}]
+    return []
+
+
 def _sanitize_llm_messages(messages: List[Dict]) -> List[Dict]:
    """Strip Odysseus-only metadata before sending messages to providers.

@@ -689,13 +703,25 @@ def _sanitize_llm_messages(messages: List[Dict]) -> List[Dict]:
        last = merged[-1]
        if last.get("role") == "user" and item.get("role") == "user":
            last_copy = dict(last)
-            last_str = str(last_copy.get("content")) if last_copy.get("content") is not None else ""
-            item_str = str(item.get("content")) if item.get("content") is not None else ""
-            new_content = "\n\n".join(part for part in (last_str, item_str) if part)
-            if new_content:
-                last_copy["content"] = new_content
+            lc = last_copy.get("content")
+            ic = item.get("content")
+            if isinstance(lc, list) or isinstance(ic, list):
+                # Preserve multimodal content blocks (e.g. an image part) by
+                # concatenating the block lists. str()-ing a list turned an
+                # image message into its Python repr and dropped the image.
+                merged_blocks = _as_content_blocks(lc) + _as_content_blocks(ic)
+                if merged_blocks:
+                    last_copy["content"] = merged_blocks
+                else:
+                    last_copy.pop("content", None)
            else:
-                last_copy.pop("content", None)
+                last_str = str(lc) if lc is not None else ""
+                item_str = str(ic) if ic is not None else ""
+                new_content = "\n\n".join(part for part in (last_str, item_str) if part)
+                if new_content:
+                    last_copy["content"] = new_content
+                else:
+                    last_copy.pop("content", None)
            merged[-1] = last_copy
        else:
            merged.append(item)
--- a/tests/test_sanitize_multimodal_merge.py
+++ b/tests/test_sanitize_multimodal_merge.py
@@ -0,0 +1,28 @@
+"""Regression: merging consecutive user messages must not str() multimodal content."""
+
+from src.llm_core import _sanitize_llm_messages
+
+
+def test_multimodal_user_message_keeps_image_block_when_merged():
+    image_msg = {"role": "user", "content": [
+        {"type": "text", "text": "look at this"},
+        {"type": "image_url", "image_url": {"url": "data:image/png;base64,AAAA"}},
+    ]}
+    tool_result = {"role": "user", "content": "Tool result: 42"}
+    out = _sanitize_llm_messages([image_msg, tool_result])
+
+    # The two consecutive user messages collapse into one...
+    assert len(out) == 1
+    content = out[0]["content"]
+    # ...and the image block survives (it used to be str()-ed into a repr).
+    assert isinstance(content, list)
+    assert any(b.get("type") == "image_url" for b in content)
+    assert content[-1] == {"type": "text", "text": "Tool result: 42"}
+
+
+def test_string_only_user_merge_unchanged():
+    a = {"role": "user", "content": "first"}
+    b = {"role": "user", "content": "second"}
+    out = _sanitize_llm_messages([a, b])
+    assert len(out) == 1
+    assert out[0]["content"] == "first\n\nsecond"