From 65751186bd6ac4208c6198a863fe777d519dbfba Mon Sep 17 00:00:00 2001 From: Afonso Coutinho Date: Tue, 2 Jun 2026 17:21:57 +0100 Subject: [PATCH] fix: merging consecutive user messages corrupts multimodal (image) content (#1277) * fix: preserve multimodal content blocks when merging consecutive user messages * test: consecutive user-message merge keeps multimodal image blocks --- src/llm_core.py | 38 +++++++++++++++++++++---- tests/test_sanitize_multimodal_merge.py | 28 ++++++++++++++++++ 2 files changed, 60 insertions(+), 6 deletions(-) create mode 100644 tests/test_sanitize_multimodal_merge.py diff --git a/src/llm_core.py b/src/llm_core.py index 1e3d12a..5438967 100644 --- a/src/llm_core.py +++ b/src/llm_core.py @@ -576,6 +576,20 @@ def _parse_anthropic_response(data: dict) -> str: ) +def _as_content_blocks(content) -> List[Dict]: + """Coerce a message `content` into a list of content blocks. + + A list (multimodal: text + image parts) passes through; a non-empty string + becomes a single text block; None/empty yields no blocks. Used when merging + consecutive user messages so multimodal content isn't str()-ed away. + """ + if isinstance(content, list): + return content + if content: + return [{"type": "text", "text": str(content)}] + return [] + + def _sanitize_llm_messages(messages: List[Dict]) -> List[Dict]: """Strip Odysseus-only metadata before sending messages to providers. @@ -689,13 +703,25 @@ def _sanitize_llm_messages(messages: List[Dict]) -> List[Dict]: last = merged[-1] if last.get("role") == "user" and item.get("role") == "user": last_copy = dict(last) - last_str = str(last_copy.get("content")) if last_copy.get("content") is not None else "" - item_str = str(item.get("content")) if item.get("content") is not None else "" - new_content = "\n\n".join(part for part in (last_str, item_str) if part) - if new_content: - last_copy["content"] = new_content + lc = last_copy.get("content") + ic = item.get("content") + if isinstance(lc, list) or isinstance(ic, list): + # Preserve multimodal content blocks (e.g. an image part) by + # concatenating the block lists. str()-ing a list turned an + # image message into its Python repr and dropped the image. + merged_blocks = _as_content_blocks(lc) + _as_content_blocks(ic) + if merged_blocks: + last_copy["content"] = merged_blocks + else: + last_copy.pop("content", None) else: - last_copy.pop("content", None) + last_str = str(lc) if lc is not None else "" + item_str = str(ic) if ic is not None else "" + new_content = "\n\n".join(part for part in (last_str, item_str) if part) + if new_content: + last_copy["content"] = new_content + else: + last_copy.pop("content", None) merged[-1] = last_copy else: merged.append(item) diff --git a/tests/test_sanitize_multimodal_merge.py b/tests/test_sanitize_multimodal_merge.py new file mode 100644 index 0000000..1304f9c --- /dev/null +++ b/tests/test_sanitize_multimodal_merge.py @@ -0,0 +1,28 @@ +"""Regression: merging consecutive user messages must not str() multimodal content.""" + +from src.llm_core import _sanitize_llm_messages + + +def test_multimodal_user_message_keeps_image_block_when_merged(): + image_msg = {"role": "user", "content": [ + {"type": "text", "text": "look at this"}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,AAAA"}}, + ]} + tool_result = {"role": "user", "content": "Tool result: 42"} + out = _sanitize_llm_messages([image_msg, tool_result]) + + # The two consecutive user messages collapse into one... + assert len(out) == 1 + content = out[0]["content"] + # ...and the image block survives (it used to be str()-ed into a repr). + assert isinstance(content, list) + assert any(b.get("type") == "image_url" for b in content) + assert content[-1] == {"type": "text", "text": "Tool result: 42"} + + +def test_string_only_user_merge_unchanged(): + a = {"role": "user", "content": "first"} + b = {"role": "user", "content": "second"} + out = _sanitize_llm_messages([a, b]) + assert len(out) == 1 + assert out[0]["content"] == "first\n\nsecond"