Cap inline attachment context across files (#1498)

Co-authored-by: ghreprimand <203024559+ghreprimand@users.noreply.github.com>
2026-06-03 00:23:43 -05:00
parent a91321d1d8
commit 8c4ea484a9
2 changed files with 124 additions and 0 deletions
--- a/src/document_processor.py
+++ b/src/document_processor.py
@@ -12,6 +12,9 @@ from src.llm_core import llm_call

 logger = logging.getLogger(__name__)

+MAX_INLINE_ATTACHMENT_CHARS = 24000
+MIN_INLINE_ATTACHMENT_SLICE = 500
+

 def _is_text_file(path: str) -> bool:
    """Check if file has text extension."""
@@ -160,6 +163,41 @@ def _truncate_inline(text: str, limit: int = 15000) -> tuple[str, str]:
    return text, ""


+def _fit_inline_attachment_text(
+    text: str,
+    remaining: int,
+    display_name: str,
+) -> tuple[str, int]:
+    """Fit extracted attachment text into the shared inline attachment budget.
+
+    Individual processors already cap single files, but multi-file batches can
+    still add N capped bodies to one user turn. Keep the first files readable,
+    keep later files visible by name, and mark exactly where inline content was
+    reduced so the model does not silently miss attachments.
+    """
+    text = text or ""
+    if len(text) <= remaining:
+        return text, remaining - len(text)
+
+    name = os.path.basename(display_name or "attachment")
+    if remaining < MIN_INLINE_ATTACHMENT_SLICE:
+        return (
+            f"\n\n[Attachment omitted from inline context: {name}. "
+            f"The {MAX_INLINE_ATTACHMENT_CHARS:,}-character shared inline "
+            "attachment budget was already used by earlier attachments. Ask "
+            "to inspect this file specifically if more detail is needed.]",
+            0,
+        )
+    marker = (
+        f"\n\n[Attachment content truncated: {name}. "
+        f"Only {remaining:,} characters of this attachment fit within "
+        f"the {MAX_INLINE_ATTACHMENT_CHARS:,}-character shared inline "
+        "attachment budget. Ask to inspect this file specifically if more "
+        "detail is needed.]"
+    )
+    return text[:remaining] + marker, 0
+
+
 def _process_office_document(path: str, display_name: str) -> str:
    """Extract an Office/EPUB document to Markdown via the optional markitdown dep.

@@ -323,6 +361,7 @@ def build_user_content(
    frontend can switch to the new doc immediately.
    """
    content = [{"type": "text", "text": text}]
+    inline_attachment_remaining = MAX_INLINE_ATTACHMENT_CHARS

    for fid in attachment_ids or []:
        upload_info = (resolved_uploads or {}).get(fid)
@@ -483,6 +522,11 @@ def build_user_content(
            else:
                extracted_text = _process_office_document(path, display_name)

+            extracted_text, inline_attachment_remaining = _fit_inline_attachment_text(
+                extracted_text,
+                inline_attachment_remaining,
+                display_name,
+            )
            if content and content[0]["type"] == "text":
                content[0]["text"] += extracted_text
            else:
--- a/tests/test_document_processor_attachment_budget.py
+++ b/tests/test_document_processor_attachment_budget.py
@@ -0,0 +1,80 @@
+from pathlib import Path
+
+
+class _UploadHandler:
+    def __init__(self, uploads):
+        self.uploads = uploads
+
+    def resolve_upload(self, fid, owner=None):
+        return self.uploads.get(fid)
+
+    def _inside_upload_dir(self, path):
+        return True
+
+    def is_image_file(self, display_name, mime):
+        return False
+
+    def is_audio_file(self, display_name, mime):
+        return False
+
+    def is_document_file(self, display_name, mime):
+        return True
+
+
+def _text_upload(tmp_path: Path, fid: str, body: str):
+    path = tmp_path / f"{fid}.txt"
+    path.write_text(body, encoding="utf-8")
+    return {
+        "path": str(path),
+        "name": path.name,
+        "mime": "text/plain",
+    }
+
+
+def test_multifile_inline_attachment_budget_keeps_later_files_visible(tmp_path, monkeypatch):
+    import src.document_processor as dp
+
+    monkeypatch.setattr(dp, "MAX_INLINE_ATTACHMENT_CHARS", 1200)
+    monkeypatch.setattr(dp, "MIN_INLINE_ATTACHMENT_SLICE", 200)
+    uploads = {
+        "a": _text_upload(tmp_path, "a", "alpha\n" + ("A" * 1000)),
+        "b": _text_upload(tmp_path, "b", "bravo\n" + ("B" * 1000)),
+        "c": _text_upload(tmp_path, "c", "charlie\n" + ("C" * 1000)),
+    }
+
+    content = dp.build_user_content(
+        "How many files do you see?",
+        ["a", "b", "c"],
+        str(tmp_path),
+        _UploadHandler(uploads),
+        owner="tester",
+    )
+
+    assert "=== File: a.txt ===" in content
+    assert "=== File: c.txt ===" not in content
+    assert "Attachment omitted from inline context: b.txt" in content
+    assert "Attachment omitted from inline context: c.txt" in content
+    assert "Ask to inspect this file specifically" in content
+    assert len(content) < 2200
+
+
+def test_inline_attachment_budget_does_not_truncate_small_batches(tmp_path, monkeypatch):
+    import src.document_processor as dp
+
+    monkeypatch.setattr(dp, "MAX_INLINE_ATTACHMENT_CHARS", 5000)
+    uploads = {
+        "a": _text_upload(tmp_path, "a", "alpha"),
+        "b": _text_upload(tmp_path, "b", "bravo"),
+    }
+
+    content = dp.build_user_content(
+        "Summarize these.",
+        ["a", "b"],
+        str(tmp_path),
+        _UploadHandler(uploads),
+        owner="tester",
+    )
+
+    assert "=== File: a.txt ===" in content
+    assert "=== File: b.txt ===" in content
+    assert "Attachment content truncated" not in content