Cap inline attachment context across files (#1498)

Co-authored-by: ghreprimand <203024559+ghreprimand@users.noreply.github.com>
2026-06-03 00:23:43 -05:00
parent a91321d1d8
commit 8c4ea484a9
2 changed files with 124 additions and 0 deletions
--- a/src/document_processor.py
+++ b/src/document_processor.py
@@ -12,6 +12,9 @@ from src.llm_core import llm_call
 logger = logging.getLogger(__name__)
 MAX_INLINE_ATTACHMENT_CHARS = 24000
 MIN_INLINE_ATTACHMENT_SLICE = 500
 def _is_text_file(path: str) -> bool:
    """Check if file has text extension."""
@@ -160,6 +163,41 @@ def _truncate_inline(text: str, limit: int = 15000) -> tuple[str, str]:
    return text, ""
 def _fit_inline_attachment_text(
    text: str,
    remaining: int,
    display_name: str,
 ) -> tuple[str, int]:
    """Fit extracted attachment text into the shared inline attachment budget.
    Individual processors already cap single files, but multi-file batches can
    still add N capped bodies to one user turn. Keep the first files readable,
    keep later files visible by name, and mark exactly where inline content was
    reduced so the model does not silently miss attachments.
    """
    text = text or ""
    if len(text) <= remaining:
        return text, remaining - len(text)
    name = os.path.basename(display_name or "attachment")
    if remaining < MIN_INLINE_ATTACHMENT_SLICE:
        return (
            f"\n\n[Attachment omitted from inline context: {name}. "
            f"The {MAX_INLINE_ATTACHMENT_CHARS:,}-character shared inline "
            "attachment budget was already used by earlier attachments. Ask "
            "to inspect this file specifically if more detail is needed.]",
            0,
        )
    marker = (
        f"\n\n[Attachment content truncated: {name}. "
        f"Only {remaining:,} characters of this attachment fit within "
        f"the {MAX_INLINE_ATTACHMENT_CHARS:,}-character shared inline "
        "attachment budget. Ask to inspect this file specifically if more "
        "detail is needed.]"
    )
    return text[:remaining] + marker, 0
 def _process_office_document(path: str, display_name: str) -> str:
    """Extract an Office/EPUB document to Markdown via the optional markitdown dep.
@@ -323,6 +361,7 @@ def build_user_content(
    frontend can switch to the new doc immediately.
    """
    content = [{"type": "text", "text": text}]
    inline_attachment_remaining = MAX_INLINE_ATTACHMENT_CHARS
    for fid in attachment_ids or []:
        upload_info = (resolved_uploads or {}).get(fid)
@@ -483,6 +522,11 @@ def build_user_content(
            else:
                extracted_text = _process_office_document(path, display_name)
            extracted_text, inline_attachment_remaining = _fit_inline_attachment_text(
                extracted_text,
                inline_attachment_remaining,
                display_name,
            )
            if content and content[0]["type"] == "text":
                content[0]["text"] += extracted_text
            else:
--- a/tests/test_document_processor_attachment_budget.py
+++ b/tests/test_document_processor_attachment_budget.py
@@ -0,0 +1,80 @@
 from pathlib import Path
 class _UploadHandler:
    def __init__(self, uploads):
        self.uploads = uploads
    def resolve_upload(self, fid, owner=None):
        return self.uploads.get(fid)
    def _inside_upload_dir(self, path):
        return True
    def is_image_file(self, display_name, mime):
        return False
    def is_audio_file(self, display_name, mime):
        return False
    def is_document_file(self, display_name, mime):
        return True
 def _text_upload(tmp_path: Path, fid: str, body: str):
    path = tmp_path / f"{fid}.txt"
    path.write_text(body, encoding="utf-8")
    return {
        "path": str(path),
        "name": path.name,
        "mime": "text/plain",
    }
 def test_multifile_inline_attachment_budget_keeps_later_files_visible(tmp_path, monkeypatch):
    import src.document_processor as dp
    monkeypatch.setattr(dp, "MAX_INLINE_ATTACHMENT_CHARS", 1200)
    monkeypatch.setattr(dp, "MIN_INLINE_ATTACHMENT_SLICE", 200)
    uploads = {
        "a": _text_upload(tmp_path, "a", "alpha\n" + ("A" * 1000)),
        "b": _text_upload(tmp_path, "b", "bravo\n" + ("B" * 1000)),
        "c": _text_upload(tmp_path, "c", "charlie\n" + ("C" * 1000)),
    }
    content = dp.build_user_content(
        "How many files do you see?",
        ["a", "b", "c"],
        str(tmp_path),
        _UploadHandler(uploads),
        owner="tester",
    )
    assert "=== File: a.txt ===" in content
    assert "=== File: c.txt ===" not in content
    assert "Attachment omitted from inline context: b.txt" in content
    assert "Attachment omitted from inline context: c.txt" in content
    assert "Ask to inspect this file specifically" in content
    assert len(content) < 2200
 def test_inline_attachment_budget_does_not_truncate_small_batches(tmp_path, monkeypatch):
    import src.document_processor as dp
    monkeypatch.setattr(dp, "MAX_INLINE_ATTACHMENT_CHARS", 5000)
    uploads = {
        "a": _text_upload(tmp_path, "a", "alpha"),
        "b": _text_upload(tmp_path, "b", "bravo"),
    }
    content = dp.build_user_content(
        "Summarize these.",
        ["a", "b"],
        str(tmp_path),
        _UploadHandler(uploads),
        owner="tester",
    )
    assert "=== File: a.txt ===" in content
    assert "=== File: b.txt ===" in content
    assert "Attachment content truncated" not in content