diff --git a/src/document_processor.py b/src/document_processor.py index 3616cf8..af180ba 100644 --- a/src/document_processor.py +++ b/src/document_processor.py @@ -12,6 +12,9 @@ from src.llm_core import llm_call logger = logging.getLogger(__name__) +MAX_INLINE_ATTACHMENT_CHARS = 24000 +MIN_INLINE_ATTACHMENT_SLICE = 500 + def _is_text_file(path: str) -> bool: """Check if file has text extension.""" @@ -160,6 +163,41 @@ def _truncate_inline(text: str, limit: int = 15000) -> tuple[str, str]: return text, "" +def _fit_inline_attachment_text( + text: str, + remaining: int, + display_name: str, +) -> tuple[str, int]: + """Fit extracted attachment text into the shared inline attachment budget. + + Individual processors already cap single files, but multi-file batches can + still add N capped bodies to one user turn. Keep the first files readable, + keep later files visible by name, and mark exactly where inline content was + reduced so the model does not silently miss attachments. + """ + text = text or "" + if len(text) <= remaining: + return text, remaining - len(text) + + name = os.path.basename(display_name or "attachment") + if remaining < MIN_INLINE_ATTACHMENT_SLICE: + return ( + f"\n\n[Attachment omitted from inline context: {name}. " + f"The {MAX_INLINE_ATTACHMENT_CHARS:,}-character shared inline " + "attachment budget was already used by earlier attachments. Ask " + "to inspect this file specifically if more detail is needed.]", + 0, + ) + marker = ( + f"\n\n[Attachment content truncated: {name}. " + f"Only {remaining:,} characters of this attachment fit within " + f"the {MAX_INLINE_ATTACHMENT_CHARS:,}-character shared inline " + "attachment budget. Ask to inspect this file specifically if more " + "detail is needed.]" + ) + return text[:remaining] + marker, 0 + + def _process_office_document(path: str, display_name: str) -> str: """Extract an Office/EPUB document to Markdown via the optional markitdown dep. @@ -323,6 +361,7 @@ def build_user_content( frontend can switch to the new doc immediately. """ content = [{"type": "text", "text": text}] + inline_attachment_remaining = MAX_INLINE_ATTACHMENT_CHARS for fid in attachment_ids or []: upload_info = (resolved_uploads or {}).get(fid) @@ -483,6 +522,11 @@ def build_user_content( else: extracted_text = _process_office_document(path, display_name) + extracted_text, inline_attachment_remaining = _fit_inline_attachment_text( + extracted_text, + inline_attachment_remaining, + display_name, + ) if content and content[0]["type"] == "text": content[0]["text"] += extracted_text else: diff --git a/tests/test_document_processor_attachment_budget.py b/tests/test_document_processor_attachment_budget.py new file mode 100644 index 0000000..f772032 --- /dev/null +++ b/tests/test_document_processor_attachment_budget.py @@ -0,0 +1,80 @@ +from pathlib import Path + + +class _UploadHandler: + def __init__(self, uploads): + self.uploads = uploads + + def resolve_upload(self, fid, owner=None): + return self.uploads.get(fid) + + def _inside_upload_dir(self, path): + return True + + def is_image_file(self, display_name, mime): + return False + + def is_audio_file(self, display_name, mime): + return False + + def is_document_file(self, display_name, mime): + return True + + +def _text_upload(tmp_path: Path, fid: str, body: str): + path = tmp_path / f"{fid}.txt" + path.write_text(body, encoding="utf-8") + return { + "path": str(path), + "name": path.name, + "mime": "text/plain", + } + + +def test_multifile_inline_attachment_budget_keeps_later_files_visible(tmp_path, monkeypatch): + import src.document_processor as dp + + monkeypatch.setattr(dp, "MAX_INLINE_ATTACHMENT_CHARS", 1200) + monkeypatch.setattr(dp, "MIN_INLINE_ATTACHMENT_SLICE", 200) + uploads = { + "a": _text_upload(tmp_path, "a", "alpha\n" + ("A" * 1000)), + "b": _text_upload(tmp_path, "b", "bravo\n" + ("B" * 1000)), + "c": _text_upload(tmp_path, "c", "charlie\n" + ("C" * 1000)), + } + + content = dp.build_user_content( + "How many files do you see?", + ["a", "b", "c"], + str(tmp_path), + _UploadHandler(uploads), + owner="tester", + ) + + assert "=== File: a.txt ===" in content + assert "=== File: c.txt ===" not in content + assert "Attachment omitted from inline context: b.txt" in content + assert "Attachment omitted from inline context: c.txt" in content + assert "Ask to inspect this file specifically" in content + assert len(content) < 2200 + + +def test_inline_attachment_budget_does_not_truncate_small_batches(tmp_path, monkeypatch): + import src.document_processor as dp + + monkeypatch.setattr(dp, "MAX_INLINE_ATTACHMENT_CHARS", 5000) + uploads = { + "a": _text_upload(tmp_path, "a", "alpha"), + "b": _text_upload(tmp_path, "b", "bravo"), + } + + content = dp.build_user_content( + "Summarize these.", + ["a", "b"], + str(tmp_path), + _UploadHandler(uploads), + owner="tester", + ) + + assert "=== File: a.txt ===" in content + assert "=== File: b.txt ===" in content + assert "Attachment content truncated" not in content