Cap inline attachment context across files (#1498)
Co-authored-by: ghreprimand <203024559+ghreprimand@users.noreply.github.com>
This commit is contained in:
@@ -12,6 +12,9 @@ from src.llm_core import llm_call
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
MAX_INLINE_ATTACHMENT_CHARS = 24000
|
||||
MIN_INLINE_ATTACHMENT_SLICE = 500
|
||||
|
||||
|
||||
def _is_text_file(path: str) -> bool:
|
||||
"""Check if file has text extension."""
|
||||
@@ -160,6 +163,41 @@ def _truncate_inline(text: str, limit: int = 15000) -> tuple[str, str]:
|
||||
return text, ""
|
||||
|
||||
|
||||
def _fit_inline_attachment_text(
|
||||
text: str,
|
||||
remaining: int,
|
||||
display_name: str,
|
||||
) -> tuple[str, int]:
|
||||
"""Fit extracted attachment text into the shared inline attachment budget.
|
||||
|
||||
Individual processors already cap single files, but multi-file batches can
|
||||
still add N capped bodies to one user turn. Keep the first files readable,
|
||||
keep later files visible by name, and mark exactly where inline content was
|
||||
reduced so the model does not silently miss attachments.
|
||||
"""
|
||||
text = text or ""
|
||||
if len(text) <= remaining:
|
||||
return text, remaining - len(text)
|
||||
|
||||
name = os.path.basename(display_name or "attachment")
|
||||
if remaining < MIN_INLINE_ATTACHMENT_SLICE:
|
||||
return (
|
||||
f"\n\n[Attachment omitted from inline context: {name}. "
|
||||
f"The {MAX_INLINE_ATTACHMENT_CHARS:,}-character shared inline "
|
||||
"attachment budget was already used by earlier attachments. Ask "
|
||||
"to inspect this file specifically if more detail is needed.]",
|
||||
0,
|
||||
)
|
||||
marker = (
|
||||
f"\n\n[Attachment content truncated: {name}. "
|
||||
f"Only {remaining:,} characters of this attachment fit within "
|
||||
f"the {MAX_INLINE_ATTACHMENT_CHARS:,}-character shared inline "
|
||||
"attachment budget. Ask to inspect this file specifically if more "
|
||||
"detail is needed.]"
|
||||
)
|
||||
return text[:remaining] + marker, 0
|
||||
|
||||
|
||||
def _process_office_document(path: str, display_name: str) -> str:
|
||||
"""Extract an Office/EPUB document to Markdown via the optional markitdown dep.
|
||||
|
||||
@@ -323,6 +361,7 @@ def build_user_content(
|
||||
frontend can switch to the new doc immediately.
|
||||
"""
|
||||
content = [{"type": "text", "text": text}]
|
||||
inline_attachment_remaining = MAX_INLINE_ATTACHMENT_CHARS
|
||||
|
||||
for fid in attachment_ids or []:
|
||||
upload_info = (resolved_uploads or {}).get(fid)
|
||||
@@ -483,6 +522,11 @@ def build_user_content(
|
||||
else:
|
||||
extracted_text = _process_office_document(path, display_name)
|
||||
|
||||
extracted_text, inline_attachment_remaining = _fit_inline_attachment_text(
|
||||
extracted_text,
|
||||
inline_attachment_remaining,
|
||||
display_name,
|
||||
)
|
||||
if content and content[0]["type"] == "text":
|
||||
content[0]["text"] += extracted_text
|
||||
else:
|
||||
|
||||
80
tests/test_document_processor_attachment_budget.py
Normal file
80
tests/test_document_processor_attachment_budget.py
Normal file
@@ -0,0 +1,80 @@
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class _UploadHandler:
|
||||
def __init__(self, uploads):
|
||||
self.uploads = uploads
|
||||
|
||||
def resolve_upload(self, fid, owner=None):
|
||||
return self.uploads.get(fid)
|
||||
|
||||
def _inside_upload_dir(self, path):
|
||||
return True
|
||||
|
||||
def is_image_file(self, display_name, mime):
|
||||
return False
|
||||
|
||||
def is_audio_file(self, display_name, mime):
|
||||
return False
|
||||
|
||||
def is_document_file(self, display_name, mime):
|
||||
return True
|
||||
|
||||
|
||||
def _text_upload(tmp_path: Path, fid: str, body: str):
|
||||
path = tmp_path / f"{fid}.txt"
|
||||
path.write_text(body, encoding="utf-8")
|
||||
return {
|
||||
"path": str(path),
|
||||
"name": path.name,
|
||||
"mime": "text/plain",
|
||||
}
|
||||
|
||||
|
||||
def test_multifile_inline_attachment_budget_keeps_later_files_visible(tmp_path, monkeypatch):
|
||||
import src.document_processor as dp
|
||||
|
||||
monkeypatch.setattr(dp, "MAX_INLINE_ATTACHMENT_CHARS", 1200)
|
||||
monkeypatch.setattr(dp, "MIN_INLINE_ATTACHMENT_SLICE", 200)
|
||||
uploads = {
|
||||
"a": _text_upload(tmp_path, "a", "alpha\n" + ("A" * 1000)),
|
||||
"b": _text_upload(tmp_path, "b", "bravo\n" + ("B" * 1000)),
|
||||
"c": _text_upload(tmp_path, "c", "charlie\n" + ("C" * 1000)),
|
||||
}
|
||||
|
||||
content = dp.build_user_content(
|
||||
"How many files do you see?",
|
||||
["a", "b", "c"],
|
||||
str(tmp_path),
|
||||
_UploadHandler(uploads),
|
||||
owner="tester",
|
||||
)
|
||||
|
||||
assert "=== File: a.txt ===" in content
|
||||
assert "=== File: c.txt ===" not in content
|
||||
assert "Attachment omitted from inline context: b.txt" in content
|
||||
assert "Attachment omitted from inline context: c.txt" in content
|
||||
assert "Ask to inspect this file specifically" in content
|
||||
assert len(content) < 2200
|
||||
|
||||
|
||||
def test_inline_attachment_budget_does_not_truncate_small_batches(tmp_path, monkeypatch):
|
||||
import src.document_processor as dp
|
||||
|
||||
monkeypatch.setattr(dp, "MAX_INLINE_ATTACHMENT_CHARS", 5000)
|
||||
uploads = {
|
||||
"a": _text_upload(tmp_path, "a", "alpha"),
|
||||
"b": _text_upload(tmp_path, "b", "bravo"),
|
||||
}
|
||||
|
||||
content = dp.build_user_content(
|
||||
"Summarize these.",
|
||||
["a", "b"],
|
||||
str(tmp_path),
|
||||
_UploadHandler(uploads),
|
||||
owner="tester",
|
||||
)
|
||||
|
||||
assert "=== File: a.txt ===" in content
|
||||
assert "=== File: b.txt ===" in content
|
||||
assert "Attachment content truncated" not in content
|
||||
Reference in New Issue
Block a user