Cap inline attachment context across files (#1498)
Co-authored-by: ghreprimand <203024559+ghreprimand@users.noreply.github.com>
This commit is contained in:
@@ -12,6 +12,9 @@ from src.llm_core import llm_call
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
MAX_INLINE_ATTACHMENT_CHARS = 24000
|
||||||
|
MIN_INLINE_ATTACHMENT_SLICE = 500
|
||||||
|
|
||||||
|
|
||||||
def _is_text_file(path: str) -> bool:
|
def _is_text_file(path: str) -> bool:
|
||||||
"""Check if file has text extension."""
|
"""Check if file has text extension."""
|
||||||
@@ -160,6 +163,41 @@ def _truncate_inline(text: str, limit: int = 15000) -> tuple[str, str]:
|
|||||||
return text, ""
|
return text, ""
|
||||||
|
|
||||||
|
|
||||||
|
def _fit_inline_attachment_text(
|
||||||
|
text: str,
|
||||||
|
remaining: int,
|
||||||
|
display_name: str,
|
||||||
|
) -> tuple[str, int]:
|
||||||
|
"""Fit extracted attachment text into the shared inline attachment budget.
|
||||||
|
|
||||||
|
Individual processors already cap single files, but multi-file batches can
|
||||||
|
still add N capped bodies to one user turn. Keep the first files readable,
|
||||||
|
keep later files visible by name, and mark exactly where inline content was
|
||||||
|
reduced so the model does not silently miss attachments.
|
||||||
|
"""
|
||||||
|
text = text or ""
|
||||||
|
if len(text) <= remaining:
|
||||||
|
return text, remaining - len(text)
|
||||||
|
|
||||||
|
name = os.path.basename(display_name or "attachment")
|
||||||
|
if remaining < MIN_INLINE_ATTACHMENT_SLICE:
|
||||||
|
return (
|
||||||
|
f"\n\n[Attachment omitted from inline context: {name}. "
|
||||||
|
f"The {MAX_INLINE_ATTACHMENT_CHARS:,}-character shared inline "
|
||||||
|
"attachment budget was already used by earlier attachments. Ask "
|
||||||
|
"to inspect this file specifically if more detail is needed.]",
|
||||||
|
0,
|
||||||
|
)
|
||||||
|
marker = (
|
||||||
|
f"\n\n[Attachment content truncated: {name}. "
|
||||||
|
f"Only {remaining:,} characters of this attachment fit within "
|
||||||
|
f"the {MAX_INLINE_ATTACHMENT_CHARS:,}-character shared inline "
|
||||||
|
"attachment budget. Ask to inspect this file specifically if more "
|
||||||
|
"detail is needed.]"
|
||||||
|
)
|
||||||
|
return text[:remaining] + marker, 0
|
||||||
|
|
||||||
|
|
||||||
def _process_office_document(path: str, display_name: str) -> str:
|
def _process_office_document(path: str, display_name: str) -> str:
|
||||||
"""Extract an Office/EPUB document to Markdown via the optional markitdown dep.
|
"""Extract an Office/EPUB document to Markdown via the optional markitdown dep.
|
||||||
|
|
||||||
@@ -323,6 +361,7 @@ def build_user_content(
|
|||||||
frontend can switch to the new doc immediately.
|
frontend can switch to the new doc immediately.
|
||||||
"""
|
"""
|
||||||
content = [{"type": "text", "text": text}]
|
content = [{"type": "text", "text": text}]
|
||||||
|
inline_attachment_remaining = MAX_INLINE_ATTACHMENT_CHARS
|
||||||
|
|
||||||
for fid in attachment_ids or []:
|
for fid in attachment_ids or []:
|
||||||
upload_info = (resolved_uploads or {}).get(fid)
|
upload_info = (resolved_uploads or {}).get(fid)
|
||||||
@@ -483,6 +522,11 @@ def build_user_content(
|
|||||||
else:
|
else:
|
||||||
extracted_text = _process_office_document(path, display_name)
|
extracted_text = _process_office_document(path, display_name)
|
||||||
|
|
||||||
|
extracted_text, inline_attachment_remaining = _fit_inline_attachment_text(
|
||||||
|
extracted_text,
|
||||||
|
inline_attachment_remaining,
|
||||||
|
display_name,
|
||||||
|
)
|
||||||
if content and content[0]["type"] == "text":
|
if content and content[0]["type"] == "text":
|
||||||
content[0]["text"] += extracted_text
|
content[0]["text"] += extracted_text
|
||||||
else:
|
else:
|
||||||
|
|||||||
80
tests/test_document_processor_attachment_budget.py
Normal file
80
tests/test_document_processor_attachment_budget.py
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
class _UploadHandler:
|
||||||
|
def __init__(self, uploads):
|
||||||
|
self.uploads = uploads
|
||||||
|
|
||||||
|
def resolve_upload(self, fid, owner=None):
|
||||||
|
return self.uploads.get(fid)
|
||||||
|
|
||||||
|
def _inside_upload_dir(self, path):
|
||||||
|
return True
|
||||||
|
|
||||||
|
def is_image_file(self, display_name, mime):
|
||||||
|
return False
|
||||||
|
|
||||||
|
def is_audio_file(self, display_name, mime):
|
||||||
|
return False
|
||||||
|
|
||||||
|
def is_document_file(self, display_name, mime):
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def _text_upload(tmp_path: Path, fid: str, body: str):
|
||||||
|
path = tmp_path / f"{fid}.txt"
|
||||||
|
path.write_text(body, encoding="utf-8")
|
||||||
|
return {
|
||||||
|
"path": str(path),
|
||||||
|
"name": path.name,
|
||||||
|
"mime": "text/plain",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_multifile_inline_attachment_budget_keeps_later_files_visible(tmp_path, monkeypatch):
|
||||||
|
import src.document_processor as dp
|
||||||
|
|
||||||
|
monkeypatch.setattr(dp, "MAX_INLINE_ATTACHMENT_CHARS", 1200)
|
||||||
|
monkeypatch.setattr(dp, "MIN_INLINE_ATTACHMENT_SLICE", 200)
|
||||||
|
uploads = {
|
||||||
|
"a": _text_upload(tmp_path, "a", "alpha\n" + ("A" * 1000)),
|
||||||
|
"b": _text_upload(tmp_path, "b", "bravo\n" + ("B" * 1000)),
|
||||||
|
"c": _text_upload(tmp_path, "c", "charlie\n" + ("C" * 1000)),
|
||||||
|
}
|
||||||
|
|
||||||
|
content = dp.build_user_content(
|
||||||
|
"How many files do you see?",
|
||||||
|
["a", "b", "c"],
|
||||||
|
str(tmp_path),
|
||||||
|
_UploadHandler(uploads),
|
||||||
|
owner="tester",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert "=== File: a.txt ===" in content
|
||||||
|
assert "=== File: c.txt ===" not in content
|
||||||
|
assert "Attachment omitted from inline context: b.txt" in content
|
||||||
|
assert "Attachment omitted from inline context: c.txt" in content
|
||||||
|
assert "Ask to inspect this file specifically" in content
|
||||||
|
assert len(content) < 2200
|
||||||
|
|
||||||
|
|
||||||
|
def test_inline_attachment_budget_does_not_truncate_small_batches(tmp_path, monkeypatch):
|
||||||
|
import src.document_processor as dp
|
||||||
|
|
||||||
|
monkeypatch.setattr(dp, "MAX_INLINE_ATTACHMENT_CHARS", 5000)
|
||||||
|
uploads = {
|
||||||
|
"a": _text_upload(tmp_path, "a", "alpha"),
|
||||||
|
"b": _text_upload(tmp_path, "b", "bravo"),
|
||||||
|
}
|
||||||
|
|
||||||
|
content = dp.build_user_content(
|
||||||
|
"Summarize these.",
|
||||||
|
["a", "b"],
|
||||||
|
str(tmp_path),
|
||||||
|
_UploadHandler(uploads),
|
||||||
|
owner="tester",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert "=== File: a.txt ===" in content
|
||||||
|
assert "=== File: b.txt ===" in content
|
||||||
|
assert "Attachment content truncated" not in content
|
||||||
Reference in New Issue
Block a user