fix: PDF attach uses lstrip(chars) that eats body text, not the marker helper (#1541)
This commit is contained in:
57
tests/test_build_user_content_pdf_marker.py
Normal file
57
tests/test_build_user_content_pdf_marker.py
Normal file
@@ -0,0 +1,57 @@
|
||||
"""Regression: build_user_content must strip the '[PDF content]:' wrapper with
|
||||
the prefix-safe helper, not str.lstrip(chars).
|
||||
|
||||
The PDF-attach path at build_user_content used
|
||||
`_process_pdf(path).lstrip("\\n[PDF content]:")`, which treats the argument as a
|
||||
set of characters and keeps eating leading body characters (so a page that
|
||||
begins "Page 1 text]: to the board" lost its "P"/"to"). The other call sites
|
||||
were switched to `strip_pdf_content_marker` (str.removeprefix); this one wasn't.
|
||||
"""
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import src.document_processor as dp
|
||||
import src.pdf_forms as pdf_forms
|
||||
import src.pdf_form_doc as pdf_form_doc
|
||||
|
||||
|
||||
class _FakeUploadHandler:
|
||||
def is_image_file(self, name, mime):
|
||||
return False
|
||||
|
||||
def is_audio_file(self, name, mime):
|
||||
return False
|
||||
|
||||
def is_document_file(self, name, mime):
|
||||
return True
|
||||
|
||||
def _inside_upload_dir(self, path):
|
||||
return True
|
||||
|
||||
|
||||
def test_pdf_body_marker_stripped_without_eating_text(monkeypatch, tmp_path):
|
||||
pdf_path = tmp_path / "doc.pdf"
|
||||
pdf_path.write_bytes(b"%PDF-1.4 fake")
|
||||
|
||||
# Shape _process_pdf actually returns: marker, then a page-text marker, then body.
|
||||
raw = "\n\n[PDF content]:\n\n[Page 1 text]:\nto the board, the agenda is set"
|
||||
monkeypatch.setattr(dp, "_process_pdf", lambda path: raw)
|
||||
monkeypatch.setattr(pdf_forms, "has_form_fields", lambda path: False)
|
||||
monkeypatch.setattr(pdf_form_doc, "create_plain_pdf_document", lambda **kw: "doc-123")
|
||||
|
||||
resolved = {"fid1": {"path": str(pdf_path), "mime": "application/pdf", "name": "doc.pdf"}}
|
||||
content = dp.build_user_content(
|
||||
text="here is a pdf",
|
||||
attachment_ids=["fid1"],
|
||||
upload_dir=str(tmp_path),
|
||||
upload_handler=_FakeUploadHandler(),
|
||||
session_id="s1",
|
||||
resolved_uploads=resolved,
|
||||
)
|
||||
|
||||
body = content[0]["text"] if isinstance(content, list) else content
|
||||
# The leading page text must survive intact.
|
||||
assert "[Page 1 text]:" in body
|
||||
assert "to the board, the agenda is set" in body
|
||||
# The old lstrip(chars) corruption ate "[P" then "to" -> "age 1 text]: the board".
|
||||
assert "age 1 text" not in body
|
||||
Reference in New Issue
Block a user