diff --git a/tests/test_build_user_content_pdf_marker.py b/tests/test_build_user_content_pdf_marker.py new file mode 100644 index 0000000..9cc9166 --- /dev/null +++ b/tests/test_build_user_content_pdf_marker.py @@ -0,0 +1,57 @@ +"""Regression: build_user_content must strip the '[PDF content]:' wrapper with +the prefix-safe helper, not str.lstrip(chars). + +The PDF-attach path at build_user_content used +`_process_pdf(path).lstrip("\\n[PDF content]:")`, which treats the argument as a +set of characters and keeps eating leading body characters (so a page that +begins "Page 1 text]: to the board" lost its "P"/"to"). The other call sites +were switched to `strip_pdf_content_marker` (str.removeprefix); this one wasn't. +""" +import os +import tempfile + +import src.document_processor as dp +import src.pdf_forms as pdf_forms +import src.pdf_form_doc as pdf_form_doc + + +class _FakeUploadHandler: + def is_image_file(self, name, mime): + return False + + def is_audio_file(self, name, mime): + return False + + def is_document_file(self, name, mime): + return True + + def _inside_upload_dir(self, path): + return True + + +def test_pdf_body_marker_stripped_without_eating_text(monkeypatch, tmp_path): + pdf_path = tmp_path / "doc.pdf" + pdf_path.write_bytes(b"%PDF-1.4 fake") + + # Shape _process_pdf actually returns: marker, then a page-text marker, then body. + raw = "\n\n[PDF content]:\n\n[Page 1 text]:\nto the board, the agenda is set" + monkeypatch.setattr(dp, "_process_pdf", lambda path: raw) + monkeypatch.setattr(pdf_forms, "has_form_fields", lambda path: False) + monkeypatch.setattr(pdf_form_doc, "create_plain_pdf_document", lambda **kw: "doc-123") + + resolved = {"fid1": {"path": str(pdf_path), "mime": "application/pdf", "name": "doc.pdf"}} + content = dp.build_user_content( + text="here is a pdf", + attachment_ids=["fid1"], + upload_dir=str(tmp_path), + upload_handler=_FakeUploadHandler(), + session_id="s1", + resolved_uploads=resolved, + ) + + body = content[0]["text"] if isinstance(content, list) else content + # The leading page text must survive intact. + assert "[Page 1 text]:" in body + assert "to the board, the agenda is set" in body + # The old lstrip(chars) corruption ate "[P" then "to" -> "age 1 text]: the board". + assert "age 1 text" not in body