From 78747b56ca7f5eeb530b6210542784ac4bbd857d Mon Sep 17 00:00:00 2001 From: SurprisedDuck Date: Tue, 2 Jun 2026 13:35:27 +0200 Subject: [PATCH] Documents: strip PDF marker without corrupting text MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _process_pdf prepends "\n\n[PDF content]:" to extracted text, and two call sites in document_routes.py stripped it with .lstrip("\n[PDF content]:"). str.lstrip(chars) treats its argument as a *set of characters*, so it keeps eating into the page text that follows the marker — e.g. a body starting with "to the board" loses its leading "to" because 't'/'o' are in the marker's character set. Replace both sites with a shared strip_pdf_content_marker() helper that uses str.removeprefix. --- routes/document_routes.py | 8 ++++---- src/document_processor.py | 16 ++++++++++++++++ tests/test_document_pdf_marker.py | 30 ++++++++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 4 deletions(-) create mode 100644 tests/test_document_pdf_marker.py diff --git a/routes/document_routes.py b/routes/document_routes.py index 7d65ed3..71535c0 100644 --- a/routes/document_routes.py +++ b/routes/document_routes.py @@ -145,7 +145,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter: create_form_markdown_document, create_plain_pdf_document, ) - from src.document_processor import _process_pdf + from src.document_processor import _process_pdf, strip_pdf_content_marker import os from src.auth_helpers import require_privilege @@ -184,7 +184,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter: title = os.path.splitext(meta.get("original_name") or meta.get("name") or upload_id)[0] try: - body_text = _process_pdf(pdf_path).lstrip("\n[PDF content]:").strip() + body_text = strip_pdf_content_marker(_process_pdf(pdf_path)) except Exception: body_text = None @@ -402,7 +402,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter: text extraction was wired, plus for scanned/image-only PDFs where the VL model picks up text the basic pypdf path missed.""" import re - from src.document_processor import _process_pdf + from src.document_processor import _process_pdf, strip_pdf_content_marker from src.pdf_form_doc import find_source_upload_id user = get_current_user(request) @@ -423,7 +423,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter: raise HTTPException(404, "Source PDF could not be located") try: - body_text = _process_pdf(pdf_path).lstrip("\n[PDF content]:").strip() + body_text = strip_pdf_content_marker(_process_pdf(pdf_path)) except Exception as e: logger.error(f"extract_pdf_text failed for {pdf_path}: {e}") raise HTTPException(500, f"Extraction failed: {e}") diff --git a/src/document_processor.py b/src/document_processor.py index 3285fa1..486aaba 100644 --- a/src/document_processor.py +++ b/src/document_processor.py @@ -190,6 +190,22 @@ def _process_office_document(path: str, display_name: str) -> str: return f"\n\n[Attached document: {display_name} — {exc}]" +# Marker that _process_pdf prepends to extracted text. +_PDF_CONTENT_MARKER = "\n\n[PDF content]:" + + +def strip_pdf_content_marker(text: str) -> str: + """Remove the leading ``[PDF content]:`` wrapper that ``_process_pdf`` adds. + + Uses ``str.removeprefix`` rather than ``str.lstrip(chars)``: ``lstrip`` + treats its argument as a *set of characters*, so ``lstrip("\\n[PDF content]:")`` + keeps chewing into the page text that follows the marker. For example + ``"\\n\\n[PDF content]:\\n\\n[Page 1 text]:\\nto the board"`` would lose the + leading "to" because 't' and 'o' are in the marker's character set. + """ + return (text or "").removeprefix(_PDF_CONTENT_MARKER).strip() + + def _load_vl_settings() -> dict: """Load admin settings from disk.""" try: diff --git a/tests/test_document_pdf_marker.py b/tests/test_document_pdf_marker.py new file mode 100644 index 0000000..5e90c5d --- /dev/null +++ b/tests/test_document_pdf_marker.py @@ -0,0 +1,30 @@ +"""Regression test: the '[PDF content]:' wrapper must be removed without eating +into the page text that follows it. + +The old call sites used ``str.lstrip("\\n[PDF content]:")``, which treats the +argument as a *set of characters* and keeps stripping leading characters that +happen to be in that set — corrupting the start of the extracted document. +""" +from src.document_processor import strip_pdf_content_marker, _PDF_CONTENT_MARKER + + +def test_marker_removed_without_eating_following_text(): + # Shape that _process_pdf actually returns: marker + "\n\n[Page 1 text]:" + body. + raw = "\n\n[PDF content]:\n\n[Page 1 text]:\nto the board, content begins" + out = strip_pdf_content_marker(raw) + assert out == "[Page 1 text]:\nto the board, content begins" + # The old lstrip approach produced "age 1 text]:..." (ate "[P" then "to"). + assert not out.startswith("age 1 text") + + +def test_marker_constant_matches_processor_output(): + # If _process_pdf's prefix ever changes, this guards the consumer. + assert _PDF_CONTENT_MARKER == "\n\n[PDF content]:" + + +def test_text_without_marker_is_only_stripped(): + assert strip_pdf_content_marker(" plain text ") == "plain text" + + +def test_handles_none(): + assert strip_pdf_content_marker(None) == ""