From 78747b56ca7f5eeb530b6210542784ac4bbd857d Mon Sep 17 00:00:00 2001
From: SurprisedDuck <jannik.theiss@googlemail.com>
Date: Tue, 2 Jun 2026 13:35:27 +0200
Subject: [PATCH] Documents: strip PDF marker without corrupting text
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

_process_pdf prepends "\n\n[PDF content]:" to extracted text, and two
call sites in document_routes.py stripped it with .lstrip("\n[PDF content]:").
str.lstrip(chars) treats its argument as a *set of characters*, so it keeps
eating into the page text that follows the marker — e.g. a body starting
with "to the board" loses its leading "to" because 't'/'o' are in the
marker's character set. Replace both sites with a shared
strip_pdf_content_marker() helper that uses str.removeprefix.
---
 routes/document_routes.py         |  8 ++++----
 src/document_processor.py         | 16 ++++++++++++++++
 tests/test_document_pdf_marker.py | 30 ++++++++++++++++++++++++++++++
 3 files changed, 50 insertions(+), 4 deletions(-)
 create mode 100644 tests/test_document_pdf_marker.py

diff --git a/routes/document_routes.py b/routes/document_routes.py
index 7d65ed3..71535c0 100644
--- a/routes/document_routes.py
+++ b/routes/document_routes.py
@@ -145,7 +145,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
             create_form_markdown_document,
             create_plain_pdf_document,
         )
-        from src.document_processor import _process_pdf
+        from src.document_processor import _process_pdf, strip_pdf_content_marker
         import os
 
         from src.auth_helpers import require_privilege
@@ -184,7 +184,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
 
         title = os.path.splitext(meta.get("original_name") or meta.get("name") or upload_id)[0]
         try:
-            body_text = _process_pdf(pdf_path).lstrip("\n[PDF content]:").strip()
+            body_text = strip_pdf_content_marker(_process_pdf(pdf_path))
         except Exception:
             body_text = None
 
@@ -402,7 +402,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
         text extraction was wired, plus for scanned/image-only PDFs where the
         VL model picks up text the basic pypdf path missed."""
         import re
-        from src.document_processor import _process_pdf
+        from src.document_processor import _process_pdf, strip_pdf_content_marker
         from src.pdf_form_doc import find_source_upload_id
 
         user = get_current_user(request)
@@ -423,7 +423,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
                 raise HTTPException(404, "Source PDF could not be located")
 
             try:
-                body_text = _process_pdf(pdf_path).lstrip("\n[PDF content]:").strip()
+                body_text = strip_pdf_content_marker(_process_pdf(pdf_path))
             except Exception as e:
                 logger.error(f"extract_pdf_text failed for {pdf_path}: {e}")
                 raise HTTPException(500, f"Extraction failed: {e}")
diff --git a/src/document_processor.py b/src/document_processor.py
index 3285fa1..486aaba 100644
--- a/src/document_processor.py
+++ b/src/document_processor.py
@@ -190,6 +190,22 @@ def _process_office_document(path: str, display_name: str) -> str:
         return f"\n\n[Attached document: {display_name} — {exc}]"
 
 
+# Marker that _process_pdf prepends to extracted text.
+_PDF_CONTENT_MARKER = "\n\n[PDF content]:"
+
+
+def strip_pdf_content_marker(text: str) -> str:
+    """Remove the leading ``[PDF content]:`` wrapper that ``_process_pdf`` adds.
+
+    Uses ``str.removeprefix`` rather than ``str.lstrip(chars)``: ``lstrip``
+    treats its argument as a *set of characters*, so ``lstrip("\\n[PDF content]:")``
+    keeps chewing into the page text that follows the marker. For example
+    ``"\\n\\n[PDF content]:\\n\\n[Page 1 text]:\\nto the board"`` would lose the
+    leading "to" because 't' and 'o' are in the marker's character set.
+    """
+    return (text or "").removeprefix(_PDF_CONTENT_MARKER).strip()
+
+
 def _load_vl_settings() -> dict:
     """Load admin settings from disk."""
     try:
diff --git a/tests/test_document_pdf_marker.py b/tests/test_document_pdf_marker.py
new file mode 100644
index 0000000..5e90c5d
--- /dev/null
+++ b/tests/test_document_pdf_marker.py
@@ -0,0 +1,30 @@
+"""Regression test: the '[PDF content]:' wrapper must be removed without eating
+into the page text that follows it.
+
+The old call sites used ``str.lstrip("\\n[PDF content]:")``, which treats the
+argument as a *set of characters* and keeps stripping leading characters that
+happen to be in that set — corrupting the start of the extracted document.
+"""
+from src.document_processor import strip_pdf_content_marker, _PDF_CONTENT_MARKER
+
+
+def test_marker_removed_without_eating_following_text():
+    # Shape that _process_pdf actually returns: marker + "\n\n[Page 1 text]:" + body.
+    raw = "\n\n[PDF content]:\n\n[Page 1 text]:\nto the board, content begins"
+    out = strip_pdf_content_marker(raw)
+    assert out == "[Page 1 text]:\nto the board, content begins"
+    # The old lstrip approach produced "age 1 text]:..." (ate "[P" then "to").
+    assert not out.startswith("age 1 text")
+
+
+def test_marker_constant_matches_processor_output():
+    # If _process_pdf's prefix ever changes, this guards the consumer.
+    assert _PDF_CONTENT_MARKER == "\n\n[PDF content]:"
+
+
+def test_text_without_marker_is_only_stripped():
+    assert strip_pdf_content_marker("  plain text  ") == "plain text"
+
+
+def test_handles_none():
+    assert strip_pdf_content_marker(None) == ""