Documents: strip PDF marker without corrupting text
_process_pdf prepends "\n\n[PDF content]:" to extracted text, and two
call sites in document_routes.py stripped it with .lstrip("\n[PDF content]:").
str.lstrip(chars) treats its argument as a *set of characters*, so it keeps
eating into the page text that follows the marker — e.g. a body starting
with "to the board" loses its leading "to" because 't'/'o' are in the
marker's character set. Replace both sites with a shared
strip_pdf_content_marker() helper that uses str.removeprefix.
This commit is contained in:
@@ -145,7 +145,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
|
|||||||
create_form_markdown_document,
|
create_form_markdown_document,
|
||||||
create_plain_pdf_document,
|
create_plain_pdf_document,
|
||||||
)
|
)
|
||||||
from src.document_processor import _process_pdf
|
from src.document_processor import _process_pdf, strip_pdf_content_marker
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from src.auth_helpers import require_privilege
|
from src.auth_helpers import require_privilege
|
||||||
@@ -184,7 +184,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
|
|||||||
|
|
||||||
title = os.path.splitext(meta.get("original_name") or meta.get("name") or upload_id)[0]
|
title = os.path.splitext(meta.get("original_name") or meta.get("name") or upload_id)[0]
|
||||||
try:
|
try:
|
||||||
body_text = _process_pdf(pdf_path).lstrip("\n[PDF content]:").strip()
|
body_text = strip_pdf_content_marker(_process_pdf(pdf_path))
|
||||||
except Exception:
|
except Exception:
|
||||||
body_text = None
|
body_text = None
|
||||||
|
|
||||||
@@ -402,7 +402,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
|
|||||||
text extraction was wired, plus for scanned/image-only PDFs where the
|
text extraction was wired, plus for scanned/image-only PDFs where the
|
||||||
VL model picks up text the basic pypdf path missed."""
|
VL model picks up text the basic pypdf path missed."""
|
||||||
import re
|
import re
|
||||||
from src.document_processor import _process_pdf
|
from src.document_processor import _process_pdf, strip_pdf_content_marker
|
||||||
from src.pdf_form_doc import find_source_upload_id
|
from src.pdf_form_doc import find_source_upload_id
|
||||||
|
|
||||||
user = get_current_user(request)
|
user = get_current_user(request)
|
||||||
@@ -423,7 +423,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
|
|||||||
raise HTTPException(404, "Source PDF could not be located")
|
raise HTTPException(404, "Source PDF could not be located")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
body_text = _process_pdf(pdf_path).lstrip("\n[PDF content]:").strip()
|
body_text = strip_pdf_content_marker(_process_pdf(pdf_path))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"extract_pdf_text failed for {pdf_path}: {e}")
|
logger.error(f"extract_pdf_text failed for {pdf_path}: {e}")
|
||||||
raise HTTPException(500, f"Extraction failed: {e}")
|
raise HTTPException(500, f"Extraction failed: {e}")
|
||||||
|
|||||||
@@ -190,6 +190,22 @@ def _process_office_document(path: str, display_name: str) -> str:
|
|||||||
return f"\n\n[Attached document: {display_name} — {exc}]"
|
return f"\n\n[Attached document: {display_name} — {exc}]"
|
||||||
|
|
||||||
|
|
||||||
|
# Marker that _process_pdf prepends to extracted text.
|
||||||
|
_PDF_CONTENT_MARKER = "\n\n[PDF content]:"
|
||||||
|
|
||||||
|
|
||||||
|
def strip_pdf_content_marker(text: str) -> str:
|
||||||
|
"""Remove the leading ``[PDF content]:`` wrapper that ``_process_pdf`` adds.
|
||||||
|
|
||||||
|
Uses ``str.removeprefix`` rather than ``str.lstrip(chars)``: ``lstrip``
|
||||||
|
treats its argument as a *set of characters*, so ``lstrip("\\n[PDF content]:")``
|
||||||
|
keeps chewing into the page text that follows the marker. For example
|
||||||
|
``"\\n\\n[PDF content]:\\n\\n[Page 1 text]:\\nto the board"`` would lose the
|
||||||
|
leading "to" because 't' and 'o' are in the marker's character set.
|
||||||
|
"""
|
||||||
|
return (text or "").removeprefix(_PDF_CONTENT_MARKER).strip()
|
||||||
|
|
||||||
|
|
||||||
def _load_vl_settings() -> dict:
|
def _load_vl_settings() -> dict:
|
||||||
"""Load admin settings from disk."""
|
"""Load admin settings from disk."""
|
||||||
try:
|
try:
|
||||||
|
|||||||
30
tests/test_document_pdf_marker.py
Normal file
30
tests/test_document_pdf_marker.py
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
"""Regression test: the '[PDF content]:' wrapper must be removed without eating
|
||||||
|
into the page text that follows it.
|
||||||
|
|
||||||
|
The old call sites used ``str.lstrip("\\n[PDF content]:")``, which treats the
|
||||||
|
argument as a *set of characters* and keeps stripping leading characters that
|
||||||
|
happen to be in that set — corrupting the start of the extracted document.
|
||||||
|
"""
|
||||||
|
from src.document_processor import strip_pdf_content_marker, _PDF_CONTENT_MARKER
|
||||||
|
|
||||||
|
|
||||||
|
def test_marker_removed_without_eating_following_text():
|
||||||
|
# Shape that _process_pdf actually returns: marker + "\n\n[Page 1 text]:" + body.
|
||||||
|
raw = "\n\n[PDF content]:\n\n[Page 1 text]:\nto the board, content begins"
|
||||||
|
out = strip_pdf_content_marker(raw)
|
||||||
|
assert out == "[Page 1 text]:\nto the board, content begins"
|
||||||
|
# The old lstrip approach produced "age 1 text]:..." (ate "[P" then "to").
|
||||||
|
assert not out.startswith("age 1 text")
|
||||||
|
|
||||||
|
|
||||||
|
def test_marker_constant_matches_processor_output():
|
||||||
|
# If _process_pdf's prefix ever changes, this guards the consumer.
|
||||||
|
assert _PDF_CONTENT_MARKER == "\n\n[PDF content]:"
|
||||||
|
|
||||||
|
|
||||||
|
def test_text_without_marker_is_only_stripped():
|
||||||
|
assert strip_pdf_content_marker(" plain text ") == "plain text"
|
||||||
|
|
||||||
|
|
||||||
|
def test_handles_none():
|
||||||
|
assert strip_pdf_content_marker(None) == ""
|
||||||
Reference in New Issue
Block a user