_process_pdf prepends "\n\n[PDF content]:" to extracted text, and two
call sites in document_routes.py stripped it with .lstrip("\n[PDF content]:").
str.lstrip(chars) treats its argument as a *set of characters*, so it keeps
eating into the page text that follows the marker — e.g. a body starting
with "to the board" loses its leading "to" because 't'/'o' are in the
marker's character set. Replace both sites with a shared
strip_pdf_content_marker() helper that uses str.removeprefix.
31 lines
1.2 KiB
Python
31 lines
1.2 KiB
Python
"""Regression test: the '[PDF content]:' wrapper must be removed without eating
|
|
into the page text that follows it.
|
|
|
|
The old call sites used ``str.lstrip("\\n[PDF content]:")``, which treats the
|
|
argument as a *set of characters* and keeps stripping leading characters that
|
|
happen to be in that set — corrupting the start of the extracted document.
|
|
"""
|
|
from src.document_processor import strip_pdf_content_marker, _PDF_CONTENT_MARKER
|
|
|
|
|
|
def test_marker_removed_without_eating_following_text():
|
|
# Shape that _process_pdf actually returns: marker + "\n\n[Page 1 text]:" + body.
|
|
raw = "\n\n[PDF content]:\n\n[Page 1 text]:\nto the board, content begins"
|
|
out = strip_pdf_content_marker(raw)
|
|
assert out == "[Page 1 text]:\nto the board, content begins"
|
|
# The old lstrip approach produced "age 1 text]:..." (ate "[P" then "to").
|
|
assert not out.startswith("age 1 text")
|
|
|
|
|
|
def test_marker_constant_matches_processor_output():
|
|
# If _process_pdf's prefix ever changes, this guards the consumer.
|
|
assert _PDF_CONTENT_MARKER == "\n\n[PDF content]:"
|
|
|
|
|
|
def test_text_without_marker_is_only_stripped():
|
|
assert strip_pdf_content_marker(" plain text ") == "plain text"
|
|
|
|
|
|
def test_handles_none():
|
|
assert strip_pdf_content_marker(None) == ""
|