Files
odysseus/tests/test_document_pdf_marker.py
SurprisedDuck 78747b56ca Documents: strip PDF marker without corrupting text
_process_pdf prepends "\n\n[PDF content]:" to extracted text, and two
call sites in document_routes.py stripped it with .lstrip("\n[PDF content]:").
str.lstrip(chars) treats its argument as a *set of characters*, so it keeps
eating into the page text that follows the marker — e.g. a body starting
with "to the board" loses its leading "to" because 't'/'o' are in the
marker's character set. Replace both sites with a shared
strip_pdf_content_marker() helper that uses str.removeprefix.
2026-06-02 20:35:27 +09:00

31 lines
1.2 KiB
Python

"""Regression test: the '[PDF content]:' wrapper must be removed without eating
into the page text that follows it.
The old call sites used ``str.lstrip("\\n[PDF content]:")``, which treats the
argument as a *set of characters* and keeps stripping leading characters that
happen to be in that set — corrupting the start of the extracted document.
"""
from src.document_processor import strip_pdf_content_marker, _PDF_CONTENT_MARKER
def test_marker_removed_without_eating_following_text():
# Shape that _process_pdf actually returns: marker + "\n\n[Page 1 text]:" + body.
raw = "\n\n[PDF content]:\n\n[Page 1 text]:\nto the board, content begins"
out = strip_pdf_content_marker(raw)
assert out == "[Page 1 text]:\nto the board, content begins"
# The old lstrip approach produced "age 1 text]:..." (ate "[P" then "to").
assert not out.startswith("age 1 text")
def test_marker_constant_matches_processor_output():
# If _process_pdf's prefix ever changes, this guards the consumer.
assert _PDF_CONTENT_MARKER == "\n\n[PDF content]:"
def test_text_without_marker_is_only_stripped():
assert strip_pdf_content_marker(" plain text ") == "plain text"
def test_handles_none():
assert strip_pdf_content_marker(None) == ""