odysseus/tests/test_document_pdf_marker.py

"""Regression test: the '[PDF content]:' wrapper must be removed without eating
into the page text that follows it.

The old call sites used ``str.lstrip("\\n[PDF content]:")``, which treats the
argument as a *set of characters* and keeps stripping leading characters that
happen to be in that set — corrupting the start of the extracted document.
"""
from src.document_processor import strip_pdf_content_marker, _PDF_CONTENT_MARKER


def test_marker_removed_without_eating_following_text():
    # Shape that _process_pdf actually returns: marker + "\n\n[Page 1 text]:" + body.
    raw = "\n\n[PDF content]:\n\n[Page 1 text]:\nto the board, content begins"
    out = strip_pdf_content_marker(raw)
    assert out == "[Page 1 text]:\nto the board, content begins"
    # The old lstrip approach produced "age 1 text]:..." (ate "[P" then "to").
    assert not out.startswith("age 1 text")


def test_marker_constant_matches_processor_output():
    # If _process_pdf's prefix ever changes, this guards the consumer.
    assert _PDF_CONTENT_MARKER == "\n\n[PDF content]:"


def test_text_without_marker_is_only_stripped():
    assert strip_pdf_content_marker("  plain text  ") == "plain text"


def test_handles_none():
    assert strip_pdf_content_marker(None) == ""