Files
odysseus/tests/test_split_chunks_no_duplicate_tail.py
Alexandre Teixeira 8bc16ef245 fix(tests): use non-repeating split chunk fixture
Updates the split_chunks containment regression test to use deterministic non-repeating records instead of a repeating fixture that could produce accidental substring matches. Restores one focused part of the Python CI baseline tracked in #2580.
2026-06-04 18:11:42 +01:00

37 lines
1.4 KiB
Python

"""Regression: split_chunks must not emit a duplicate trailing chunk.
The loop advanced `i = j - overlap` even after `j` reached the end of the text,
so any text longer than (size - overlap) got an extra final chunk duplicating
the last `overlap` characters. That duplicate is indexed and keyword-scored
twice, so retrieve_personal_keyword returns the same tail content twice.
"""
from src.personal_docs import split_chunks
def test_no_duplicate_tail_chunk():
chunks = split_chunks("x" * 1100, size=1000, overlap=200)
assert [len(c) for c in chunks] == [1000, 300]
def test_no_chunk_is_contained_in_another():
text = "\n".join(
f"unique-line-{k:04d}-square-{k * k:08d}-cube-{k * k * k:012d}"
for k in range(300)
)
chunks = split_chunks(text, size=1000, overlap=200)
# The buggy version produced a final 200-char chunk fully inside the prior one.
for a in range(len(chunks)):
for b in range(len(chunks)):
if a != b:
assert chunks[a] not in chunks[b]
def test_overlap_is_preserved_between_chunks():
chunks = split_chunks("x" * 1100, size=1000, overlap=200)
# Second chunk starts 200 chars before the first one ended (offset 800).
assert len(chunks) == 2 and chunks[1] == ("x" * 1100)[800:1100]
def test_short_text_single_chunk():
assert split_chunks("hello world", size=1000, overlap=200) == ["hello world"]