From 82c09dd7682f7226584fb98f7969b927fcf693f5 Mon Sep 17 00:00:00 2001 From: Afonso Coutinho Date: Wed, 3 Jun 2026 00:57:54 +0100 Subject: [PATCH] fix: split_chunks emits a duplicate trailing chunk for text over size-overlap (#1573) --- src/personal_docs.py | 5 +++ tests/test_split_chunks_no_duplicate_tail.py | 33 ++++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 tests/test_split_chunks_no_duplicate_tail.py diff --git a/src/personal_docs.py b/src/personal_docs.py index 7f4cb8a..b4d935a 100644 --- a/src/personal_docs.py +++ b/src/personal_docs.py @@ -77,6 +77,11 @@ def split_chunks(text: str, size: int = config.CHUNK_SIZE, overlap: int = config while i < n: j = min(i + size, n) chunks.append(text[i:j]) + if j >= n: + # Reached the end. Without this, the next start (j - overlap) is + # still > i, so the loop appended one extra chunk duplicating the + # last `overlap` chars of the text. + break i = j - overlap if j - overlap > i else j return chunks diff --git a/tests/test_split_chunks_no_duplicate_tail.py b/tests/test_split_chunks_no_duplicate_tail.py new file mode 100644 index 0000000..a7fc32d --- /dev/null +++ b/tests/test_split_chunks_no_duplicate_tail.py @@ -0,0 +1,33 @@ +"""Regression: split_chunks must not emit a duplicate trailing chunk. + +The loop advanced `i = j - overlap` even after `j` reached the end of the text, +so any text longer than (size - overlap) got an extra final chunk duplicating +the last `overlap` characters. That duplicate is indexed and keyword-scored +twice, so retrieve_personal_keyword returns the same tail content twice. +""" +from src.personal_docs import split_chunks + + +def test_no_duplicate_tail_chunk(): + chunks = split_chunks("x" * 1100, size=1000, overlap=200) + assert [len(c) for c in chunks] == [1000, 300] + + +def test_no_chunk_is_contained_in_another(): + text = "".join(chr(33 + (k % 90)) for k in range(2000)) + chunks = split_chunks(text, size=1000, overlap=200) + # The buggy version produced a final 200-char chunk fully inside the prior one. + for a in range(len(chunks)): + for b in range(len(chunks)): + if a != b: + assert chunks[a] not in chunks[b] + + +def test_overlap_is_preserved_between_chunks(): + chunks = split_chunks("x" * 1100, size=1000, overlap=200) + # Second chunk starts 200 chars before the first one ended (offset 800). + assert len(chunks) == 2 and chunks[1] == ("x" * 1100)[800:1100] + + +def test_short_text_single_chunk(): + assert split_chunks("hello world", size=1000, overlap=200) == ["hello world"]