From a880b1762412d59eeb3de00a1829d8d7aed41077 Mon Sep 17 00:00:00 2001 From: Afonso Coutinho Date: Wed, 3 Jun 2026 05:42:05 +0100 Subject: [PATCH] Skip malformed personal keyword index rows Make personal keyword retrieval tolerate corrupted non-dict index entries and missing chunk lists, with regression coverage. --- src/personal_docs.py | 6 ++++-- tests/test_personal_docs_keyword_nondict.py | 21 +++++++++++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) create mode 100644 tests/test_personal_docs_keyword_nondict.py diff --git a/src/personal_docs.py b/src/personal_docs.py index 06875d0..92ba1bc 100644 --- a/src/personal_docs.py +++ b/src/personal_docs.py @@ -134,10 +134,12 @@ def retrieve_personal_keyword(personal_index: List[Dict], query: str, k: int = 5 scored = [] for f in personal_index: - for idx, ch in enumerate(f["chunks"]): + if not isinstance(f, dict): + continue + for idx, ch in enumerate(f.get("chunks") or []): score = len(q & tokenize(ch)) if score > 0: - scored.append((score, f["name"], idx, ch)) + scored.append((score, f.get("name", ""), idx, ch)) scored.sort(key=lambda x: x[0], reverse=True) out = [] diff --git a/tests/test_personal_docs_keyword_nondict.py b/tests/test_personal_docs_keyword_nondict.py new file mode 100644 index 0000000..f46c9f4 --- /dev/null +++ b/tests/test_personal_docs_keyword_nondict.py @@ -0,0 +1,21 @@ +from src.personal_docs import retrieve_personal_keyword + + +def test_retrieve_personal_keyword_skips_non_dict_rows(): + # A corrupted personal index can hold non-dict rows (partial write, bad + # import). The old loop did f["chunks"] which raised TypeError on a str + # row and aborted the whole search; now bad rows are skipped. + index = [ + "bad-row", + None, + ["also", "bad"], + {"name": "report.txt", "chunks": ["hello world from the quarterly report"]}, + ] + out = retrieve_personal_keyword(index, "hello", k=5) + assert out == ["[report.txt :: chunk 1]\nhello world from the quarterly report"] + + +def test_retrieve_personal_keyword_tolerates_missing_chunks_key(): + index = [{"name": "empty.txt"}, {"name": "doc.txt", "chunks": ["alpha beta gamma"]}] + out = retrieve_personal_keyword(index, "beta", k=5) + assert out == ["[doc.txt :: chunk 1]\nalpha beta gamma"]