Skip malformed personal keyword index rows

Make personal keyword retrieval tolerate corrupted non-dict index entries and missing chunk lists, with regression coverage.
This commit is contained in:
Afonso Coutinho
2026-06-03 05:42:05 +01:00
committed by GitHub
parent 61d62a3cb8
commit a880b17624
2 changed files with 25 additions and 2 deletions

View File

@@ -0,0 +1,21 @@
from src.personal_docs import retrieve_personal_keyword
def test_retrieve_personal_keyword_skips_non_dict_rows():
# A corrupted personal index can hold non-dict rows (partial write, bad
# import). The old loop did f["chunks"] which raised TypeError on a str
# row and aborted the whole search; now bad rows are skipped.
index = [
"bad-row",
None,
["also", "bad"],
{"name": "report.txt", "chunks": ["hello world from the quarterly report"]},
]
out = retrieve_personal_keyword(index, "hello", k=5)
assert out == ["[report.txt :: chunk 1]\nhello world from the quarterly report"]
def test_retrieve_personal_keyword_tolerates_missing_chunks_key():
index = [{"name": "empty.txt"}, {"name": "doc.txt", "chunks": ["alpha beta gamma"]}]
out = retrieve_personal_keyword(index, "beta", k=5)
assert out == ["[doc.txt :: chunk 1]\nalpha beta gamma"]