Skip malformed personal keyword index rows
Make personal keyword retrieval tolerate corrupted non-dict index entries and missing chunk lists, with regression coverage.
This commit is contained in:
@@ -134,10 +134,12 @@ def retrieve_personal_keyword(personal_index: List[Dict], query: str, k: int = 5
|
|||||||
|
|
||||||
scored = []
|
scored = []
|
||||||
for f in personal_index:
|
for f in personal_index:
|
||||||
for idx, ch in enumerate(f["chunks"]):
|
if not isinstance(f, dict):
|
||||||
|
continue
|
||||||
|
for idx, ch in enumerate(f.get("chunks") or []):
|
||||||
score = len(q & tokenize(ch))
|
score = len(q & tokenize(ch))
|
||||||
if score > 0:
|
if score > 0:
|
||||||
scored.append((score, f["name"], idx, ch))
|
scored.append((score, f.get("name", ""), idx, ch))
|
||||||
scored.sort(key=lambda x: x[0], reverse=True)
|
scored.sort(key=lambda x: x[0], reverse=True)
|
||||||
|
|
||||||
out = []
|
out = []
|
||||||
|
|||||||
21
tests/test_personal_docs_keyword_nondict.py
Normal file
21
tests/test_personal_docs_keyword_nondict.py
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
from src.personal_docs import retrieve_personal_keyword
|
||||||
|
|
||||||
|
|
||||||
|
def test_retrieve_personal_keyword_skips_non_dict_rows():
|
||||||
|
# A corrupted personal index can hold non-dict rows (partial write, bad
|
||||||
|
# import). The old loop did f["chunks"] which raised TypeError on a str
|
||||||
|
# row and aborted the whole search; now bad rows are skipped.
|
||||||
|
index = [
|
||||||
|
"bad-row",
|
||||||
|
None,
|
||||||
|
["also", "bad"],
|
||||||
|
{"name": "report.txt", "chunks": ["hello world from the quarterly report"]},
|
||||||
|
]
|
||||||
|
out = retrieve_personal_keyword(index, "hello", k=5)
|
||||||
|
assert out == ["[report.txt :: chunk 1]\nhello world from the quarterly report"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_retrieve_personal_keyword_tolerates_missing_chunks_key():
|
||||||
|
index = [{"name": "empty.txt"}, {"name": "doc.txt", "chunks": ["alpha beta gamma"]}]
|
||||||
|
out = retrieve_personal_keyword(index, "beta", k=5)
|
||||||
|
assert out == ["[doc.txt :: chunk 1]\nalpha beta gamma"]
|
||||||
Reference in New Issue
Block a user