diff --git a/src/rag_vector.py b/src/rag_vector.py index 1f414e5..5f2b880 100644 --- a/src/rag_vector.py +++ b/src/rag_vector.py @@ -260,8 +260,11 @@ class VectorRAG: for i, doc in enumerate(all_docs["documents"]): meta = all_docs["metadatas"][i] if owner: - doc_owner = meta.get("owner") - if doc_owner and doc_owner != owner: + # Match the primary path's strict where={"owner": owner} + # filter. The old `if doc_owner and doc_owner != owner` + # let docs with a missing/empty owner fall through, leaking + # owner-less documents into another user's results. + if meta.get("owner") != owner: continue doc_lower = doc.lower() score = sum(1 for w in query_words if w in doc_lower) diff --git a/tests/test_rag_keyword_fallback_owner.py b/tests/test_rag_keyword_fallback_owner.py new file mode 100644 index 0000000..e030ea3 --- /dev/null +++ b/tests/test_rag_keyword_fallback_owner.py @@ -0,0 +1,57 @@ +"""Regression: VectorRAG._keyword_search_fallback must not leak owner-less docs +across users. + +The primary hybrid search filters with ChromaDB ``where={"owner": owner}``, +which returns only documents whose ``owner == owner`` (documents with no owner +are excluded). The keyword fallback used +``if doc_owner and doc_owner != owner: continue``, so a document with a +missing/empty owner fell through the guard and was returned to whichever user +issued the query — a cross-user leak whenever the primary path errored and fell +back to keyword search. +""" +from src.rag_vector import VectorRAG + + +class _FakeCollection: + def __init__(self, docs): + # docs: list of (id, text, metadata) + self._docs = docs + + def count(self): + return len(self._docs) + + def get(self, include=None): + return { + "ids": [d[0] for d in self._docs], + "documents": [d[1] for d in self._docs], + "metadatas": [d[2] for d in self._docs], + } + + +def _store(docs): + store = VectorRAG.__new__(VectorRAG) + store._collection = _FakeCollection(docs) + return store + + +def test_ownerless_doc_not_leaked_to_user(): + store = _store([ + ("a", "alice secret project", {"owner": "alice"}), + ("b", "bob secret project", {"owner": "bob"}), + ("c", "ownerless secret project", {}), # no owner key + ]) + results = store._keyword_search_fallback("secret project", k=10, owner="alice") + ids = {r["id"] for r in results} + assert ids == {"a"} # only alice's doc + assert "b" not in ids # another user's doc excluded (already was) + assert "c" not in ids # owner-less doc must NOT leak (the fix) + + +def test_no_owner_filter_returns_all(): + store = _store([ + ("a", "shared note", {"owner": "alice"}), + ("c", "shared note", {}), + ]) + results = store._keyword_search_fallback("shared note", k=10, owner=None) + ids = {r["id"] for r in results} + assert ids == {"a", "c"} # no owner requested → no filtering