fix: RAG keyword fallback leaked owner-less documents across users (#1722)
VectorRAG.search() filters with ChromaDB where={"owner": owner}, returning only
documents whose owner equals the requesting user. The keyword fallback
(_keyword_search_fallback, used when the primary query raises) guarded with
`if doc_owner and doc_owner != owner: continue`, so a document with a
missing/empty owner fell through and was returned to whichever user issued the
query — a cross-user information leak on the fallback path.
Match the primary path's strict filter: skip any doc whose owner != the
requested owner, including owner-less docs.
Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -260,8 +260,11 @@ class VectorRAG:
|
|||||||
for i, doc in enumerate(all_docs["documents"]):
|
for i, doc in enumerate(all_docs["documents"]):
|
||||||
meta = all_docs["metadatas"][i]
|
meta = all_docs["metadatas"][i]
|
||||||
if owner:
|
if owner:
|
||||||
doc_owner = meta.get("owner")
|
# Match the primary path's strict where={"owner": owner}
|
||||||
if doc_owner and doc_owner != owner:
|
# filter. The old `if doc_owner and doc_owner != owner`
|
||||||
|
# let docs with a missing/empty owner fall through, leaking
|
||||||
|
# owner-less documents into another user's results.
|
||||||
|
if meta.get("owner") != owner:
|
||||||
continue
|
continue
|
||||||
doc_lower = doc.lower()
|
doc_lower = doc.lower()
|
||||||
score = sum(1 for w in query_words if w in doc_lower)
|
score = sum(1 for w in query_words if w in doc_lower)
|
||||||
|
|||||||
57
tests/test_rag_keyword_fallback_owner.py
Normal file
57
tests/test_rag_keyword_fallback_owner.py
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
"""Regression: VectorRAG._keyword_search_fallback must not leak owner-less docs
|
||||||
|
across users.
|
||||||
|
|
||||||
|
The primary hybrid search filters with ChromaDB ``where={"owner": owner}``,
|
||||||
|
which returns only documents whose ``owner == owner`` (documents with no owner
|
||||||
|
are excluded). The keyword fallback used
|
||||||
|
``if doc_owner and doc_owner != owner: continue``, so a document with a
|
||||||
|
missing/empty owner fell through the guard and was returned to whichever user
|
||||||
|
issued the query — a cross-user leak whenever the primary path errored and fell
|
||||||
|
back to keyword search.
|
||||||
|
"""
|
||||||
|
from src.rag_vector import VectorRAG
|
||||||
|
|
||||||
|
|
||||||
|
class _FakeCollection:
|
||||||
|
def __init__(self, docs):
|
||||||
|
# docs: list of (id, text, metadata)
|
||||||
|
self._docs = docs
|
||||||
|
|
||||||
|
def count(self):
|
||||||
|
return len(self._docs)
|
||||||
|
|
||||||
|
def get(self, include=None):
|
||||||
|
return {
|
||||||
|
"ids": [d[0] for d in self._docs],
|
||||||
|
"documents": [d[1] for d in self._docs],
|
||||||
|
"metadatas": [d[2] for d in self._docs],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _store(docs):
|
||||||
|
store = VectorRAG.__new__(VectorRAG)
|
||||||
|
store._collection = _FakeCollection(docs)
|
||||||
|
return store
|
||||||
|
|
||||||
|
|
||||||
|
def test_ownerless_doc_not_leaked_to_user():
|
||||||
|
store = _store([
|
||||||
|
("a", "alice secret project", {"owner": "alice"}),
|
||||||
|
("b", "bob secret project", {"owner": "bob"}),
|
||||||
|
("c", "ownerless secret project", {}), # no owner key
|
||||||
|
])
|
||||||
|
results = store._keyword_search_fallback("secret project", k=10, owner="alice")
|
||||||
|
ids = {r["id"] for r in results}
|
||||||
|
assert ids == {"a"} # only alice's doc
|
||||||
|
assert "b" not in ids # another user's doc excluded (already was)
|
||||||
|
assert "c" not in ids # owner-less doc must NOT leak (the fix)
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_owner_filter_returns_all():
|
||||||
|
store = _store([
|
||||||
|
("a", "shared note", {"owner": "alice"}),
|
||||||
|
("c", "shared note", {}),
|
||||||
|
])
|
||||||
|
results = store._keyword_search_fallback("shared note", k=10, owner=None)
|
||||||
|
ids = {r["id"] for r in results}
|
||||||
|
assert ids == {"a", "c"} # no owner requested → no filtering
|
||||||
Reference in New Issue
Block a user