Owner-scope RAG doc ids so identical chunks across users don't collide (#1738, #1760)

_generate_doc_id hashed only text. add_document / add_documents_batch
early-return when the id exists, so the second owner indexing a
byte-identical chunk hit the first owner's id, was silently dropped,
and never stored under their owner — their owner-filtered search then
quietly omitted it. Hash owner + text; empty owner reproduces the
legacy id, so the unowned/base index keeps existing ids and isn't
re-churned. Same-owner identical chunks still dedupe.

Caught by #1738 and #1760 (independent reports of the same bug).
This commit is contained in:
pewdiepie-archdaemon
2026-06-03 11:36:31 +09:00
parent 8e2b9baf19
commit ed7956cbd3

View File

@@ -27,8 +27,14 @@ KEYWORD_WEIGHT = 0.3
COLLECTION_NAME = "odysseus_rag"
def _generate_doc_id(text: str) -> str:
return f"doc_{hashlib.sha256(text.encode('utf-8')).hexdigest()[:16]}"
def _generate_doc_id(text: str, owner: str = "") -> str:
# Owner-scope the id so two owners can index byte-identical chunks
# without the second one's add early-returning on the first's id and
# being silently dropped from their owner-filtered search results.
# Empty owner reproduces the legacy text-only id so the unowned/base
# index keeps its existing ids and isn't re-churned.
key = f"{owner}\x00{text}" if owner else text
return f"doc_{hashlib.sha256(key.encode('utf-8')).hexdigest()[:16]}"
class VectorRAG:
@@ -104,7 +110,7 @@ class VectorRAG:
return False
try:
doc_id = _generate_doc_id(text)
doc_id = _generate_doc_id(text, metadata.get("owner") or "")
# Check if already exists
existing = self._collection.get(ids=[doc_id])
if existing["ids"]:
@@ -140,7 +146,7 @@ class VectorRAG:
new_metas = []
new_ids = []
for t, m in valid:
doc_id = _generate_doc_id(t)
doc_id = _generate_doc_id(t, m.get("owner") or "")
existing = self._collection.get(ids=[doc_id])
if not existing["ids"]:
new_texts.append(t)