From ed7956cbd3bb880c2eaa610709e16af882251ff5 Mon Sep 17 00:00:00 2001 From: pewdiepie-archdaemon Date: Wed, 3 Jun 2026 11:36:31 +0900 Subject: [PATCH] Owner-scope RAG doc ids so identical chunks across users don't collide (#1738, #1760) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _generate_doc_id hashed only text. add_document / add_documents_batch early-return when the id exists, so the second owner indexing a byte-identical chunk hit the first owner's id, was silently dropped, and never stored under their owner — their owner-filtered search then quietly omitted it. Hash owner + text; empty owner reproduces the legacy id, so the unowned/base index keeps existing ids and isn't re-churned. Same-owner identical chunks still dedupe. Caught by #1738 and #1760 (independent reports of the same bug). --- src/rag_vector.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/rag_vector.py b/src/rag_vector.py index f1df5b8..7faad3d 100644 --- a/src/rag_vector.py +++ b/src/rag_vector.py @@ -27,8 +27,14 @@ KEYWORD_WEIGHT = 0.3 COLLECTION_NAME = "odysseus_rag" -def _generate_doc_id(text: str) -> str: - return f"doc_{hashlib.sha256(text.encode('utf-8')).hexdigest()[:16]}" +def _generate_doc_id(text: str, owner: str = "") -> str: + # Owner-scope the id so two owners can index byte-identical chunks + # without the second one's add early-returning on the first's id and + # being silently dropped from their owner-filtered search results. + # Empty owner reproduces the legacy text-only id so the unowned/base + # index keeps its existing ids and isn't re-churned. + key = f"{owner}\x00{text}" if owner else text + return f"doc_{hashlib.sha256(key.encode('utf-8')).hexdigest()[:16]}" class VectorRAG: @@ -104,7 +110,7 @@ class VectorRAG: return False try: - doc_id = _generate_doc_id(text) + doc_id = _generate_doc_id(text, metadata.get("owner") or "") # Check if already exists existing = self._collection.get(ids=[doc_id]) if existing["ids"]: @@ -140,7 +146,7 @@ class VectorRAG: new_metas = [] new_ids = [] for t, m in valid: - doc_id = _generate_doc_id(t) + doc_id = _generate_doc_id(t, m.get("owner") or "") existing = self._collection.get(ids=[doc_id]) if not existing["ids"]: new_texts.append(t)