_generate_doc_id hashed only text. add_document / add_documents_batch early-return when the id exists, so the second owner indexing a byte-identical chunk hit the first owner's id, was silently dropped, and never stored under their owner — their owner-filtered search then quietly omitted it. Hash owner + text; empty owner reproduces the legacy id, so the unowned/base index keeps existing ids and isn't re-churned. Same-owner identical chunks still dedupe. Caught by #1738 and #1760 (independent reports of the same bug).
This commit is contained in:
@@ -27,8 +27,14 @@ KEYWORD_WEIGHT = 0.3
|
||||
COLLECTION_NAME = "odysseus_rag"
|
||||
|
||||
|
||||
def _generate_doc_id(text: str) -> str:
|
||||
return f"doc_{hashlib.sha256(text.encode('utf-8')).hexdigest()[:16]}"
|
||||
def _generate_doc_id(text: str, owner: str = "") -> str:
|
||||
# Owner-scope the id so two owners can index byte-identical chunks
|
||||
# without the second one's add early-returning on the first's id and
|
||||
# being silently dropped from their owner-filtered search results.
|
||||
# Empty owner reproduces the legacy text-only id so the unowned/base
|
||||
# index keeps its existing ids and isn't re-churned.
|
||||
key = f"{owner}\x00{text}" if owner else text
|
||||
return f"doc_{hashlib.sha256(key.encode('utf-8')).hexdigest()[:16]}"
|
||||
|
||||
|
||||
class VectorRAG:
|
||||
@@ -104,7 +110,7 @@ class VectorRAG:
|
||||
return False
|
||||
|
||||
try:
|
||||
doc_id = _generate_doc_id(text)
|
||||
doc_id = _generate_doc_id(text, metadata.get("owner") or "")
|
||||
# Check if already exists
|
||||
existing = self._collection.get(ids=[doc_id])
|
||||
if existing["ids"]:
|
||||
@@ -140,7 +146,7 @@ class VectorRAG:
|
||||
new_metas = []
|
||||
new_ids = []
|
||||
for t, m in valid:
|
||||
doc_id = _generate_doc_id(t)
|
||||
doc_id = _generate_doc_id(t, m.get("owner") or "")
|
||||
existing = self._collection.get(ids=[doc_id])
|
||||
if not existing["ids"]:
|
||||
new_texts.append(t)
|
||||
|
||||
Reference in New Issue
Block a user