_generate_doc_id hashed only text. add_document / add_documents_batch early-return when the id exists, so the second owner indexing a byte-identical chunk hit the first owner's id, was silently dropped, and never stored under their owner — their owner-filtered search then quietly omitted it. Hash owner + text; empty owner reproduces the legacy id, so the unowned/base index keeps existing ids and isn't re-churned. Same-owner identical chunks still dedupe. Caught by #1738 and #1760 (independent reports of the same bug).
This commit is contained in:
@@ -27,8 +27,14 @@ KEYWORD_WEIGHT = 0.3
|
|||||||
COLLECTION_NAME = "odysseus_rag"
|
COLLECTION_NAME = "odysseus_rag"
|
||||||
|
|
||||||
|
|
||||||
def _generate_doc_id(text: str) -> str:
|
def _generate_doc_id(text: str, owner: str = "") -> str:
|
||||||
return f"doc_{hashlib.sha256(text.encode('utf-8')).hexdigest()[:16]}"
|
# Owner-scope the id so two owners can index byte-identical chunks
|
||||||
|
# without the second one's add early-returning on the first's id and
|
||||||
|
# being silently dropped from their owner-filtered search results.
|
||||||
|
# Empty owner reproduces the legacy text-only id so the unowned/base
|
||||||
|
# index keeps its existing ids and isn't re-churned.
|
||||||
|
key = f"{owner}\x00{text}" if owner else text
|
||||||
|
return f"doc_{hashlib.sha256(key.encode('utf-8')).hexdigest()[:16]}"
|
||||||
|
|
||||||
|
|
||||||
class VectorRAG:
|
class VectorRAG:
|
||||||
@@ -104,7 +110,7 @@ class VectorRAG:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
try:
|
try:
|
||||||
doc_id = _generate_doc_id(text)
|
doc_id = _generate_doc_id(text, metadata.get("owner") or "")
|
||||||
# Check if already exists
|
# Check if already exists
|
||||||
existing = self._collection.get(ids=[doc_id])
|
existing = self._collection.get(ids=[doc_id])
|
||||||
if existing["ids"]:
|
if existing["ids"]:
|
||||||
@@ -140,7 +146,7 @@ class VectorRAG:
|
|||||||
new_metas = []
|
new_metas = []
|
||||||
new_ids = []
|
new_ids = []
|
||||||
for t, m in valid:
|
for t, m in valid:
|
||||||
doc_id = _generate_doc_id(t)
|
doc_id = _generate_doc_id(t, m.get("owner") or "")
|
||||||
existing = self._collection.get(ids=[doc_id])
|
existing = self._collection.get(ids=[doc_id])
|
||||||
if not existing["ids"]:
|
if not existing["ids"]:
|
||||||
new_texts.append(t)
|
new_texts.append(t)
|
||||||
|
|||||||
Reference in New Issue
Block a user