diff --git a/src/rag_vector.py b/src/rag_vector.py index f1df5b8..7faad3d 100644 --- a/src/rag_vector.py +++ b/src/rag_vector.py @@ -27,8 +27,14 @@ KEYWORD_WEIGHT = 0.3 COLLECTION_NAME = "odysseus_rag" -def _generate_doc_id(text: str) -> str: - return f"doc_{hashlib.sha256(text.encode('utf-8')).hexdigest()[:16]}" +def _generate_doc_id(text: str, owner: str = "") -> str: + # Owner-scope the id so two owners can index byte-identical chunks + # without the second one's add early-returning on the first's id and + # being silently dropped from their owner-filtered search results. + # Empty owner reproduces the legacy text-only id so the unowned/base + # index keeps its existing ids and isn't re-churned. + key = f"{owner}\x00{text}" if owner else text + return f"doc_{hashlib.sha256(key.encode('utf-8')).hexdigest()[:16]}" class VectorRAG: @@ -104,7 +110,7 @@ class VectorRAG: return False try: - doc_id = _generate_doc_id(text) + doc_id = _generate_doc_id(text, metadata.get("owner") or "") # Check if already exists existing = self._collection.get(ids=[doc_id]) if existing["ids"]: @@ -140,7 +146,7 @@ class VectorRAG: new_metas = [] new_ids = [] for t, m in valid: - doc_id = _generate_doc_id(t) + doc_id = _generate_doc_id(t, m.get("owner") or "") existing = self._collection.get(ids=[doc_id]) if not existing["ids"]: new_texts.append(t)