fix(rag): use a stable hash for document IDs so dedup survives restarts (#1098)

add_document() and add_documents_batch() derive the persistent ChromaDB document id from Python's built-in hash(): doc_id = f"doc_{hash(text) % 10**16}" str hashing is randomized per process (PYTHONHASHSEED is on by default), so the same document text gets a different doc_id on every restart. The dedup check right after — self._collection.get(ids=[doc_id]) — therefore misses on restart, and identical documents are re-embedded and re-added as duplicates each time the app restarts, bloating the vector store and skewing retrieval. Derive the id from a stable hashlib.sha256 of the text via a shared _generate_doc_id() helper, used by both add paths so they agree. tests/test_rag_vector_id_stability.py runs _generate_doc_id in subprocesses under PYTHONHASHSEED=0/1/random and asserts the id is identical across all of them (and differs for different text). Fails before this change.
2026-06-02 20:42:23 +07:00
parent ff93a6c63b
commit dc8a882f1f
2 changed files with 35 additions and 2 deletions
--- a/src/rag_vector.py
+++ b/src/rag_vector.py
@@ -7,6 +7,7 @@ configurable embedding endpoint via EMBEDDING_URL env var.
 """

 import os
+import hashlib
 import re
 import logging
 import numpy as np
@@ -26,6 +27,10 @@ KEYWORD_WEIGHT = 0.3
 COLLECTION_NAME = "odysseus_rag"


+def _generate_doc_id(text: str) -> str:
+    return f"doc_{hashlib.sha256(text.encode('utf-8')).hexdigest()[:16]}"
+
+
 class VectorRAG:
    """RAG system using ChromaDB vector storage with hybrid search."""

@@ -99,7 +104,7 @@ class VectorRAG:
            return False

        try:
-            doc_id = f"doc_{hash(text) % 10**16}"
+            doc_id = _generate_doc_id(text)
            # Check if already exists
            existing = self._collection.get(ids=[doc_id])
            if existing["ids"]:
@@ -135,7 +140,7 @@ class VectorRAG:
            new_metas = []
            new_ids = []
            for t, m in valid:
-                doc_id = f"doc_{hash(t) % 10**16}"
+                doc_id = _generate_doc_id(t)
                existing = self._collection.get(ids=[doc_id])
                if not existing["ids"]:
                    new_texts.append(t)