From dc8a882f1f88e4936b81d182a9b5e8a48559947d Mon Sep 17 00:00:00 2001
From: Tatlatat <hungdotmn@gmail.com>
Date: Tue, 2 Jun 2026 20:42:23 +0700
Subject: [PATCH] fix(rag): use a stable hash for document IDs so dedup
 survives restarts (#1098)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

add_document() and add_documents_batch() derive the persistent ChromaDB
document id from Python's built-in hash():

    doc_id = f"doc_{hash(text) % 10**16}"

str hashing is randomized per process (PYTHONHASHSEED is on by default), so
the same document text gets a different doc_id on every restart. The dedup
check right after — self._collection.get(ids=[doc_id]) — therefore misses
on restart, and identical documents are re-embedded and re-added as
duplicates each time the app restarts, bloating the vector store and
skewing retrieval.

Derive the id from a stable hashlib.sha256 of the text via a shared
_generate_doc_id() helper, used by both add paths so they agree.

tests/test_rag_vector_id_stability.py runs _generate_doc_id in subprocesses
under PYTHONHASHSEED=0/1/random and asserts the id is identical across all
of them (and differs for different text). Fails before this change.
---
 src/rag_vector.py                     |  9 +++++++--
 tests/test_rag_vector_id_stability.py | 28 +++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 2 deletions(-)
 create mode 100644 tests/test_rag_vector_id_stability.py

diff --git a/src/rag_vector.py b/src/rag_vector.py
index fcb27c1..f1df5b8 100644
--- a/src/rag_vector.py
+++ b/src/rag_vector.py
@@ -7,6 +7,7 @@ configurable embedding endpoint via EMBEDDING_URL env var.
 """
 
 import os
+import hashlib
 import re
 import logging
 import numpy as np
@@ -26,6 +27,10 @@ KEYWORD_WEIGHT = 0.3
 COLLECTION_NAME = "odysseus_rag"
 
 
+def _generate_doc_id(text: str) -> str:
+    return f"doc_{hashlib.sha256(text.encode('utf-8')).hexdigest()[:16]}"
+
+
 class VectorRAG:
     """RAG system using ChromaDB vector storage with hybrid search."""
 
@@ -99,7 +104,7 @@ class VectorRAG:
             return False
 
         try:
-            doc_id = f"doc_{hash(text) % 10**16}"
+            doc_id = _generate_doc_id(text)
             # Check if already exists
             existing = self._collection.get(ids=[doc_id])
             if existing["ids"]:
@@ -135,7 +140,7 @@ class VectorRAG:
             new_metas = []
             new_ids = []
             for t, m in valid:
-                doc_id = f"doc_{hash(t) % 10**16}"
+                doc_id = _generate_doc_id(t)
                 existing = self._collection.get(ids=[doc_id])
                 if not existing["ids"]:
                     new_texts.append(t)
diff --git a/tests/test_rag_vector_id_stability.py b/tests/test_rag_vector_id_stability.py
new file mode 100644
index 0000000..c9d2656
--- /dev/null
+++ b/tests/test_rag_vector_id_stability.py
@@ -0,0 +1,28 @@
+import os
+import subprocess
+import pytest
+
+def test_rag_id_stability_across_processes():
+    # Run helper in subprocesses with different PYTHONHASHSEED values to ensure cross-process stability
+    cmd = ["./venv/bin/python", "-c", "from src.rag_vector import _generate_doc_id; print(_generate_doc_id('test_text_hash'))"]
+    
+    env0 = os.environ.copy()
+    env0["PYTHONHASHSEED"] = "0"
+    id0 = subprocess.check_output(cmd, env=env0).decode().strip()
+    
+    env1 = os.environ.copy()
+    env1["PYTHONHASHSEED"] = "1"
+    id1 = subprocess.check_output(cmd, env=env1).decode().strip()
+    
+    env_rand = os.environ.copy()
+    env_rand["PYTHONHASHSEED"] = "random"
+    id_rand = subprocess.check_output(cmd, env=env_rand).decode().strip()
+    
+    # Assert they are all equal (deterministic across seeds and processes)
+    assert id0 == id1
+    assert id0 == id_rand
+    
+    # Assert different inputs produce different IDs
+    cmd_diff = ["./venv/bin/python", "-c", "from src.rag_vector import _generate_doc_id; print(_generate_doc_id('different_text_hash'))"]
+    id_diff = subprocess.check_output(cmd_diff, env=env0).decode().strip()
+    assert id0 != id_diff