From dc8a882f1f88e4936b81d182a9b5e8a48559947d Mon Sep 17 00:00:00 2001 From: Tatlatat Date: Tue, 2 Jun 2026 20:42:23 +0700 Subject: [PATCH] fix(rag): use a stable hash for document IDs so dedup survives restarts (#1098) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit add_document() and add_documents_batch() derive the persistent ChromaDB document id from Python's built-in hash(): doc_id = f"doc_{hash(text) % 10**16}" str hashing is randomized per process (PYTHONHASHSEED is on by default), so the same document text gets a different doc_id on every restart. The dedup check right after — self._collection.get(ids=[doc_id]) — therefore misses on restart, and identical documents are re-embedded and re-added as duplicates each time the app restarts, bloating the vector store and skewing retrieval. Derive the id from a stable hashlib.sha256 of the text via a shared _generate_doc_id() helper, used by both add paths so they agree. tests/test_rag_vector_id_stability.py runs _generate_doc_id in subprocesses under PYTHONHASHSEED=0/1/random and asserts the id is identical across all of them (and differs for different text). Fails before this change. --- src/rag_vector.py | 9 +++++++-- tests/test_rag_vector_id_stability.py | 28 +++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 2 deletions(-) create mode 100644 tests/test_rag_vector_id_stability.py diff --git a/src/rag_vector.py b/src/rag_vector.py index fcb27c1..f1df5b8 100644 --- a/src/rag_vector.py +++ b/src/rag_vector.py @@ -7,6 +7,7 @@ configurable embedding endpoint via EMBEDDING_URL env var. """ import os +import hashlib import re import logging import numpy as np @@ -26,6 +27,10 @@ KEYWORD_WEIGHT = 0.3 COLLECTION_NAME = "odysseus_rag" +def _generate_doc_id(text: str) -> str: + return f"doc_{hashlib.sha256(text.encode('utf-8')).hexdigest()[:16]}" + + class VectorRAG: """RAG system using ChromaDB vector storage with hybrid search.""" @@ -99,7 +104,7 @@ class VectorRAG: return False try: - doc_id = f"doc_{hash(text) % 10**16}" + doc_id = _generate_doc_id(text) # Check if already exists existing = self._collection.get(ids=[doc_id]) if existing["ids"]: @@ -135,7 +140,7 @@ class VectorRAG: new_metas = [] new_ids = [] for t, m in valid: - doc_id = f"doc_{hash(t) % 10**16}" + doc_id = _generate_doc_id(t) existing = self._collection.get(ids=[doc_id]) if not existing["ids"]: new_texts.append(t) diff --git a/tests/test_rag_vector_id_stability.py b/tests/test_rag_vector_id_stability.py new file mode 100644 index 0000000..c9d2656 --- /dev/null +++ b/tests/test_rag_vector_id_stability.py @@ -0,0 +1,28 @@ +import os +import subprocess +import pytest + +def test_rag_id_stability_across_processes(): + # Run helper in subprocesses with different PYTHONHASHSEED values to ensure cross-process stability + cmd = ["./venv/bin/python", "-c", "from src.rag_vector import _generate_doc_id; print(_generate_doc_id('test_text_hash'))"] + + env0 = os.environ.copy() + env0["PYTHONHASHSEED"] = "0" + id0 = subprocess.check_output(cmd, env=env0).decode().strip() + + env1 = os.environ.copy() + env1["PYTHONHASHSEED"] = "1" + id1 = subprocess.check_output(cmd, env=env1).decode().strip() + + env_rand = os.environ.copy() + env_rand["PYTHONHASHSEED"] = "random" + id_rand = subprocess.check_output(cmd, env=env_rand).decode().strip() + + # Assert they are all equal (deterministic across seeds and processes) + assert id0 == id1 + assert id0 == id_rand + + # Assert different inputs produce different IDs + cmd_diff = ["./venv/bin/python", "-c", "from src.rag_vector import _generate_doc_id; print(_generate_doc_id('different_text_hash'))"] + id_diff = subprocess.check_output(cmd_diff, env=env0).decode().strip() + assert id0 != id_diff