fix(rag): use a stable hash for document IDs so dedup survives restarts (#1098)
add_document() and add_documents_batch() derive the persistent ChromaDB
document id from Python's built-in hash():
doc_id = f"doc_{hash(text) % 10**16}"
str hashing is randomized per process (PYTHONHASHSEED is on by default), so
the same document text gets a different doc_id on every restart. The dedup
check right after — self._collection.get(ids=[doc_id]) — therefore misses
on restart, and identical documents are re-embedded and re-added as
duplicates each time the app restarts, bloating the vector store and
skewing retrieval.
Derive the id from a stable hashlib.sha256 of the text via a shared
_generate_doc_id() helper, used by both add paths so they agree.
tests/test_rag_vector_id_stability.py runs _generate_doc_id in subprocesses
under PYTHONHASHSEED=0/1/random and asserts the id is identical across all
of them (and differs for different text). Fails before this change.
This commit is contained in:
@@ -7,6 +7,7 @@ configurable embedding endpoint via EMBEDDING_URL env var.
|
||||
"""
|
||||
|
||||
import os
|
||||
import hashlib
|
||||
import re
|
||||
import logging
|
||||
import numpy as np
|
||||
@@ -26,6 +27,10 @@ KEYWORD_WEIGHT = 0.3
|
||||
COLLECTION_NAME = "odysseus_rag"
|
||||
|
||||
|
||||
def _generate_doc_id(text: str) -> str:
|
||||
return f"doc_{hashlib.sha256(text.encode('utf-8')).hexdigest()[:16]}"
|
||||
|
||||
|
||||
class VectorRAG:
|
||||
"""RAG system using ChromaDB vector storage with hybrid search."""
|
||||
|
||||
@@ -99,7 +104,7 @@ class VectorRAG:
|
||||
return False
|
||||
|
||||
try:
|
||||
doc_id = f"doc_{hash(text) % 10**16}"
|
||||
doc_id = _generate_doc_id(text)
|
||||
# Check if already exists
|
||||
existing = self._collection.get(ids=[doc_id])
|
||||
if existing["ids"]:
|
||||
@@ -135,7 +140,7 @@ class VectorRAG:
|
||||
new_metas = []
|
||||
new_ids = []
|
||||
for t, m in valid:
|
||||
doc_id = f"doc_{hash(t) % 10**16}"
|
||||
doc_id = _generate_doc_id(t)
|
||||
existing = self._collection.get(ids=[doc_id])
|
||||
if not existing["ids"]:
|
||||
new_texts.append(t)
|
||||
|
||||
28
tests/test_rag_vector_id_stability.py
Normal file
28
tests/test_rag_vector_id_stability.py
Normal file
@@ -0,0 +1,28 @@
|
||||
import os
|
||||
import subprocess
|
||||
import pytest
|
||||
|
||||
def test_rag_id_stability_across_processes():
|
||||
# Run helper in subprocesses with different PYTHONHASHSEED values to ensure cross-process stability
|
||||
cmd = ["./venv/bin/python", "-c", "from src.rag_vector import _generate_doc_id; print(_generate_doc_id('test_text_hash'))"]
|
||||
|
||||
env0 = os.environ.copy()
|
||||
env0["PYTHONHASHSEED"] = "0"
|
||||
id0 = subprocess.check_output(cmd, env=env0).decode().strip()
|
||||
|
||||
env1 = os.environ.copy()
|
||||
env1["PYTHONHASHSEED"] = "1"
|
||||
id1 = subprocess.check_output(cmd, env=env1).decode().strip()
|
||||
|
||||
env_rand = os.environ.copy()
|
||||
env_rand["PYTHONHASHSEED"] = "random"
|
||||
id_rand = subprocess.check_output(cmd, env=env_rand).decode().strip()
|
||||
|
||||
# Assert they are all equal (deterministic across seeds and processes)
|
||||
assert id0 == id1
|
||||
assert id0 == id_rand
|
||||
|
||||
# Assert different inputs produce different IDs
|
||||
cmd_diff = ["./venv/bin/python", "-c", "from src.rag_vector import _generate_doc_id; print(_generate_doc_id('different_text_hash'))"]
|
||||
id_diff = subprocess.check_output(cmd_diff, env=env0).decode().strip()
|
||||
assert id0 != id_diff
|
||||
Reference in New Issue
Block a user