add_document() and add_documents_batch() derive the persistent ChromaDB
document id from Python's built-in hash():
doc_id = f"doc_{hash(text) % 10**16}"
str hashing is randomized per process (PYTHONHASHSEED is on by default), so
the same document text gets a different doc_id on every restart. The dedup
check right after — self._collection.get(ids=[doc_id]) — therefore misses
on restart, and identical documents are re-embedded and re-added as
duplicates each time the app restarts, bloating the vector store and
skewing retrieval.
Derive the id from a stable hashlib.sha256 of the text via a shared
_generate_doc_id() helper, used by both add paths so they agree.
tests/test_rag_vector_id_stability.py runs _generate_doc_id in subprocesses
under PYTHONHASHSEED=0/1/random and asserts the id is identical across all
of them (and differs for different text). Fails before this change.
29 lines
1.1 KiB
Python
29 lines
1.1 KiB
Python
import os
|
|
import subprocess
|
|
import pytest
|
|
|
|
def test_rag_id_stability_across_processes():
|
|
# Run helper in subprocesses with different PYTHONHASHSEED values to ensure cross-process stability
|
|
cmd = ["./venv/bin/python", "-c", "from src.rag_vector import _generate_doc_id; print(_generate_doc_id('test_text_hash'))"]
|
|
|
|
env0 = os.environ.copy()
|
|
env0["PYTHONHASHSEED"] = "0"
|
|
id0 = subprocess.check_output(cmd, env=env0).decode().strip()
|
|
|
|
env1 = os.environ.copy()
|
|
env1["PYTHONHASHSEED"] = "1"
|
|
id1 = subprocess.check_output(cmd, env=env1).decode().strip()
|
|
|
|
env_rand = os.environ.copy()
|
|
env_rand["PYTHONHASHSEED"] = "random"
|
|
id_rand = subprocess.check_output(cmd, env=env_rand).decode().strip()
|
|
|
|
# Assert they are all equal (deterministic across seeds and processes)
|
|
assert id0 == id1
|
|
assert id0 == id_rand
|
|
|
|
# Assert different inputs produce different IDs
|
|
cmd_diff = ["./venv/bin/python", "-c", "from src.rag_vector import _generate_doc_id; print(_generate_doc_id('different_text_hash'))"]
|
|
id_diff = subprocess.check_output(cmd_diff, env=env0).decode().strip()
|
|
assert id0 != id_diff
|