Fix RAG remove_directory wiping the entire shared collection (#1660) (#1734)

Removing one RAG directory destroyed the whole shared ChromaDB collection
(all owners + base index) instead of just that directory's chunks. Shared
root cause: PersonalDocsManager.remove_directory called rebuild_index()
(delete_collection + recreate) then re-indexed only the remaining tracked
dirs (ownerless, never personal_dir). The targeted VectorRAG.remove_directory
that should have been used was itself broken (where={"source":{"$contains":dir}}
selects nothing on scalar metadata and would over-delete siblings), and the
dead do_manage_rag path fired a second unconditional rebuild.

- VectorRAG.remove_directory: select chunks in Python by a path-boundary match
  on the stored absolute `source` (dir or dir+os.sep), abspath-normalized.
  Keys on `source` (always written), never `owner` -- no migration.
- PersonalDocsManager.remove_directory: call the targeted remove instead of
  rebuild_index() + partial reindex.
- do_manage_rag (dead code): drop the second rebuild_index() (hygiene).
- rag_server.py add path: abspath so indexed `source` matches the remove.

No schema change. Prevents future wipes (does not recover already-wiped
vectors). Adds hermetic regression tests at three layers.

Fixes #1660

Co-authored-by: Ethan <23321960+0xLeathery@users.noreply.github.com>
This commit is contained in:
Ethan
2026-06-03 14:29:51 +10:00
committed by GitHub
parent 9964e9f3fb
commit 0e538ecd29
5 changed files with 199 additions and 21 deletions

View File

@@ -105,7 +105,9 @@ async def call_tool(name: str, arguments: dict) -> list[TextContent]:
directory = _dir.strip() if isinstance(_dir, str) else "" directory = _dir.strip() if isinstance(_dir, str) else ""
if not directory: if not directory:
return [TextContent(type="text", text="Error: add_directory needs a directory path")] return [TextContent(type="text", text="Error: add_directory needs a directory path")]
directory = os.path.expanduser(directory) # Store an absolute path so indexed `source` metadata is absolute and
# remove_directory (which abspath-normalizes) can match it later (#1660).
directory = os.path.abspath(os.path.expanduser(directory))
if not os.path.isdir(directory): if not os.path.isdir(directory):
return [TextContent(type="text", text=f"Error: Directory not found: {directory}")] return [TextContent(type="text", text=f"Error: Directory not found: {directory}")]
if not _rag_manager: if not _rag_manager:

View File

@@ -1232,9 +1232,11 @@ async def do_manage_rag(content: str, session_id: Optional[str] = None) -> Dict:
try: try:
if hasattr(_personal_docs_manager, 'remove_directory'): if hasattr(_personal_docs_manager, 'remove_directory'):
# Performs a targeted per-directory delete (#1660). The previous
# unconditional _rag_manager.rebuild_index() here wiped the whole
# collection on every remove (even for untracked dirs) and has
# been removed.
_personal_docs_manager.remove_directory(directory) _personal_docs_manager.remove_directory(directory)
if _rag_manager and hasattr(_rag_manager, 'rebuild_index'):
_rag_manager.rebuild_index()
return {"action": "remove_directory", "directory": directory, return {"action": "remove_directory", "directory": directory,
"results": f"Directory '{directory}' removed from RAG index"} "results": f"Directory '{directory}' removed from RAG index"}
except Exception as e: except Exception as e:

View File

@@ -306,18 +306,17 @@ class PersonalDocsManager:
# Refresh the index to exclude the removed directory # Refresh the index to exclude the removed directory
self.refresh_index() self.refresh_index()
# If RAG manager is available, we should rebuild the index # Targeted delete of just this directory's chunks. This previously
# This is a simple approach - in production you might want more sophisticated removal # called rag_manager.rebuild_index(), which delete+recreates the
# entire shared collection (every owner + the base index) and then
# re-indexed only the remaining tracked dirs — ownerless and never
# personal_dir — a catastrophic wipe (#1660). remove_directory now
# removes exactly this directory's chunks and leaves the rest intact.
if self.rag_manager: if self.rag_manager:
try: try:
logger.info("Rebuilding RAG index after directory removal") self.rag_manager.remove_directory(directory)
self.rag_manager.rebuild_index()
# Re-index remaining directories
for dir_path in self.indexed_directories:
if os.path.exists(dir_path):
self.rag_manager.index_personal_documents(dir_path)
except Exception as e: except Exception as e:
logger.error(f"Failed to rebuild RAG index: {e}") logger.error(f"Failed to remove directory from RAG index: {e}")
else: else:
logger.info(f"Directory not in index: {directory}") logger.info(f"Directory not in index: {directory}")

View File

@@ -380,20 +380,36 @@ class VectorRAG:
return {'success': False, 'indexed_count': indexed, 'failed_count': failed, 'message': str(e)} return {'success': False, 'indexed_count': indexed, 'failed_count': failed, 'message': str(e)}
def remove_directory(self, directory: str) -> Dict[str, Any]: def remove_directory(self, directory: str) -> Dict[str, Any]:
"""Remove all chunks from a directory. O(1) per chunk via ChromaDB.""" """Remove all chunks under ``directory`` (recursively), and nothing else.
Selection is a Python-side path-boundary match on each chunk's stored
``source`` full path, NOT a Chroma metadata ``where`` filter. No Chroma
metadata operator selects a scalar string by path prefix (``$contains``
targets document content / list membership, not a ``source`` substring),
and a plain substring would over-delete siblings — removing ``/docs``
must not touch ``/docs2`` or ``/docs_personal``. We therefore match
``source == directory`` or ``source`` startswith ``directory + os.sep``,
the same boundary rule add_directory uses for exclusions. ``directory``
is abspath-normalized so it matches the absolute ``source`` that indexing
always stores, regardless of how the caller passed it in.
"""
if not self.healthy: if not self.healthy:
return {"success": False, "message": "Collection not initialized"} return {"success": False, "message": "Collection not initialized"}
directory = os.path.abspath(directory)
try: try:
# Use ChromaDB where filter to find all docs from this directory results = self._collection.get(include=["metadatas"])
results = self._collection.get( ids = [
where={"source": {"$contains": directory}} if "/" in directory else {"directory": directory}, results["ids"][i]
include=["metadatas"], for i, m in enumerate(results["metadatas"])
) if isinstance(m, dict)
if not results['ids']: and isinstance(m.get("source"), str)
and (m["source"] == directory or m["source"].startswith(directory + os.sep))
]
if not ids:
return {"success": True, "removed_count": 0, "message": "No docs found"} return {"success": True, "removed_count": 0, "message": "No docs found"}
self._collection.delete(ids=results['ids']) self._collection.delete(ids=ids)
n = len(results['ids']) n = len(ids)
logger.info(f"Removed {n} chunks from {directory}") logger.info(f"Removed {n} chunks from {directory}")
return {"success": True, "removed_count": n, "message": f"Removed {n} chunks"} return {"success": True, "removed_count": n, "message": f"Removed {n} chunks"}
except Exception as e: except Exception as e:

View File

@@ -0,0 +1,159 @@
"""Regression guard for #1660 — removing one RAG directory must delete only that
directory's chunks, never wipe the whole shared collection.
Two compounding defects were fixed:
1. PersonalDocsManager.remove_directory called rag_manager.rebuild_index(),
which delete+recreates the entire shared "odysseus_rag" collection (all
owners + the base index), then re-indexed only the remaining tracked dirs
(ownerless, never personal_dir). Now it does a targeted per-directory delete.
2. VectorRAG.remove_directory selected via where={"source": {"$contains": dir}},
which no Chroma metadata operator supports as a path-prefix match (and a
substring would over-delete siblings). Now it filters stored absolute
`source` paths in Python with a path boundary (dir or dir + os.sep).
These tests are hermetic — no chromadb; VectorRAG is exercised against a fake
collection, PersonalDocsManager against a fake rag manager.
"""
import os
os.environ.setdefault("DATABASE_URL", "sqlite:///:memory:")
import pytest
import src.rag_vector as rag_vector
import src.personal_docs as personal_docs
import src.ai_interaction as ai
# --------------------------------------------------------------------------- #
# VectorRAG.remove_directory selection correctness (edit C)
# --------------------------------------------------------------------------- #
class _FakeCollection:
def __init__(self, rows):
self._ids = [r[0] for r in rows]
self._metas = [r[1] for r in rows]
def get(self, include=None):
return {"ids": list(self._ids), "metadatas": list(self._metas)}
def delete(self, ids=None):
drop = set(ids or [])
kept = [(i, m) for i, m in zip(self._ids, self._metas) if i not in drop]
self._ids = [i for i, _ in kept]
self._metas = [m for _, m in kept]
def _make_vectorrag(rows):
rag = rag_vector.VectorRAG.__new__(rag_vector.VectorRAG) # skip Chroma connect
rag._collection = _FakeCollection(rows)
rag._healthy = True
return rag
def test_vectorrag_remove_is_path_bounded():
rows = [
("a", {"source": "/a/docs/f1.md"}),
("b", {"source": "/a/docs/sub/f2.md"}), # nested -> must be removed
("c", {"source": "/a/docs2/f3.md"}), # sibling prefix -> must survive
("d", {"source": "/a/docs_personal/f4.md"}), # sibling prefix -> must survive
("e", {"filename": "no-source.md"}), # sourceless dict -> must not crash/survive
]
rag = _make_vectorrag(rows)
res = rag.remove_directory("/a/docs")
assert res["success"] is True
assert res["removed_count"] == 2
remaining = set(rag._collection.get()["ids"])
assert remaining == {"c", "d", "e"}, remaining
def test_vectorrag_remove_no_match_is_noop():
rag = _make_vectorrag([("a", {"source": "/a/docs/f1.md"})])
res = rag.remove_directory("/nowhere")
assert res["success"] is True
assert res["removed_count"] == 0
assert set(rag._collection.get()["ids"]) == {"a"}
# --------------------------------------------------------------------------- #
# PersonalDocsManager.remove_directory must delete-targeted, not wipe (edit A)
# --------------------------------------------------------------------------- #
class _FakeRag:
"""Records calls and simulates a chunk store keyed by id -> metadata."""
def __init__(self, store):
self.store = store
self.rebuild_called = False
def rebuild_index(self):
# The catastrophic op — mimic delete_collection wiping everything.
self.rebuild_called = True
self.store.clear()
return True
def index_personal_documents(self, directory, owner=None):
return {"indexed_count": 0} # old recovery path re-adds nothing here
def remove_directory(self, directory):
directory = os.path.abspath(directory)
doomed = [
i for i, m in self.store.items()
if isinstance(m.get("source"), str)
and (m["source"] == directory or m["source"].startswith(directory + os.sep))
]
for i in doomed:
del self.store[i]
return {"success": True, "removed_count": len(doomed)}
def test_personal_docs_remove_is_targeted(tmp_path):
personal = os.path.abspath(str(tmp_path / "personal"))
target = os.path.abspath(str(tmp_path / "target"))
other = os.path.abspath(str(tmp_path / "other"))
store = {
"p": {"source": os.path.join(personal, "note.md"), "owner": "alice"},
"t": {"source": os.path.join(target, "doc.md"), "owner": "alice"},
"o": {"source": os.path.join(other, "doc.md"), "owner": "bob"},
}
fake = _FakeRag(store)
mgr = personal_docs.PersonalDocsManager(str(tmp_path), rag_manager=fake)
mgr.indexed_directories = [target, other] # personal_dir intentionally NOT tracked
mgr.remove_directory(target)
assert fake.rebuild_called is False, "must not wipe the whole collection"
assert "t" not in store, "target directory's chunk should be removed"
assert "p" in store, "base personal index must survive"
assert "o" in store, "another owner's chunk must survive"
# --------------------------------------------------------------------------- #
# do_manage_rag remove path must not fire a whole-collection rebuild (edit B)
# --------------------------------------------------------------------------- #
async def test_do_manage_rag_remove_does_not_rebuild(monkeypatch):
calls = {"rebuild": 0}
class _Rag:
def rebuild_index(self):
calls["rebuild"] += 1
def remove_directory(self, directory):
pass
class _PDocs:
def remove_directory(self, directory):
pass
monkeypatch.setattr(ai, "_rag_manager", _Rag())
monkeypatch.setattr(ai, "_personal_docs_manager", _PDocs())
# Untracked path: the old code still fired an unconditional rebuild_index().
result = await ai.do_manage_rag("remove_directory\n/abs/untracked/dir")
assert calls["rebuild"] == 0, "remove must not rebuild (whole-collection wipe)"
assert "error" not in result, result