Removing one RAG directory destroyed the whole shared ChromaDB collection
(all owners + base index) instead of just that directory's chunks. Shared
root cause: PersonalDocsManager.remove_directory called rebuild_index()
(delete_collection + recreate) then re-indexed only the remaining tracked
dirs (ownerless, never personal_dir). The targeted VectorRAG.remove_directory
that should have been used was itself broken (where={"source":{"$contains":dir}}
selects nothing on scalar metadata and would over-delete siblings), and the
dead do_manage_rag path fired a second unconditional rebuild.
- VectorRAG.remove_directory: select chunks in Python by a path-boundary match
on the stored absolute `source` (dir or dir+os.sep), abspath-normalized.
Keys on `source` (always written), never `owner` -- no migration.
- PersonalDocsManager.remove_directory: call the targeted remove instead of
rebuild_index() + partial reindex.
- do_manage_rag (dead code): drop the second rebuild_index() (hygiene).
- rag_server.py add path: abspath so indexed `source` matches the remove.
No schema change. Prevents future wipes (does not recover already-wiped
vectors). Adds hermetic regression tests at three layers.
Fixes #1660
Co-authored-by: Ethan <23321960+0xLeathery@users.noreply.github.com>
This commit is contained in:
@@ -105,7 +105,9 @@ async def call_tool(name: str, arguments: dict) -> list[TextContent]:
|
||||
directory = _dir.strip() if isinstance(_dir, str) else ""
|
||||
if not directory:
|
||||
return [TextContent(type="text", text="Error: add_directory needs a directory path")]
|
||||
directory = os.path.expanduser(directory)
|
||||
# Store an absolute path so indexed `source` metadata is absolute and
|
||||
# remove_directory (which abspath-normalizes) can match it later (#1660).
|
||||
directory = os.path.abspath(os.path.expanduser(directory))
|
||||
if not os.path.isdir(directory):
|
||||
return [TextContent(type="text", text=f"Error: Directory not found: {directory}")]
|
||||
if not _rag_manager:
|
||||
|
||||
@@ -1232,9 +1232,11 @@ async def do_manage_rag(content: str, session_id: Optional[str] = None) -> Dict:
|
||||
|
||||
try:
|
||||
if hasattr(_personal_docs_manager, 'remove_directory'):
|
||||
# Performs a targeted per-directory delete (#1660). The previous
|
||||
# unconditional _rag_manager.rebuild_index() here wiped the whole
|
||||
# collection on every remove (even for untracked dirs) and has
|
||||
# been removed.
|
||||
_personal_docs_manager.remove_directory(directory)
|
||||
if _rag_manager and hasattr(_rag_manager, 'rebuild_index'):
|
||||
_rag_manager.rebuild_index()
|
||||
return {"action": "remove_directory", "directory": directory,
|
||||
"results": f"Directory '{directory}' removed from RAG index"}
|
||||
except Exception as e:
|
||||
|
||||
@@ -306,18 +306,17 @@ class PersonalDocsManager:
|
||||
# Refresh the index to exclude the removed directory
|
||||
self.refresh_index()
|
||||
|
||||
# If RAG manager is available, we should rebuild the index
|
||||
# This is a simple approach - in production you might want more sophisticated removal
|
||||
# Targeted delete of just this directory's chunks. This previously
|
||||
# called rag_manager.rebuild_index(), which delete+recreates the
|
||||
# entire shared collection (every owner + the base index) and then
|
||||
# re-indexed only the remaining tracked dirs — ownerless and never
|
||||
# personal_dir — a catastrophic wipe (#1660). remove_directory now
|
||||
# removes exactly this directory's chunks and leaves the rest intact.
|
||||
if self.rag_manager:
|
||||
try:
|
||||
logger.info("Rebuilding RAG index after directory removal")
|
||||
self.rag_manager.rebuild_index()
|
||||
# Re-index remaining directories
|
||||
for dir_path in self.indexed_directories:
|
||||
if os.path.exists(dir_path):
|
||||
self.rag_manager.index_personal_documents(dir_path)
|
||||
self.rag_manager.remove_directory(directory)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to rebuild RAG index: {e}")
|
||||
logger.error(f"Failed to remove directory from RAG index: {e}")
|
||||
else:
|
||||
logger.info(f"Directory not in index: {directory}")
|
||||
|
||||
|
||||
@@ -380,20 +380,36 @@ class VectorRAG:
|
||||
return {'success': False, 'indexed_count': indexed, 'failed_count': failed, 'message': str(e)}
|
||||
|
||||
def remove_directory(self, directory: str) -> Dict[str, Any]:
|
||||
"""Remove all chunks from a directory. O(1) per chunk via ChromaDB."""
|
||||
"""Remove all chunks under ``directory`` (recursively), and nothing else.
|
||||
|
||||
Selection is a Python-side path-boundary match on each chunk's stored
|
||||
``source`` full path, NOT a Chroma metadata ``where`` filter. No Chroma
|
||||
metadata operator selects a scalar string by path prefix (``$contains``
|
||||
targets document content / list membership, not a ``source`` substring),
|
||||
and a plain substring would over-delete siblings — removing ``/docs``
|
||||
must not touch ``/docs2`` or ``/docs_personal``. We therefore match
|
||||
``source == directory`` or ``source`` startswith ``directory + os.sep``,
|
||||
the same boundary rule add_directory uses for exclusions. ``directory``
|
||||
is abspath-normalized so it matches the absolute ``source`` that indexing
|
||||
always stores, regardless of how the caller passed it in.
|
||||
"""
|
||||
if not self.healthy:
|
||||
return {"success": False, "message": "Collection not initialized"}
|
||||
directory = os.path.abspath(directory)
|
||||
try:
|
||||
# Use ChromaDB where filter to find all docs from this directory
|
||||
results = self._collection.get(
|
||||
where={"source": {"$contains": directory}} if "/" in directory else {"directory": directory},
|
||||
include=["metadatas"],
|
||||
)
|
||||
if not results['ids']:
|
||||
results = self._collection.get(include=["metadatas"])
|
||||
ids = [
|
||||
results["ids"][i]
|
||||
for i, m in enumerate(results["metadatas"])
|
||||
if isinstance(m, dict)
|
||||
and isinstance(m.get("source"), str)
|
||||
and (m["source"] == directory or m["source"].startswith(directory + os.sep))
|
||||
]
|
||||
if not ids:
|
||||
return {"success": True, "removed_count": 0, "message": "No docs found"}
|
||||
|
||||
self._collection.delete(ids=results['ids'])
|
||||
n = len(results['ids'])
|
||||
self._collection.delete(ids=ids)
|
||||
n = len(ids)
|
||||
logger.info(f"Removed {n} chunks from {directory}")
|
||||
return {"success": True, "removed_count": n, "message": f"Removed {n} chunks"}
|
||||
except Exception as e:
|
||||
|
||||
159
tests/test_rag_remove_directory_scope.py
Normal file
159
tests/test_rag_remove_directory_scope.py
Normal file
@@ -0,0 +1,159 @@
|
||||
"""Regression guard for #1660 — removing one RAG directory must delete only that
|
||||
directory's chunks, never wipe the whole shared collection.
|
||||
|
||||
Two compounding defects were fixed:
|
||||
1. PersonalDocsManager.remove_directory called rag_manager.rebuild_index(),
|
||||
which delete+recreates the entire shared "odysseus_rag" collection (all
|
||||
owners + the base index), then re-indexed only the remaining tracked dirs
|
||||
(ownerless, never personal_dir). Now it does a targeted per-directory delete.
|
||||
2. VectorRAG.remove_directory selected via where={"source": {"$contains": dir}},
|
||||
which no Chroma metadata operator supports as a path-prefix match (and a
|
||||
substring would over-delete siblings). Now it filters stored absolute
|
||||
`source` paths in Python with a path boundary (dir or dir + os.sep).
|
||||
|
||||
These tests are hermetic — no chromadb; VectorRAG is exercised against a fake
|
||||
collection, PersonalDocsManager against a fake rag manager.
|
||||
"""
|
||||
import os
|
||||
|
||||
os.environ.setdefault("DATABASE_URL", "sqlite:///:memory:")
|
||||
|
||||
import pytest
|
||||
|
||||
import src.rag_vector as rag_vector
|
||||
import src.personal_docs as personal_docs
|
||||
import src.ai_interaction as ai
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# VectorRAG.remove_directory selection correctness (edit C)
|
||||
# --------------------------------------------------------------------------- #
|
||||
|
||||
|
||||
class _FakeCollection:
|
||||
def __init__(self, rows):
|
||||
self._ids = [r[0] for r in rows]
|
||||
self._metas = [r[1] for r in rows]
|
||||
|
||||
def get(self, include=None):
|
||||
return {"ids": list(self._ids), "metadatas": list(self._metas)}
|
||||
|
||||
def delete(self, ids=None):
|
||||
drop = set(ids or [])
|
||||
kept = [(i, m) for i, m in zip(self._ids, self._metas) if i not in drop]
|
||||
self._ids = [i for i, _ in kept]
|
||||
self._metas = [m for _, m in kept]
|
||||
|
||||
|
||||
def _make_vectorrag(rows):
|
||||
rag = rag_vector.VectorRAG.__new__(rag_vector.VectorRAG) # skip Chroma connect
|
||||
rag._collection = _FakeCollection(rows)
|
||||
rag._healthy = True
|
||||
return rag
|
||||
|
||||
|
||||
def test_vectorrag_remove_is_path_bounded():
|
||||
rows = [
|
||||
("a", {"source": "/a/docs/f1.md"}),
|
||||
("b", {"source": "/a/docs/sub/f2.md"}), # nested -> must be removed
|
||||
("c", {"source": "/a/docs2/f3.md"}), # sibling prefix -> must survive
|
||||
("d", {"source": "/a/docs_personal/f4.md"}), # sibling prefix -> must survive
|
||||
("e", {"filename": "no-source.md"}), # sourceless dict -> must not crash/survive
|
||||
]
|
||||
rag = _make_vectorrag(rows)
|
||||
res = rag.remove_directory("/a/docs")
|
||||
assert res["success"] is True
|
||||
assert res["removed_count"] == 2
|
||||
remaining = set(rag._collection.get()["ids"])
|
||||
assert remaining == {"c", "d", "e"}, remaining
|
||||
|
||||
|
||||
def test_vectorrag_remove_no_match_is_noop():
|
||||
rag = _make_vectorrag([("a", {"source": "/a/docs/f1.md"})])
|
||||
res = rag.remove_directory("/nowhere")
|
||||
assert res["success"] is True
|
||||
assert res["removed_count"] == 0
|
||||
assert set(rag._collection.get()["ids"]) == {"a"}
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# PersonalDocsManager.remove_directory must delete-targeted, not wipe (edit A)
|
||||
# --------------------------------------------------------------------------- #
|
||||
|
||||
|
||||
class _FakeRag:
|
||||
"""Records calls and simulates a chunk store keyed by id -> metadata."""
|
||||
|
||||
def __init__(self, store):
|
||||
self.store = store
|
||||
self.rebuild_called = False
|
||||
|
||||
def rebuild_index(self):
|
||||
# The catastrophic op — mimic delete_collection wiping everything.
|
||||
self.rebuild_called = True
|
||||
self.store.clear()
|
||||
return True
|
||||
|
||||
def index_personal_documents(self, directory, owner=None):
|
||||
return {"indexed_count": 0} # old recovery path re-adds nothing here
|
||||
|
||||
def remove_directory(self, directory):
|
||||
directory = os.path.abspath(directory)
|
||||
doomed = [
|
||||
i for i, m in self.store.items()
|
||||
if isinstance(m.get("source"), str)
|
||||
and (m["source"] == directory or m["source"].startswith(directory + os.sep))
|
||||
]
|
||||
for i in doomed:
|
||||
del self.store[i]
|
||||
return {"success": True, "removed_count": len(doomed)}
|
||||
|
||||
|
||||
def test_personal_docs_remove_is_targeted(tmp_path):
|
||||
personal = os.path.abspath(str(tmp_path / "personal"))
|
||||
target = os.path.abspath(str(tmp_path / "target"))
|
||||
other = os.path.abspath(str(tmp_path / "other"))
|
||||
store = {
|
||||
"p": {"source": os.path.join(personal, "note.md"), "owner": "alice"},
|
||||
"t": {"source": os.path.join(target, "doc.md"), "owner": "alice"},
|
||||
"o": {"source": os.path.join(other, "doc.md"), "owner": "bob"},
|
||||
}
|
||||
fake = _FakeRag(store)
|
||||
mgr = personal_docs.PersonalDocsManager(str(tmp_path), rag_manager=fake)
|
||||
mgr.indexed_directories = [target, other] # personal_dir intentionally NOT tracked
|
||||
|
||||
mgr.remove_directory(target)
|
||||
|
||||
assert fake.rebuild_called is False, "must not wipe the whole collection"
|
||||
assert "t" not in store, "target directory's chunk should be removed"
|
||||
assert "p" in store, "base personal index must survive"
|
||||
assert "o" in store, "another owner's chunk must survive"
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# do_manage_rag remove path must not fire a whole-collection rebuild (edit B)
|
||||
# --------------------------------------------------------------------------- #
|
||||
|
||||
|
||||
async def test_do_manage_rag_remove_does_not_rebuild(monkeypatch):
|
||||
calls = {"rebuild": 0}
|
||||
|
||||
class _Rag:
|
||||
def rebuild_index(self):
|
||||
calls["rebuild"] += 1
|
||||
|
||||
def remove_directory(self, directory):
|
||||
pass
|
||||
|
||||
class _PDocs:
|
||||
def remove_directory(self, directory):
|
||||
pass
|
||||
|
||||
monkeypatch.setattr(ai, "_rag_manager", _Rag())
|
||||
monkeypatch.setattr(ai, "_personal_docs_manager", _PDocs())
|
||||
|
||||
# Untracked path: the old code still fired an unconditional rebuild_index().
|
||||
result = await ai.do_manage_rag("remove_directory\n/abs/untracked/dir")
|
||||
|
||||
assert calls["rebuild"] == 0, "remove must not rebuild (whole-collection wipe)"
|
||||
assert "error" not in result, result
|
||||
Reference in New Issue
Block a user