From 0e538ecd2939c18f9f86a9c5e8ff059d4ce17414 Mon Sep 17 00:00:00 2001 From: Ethan Date: Wed, 3 Jun 2026 14:29:51 +1000 Subject: [PATCH] Fix RAG remove_directory wiping the entire shared collection (#1660) (#1734) Removing one RAG directory destroyed the whole shared ChromaDB collection (all owners + base index) instead of just that directory's chunks. Shared root cause: PersonalDocsManager.remove_directory called rebuild_index() (delete_collection + recreate) then re-indexed only the remaining tracked dirs (ownerless, never personal_dir). The targeted VectorRAG.remove_directory that should have been used was itself broken (where={"source":{"$contains":dir}} selects nothing on scalar metadata and would over-delete siblings), and the dead do_manage_rag path fired a second unconditional rebuild. - VectorRAG.remove_directory: select chunks in Python by a path-boundary match on the stored absolute `source` (dir or dir+os.sep), abspath-normalized. Keys on `source` (always written), never `owner` -- no migration. - PersonalDocsManager.remove_directory: call the targeted remove instead of rebuild_index() + partial reindex. - do_manage_rag (dead code): drop the second rebuild_index() (hygiene). - rag_server.py add path: abspath so indexed `source` matches the remove. No schema change. Prevents future wipes (does not recover already-wiped vectors). Adds hermetic regression tests at three layers. Fixes #1660 Co-authored-by: Ethan <23321960+0xLeathery@users.noreply.github.com> --- mcp_servers/rag_server.py | 4 +- src/ai_interaction.py | 6 +- src/personal_docs.py | 17 ++- src/rag_vector.py | 34 +++-- tests/test_rag_remove_directory_scope.py | 159 +++++++++++++++++++++++ 5 files changed, 199 insertions(+), 21 deletions(-) create mode 100644 tests/test_rag_remove_directory_scope.py diff --git a/mcp_servers/rag_server.py b/mcp_servers/rag_server.py index 1dfd464..71aa1b6 100644 --- a/mcp_servers/rag_server.py +++ b/mcp_servers/rag_server.py @@ -105,7 +105,9 @@ async def call_tool(name: str, arguments: dict) -> list[TextContent]: directory = _dir.strip() if isinstance(_dir, str) else "" if not directory: return [TextContent(type="text", text="Error: add_directory needs a directory path")] - directory = os.path.expanduser(directory) + # Store an absolute path so indexed `source` metadata is absolute and + # remove_directory (which abspath-normalizes) can match it later (#1660). + directory = os.path.abspath(os.path.expanduser(directory)) if not os.path.isdir(directory): return [TextContent(type="text", text=f"Error: Directory not found: {directory}")] if not _rag_manager: diff --git a/src/ai_interaction.py b/src/ai_interaction.py index d9df250..5d36507 100644 --- a/src/ai_interaction.py +++ b/src/ai_interaction.py @@ -1232,9 +1232,11 @@ async def do_manage_rag(content: str, session_id: Optional[str] = None) -> Dict: try: if hasattr(_personal_docs_manager, 'remove_directory'): + # Performs a targeted per-directory delete (#1660). The previous + # unconditional _rag_manager.rebuild_index() here wiped the whole + # collection on every remove (even for untracked dirs) and has + # been removed. _personal_docs_manager.remove_directory(directory) - if _rag_manager and hasattr(_rag_manager, 'rebuild_index'): - _rag_manager.rebuild_index() return {"action": "remove_directory", "directory": directory, "results": f"Directory '{directory}' removed from RAG index"} except Exception as e: diff --git a/src/personal_docs.py b/src/personal_docs.py index b4d935a..06875d0 100644 --- a/src/personal_docs.py +++ b/src/personal_docs.py @@ -306,18 +306,17 @@ class PersonalDocsManager: # Refresh the index to exclude the removed directory self.refresh_index() - # If RAG manager is available, we should rebuild the index - # This is a simple approach - in production you might want more sophisticated removal + # Targeted delete of just this directory's chunks. This previously + # called rag_manager.rebuild_index(), which delete+recreates the + # entire shared collection (every owner + the base index) and then + # re-indexed only the remaining tracked dirs — ownerless and never + # personal_dir — a catastrophic wipe (#1660). remove_directory now + # removes exactly this directory's chunks and leaves the rest intact. if self.rag_manager: try: - logger.info("Rebuilding RAG index after directory removal") - self.rag_manager.rebuild_index() - # Re-index remaining directories - for dir_path in self.indexed_directories: - if os.path.exists(dir_path): - self.rag_manager.index_personal_documents(dir_path) + self.rag_manager.remove_directory(directory) except Exception as e: - logger.error(f"Failed to rebuild RAG index: {e}") + logger.error(f"Failed to remove directory from RAG index: {e}") else: logger.info(f"Directory not in index: {directory}") diff --git a/src/rag_vector.py b/src/rag_vector.py index 7faad3d..1f414e5 100644 --- a/src/rag_vector.py +++ b/src/rag_vector.py @@ -380,20 +380,36 @@ class VectorRAG: return {'success': False, 'indexed_count': indexed, 'failed_count': failed, 'message': str(e)} def remove_directory(self, directory: str) -> Dict[str, Any]: - """Remove all chunks from a directory. O(1) per chunk via ChromaDB.""" + """Remove all chunks under ``directory`` (recursively), and nothing else. + + Selection is a Python-side path-boundary match on each chunk's stored + ``source`` full path, NOT a Chroma metadata ``where`` filter. No Chroma + metadata operator selects a scalar string by path prefix (``$contains`` + targets document content / list membership, not a ``source`` substring), + and a plain substring would over-delete siblings — removing ``/docs`` + must not touch ``/docs2`` or ``/docs_personal``. We therefore match + ``source == directory`` or ``source`` startswith ``directory + os.sep``, + the same boundary rule add_directory uses for exclusions. ``directory`` + is abspath-normalized so it matches the absolute ``source`` that indexing + always stores, regardless of how the caller passed it in. + """ if not self.healthy: return {"success": False, "message": "Collection not initialized"} + directory = os.path.abspath(directory) try: - # Use ChromaDB where filter to find all docs from this directory - results = self._collection.get( - where={"source": {"$contains": directory}} if "/" in directory else {"directory": directory}, - include=["metadatas"], - ) - if not results['ids']: + results = self._collection.get(include=["metadatas"]) + ids = [ + results["ids"][i] + for i, m in enumerate(results["metadatas"]) + if isinstance(m, dict) + and isinstance(m.get("source"), str) + and (m["source"] == directory or m["source"].startswith(directory + os.sep)) + ] + if not ids: return {"success": True, "removed_count": 0, "message": "No docs found"} - self._collection.delete(ids=results['ids']) - n = len(results['ids']) + self._collection.delete(ids=ids) + n = len(ids) logger.info(f"Removed {n} chunks from {directory}") return {"success": True, "removed_count": n, "message": f"Removed {n} chunks"} except Exception as e: diff --git a/tests/test_rag_remove_directory_scope.py b/tests/test_rag_remove_directory_scope.py new file mode 100644 index 0000000..c2e5b4e --- /dev/null +++ b/tests/test_rag_remove_directory_scope.py @@ -0,0 +1,159 @@ +"""Regression guard for #1660 — removing one RAG directory must delete only that +directory's chunks, never wipe the whole shared collection. + +Two compounding defects were fixed: + 1. PersonalDocsManager.remove_directory called rag_manager.rebuild_index(), + which delete+recreates the entire shared "odysseus_rag" collection (all + owners + the base index), then re-indexed only the remaining tracked dirs + (ownerless, never personal_dir). Now it does a targeted per-directory delete. + 2. VectorRAG.remove_directory selected via where={"source": {"$contains": dir}}, + which no Chroma metadata operator supports as a path-prefix match (and a + substring would over-delete siblings). Now it filters stored absolute + `source` paths in Python with a path boundary (dir or dir + os.sep). + +These tests are hermetic — no chromadb; VectorRAG is exercised against a fake +collection, PersonalDocsManager against a fake rag manager. +""" +import os + +os.environ.setdefault("DATABASE_URL", "sqlite:///:memory:") + +import pytest + +import src.rag_vector as rag_vector +import src.personal_docs as personal_docs +import src.ai_interaction as ai + + +# --------------------------------------------------------------------------- # +# VectorRAG.remove_directory selection correctness (edit C) +# --------------------------------------------------------------------------- # + + +class _FakeCollection: + def __init__(self, rows): + self._ids = [r[0] for r in rows] + self._metas = [r[1] for r in rows] + + def get(self, include=None): + return {"ids": list(self._ids), "metadatas": list(self._metas)} + + def delete(self, ids=None): + drop = set(ids or []) + kept = [(i, m) for i, m in zip(self._ids, self._metas) if i not in drop] + self._ids = [i for i, _ in kept] + self._metas = [m for _, m in kept] + + +def _make_vectorrag(rows): + rag = rag_vector.VectorRAG.__new__(rag_vector.VectorRAG) # skip Chroma connect + rag._collection = _FakeCollection(rows) + rag._healthy = True + return rag + + +def test_vectorrag_remove_is_path_bounded(): + rows = [ + ("a", {"source": "/a/docs/f1.md"}), + ("b", {"source": "/a/docs/sub/f2.md"}), # nested -> must be removed + ("c", {"source": "/a/docs2/f3.md"}), # sibling prefix -> must survive + ("d", {"source": "/a/docs_personal/f4.md"}), # sibling prefix -> must survive + ("e", {"filename": "no-source.md"}), # sourceless dict -> must not crash/survive + ] + rag = _make_vectorrag(rows) + res = rag.remove_directory("/a/docs") + assert res["success"] is True + assert res["removed_count"] == 2 + remaining = set(rag._collection.get()["ids"]) + assert remaining == {"c", "d", "e"}, remaining + + +def test_vectorrag_remove_no_match_is_noop(): + rag = _make_vectorrag([("a", {"source": "/a/docs/f1.md"})]) + res = rag.remove_directory("/nowhere") + assert res["success"] is True + assert res["removed_count"] == 0 + assert set(rag._collection.get()["ids"]) == {"a"} + + +# --------------------------------------------------------------------------- # +# PersonalDocsManager.remove_directory must delete-targeted, not wipe (edit A) +# --------------------------------------------------------------------------- # + + +class _FakeRag: + """Records calls and simulates a chunk store keyed by id -> metadata.""" + + def __init__(self, store): + self.store = store + self.rebuild_called = False + + def rebuild_index(self): + # The catastrophic op — mimic delete_collection wiping everything. + self.rebuild_called = True + self.store.clear() + return True + + def index_personal_documents(self, directory, owner=None): + return {"indexed_count": 0} # old recovery path re-adds nothing here + + def remove_directory(self, directory): + directory = os.path.abspath(directory) + doomed = [ + i for i, m in self.store.items() + if isinstance(m.get("source"), str) + and (m["source"] == directory or m["source"].startswith(directory + os.sep)) + ] + for i in doomed: + del self.store[i] + return {"success": True, "removed_count": len(doomed)} + + +def test_personal_docs_remove_is_targeted(tmp_path): + personal = os.path.abspath(str(tmp_path / "personal")) + target = os.path.abspath(str(tmp_path / "target")) + other = os.path.abspath(str(tmp_path / "other")) + store = { + "p": {"source": os.path.join(personal, "note.md"), "owner": "alice"}, + "t": {"source": os.path.join(target, "doc.md"), "owner": "alice"}, + "o": {"source": os.path.join(other, "doc.md"), "owner": "bob"}, + } + fake = _FakeRag(store) + mgr = personal_docs.PersonalDocsManager(str(tmp_path), rag_manager=fake) + mgr.indexed_directories = [target, other] # personal_dir intentionally NOT tracked + + mgr.remove_directory(target) + + assert fake.rebuild_called is False, "must not wipe the whole collection" + assert "t" not in store, "target directory's chunk should be removed" + assert "p" in store, "base personal index must survive" + assert "o" in store, "another owner's chunk must survive" + + +# --------------------------------------------------------------------------- # +# do_manage_rag remove path must not fire a whole-collection rebuild (edit B) +# --------------------------------------------------------------------------- # + + +async def test_do_manage_rag_remove_does_not_rebuild(monkeypatch): + calls = {"rebuild": 0} + + class _Rag: + def rebuild_index(self): + calls["rebuild"] += 1 + + def remove_directory(self, directory): + pass + + class _PDocs: + def remove_directory(self, directory): + pass + + monkeypatch.setattr(ai, "_rag_manager", _Rag()) + monkeypatch.setattr(ai, "_personal_docs_manager", _PDocs()) + + # Untracked path: the old code still fired an unconditional rebuild_index(). + result = await ai.do_manage_rag("remove_directory\n/abs/untracked/dir") + + assert calls["rebuild"] == 0, "remove must not rebuild (whole-collection wipe)" + assert "error" not in result, result