Fix RAG remove_directory wiping the entire shared collection (#1660) (#1734)

Removing one RAG directory destroyed the whole shared ChromaDB collection (all owners + base index) instead of just that directory's chunks. Shared root cause: PersonalDocsManager.remove_directory called rebuild_index() (delete_collection + recreate) then re-indexed only the remaining tracked dirs (ownerless, never personal_dir). The targeted VectorRAG.remove_directory that should have been used was itself broken (where={"source":{"$contains":dir}} selects nothing on scalar metadata and would over-delete siblings), and the dead do_manage_rag path fired a second unconditional rebuild. - VectorRAG.remove_directory: select chunks in Python by a path-boundary match on the stored absolute `source` (dir or dir+os.sep), abspath-normalized. Keys on `source` (always written), never `owner` -- no migration. - PersonalDocsManager.remove_directory: call the targeted remove instead of rebuild_index() + partial reindex. - do_manage_rag (dead code): drop the second rebuild_index() (hygiene). - rag_server.py add path: abspath so indexed `source` matches the remove. No schema change. Prevents future wipes (does not recover already-wiped vectors). Adds hermetic regression tests at three layers. Fixes #1660 Co-authored-by: Ethan <23321960+0xLeathery@users.noreply.github.com>
2026-06-03 14:29:51 +10:00
parent 9964e9f3fb
commit 0e538ecd29
5 changed files with 199 additions and 21 deletions
--- a/mcp_servers/rag_server.py
+++ b/mcp_servers/rag_server.py
@@ -105,7 +105,9 @@ async def call_tool(name: str, arguments: dict) -> list[TextContent]:
        directory = _dir.strip() if isinstance(_dir, str) else ""
        if not directory:
            return [TextContent(type="text", text="Error: add_directory needs a directory path")]
-        directory = os.path.expanduser(directory)
+        # Store an absolute path so indexed `source` metadata is absolute and
+        # remove_directory (which abspath-normalizes) can match it later (#1660).
+        directory = os.path.abspath(os.path.expanduser(directory))
        if not os.path.isdir(directory):
            return [TextContent(type="text", text=f"Error: Directory not found: {directory}")]
        if not _rag_manager:
--- a/src/ai_interaction.py
+++ b/src/ai_interaction.py
@@ -1232,9 +1232,11 @@ async def do_manage_rag(content: str, session_id: Optional[str] = None) -> Dict:

        try:
            if hasattr(_personal_docs_manager, 'remove_directory'):
+                # Performs a targeted per-directory delete (#1660). The previous
+                # unconditional _rag_manager.rebuild_index() here wiped the whole
+                # collection on every remove (even for untracked dirs) and has
+                # been removed.
                _personal_docs_manager.remove_directory(directory)
-            if _rag_manager and hasattr(_rag_manager, 'rebuild_index'):
-                _rag_manager.rebuild_index()
            return {"action": "remove_directory", "directory": directory,
                    "results": f"Directory '{directory}' removed from RAG index"}
        except Exception as e:
--- a/src/personal_docs.py
+++ b/src/personal_docs.py
@@ -306,18 +306,17 @@ class PersonalDocsManager:
            # Refresh the index to exclude the removed directory
            self.refresh_index()
            
-            # If RAG manager is available, we should rebuild the index
-            # This is a simple approach - in production you might want more sophisticated removal
+            # Targeted delete of just this directory's chunks. This previously
+            # called rag_manager.rebuild_index(), which delete+recreates the
+            # entire shared collection (every owner + the base index) and then
+            # re-indexed only the remaining tracked dirs — ownerless and never
+            # personal_dir — a catastrophic wipe (#1660). remove_directory now
+            # removes exactly this directory's chunks and leaves the rest intact.
            if self.rag_manager:
                try:
-                    logger.info("Rebuilding RAG index after directory removal")
-                    self.rag_manager.rebuild_index()
-                    # Re-index remaining directories
-                    for dir_path in self.indexed_directories:
-                        if os.path.exists(dir_path):
-                            self.rag_manager.index_personal_documents(dir_path)
+                    self.rag_manager.remove_directory(directory)
                except Exception as e:
-                    logger.error(f"Failed to rebuild RAG index: {e}")
+                    logger.error(f"Failed to remove directory from RAG index: {e}")
        else:
            logger.info(f"Directory not in index: {directory}")

--- a/src/rag_vector.py
+++ b/src/rag_vector.py
@@ -380,20 +380,36 @@ class VectorRAG:
            return {'success': False, 'indexed_count': indexed, 'failed_count': failed, 'message': str(e)}

    def remove_directory(self, directory: str) -> Dict[str, Any]:
-        """Remove all chunks from a directory. O(1) per chunk via ChromaDB."""
+        """Remove all chunks under ``directory`` (recursively), and nothing else.
+
+        Selection is a Python-side path-boundary match on each chunk's stored
+        ``source`` full path, NOT a Chroma metadata ``where`` filter. No Chroma
+        metadata operator selects a scalar string by path prefix (``$contains``
+        targets document content / list membership, not a ``source`` substring),
+        and a plain substring would over-delete siblings — removing ``/docs``
+        must not touch ``/docs2`` or ``/docs_personal``. We therefore match
+        ``source == directory`` or ``source`` startswith ``directory + os.sep``,
+        the same boundary rule add_directory uses for exclusions. ``directory``
+        is abspath-normalized so it matches the absolute ``source`` that indexing
+        always stores, regardless of how the caller passed it in.
+        """
        if not self.healthy:
            return {"success": False, "message": "Collection not initialized"}
+        directory = os.path.abspath(directory)
        try:
-            # Use ChromaDB where filter to find all docs from this directory
-            results = self._collection.get(
-                where={"source": {"$contains": directory}} if "/" in directory else {"directory": directory},
-                include=["metadatas"],
-            )
-            if not results['ids']:
+            results = self._collection.get(include=["metadatas"])
+            ids = [
+                results["ids"][i]
+                for i, m in enumerate(results["metadatas"])
+                if isinstance(m, dict)
+                and isinstance(m.get("source"), str)
+                and (m["source"] == directory or m["source"].startswith(directory + os.sep))
+            ]
+            if not ids:
                return {"success": True, "removed_count": 0, "message": "No docs found"}

-            self._collection.delete(ids=results['ids'])
-            n = len(results['ids'])
+            self._collection.delete(ids=ids)
+            n = len(ids)
            logger.info(f"Removed {n} chunks from {directory}")
            return {"success": True, "removed_count": n, "message": f"Removed {n} chunks"}
        except Exception as e:
--- a/tests/test_rag_remove_directory_scope.py
+++ b/tests/test_rag_remove_directory_scope.py
@@ -0,0 +1,159 @@
+"""Regression guard for #1660 — removing one RAG directory must delete only that
+directory's chunks, never wipe the whole shared collection.
+
+Two compounding defects were fixed:
+  1. PersonalDocsManager.remove_directory called rag_manager.rebuild_index(),
+     which delete+recreates the entire shared "odysseus_rag" collection (all
+     owners + the base index), then re-indexed only the remaining tracked dirs
+     (ownerless, never personal_dir). Now it does a targeted per-directory delete.
+  2. VectorRAG.remove_directory selected via where={"source": {"$contains": dir}},
+     which no Chroma metadata operator supports as a path-prefix match (and a
+     substring would over-delete siblings). Now it filters stored absolute
+     `source` paths in Python with a path boundary (dir or dir + os.sep).
+
+These tests are hermetic — no chromadb; VectorRAG is exercised against a fake
+collection, PersonalDocsManager against a fake rag manager.
+"""
+import os
+
+os.environ.setdefault("DATABASE_URL", "sqlite:///:memory:")
+
+import pytest
+
+import src.rag_vector as rag_vector
+import src.personal_docs as personal_docs
+import src.ai_interaction as ai
+
+
+# --------------------------------------------------------------------------- #
+# VectorRAG.remove_directory selection correctness (edit C)
+# --------------------------------------------------------------------------- #
+
+
+class _FakeCollection:
+    def __init__(self, rows):
+        self._ids = [r[0] for r in rows]
+        self._metas = [r[1] for r in rows]
+
+    def get(self, include=None):
+        return {"ids": list(self._ids), "metadatas": list(self._metas)}
+
+    def delete(self, ids=None):
+        drop = set(ids or [])
+        kept = [(i, m) for i, m in zip(self._ids, self._metas) if i not in drop]
+        self._ids = [i for i, _ in kept]
+        self._metas = [m for _, m in kept]
+
+
+def _make_vectorrag(rows):
+    rag = rag_vector.VectorRAG.__new__(rag_vector.VectorRAG)  # skip Chroma connect
+    rag._collection = _FakeCollection(rows)
+    rag._healthy = True
+    return rag
+
+
+def test_vectorrag_remove_is_path_bounded():
+    rows = [
+        ("a", {"source": "/a/docs/f1.md"}),
+        ("b", {"source": "/a/docs/sub/f2.md"}),   # nested -> must be removed
+        ("c", {"source": "/a/docs2/f3.md"}),       # sibling prefix -> must survive
+        ("d", {"source": "/a/docs_personal/f4.md"}),  # sibling prefix -> must survive
+        ("e", {"filename": "no-source.md"}),       # sourceless dict -> must not crash/survive
+    ]
+    rag = _make_vectorrag(rows)
+    res = rag.remove_directory("/a/docs")
+    assert res["success"] is True
+    assert res["removed_count"] == 2
+    remaining = set(rag._collection.get()["ids"])
+    assert remaining == {"c", "d", "e"}, remaining
+
+
+def test_vectorrag_remove_no_match_is_noop():
+    rag = _make_vectorrag([("a", {"source": "/a/docs/f1.md"})])
+    res = rag.remove_directory("/nowhere")
+    assert res["success"] is True
+    assert res["removed_count"] == 0
+    assert set(rag._collection.get()["ids"]) == {"a"}
+
+
+# --------------------------------------------------------------------------- #
+# PersonalDocsManager.remove_directory must delete-targeted, not wipe (edit A)
+# --------------------------------------------------------------------------- #
+
+
+class _FakeRag:
+    """Records calls and simulates a chunk store keyed by id -> metadata."""
+
+    def __init__(self, store):
+        self.store = store
+        self.rebuild_called = False
+
+    def rebuild_index(self):
+        # The catastrophic op — mimic delete_collection wiping everything.
+        self.rebuild_called = True
+        self.store.clear()
+        return True
+
+    def index_personal_documents(self, directory, owner=None):
+        return {"indexed_count": 0}  # old recovery path re-adds nothing here
+
+    def remove_directory(self, directory):
+        directory = os.path.abspath(directory)
+        doomed = [
+            i for i, m in self.store.items()
+            if isinstance(m.get("source"), str)
+            and (m["source"] == directory or m["source"].startswith(directory + os.sep))
+        ]
+        for i in doomed:
+            del self.store[i]
+        return {"success": True, "removed_count": len(doomed)}
+
+
+def test_personal_docs_remove_is_targeted(tmp_path):
+    personal = os.path.abspath(str(tmp_path / "personal"))
+    target = os.path.abspath(str(tmp_path / "target"))
+    other = os.path.abspath(str(tmp_path / "other"))
+    store = {
+        "p": {"source": os.path.join(personal, "note.md"), "owner": "alice"},
+        "t": {"source": os.path.join(target, "doc.md"), "owner": "alice"},
+        "o": {"source": os.path.join(other, "doc.md"), "owner": "bob"},
+    }
+    fake = _FakeRag(store)
+    mgr = personal_docs.PersonalDocsManager(str(tmp_path), rag_manager=fake)
+    mgr.indexed_directories = [target, other]  # personal_dir intentionally NOT tracked
+
+    mgr.remove_directory(target)
+
+    assert fake.rebuild_called is False, "must not wipe the whole collection"
+    assert "t" not in store, "target directory's chunk should be removed"
+    assert "p" in store, "base personal index must survive"
+    assert "o" in store, "another owner's chunk must survive"
+
+
+# --------------------------------------------------------------------------- #
+# do_manage_rag remove path must not fire a whole-collection rebuild (edit B)
+# --------------------------------------------------------------------------- #
+
+
+async def test_do_manage_rag_remove_does_not_rebuild(monkeypatch):
+    calls = {"rebuild": 0}
+
+    class _Rag:
+        def rebuild_index(self):
+            calls["rebuild"] += 1
+
+        def remove_directory(self, directory):
+            pass
+
+    class _PDocs:
+        def remove_directory(self, directory):
+            pass
+
+    monkeypatch.setattr(ai, "_rag_manager", _Rag())
+    monkeypatch.setattr(ai, "_personal_docs_manager", _PDocs())
+
+    # Untracked path: the old code still fired an unconditional rebuild_index().
+    result = await ai.do_manage_rag("remove_directory\n/abs/untracked/dir")
+
+    assert calls["rebuild"] == 0, "remove must not rebuild (whole-collection wipe)"
+    assert "error" not in result, result