diff --git a/services/memory/memory_extractor.py b/services/memory/memory_extractor.py index 32412e6..44a9f1f 100644 --- a/services/memory/memory_extractor.py +++ b/services/memory/memory_extractor.py @@ -345,8 +345,17 @@ async def extract_and_store( logger.warning(f"Memory dedup (vector) unavailable, using text fallback: {e}") existing_id = None if existing_id: - logger.debug(f"Memory dedup (vector): '{fact_text[:50]}' matches {existing_id}") - continue + # The vector store is a single shared collection with no + # owner metadata, so find_similar can return ANOTHER + # tenant's memory. Only treat it as a duplicate when the + # match is this user's own (or a legacy unowned) memory — + # otherwise the user's freshly-extracted fact would be + # silently dropped. Mirror the owner predicate used by the + # text dedup below; cross-tenant/stale matches fall through. + _match = next((e for e in existing if e.get("id") == existing_id), None) + if _match is not None and (_match.get("owner") == _owner or _match.get("owner") is None): + logger.debug(f"Memory dedup (vector): '{fact_text[:50]}' matches {existing_id}") + continue # Text dedup fallback: exact match + fuzzy similarity user_existing = [e for e in existing if e.get("owner") == _owner or e.get("owner") is None] if _owner else existing diff --git a/tests/test_memory_extractor_vector_cross_tenant.py b/tests/test_memory_extractor_vector_cross_tenant.py new file mode 100644 index 0000000..6b1d243 --- /dev/null +++ b/tests/test_memory_extractor_vector_cross_tenant.py @@ -0,0 +1,111 @@ +"""Regression: auto-memory vector dedup must not drop a user's fact because it +matches ANOTHER tenant's memory. + +`extract_and_store` dedups each extracted fact against the vector store first. +The vector store (`memory_vector`) is a single shared ChromaDB collection with +no owner in its metadata, so `find_similar` can return a memory_id belonging to +a different user. The old code `continue`d (skipped storing) on any vector hit +without checking ownership, so user B's freshly-extracted fact was silently +dropped when it was merely semantically similar to user A's memory. The text +dedup fallback right below is already owner-scoped; the vector path must be too. +""" +import asyncio +import importlib.util +import sys +import types +from pathlib import Path + +import pytest + +ROOT = Path(__file__).resolve().parents[1] + + +def _load_extractor(): + # Load services/memory/memory_extractor.py directly by path so we don't + # trigger services/__init__ (which imports the search stack and its heavy + # optional deps). The module's only module-level imports are stdlib; its + # src.llm_core / src.event_bus imports are lazy and stubbed/guarded. + path = ROOT / "services" / "memory" / "memory_extractor.py" + spec = importlib.util.spec_from_file_location("memory_extractor_under_test", path) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod + + +def _install_llm_stub(facts_json): + mod = types.ModuleType("src.llm_core") + + async def llm_call_async(*a, **k): + return facts_json + + mod.llm_call_async = llm_call_async + src_pkg = sys.modules.get("src") or types.ModuleType("src") + sys.modules["src"] = src_pkg + sys.modules["src.llm_core"] = mod + + +class FakeSession: + def __init__(self, owner): + self.owner = owner + + def get_context_messages(self): + return [ + {"role": "user", "content": "Tell me where I live."}, + {"role": "assistant", "content": "Noted."}, + ] + + +class FakeMemoryManager: + def __init__(self, rows): + self.rows = list(rows) + self._n = 0 + + def load_all(self): + return list(self.rows) + + def load(self, owner=None): + return [r for r in self.rows if r.get("owner") == owner] + + def find_duplicates(self, text, subset): + t = text.strip().lower() + return [r for r in subset if r.get("text", "").strip().lower() == t] + + def add_entry(self, text, source="auto", category="fact", owner=None): + self._n += 1 + entry = {"id": f"new-{self._n}", "text": text, "owner": owner, + "source": source, "category": category} + self.rows.append(entry) + return entry + + +class FakeVector: + """Healthy vector store whose find_similar always matches user A's memory.""" + def __init__(self, match_id): + self.healthy = True + self._match_id = match_id + + def find_similar(self, text, threshold=0.92): + return self._match_id + + +def test_vector_match_from_other_tenant_does_not_drop_users_fact(monkeypatch): + # User A already owns a semantically-similar memory. + mm = FakeMemoryManager([ + {"id": "a1", "text": "I live in Lisbon", "owner": "userA"}, + ]) + # The vector store reports user B's new fact as a near-duplicate of a1. + vec = FakeVector(match_id="a1") + _install_llm_stub('["My home is in Lisbon"]') + + memory_extractor = _load_extractor() + + asyncio.run(memory_extractor.extract_and_store( + FakeSession(owner="userB"), mm, vec, + endpoint_url="http://x", model="m", + )) + + b_texts = {r["text"] for r in mm.load(owner="userB")} + assert "My home is in Lisbon" in b_texts, ( + "User B's own extracted fact was dropped because the shared vector " + "store matched user A's memory (cross-tenant dedup)." + )