From 28b296a712675b7f3732ca4486e99b28fcae68d7 Mon Sep 17 00:00:00 2001 From: afonsopc Date: Wed, 3 Jun 2026 15:10:10 +0100 Subject: [PATCH 1/3] Fix auto-memory vector dedup dropping a user's fact on cross-tenant match MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit extract_and_store dedups each extracted fact against the vector store before the (owner-scoped) text fallback. The vector store is a single shared ChromaDB collection storing only {"source": "memory"} — no owner — and find_similar queries it with no owner filter, so it can return a memory_id belonging to a different tenant. The old code continue'd (skipped storing) on any vector hit without checking ownership, so when ChromaDB is healthy (the common path) a user's freshly-extracted fact was silently dropped because it was merely semantically similar to another user's memory — the text fallback that IS owner-scoped never ran. Gate the skip on the matched memory being this user's own (or legacy unowned), mirroring the text dedup predicate; cross-tenant or stale matches fall through. Same bug class as #1743. --- services/memory/memory_extractor.py | 13 +- ...st_memory_extractor_vector_cross_tenant.py | 111 ++++++++++++++++++ 2 files changed, 122 insertions(+), 2 deletions(-) create mode 100644 tests/test_memory_extractor_vector_cross_tenant.py diff --git a/services/memory/memory_extractor.py b/services/memory/memory_extractor.py index 32412e6..44a9f1f 100644 --- a/services/memory/memory_extractor.py +++ b/services/memory/memory_extractor.py @@ -345,8 +345,17 @@ async def extract_and_store( logger.warning(f"Memory dedup (vector) unavailable, using text fallback: {e}") existing_id = None if existing_id: - logger.debug(f"Memory dedup (vector): '{fact_text[:50]}' matches {existing_id}") - continue + # The vector store is a single shared collection with no + # owner metadata, so find_similar can return ANOTHER + # tenant's memory. Only treat it as a duplicate when the + # match is this user's own (or a legacy unowned) memory — + # otherwise the user's freshly-extracted fact would be + # silently dropped. Mirror the owner predicate used by the + # text dedup below; cross-tenant/stale matches fall through. + _match = next((e for e in existing if e.get("id") == existing_id), None) + if _match is not None and (_match.get("owner") == _owner or _match.get("owner") is None): + logger.debug(f"Memory dedup (vector): '{fact_text[:50]}' matches {existing_id}") + continue # Text dedup fallback: exact match + fuzzy similarity user_existing = [e for e in existing if e.get("owner") == _owner or e.get("owner") is None] if _owner else existing diff --git a/tests/test_memory_extractor_vector_cross_tenant.py b/tests/test_memory_extractor_vector_cross_tenant.py new file mode 100644 index 0000000..6b1d243 --- /dev/null +++ b/tests/test_memory_extractor_vector_cross_tenant.py @@ -0,0 +1,111 @@ +"""Regression: auto-memory vector dedup must not drop a user's fact because it +matches ANOTHER tenant's memory. + +`extract_and_store` dedups each extracted fact against the vector store first. +The vector store (`memory_vector`) is a single shared ChromaDB collection with +no owner in its metadata, so `find_similar` can return a memory_id belonging to +a different user. The old code `continue`d (skipped storing) on any vector hit +without checking ownership, so user B's freshly-extracted fact was silently +dropped when it was merely semantically similar to user A's memory. The text +dedup fallback right below is already owner-scoped; the vector path must be too. +""" +import asyncio +import importlib.util +import sys +import types +from pathlib import Path + +import pytest + +ROOT = Path(__file__).resolve().parents[1] + + +def _load_extractor(): + # Load services/memory/memory_extractor.py directly by path so we don't + # trigger services/__init__ (which imports the search stack and its heavy + # optional deps). The module's only module-level imports are stdlib; its + # src.llm_core / src.event_bus imports are lazy and stubbed/guarded. + path = ROOT / "services" / "memory" / "memory_extractor.py" + spec = importlib.util.spec_from_file_location("memory_extractor_under_test", path) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod + + +def _install_llm_stub(facts_json): + mod = types.ModuleType("src.llm_core") + + async def llm_call_async(*a, **k): + return facts_json + + mod.llm_call_async = llm_call_async + src_pkg = sys.modules.get("src") or types.ModuleType("src") + sys.modules["src"] = src_pkg + sys.modules["src.llm_core"] = mod + + +class FakeSession: + def __init__(self, owner): + self.owner = owner + + def get_context_messages(self): + return [ + {"role": "user", "content": "Tell me where I live."}, + {"role": "assistant", "content": "Noted."}, + ] + + +class FakeMemoryManager: + def __init__(self, rows): + self.rows = list(rows) + self._n = 0 + + def load_all(self): + return list(self.rows) + + def load(self, owner=None): + return [r for r in self.rows if r.get("owner") == owner] + + def find_duplicates(self, text, subset): + t = text.strip().lower() + return [r for r in subset if r.get("text", "").strip().lower() == t] + + def add_entry(self, text, source="auto", category="fact", owner=None): + self._n += 1 + entry = {"id": f"new-{self._n}", "text": text, "owner": owner, + "source": source, "category": category} + self.rows.append(entry) + return entry + + +class FakeVector: + """Healthy vector store whose find_similar always matches user A's memory.""" + def __init__(self, match_id): + self.healthy = True + self._match_id = match_id + + def find_similar(self, text, threshold=0.92): + return self._match_id + + +def test_vector_match_from_other_tenant_does_not_drop_users_fact(monkeypatch): + # User A already owns a semantically-similar memory. + mm = FakeMemoryManager([ + {"id": "a1", "text": "I live in Lisbon", "owner": "userA"}, + ]) + # The vector store reports user B's new fact as a near-duplicate of a1. + vec = FakeVector(match_id="a1") + _install_llm_stub('["My home is in Lisbon"]') + + memory_extractor = _load_extractor() + + asyncio.run(memory_extractor.extract_and_store( + FakeSession(owner="userB"), mm, vec, + endpoint_url="http://x", model="m", + )) + + b_texts = {r["text"] for r in mm.load(owner="userB")} + assert "My home is in Lisbon" in b_texts, ( + "User B's own extracted fact was dropped because the shared vector " + "store matched user A's memory (cross-tenant dedup)." + ) From 1801ba9a0d3c13135748c19ba04e9b8c510d79cf Mon Sep 17 00:00:00 2001 From: afonsopc Date: Thu, 4 Jun 2026 19:26:28 +0100 Subject: [PATCH 2/3] Update degraded-vector dedup test for owner-scoped vector match --- .../test_memory_extractor_vector_degraded.py | 36 ++++++++++++------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/tests/test_memory_extractor_vector_degraded.py b/tests/test_memory_extractor_vector_degraded.py index 94ea594..1b3bd24 100644 --- a/tests/test_memory_extractor_vector_degraded.py +++ b/tests/test_memory_extractor_vector_degraded.py @@ -86,8 +86,12 @@ def test_extraction_persists_facts_when_vector_store_fails_at_runtime(monkeypatc def test_healthy_vector_store_still_dedups_normally(monkeypatch): - """Control: when find_similar reports a match, that fact is skipped — the - try/except added around it must not swallow a legitimate dedup hit.""" + """Control: a vector hit on the user's OWN memory is honored (deduped) and + add is not called. The vector store is a shared collection with no owner + metadata, so a hit is only treated as a duplicate when the matched id + resolves to this user's own (or legacy unowned) memory — otherwise the + fact would be a cross-tenant false drop. Here the match is alice's own + memory, so the dedup must still fire.""" async def _fake_llm(url, model, messages, **kwargs): return '[{"text": "Alice lives in Lisbon", "category": "fact"}]' @@ -95,19 +99,27 @@ def test_healthy_vector_store_still_dedups_normally(monkeypatch): monkeypatch.setattr(src.llm_core, "llm_call_async", _fake_llm) monkeypatch.setattr(src.event_bus, "fire_event", lambda *a, **k: None) - class _DedupVectorStore: - healthy = True - - def find_similar(self, text, threshold=0.72): - return "existing-id" # claim it already exists - - def add(self, memory_id, text): # pragma: no cover - should not run - raise AssertionError("add should not be called for a deduped fact") - with tempfile.TemporaryDirectory() as data_dir: mgr = MemoryManager(data_dir) + # Seed alice's own memory (persisted so load_all sees it) and point + # find_similar at its real id. + seeded = mgr.add_entry("Alice's home city is Lisbon", source="auto", + category="fact", owner="alice") + mgr.save([seeded]) + + class _DedupVectorStore: + healthy = True + + def find_similar(self, text, threshold=0.72): + return seeded["id"] # matches alice's own seeded memory + + def add(self, memory_id, text): # pragma: no cover - should not run + raise AssertionError("add should not be called for a deduped fact") + _run(extract_and_store( _FakeSession(), mgr, _DedupVectorStore(), endpoint_url="http://x", model="m", headers=None, )) - assert mgr.load(owner="alice") == [] + # The new fact was deduped against alice's own memory, so only the + # seeded entry remains (no duplicate added). + assert [e["text"] for e in mgr.load(owner="alice")] == ["Alice's home city is Lisbon"] From 9be2862e4ee2bfb601e3891893dcd9b4d35ea664 Mon Sep 17 00:00:00 2001 From: afonsopc Date: Fri, 5 Jun 2026 00:04:15 +0100 Subject: [PATCH 3/3] Stub llm_core via monkeypatch.setitem so the cross-tenant test does not leak its fake into later test modules --- tests/test_memory_extractor_vector_cross_tenant.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/test_memory_extractor_vector_cross_tenant.py b/tests/test_memory_extractor_vector_cross_tenant.py index 6b1d243..49702c1 100644 --- a/tests/test_memory_extractor_vector_cross_tenant.py +++ b/tests/test_memory_extractor_vector_cross_tenant.py @@ -32,16 +32,20 @@ def _load_extractor(): return mod -def _install_llm_stub(facts_json): +def _install_llm_stub(monkeypatch, facts_json): mod = types.ModuleType("src.llm_core") async def llm_call_async(*a, **k): return facts_json mod.llm_call_async = llm_call_async + # Use monkeypatch.setitem so sys.modules is restored at teardown. A raw + # assignment here permanently replaced the real src.llm_core with this + # stripped stub, leaking "My home is in Lisbon" (and hiding _detect_provider) + # into every later-collected test that imports the real module. src_pkg = sys.modules.get("src") or types.ModuleType("src") - sys.modules["src"] = src_pkg - sys.modules["src.llm_core"] = mod + monkeypatch.setitem(sys.modules, "src", src_pkg) + monkeypatch.setitem(sys.modules, "src.llm_core", mod) class FakeSession: @@ -95,7 +99,7 @@ def test_vector_match_from_other_tenant_does_not_drop_users_fact(monkeypatch): ]) # The vector store reports user B's new fact as a near-duplicate of a1. vec = FakeVector(match_id="a1") - _install_llm_stub('["My home is in Lisbon"]') + _install_llm_stub(monkeypatch, '["My home is in Lisbon"]') memory_extractor = _load_extractor()