Fix auto-memory vector dedup dropping a user's fact on cross-tenant match
extract_and_store dedups each extracted fact against the vector store
before the (owner-scoped) text fallback. The vector store is a single
shared ChromaDB collection storing only {"source": "memory"} — no
owner — and find_similar queries it with no owner filter, so it can
return a memory_id belonging to a different tenant. The old code
continue'd (skipped storing) on any vector hit without checking
ownership, so when ChromaDB is healthy (the common path) a user's
freshly-extracted fact was silently dropped because it was merely
semantically similar to another user's memory — the text fallback that
IS owner-scoped never ran. Gate the skip on the matched memory being
this user's own (or legacy unowned), mirroring the text dedup predicate;
cross-tenant or stale matches fall through. Same bug class as #1743.
This commit is contained in:
@@ -345,8 +345,17 @@ async def extract_and_store(
|
||||
logger.warning(f"Memory dedup (vector) unavailable, using text fallback: {e}")
|
||||
existing_id = None
|
||||
if existing_id:
|
||||
logger.debug(f"Memory dedup (vector): '{fact_text[:50]}' matches {existing_id}")
|
||||
continue
|
||||
# The vector store is a single shared collection with no
|
||||
# owner metadata, so find_similar can return ANOTHER
|
||||
# tenant's memory. Only treat it as a duplicate when the
|
||||
# match is this user's own (or a legacy unowned) memory —
|
||||
# otherwise the user's freshly-extracted fact would be
|
||||
# silently dropped. Mirror the owner predicate used by the
|
||||
# text dedup below; cross-tenant/stale matches fall through.
|
||||
_match = next((e for e in existing if e.get("id") == existing_id), None)
|
||||
if _match is not None and (_match.get("owner") == _owner or _match.get("owner") is None):
|
||||
logger.debug(f"Memory dedup (vector): '{fact_text[:50]}' matches {existing_id}")
|
||||
continue
|
||||
|
||||
# Text dedup fallback: exact match + fuzzy similarity
|
||||
user_existing = [e for e in existing if e.get("owner") == _owner or e.get("owner") is None] if _owner else existing
|
||||
|
||||
111
tests/test_memory_extractor_vector_cross_tenant.py
Normal file
111
tests/test_memory_extractor_vector_cross_tenant.py
Normal file
@@ -0,0 +1,111 @@
|
||||
"""Regression: auto-memory vector dedup must not drop a user's fact because it
|
||||
matches ANOTHER tenant's memory.
|
||||
|
||||
`extract_and_store` dedups each extracted fact against the vector store first.
|
||||
The vector store (`memory_vector`) is a single shared ChromaDB collection with
|
||||
no owner in its metadata, so `find_similar` can return a memory_id belonging to
|
||||
a different user. The old code `continue`d (skipped storing) on any vector hit
|
||||
without checking ownership, so user B's freshly-extracted fact was silently
|
||||
dropped when it was merely semantically similar to user A's memory. The text
|
||||
dedup fallback right below is already owner-scoped; the vector path must be too.
|
||||
"""
|
||||
import asyncio
|
||||
import importlib.util
|
||||
import sys
|
||||
import types
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
|
||||
|
||||
def _load_extractor():
|
||||
# Load services/memory/memory_extractor.py directly by path so we don't
|
||||
# trigger services/__init__ (which imports the search stack and its heavy
|
||||
# optional deps). The module's only module-level imports are stdlib; its
|
||||
# src.llm_core / src.event_bus imports are lazy and stubbed/guarded.
|
||||
path = ROOT / "services" / "memory" / "memory_extractor.py"
|
||||
spec = importlib.util.spec_from_file_location("memory_extractor_under_test", path)
|
||||
mod = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(mod)
|
||||
return mod
|
||||
|
||||
|
||||
def _install_llm_stub(facts_json):
|
||||
mod = types.ModuleType("src.llm_core")
|
||||
|
||||
async def llm_call_async(*a, **k):
|
||||
return facts_json
|
||||
|
||||
mod.llm_call_async = llm_call_async
|
||||
src_pkg = sys.modules.get("src") or types.ModuleType("src")
|
||||
sys.modules["src"] = src_pkg
|
||||
sys.modules["src.llm_core"] = mod
|
||||
|
||||
|
||||
class FakeSession:
|
||||
def __init__(self, owner):
|
||||
self.owner = owner
|
||||
|
||||
def get_context_messages(self):
|
||||
return [
|
||||
{"role": "user", "content": "Tell me where I live."},
|
||||
{"role": "assistant", "content": "Noted."},
|
||||
]
|
||||
|
||||
|
||||
class FakeMemoryManager:
|
||||
def __init__(self, rows):
|
||||
self.rows = list(rows)
|
||||
self._n = 0
|
||||
|
||||
def load_all(self):
|
||||
return list(self.rows)
|
||||
|
||||
def load(self, owner=None):
|
||||
return [r for r in self.rows if r.get("owner") == owner]
|
||||
|
||||
def find_duplicates(self, text, subset):
|
||||
t = text.strip().lower()
|
||||
return [r for r in subset if r.get("text", "").strip().lower() == t]
|
||||
|
||||
def add_entry(self, text, source="auto", category="fact", owner=None):
|
||||
self._n += 1
|
||||
entry = {"id": f"new-{self._n}", "text": text, "owner": owner,
|
||||
"source": source, "category": category}
|
||||
self.rows.append(entry)
|
||||
return entry
|
||||
|
||||
|
||||
class FakeVector:
|
||||
"""Healthy vector store whose find_similar always matches user A's memory."""
|
||||
def __init__(self, match_id):
|
||||
self.healthy = True
|
||||
self._match_id = match_id
|
||||
|
||||
def find_similar(self, text, threshold=0.92):
|
||||
return self._match_id
|
||||
|
||||
|
||||
def test_vector_match_from_other_tenant_does_not_drop_users_fact(monkeypatch):
|
||||
# User A already owns a semantically-similar memory.
|
||||
mm = FakeMemoryManager([
|
||||
{"id": "a1", "text": "I live in Lisbon", "owner": "userA"},
|
||||
])
|
||||
# The vector store reports user B's new fact as a near-duplicate of a1.
|
||||
vec = FakeVector(match_id="a1")
|
||||
_install_llm_stub('["My home is in Lisbon"]')
|
||||
|
||||
memory_extractor = _load_extractor()
|
||||
|
||||
asyncio.run(memory_extractor.extract_and_store(
|
||||
FakeSession(owner="userB"), mm, vec,
|
||||
endpoint_url="http://x", model="m",
|
||||
))
|
||||
|
||||
b_texts = {r["text"] for r in mm.load(owner="userB")}
|
||||
assert "My home is in Lisbon" in b_texts, (
|
||||
"User B's own extracted fact was dropped because the shared vector "
|
||||
"store matched user A's memory (cross-tenant dedup)."
|
||||
)
|
||||
Reference in New Issue
Block a user