fix: data integrity — deep-research result parsing + memory-extraction durability (#808)

Two independent data-integrity bugs:

- services/research/service.py: ResearchService.research() (the public deep-research
  API, re-exported from services/__init__) treated the handler return value as a
  dict (result.get("sources"/"summary"/...)), but call_research_service() returns a
  formatted markdown STRING -> AttributeError: str has no attribute get on EVERY
  successful call, making the API unusable for any non-error result. Now uses the
  string report as the summary and parses sources from the "### Sources" markdown
  section (section-bounded, URL-deduped), with a defensive dict branch for back-compat.

- services/memory/memory_extractor.py: extract_and_store guarded the vector-store
  find_similar/add calls only with the .healthy flag set ONCE at init. If the
  embedding/ChromaDB backend degraded LATER (OOM, evicted model, remote endpoint
  down), those calls raised, the exception escaped the dedup loop, skipped
  memory_manager.save(), and was swallowed by the outer try/except -> EVERY
  validated fact from the session was silently lost (the function docstring
  promises "never raised"). Now falls back to the existing text/fuzzy dedup so
  facts are still saved when the vector index is unavailable at runtime.

Tests: test_research_service.py, test_memory_extractor_vector_degraded.py.
This commit is contained in:
David Anderson
2026-06-01 19:27:31 -07:00
committed by GitHub
parent 0e31c38be0
commit 610968f91e
4 changed files with 346 additions and 17 deletions

View File

@@ -0,0 +1,113 @@
"""Regression: auto memory extraction must survive a runtime vector-store
failure.
The vector index reports `.healthy` only at init time. If the embedding
backend dies later (OOM, model evicted, remote endpoint down), the per-fact
`find_similar` / `add` calls raise. Before the fix these exceptions escaped the
dedup loop, jumped past `memory_manager.save(...)`, and were swallowed by the
function's outer try/except — so EVERY validated fact from the session was
silently lost (the feature promises "Errors are logged, never raised", but it
also quietly dropped all the data).
After the fix a degraded vector store falls through to the text/fuzzy dedup
path (which the code already maintains "when vector index is unavailable") and
the facts still land in the JSON store.
"""
import asyncio
import tempfile
import src.llm_core
import src.event_bus
from src.memory import MemoryManager
from services.memory.memory_extractor import extract_and_store
class _FakeSession:
"""Minimal session: two-message history so extraction proceeds."""
owner = "alice"
session_id = "sess-1"
def get_context_messages(self):
return [
{"role": "user", "content": "Hi, a few things about me."},
{"role": "assistant", "content": "Noted."},
]
class _BrokenVectorStore:
"""Healthy at init, but every embedding-backed op raises at runtime."""
healthy = True
def find_similar(self, text, threshold=0.72):
raise RuntimeError("embedding backend unavailable")
def add(self, memory_id, text):
raise RuntimeError("embedding backend unavailable")
def _run(coro):
return asyncio.new_event_loop().run_until_complete(coro)
def test_extraction_persists_facts_when_vector_store_fails_at_runtime(monkeypatch):
facts_json = (
'[{"text": "Alice lives in Lisbon", "category": "fact"}, '
'{"text": "Alice prefers tea over coffee", "category": "preference"}]'
)
async def _fake_llm(url, model, messages, **kwargs):
return facts_json
monkeypatch.setattr(src.llm_core, "llm_call_async", _fake_llm)
# fire_event touches an async event loop / disk — neutralize it.
monkeypatch.setattr(src.event_bus, "fire_event", lambda *a, **k: None)
with tempfile.TemporaryDirectory() as data_dir:
mgr = MemoryManager(data_dir)
_run(extract_and_store(
_FakeSession(),
mgr,
_BrokenVectorStore(),
endpoint_url="http://x",
model="m",
headers=None,
))
stored = mgr.load(owner="alice")
texts = {e["text"] for e in stored}
# The bug lost ALL of them (save() was never reached); both must survive.
assert "Alice lives in Lisbon" in texts
assert "Alice prefers tea over coffee" in texts
def test_healthy_vector_store_still_dedups_normally(monkeypatch):
"""Control: when find_similar reports a match, that fact is skipped — the
try/except added around it must not swallow a legitimate dedup hit."""
async def _fake_llm(url, model, messages, **kwargs):
return '[{"text": "Alice lives in Lisbon", "category": "fact"}]'
monkeypatch.setattr(src.llm_core, "llm_call_async", _fake_llm)
monkeypatch.setattr(src.event_bus, "fire_event", lambda *a, **k: None)
class _DedupVectorStore:
healthy = True
def find_similar(self, text, threshold=0.72):
return "existing-id" # claim it already exists
def add(self, memory_id, text): # pragma: no cover - should not run
raise AssertionError("add should not be called for a deduped fact")
with tempfile.TemporaryDirectory() as data_dir:
mgr = MemoryManager(data_dir)
_run(extract_and_store(
_FakeSession(), mgr, _DedupVectorStore(),
endpoint_url="http://x", model="m", headers=None,
))
assert mgr.load(owner="alice") == []

View File

@@ -0,0 +1,153 @@
"""Tests for ResearchService — correct handling of the handler's string report.
ResearchHandler.call_research_service returns a *formatted markdown string*,
not a dict. ResearchService.research() must consume that contract without
raising (the previous code called ``.get()`` on the string and blew up on
every successful research call).
"""
import asyncio
import pytest
from services.research.service import (
ResearchService,
ResearchResult,
ResearchSource,
)
# A faithful slice of what ResearchHandler._format_research_report emits.
SAMPLE_REPORT = """---
## Research Summary
**Duration:** 12.3s | **Rounds:** 3 | **Queries:** 5 | **URLs Analyzed:** 7
---
# Findings
Quantum error correction saw major advances in 2024. See [an inline note](https://inline.example/not-a-source) here.
### Sources
- [Surface Codes Paper](https://example.com/surface-codes)
- [Lab Announcement](https://example.com/lab)
- [Surface Codes Paper](https://example.com/surface-codes)
---
**The AI has analyzed all research findings above.**
"""
def _run(coro):
return asyncio.new_event_loop().run_until_complete(coro)
class _StubHandler:
"""Stands in for ResearchHandler; returns a string like the real one."""
def __init__(self, report):
self._report = report
self.called_with = None
async def call_research_service(self, topic, llm_endpoint, llm_model,
max_time=300, progress_callback=None):
self.called_with = (topic, llm_endpoint, llm_model, max_time)
return self._report
class TestResearchOnStringReport:
def _service(self, report):
svc = ResearchService()
svc.handler = _StubHandler(report)
return svc
def test_does_not_raise_on_string_report(self):
svc = self._service(SAMPLE_REPORT)
result = _run(svc.research("quantum", "http://llm", "model"))
assert isinstance(result, ResearchResult)
def test_summary_is_the_report(self):
svc = self._service(SAMPLE_REPORT)
result = _run(svc.research("quantum", "http://llm", "model"))
assert "Quantum error correction" in result.summary
assert result.query == "quantum"
def test_sources_parsed_and_deduped(self):
svc = self._service(SAMPLE_REPORT)
result = _run(svc.research("quantum", "http://llm", "model"))
urls = [s.url for s in result.sources]
assert urls == [
"https://example.com/surface-codes",
"https://example.com/lab",
]
assert all(isinstance(s, ResearchSource) for s in result.sources)
def test_inline_links_outside_sources_section_ignored(self):
svc = self._service(SAMPLE_REPORT)
result = _run(svc.research("quantum", "http://llm", "model"))
urls = [s.url for s in result.sources]
assert "https://inline.example/not-a-source" not in urls
def test_duration_recorded(self):
svc = self._service(SAMPLE_REPORT)
result = _run(svc.research("quantum", "http://llm", "model"))
assert result.duration_seconds >= 0.0
def test_empty_report_yields_no_sources(self):
svc = self._service("")
result = _run(svc.research("quantum", "http://llm", "model"))
assert result.sources == []
assert result.summary == ""
class TestParseSources:
def test_returns_empty_for_empty_input(self):
assert ResearchService._parse_sources("") == []
def test_handles_titleless_link(self):
report = "### Sources\n\n- [](https://example.com/x)\n"
sources = ResearchService._parse_sources(report)
assert len(sources) == 1
assert sources[0].url == "https://example.com/x"
assert sources[0].title == ""
def test_section_ends_at_next_heading(self):
report = (
"### Sources\n\n"
"- [A](https://a.example)\n\n"
"### Notes\n\n"
"- [B](https://b.example)\n"
)
urls = [s.url for s in ResearchService._parse_sources(report)]
assert urls == ["https://a.example"]
class TestDictBackCompat:
"""A handler that returns a dict (legacy shape) must still work."""
def test_dict_result_still_parsed(self):
svc = ResearchService()
class _DictHandler:
async def call_research_service(self, *a, **k):
return {
"summary": "done",
"sources": [
{"url": "https://x.example", "title": "X",
"snippet": "s", "relevance": 0.9},
],
"sections": ["intro"],
"tokens_used": 42,
}
svc.handler = _DictHandler()
result = _run(svc.research("q", "http://llm", "model"))
assert result.summary == "done"
assert result.tokens_used == 42
assert result.sections == ["intro"]
assert result.sources[0].url == "https://x.example"
assert result.sources[0].relevance == 0.9