diff --git a/services/research/research_handler.py b/services/research/research_handler.py index 77863b8..0a49c72 100644 --- a/services/research/research_handler.py +++ b/services/research/research_handler.py @@ -14,6 +14,8 @@ import time from pathlib import Path from typing import Optional, Dict +from src.research_utils import is_low_quality + logger = logging.getLogger(__name__) RESEARCH_DATA_DIR = Path("data/deep_research") @@ -179,13 +181,14 @@ class ResearchHandler: @staticmethod def _extract_sources(findings: list) -> list: - """Extract deduplicated [{url, title}] from findings.""" + """Extract deduplicated [{url, title}] from findings, filtering low-quality ones.""" seen = set() sources = [] for f in findings: url = f.get("url", "") title = f.get("title", "") or url - if url and url not in seen: + summary = f.get("summary", "") or f.get("evidence", "") + if url and url not in seen and not is_low_quality(summary): seen.add(url) sources.append({"url": url, "title": title}) return sources @@ -346,7 +349,8 @@ class ResearchHandler: for f in findings: url = f.get("url", "") title = f.get("title", "") or url - if url and url not in seen_urls: + summary = f.get("summary", "") or f.get("evidence", "") + if url and url not in seen_urls and not is_low_quality(summary): seen_urls.add(url) source_lines.append(f"- [{title}]({url})") if source_lines: diff --git a/tests/test_services_research_low_quality_sources.py b/tests/test_services_research_low_quality_sources.py new file mode 100644 index 0000000..2217f4b --- /dev/null +++ b/tests/test_services_research_low_quality_sources.py @@ -0,0 +1,85 @@ +"""services/research _extract_sources must gate low-quality findings. + +The src/research_handler.py copy filters findings whose summary is junk +boilerplate (via research_utils.is_low_quality) before listing them as +cited sources. The services/research copy diverged and had no gate, so +"the page does not contain relevant information" URLs showed up as +sources, and a junk finding seen first suppressed the good title for the +same URL. services/research/service.py imports this handler, so it is the +live path. +""" + +import importlib.util +import sys +import types + +import pytest + + +@pytest.fixture +def handler_cls(monkeypatch): + """Load services.research.research_handler from its file path so the + heavy services/__init__.py (httpx etc.) is never imported.""" + pkg = types.ModuleType("services") + pkg.__path__ = [] + sub = types.ModuleType("services.research") + sub.__path__ = [] + monkeypatch.setitem(sys.modules, "services", pkg) + monkeypatch.setitem(sys.modules, "services.research", sub) + name = "services.research.research_handler" + monkeypatch.delitem(sys.modules, name, raising=False) + spec = importlib.util.spec_from_file_location( + name, "services/research/research_handler.py" + ) + mod = importlib.util.module_from_spec(spec) + monkeypatch.setitem(sys.modules, name, mod) + spec.loader.exec_module(mod) + return mod.ResearchHandler + + +JUNK = "The page does not contain relevant information" + + +def test_low_quality_summary_is_not_a_source(handler_cls): + out = handler_cls._extract_sources([{"url": "http://a", "title": "T", "summary": JUNK}]) + assert out == [] + + +def test_good_summary_is_kept(handler_cls): + out = handler_cls._extract_sources( + [{"url": "http://a", "title": "T", "summary": "Detailed statistics about the topic"}] + ) + assert out == [{"url": "http://a", "title": "T"}] + + +def test_junk_first_no_longer_suppresses_the_good_finding(handler_cls): + out = handler_cls._extract_sources( + [ + {"url": "http://a", "title": "Bad", "summary": JUNK}, + {"url": "http://a", "title": "Good", "summary": "Real data about the topic"}, + ] + ) + assert out == [{"url": "http://a", "title": "Good"}] + + +def test_evidence_is_checked_when_summary_missing(handler_cls): + out = handler_cls._extract_sources( + [{"url": "http://a", "title": "T", "evidence": "Concrete evidence text"}] + ) + assert out == [{"url": "http://a", "title": "T"}] + + +def test_report_sources_section_gates_junk(handler_cls): + h = object.__new__(handler_cls) + report = h._format_research_report( + "q", + "full report", + {}, + 1.0, + findings=[ + {"url": "http://junk", "title": "Junk", "summary": JUNK}, + {"url": "http://good", "title": "Good", "summary": "Useful content here"}, + ], + ) + assert "http://good" in report + assert "- [Junk](http://junk)" not in report