fix: services research lists junk no-content pages as cited sources (#1669)

2026-06-03 06:22:58 +01:00
parent eae8797e08
commit 7f80d33210
2 changed files with 92 additions and 3 deletions
--- a/services/research/research_handler.py
+++ b/services/research/research_handler.py
@@ -14,6 +14,8 @@ import time
 from pathlib import Path
 from typing import Optional, Dict

+from src.research_utils import is_low_quality
+
 logger = logging.getLogger(__name__)

 RESEARCH_DATA_DIR = Path("data/deep_research")
@@ -179,13 +181,14 @@ class ResearchHandler:

    @staticmethod
    def _extract_sources(findings: list) -> list:
-        """Extract deduplicated [{url, title}] from findings."""
+        """Extract deduplicated [{url, title}] from findings, filtering low-quality ones."""
        seen = set()
        sources = []
        for f in findings:
            url = f.get("url", "")
            title = f.get("title", "") or url
-            if url and url not in seen:
+            summary = f.get("summary", "") or f.get("evidence", "")
+            if url and url not in seen and not is_low_quality(summary):
                seen.add(url)
                sources.append({"url": url, "title": title})
        return sources
@@ -346,7 +349,8 @@ class ResearchHandler:
            for f in findings:
                url = f.get("url", "")
                title = f.get("title", "") or url
-                if url and url not in seen_urls:
+                summary = f.get("summary", "") or f.get("evidence", "")
+                if url and url not in seen_urls and not is_low_quality(summary):
                    seen_urls.add(url)
                    source_lines.append(f"- [{title}]({url})")
            if source_lines:
--- a/tests/test_services_research_low_quality_sources.py
+++ b/tests/test_services_research_low_quality_sources.py
@@ -0,0 +1,85 @@
+"""services/research _extract_sources must gate low-quality findings.
+
+The src/research_handler.py copy filters findings whose summary is junk
+boilerplate (via research_utils.is_low_quality) before listing them as
+cited sources. The services/research copy diverged and had no gate, so
+"the page does not contain relevant information" URLs showed up as
+sources, and a junk finding seen first suppressed the good title for the
+same URL. services/research/service.py imports this handler, so it is the
+live path.
+"""
+
+import importlib.util
+import sys
+import types
+
+import pytest
+
+
+@pytest.fixture
+def handler_cls(monkeypatch):
+    """Load services.research.research_handler from its file path so the
+    heavy services/__init__.py (httpx etc.) is never imported."""
+    pkg = types.ModuleType("services")
+    pkg.__path__ = []
+    sub = types.ModuleType("services.research")
+    sub.__path__ = []
+    monkeypatch.setitem(sys.modules, "services", pkg)
+    monkeypatch.setitem(sys.modules, "services.research", sub)
+    name = "services.research.research_handler"
+    monkeypatch.delitem(sys.modules, name, raising=False)
+    spec = importlib.util.spec_from_file_location(
+        name, "services/research/research_handler.py"
+    )
+    mod = importlib.util.module_from_spec(spec)
+    monkeypatch.setitem(sys.modules, name, mod)
+    spec.loader.exec_module(mod)
+    return mod.ResearchHandler
+
+
+JUNK = "The page does not contain relevant information"
+
+
+def test_low_quality_summary_is_not_a_source(handler_cls):
+    out = handler_cls._extract_sources([{"url": "http://a", "title": "T", "summary": JUNK}])
+    assert out == []
+
+
+def test_good_summary_is_kept(handler_cls):
+    out = handler_cls._extract_sources(
+        [{"url": "http://a", "title": "T", "summary": "Detailed statistics about the topic"}]
+    )
+    assert out == [{"url": "http://a", "title": "T"}]
+
+
+def test_junk_first_no_longer_suppresses_the_good_finding(handler_cls):
+    out = handler_cls._extract_sources(
+        [
+            {"url": "http://a", "title": "Bad", "summary": JUNK},
+            {"url": "http://a", "title": "Good", "summary": "Real data about the topic"},
+        ]
+    )
+    assert out == [{"url": "http://a", "title": "Good"}]
+
+
+def test_evidence_is_checked_when_summary_missing(handler_cls):
+    out = handler_cls._extract_sources(
+        [{"url": "http://a", "title": "T", "evidence": "Concrete evidence text"}]
+    )
+    assert out == [{"url": "http://a", "title": "T"}]
+
+
+def test_report_sources_section_gates_junk(handler_cls):
+    h = object.__new__(handler_cls)
+    report = h._format_research_report(
+        "q",
+        "full report",
+        {},
+        1.0,
+        findings=[
+            {"url": "http://junk", "title": "Junk", "summary": JUNK},
+            {"url": "http://good", "title": "Good", "summary": "Useful content here"},
+        ],
+    )
+    assert "http://good" in report
+    assert "- [Junk](http://junk)" not in report