fix: services research lists junk no-content pages as cited sources (#1669)

This commit is contained in:
Afonso Coutinho
2026-06-03 06:22:58 +01:00
committed by GitHub
parent eae8797e08
commit 7f80d33210
2 changed files with 92 additions and 3 deletions

View File

@@ -14,6 +14,8 @@ import time
from pathlib import Path
from typing import Optional, Dict
from src.research_utils import is_low_quality
logger = logging.getLogger(__name__)
RESEARCH_DATA_DIR = Path("data/deep_research")
@@ -179,13 +181,14 @@ class ResearchHandler:
@staticmethod
def _extract_sources(findings: list) -> list:
"""Extract deduplicated [{url, title}] from findings."""
"""Extract deduplicated [{url, title}] from findings, filtering low-quality ones."""
seen = set()
sources = []
for f in findings:
url = f.get("url", "")
title = f.get("title", "") or url
if url and url not in seen:
summary = f.get("summary", "") or f.get("evidence", "")
if url and url not in seen and not is_low_quality(summary):
seen.add(url)
sources.append({"url": url, "title": title})
return sources
@@ -346,7 +349,8 @@ class ResearchHandler:
for f in findings:
url = f.get("url", "")
title = f.get("title", "") or url
if url and url not in seen_urls:
summary = f.get("summary", "") or f.get("evidence", "")
if url and url not in seen_urls and not is_low_quality(summary):
seen_urls.add(url)
source_lines.append(f"- [{title}]({url})")
if source_lines: