fix: services research lists junk no-content pages as cited sources (#1669)
This commit is contained in:
@@ -14,6 +14,8 @@ import time
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict
|
||||
|
||||
from src.research_utils import is_low_quality
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
RESEARCH_DATA_DIR = Path("data/deep_research")
|
||||
@@ -179,13 +181,14 @@ class ResearchHandler:
|
||||
|
||||
@staticmethod
|
||||
def _extract_sources(findings: list) -> list:
|
||||
"""Extract deduplicated [{url, title}] from findings."""
|
||||
"""Extract deduplicated [{url, title}] from findings, filtering low-quality ones."""
|
||||
seen = set()
|
||||
sources = []
|
||||
for f in findings:
|
||||
url = f.get("url", "")
|
||||
title = f.get("title", "") or url
|
||||
if url and url not in seen:
|
||||
summary = f.get("summary", "") or f.get("evidence", "")
|
||||
if url and url not in seen and not is_low_quality(summary):
|
||||
seen.add(url)
|
||||
sources.append({"url": url, "title": title})
|
||||
return sources
|
||||
@@ -346,7 +349,8 @@ class ResearchHandler:
|
||||
for f in findings:
|
||||
url = f.get("url", "")
|
||||
title = f.get("title", "") or url
|
||||
if url and url not in seen_urls:
|
||||
summary = f.get("summary", "") or f.get("evidence", "")
|
||||
if url and url not in seen_urls and not is_low_quality(summary):
|
||||
seen_urls.add(url)
|
||||
source_lines.append(f"- [{title}]({url})")
|
||||
if source_lines:
|
||||
|
||||
Reference in New Issue
Block a user