* fix: drop over-broad 'cookie'/'copyright' low-quality markers * fix: detect cookie/copyright boilerplate via phrases, not bare words * test: keep research findings that merely mention cookies or copyright
64 lines
2.2 KiB
Python
64 lines
2.2 KiB
Python
# src/research_utils.py
|
|
"""Shared utilities for the deep research system.
|
|
|
|
Centralizes text cleaning, quality filtering, and other logic
|
|
used across deep_research.py, research_handler.py, and visual_report.py.
|
|
"""
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Thinking / reasoning block stripping
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def strip_thinking(text):
|
|
"""Strip thinking / reasoning patterns from LLM output.
|
|
|
|
Delegates to `src.text_helpers.strip_think` (single source of truth).
|
|
Kept as an alias here so existing `from src.research_utils import strip_thinking`
|
|
callers don't break. Preserves None passthrough — many callers pass an
|
|
`Optional[str]` LLM result and expect None back when the call failed.
|
|
"""
|
|
if text is None:
|
|
return None
|
|
from src.text_helpers import strip_think
|
|
return strip_think(text, prose=False, prompt_echo=True)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Source quality filtering
|
|
# ---------------------------------------------------------------------------
|
|
|
|
# Markers indicating extracted content is boilerplate, error text, or empty.
|
|
# If any marker is found (case-insensitive), the content is filtered out.
|
|
LOW_QUALITY_MARKERS = [
|
|
"insufficient to",
|
|
"content is insufficient",
|
|
"no substantive data",
|
|
"does not contain",
|
|
"not relevant to",
|
|
"no relevant information",
|
|
"unable to extract",
|
|
"completely unrelated",
|
|
"boilerplate",
|
|
"footer text",
|
|
# Phrases (not bare "cookie"/"copyright") so we still catch boilerplate
|
|
# like consent banners and footers without discarding legitimate findings
|
|
# that merely discuss cookies or copyright as their subject.
|
|
"cookie consent",
|
|
"cookie banner",
|
|
"cookie notice",
|
|
"copyright notice",
|
|
"copyright footer",
|
|
"all rights reserved",
|
|
]
|
|
|
|
|
|
def is_low_quality(summary: str) -> bool:
|
|
"""Check if a finding summary indicates useless or irrelevant content."""
|
|
try:
|
|
if not summary:
|
|
return True
|
|
low = summary.lower()
|
|
return any(marker in low for marker in LOW_QUALITY_MARKERS)
|
|
except Exception:
|
|
return False # fail open
|