fix: deep research discards valid sources mentioning cookies/copyright (#481)

* fix: drop over-broad 'cookie'/'copyright' low-quality markers

* fix: detect cookie/copyright boilerplate via phrases, not bare words

* test: keep research findings that merely mention cookies or copyright
This commit is contained in:
Afonso Coutinho
2026-06-01 14:26:37 +01:00
committed by GitHub
parent 07d92556a3
commit c38932e6c6
2 changed files with 25 additions and 2 deletions

View File

@@ -39,9 +39,16 @@ LOW_QUALITY_MARKERS = [
"unable to extract",
"completely unrelated",
"boilerplate",
"cookie",
"footer text",
"copyright",
# Phrases (not bare "cookie"/"copyright") so we still catch boilerplate
# like consent banners and footers without discarding legitimate findings
# that merely discuss cookies or copyright as their subject.
"cookie consent",
"cookie banner",
"cookie notice",
"copyright notice",
"copyright footer",
"all rights reserved",
]