fix: deep research discards valid sources mentioning cookies/copyright (#481)
* fix: drop over-broad 'cookie'/'copyright' low-quality markers * fix: detect cookie/copyright boilerplate via phrases, not bare words * test: keep research findings that merely mention cookies or copyright
This commit is contained in:
@@ -39,9 +39,16 @@ LOW_QUALITY_MARKERS = [
|
||||
"unable to extract",
|
||||
"completely unrelated",
|
||||
"boilerplate",
|
||||
"cookie",
|
||||
"footer text",
|
||||
"copyright",
|
||||
# Phrases (not bare "cookie"/"copyright") so we still catch boilerplate
|
||||
# like consent banners and footers without discarding legitimate findings
|
||||
# that merely discuss cookies or copyright as their subject.
|
||||
"cookie consent",
|
||||
"cookie banner",
|
||||
"cookie notice",
|
||||
"copyright notice",
|
||||
"copyright footer",
|
||||
"all rights reserved",
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -79,3 +79,19 @@ class TestIsLowQuality:
|
||||
|
||||
def test_copyright_marker(self):
|
||||
assert is_low_quality("Just a copyright notice at the bottom.") is True
|
||||
|
||||
# Regression: bare "cookie"/"copyright" used to be substring markers, so
|
||||
# legitimate findings that merely discuss them as their subject were
|
||||
# discarded. They must now be kept.
|
||||
def test_keeps_finding_about_copyright_law(self):
|
||||
assert is_low_quality("This article explains the new EU copyright directive reforms.") is False
|
||||
|
||||
def test_keeps_finding_about_cookies(self):
|
||||
assert is_low_quality("A technical guide to how tracking cookies and session cookies work.") is False
|
||||
|
||||
def test_keeps_recipe_mentioning_cookies(self):
|
||||
assert is_low_quality("Recipe: the best chocolate chip cookies you will ever bake.") is False
|
||||
|
||||
# Boilerplate is still caught via phrases.
|
||||
def test_cookie_consent_banner_still_filtered(self):
|
||||
assert is_low_quality("The page is just a cookie consent banner.") is True
|
||||
|
||||
Reference in New Issue
Block a user