diff --git a/src/research_utils.py b/src/research_utils.py index ec9cffa..9961848 100644 --- a/src/research_utils.py +++ b/src/research_utils.py @@ -39,9 +39,16 @@ LOW_QUALITY_MARKERS = [ "unable to extract", "completely unrelated", "boilerplate", - "cookie", "footer text", - "copyright", + # Phrases (not bare "cookie"/"copyright") so we still catch boilerplate + # like consent banners and footers without discarding legitimate findings + # that merely discuss cookies or copyright as their subject. + "cookie consent", + "cookie banner", + "cookie notice", + "copyright notice", + "copyright footer", + "all rights reserved", ] diff --git a/tests/test_research_utils.py b/tests/test_research_utils.py index 12e4df6..52001d0 100644 --- a/tests/test_research_utils.py +++ b/tests/test_research_utils.py @@ -79,3 +79,19 @@ class TestIsLowQuality: def test_copyright_marker(self): assert is_low_quality("Just a copyright notice at the bottom.") is True + + # Regression: bare "cookie"/"copyright" used to be substring markers, so + # legitimate findings that merely discuss them as their subject were + # discarded. They must now be kept. + def test_keeps_finding_about_copyright_law(self): + assert is_low_quality("This article explains the new EU copyright directive reforms.") is False + + def test_keeps_finding_about_cookies(self): + assert is_low_quality("A technical guide to how tracking cookies and session cookies work.") is False + + def test_keeps_recipe_mentioning_cookies(self): + assert is_low_quality("Recipe: the best chocolate chip cookies you will ever bake.") is False + + # Boilerplate is still caught via phrases. + def test_cookie_consent_banner_still_filtered(self): + assert is_low_quality("The page is just a cookie consent banner.") is True