diff --git a/services/search/ranking.py b/services/search/ranking.py index 17facba..23ea691 100644 --- a/services/search/ranking.py +++ b/services/search/ranking.py @@ -13,6 +13,11 @@ _SPORTS_HINTS = { "sport", "sports", "soccer", "football", "hockey", "nba", "nfl", "mlb", "fifa", "world cup", "championship", "quarterfinal", "eliminates", } +# Word-boundary match so "sport" does not fire inside "transport"/"passport" +# and a domain like "transport.gov" is not mistaken for a sports site. +_SPORTS_HINT_RE = re.compile( + r"\b(?:" + "|".join(re.escape(h) for h in _SPORTS_HINTS) + r")\b" +) _LOW_VALUE_NEWS_DOMAINS = { "facebook.com", "www.facebook.com", "sports.yahoo.com", "yahoo.com", "www.yahoo.com", "msn.com", "www.msn.com", @@ -39,7 +44,7 @@ def rank_search_results(query: str, results: List[dict]) -> List[dict]: query_terms = [t.lower() for t in re.findall(r"\b\w+\b", query)] query_lc = query.lower() is_news_query = any(term in _NEWS_HINTS for term in query_terms) - is_sports_query = any(hint in query_lc for hint in _SPORTS_HINTS) + is_sports_query = bool(_SPORTS_HINT_RE.search(query_lc)) def title_score(title: str) -> float: if not title: @@ -98,7 +103,7 @@ def rank_search_results(query: str, results: List[dict]) -> List[dict]: adjustment += 0.4 if netloc in _LOW_VALUE_NEWS_DOMAINS: adjustment -= 0.8 - if not is_sports_query and any(hint in text or hint in netloc for hint in _SPORTS_HINTS): + if not is_sports_query and (_SPORTS_HINT_RE.search(text) or _SPORTS_HINT_RE.search(netloc)): adjustment -= 1.5 # A country/news query should not rank a page whose title/snippet barely # mentions the country above actual news pages for that country. diff --git a/src/search/ranking.py b/src/search/ranking.py index 17605b2..771a11a 100644 --- a/src/search/ranking.py +++ b/src/search/ranking.py @@ -50,6 +50,11 @@ _SPORTS_HINTS = { "sport", "sports", "soccer", "football", "hockey", "nba", "nfl", "mlb", "fifa", "world cup", "championship", "quarterfinal", "eliminates", } +# Word-boundary match so "sport" does not fire inside "transport"/"passport" +# and a domain like "transport.gov" is not mistaken for a sports site. +_SPORTS_HINT_RE = re.compile( + r"\b(?:" + "|".join(re.escape(h) for h in _SPORTS_HINTS) + r")\b" +) _LOW_VALUE_NEWS_DOMAINS = { "facebook.com", "www.facebook.com", "sports.yahoo.com", "yahoo.com", "www.yahoo.com", "msn.com", "www.msn.com", @@ -76,7 +81,7 @@ def rank_search_results(query: str, results: List[dict]) -> List[dict]: query_terms = [t.lower() for t in re.findall(r"\b\w+\b", query)] query_lc = query.lower() is_news_query = any(term in _NEWS_HINTS for term in query_terms) - is_sports_query = any(hint in query_lc for hint in _SPORTS_HINTS) + is_sports_query = bool(_SPORTS_HINT_RE.search(query_lc)) def title_score(title: str) -> float: if not title: @@ -117,7 +122,7 @@ def rank_search_results(query: str, results: List[dict]) -> List[dict]: adjustment += 0.4 if netloc in _LOW_VALUE_NEWS_DOMAINS: adjustment -= 0.8 - if not is_sports_query and any(hint in text or hint in netloc for hint in _SPORTS_HINTS): + if not is_sports_query and (_SPORTS_HINT_RE.search(text) or _SPORTS_HINT_RE.search(netloc)): adjustment -= 1.5 # A country/news query should not rank a page whose title/snippet barely # mentions the country above actual news pages for that country. diff --git a/tests/test_search_ranking_sports_substring.py b/tests/test_search_ranking_sports_substring.py new file mode 100644 index 0000000..0a16761 --- /dev/null +++ b/tests/test_search_ranking_sports_substring.py @@ -0,0 +1,52 @@ +"""Regression: the sports-hint match must be word-boundary, not substring. + +`_SPORTS_HINTS` contains "sport", which is a substring of "transport", +"passport", "sportswear", and of domains like "transport.gov". The old code +used `hint in text` / `hint in netloc`, so for any non-sports news query a +legitimate result mentioning "transport"/"passport" took the -1.5 sports +penalty and was pushed down the ranking. The query classifier had the same +flaw (a "passport" query was treated as a sports query). Both now use the +word-boundary `_SPORTS_HINT_RE`. + +The same ranking module exists in two live copies: `services/search/ranking.py` +(the /api/search HTTP path) and `src/search/ranking.py` (the agent's +`web_search` tool path via `src/search/core.py`). Both are fixed and both are +covered here. +""" +import pytest + +import services.search.ranking as services_ranking +import src.search.ranking as src_ranking + +MODULES = [services_ranking, src_ranking] +MODULE_IDS = ["services", "src"] + + +@pytest.mark.parametrize("ranking", MODULES, ids=MODULE_IDS) +def test_sports_regex_ignores_substring_false_positives(ranking): + for word in ("transport", "passport", "sportswear", "transportation"): + assert ranking._SPORTS_HINT_RE.search(word) is None, word + + +@pytest.mark.parametrize("ranking", MODULES, ids=MODULE_IDS) +def test_sports_regex_still_matches_real_terms(ranking): + for word in ("sport", "sports", "world cup", "the nba finals", "soccer match"): + assert ranking._SPORTS_HINT_RE.search(word) is not None, word + + +@pytest.mark.parametrize("ranking", MODULES, ids=MODULE_IDS) +def test_transport_news_result_outranks_one_with_standalone_sport(ranking): + # Non-sports news query (contains "latest"/"news"); subject term "transport". + query = "latest transport news" + results = [ + # B first in input; identical except B carries a standalone "sport" word. + {"title": "City transport plan", "snippet": "the transport plan details and sport", + "url": "https://example.org/b", "age": "1 day"}, + {"title": "City transport plan", "snippet": "the transport plan details", + "url": "https://example.org/a", "age": "1 day"}, + ] + ranked = ranking.rank_search_results(query, results) + # With word-boundary matching only B (standalone "sport") is penalized, so the + # plain transport result rises to the top. Pre-fix both were penalized equally + # (via "transport") and input order was preserved, leaving B on top. + assert ranked[0]["url"] == "https://example.org/a"