fix: sports-hint ranking penalty fires on 'transport'/'passport' substrings (#1473)
* fix: sports-hint ranking penalty fires on 'transport'/'passport' substrings * Apply word-boundary sports-hint fix to src/search/ranking.py as well
This commit is contained in:
@@ -13,6 +13,11 @@ _SPORTS_HINTS = {
|
|||||||
"sport", "sports", "soccer", "football", "hockey", "nba", "nfl", "mlb",
|
"sport", "sports", "soccer", "football", "hockey", "nba", "nfl", "mlb",
|
||||||
"fifa", "world cup", "championship", "quarterfinal", "eliminates",
|
"fifa", "world cup", "championship", "quarterfinal", "eliminates",
|
||||||
}
|
}
|
||||||
|
# Word-boundary match so "sport" does not fire inside "transport"/"passport"
|
||||||
|
# and a domain like "transport.gov" is not mistaken for a sports site.
|
||||||
|
_SPORTS_HINT_RE = re.compile(
|
||||||
|
r"\b(?:" + "|".join(re.escape(h) for h in _SPORTS_HINTS) + r")\b"
|
||||||
|
)
|
||||||
_LOW_VALUE_NEWS_DOMAINS = {
|
_LOW_VALUE_NEWS_DOMAINS = {
|
||||||
"facebook.com", "www.facebook.com", "sports.yahoo.com", "yahoo.com",
|
"facebook.com", "www.facebook.com", "sports.yahoo.com", "yahoo.com",
|
||||||
"www.yahoo.com", "msn.com", "www.msn.com",
|
"www.yahoo.com", "msn.com", "www.msn.com",
|
||||||
@@ -39,7 +44,7 @@ def rank_search_results(query: str, results: List[dict]) -> List[dict]:
|
|||||||
query_terms = [t.lower() for t in re.findall(r"\b\w+\b", query)]
|
query_terms = [t.lower() for t in re.findall(r"\b\w+\b", query)]
|
||||||
query_lc = query.lower()
|
query_lc = query.lower()
|
||||||
is_news_query = any(term in _NEWS_HINTS for term in query_terms)
|
is_news_query = any(term in _NEWS_HINTS for term in query_terms)
|
||||||
is_sports_query = any(hint in query_lc for hint in _SPORTS_HINTS)
|
is_sports_query = bool(_SPORTS_HINT_RE.search(query_lc))
|
||||||
|
|
||||||
def title_score(title: str) -> float:
|
def title_score(title: str) -> float:
|
||||||
if not title:
|
if not title:
|
||||||
@@ -98,7 +103,7 @@ def rank_search_results(query: str, results: List[dict]) -> List[dict]:
|
|||||||
adjustment += 0.4
|
adjustment += 0.4
|
||||||
if netloc in _LOW_VALUE_NEWS_DOMAINS:
|
if netloc in _LOW_VALUE_NEWS_DOMAINS:
|
||||||
adjustment -= 0.8
|
adjustment -= 0.8
|
||||||
if not is_sports_query and any(hint in text or hint in netloc for hint in _SPORTS_HINTS):
|
if not is_sports_query and (_SPORTS_HINT_RE.search(text) or _SPORTS_HINT_RE.search(netloc)):
|
||||||
adjustment -= 1.5
|
adjustment -= 1.5
|
||||||
# A country/news query should not rank a page whose title/snippet barely
|
# A country/news query should not rank a page whose title/snippet barely
|
||||||
# mentions the country above actual news pages for that country.
|
# mentions the country above actual news pages for that country.
|
||||||
|
|||||||
@@ -50,6 +50,11 @@ _SPORTS_HINTS = {
|
|||||||
"sport", "sports", "soccer", "football", "hockey", "nba", "nfl", "mlb",
|
"sport", "sports", "soccer", "football", "hockey", "nba", "nfl", "mlb",
|
||||||
"fifa", "world cup", "championship", "quarterfinal", "eliminates",
|
"fifa", "world cup", "championship", "quarterfinal", "eliminates",
|
||||||
}
|
}
|
||||||
|
# Word-boundary match so "sport" does not fire inside "transport"/"passport"
|
||||||
|
# and a domain like "transport.gov" is not mistaken for a sports site.
|
||||||
|
_SPORTS_HINT_RE = re.compile(
|
||||||
|
r"\b(?:" + "|".join(re.escape(h) for h in _SPORTS_HINTS) + r")\b"
|
||||||
|
)
|
||||||
_LOW_VALUE_NEWS_DOMAINS = {
|
_LOW_VALUE_NEWS_DOMAINS = {
|
||||||
"facebook.com", "www.facebook.com", "sports.yahoo.com", "yahoo.com",
|
"facebook.com", "www.facebook.com", "sports.yahoo.com", "yahoo.com",
|
||||||
"www.yahoo.com", "msn.com", "www.msn.com",
|
"www.yahoo.com", "msn.com", "www.msn.com",
|
||||||
@@ -76,7 +81,7 @@ def rank_search_results(query: str, results: List[dict]) -> List[dict]:
|
|||||||
query_terms = [t.lower() for t in re.findall(r"\b\w+\b", query)]
|
query_terms = [t.lower() for t in re.findall(r"\b\w+\b", query)]
|
||||||
query_lc = query.lower()
|
query_lc = query.lower()
|
||||||
is_news_query = any(term in _NEWS_HINTS for term in query_terms)
|
is_news_query = any(term in _NEWS_HINTS for term in query_terms)
|
||||||
is_sports_query = any(hint in query_lc for hint in _SPORTS_HINTS)
|
is_sports_query = bool(_SPORTS_HINT_RE.search(query_lc))
|
||||||
|
|
||||||
def title_score(title: str) -> float:
|
def title_score(title: str) -> float:
|
||||||
if not title:
|
if not title:
|
||||||
@@ -117,7 +122,7 @@ def rank_search_results(query: str, results: List[dict]) -> List[dict]:
|
|||||||
adjustment += 0.4
|
adjustment += 0.4
|
||||||
if netloc in _LOW_VALUE_NEWS_DOMAINS:
|
if netloc in _LOW_VALUE_NEWS_DOMAINS:
|
||||||
adjustment -= 0.8
|
adjustment -= 0.8
|
||||||
if not is_sports_query and any(hint in text or hint in netloc for hint in _SPORTS_HINTS):
|
if not is_sports_query and (_SPORTS_HINT_RE.search(text) or _SPORTS_HINT_RE.search(netloc)):
|
||||||
adjustment -= 1.5
|
adjustment -= 1.5
|
||||||
# A country/news query should not rank a page whose title/snippet barely
|
# A country/news query should not rank a page whose title/snippet barely
|
||||||
# mentions the country above actual news pages for that country.
|
# mentions the country above actual news pages for that country.
|
||||||
|
|||||||
52
tests/test_search_ranking_sports_substring.py
Normal file
52
tests/test_search_ranking_sports_substring.py
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
"""Regression: the sports-hint match must be word-boundary, not substring.
|
||||||
|
|
||||||
|
`_SPORTS_HINTS` contains "sport", which is a substring of "transport",
|
||||||
|
"passport", "sportswear", and of domains like "transport.gov". The old code
|
||||||
|
used `hint in text` / `hint in netloc`, so for any non-sports news query a
|
||||||
|
legitimate result mentioning "transport"/"passport" took the -1.5 sports
|
||||||
|
penalty and was pushed down the ranking. The query classifier had the same
|
||||||
|
flaw (a "passport" query was treated as a sports query). Both now use the
|
||||||
|
word-boundary `_SPORTS_HINT_RE`.
|
||||||
|
|
||||||
|
The same ranking module exists in two live copies: `services/search/ranking.py`
|
||||||
|
(the /api/search HTTP path) and `src/search/ranking.py` (the agent's
|
||||||
|
`web_search` tool path via `src/search/core.py`). Both are fixed and both are
|
||||||
|
covered here.
|
||||||
|
"""
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
import services.search.ranking as services_ranking
|
||||||
|
import src.search.ranking as src_ranking
|
||||||
|
|
||||||
|
MODULES = [services_ranking, src_ranking]
|
||||||
|
MODULE_IDS = ["services", "src"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("ranking", MODULES, ids=MODULE_IDS)
|
||||||
|
def test_sports_regex_ignores_substring_false_positives(ranking):
|
||||||
|
for word in ("transport", "passport", "sportswear", "transportation"):
|
||||||
|
assert ranking._SPORTS_HINT_RE.search(word) is None, word
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("ranking", MODULES, ids=MODULE_IDS)
|
||||||
|
def test_sports_regex_still_matches_real_terms(ranking):
|
||||||
|
for word in ("sport", "sports", "world cup", "the nba finals", "soccer match"):
|
||||||
|
assert ranking._SPORTS_HINT_RE.search(word) is not None, word
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("ranking", MODULES, ids=MODULE_IDS)
|
||||||
|
def test_transport_news_result_outranks_one_with_standalone_sport(ranking):
|
||||||
|
# Non-sports news query (contains "latest"/"news"); subject term "transport".
|
||||||
|
query = "latest transport news"
|
||||||
|
results = [
|
||||||
|
# B first in input; identical except B carries a standalone "sport" word.
|
||||||
|
{"title": "City transport plan", "snippet": "the transport plan details and sport",
|
||||||
|
"url": "https://example.org/b", "age": "1 day"},
|
||||||
|
{"title": "City transport plan", "snippet": "the transport plan details",
|
||||||
|
"url": "https://example.org/a", "age": "1 day"},
|
||||||
|
]
|
||||||
|
ranked = ranking.rank_search_results(query, results)
|
||||||
|
# With word-boundary matching only B (standalone "sport") is penalized, so the
|
||||||
|
# plain transport result rises to the top. Pre-fix both were penalized equally
|
||||||
|
# (via "transport") and input order was preserved, leaving B on top.
|
||||||
|
assert ranked[0]["url"] == "https://example.org/a"
|
||||||
Reference in New Issue
Block a user