diff --git a/services/search/ranking.py b/services/search/ranking.py index 771a11a..66ffbf5 100644 --- a/services/search/ranking.py +++ b/services/search/ranking.py @@ -76,6 +76,19 @@ def _domain(url: str) -> str: return "" +def _has_word(text: str, term: str) -> bool: + """True if ``term`` appears in ``text`` as a whole word. + + Query terms are matched on word boundaries so a short term doesn't match + inside an unrelated word: "us" must not match "business"/"music", "port" + must not match "transport"/"support". This mirrors the tokenization used to + build ``query_terms`` (``\\b\\w+\\b``). #1473 converted the title and sports + checks to word boundaries; the snippet and subject-term checks below use + the same helper so the whole file stays consistent. + """ + return re.search(rf"\b{re.escape(term)}\b", text) is not None + + def rank_search_results(query: str, results: List[dict]) -> List[dict]: """Rank search results by title relevance, snippet quality, domain authority, and recency.""" query_terms = [t.lower() for t in re.findall(r"\b\w+\b", query)] @@ -87,14 +100,14 @@ def rank_search_results(query: str, results: List[dict]) -> List[dict]: if not title: return 0.0 title_lc = title.lower() - matches = sum(1 for term in query_terms if re.search(rf"\b{re.escape(term)}\b", title_lc)) + matches = sum(1 for term in query_terms if _has_word(title_lc, term)) return matches / len(query_terms) if query_terms else 0.0 def snippet_score(snippet: str) -> float: if not snippet: return 0.0 length_factor = min(len(snippet), 200) / 200 - term_hits = sum(1 for term in query_terms if term in snippet.lower()) + term_hits = sum(1 for term in query_terms if _has_word(snippet.lower(), term)) term_factor = term_hits / len(query_terms) if query_terms else 0.0 return (length_factor + term_factor) / 2 @@ -127,7 +140,7 @@ def rank_search_results(query: str, results: List[dict]) -> List[dict]: # A country/news query should not rank a page whose title/snippet barely # mentions the country above actual news pages for that country. subject_terms = [t for t in query_terms if t not in _NEWS_HINTS] - if subject_terms and not any(t in text or t in netloc for t in subject_terms): + if subject_terms and not any(_has_word(text, t) or _has_word(netloc, t) for t in subject_terms): adjustment -= 1.0 return adjustment diff --git a/tests/test_search_ranking_subject_substring.py b/tests/test_search_ranking_subject_substring.py new file mode 100644 index 0000000..81525b0 --- /dev/null +++ b/tests/test_search_ranking_subject_substring.py @@ -0,0 +1,87 @@ +"""Regression: snippet and subject-term matching must be word-boundary. + +#1473 converted the title and sports-hint matches in ranking.py to word +boundaries, but left two raw substring tests behind: + + - snippet_score: ``term in snippet.lower()`` — query term "port" hits + "transport"/"support", inflating a result's relevance. + - news_quality_adjustment: ``t in text or t in netloc`` for the subject term — + query "us" substring-matches "business"/"music", so an off-topic page + wrongly escapes the off-topic penalty for a country/subject news query. + +Both now go through ``_has_word`` (the same ``\\b...\\b`` pattern title_score +uses), so a short term no longer matches inside an unrelated word. + +``rank_search_results`` is exercised on both the services module (the +/api/search path) and the src re-export shim (the agent web_search path). +""" +import pytest + +import services.search.ranking as services_ranking +import src.search.ranking as src_ranking + +RANK_MODULES = [services_ranking, src_ranking] +RANK_IDS = ["services", "src"] + + +# --- _has_word helper (defined in the services module) --------------------- + +def test_has_word_rejects_substring_false_positives(): + assert services_ranking._has_word("business and music", "us") is False + assert services_ranking._has_word("transport and support", "port") is False + assert services_ranking._has_word("passport office", "sport") is False + + +def test_has_word_matches_standalone_terms(): + assert services_ranking._has_word("the us economy", "us") is True + assert services_ranking._has_word("port forwarding guide", "port") is True + + +# --- snippet_score: substring term must not inflate relevance --------------- + +@pytest.mark.parametrize("ranking", RANK_MODULES, ids=RANK_IDS) +def test_snippet_substring_does_not_outrank_a_true_nonmatch(ranking): + # Non-news query so only snippet relevance differs (no news adjustment). + query = "port forwarding" + results = [ + # C first: a genuine non-match (no query word at all). + {"title": "Networking notes", "snippet": "weather updates today", + "url": "https://example.org/c", "age": "1 day"}, + # B: contains "port" only inside "transport"/"support" (substring). + {"title": "Networking notes", "snippet": "transport and support", + "url": "https://example.org/b", "age": "1 day"}, + ] + ranked = ranking.rank_search_results(query, results) + # Pre-fix B got a spurious term hit and outranked C; post-fix they have the + # same (zero) snippet term match, so input order stands and C stays first. + assert ranked[0]["url"] == "https://example.org/c" + + +# --- subject-term off-topic penalty: substring must not suppress it --------- + +@pytest.mark.parametrize("ranking", RANK_MODULES, ids=RANK_IDS) +def test_offtopic_subject_substring_is_still_penalized(ranking): + # News query with subject term "us". B mentions "us" only inside + # "business"; A mentions "us" as a standalone word. The snippets are padded + # past the 200-char length cap and are otherwise identical, so both sides + # have equal base scores and the ONLY thing that can differ is the off-topic + # penalty — isolating the bug from incidental length/term scoring. + filler = ( + "regional market report covered many provincial topics and figures in " + "detail over the period with extra commentary and analysis written for " + "readers wanting more depth on the matter at hand and well into the " + "following week ahead" + ) + query = "us news" + results = [ + # B first: off-topic, "us" only as a substring of "business". + {"title": "Daily roundup", "snippet": "business economy and policy. " + filler, + "url": "https://example.org/b", "age": "1 day"}, + # A: on-topic, standalone "us". + {"title": "Daily roundup", "snippet": "us economy and policy. " + filler, + "url": "https://example.org/a", "age": "1 day"}, + ] + ranked = ranking.rank_search_results(query, results) + # Pre-fix B escaped the off-topic penalty (substring "us") so the tie kept + # input order (B on top); post-fix B takes the -1.0 penalty and A rises. + assert ranked[0]["url"] == "https://example.org/a"