Word-boundary match for snippet and subject-term ranking (#1473 follow-up) (#2556)

#1473 converted the title and sports-hint matches in services/search/ranking.py to word boundaries but left two raw substring tests: - snippet_score: 'term in snippet.lower()' — query term 'port' hits 'transport'/'support', inflating a result's relevance. - news_quality_adjustment: 't in text or t in netloc' for the subject term — query 'us' substring-matches 'business'/'music', so an off-topic page wrongly escapes the off-topic penalty on a country/subject news query. Add a _has_word helper (the same \b...\b pattern title_score already used) and route all three word checks (title, snippet, subject) through it, so the file stays consistent and a future partial fix can't reintroduce the same bug class. Pure ranking refinement: scores change only for spurious substring matches; no API or schema change. (cherry picked from commit 22bd23f044f191bb30e43f6b68386552817f4cc3) Co-authored-by: ghreprimand <203024559+ghreprimand@users.noreply.github.com>
2026-06-05 02:04:31 -05:00
parent 5271d529d6
commit cfb2d17a2d
2 changed files with 103 additions and 3 deletions
--- a/services/search/ranking.py
+++ b/services/search/ranking.py
@@ -76,6 +76,19 @@ def _domain(url: str) -> str:
        return ""


+def _has_word(text: str, term: str) -> bool:
+    """True if ``term`` appears in ``text`` as a whole word.
+
+    Query terms are matched on word boundaries so a short term doesn't match
+    inside an unrelated word: "us" must not match "business"/"music", "port"
+    must not match "transport"/"support". This mirrors the tokenization used to
+    build ``query_terms`` (``\\b\\w+\\b``). #1473 converted the title and sports
+    checks to word boundaries; the snippet and subject-term checks below use
+    the same helper so the whole file stays consistent.
+    """
+    return re.search(rf"\b{re.escape(term)}\b", text) is not None
+
+
 def rank_search_results(query: str, results: List[dict]) -> List[dict]:
    """Rank search results by title relevance, snippet quality, domain authority, and recency."""
    query_terms = [t.lower() for t in re.findall(r"\b\w+\b", query)]
@@ -87,14 +100,14 @@ def rank_search_results(query: str, results: List[dict]) -> List[dict]:
        if not title:
            return 0.0
        title_lc = title.lower()
-        matches = sum(1 for term in query_terms if re.search(rf"\b{re.escape(term)}\b", title_lc))
+        matches = sum(1 for term in query_terms if _has_word(title_lc, term))
        return matches / len(query_terms) if query_terms else 0.0

    def snippet_score(snippet: str) -> float:
        if not snippet:
            return 0.0
        length_factor = min(len(snippet), 200) / 200
-        term_hits = sum(1 for term in query_terms if term in snippet.lower())
+        term_hits = sum(1 for term in query_terms if _has_word(snippet.lower(), term))
        term_factor = term_hits / len(query_terms) if query_terms else 0.0
        return (length_factor + term_factor) / 2

@@ -127,7 +140,7 @@ def rank_search_results(query: str, results: List[dict]) -> List[dict]:
        # A country/news query should not rank a page whose title/snippet barely
        # mentions the country above actual news pages for that country.
        subject_terms = [t for t in query_terms if t not in _NEWS_HINTS]
-        if subject_terms and not any(t in text or t in netloc for t in subject_terms):
+        if subject_terms and not any(_has_word(text, t) or _has_word(netloc, t) for t in subject_terms):
            adjustment -= 1.0
        return adjustment

--- a/tests/test_search_ranking_subject_substring.py
+++ b/tests/test_search_ranking_subject_substring.py
@@ -0,0 +1,87 @@
+"""Regression: snippet and subject-term matching must be word-boundary.
+
+#1473 converted the title and sports-hint matches in ranking.py to word
+boundaries, but left two raw substring tests behind:
+
+  - snippet_score: ``term in snippet.lower()`` — query term "port" hits
+    "transport"/"support", inflating a result's relevance.
+  - news_quality_adjustment: ``t in text or t in netloc`` for the subject term —
+    query "us" substring-matches "business"/"music", so an off-topic page
+    wrongly escapes the off-topic penalty for a country/subject news query.
+
+Both now go through ``_has_word`` (the same ``\\b...\\b`` pattern title_score
+uses), so a short term no longer matches inside an unrelated word.
+
+``rank_search_results`` is exercised on both the services module (the
+/api/search path) and the src re-export shim (the agent web_search path).
+"""
+import pytest
+
+import services.search.ranking as services_ranking
+import src.search.ranking as src_ranking
+
+RANK_MODULES = [services_ranking, src_ranking]
+RANK_IDS = ["services", "src"]
+
+
+# --- _has_word helper (defined in the services module) ---------------------
+
+def test_has_word_rejects_substring_false_positives():
+    assert services_ranking._has_word("business and music", "us") is False
+    assert services_ranking._has_word("transport and support", "port") is False
+    assert services_ranking._has_word("passport office", "sport") is False
+
+
+def test_has_word_matches_standalone_terms():
+    assert services_ranking._has_word("the us economy", "us") is True
+    assert services_ranking._has_word("port forwarding guide", "port") is True
+
+
+# --- snippet_score: substring term must not inflate relevance ---------------
+
+@pytest.mark.parametrize("ranking", RANK_MODULES, ids=RANK_IDS)
+def test_snippet_substring_does_not_outrank_a_true_nonmatch(ranking):
+    # Non-news query so only snippet relevance differs (no news adjustment).
+    query = "port forwarding"
+    results = [
+        # C first: a genuine non-match (no query word at all).
+        {"title": "Networking notes", "snippet": "weather updates today",
+         "url": "https://example.org/c", "age": "1 day"},
+        # B: contains "port" only inside "transport"/"support" (substring).
+        {"title": "Networking notes", "snippet": "transport and support",
+         "url": "https://example.org/b", "age": "1 day"},
+    ]
+    ranked = ranking.rank_search_results(query, results)
+    # Pre-fix B got a spurious term hit and outranked C; post-fix they have the
+    # same (zero) snippet term match, so input order stands and C stays first.
+    assert ranked[0]["url"] == "https://example.org/c"
+
+
+# --- subject-term off-topic penalty: substring must not suppress it ---------
+
+@pytest.mark.parametrize("ranking", RANK_MODULES, ids=RANK_IDS)
+def test_offtopic_subject_substring_is_still_penalized(ranking):
+    # News query with subject term "us". B mentions "us" only inside
+    # "business"; A mentions "us" as a standalone word. The snippets are padded
+    # past the 200-char length cap and are otherwise identical, so both sides
+    # have equal base scores and the ONLY thing that can differ is the off-topic
+    # penalty — isolating the bug from incidental length/term scoring.
+    filler = (
+        "regional market report covered many provincial topics and figures in "
+        "detail over the period with extra commentary and analysis written for "
+        "readers wanting more depth on the matter at hand and well into the "
+        "following week ahead"
+    )
+    query = "us news"
+    results = [
+        # B first: off-topic, "us" only as a substring of "business".
+        {"title": "Daily roundup", "snippet": "business economy and policy. " + filler,
+         "url": "https://example.org/b", "age": "1 day"},
+        # A: on-topic, standalone "us".
+        {"title": "Daily roundup", "snippet": "us economy and policy. " + filler,
+         "url": "https://example.org/a", "age": "1 day"},
+    ]
+    ranked = ranking.rank_search_results(query, results)
+    # Pre-fix B escaped the off-topic penalty (substring "us") so the tie kept
+    # input order (B on top); post-fix B takes the -1.0 penalty and A rises.
+    assert ranked[0]["url"] == "https://example.org/a"