#1473 converted the title and sports-hint matches in services/search/ranking.py to word boundaries but left two raw substring tests: - snippet_score: 'term in snippet.lower()' — query term 'port' hits 'transport'/'support', inflating a result's relevance. - news_quality_adjustment: 't in text or t in netloc' for the subject term — query 'us' substring-matches 'business'/'music', so an off-topic page wrongly escapes the off-topic penalty on a country/subject news query. Add a _has_word helper (the same \b...\b pattern title_score already used) and route all three word checks (title, snippet, subject) through it, so the file stays consistent and a future partial fix can't reintroduce the same bug class. Pure ranking refinement: scores change only for spurious substring matches; no API or schema change. (cherry picked from commit 22bd23f044f191bb30e43f6b68386552817f4cc3) Co-authored-by: ghreprimand <203024559+ghreprimand@users.noreply.github.com>
This commit is contained in:
87
tests/test_search_ranking_subject_substring.py
Normal file
87
tests/test_search_ranking_subject_substring.py
Normal file
@@ -0,0 +1,87 @@
|
||||
"""Regression: snippet and subject-term matching must be word-boundary.
|
||||
|
||||
#1473 converted the title and sports-hint matches in ranking.py to word
|
||||
boundaries, but left two raw substring tests behind:
|
||||
|
||||
- snippet_score: ``term in snippet.lower()`` — query term "port" hits
|
||||
"transport"/"support", inflating a result's relevance.
|
||||
- news_quality_adjustment: ``t in text or t in netloc`` for the subject term —
|
||||
query "us" substring-matches "business"/"music", so an off-topic page
|
||||
wrongly escapes the off-topic penalty for a country/subject news query.
|
||||
|
||||
Both now go through ``_has_word`` (the same ``\\b...\\b`` pattern title_score
|
||||
uses), so a short term no longer matches inside an unrelated word.
|
||||
|
||||
``rank_search_results`` is exercised on both the services module (the
|
||||
/api/search path) and the src re-export shim (the agent web_search path).
|
||||
"""
|
||||
import pytest
|
||||
|
||||
import services.search.ranking as services_ranking
|
||||
import src.search.ranking as src_ranking
|
||||
|
||||
RANK_MODULES = [services_ranking, src_ranking]
|
||||
RANK_IDS = ["services", "src"]
|
||||
|
||||
|
||||
# --- _has_word helper (defined in the services module) ---------------------
|
||||
|
||||
def test_has_word_rejects_substring_false_positives():
|
||||
assert services_ranking._has_word("business and music", "us") is False
|
||||
assert services_ranking._has_word("transport and support", "port") is False
|
||||
assert services_ranking._has_word("passport office", "sport") is False
|
||||
|
||||
|
||||
def test_has_word_matches_standalone_terms():
|
||||
assert services_ranking._has_word("the us economy", "us") is True
|
||||
assert services_ranking._has_word("port forwarding guide", "port") is True
|
||||
|
||||
|
||||
# --- snippet_score: substring term must not inflate relevance ---------------
|
||||
|
||||
@pytest.mark.parametrize("ranking", RANK_MODULES, ids=RANK_IDS)
|
||||
def test_snippet_substring_does_not_outrank_a_true_nonmatch(ranking):
|
||||
# Non-news query so only snippet relevance differs (no news adjustment).
|
||||
query = "port forwarding"
|
||||
results = [
|
||||
# C first: a genuine non-match (no query word at all).
|
||||
{"title": "Networking notes", "snippet": "weather updates today",
|
||||
"url": "https://example.org/c", "age": "1 day"},
|
||||
# B: contains "port" only inside "transport"/"support" (substring).
|
||||
{"title": "Networking notes", "snippet": "transport and support",
|
||||
"url": "https://example.org/b", "age": "1 day"},
|
||||
]
|
||||
ranked = ranking.rank_search_results(query, results)
|
||||
# Pre-fix B got a spurious term hit and outranked C; post-fix they have the
|
||||
# same (zero) snippet term match, so input order stands and C stays first.
|
||||
assert ranked[0]["url"] == "https://example.org/c"
|
||||
|
||||
|
||||
# --- subject-term off-topic penalty: substring must not suppress it ---------
|
||||
|
||||
@pytest.mark.parametrize("ranking", RANK_MODULES, ids=RANK_IDS)
|
||||
def test_offtopic_subject_substring_is_still_penalized(ranking):
|
||||
# News query with subject term "us". B mentions "us" only inside
|
||||
# "business"; A mentions "us" as a standalone word. The snippets are padded
|
||||
# past the 200-char length cap and are otherwise identical, so both sides
|
||||
# have equal base scores and the ONLY thing that can differ is the off-topic
|
||||
# penalty — isolating the bug from incidental length/term scoring.
|
||||
filler = (
|
||||
"regional market report covered many provincial topics and figures in "
|
||||
"detail over the period with extra commentary and analysis written for "
|
||||
"readers wanting more depth on the matter at hand and well into the "
|
||||
"following week ahead"
|
||||
)
|
||||
query = "us news"
|
||||
results = [
|
||||
# B first: off-topic, "us" only as a substring of "business".
|
||||
{"title": "Daily roundup", "snippet": "business economy and policy. " + filler,
|
||||
"url": "https://example.org/b", "age": "1 day"},
|
||||
# A: on-topic, standalone "us".
|
||||
{"title": "Daily roundup", "snippet": "us economy and policy. " + filler,
|
||||
"url": "https://example.org/a", "age": "1 day"},
|
||||
]
|
||||
ranked = ranking.rank_search_results(query, results)
|
||||
# Pre-fix B escaped the off-topic penalty (substring "us") so the tie kept
|
||||
# input order (B on top); post-fix B takes the -1.0 penalty and A rises.
|
||||
assert ranked[0]["url"] == "https://example.org/a"
|
||||
Reference in New Issue
Block a user