fix(search): apply recency UTC fix to live ranking module

2026-06-03 12:49:32 +01:00
parent 0deeba58ba
commit a75dd4a231
4 changed files with 85 additions and 169 deletions
--- a/services/search/ranking.py
+++ b/services/search/ranking.py
@@ -2,12 +2,49 @@

 import re
 import logging
-from datetime import datetime
+from datetime import datetime, timezone
 from typing import List, Optional
 from urllib.parse import urlparse

 logger = logging.getLogger(__name__)

+_AGE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S")
+
+
+def _utcnow_naive() -> datetime:
+    """Naive UTC 'now'. Matches the naive, UTC-style published dates parsed below,
+    and is safe on Python 3.14 where ``datetime.utcnow()`` is removed (#1116)."""
+    return datetime.now(timezone.utc).replace(tzinfo=None)
+
+
+def recency_score(age_str: Optional[str], now: Optional[datetime] = None) -> float:
+    """Score how recent a result is: 1.0 for <=7 days old, 0.0 for >=30 days.
+
+    The age is measured against UTC, not local time. The previous code used
+    ``datetime.now()`` (local) against UTC-style published dates, so the age was
+    skewed by the host's UTC offset; it was also a latent crash once neighbouring
+    code moves to timezone-aware datetimes (#1116). ``now`` is injectable for tests.
+    """
+    if not age_str:
+        return 0.0
+    dt = None
+    for fmt in _AGE_FORMATS:
+        try:
+            dt = datetime.strptime(age_str, fmt)
+            break
+        except Exception:
+            dt = None
+    if not dt:
+        return 0.0
+    now = now or _utcnow_naive()
+    days_old = (now - dt).days
+    if days_old <= 7:
+        return 1.0
+    if days_old >= 30:
+        return 0.0
+    return (30 - days_old) / 23
+
+
 _NEWS_HINTS = {"news", "nyheter", "headlines", "breaking", "latest", "today", "idag"}
 _SPORTS_HINTS = {
    "sport", "sports", "soccer", "football", "hockey", "nba", "nfl", "mlb",
@@ -73,24 +110,6 @@ def rank_search_results(query: str, results: List[dict]) -> List[dict]:
            return 0.7
        return 0.4

-    def recency_score(age_str: Optional[str]) -> float:
-        if not age_str:
-            return 0.0
-        for fmt in ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S"):
-            try:
-                dt = datetime.strptime(age_str, fmt)
-                break
-            except Exception:
-                dt = None
-        if not dt:
-            return 0.0
-        days_old = (datetime.now() - dt).days
-        if days_old <= 7:
-            return 1.0
-        if days_old >= 30:
-            return 0.0
-        return (30 - days_old) / 23
-
    def news_quality_adjustment(title: str, snippet: str, url: str) -> float:
        if not is_news_query:
            return 0.0
--- a/src/search/ranking.py
+++ b/src/search/ranking.py
@@ -1,151 +1,13 @@
-"""Search result ranking based on relevance, source quality, and recency."""
+"""Compatibility re-export shim for the live ranking module.

-import re
-import logging
-from datetime import datetime, timezone
-from typing import List, Optional
-from urllib.parse import urlparse
+The real implementation lives in :mod:`services.search.ranking`, which is what
+the search runtime (services/search/core.py) imports. This module used to hold a
+parallel copy; it now re-exports so the two cannot drift out of sync again.
+"""

-logger = logging.getLogger(__name__)
-
-_AGE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S")
-
-
-def _utcnow_naive() -> datetime:
-    """Naive UTC 'now'. Matches the naive, UTC-style published dates parsed below,
-    and is safe on Python 3.14 where ``datetime.utcnow()`` is removed (#1116)."""
-    return datetime.now(timezone.utc).replace(tzinfo=None)
-
-
-def recency_score(age_str: Optional[str], now: Optional[datetime] = None) -> float:
-    """Score how recent a result is: 1.0 for <=7 days old, 0.0 for >=30 days.
-
-    The age is measured against UTC, not local time. The previous code used
-    ``datetime.now()`` (local) against UTC-style published dates, so the age was
-    skewed by the host's UTC offset; it was also a latent crash once neighbouring
-    code moves to timezone-aware datetimes (#1116). ``now`` is injectable for tests.
-    """
-    if not age_str:
-        return 0.0
-    dt = None
-    for fmt in _AGE_FORMATS:
-        try:
-            dt = datetime.strptime(age_str, fmt)
-            break
-        except Exception:
-            dt = None
-    if not dt:
-        return 0.0
-    now = now or _utcnow_naive()
-    days_old = (now - dt).days
-    if days_old <= 7:
-        return 1.0
-    if days_old >= 30:
-        return 0.0
-    return (30 - days_old) / 23
-
-
-_NEWS_HINTS = {"news", "nyheter", "headlines", "breaking", "latest", "today", "idag"}
-_SPORTS_HINTS = {
-    "sport", "sports", "soccer", "football", "hockey", "nba", "nfl", "mlb",
-    "fifa", "world cup", "championship", "quarterfinal", "eliminates",
-}
-# Word-boundary match so "sport" does not fire inside "transport"/"passport"
-# and a domain like "transport.gov" is not mistaken for a sports site.
-_SPORTS_HINT_RE = re.compile(
-    r"\b(?:" + "|".join(re.escape(h) for h in _SPORTS_HINTS) + r")\b"
+from services.search.ranking import (  # noqa: F401
+    _AGE_FORMATS,
+    _utcnow_naive,
+    rank_search_results,
+    recency_score,
 )
-_LOW_VALUE_NEWS_DOMAINS = {
-    "facebook.com", "www.facebook.com", "sports.yahoo.com", "yahoo.com",
-    "www.yahoo.com", "msn.com", "www.msn.com",
-}
-_TRUSTED_NEWS_DOMAINS = {
-    "apnews.com", "www.apnews.com", "reuters.com", "www.reuters.com",
-    "bbc.com", "www.bbc.com", "cbc.ca", "www.cbc.ca",
-    "ctvnews.ca", "www.ctvnews.ca", "globalnews.ca", "www.globalnews.ca",
-    "theguardian.com",
-    "www.theguardian.com", "euronews.com", "www.euronews.com",
-    "dw.com", "www.dw.com", "government.se", "www.government.se",
-}
-
-
-def _domain(url: str) -> str:
-    try:
-        return urlparse(url).netloc.lower()
-    except Exception:
-        return ""
-
-
-def rank_search_results(query: str, results: List[dict]) -> List[dict]:
-    """Rank search results by title relevance, snippet quality, domain authority, and recency."""
-    query_terms = [t.lower() for t in re.findall(r"\b\w+\b", query)]
-    query_lc = query.lower()
-    is_news_query = any(term in _NEWS_HINTS for term in query_terms)
-    is_sports_query = bool(_SPORTS_HINT_RE.search(query_lc))
-
-    def title_score(title: str) -> float:
-        if not title:
-            return 0.0
-        title_lc = title.lower()
-        matches = sum(1 for term in query_terms if re.search(rf"\b{re.escape(term)}\b", title_lc))
-        return matches / len(query_terms) if query_terms else 0.0
-
-    def snippet_score(snippet: str) -> float:
-        if not snippet:
-            return 0.0
-        length_factor = min(len(snippet), 200) / 200
-        term_hits = sum(1 for term in query_terms if term in snippet.lower())
-        term_factor = term_hits / len(query_terms) if query_terms else 0.0
-        return (length_factor + term_factor) / 2
-
-    def domain_score(url: str) -> float:
-        netloc = _domain(url)
-        if not netloc:
-            return 0.0
-        if netloc in _TRUSTED_NEWS_DOMAINS:
-            return 1.0
-        if netloc.endswith(".edu") or netloc.endswith(".gov"):
-            return 1.0
-        if netloc.endswith(".org"):
-            return 0.7
-        return 0.4
-
-    def news_quality_adjustment(title: str, snippet: str, url: str) -> float:
-        if not is_news_query:
-            return 0.0
-        text = f"{title} {snippet}".lower()
-        netloc = _domain(url)
-        adjustment = 0.0
-        if netloc in _TRUSTED_NEWS_DOMAINS:
-            adjustment += 1.2
-        if any(term in text for term in ("latest news", "breaking news", "daily coverage", "news from")):
-            adjustment += 0.4
-        if netloc in _LOW_VALUE_NEWS_DOMAINS:
-            adjustment -= 0.8
-        if not is_sports_query and (_SPORTS_HINT_RE.search(text) or _SPORTS_HINT_RE.search(netloc)):
-            adjustment -= 1.5
-        # A country/news query should not rank a page whose title/snippet barely
-        # mentions the country above actual news pages for that country.
-        subject_terms = [t for t in query_terms if t not in _NEWS_HINTS]
-        if subject_terms and not any(t in text or t in netloc for t in subject_terms):
-            adjustment -= 1.0
-        return adjustment
-
-    ranked = []
-    for result in results:
-        title = result.get("title", "")
-        snippet = result.get("snippet", "")
-        url = result.get("url", "")
-        age = result.get("age", None)
-
-        score = (
-            2.0 * title_score(title)
-            + 1.0 * snippet_score(snippet)
-            + 1.5 * domain_score(url)
-            + 1.0 * recency_score(age)
-            + news_quality_adjustment(title, snippet, url)
-        )
-        ranked.append((score, result))
-
-    ranked.sort(key=lambda x: x[0], reverse=True)
-    return [r for _, r in ranked]
--- a/tests/test_search_ranking.py
+++ b/tests/test_search_ranking.py
@@ -1,4 +1,4 @@
-from src.search.ranking import rank_search_results
+from services.search.ranking import rank_search_results


 def test_news_queries_prefer_news_sources_over_sports_and_social_results():
--- a/tests/test_search_ranking_recency.py
+++ b/tests/test_search_ranking_recency.py
@@ -8,7 +8,8 @@ module-level, time-injectable function.

 from datetime import datetime, timezone

-from src.search.ranking import recency_score, _utcnow_naive
+import services.search.ranking as live_ranking
+from services.search.ranking import recency_score, _utcnow_naive, rank_search_results


 def test_fresh_result_scores_one():
@@ -37,3 +38,37 @@ def test_default_now_is_naive_utc():
    assert now.tzinfo is None
    reference = datetime.now(timezone.utc).replace(tzinfo=None)
    assert abs((now - reference).total_seconds()) < 5
+
+
+def test_supported_timestamp_formats_parse():
+    # All three formats the current implementation supports resolve to the same
+    # ~4-day-old age, so each scores a full 1.0.
+    now = datetime(2026, 1, 5, 12, 0, 0)
+    assert recency_score("2026-01-01", now=now) == 1.0
+    assert recency_score("2026-01-01T08:30:00", now=now) == 1.0
+    assert recency_score("2026-01-01 08:30:00", now=now) == 1.0
+
+
+def test_shim_reexports_live_objects():
+    # src.search.ranking is a compatibility shim; it must expose the *same*
+    # objects as the live services module so the two cannot diverge.
+    import src.search.ranking as shim
+
+    assert shim.recency_score is live_ranking.recency_score
+    assert shim.rank_search_results is live_ranking.rank_search_results
+    assert shim._utcnow_naive is live_ranking._utcnow_naive
+
+
+def test_live_rank_path_prefers_newer_result(monkeypatch):
+    # Pin "now" so age scoring is deterministic. The two results are identical
+    # apart from age, isolating recency as the only differentiator.
+    monkeypatch.setattr(live_ranking, "_utcnow_naive", lambda: datetime(2026, 1, 31))
+    results = [
+        {"title": "Report", "url": "https://example.org/a", "snippet": "x", "age": "2026-01-01"},
+        {"title": "Report", "url": "https://example.org/b", "snippet": "x", "age": "2026-01-29"},
+    ]
+
+    ranked = rank_search_results("report", results)
+
+    assert ranked[0]["url"] == "https://example.org/b"
+    assert ranked[1]["url"] == "https://example.org/a"