From 975fd42e322499d1a3d43aebb5f049ceccb17899 Mon Sep 17 00:00:00 2001 From: lekt8 Date: Tue, 2 Jun 2026 23:18:15 +0800 Subject: [PATCH] fix: rank recency by UTC, not local time (#1116) (#1234) src/search/ranking.py computed result age as `(datetime.now() - dt).days`, where `dt` is parsed from a UTC-style published date with no timezone. Using local `datetime.now()` skewed the age by the host's UTC offset (off-by-up-to-a-day near boundaries), and was a latent crash: once neighbouring code becomes timezone-aware the naive/aware subtraction raises TypeError (the landmine called out in #1116). Recency is now measured against naive UTC. The scoring is also lifted out of the rank_search_results closure into a module-level, time-injectable `recency_score` so it's unit-testable, and `_utcnow_naive()` avoids `datetime.utcnow()` (removed in Python 3.14). Covered by tests/test_search_ranking_recency.py (5 cases); the existing tests/test_search_ranking.py still passes. Co-authored-by: Claude Opus 4.8 (1M context) --- src/search/ranking.py | 57 ++++++++++++++++++---------- tests/test_search_ranking_recency.py | 39 +++++++++++++++++++ 2 files changed, 77 insertions(+), 19 deletions(-) create mode 100644 tests/test_search_ranking_recency.py diff --git a/src/search/ranking.py b/src/search/ranking.py index 17facba..17605b2 100644 --- a/src/search/ranking.py +++ b/src/search/ranking.py @@ -2,12 +2,49 @@ import re import logging -from datetime import datetime +from datetime import datetime, timezone from typing import List, Optional from urllib.parse import urlparse logger = logging.getLogger(__name__) +_AGE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S") + + +def _utcnow_naive() -> datetime: + """Naive UTC 'now'. Matches the naive, UTC-style published dates parsed below, + and is safe on Python 3.14 where ``datetime.utcnow()`` is removed (#1116).""" + return datetime.now(timezone.utc).replace(tzinfo=None) + + +def recency_score(age_str: Optional[str], now: Optional[datetime] = None) -> float: + """Score how recent a result is: 1.0 for <=7 days old, 0.0 for >=30 days. + + The age is measured against UTC, not local time. The previous code used + ``datetime.now()`` (local) against UTC-style published dates, so the age was + skewed by the host's UTC offset; it was also a latent crash once neighbouring + code moves to timezone-aware datetimes (#1116). ``now`` is injectable for tests. + """ + if not age_str: + return 0.0 + dt = None + for fmt in _AGE_FORMATS: + try: + dt = datetime.strptime(age_str, fmt) + break + except Exception: + dt = None + if not dt: + return 0.0 + now = now or _utcnow_naive() + days_old = (now - dt).days + if days_old <= 7: + return 1.0 + if days_old >= 30: + return 0.0 + return (30 - days_old) / 23 + + _NEWS_HINTS = {"news", "nyheter", "headlines", "breaking", "latest", "today", "idag"} _SPORTS_HINTS = { "sport", "sports", "soccer", "football", "hockey", "nba", "nfl", "mlb", @@ -68,24 +105,6 @@ def rank_search_results(query: str, results: List[dict]) -> List[dict]: return 0.7 return 0.4 - def recency_score(age_str: Optional[str]) -> float: - if not age_str: - return 0.0 - for fmt in ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S"): - try: - dt = datetime.strptime(age_str, fmt) - break - except Exception: - dt = None - if not dt: - return 0.0 - days_old = (datetime.now() - dt).days - if days_old <= 7: - return 1.0 - if days_old >= 30: - return 0.0 - return (30 - days_old) / 23 - def news_quality_adjustment(title: str, snippet: str, url: str) -> float: if not is_news_query: return 0.0 diff --git a/tests/test_search_ranking_recency.py b/tests/test_search_ranking_recency.py new file mode 100644 index 0000000..64e59d4 --- /dev/null +++ b/tests/test_search_ranking_recency.py @@ -0,0 +1,39 @@ +"""Issue #1116 (latent ranking bug) — recency scoring uses UTC, not local time. + +`recency_score` measured age with `datetime.now()` (local) against UTC-style +published dates, skewing the age by the host's UTC offset and risking a TypeError +once neighbouring code becomes timezone-aware. It now uses naive UTC and is a +module-level, time-injectable function. +""" + +from datetime import datetime, timezone + +from src.search.ranking import recency_score, _utcnow_naive + + +def test_fresh_result_scores_one(): + assert recency_score("2026-01-01", now=datetime(2026, 1, 5)) == 1.0 # 4 days old + + +def test_old_result_scores_zero(): + assert recency_score("2026-01-01", now=datetime(2026, 3, 1)) == 0.0 # >30 days + + +def test_mid_range_decays_linearly(): + score = recency_score("2026-01-01", now=datetime(2026, 1, 20)) # 19 days old + assert score == (30 - 19) / 23 + + +def test_empty_or_unparseable_scores_zero(): + assert recency_score("", now=datetime(2026, 1, 1)) == 0.0 + assert recency_score(None, now=datetime(2026, 1, 1)) == 0.0 + assert recency_score("not-a-date", now=datetime(2026, 1, 1)) == 0.0 + + +def test_default_now_is_naive_utc(): + # Naive (no tzinfo) so it subtracts cleanly from the naive parsed dates, + # and UTC-based (3.14-safe, no datetime.utcnow()). + now = _utcnow_naive() + assert now.tzinfo is None + reference = datetime.now(timezone.utc).replace(tzinfo=None) + assert abs((now - reference).total_seconds()) < 5