fix: rank recency by UTC, not local time (#1116) (#1234)

src/search/ranking.py computed result age as `(datetime.now() - dt).days`, where
`dt` is parsed from a UTC-style published date with no timezone. Using local
`datetime.now()` skewed the age by the host's UTC offset (off-by-up-to-a-day near
boundaries), and was a latent crash: once neighbouring code becomes timezone-aware
the naive/aware subtraction raises TypeError (the landmine called out in #1116).

Recency is now measured against naive UTC. The scoring is also lifted out of the
rank_search_results closure into a module-level, time-injectable `recency_score`
so it's unit-testable, and `_utcnow_naive()` avoids `datetime.utcnow()` (removed in
Python 3.14).

Covered by tests/test_search_ranking_recency.py (5 cases); the existing
tests/test_search_ranking.py still passes.

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
lekt8
2026-06-02 23:18:15 +08:00
committed by GitHub
parent 8c376d2b0e
commit 975fd42e32
2 changed files with 77 additions and 19 deletions

View File

@@ -2,12 +2,49 @@
import re import re
import logging import logging
from datetime import datetime from datetime import datetime, timezone
from typing import List, Optional from typing import List, Optional
from urllib.parse import urlparse from urllib.parse import urlparse
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_AGE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S")
def _utcnow_naive() -> datetime:
"""Naive UTC 'now'. Matches the naive, UTC-style published dates parsed below,
and is safe on Python 3.14 where ``datetime.utcnow()`` is removed (#1116)."""
return datetime.now(timezone.utc).replace(tzinfo=None)
def recency_score(age_str: Optional[str], now: Optional[datetime] = None) -> float:
"""Score how recent a result is: 1.0 for <=7 days old, 0.0 for >=30 days.
The age is measured against UTC, not local time. The previous code used
``datetime.now()`` (local) against UTC-style published dates, so the age was
skewed by the host's UTC offset; it was also a latent crash once neighbouring
code moves to timezone-aware datetimes (#1116). ``now`` is injectable for tests.
"""
if not age_str:
return 0.0
dt = None
for fmt in _AGE_FORMATS:
try:
dt = datetime.strptime(age_str, fmt)
break
except Exception:
dt = None
if not dt:
return 0.0
now = now or _utcnow_naive()
days_old = (now - dt).days
if days_old <= 7:
return 1.0
if days_old >= 30:
return 0.0
return (30 - days_old) / 23
_NEWS_HINTS = {"news", "nyheter", "headlines", "breaking", "latest", "today", "idag"} _NEWS_HINTS = {"news", "nyheter", "headlines", "breaking", "latest", "today", "idag"}
_SPORTS_HINTS = { _SPORTS_HINTS = {
"sport", "sports", "soccer", "football", "hockey", "nba", "nfl", "mlb", "sport", "sports", "soccer", "football", "hockey", "nba", "nfl", "mlb",
@@ -68,24 +105,6 @@ def rank_search_results(query: str, results: List[dict]) -> List[dict]:
return 0.7 return 0.7
return 0.4 return 0.4
def recency_score(age_str: Optional[str]) -> float:
if not age_str:
return 0.0
for fmt in ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S"):
try:
dt = datetime.strptime(age_str, fmt)
break
except Exception:
dt = None
if not dt:
return 0.0
days_old = (datetime.now() - dt).days
if days_old <= 7:
return 1.0
if days_old >= 30:
return 0.0
return (30 - days_old) / 23
def news_quality_adjustment(title: str, snippet: str, url: str) -> float: def news_quality_adjustment(title: str, snippet: str, url: str) -> float:
if not is_news_query: if not is_news_query:
return 0.0 return 0.0

View File

@@ -0,0 +1,39 @@
"""Issue #1116 (latent ranking bug) — recency scoring uses UTC, not local time.
`recency_score` measured age with `datetime.now()` (local) against UTC-style
published dates, skewing the age by the host's UTC offset and risking a TypeError
once neighbouring code becomes timezone-aware. It now uses naive UTC and is a
module-level, time-injectable function.
"""
from datetime import datetime, timezone
from src.search.ranking import recency_score, _utcnow_naive
def test_fresh_result_scores_one():
assert recency_score("2026-01-01", now=datetime(2026, 1, 5)) == 1.0 # 4 days old
def test_old_result_scores_zero():
assert recency_score("2026-01-01", now=datetime(2026, 3, 1)) == 0.0 # >30 days
def test_mid_range_decays_linearly():
score = recency_score("2026-01-01", now=datetime(2026, 1, 20)) # 19 days old
assert score == (30 - 19) / 23
def test_empty_or_unparseable_scores_zero():
assert recency_score("", now=datetime(2026, 1, 1)) == 0.0
assert recency_score(None, now=datetime(2026, 1, 1)) == 0.0
assert recency_score("not-a-date", now=datetime(2026, 1, 1)) == 0.0
def test_default_now_is_naive_utc():
# Naive (no tzinfo) so it subtracts cleanly from the naive parsed dates,
# and UTC-based (3.14-safe, no datetime.utcnow()).
now = _utcnow_naive()
assert now.tzinfo is None
reference = datetime.now(timezone.utc).replace(tzinfo=None)
assert abs((now - reference).total_seconds()) < 5