fix: rank recency by UTC, not local time (#1116) (#1234)

src/search/ranking.py computed result age as `(datetime.now() - dt).days`, where
`dt` is parsed from a UTC-style published date with no timezone. Using local
`datetime.now()` skewed the age by the host's UTC offset (off-by-up-to-a-day near
boundaries), and was a latent crash: once neighbouring code becomes timezone-aware
the naive/aware subtraction raises TypeError (the landmine called out in #1116).

Recency is now measured against naive UTC. The scoring is also lifted out of the
rank_search_results closure into a module-level, time-injectable `recency_score`
so it's unit-testable, and `_utcnow_naive()` avoids `datetime.utcnow()` (removed in
Python 3.14).

Covered by tests/test_search_ranking_recency.py (5 cases); the existing
tests/test_search_ranking.py still passes.

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
lekt8
2026-06-02 23:18:15 +08:00
committed by GitHub
parent 8c376d2b0e
commit 975fd42e32
2 changed files with 77 additions and 19 deletions

View File

@@ -2,12 +2,49 @@
import re
import logging
from datetime import datetime
from datetime import datetime, timezone
from typing import List, Optional
from urllib.parse import urlparse
logger = logging.getLogger(__name__)
_AGE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S")
def _utcnow_naive() -> datetime:
"""Naive UTC 'now'. Matches the naive, UTC-style published dates parsed below,
and is safe on Python 3.14 where ``datetime.utcnow()`` is removed (#1116)."""
return datetime.now(timezone.utc).replace(tzinfo=None)
def recency_score(age_str: Optional[str], now: Optional[datetime] = None) -> float:
"""Score how recent a result is: 1.0 for <=7 days old, 0.0 for >=30 days.
The age is measured against UTC, not local time. The previous code used
``datetime.now()`` (local) against UTC-style published dates, so the age was
skewed by the host's UTC offset; it was also a latent crash once neighbouring
code moves to timezone-aware datetimes (#1116). ``now`` is injectable for tests.
"""
if not age_str:
return 0.0
dt = None
for fmt in _AGE_FORMATS:
try:
dt = datetime.strptime(age_str, fmt)
break
except Exception:
dt = None
if not dt:
return 0.0
now = now or _utcnow_naive()
days_old = (now - dt).days
if days_old <= 7:
return 1.0
if days_old >= 30:
return 0.0
return (30 - days_old) / 23
_NEWS_HINTS = {"news", "nyheter", "headlines", "breaking", "latest", "today", "idag"}
_SPORTS_HINTS = {
"sport", "sports", "soccer", "football", "hockey", "nba", "nfl", "mlb",
@@ -68,24 +105,6 @@ def rank_search_results(query: str, results: List[dict]) -> List[dict]:
return 0.7
return 0.4
def recency_score(age_str: Optional[str]) -> float:
if not age_str:
return 0.0
for fmt in ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S"):
try:
dt = datetime.strptime(age_str, fmt)
break
except Exception:
dt = None
if not dt:
return 0.0
days_old = (datetime.now() - dt).days
if days_old <= 7:
return 1.0
if days_old >= 30:
return 0.0
return (30 - days_old) / 23
def news_quality_adjustment(title: str, snippet: str, url: str) -> float:
if not is_news_query:
return 0.0