src/search/ranking.py computed result age as `(datetime.now() - dt).days`, where `dt` is parsed from a UTC-style published date with no timezone. Using local `datetime.now()` skewed the age by the host's UTC offset (off-by-up-to-a-day near boundaries), and was a latent crash: once neighbouring code becomes timezone-aware the naive/aware subtraction raises TypeError (the landmine called out in #1116). Recency is now measured against naive UTC. The scoring is also lifted out of the rank_search_results closure into a module-level, time-injectable `recency_score` so it's unit-testable, and `_utcnow_naive()` avoids `datetime.utcnow()` (removed in Python 3.14). Covered by tests/test_search_ranking_recency.py (5 cases); the existing tests/test_search_ranking.py still passes. Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -2,12 +2,49 @@
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
import logging
|
import logging
|
||||||
from datetime import datetime
|
from datetime import datetime, timezone
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_AGE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S")
|
||||||
|
|
||||||
|
|
||||||
|
def _utcnow_naive() -> datetime:
|
||||||
|
"""Naive UTC 'now'. Matches the naive, UTC-style published dates parsed below,
|
||||||
|
and is safe on Python 3.14 where ``datetime.utcnow()`` is removed (#1116)."""
|
||||||
|
return datetime.now(timezone.utc).replace(tzinfo=None)
|
||||||
|
|
||||||
|
|
||||||
|
def recency_score(age_str: Optional[str], now: Optional[datetime] = None) -> float:
|
||||||
|
"""Score how recent a result is: 1.0 for <=7 days old, 0.0 for >=30 days.
|
||||||
|
|
||||||
|
The age is measured against UTC, not local time. The previous code used
|
||||||
|
``datetime.now()`` (local) against UTC-style published dates, so the age was
|
||||||
|
skewed by the host's UTC offset; it was also a latent crash once neighbouring
|
||||||
|
code moves to timezone-aware datetimes (#1116). ``now`` is injectable for tests.
|
||||||
|
"""
|
||||||
|
if not age_str:
|
||||||
|
return 0.0
|
||||||
|
dt = None
|
||||||
|
for fmt in _AGE_FORMATS:
|
||||||
|
try:
|
||||||
|
dt = datetime.strptime(age_str, fmt)
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
dt = None
|
||||||
|
if not dt:
|
||||||
|
return 0.0
|
||||||
|
now = now or _utcnow_naive()
|
||||||
|
days_old = (now - dt).days
|
||||||
|
if days_old <= 7:
|
||||||
|
return 1.0
|
||||||
|
if days_old >= 30:
|
||||||
|
return 0.0
|
||||||
|
return (30 - days_old) / 23
|
||||||
|
|
||||||
|
|
||||||
_NEWS_HINTS = {"news", "nyheter", "headlines", "breaking", "latest", "today", "idag"}
|
_NEWS_HINTS = {"news", "nyheter", "headlines", "breaking", "latest", "today", "idag"}
|
||||||
_SPORTS_HINTS = {
|
_SPORTS_HINTS = {
|
||||||
"sport", "sports", "soccer", "football", "hockey", "nba", "nfl", "mlb",
|
"sport", "sports", "soccer", "football", "hockey", "nba", "nfl", "mlb",
|
||||||
@@ -68,24 +105,6 @@ def rank_search_results(query: str, results: List[dict]) -> List[dict]:
|
|||||||
return 0.7
|
return 0.7
|
||||||
return 0.4
|
return 0.4
|
||||||
|
|
||||||
def recency_score(age_str: Optional[str]) -> float:
|
|
||||||
if not age_str:
|
|
||||||
return 0.0
|
|
||||||
for fmt in ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S"):
|
|
||||||
try:
|
|
||||||
dt = datetime.strptime(age_str, fmt)
|
|
||||||
break
|
|
||||||
except Exception:
|
|
||||||
dt = None
|
|
||||||
if not dt:
|
|
||||||
return 0.0
|
|
||||||
days_old = (datetime.now() - dt).days
|
|
||||||
if days_old <= 7:
|
|
||||||
return 1.0
|
|
||||||
if days_old >= 30:
|
|
||||||
return 0.0
|
|
||||||
return (30 - days_old) / 23
|
|
||||||
|
|
||||||
def news_quality_adjustment(title: str, snippet: str, url: str) -> float:
|
def news_quality_adjustment(title: str, snippet: str, url: str) -> float:
|
||||||
if not is_news_query:
|
if not is_news_query:
|
||||||
return 0.0
|
return 0.0
|
||||||
|
|||||||
39
tests/test_search_ranking_recency.py
Normal file
39
tests/test_search_ranking_recency.py
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
"""Issue #1116 (latent ranking bug) — recency scoring uses UTC, not local time.
|
||||||
|
|
||||||
|
`recency_score` measured age with `datetime.now()` (local) against UTC-style
|
||||||
|
published dates, skewing the age by the host's UTC offset and risking a TypeError
|
||||||
|
once neighbouring code becomes timezone-aware. It now uses naive UTC and is a
|
||||||
|
module-level, time-injectable function.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
from src.search.ranking import recency_score, _utcnow_naive
|
||||||
|
|
||||||
|
|
||||||
|
def test_fresh_result_scores_one():
|
||||||
|
assert recency_score("2026-01-01", now=datetime(2026, 1, 5)) == 1.0 # 4 days old
|
||||||
|
|
||||||
|
|
||||||
|
def test_old_result_scores_zero():
|
||||||
|
assert recency_score("2026-01-01", now=datetime(2026, 3, 1)) == 0.0 # >30 days
|
||||||
|
|
||||||
|
|
||||||
|
def test_mid_range_decays_linearly():
|
||||||
|
score = recency_score("2026-01-01", now=datetime(2026, 1, 20)) # 19 days old
|
||||||
|
assert score == (30 - 19) / 23
|
||||||
|
|
||||||
|
|
||||||
|
def test_empty_or_unparseable_scores_zero():
|
||||||
|
assert recency_score("", now=datetime(2026, 1, 1)) == 0.0
|
||||||
|
assert recency_score(None, now=datetime(2026, 1, 1)) == 0.0
|
||||||
|
assert recency_score("not-a-date", now=datetime(2026, 1, 1)) == 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_default_now_is_naive_utc():
|
||||||
|
# Naive (no tzinfo) so it subtracts cleanly from the naive parsed dates,
|
||||||
|
# and UTC-based (3.14-safe, no datetime.utcnow()).
|
||||||
|
now = _utcnow_naive()
|
||||||
|
assert now.tzinfo is None
|
||||||
|
reference = datetime.now(timezone.utc).replace(tzinfo=None)
|
||||||
|
assert abs((now - reference).total_seconds()) < 5
|
||||||
Reference in New Issue
Block a user