Odysseus v1.0
This commit is contained in:
127
src/search/ranking.py
Normal file
127
src/search/ranking.py
Normal file
@@ -0,0 +1,127 @@
|
||||
"""Search result ranking based on relevance, source quality, and recency."""
|
||||
|
||||
import re
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import List, Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_NEWS_HINTS = {"news", "nyheter", "headlines", "breaking", "latest", "today", "idag"}
|
||||
_SPORTS_HINTS = {
|
||||
"sport", "sports", "soccer", "football", "hockey", "nba", "nfl", "mlb",
|
||||
"fifa", "world cup", "championship", "quarterfinal", "eliminates",
|
||||
}
|
||||
_LOW_VALUE_NEWS_DOMAINS = {
|
||||
"facebook.com", "www.facebook.com", "sports.yahoo.com", "yahoo.com",
|
||||
"www.yahoo.com", "msn.com", "www.msn.com",
|
||||
}
|
||||
_TRUSTED_NEWS_DOMAINS = {
|
||||
"apnews.com", "www.apnews.com", "reuters.com", "www.reuters.com",
|
||||
"bbc.com", "www.bbc.com", "cbc.ca", "www.cbc.ca",
|
||||
"ctvnews.ca", "www.ctvnews.ca", "globalnews.ca", "www.globalnews.ca",
|
||||
"theguardian.com",
|
||||
"www.theguardian.com", "euronews.com", "www.euronews.com",
|
||||
"dw.com", "www.dw.com", "government.se", "www.government.se",
|
||||
}
|
||||
|
||||
|
||||
def _domain(url: str) -> str:
|
||||
try:
|
||||
return urlparse(url).netloc.lower()
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def rank_search_results(query: str, results: List[dict]) -> List[dict]:
|
||||
"""Rank search results by title relevance, snippet quality, domain authority, and recency."""
|
||||
query_terms = [t.lower() for t in re.findall(r"\b\w+\b", query)]
|
||||
query_lc = query.lower()
|
||||
is_news_query = any(term in _NEWS_HINTS for term in query_terms)
|
||||
is_sports_query = any(hint in query_lc for hint in _SPORTS_HINTS)
|
||||
|
||||
def title_score(title: str) -> float:
|
||||
if not title:
|
||||
return 0.0
|
||||
title_lc = title.lower()
|
||||
matches = sum(1 for term in query_terms if re.search(rf"\b{re.escape(term)}\b", title_lc))
|
||||
return matches / len(query_terms) if query_terms else 0.0
|
||||
|
||||
def snippet_score(snippet: str) -> float:
|
||||
if not snippet:
|
||||
return 0.0
|
||||
length_factor = min(len(snippet), 200) / 200
|
||||
term_hits = sum(1 for term in query_terms if term in snippet.lower())
|
||||
term_factor = term_hits / len(query_terms) if query_terms else 0.0
|
||||
return (length_factor + term_factor) / 2
|
||||
|
||||
def domain_score(url: str) -> float:
|
||||
netloc = _domain(url)
|
||||
if not netloc:
|
||||
return 0.0
|
||||
if netloc in _TRUSTED_NEWS_DOMAINS:
|
||||
return 1.0
|
||||
if netloc.endswith(".edu") or netloc.endswith(".gov"):
|
||||
return 1.0
|
||||
if netloc.endswith(".org"):
|
||||
return 0.7
|
||||
return 0.4
|
||||
|
||||
def recency_score(age_str: Optional[str]) -> float:
|
||||
if not age_str:
|
||||
return 0.0
|
||||
for fmt in ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S"):
|
||||
try:
|
||||
dt = datetime.strptime(age_str, fmt)
|
||||
break
|
||||
except Exception:
|
||||
dt = None
|
||||
if not dt:
|
||||
return 0.0
|
||||
days_old = (datetime.now() - dt).days
|
||||
if days_old <= 7:
|
||||
return 1.0
|
||||
if days_old >= 30:
|
||||
return 0.0
|
||||
return (30 - days_old) / 23
|
||||
|
||||
def news_quality_adjustment(title: str, snippet: str, url: str) -> float:
|
||||
if not is_news_query:
|
||||
return 0.0
|
||||
text = f"{title} {snippet}".lower()
|
||||
netloc = _domain(url)
|
||||
adjustment = 0.0
|
||||
if netloc in _TRUSTED_NEWS_DOMAINS:
|
||||
adjustment += 1.2
|
||||
if any(term in text for term in ("latest news", "breaking news", "daily coverage", "news from")):
|
||||
adjustment += 0.4
|
||||
if netloc in _LOW_VALUE_NEWS_DOMAINS:
|
||||
adjustment -= 0.8
|
||||
if not is_sports_query and any(hint in text or hint in netloc for hint in _SPORTS_HINTS):
|
||||
adjustment -= 1.5
|
||||
# A country/news query should not rank a page whose title/snippet barely
|
||||
# mentions the country above actual news pages for that country.
|
||||
subject_terms = [t for t in query_terms if t not in _NEWS_HINTS]
|
||||
if subject_terms and not any(t in text or t in netloc for t in subject_terms):
|
||||
adjustment -= 1.0
|
||||
return adjustment
|
||||
|
||||
ranked = []
|
||||
for result in results:
|
||||
title = result.get("title", "")
|
||||
snippet = result.get("snippet", "")
|
||||
url = result.get("url", "")
|
||||
age = result.get("age", None)
|
||||
|
||||
score = (
|
||||
2.0 * title_score(title)
|
||||
+ 1.0 * snippet_score(snippet)
|
||||
+ 1.5 * domain_score(url)
|
||||
+ 1.0 * recency_score(age)
|
||||
+ news_quality_adjustment(title, snippet, url)
|
||||
)
|
||||
ranked.append((score, result))
|
||||
|
||||
ranked.sort(key=lambda x: x[0], reverse=True)
|
||||
return [r for _, r in ranked]
|
||||
Reference in New Issue
Block a user