Search: align service provider guards
Co-authored-by: ghreprimand <203024559+ghreprimand@users.noreply.github.com>
This commit is contained in:
@@ -76,6 +76,43 @@ def _get_result_count() -> int:
|
||||
return 5
|
||||
|
||||
|
||||
# Canonical SafeSearch levels: "strict" (default), "moderate", "off".
|
||||
# Each provider has its own knob name and value space -- see _safesearch_for(...).
|
||||
_SAFESEARCH_LEVELS = ("strict", "moderate", "off")
|
||||
|
||||
|
||||
def _get_safesearch_level() -> str:
|
||||
"""Return configured SafeSearch level normalized to a canonical value."""
|
||||
settings = _get_search_settings()
|
||||
raw = (settings.get("search_safesearch") or "strict").strip().lower()
|
||||
if raw in _SAFESEARCH_LEVELS:
|
||||
return raw
|
||||
aliases = {
|
||||
"on": "strict", "high": "strict", "2": "strict",
|
||||
"medium": "moderate", "1": "moderate", "default": "moderate",
|
||||
"none": "off", "disabled": "off", "0": "off",
|
||||
}
|
||||
return aliases.get(raw, "strict")
|
||||
|
||||
|
||||
def _safesearch_for(provider: str) -> Optional[str]:
|
||||
"""Translate the canonical SafeSearch level into provider-specific values."""
|
||||
level = _get_safesearch_level()
|
||||
if provider == "searxng":
|
||||
return {"strict": "2", "moderate": "1", "off": "0"}[level]
|
||||
if provider == "brave":
|
||||
return level
|
||||
if provider == "duckduckgo_lib":
|
||||
return {"strict": "on", "moderate": "moderate", "off": "off"}[level]
|
||||
if provider == "duckduckgo_html":
|
||||
return {"strict": "1", "moderate": "-1", "off": "-2"}[level]
|
||||
if provider == "google_pse":
|
||||
return None if level == "off" else "active"
|
||||
if provider == "serper":
|
||||
return None if level == "off" else "active"
|
||||
return None
|
||||
|
||||
|
||||
# ── SearXNG ──
|
||||
|
||||
_NEWS_HINTS = ("news", "nyheter", "headlines", "breaking", "latest", "today", "idag")
|
||||
@@ -105,7 +142,12 @@ def searxng_search_api(query: str, count: int = 10, categories: str = "general",
|
||||
# languages and brand-ambiguous terms bleed in foreign SEO pages (e.g.
|
||||
# "Odyssey" → Honda Japan, "Trojan" → Japanese malware blogs, "Polyphemus"
|
||||
# → Chinese math forums). The news path already did this; general didn't.
|
||||
params = {"q": query, "format": "json", "language": "en"}
|
||||
params = {
|
||||
"q": query,
|
||||
"format": "json",
|
||||
"language": "en",
|
||||
"safesearch": _safesearch_for("searxng"),
|
||||
}
|
||||
q_lc = query.lower()
|
||||
is_news = time_filter is not None or any(h in q_lc for h in _NEWS_HINTS)
|
||||
if is_news and categories == "general":
|
||||
@@ -154,6 +196,7 @@ def searxng_search_api(query: str, count: int = 10, categories: str = "general",
|
||||
"format": "json",
|
||||
"language": "en",
|
||||
"categories": "general",
|
||||
"safesearch": _safesearch_for("searxng"),
|
||||
}
|
||||
if _GENERAL_ENGINES:
|
||||
fallback["engines"] = _GENERAL_ENGINES
|
||||
@@ -204,7 +247,7 @@ def searxng_search(query, max_results=10):
|
||||
try:
|
||||
response = httpx.get(
|
||||
f"{instance}/search",
|
||||
params={"q": query},
|
||||
params={"q": query, "safesearch": _safesearch_for("searxng")},
|
||||
headers=req_headers,
|
||||
timeout=10,
|
||||
)
|
||||
@@ -249,7 +292,11 @@ def _brave_search_impl(query: str, count: int, time_filter: Optional[str] = None
|
||||
return []
|
||||
|
||||
headers = {"X-Subscription-Token": brave_api_key, "Accept": "application/json"}
|
||||
params = {"q": enhanced_query, "count": count}
|
||||
params = {
|
||||
"q": enhanced_query,
|
||||
"count": count,
|
||||
"safesearch": _safesearch_for("brave"),
|
||||
}
|
||||
if time_filter:
|
||||
time_map = {"day": "day", "week": "week", "month": "month", "year": "year"}
|
||||
if time_filter in time_map:
|
||||
@@ -298,32 +345,40 @@ def _brave_search_impl(query: str, count: int, time_filter: Optional[str] = None
|
||||
|
||||
# ── DuckDuckGo (free, no key) ──
|
||||
|
||||
def _is_duckduckgo_host(host: str) -> bool:
|
||||
"""True only for duckduckgo.com and its subdomains."""
|
||||
host = (host or "").lower()
|
||||
return host == "duckduckgo.com" or host.endswith(".duckduckgo.com")
|
||||
|
||||
|
||||
def _resolve_ddg_redirect(raw: str) -> str:
|
||||
"""Resolve a DuckDuckGo /l/?uddg= redirect URL to its destination."""
|
||||
if not raw:
|
||||
return raw
|
||||
resolved = raw
|
||||
if resolved.startswith("//"):
|
||||
resolved = "https:" + resolved
|
||||
elif resolved.startswith("/"):
|
||||
resolved = urljoin("https://html.duckduckgo.com", resolved)
|
||||
try:
|
||||
parsed = urlparse(resolved)
|
||||
if _is_duckduckgo_host(parsed.hostname) and parsed.path.rstrip("/") == "/l":
|
||||
qs = parse_qs(parsed.query)
|
||||
if "uddg" in qs:
|
||||
return qs["uddg"][0]
|
||||
except Exception:
|
||||
pass
|
||||
return resolved
|
||||
|
||||
|
||||
def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
|
||||
"""Search using DuckDuckGo via the duckduckgo-search library. No API key needed."""
|
||||
def _resolve_url(raw: str) -> str:
|
||||
"""Resolve DuckDuckGo redirect URL to the actual destination URL."""
|
||||
if not raw:
|
||||
return raw
|
||||
resolved = raw
|
||||
if resolved.startswith("//"):
|
||||
resolved = "https:" + resolved
|
||||
elif resolved.startswith("/"):
|
||||
resolved = urljoin("https://html.duckduckgo.com", resolved)
|
||||
try:
|
||||
parsed = urlparse(resolved)
|
||||
if "duckduckgo.com" in (parsed.hostname or "") and parsed.path.rstrip("/") == "/l":
|
||||
qs = parse_qs(parsed.query)
|
||||
if "uddg" in qs:
|
||||
return qs["uddg"][0]
|
||||
except Exception:
|
||||
pass
|
||||
return resolved
|
||||
|
||||
def _html_fallback() -> List[dict]:
|
||||
try:
|
||||
response = httpx.get(
|
||||
"https://html.duckduckgo.com/html/",
|
||||
params={"q": query},
|
||||
params={"q": query, "kp": _safesearch_for("duckduckgo_html")},
|
||||
headers={"User-Agent": "Mozilla/5.0"},
|
||||
timeout=REQUEST_TIMEOUT,
|
||||
)
|
||||
@@ -334,7 +389,7 @@ def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] =
|
||||
link = result.select_one(".result__a")
|
||||
if not link:
|
||||
continue
|
||||
url = _resolve_url(link.get("href", ""))
|
||||
url = _resolve_ddg_redirect(link.get("href", ""))
|
||||
if not url:
|
||||
continue
|
||||
snippet_el = result.select_one(".result__snippet")
|
||||
@@ -362,7 +417,12 @@ def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] =
|
||||
|
||||
try:
|
||||
ddgs = DDGS()
|
||||
raw = ddgs.text(query, max_results=count, timelimit=timelimit)
|
||||
raw = ddgs.text(
|
||||
query,
|
||||
max_results=count,
|
||||
timelimit=timelimit,
|
||||
safesearch=_safesearch_for("duckduckgo_lib"),
|
||||
)
|
||||
results = []
|
||||
for item in raw:
|
||||
url = item.get("href", "")
|
||||
@@ -404,6 +464,9 @@ def google_pse_search(query: str, count: int = 10, time_filter: Optional[str] =
|
||||
"q": query,
|
||||
"num": min(count, 10), # Google PSE max is 10 per request
|
||||
}
|
||||
safe = _safesearch_for("google_pse")
|
||||
if safe:
|
||||
params["safe"] = safe
|
||||
if time_filter:
|
||||
# dateRestrict: d[number], w[number], m[number], y[number]
|
||||
time_map = {"day": "d1", "week": "w1", "month": "m1", "year": "y1"}
|
||||
@@ -508,6 +571,9 @@ def serper_search(query: str, count: int = 10, time_filter: Optional[str] = None
|
||||
"q": query,
|
||||
"num": count,
|
||||
}
|
||||
safe = _safesearch_for("serper")
|
||||
if safe:
|
||||
payload["safe"] = safe
|
||||
if time_filter:
|
||||
time_map = {"day": "qdr:d", "week": "qdr:w", "month": "qdr:m", "year": "qdr:y"}
|
||||
if time_filter in time_map:
|
||||
|
||||
101
tests/test_service_search_provider_guards.py
Normal file
101
tests/test_service_search_provider_guards.py
Normal file
@@ -0,0 +1,101 @@
|
||||
"""Regression tests for the services.search provider copy.
|
||||
|
||||
The UI search routes import services.search, while agent/deep-research paths
|
||||
still import src.search. Keep the service-side copy aligned with the safer
|
||||
provider guards already present in src.search.
|
||||
"""
|
||||
|
||||
import sys
|
||||
|
||||
from services.search import providers
|
||||
|
||||
|
||||
def test_service_safesearch_values_match_provider_contract(monkeypatch):
|
||||
monkeypatch.setattr(providers, "_get_search_settings", lambda: {"search_safesearch": "strict"})
|
||||
assert providers._safesearch_for("searxng") == "2"
|
||||
assert providers._safesearch_for("brave") == "strict"
|
||||
assert providers._safesearch_for("duckduckgo_lib") == "on"
|
||||
assert providers._safesearch_for("duckduckgo_html") == "1"
|
||||
assert providers._safesearch_for("google_pse") == "active"
|
||||
assert providers._safesearch_for("serper") == "active"
|
||||
|
||||
monkeypatch.setattr(providers, "_get_search_settings", lambda: {"search_safesearch": "off"})
|
||||
assert providers._safesearch_for("searxng") == "0"
|
||||
assert providers._safesearch_for("brave") == "off"
|
||||
assert providers._safesearch_for("duckduckgo_lib") == "off"
|
||||
assert providers._safesearch_for("duckduckgo_html") == "-2"
|
||||
assert providers._safesearch_for("google_pse") is None
|
||||
assert providers._safesearch_for("serper") is None
|
||||
|
||||
|
||||
def test_service_searxng_json_sends_safesearch(monkeypatch):
|
||||
seen = {}
|
||||
|
||||
class _Response:
|
||||
def raise_for_status(self):
|
||||
return None
|
||||
|
||||
def json(self):
|
||||
return {
|
||||
"results": [
|
||||
{"title": "Result", "url": "https://example.com", "content": "Snippet"}
|
||||
]
|
||||
}
|
||||
|
||||
def fake_get(url, **kwargs):
|
||||
seen["url"] = url
|
||||
seen["params"] = kwargs["params"]
|
||||
return _Response()
|
||||
|
||||
monkeypatch.setattr(providers, "_get_search_instance", lambda: "http://searx.test")
|
||||
monkeypatch.setattr(providers, "_get_search_settings", lambda: {"search_safesearch": "moderate"})
|
||||
monkeypatch.setattr(providers.httpx, "get", fake_get)
|
||||
|
||||
results = providers.searxng_search_api("odysseus", count=1)
|
||||
|
||||
assert results
|
||||
assert seen["url"] == "http://searx.test/search"
|
||||
assert seen["params"]["safesearch"] == "1"
|
||||
|
||||
|
||||
def test_service_ddg_redirect_ignores_lookalike_hosts():
|
||||
for host in ("duckduckgo.com.evil.com", "notduckduckgo.com"):
|
||||
url = f"https://{host}/l/?uddg=https%3A%2F%2Fexample.com"
|
||||
assert providers._resolve_ddg_redirect(url) == url
|
||||
|
||||
assert providers._resolve_ddg_redirect(
|
||||
"https://duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com"
|
||||
) == "https://example.com"
|
||||
|
||||
|
||||
def test_service_ddg_html_fallback_sends_safesearch(monkeypatch):
|
||||
seen = {}
|
||||
html = """
|
||||
<html><body>
|
||||
<div class="result">
|
||||
<a class="result__a" href="https://notduckduckgo.com/l/?uddg=https%3A%2F%2Fevil.example">
|
||||
Lookalike
|
||||
</a>
|
||||
<a class="result__snippet">Snippet</a>
|
||||
</div>
|
||||
</body></html>
|
||||
"""
|
||||
|
||||
class _Response:
|
||||
text = html
|
||||
|
||||
def raise_for_status(self):
|
||||
return None
|
||||
|
||||
def fake_get(url, **kwargs):
|
||||
seen["params"] = kwargs["params"]
|
||||
return _Response()
|
||||
|
||||
monkeypatch.setitem(sys.modules, "duckduckgo_search", None)
|
||||
monkeypatch.setattr(providers, "_get_search_settings", lambda: {"search_safesearch": "off"})
|
||||
monkeypatch.setattr(providers.httpx, "get", fake_get)
|
||||
|
||||
results = providers.duckduckgo_search("odysseus", count=1)
|
||||
|
||||
assert seen["params"]["kp"] == "-2"
|
||||
assert results[0]["url"].startswith("https://notduckduckgo.com/")
|
||||
Reference in New Issue
Block a user