Search: align service provider guards

Co-authored-by: ghreprimand <203024559+ghreprimand@users.noreply.github.com>
This commit is contained in:
ghreprimand
2026-06-02 06:52:13 -05:00
committed by GitHub
parent 6c15dc7d33
commit eddb9ce6db
2 changed files with 191 additions and 24 deletions

View File

@@ -76,6 +76,43 @@ def _get_result_count() -> int:
return 5
# Canonical SafeSearch levels: "strict" (default), "moderate", "off".
# Each provider has its own knob name and value space -- see _safesearch_for(...).
_SAFESEARCH_LEVELS = ("strict", "moderate", "off")
def _get_safesearch_level() -> str:
"""Return configured SafeSearch level normalized to a canonical value."""
settings = _get_search_settings()
raw = (settings.get("search_safesearch") or "strict").strip().lower()
if raw in _SAFESEARCH_LEVELS:
return raw
aliases = {
"on": "strict", "high": "strict", "2": "strict",
"medium": "moderate", "1": "moderate", "default": "moderate",
"none": "off", "disabled": "off", "0": "off",
}
return aliases.get(raw, "strict")
def _safesearch_for(provider: str) -> Optional[str]:
"""Translate the canonical SafeSearch level into provider-specific values."""
level = _get_safesearch_level()
if provider == "searxng":
return {"strict": "2", "moderate": "1", "off": "0"}[level]
if provider == "brave":
return level
if provider == "duckduckgo_lib":
return {"strict": "on", "moderate": "moderate", "off": "off"}[level]
if provider == "duckduckgo_html":
return {"strict": "1", "moderate": "-1", "off": "-2"}[level]
if provider == "google_pse":
return None if level == "off" else "active"
if provider == "serper":
return None if level == "off" else "active"
return None
# ── SearXNG ──
_NEWS_HINTS = ("news", "nyheter", "headlines", "breaking", "latest", "today", "idag")
@@ -105,7 +142,12 @@ def searxng_search_api(query: str, count: int = 10, categories: str = "general",
# languages and brand-ambiguous terms bleed in foreign SEO pages (e.g.
# "Odyssey" → Honda Japan, "Trojan" → Japanese malware blogs, "Polyphemus"
# → Chinese math forums). The news path already did this; general didn't.
params = {"q": query, "format": "json", "language": "en"}
params = {
"q": query,
"format": "json",
"language": "en",
"safesearch": _safesearch_for("searxng"),
}
q_lc = query.lower()
is_news = time_filter is not None or any(h in q_lc for h in _NEWS_HINTS)
if is_news and categories == "general":
@@ -154,6 +196,7 @@ def searxng_search_api(query: str, count: int = 10, categories: str = "general",
"format": "json",
"language": "en",
"categories": "general",
"safesearch": _safesearch_for("searxng"),
}
if _GENERAL_ENGINES:
fallback["engines"] = _GENERAL_ENGINES
@@ -204,7 +247,7 @@ def searxng_search(query, max_results=10):
try:
response = httpx.get(
f"{instance}/search",
params={"q": query},
params={"q": query, "safesearch": _safesearch_for("searxng")},
headers=req_headers,
timeout=10,
)
@@ -249,7 +292,11 @@ def _brave_search_impl(query: str, count: int, time_filter: Optional[str] = None
return []
headers = {"X-Subscription-Token": brave_api_key, "Accept": "application/json"}
params = {"q": enhanced_query, "count": count}
params = {
"q": enhanced_query,
"count": count,
"safesearch": _safesearch_for("brave"),
}
if time_filter:
time_map = {"day": "day", "week": "week", "month": "month", "year": "year"}
if time_filter in time_map:
@@ -298,32 +345,40 @@ def _brave_search_impl(query: str, count: int, time_filter: Optional[str] = None
# ── DuckDuckGo (free, no key) ──
def _is_duckduckgo_host(host: str) -> bool:
"""True only for duckduckgo.com and its subdomains."""
host = (host or "").lower()
return host == "duckduckgo.com" or host.endswith(".duckduckgo.com")
def _resolve_ddg_redirect(raw: str) -> str:
"""Resolve a DuckDuckGo /l/?uddg= redirect URL to its destination."""
if not raw:
return raw
resolved = raw
if resolved.startswith("//"):
resolved = "https:" + resolved
elif resolved.startswith("/"):
resolved = urljoin("https://html.duckduckgo.com", resolved)
try:
parsed = urlparse(resolved)
if _is_duckduckgo_host(parsed.hostname) and parsed.path.rstrip("/") == "/l":
qs = parse_qs(parsed.query)
if "uddg" in qs:
return qs["uddg"][0]
except Exception:
pass
return resolved
def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
"""Search using DuckDuckGo via the duckduckgo-search library. No API key needed."""
def _resolve_url(raw: str) -> str:
"""Resolve DuckDuckGo redirect URL to the actual destination URL."""
if not raw:
return raw
resolved = raw
if resolved.startswith("//"):
resolved = "https:" + resolved
elif resolved.startswith("/"):
resolved = urljoin("https://html.duckduckgo.com", resolved)
try:
parsed = urlparse(resolved)
if "duckduckgo.com" in (parsed.hostname or "") and parsed.path.rstrip("/") == "/l":
qs = parse_qs(parsed.query)
if "uddg" in qs:
return qs["uddg"][0]
except Exception:
pass
return resolved
def _html_fallback() -> List[dict]:
try:
response = httpx.get(
"https://html.duckduckgo.com/html/",
params={"q": query},
params={"q": query, "kp": _safesearch_for("duckduckgo_html")},
headers={"User-Agent": "Mozilla/5.0"},
timeout=REQUEST_TIMEOUT,
)
@@ -334,7 +389,7 @@ def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] =
link = result.select_one(".result__a")
if not link:
continue
url = _resolve_url(link.get("href", ""))
url = _resolve_ddg_redirect(link.get("href", ""))
if not url:
continue
snippet_el = result.select_one(".result__snippet")
@@ -362,7 +417,12 @@ def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] =
try:
ddgs = DDGS()
raw = ddgs.text(query, max_results=count, timelimit=timelimit)
raw = ddgs.text(
query,
max_results=count,
timelimit=timelimit,
safesearch=_safesearch_for("duckduckgo_lib"),
)
results = []
for item in raw:
url = item.get("href", "")
@@ -404,6 +464,9 @@ def google_pse_search(query: str, count: int = 10, time_filter: Optional[str] =
"q": query,
"num": min(count, 10), # Google PSE max is 10 per request
}
safe = _safesearch_for("google_pse")
if safe:
params["safe"] = safe
if time_filter:
# dateRestrict: d[number], w[number], m[number], y[number]
time_map = {"day": "d1", "week": "w1", "month": "m1", "year": "y1"}
@@ -508,6 +571,9 @@ def serper_search(query: str, count: int = 10, time_filter: Optional[str] = None
"q": query,
"num": count,
}
safe = _safesearch_for("serper")
if safe:
payload["safe"] = safe
if time_filter:
time_map = {"day": "qdr:d", "week": "qdr:w", "month": "qdr:m", "year": "qdr:y"}
if time_filter in time_map:

View File

@@ -0,0 +1,101 @@
"""Regression tests for the services.search provider copy.
The UI search routes import services.search, while agent/deep-research paths
still import src.search. Keep the service-side copy aligned with the safer
provider guards already present in src.search.
"""
import sys
from services.search import providers
def test_service_safesearch_values_match_provider_contract(monkeypatch):
monkeypatch.setattr(providers, "_get_search_settings", lambda: {"search_safesearch": "strict"})
assert providers._safesearch_for("searxng") == "2"
assert providers._safesearch_for("brave") == "strict"
assert providers._safesearch_for("duckduckgo_lib") == "on"
assert providers._safesearch_for("duckduckgo_html") == "1"
assert providers._safesearch_for("google_pse") == "active"
assert providers._safesearch_for("serper") == "active"
monkeypatch.setattr(providers, "_get_search_settings", lambda: {"search_safesearch": "off"})
assert providers._safesearch_for("searxng") == "0"
assert providers._safesearch_for("brave") == "off"
assert providers._safesearch_for("duckduckgo_lib") == "off"
assert providers._safesearch_for("duckduckgo_html") == "-2"
assert providers._safesearch_for("google_pse") is None
assert providers._safesearch_for("serper") is None
def test_service_searxng_json_sends_safesearch(monkeypatch):
seen = {}
class _Response:
def raise_for_status(self):
return None
def json(self):
return {
"results": [
{"title": "Result", "url": "https://example.com", "content": "Snippet"}
]
}
def fake_get(url, **kwargs):
seen["url"] = url
seen["params"] = kwargs["params"]
return _Response()
monkeypatch.setattr(providers, "_get_search_instance", lambda: "http://searx.test")
monkeypatch.setattr(providers, "_get_search_settings", lambda: {"search_safesearch": "moderate"})
monkeypatch.setattr(providers.httpx, "get", fake_get)
results = providers.searxng_search_api("odysseus", count=1)
assert results
assert seen["url"] == "http://searx.test/search"
assert seen["params"]["safesearch"] == "1"
def test_service_ddg_redirect_ignores_lookalike_hosts():
for host in ("duckduckgo.com.evil.com", "notduckduckgo.com"):
url = f"https://{host}/l/?uddg=https%3A%2F%2Fexample.com"
assert providers._resolve_ddg_redirect(url) == url
assert providers._resolve_ddg_redirect(
"https://duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com"
) == "https://example.com"
def test_service_ddg_html_fallback_sends_safesearch(monkeypatch):
seen = {}
html = """
<html><body>
<div class="result">
<a class="result__a" href="https://notduckduckgo.com/l/?uddg=https%3A%2F%2Fevil.example">
Lookalike
</a>
<a class="result__snippet">Snippet</a>
</div>
</body></html>
"""
class _Response:
text = html
def raise_for_status(self):
return None
def fake_get(url, **kwargs):
seen["params"] = kwargs["params"]
return _Response()
monkeypatch.setitem(sys.modules, "duckduckgo_search", None)
monkeypatch.setattr(providers, "_get_search_settings", lambda: {"search_safesearch": "off"})
monkeypatch.setattr(providers.httpx, "get", fake_get)
results = providers.duckduckgo_search("odysseus", count=1)
assert seen["params"]["kp"] == "-2"
assert results[0]["url"].startswith("https://notduckduckgo.com/")