diff --git a/services/search/providers.py b/services/search/providers.py index b7cdce6..1450740 100644 --- a/services/search/providers.py +++ b/services/search/providers.py @@ -76,6 +76,43 @@ def _get_result_count() -> int: return 5 +# Canonical SafeSearch levels: "strict" (default), "moderate", "off". +# Each provider has its own knob name and value space -- see _safesearch_for(...). +_SAFESEARCH_LEVELS = ("strict", "moderate", "off") + + +def _get_safesearch_level() -> str: + """Return configured SafeSearch level normalized to a canonical value.""" + settings = _get_search_settings() + raw = (settings.get("search_safesearch") or "strict").strip().lower() + if raw in _SAFESEARCH_LEVELS: + return raw + aliases = { + "on": "strict", "high": "strict", "2": "strict", + "medium": "moderate", "1": "moderate", "default": "moderate", + "none": "off", "disabled": "off", "0": "off", + } + return aliases.get(raw, "strict") + + +def _safesearch_for(provider: str) -> Optional[str]: + """Translate the canonical SafeSearch level into provider-specific values.""" + level = _get_safesearch_level() + if provider == "searxng": + return {"strict": "2", "moderate": "1", "off": "0"}[level] + if provider == "brave": + return level + if provider == "duckduckgo_lib": + return {"strict": "on", "moderate": "moderate", "off": "off"}[level] + if provider == "duckduckgo_html": + return {"strict": "1", "moderate": "-1", "off": "-2"}[level] + if provider == "google_pse": + return None if level == "off" else "active" + if provider == "serper": + return None if level == "off" else "active" + return None + + # ── SearXNG ── _NEWS_HINTS = ("news", "nyheter", "headlines", "breaking", "latest", "today", "idag") @@ -105,7 +142,12 @@ def searxng_search_api(query: str, count: int = 10, categories: str = "general", # languages and brand-ambiguous terms bleed in foreign SEO pages (e.g. # "Odyssey" → Honda Japan, "Trojan" → Japanese malware blogs, "Polyphemus" # → Chinese math forums). The news path already did this; general didn't. - params = {"q": query, "format": "json", "language": "en"} + params = { + "q": query, + "format": "json", + "language": "en", + "safesearch": _safesearch_for("searxng"), + } q_lc = query.lower() is_news = time_filter is not None or any(h in q_lc for h in _NEWS_HINTS) if is_news and categories == "general": @@ -154,6 +196,7 @@ def searxng_search_api(query: str, count: int = 10, categories: str = "general", "format": "json", "language": "en", "categories": "general", + "safesearch": _safesearch_for("searxng"), } if _GENERAL_ENGINES: fallback["engines"] = _GENERAL_ENGINES @@ -204,7 +247,7 @@ def searxng_search(query, max_results=10): try: response = httpx.get( f"{instance}/search", - params={"q": query}, + params={"q": query, "safesearch": _safesearch_for("searxng")}, headers=req_headers, timeout=10, ) @@ -249,7 +292,11 @@ def _brave_search_impl(query: str, count: int, time_filter: Optional[str] = None return [] headers = {"X-Subscription-Token": brave_api_key, "Accept": "application/json"} - params = {"q": enhanced_query, "count": count} + params = { + "q": enhanced_query, + "count": count, + "safesearch": _safesearch_for("brave"), + } if time_filter: time_map = {"day": "day", "week": "week", "month": "month", "year": "year"} if time_filter in time_map: @@ -298,32 +345,40 @@ def _brave_search_impl(query: str, count: int, time_filter: Optional[str] = None # ── DuckDuckGo (free, no key) ── +def _is_duckduckgo_host(host: str) -> bool: + """True only for duckduckgo.com and its subdomains.""" + host = (host or "").lower() + return host == "duckduckgo.com" or host.endswith(".duckduckgo.com") + + +def _resolve_ddg_redirect(raw: str) -> str: + """Resolve a DuckDuckGo /l/?uddg= redirect URL to its destination.""" + if not raw: + return raw + resolved = raw + if resolved.startswith("//"): + resolved = "https:" + resolved + elif resolved.startswith("/"): + resolved = urljoin("https://html.duckduckgo.com", resolved) + try: + parsed = urlparse(resolved) + if _is_duckduckgo_host(parsed.hostname) and parsed.path.rstrip("/") == "/l": + qs = parse_qs(parsed.query) + if "uddg" in qs: + return qs["uddg"][0] + except Exception: + pass + return resolved + + def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]: """Search using DuckDuckGo via the duckduckgo-search library. No API key needed.""" - def _resolve_url(raw: str) -> str: - """Resolve DuckDuckGo redirect URL to the actual destination URL.""" - if not raw: - return raw - resolved = raw - if resolved.startswith("//"): - resolved = "https:" + resolved - elif resolved.startswith("/"): - resolved = urljoin("https://html.duckduckgo.com", resolved) - try: - parsed = urlparse(resolved) - if "duckduckgo.com" in (parsed.hostname or "") and parsed.path.rstrip("/") == "/l": - qs = parse_qs(parsed.query) - if "uddg" in qs: - return qs["uddg"][0] - except Exception: - pass - return resolved def _html_fallback() -> List[dict]: try: response = httpx.get( "https://html.duckduckgo.com/html/", - params={"q": query}, + params={"q": query, "kp": _safesearch_for("duckduckgo_html")}, headers={"User-Agent": "Mozilla/5.0"}, timeout=REQUEST_TIMEOUT, ) @@ -334,7 +389,7 @@ def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] = link = result.select_one(".result__a") if not link: continue - url = _resolve_url(link.get("href", "")) + url = _resolve_ddg_redirect(link.get("href", "")) if not url: continue snippet_el = result.select_one(".result__snippet") @@ -362,7 +417,12 @@ def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] = try: ddgs = DDGS() - raw = ddgs.text(query, max_results=count, timelimit=timelimit) + raw = ddgs.text( + query, + max_results=count, + timelimit=timelimit, + safesearch=_safesearch_for("duckduckgo_lib"), + ) results = [] for item in raw: url = item.get("href", "") @@ -404,6 +464,9 @@ def google_pse_search(query: str, count: int = 10, time_filter: Optional[str] = "q": query, "num": min(count, 10), # Google PSE max is 10 per request } + safe = _safesearch_for("google_pse") + if safe: + params["safe"] = safe if time_filter: # dateRestrict: d[number], w[number], m[number], y[number] time_map = {"day": "d1", "week": "w1", "month": "m1", "year": "y1"} @@ -508,6 +571,9 @@ def serper_search(query: str, count: int = 10, time_filter: Optional[str] = None "q": query, "num": count, } + safe = _safesearch_for("serper") + if safe: + payload["safe"] = safe if time_filter: time_map = {"day": "qdr:d", "week": "qdr:w", "month": "qdr:m", "year": "qdr:y"} if time_filter in time_map: diff --git a/tests/test_service_search_provider_guards.py b/tests/test_service_search_provider_guards.py new file mode 100644 index 0000000..8e81b1a --- /dev/null +++ b/tests/test_service_search_provider_guards.py @@ -0,0 +1,101 @@ +"""Regression tests for the services.search provider copy. + +The UI search routes import services.search, while agent/deep-research paths +still import src.search. Keep the service-side copy aligned with the safer +provider guards already present in src.search. +""" + +import sys + +from services.search import providers + + +def test_service_safesearch_values_match_provider_contract(monkeypatch): + monkeypatch.setattr(providers, "_get_search_settings", lambda: {"search_safesearch": "strict"}) + assert providers._safesearch_for("searxng") == "2" + assert providers._safesearch_for("brave") == "strict" + assert providers._safesearch_for("duckduckgo_lib") == "on" + assert providers._safesearch_for("duckduckgo_html") == "1" + assert providers._safesearch_for("google_pse") == "active" + assert providers._safesearch_for("serper") == "active" + + monkeypatch.setattr(providers, "_get_search_settings", lambda: {"search_safesearch": "off"}) + assert providers._safesearch_for("searxng") == "0" + assert providers._safesearch_for("brave") == "off" + assert providers._safesearch_for("duckduckgo_lib") == "off" + assert providers._safesearch_for("duckduckgo_html") == "-2" + assert providers._safesearch_for("google_pse") is None + assert providers._safesearch_for("serper") is None + + +def test_service_searxng_json_sends_safesearch(monkeypatch): + seen = {} + + class _Response: + def raise_for_status(self): + return None + + def json(self): + return { + "results": [ + {"title": "Result", "url": "https://example.com", "content": "Snippet"} + ] + } + + def fake_get(url, **kwargs): + seen["url"] = url + seen["params"] = kwargs["params"] + return _Response() + + monkeypatch.setattr(providers, "_get_search_instance", lambda: "http://searx.test") + monkeypatch.setattr(providers, "_get_search_settings", lambda: {"search_safesearch": "moderate"}) + monkeypatch.setattr(providers.httpx, "get", fake_get) + + results = providers.searxng_search_api("odysseus", count=1) + + assert results + assert seen["url"] == "http://searx.test/search" + assert seen["params"]["safesearch"] == "1" + + +def test_service_ddg_redirect_ignores_lookalike_hosts(): + for host in ("duckduckgo.com.evil.com", "notduckduckgo.com"): + url = f"https://{host}/l/?uddg=https%3A%2F%2Fexample.com" + assert providers._resolve_ddg_redirect(url) == url + + assert providers._resolve_ddg_redirect( + "https://duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com" + ) == "https://example.com" + + +def test_service_ddg_html_fallback_sends_safesearch(monkeypatch): + seen = {} + html = """ +
+