diff --git a/src/search/providers.py b/src/search/providers.py index ee16a50..4ec493b 100644 --- a/src/search/providers.py +++ b/src/search/providers.py @@ -76,6 +76,56 @@ def _get_result_count() -> int: return 5 +# Canonical SafeSearch levels: "strict" (default), "moderate", "off". +# Each provider has its own knob name and value space — see _safesearch_for(...). +_SAFESEARCH_LEVELS = ("strict", "moderate", "off") + + +def _get_safesearch_level() -> str: + """Return the configured SafeSearch level, normalized to one of + _SAFESEARCH_LEVELS. Defaults to 'strict' to avoid adult / spammy URLs + bleeding into research and web_search results.""" + settings = _get_search_settings() + raw = (settings.get("search_safesearch") or "strict").strip().lower() + if raw in _SAFESEARCH_LEVELS: + return raw + # Accept a few common aliases so a manually-edited config doesn't + # silently lose SafeSearch — fall back to strict on anything unknown. + aliases = { + "on": "strict", "high": "strict", "2": "strict", + "medium": "moderate", "1": "moderate", "default": "moderate", + "none": "off", "disabled": "off", "0": "off", + } + return aliases.get(raw, "strict") + + +def _safesearch_for(provider: str) -> Optional[str]: + """Translate the canonical level into the per-provider param value. + Returns None when SafeSearch should be omitted entirely for a provider + (some APIs default to filtered and treat missing-param as "off").""" + level = _get_safesearch_level() + if provider == "searxng": + # SearXNG: integer 0/1/2 + return {"strict": "2", "moderate": "1", "off": "0"}[level] + if provider == "brave": + # Brave: strict / moderate / off + return level + if provider == "duckduckgo_lib": + # duckduckgo-search library: on / moderate / off + return {"strict": "on", "moderate": "moderate", "off": "off"}[level] + if provider == "duckduckgo_html": + # DDG HTML endpoint kp: 1 strict / -1 moderate / -2 off + return {"strict": "1", "moderate": "-1", "off": "-2"}[level] + if provider == "google_pse": + # Google PSE: 'active' filters explicit; 'off' disables. Treat + # moderate the same as active — Google PSE has no middle tier. + return None if level == "off" else "active" + if provider == "serper": + # Serper proxies Google's `safe` param. + return None if level == "off" else "active" + return None + + # ── SearXNG ── _NEWS_HINTS = ("news", "nyheter", "headlines", "breaking", "latest", "today", "idag") @@ -106,7 +156,8 @@ def searxng_search_api(query: str, count: int = 10, categories: str = "general", # Pin English for ALL searches — without it SearXNG mixes languages and # brand-ambiguous terms bleed in foreign SEO pages (Honda "Odyssey" JP, # Japanese "Trojan" malware blogs, Chinese math forums for "Polyphemus"). - params = {"q": query, "format": "json", "language": "en"} + params = {"q": query, "format": "json", "language": "en", + "safesearch": _safesearch_for("searxng")} q_lc = query.lower() is_news = time_filter is not None or any(h in q_lc for h in _NEWS_HINTS) if is_news and categories == "general": @@ -155,6 +206,7 @@ def searxng_search_api(query: str, count: int = 10, categories: str = "general", "format": "json", "language": "en", "categories": "general", + "safesearch": _safesearch_for("searxng"), } if _GENERAL_ENGINES: fallback["engines"] = _GENERAL_ENGINES @@ -205,7 +257,7 @@ def searxng_search(query, max_results=10): try: response = httpx.get( f"{instance}/search", - params={"q": query}, + params={"q": query, "safesearch": _safesearch_for("searxng")}, headers=req_headers, timeout=10, ) @@ -250,7 +302,8 @@ def _brave_search_impl(query: str, count: int, time_filter: Optional[str] = None return [] headers = {"X-Subscription-Token": brave_api_key, "Accept": "application/json"} - params = {"q": enhanced_query, "count": count} + params = {"q": enhanced_query, "count": count, + "safesearch": _safesearch_for("brave")} if time_filter: time_map = {"day": "day", "week": "week", "month": "month", "year": "year"} if time_filter in time_map: @@ -326,7 +379,7 @@ def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] = try: response = httpx.get( "https://html.duckduckgo.com/html/", - params={"q": query}, + params={"q": query, "kp": _safesearch_for("duckduckgo_html")}, headers={"User-Agent": "Mozilla/5.0"}, timeout=REQUEST_TIMEOUT, ) @@ -365,7 +418,8 @@ def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] = try: ddgs = DDGS() - raw = ddgs.text(query, max_results=count, timelimit=timelimit) + raw = ddgs.text(query, max_results=count, timelimit=timelimit, + safesearch=_safesearch_for("duckduckgo_lib")) results = [] for item in raw: url = item.get("href", "") @@ -407,6 +461,9 @@ def google_pse_search(query: str, count: int = 10, time_filter: Optional[str] = "q": query, "num": min(count, 10), # Google PSE max is 10 per request } + safe = _safesearch_for("google_pse") + if safe: + params["safe"] = safe if time_filter: # dateRestrict: d[number], w[number], m[number], y[number] time_map = {"day": "d1", "week": "w1", "month": "m1", "year": "y1"} @@ -511,6 +568,9 @@ def serper_search(query: str, count: int = 10, time_filter: Optional[str] = None "q": query, "num": count, } + safe = _safesearch_for("serper") + if safe: + payload["safe"] = safe if time_filter: time_map = {"day": "qdr:d", "week": "qdr:w", "month": "qdr:m", "year": "qdr:y"} if time_filter in time_map: diff --git a/src/settings.py b/src/settings.py index 1172389..b64010e 100644 --- a/src/settings.py +++ b/src/settings.py @@ -55,6 +55,26 @@ DEFAULT_SETTINGS = { "search_fallback_chain": ["duckduckgo"], "search_url": "", "search_result_count": 5, + # SafeSearch level applied to every provider that exposes one. + # "strict" — block adult / explicit results (default; matches what users + # expect from a research tool and avoids unrelated NSFW URLs + # bleeding in via provider "related" / spam recommendations) + # "moderate" — provider-default behavior (filter explicit but allow + # suggestive content) + # "off" — disable filtering entirely (advanced users only) + # + # Providers that honor this setting (translated to each provider's native + # param in src/search/providers.py:_safesearch_for): + # SearXNG safesearch=0/1/2 (JSON API, HTML scrape, news fallback) + # Brave Search safesearch=off/moderate/strict + # DuckDuckGo safesearch=off/moderate/on (library + HTML kp param) + # Google PSE safe=active (omitted for "off"; PSE has no middle tier) + # Serper.dev safe=active (omitted for "off"; proxies Google's `safe`) + # Providers NOT touched: Tavily (no SafeSearch knob; filters at index time) + # and any custom backend reached via search_url — they keep whatever the + # backend itself decides, so operators stay in control of self-hosted / + # niche search instances. + "search_safesearch": "strict", "brave_api_key": "", "google_pse_key": "", "google_pse_cx": "",