Apply SafeSearch by default across search providers (#763)

#718 reported Deep Research drifting into adult / spam URLs several
rounds into a benign session ("research about https://bhagathgoud.com/
and what he doing currently"). The reporter's log showed Japanese
adult sites being crawled even though the model was emitting normal
queries like "Bhagath Goud LinkedIn" and "site:bhagathgoud.com".

The model wasn't generating those URLs. Every provider call site
constructed its params dict without a SafeSearch parameter, so the
underlying HTTP backend (the duckduckgo-search library / DDG's HTML
endpoint in this case) was free to surface "related search" /
trending / spam recommendations that have nothing to do with the
user's query. Per provider:

- SearXNG: instance-dependent; many self-hosted instances default
  to safesearch=0.
- Brave API: defaults to "off" for new API keys.
- duckduckgo-search lib: defaults to "moderate", which still lets
  related-search recommendations and HTTP-backend fallback URLs
  surface trending non-English spam topics.
- DDG HTML fallback (html.duckduckgo.com): no `kp` param, treated
  as off.
- Google PSE: omitted `safe` is equivalent to off.
- Serper: omitted `safe` proxies to Google with safe off.

Since the bad URLs entered through the provider layer, not the
model, the provider params are the right place to gate this.

Changes:

- src/settings.py: new `search_safesearch` setting with default
  "strict". Documented values ("strict" | "moderate" | "off") plus
  a few aliases ("on", "high", "0/1/2", "disabled", ...) so a
  hand-edited config doesn't silently fall through to off.
- src/search/providers.py:
  - Add `_get_safesearch_level()` (canonical, normalizing) and
    `_safesearch_for(provider)` (per-provider param translation).
  - Thread the per-provider value into every params dict:
    SearXNG JSON, SearXNG language/engines fallbacks, SearXNG HTML,
    Brave, DDG library, DDG HTML fallback, Google PSE, Serper.
  - Tavily is left untouched — its API has no SafeSearch knob and
    its index already filters explicit content at ingest time.

Behavior change for existing installs: default is now "strict", so
explicit results get filtered across every supported provider
without any user action. Users who deliberately want unfiltered
results can set `search_safesearch` to "off" in Settings. No new
dependencies, no schema migrations.

Closes #718.
This commit is contained in:
tanmayraut45
2026-06-02 08:04:32 +05:30
committed by GitHub
parent eff762cdd9
commit 1cc2e90ac0
2 changed files with 85 additions and 5 deletions

View File

@@ -76,6 +76,56 @@ def _get_result_count() -> int:
return 5
# Canonical SafeSearch levels: "strict" (default), "moderate", "off".
# Each provider has its own knob name and value space — see _safesearch_for(...).
_SAFESEARCH_LEVELS = ("strict", "moderate", "off")
def _get_safesearch_level() -> str:
"""Return the configured SafeSearch level, normalized to one of
_SAFESEARCH_LEVELS. Defaults to 'strict' to avoid adult / spammy URLs
bleeding into research and web_search results."""
settings = _get_search_settings()
raw = (settings.get("search_safesearch") or "strict").strip().lower()
if raw in _SAFESEARCH_LEVELS:
return raw
# Accept a few common aliases so a manually-edited config doesn't
# silently lose SafeSearch — fall back to strict on anything unknown.
aliases = {
"on": "strict", "high": "strict", "2": "strict",
"medium": "moderate", "1": "moderate", "default": "moderate",
"none": "off", "disabled": "off", "0": "off",
}
return aliases.get(raw, "strict")
def _safesearch_for(provider: str) -> Optional[str]:
"""Translate the canonical level into the per-provider param value.
Returns None when SafeSearch should be omitted entirely for a provider
(some APIs default to filtered and treat missing-param as "off")."""
level = _get_safesearch_level()
if provider == "searxng":
# SearXNG: integer 0/1/2
return {"strict": "2", "moderate": "1", "off": "0"}[level]
if provider == "brave":
# Brave: strict / moderate / off
return level
if provider == "duckduckgo_lib":
# duckduckgo-search library: on / moderate / off
return {"strict": "on", "moderate": "moderate", "off": "off"}[level]
if provider == "duckduckgo_html":
# DDG HTML endpoint kp: 1 strict / -1 moderate / -2 off
return {"strict": "1", "moderate": "-1", "off": "-2"}[level]
if provider == "google_pse":
# Google PSE: 'active' filters explicit; 'off' disables. Treat
# moderate the same as active — Google PSE has no middle tier.
return None if level == "off" else "active"
if provider == "serper":
# Serper proxies Google's `safe` param.
return None if level == "off" else "active"
return None
# ── SearXNG ──
_NEWS_HINTS = ("news", "nyheter", "headlines", "breaking", "latest", "today", "idag")
@@ -106,7 +156,8 @@ def searxng_search_api(query: str, count: int = 10, categories: str = "general",
# Pin English for ALL searches — without it SearXNG mixes languages and
# brand-ambiguous terms bleed in foreign SEO pages (Honda "Odyssey" JP,
# Japanese "Trojan" malware blogs, Chinese math forums for "Polyphemus").
params = {"q": query, "format": "json", "language": "en"}
params = {"q": query, "format": "json", "language": "en",
"safesearch": _safesearch_for("searxng")}
q_lc = query.lower()
is_news = time_filter is not None or any(h in q_lc for h in _NEWS_HINTS)
if is_news and categories == "general":
@@ -155,6 +206,7 @@ def searxng_search_api(query: str, count: int = 10, categories: str = "general",
"format": "json",
"language": "en",
"categories": "general",
"safesearch": _safesearch_for("searxng"),
}
if _GENERAL_ENGINES:
fallback["engines"] = _GENERAL_ENGINES
@@ -205,7 +257,7 @@ def searxng_search(query, max_results=10):
try:
response = httpx.get(
f"{instance}/search",
params={"q": query},
params={"q": query, "safesearch": _safesearch_for("searxng")},
headers=req_headers,
timeout=10,
)
@@ -250,7 +302,8 @@ def _brave_search_impl(query: str, count: int, time_filter: Optional[str] = None
return []
headers = {"X-Subscription-Token": brave_api_key, "Accept": "application/json"}
params = {"q": enhanced_query, "count": count}
params = {"q": enhanced_query, "count": count,
"safesearch": _safesearch_for("brave")}
if time_filter:
time_map = {"day": "day", "week": "week", "month": "month", "year": "year"}
if time_filter in time_map:
@@ -326,7 +379,7 @@ def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] =
try:
response = httpx.get(
"https://html.duckduckgo.com/html/",
params={"q": query},
params={"q": query, "kp": _safesearch_for("duckduckgo_html")},
headers={"User-Agent": "Mozilla/5.0"},
timeout=REQUEST_TIMEOUT,
)
@@ -365,7 +418,8 @@ def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] =
try:
ddgs = DDGS()
raw = ddgs.text(query, max_results=count, timelimit=timelimit)
raw = ddgs.text(query, max_results=count, timelimit=timelimit,
safesearch=_safesearch_for("duckduckgo_lib"))
results = []
for item in raw:
url = item.get("href", "")
@@ -407,6 +461,9 @@ def google_pse_search(query: str, count: int = 10, time_filter: Optional[str] =
"q": query,
"num": min(count, 10), # Google PSE max is 10 per request
}
safe = _safesearch_for("google_pse")
if safe:
params["safe"] = safe
if time_filter:
# dateRestrict: d[number], w[number], m[number], y[number]
time_map = {"day": "d1", "week": "w1", "month": "m1", "year": "y1"}
@@ -511,6 +568,9 @@ def serper_search(query: str, count: int = 10, time_filter: Optional[str] = None
"q": query,
"num": count,
}
safe = _safesearch_for("serper")
if safe:
payload["safe"] = safe
if time_filter:
time_map = {"day": "qdr:d", "week": "qdr:w", "month": "qdr:m", "year": "qdr:y"}
if time_filter in time_map:

View File

@@ -55,6 +55,26 @@ DEFAULT_SETTINGS = {
"search_fallback_chain": ["duckduckgo"],
"search_url": "",
"search_result_count": 5,
# SafeSearch level applied to every provider that exposes one.
# "strict" — block adult / explicit results (default; matches what users
# expect from a research tool and avoids unrelated NSFW URLs
# bleeding in via provider "related" / spam recommendations)
# "moderate" — provider-default behavior (filter explicit but allow
# suggestive content)
# "off" — disable filtering entirely (advanced users only)
#
# Providers that honor this setting (translated to each provider's native
# param in src/search/providers.py:_safesearch_for):
# SearXNG safesearch=0/1/2 (JSON API, HTML scrape, news fallback)
# Brave Search safesearch=off/moderate/strict
# DuckDuckGo safesearch=off/moderate/on (library + HTML kp param)
# Google PSE safe=active (omitted for "off"; PSE has no middle tier)
# Serper.dev safe=active (omitted for "off"; proxies Google's `safe`)
# Providers NOT touched: Tavily (no SafeSearch knob; filters at index time)
# and any custom backend reached via search_url — they keep whatever the
# backend itself decides, so operators stay in control of self-hosted /
# niche search instances.
"search_safesearch": "strict",
"brave_api_key": "",
"google_pse_key": "",
"google_pse_cx": "",