Match host, not substring, when resolving DuckDuckGo redirects (#886)

_resolve_ddg_redirect (the DuckDuckGo /l/?uddg= redirect resolver used on every
HTML-fallback result href) gated on `"duckduckgo.com" in parsed.hostname`. That
substring test also matches look-alike hosts like `duckduckgo.com.evil.com` and
`notduckduckgo.com`, so a result link on such a host would be silently rewritten
to its embedded `uddg` target. Same substring-vs-hostname pitfall fixed for
provider detection in 54ecfa3.

Match the host properly: exactly `duckduckgo.com` or a `.duckduckgo.com`
subdomain. Genuine redirects (`//duckduckgo.com/l/...`, and relative `/l/...`
hrefs resolved against `html.duckduckgo.com`) keep working.

The resolver was a closure inside duckduckgo_search; lifted it (plus the new
_is_duckduckgo_host helper) to module scope so it can be unit-tested directly.

Adds tests/test_ddg_redirect_resolution.py (red on the look-alike case before
this change, green after).
This commit is contained in:
mist
2026-06-02 06:25:56 +03:00
committed by GitHub
parent f691537472
commit fca8d68aba
2 changed files with 67 additions and 22 deletions

View File

@@ -352,29 +352,37 @@ def _brave_search_impl(query: str, count: int, time_filter: Optional[str] = None
# ── DuckDuckGo (free, no key) ──
def _is_duckduckgo_host(host: str) -> bool:
"""True only for duckduckgo.com and its subdomains — not substring look-alikes
such as ``duckduckgo.com.evil.com`` or ``notduckduckgo.com``."""
host = (host or "").lower()
return host == "duckduckgo.com" or host.endswith(".duckduckgo.com")
def _resolve_ddg_redirect(raw: str) -> str:
"""Resolve a DuckDuckGo /l/?uddg= redirect URL to its destination."""
if not raw:
return raw
# Handle protocol-relative URLs
resolved = raw
if resolved.startswith("//"):
resolved = "https:" + resolved
elif resolved.startswith("/"):
resolved = urljoin("https://html.duckduckgo.com", resolved)
# Extract the actual URL from DuckDuckGo's /l/?uddg= redirect
try:
parsed = urlparse(resolved)
if _is_duckduckgo_host(parsed.hostname) and parsed.path.rstrip("/") == "/l":
qs = parse_qs(parsed.query)
if "uddg" in qs:
return qs["uddg"][0]
except Exception:
pass
return resolved
def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
"""Search using DuckDuckGo via the duckduckgo-search library. No API key needed."""
def _resolve_url(raw: str) -> str:
"""Resolve DuckDuckGo redirect URL to the actual destination URL."""
if not raw:
return raw
# Handle protocol-relative URLs
resolved = raw
if resolved.startswith("//"):
resolved = "https:" + resolved
elif resolved.startswith("/"):
resolved = urljoin("https://html.duckduckgo.com", resolved)
# Extract the actual URL from DuckDuckGo's /l/?uddg= redirect
try:
parsed = urlparse(resolved)
if "duckduckgo.com" in (parsed.hostname or "") and parsed.path.rstrip("/") == "/l":
qs = parse_qs(parsed.query)
if "uddg" in qs:
return qs["uddg"][0]
except Exception:
pass
return resolved
def _html_fallback() -> List[dict]:
try:
response = httpx.get(
@@ -390,7 +398,7 @@ def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] =
link = result.select_one(".result__a")
if not link:
continue
url = _resolve_url(link.get("href", ""))
url = _resolve_ddg_redirect(link.get("href", ""))
if not url:
continue
snippet_el = result.select_one(".result__snippet")

View File

@@ -0,0 +1,37 @@
"""Resolving DuckDuckGo /l/?uddg= redirects must match the host, not a substring.
`_resolve_ddg_redirect` only extracts the embedded `uddg` destination when the
redirect link is actually on DuckDuckGo. The host check used
`"duckduckgo.com" in parsed.hostname`, which also matches look-alike hosts such
as `duckduckgo.com.evil.com` or `notduckduckgo.com` — so a result link on one of
those would be silently rewritten to its embedded `uddg` target. Same
substring-vs-hostname pitfall fixed for provider detection in 54ecfa3.
"""
from src.search.providers import _resolve_ddg_redirect, _is_duckduckgo_host
def test_resolves_genuine_ddg_redirects():
# protocol-relative DDG redirect
assert _resolve_ddg_redirect(
"//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com"
) == "https://example.com"
# relative href -> resolved against html.duckduckgo.com (a real DDG subdomain)
assert _resolve_ddg_redirect(
"/l/?uddg=https%3A%2F%2Fexample.com"
) == "https://example.com"
def test_ignores_lookalike_hosts():
for host in ("duckduckgo.com.evil.com", "notduckduckgo.com"):
url = f"https://{host}/l/?uddg=https%3A%2F%2Fexample.com"
# Must be returned unchanged — it is NOT a DuckDuckGo redirect.
assert _resolve_ddg_redirect(url) == url
def test_host_matcher():
assert _is_duckduckgo_host("duckduckgo.com")
assert _is_duckduckgo_host("html.duckduckgo.com")
assert _is_duckduckgo_host("lite.duckduckgo.com")
assert not _is_duckduckgo_host("duckduckgo.com.evil.com")
assert not _is_duckduckgo_host("notduckduckgo.com")
assert not _is_duckduckgo_host("")