Match host, not substring, when resolving DuckDuckGo redirects (#886)
_resolve_ddg_redirect (the DuckDuckGo /l/?uddg= redirect resolver used on every
HTML-fallback result href) gated on `"duckduckgo.com" in parsed.hostname`. That
substring test also matches look-alike hosts like `duckduckgo.com.evil.com` and
`notduckduckgo.com`, so a result link on such a host would be silently rewritten
to its embedded `uddg` target. Same substring-vs-hostname pitfall fixed for
provider detection in 54ecfa3.
Match the host properly: exactly `duckduckgo.com` or a `.duckduckgo.com`
subdomain. Genuine redirects (`//duckduckgo.com/l/...`, and relative `/l/...`
hrefs resolved against `html.duckduckgo.com`) keep working.
The resolver was a closure inside duckduckgo_search; lifted it (plus the new
_is_duckduckgo_host helper) to module scope so it can be unit-tested directly.
Adds tests/test_ddg_redirect_resolution.py (red on the look-alike case before
this change, green after).
This commit is contained in:
@@ -352,29 +352,37 @@ def _brave_search_impl(query: str, count: int, time_filter: Optional[str] = None
|
||||
|
||||
# ── DuckDuckGo (free, no key) ──
|
||||
|
||||
def _is_duckduckgo_host(host: str) -> bool:
|
||||
"""True only for duckduckgo.com and its subdomains — not substring look-alikes
|
||||
such as ``duckduckgo.com.evil.com`` or ``notduckduckgo.com``."""
|
||||
host = (host or "").lower()
|
||||
return host == "duckduckgo.com" or host.endswith(".duckduckgo.com")
|
||||
|
||||
|
||||
def _resolve_ddg_redirect(raw: str) -> str:
|
||||
"""Resolve a DuckDuckGo /l/?uddg= redirect URL to its destination."""
|
||||
if not raw:
|
||||
return raw
|
||||
# Handle protocol-relative URLs
|
||||
resolved = raw
|
||||
if resolved.startswith("//"):
|
||||
resolved = "https:" + resolved
|
||||
elif resolved.startswith("/"):
|
||||
resolved = urljoin("https://html.duckduckgo.com", resolved)
|
||||
# Extract the actual URL from DuckDuckGo's /l/?uddg= redirect
|
||||
try:
|
||||
parsed = urlparse(resolved)
|
||||
if _is_duckduckgo_host(parsed.hostname) and parsed.path.rstrip("/") == "/l":
|
||||
qs = parse_qs(parsed.query)
|
||||
if "uddg" in qs:
|
||||
return qs["uddg"][0]
|
||||
except Exception:
|
||||
pass
|
||||
return resolved
|
||||
|
||||
|
||||
def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
|
||||
"""Search using DuckDuckGo via the duckduckgo-search library. No API key needed."""
|
||||
def _resolve_url(raw: str) -> str:
|
||||
"""Resolve DuckDuckGo redirect URL to the actual destination URL."""
|
||||
if not raw:
|
||||
return raw
|
||||
# Handle protocol-relative URLs
|
||||
resolved = raw
|
||||
if resolved.startswith("//"):
|
||||
resolved = "https:" + resolved
|
||||
elif resolved.startswith("/"):
|
||||
resolved = urljoin("https://html.duckduckgo.com", resolved)
|
||||
# Extract the actual URL from DuckDuckGo's /l/?uddg= redirect
|
||||
try:
|
||||
parsed = urlparse(resolved)
|
||||
if "duckduckgo.com" in (parsed.hostname or "") and parsed.path.rstrip("/") == "/l":
|
||||
qs = parse_qs(parsed.query)
|
||||
if "uddg" in qs:
|
||||
return qs["uddg"][0]
|
||||
except Exception:
|
||||
pass
|
||||
return resolved
|
||||
|
||||
def _html_fallback() -> List[dict]:
|
||||
try:
|
||||
response = httpx.get(
|
||||
@@ -390,7 +398,7 @@ def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] =
|
||||
link = result.select_one(".result__a")
|
||||
if not link:
|
||||
continue
|
||||
url = _resolve_url(link.get("href", ""))
|
||||
url = _resolve_ddg_redirect(link.get("href", ""))
|
||||
if not url:
|
||||
continue
|
||||
snippet_el = result.select_one(".result__snippet")
|
||||
|
||||
37
tests/test_ddg_redirect_resolution.py
Normal file
37
tests/test_ddg_redirect_resolution.py
Normal file
@@ -0,0 +1,37 @@
|
||||
"""Resolving DuckDuckGo /l/?uddg= redirects must match the host, not a substring.
|
||||
|
||||
`_resolve_ddg_redirect` only extracts the embedded `uddg` destination when the
|
||||
redirect link is actually on DuckDuckGo. The host check used
|
||||
`"duckduckgo.com" in parsed.hostname`, which also matches look-alike hosts such
|
||||
as `duckduckgo.com.evil.com` or `notduckduckgo.com` — so a result link on one of
|
||||
those would be silently rewritten to its embedded `uddg` target. Same
|
||||
substring-vs-hostname pitfall fixed for provider detection in 54ecfa3.
|
||||
"""
|
||||
from src.search.providers import _resolve_ddg_redirect, _is_duckduckgo_host
|
||||
|
||||
|
||||
def test_resolves_genuine_ddg_redirects():
|
||||
# protocol-relative DDG redirect
|
||||
assert _resolve_ddg_redirect(
|
||||
"//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com"
|
||||
) == "https://example.com"
|
||||
# relative href -> resolved against html.duckduckgo.com (a real DDG subdomain)
|
||||
assert _resolve_ddg_redirect(
|
||||
"/l/?uddg=https%3A%2F%2Fexample.com"
|
||||
) == "https://example.com"
|
||||
|
||||
|
||||
def test_ignores_lookalike_hosts():
|
||||
for host in ("duckduckgo.com.evil.com", "notduckduckgo.com"):
|
||||
url = f"https://{host}/l/?uddg=https%3A%2F%2Fexample.com"
|
||||
# Must be returned unchanged — it is NOT a DuckDuckGo redirect.
|
||||
assert _resolve_ddg_redirect(url) == url
|
||||
|
||||
|
||||
def test_host_matcher():
|
||||
assert _is_duckduckgo_host("duckduckgo.com")
|
||||
assert _is_duckduckgo_host("html.duckduckgo.com")
|
||||
assert _is_duckduckgo_host("lite.duckduckgo.com")
|
||||
assert not _is_duckduckgo_host("duckduckgo.com.evil.com")
|
||||
assert not _is_duckduckgo_host("notduckduckgo.com")
|
||||
assert not _is_duckduckgo_host("")
|
||||
Reference in New Issue
Block a user