From fca8d68abaaa5358ff5b45f595a32020a1cee558 Mon Sep 17 00:00:00 2001 From: mist Date: Tue, 2 Jun 2026 06:25:56 +0300 Subject: [PATCH] Match host, not substring, when resolving DuckDuckGo redirects (#886) _resolve_ddg_redirect (the DuckDuckGo /l/?uddg= redirect resolver used on every HTML-fallback result href) gated on `"duckduckgo.com" in parsed.hostname`. That substring test also matches look-alike hosts like `duckduckgo.com.evil.com` and `notduckduckgo.com`, so a result link on such a host would be silently rewritten to its embedded `uddg` target. Same substring-vs-hostname pitfall fixed for provider detection in 54ecfa3. Match the host properly: exactly `duckduckgo.com` or a `.duckduckgo.com` subdomain. Genuine redirects (`//duckduckgo.com/l/...`, and relative `/l/...` hrefs resolved against `html.duckduckgo.com`) keep working. The resolver was a closure inside duckduckgo_search; lifted it (plus the new _is_duckduckgo_host helper) to module scope so it can be unit-tested directly. Adds tests/test_ddg_redirect_resolution.py (red on the look-alike case before this change, green after). --- src/search/providers.py | 52 +++++++++++++++------------ tests/test_ddg_redirect_resolution.py | 37 +++++++++++++++++++ 2 files changed, 67 insertions(+), 22 deletions(-) create mode 100644 tests/test_ddg_redirect_resolution.py diff --git a/src/search/providers.py b/src/search/providers.py index 4ec493b..5d8b2e6 100644 --- a/src/search/providers.py +++ b/src/search/providers.py @@ -352,29 +352,37 @@ def _brave_search_impl(query: str, count: int, time_filter: Optional[str] = None # ── DuckDuckGo (free, no key) ── +def _is_duckduckgo_host(host: str) -> bool: + """True only for duckduckgo.com and its subdomains — not substring look-alikes + such as ``duckduckgo.com.evil.com`` or ``notduckduckgo.com``.""" + host = (host or "").lower() + return host == "duckduckgo.com" or host.endswith(".duckduckgo.com") + + +def _resolve_ddg_redirect(raw: str) -> str: + """Resolve a DuckDuckGo /l/?uddg= redirect URL to its destination.""" + if not raw: + return raw + # Handle protocol-relative URLs + resolved = raw + if resolved.startswith("//"): + resolved = "https:" + resolved + elif resolved.startswith("/"): + resolved = urljoin("https://html.duckduckgo.com", resolved) + # Extract the actual URL from DuckDuckGo's /l/?uddg= redirect + try: + parsed = urlparse(resolved) + if _is_duckduckgo_host(parsed.hostname) and parsed.path.rstrip("/") == "/l": + qs = parse_qs(parsed.query) + if "uddg" in qs: + return qs["uddg"][0] + except Exception: + pass + return resolved + + def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]: """Search using DuckDuckGo via the duckduckgo-search library. No API key needed.""" - def _resolve_url(raw: str) -> str: - """Resolve DuckDuckGo redirect URL to the actual destination URL.""" - if not raw: - return raw - # Handle protocol-relative URLs - resolved = raw - if resolved.startswith("//"): - resolved = "https:" + resolved - elif resolved.startswith("/"): - resolved = urljoin("https://html.duckduckgo.com", resolved) - # Extract the actual URL from DuckDuckGo's /l/?uddg= redirect - try: - parsed = urlparse(resolved) - if "duckduckgo.com" in (parsed.hostname or "") and parsed.path.rstrip("/") == "/l": - qs = parse_qs(parsed.query) - if "uddg" in qs: - return qs["uddg"][0] - except Exception: - pass - return resolved - def _html_fallback() -> List[dict]: try: response = httpx.get( @@ -390,7 +398,7 @@ def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] = link = result.select_one(".result__a") if not link: continue - url = _resolve_url(link.get("href", "")) + url = _resolve_ddg_redirect(link.get("href", "")) if not url: continue snippet_el = result.select_one(".result__snippet") diff --git a/tests/test_ddg_redirect_resolution.py b/tests/test_ddg_redirect_resolution.py new file mode 100644 index 0000000..80ee9f4 --- /dev/null +++ b/tests/test_ddg_redirect_resolution.py @@ -0,0 +1,37 @@ +"""Resolving DuckDuckGo /l/?uddg= redirects must match the host, not a substring. + +`_resolve_ddg_redirect` only extracts the embedded `uddg` destination when the +redirect link is actually on DuckDuckGo. The host check used +`"duckduckgo.com" in parsed.hostname`, which also matches look-alike hosts such +as `duckduckgo.com.evil.com` or `notduckduckgo.com` — so a result link on one of +those would be silently rewritten to its embedded `uddg` target. Same +substring-vs-hostname pitfall fixed for provider detection in 54ecfa3. +""" +from src.search.providers import _resolve_ddg_redirect, _is_duckduckgo_host + + +def test_resolves_genuine_ddg_redirects(): + # protocol-relative DDG redirect + assert _resolve_ddg_redirect( + "//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com" + ) == "https://example.com" + # relative href -> resolved against html.duckduckgo.com (a real DDG subdomain) + assert _resolve_ddg_redirect( + "/l/?uddg=https%3A%2F%2Fexample.com" + ) == "https://example.com" + + +def test_ignores_lookalike_hosts(): + for host in ("duckduckgo.com.evil.com", "notduckduckgo.com"): + url = f"https://{host}/l/?uddg=https%3A%2F%2Fexample.com" + # Must be returned unchanged — it is NOT a DuckDuckGo redirect. + assert _resolve_ddg_redirect(url) == url + + +def test_host_matcher(): + assert _is_duckduckgo_host("duckduckgo.com") + assert _is_duckduckgo_host("html.duckduckgo.com") + assert _is_duckduckgo_host("lite.duckduckgo.com") + assert not _is_duckduckgo_host("duckduckgo.com.evil.com") + assert not _is_duckduckgo_host("notduckduckgo.com") + assert not _is_duckduckgo_host("")