From c0466274ed8254ac0787eaa72c3c47c605ce977b Mon Sep 17 00:00:00 2001 From: BSG-Walter Date: Mon, 1 Jun 2026 19:42:01 -0300 Subject: [PATCH] fix: resolve DuckDuckGo redirect URLs in HTML fallback search The DuckDuckGo HTML fallback returns redirect URLs (//duckduckgo.com/l/?uddg=...) instead of actual page URLs. This caused fetch_webpage_content() to reject them instantly because _public_http_url() requires an http/https scheme, making search results unfetchable in deep research mode. Added _resolve_url() to: - Convert protocol-relative URLs to absolute (https:) - Convert path-relative URLs to absolute - Extract the real URL from DuckDuckGo's /l/?uddg= redirect parameters --- services/search/providers.py | 22 +++++++++++++++++++++- src/search/providers.py | 24 +++++++++++++++++++++++- 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/services/search/providers.py b/services/search/providers.py index c760b5a..b7cdce6 100644 --- a/services/search/providers.py +++ b/services/search/providers.py @@ -4,6 +4,7 @@ import json import logging import os from typing import List, Optional +from urllib.parse import urljoin, urlparse, parse_qs import httpx from bs4 import BeautifulSoup @@ -299,6 +300,25 @@ def _brave_search_impl(query: str, count: int, time_filter: Optional[str] = None def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]: """Search using DuckDuckGo via the duckduckgo-search library. No API key needed.""" + def _resolve_url(raw: str) -> str: + """Resolve DuckDuckGo redirect URL to the actual destination URL.""" + if not raw: + return raw + resolved = raw + if resolved.startswith("//"): + resolved = "https:" + resolved + elif resolved.startswith("/"): + resolved = urljoin("https://html.duckduckgo.com", resolved) + try: + parsed = urlparse(resolved) + if "duckduckgo.com" in (parsed.hostname or "") and parsed.path.rstrip("/") == "/l": + qs = parse_qs(parsed.query) + if "uddg" in qs: + return qs["uddg"][0] + except Exception: + pass + return resolved + def _html_fallback() -> List[dict]: try: response = httpx.get( @@ -314,7 +334,7 @@ def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] = link = result.select_one(".result__a") if not link: continue - url = link.get("href", "") + url = _resolve_url(link.get("href", "")) if not url: continue snippet_el = result.select_one(".result__snippet") diff --git a/src/search/providers.py b/src/search/providers.py index f60a024..ee16a50 100644 --- a/src/search/providers.py +++ b/src/search/providers.py @@ -4,6 +4,7 @@ import json import logging import os from typing import List, Optional +from urllib.parse import urljoin, urlparse, parse_qs import httpx from bs4 import BeautifulSoup @@ -300,6 +301,27 @@ def _brave_search_impl(query: str, count: int, time_filter: Optional[str] = None def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]: """Search using DuckDuckGo via the duckduckgo-search library. No API key needed.""" + def _resolve_url(raw: str) -> str: + """Resolve DuckDuckGo redirect URL to the actual destination URL.""" + if not raw: + return raw + # Handle protocol-relative URLs + resolved = raw + if resolved.startswith("//"): + resolved = "https:" + resolved + elif resolved.startswith("/"): + resolved = urljoin("https://html.duckduckgo.com", resolved) + # Extract the actual URL from DuckDuckGo's /l/?uddg= redirect + try: + parsed = urlparse(resolved) + if "duckduckgo.com" in (parsed.hostname or "") and parsed.path.rstrip("/") == "/l": + qs = parse_qs(parsed.query) + if "uddg" in qs: + return qs["uddg"][0] + except Exception: + pass + return resolved + def _html_fallback() -> List[dict]: try: response = httpx.get( @@ -315,7 +337,7 @@ def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] = link = result.select_one(".result__a") if not link: continue - url = link.get("href", "") + url = _resolve_url(link.get("href", "")) if not url: continue snippet_el = result.select_one(".result__snippet")