From c0466274ed8254ac0787eaa72c3c47c605ce977b Mon Sep 17 00:00:00 2001
From: BSG-Walter <walter.gonzalez@live.com.ar>
Date: Mon, 1 Jun 2026 19:42:01 -0300
Subject: [PATCH] fix: resolve DuckDuckGo redirect URLs in HTML fallback search

The DuckDuckGo HTML fallback returns redirect URLs (//duckduckgo.com/l/?uddg=...)
instead of actual page URLs. This caused fetch_webpage_content() to reject them
instantly because _public_http_url() requires an http/https scheme, making search
results unfetchable in deep research mode.
Added _resolve_url() to:
- Convert protocol-relative URLs to absolute (https:)
- Convert path-relative URLs to absolute
- Extract the real URL from DuckDuckGo's /l/?uddg= redirect parameters
---
 services/search/providers.py | 22 +++++++++++++++++++++-
 src/search/providers.py      | 24 +++++++++++++++++++++++-
 2 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/services/search/providers.py b/services/search/providers.py
index c760b5a..b7cdce6 100644
--- a/services/search/providers.py
+++ b/services/search/providers.py
@@ -4,6 +4,7 @@ import json
 import logging
 import os
 from typing import List, Optional
+from urllib.parse import urljoin, urlparse, parse_qs
 
 import httpx
 from bs4 import BeautifulSoup
@@ -299,6 +300,25 @@ def _brave_search_impl(query: str, count: int, time_filter: Optional[str] = None
 
 def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
     """Search using DuckDuckGo via the duckduckgo-search library. No API key needed."""
+    def _resolve_url(raw: str) -> str:
+        """Resolve DuckDuckGo redirect URL to the actual destination URL."""
+        if not raw:
+            return raw
+        resolved = raw
+        if resolved.startswith("//"):
+            resolved = "https:" + resolved
+        elif resolved.startswith("/"):
+            resolved = urljoin("https://html.duckduckgo.com", resolved)
+        try:
+            parsed = urlparse(resolved)
+            if "duckduckgo.com" in (parsed.hostname or "") and parsed.path.rstrip("/") == "/l":
+                qs = parse_qs(parsed.query)
+                if "uddg" in qs:
+                    return qs["uddg"][0]
+        except Exception:
+            pass
+        return resolved
+
     def _html_fallback() -> List[dict]:
         try:
             response = httpx.get(
@@ -314,7 +334,7 @@ def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] =
                 link = result.select_one(".result__a")
                 if not link:
                     continue
-                url = link.get("href", "")
+                url = _resolve_url(link.get("href", ""))
                 if not url:
                     continue
                 snippet_el = result.select_one(".result__snippet")
diff --git a/src/search/providers.py b/src/search/providers.py
index f60a024..ee16a50 100644
--- a/src/search/providers.py
+++ b/src/search/providers.py
@@ -4,6 +4,7 @@ import json
 import logging
 import os
 from typing import List, Optional
+from urllib.parse import urljoin, urlparse, parse_qs
 
 import httpx
 from bs4 import BeautifulSoup
@@ -300,6 +301,27 @@ def _brave_search_impl(query: str, count: int, time_filter: Optional[str] = None
 
 def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
     """Search using DuckDuckGo via the duckduckgo-search library. No API key needed."""
+    def _resolve_url(raw: str) -> str:
+        """Resolve DuckDuckGo redirect URL to the actual destination URL."""
+        if not raw:
+            return raw
+        # Handle protocol-relative URLs
+        resolved = raw
+        if resolved.startswith("//"):
+            resolved = "https:" + resolved
+        elif resolved.startswith("/"):
+            resolved = urljoin("https://html.duckduckgo.com", resolved)
+        # Extract the actual URL from DuckDuckGo's /l/?uddg= redirect
+        try:
+            parsed = urlparse(resolved)
+            if "duckduckgo.com" in (parsed.hostname or "") and parsed.path.rstrip("/") == "/l":
+                qs = parse_qs(parsed.query)
+                if "uddg" in qs:
+                    return qs["uddg"][0]
+        except Exception:
+            pass
+        return resolved
+
     def _html_fallback() -> List[dict]:
         try:
             response = httpx.get(
@@ -315,7 +337,7 @@ def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] =
                 link = result.select_one(".result__a")
                 if not link:
                     continue
-                url = link.get("href", "")
+                url = _resolve_url(link.get("href", ""))
                 if not url:
                     continue
                 snippet_el = result.select_one(".result__snippet")