diff --git a/services/search/providers.py b/services/search/providers.py index aacbee3..f2d4a58 100644 --- a/services/search/providers.py +++ b/services/search/providers.py @@ -492,7 +492,6 @@ def google_pse_search(query: str, count: int = 10, time_filter: Optional[str] = if response.status_code == 429: raise RateLimitError("Google PSE rate limit hit") response.raise_for_status() - data = response.json() except httpx.RequestError as e: error_logger.error(f"Google PSE search failed: {e}") return [] @@ -500,6 +499,12 @@ def google_pse_search(query: str, count: int = 10, time_filter: Optional[str] = error_logger.error(str(e)) return [] + try: + data = response.json() + except json.JSONDecodeError as e: + error_logger.error(f"Google PSE returned invalid JSON: {e}") + return [] + results = [] for item in data.get("items", [])[:count]: url = item.get("link", "") @@ -544,7 +549,6 @@ def tavily_search(query: str, count: int = 10, time_filter: Optional[str] = None if response.status_code == 429: raise RateLimitError("Tavily rate limit hit") response.raise_for_status() - data = response.json() except httpx.RequestError as e: error_logger.error(f"Tavily search failed: {e}") return [] @@ -552,6 +556,12 @@ def tavily_search(query: str, count: int = 10, time_filter: Optional[str] = None error_logger.error(str(e)) return [] + try: + data = response.json() + except json.JSONDecodeError as e: + error_logger.error(f"Tavily returned invalid JSON: {e}") + return [] + results = [] for item in data.get("results", [])[:count]: url = item.get("url", "") @@ -599,7 +609,6 @@ def serper_search(query: str, count: int = 10, time_filter: Optional[str] = None if response.status_code == 429: raise RateLimitError("Serper rate limit hit") response.raise_for_status() - data = response.json() except httpx.RequestError as e: error_logger.error(f"Serper search failed: {e}") return [] @@ -607,6 +616,12 @@ def serper_search(query: str, count: int = 10, time_filter: Optional[str] = None error_logger.error(str(e)) return [] + try: + data = response.json() + except json.JSONDecodeError as e: + error_logger.error(f"Serper returned invalid JSON: {e}") + return [] + results = [] for item in data.get("organic", [])[:count]: url = item.get("link", "") diff --git a/tests/test_search_provider_json.py b/tests/test_search_provider_json.py new file mode 100644 index 0000000..61c730f --- /dev/null +++ b/tests/test_search_provider_json.py @@ -0,0 +1,59 @@ +"""Search providers must not raise on a non-JSON response body (issue #1129). + +`brave_search` already wraps `response.json()` in its own try/except that catches +`json.JSONDecodeError` and returns []. The Tavily, Serper, and Google PSE +providers parsed JSON inside the network try block, which only caught +`httpx.RequestError`/`RateLimitError` — so a provider returning a non-JSON body +(an HTML error page, a truncated/empty body, a gateway error) raised an +UNCAUGHT `json.JSONDecodeError` that aborted the search in the background. These +pin that all four providers degrade to [] on malformed JSON, matching brave. +""" + +import json + +import pytest + +from services.search import providers + + +class _BadJSONResponse: + """A 200 response whose body is not valid JSON (e.g. an HTML error page).""" + status_code = 200 + + def raise_for_status(self): + return None + + def json(self): + raise json.JSONDecodeError("Expecting value", "down", 0) + + +@pytest.fixture(autouse=True) +def _offline(monkeypatch): + # Keep everything offline + deterministic: no settings/DB, keys via env, and + # both httpx verbs return a body that fails to decode. + monkeypatch.setattr(providers, "_get_search_settings", lambda: {}, raising=False) + monkeypatch.setattr(providers, "_safesearch_for", lambda *_a, **_k: None, raising=False) + monkeypatch.setenv("DATA_BRAVE_API_KEY", "k") + monkeypatch.setenv("TAVILY_API_KEY", "k") + monkeypatch.setenv("SERPER_API_KEY", "k") + monkeypatch.setenv("GOOGLE_API_KEY", "k") + monkeypatch.setenv("GOOGLE_PSE_CX", "cx") + monkeypatch.setattr(providers.httpx, "post", lambda *a, **k: _BadJSONResponse()) + monkeypatch.setattr(providers.httpx, "get", lambda *a, **k: _BadJSONResponse()) + + +def test_tavily_malformed_json_returns_empty(): + assert providers.tavily_search("hello") == [] + + +def test_serper_malformed_json_returns_empty(): + assert providers.serper_search("hello") == [] + + +def test_google_pse_malformed_json_returns_empty(): + assert providers.google_pse_search("hello") == [] + + +def test_brave_malformed_json_returns_empty(): + # Already correct on main — guards against regressing the reference behaviour. + assert providers.brave_search("hello") == []