From 77313170c67f21cb58b347b1612934b9c27fb9dd Mon Sep 17 00:00:00 2001 From: Afonso Coutinho Date: Wed, 3 Jun 2026 00:36:01 +0100 Subject: [PATCH] fix: search query helpers crash on a non-string query (#1604) --- services/search/query.py | 10 +++++++ tests/test_search_query_nonstring.py | 40 ++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 tests/test_search_query_nonstring.py diff --git a/services/search/query.py b/services/search/query.py index eb54ee1..03596ed 100644 --- a/services/search/query.py +++ b/services/search/query.py @@ -13,6 +13,8 @@ logger = logging.getLogger(__name__) # ---------------------------------------------------------------------- def _detect_question_type(query: str) -> Optional[str]: """Return the leading question word if present (who, what, when, where, why, how).""" + if not isinstance(query, str): + return None q = query.strip().lower() for word in ("who", "what", "when", "where", "why", "how"): # Require a whole-word match: a bare prefix mis-flags ordinary queries @@ -45,12 +47,16 @@ def _extract_entities(query: str) -> Dict[str, List[str]]: def _split_multi_part(query: str) -> List[str]: """Split a query into sub-queries on common conjunctions.""" + if not isinstance(query, str): + return [] parts = re.split(r"\s+and\s+|\s+or\s+|;", query, flags=re.I) return [p.strip() for p in parts if p.strip()] def _extract_site_filter(query: str) -> Tuple[str, Optional[str]]: """Detect a 'site:example.com' token. Returns (query_without_token, site_or_None).""" + if not isinstance(query, str): + return "", None match = re.search(r"\bsite:([^\s]+)", query, flags=re.I) if match: site = match.group(1) @@ -71,6 +77,8 @@ def _boost_entities_in_query(base_query: str, entities: Dict[str, List[str]]) -> def enhance_query(original_query: str) -> Tuple[str, Optional[str]]: """Process the original query: site filter, question type boosts, entity extraction.""" + if not isinstance(original_query, str): + original_query = "" query_without_site, site = _extract_site_filter(original_query) sub_queries = _split_multi_part(query_without_site) @@ -120,6 +128,8 @@ def build_enhanced_query(query: str, time_filter: str = None) -> str: def _is_news_query(query: str) -> bool: """Lightweight heuristic to decide if a query is news-oriented.""" news_terms = {"news", "latest", "breaking", "today", "today's", "current", "updates", "happening"} + if not isinstance(query, str): + return False tokens = set(re.findall(r"\b\w+\b", query.lower())) return bool(tokens & news_terms) diff --git a/tests/test_search_query_nonstring.py b/tests/test_search_query_nonstring.py new file mode 100644 index 0000000..f8c7672 --- /dev/null +++ b/tests/test_search_query_nonstring.py @@ -0,0 +1,40 @@ +"""Regression: search query helpers must tolerate a non-string query. + +These helpers did `query.strip()`, `query.lower()`, `re.split(..., query)`, +`re.search(..., query)` directly, so a None / non-string query (e.g. from a +caller that didn't coerce) raised TypeError/AttributeError. They now return a +safe default for non-strings. +""" +import importlib.machinery +import importlib.util +from pathlib import Path + +_PATH = Path(__file__).resolve().parents[1] / "services" / "search" / "query.py" + + +def _load(): + # Load the module file directly so the package __init__ (which imports + # httpx) isn't required. + loader = importlib.machinery.SourceFileLoader("odysseus_search_query", str(_PATH)) + spec = importlib.util.spec_from_loader(loader.name, loader) + module = importlib.util.module_from_spec(spec) + loader.exec_module(module) + return module + + +def test_helpers_handle_none(): + q = _load() + assert q._detect_question_type(None) is None + assert q._split_multi_part(None) == [] + assert q._extract_site_filter(None) == ("", None) + assert q._is_news_query(None) is False + # entry points coerce and do not raise + assert isinstance(q.enhance_query(None)[0], str) + assert isinstance(q.build_enhanced_query(123), str) + + +def test_valid_query_still_works(): + q = _load() + assert q._detect_question_type("who is bob") == "who" + assert q._is_news_query("latest news today") is True + assert q._extract_site_filter("cats site:x.com")[1] == "x.com"