fix: search query helpers crash on a non-string query (#1604)

This commit is contained in:
Afonso Coutinho
2026-06-03 00:36:01 +01:00
committed by GitHub
parent 0d25dfb5f4
commit 77313170c6
2 changed files with 50 additions and 0 deletions

View File

@@ -13,6 +13,8 @@ logger = logging.getLogger(__name__)
# ----------------------------------------------------------------------
def _detect_question_type(query: str) -> Optional[str]:
"""Return the leading question word if present (who, what, when, where, why, how)."""
if not isinstance(query, str):
return None
q = query.strip().lower()
for word in ("who", "what", "when", "where", "why", "how"):
# Require a whole-word match: a bare prefix mis-flags ordinary queries
@@ -45,12 +47,16 @@ def _extract_entities(query: str) -> Dict[str, List[str]]:
def _split_multi_part(query: str) -> List[str]:
"""Split a query into sub-queries on common conjunctions."""
if not isinstance(query, str):
return []
parts = re.split(r"\s+and\s+|\s+or\s+|;", query, flags=re.I)
return [p.strip() for p in parts if p.strip()]
def _extract_site_filter(query: str) -> Tuple[str, Optional[str]]:
"""Detect a 'site:example.com' token. Returns (query_without_token, site_or_None)."""
if not isinstance(query, str):
return "", None
match = re.search(r"\bsite:([^\s]+)", query, flags=re.I)
if match:
site = match.group(1)
@@ -71,6 +77,8 @@ def _boost_entities_in_query(base_query: str, entities: Dict[str, List[str]]) ->
def enhance_query(original_query: str) -> Tuple[str, Optional[str]]:
"""Process the original query: site filter, question type boosts, entity extraction."""
if not isinstance(original_query, str):
original_query = ""
query_without_site, site = _extract_site_filter(original_query)
sub_queries = _split_multi_part(query_without_site)
@@ -120,6 +128,8 @@ def build_enhanced_query(query: str, time_filter: str = None) -> str:
def _is_news_query(query: str) -> bool:
"""Lightweight heuristic to decide if a query is news-oriented."""
news_terms = {"news", "latest", "breaking", "today", "today's", "current", "updates", "happening"}
if not isinstance(query, str):
return False
tokens = set(re.findall(r"\b\w+\b", query.lower()))
return bool(tokens & news_terms)

View File

@@ -0,0 +1,40 @@
"""Regression: search query helpers must tolerate a non-string query.
These helpers did `query.strip()`, `query.lower()`, `re.split(..., query)`,
`re.search(..., query)` directly, so a None / non-string query (e.g. from a
caller that didn't coerce) raised TypeError/AttributeError. They now return a
safe default for non-strings.
"""
import importlib.machinery
import importlib.util
from pathlib import Path
_PATH = Path(__file__).resolve().parents[1] / "services" / "search" / "query.py"
def _load():
# Load the module file directly so the package __init__ (which imports
# httpx) isn't required.
loader = importlib.machinery.SourceFileLoader("odysseus_search_query", str(_PATH))
spec = importlib.util.spec_from_loader(loader.name, loader)
module = importlib.util.module_from_spec(spec)
loader.exec_module(module)
return module
def test_helpers_handle_none():
q = _load()
assert q._detect_question_type(None) is None
assert q._split_multi_part(None) == []
assert q._extract_site_filter(None) == ("", None)
assert q._is_news_query(None) is False
# entry points coerce and do not raise
assert isinstance(q.enhance_query(None)[0], str)
assert isinstance(q.build_enhanced_query(123), str)
def test_valid_query_still_works():
q = _load()
assert q._detect_question_type("who is bob") == "who"
assert q._is_news_query("latest news today") is True
assert q._extract_site_filter("cats site:x.com")[1] == "x.com"