Handle non-string src search queries (#1646)
This commit is contained in:
@@ -13,6 +13,8 @@ logger = logging.getLogger(__name__)
|
||||
# ----------------------------------------------------------------------
|
||||
def _detect_question_type(query: str) -> Optional[str]:
|
||||
"""Return the leading question word if present (who, what, when, where, why, how)."""
|
||||
if not isinstance(query, str):
|
||||
return None
|
||||
q = query.strip().lower()
|
||||
for word in ("who", "what", "when", "where", "why", "how"):
|
||||
# Require a whole-word match: a bare prefix mis-flags ordinary queries
|
||||
@@ -45,12 +47,16 @@ def _extract_entities(query: str) -> Dict[str, List[str]]:
|
||||
|
||||
def _split_multi_part(query: str) -> List[str]:
|
||||
"""Split a query into sub-queries on common conjunctions."""
|
||||
if not isinstance(query, str):
|
||||
return []
|
||||
parts = re.split(r"\s+and\s+|\s+or\s+|;", query, flags=re.I)
|
||||
return [p.strip() for p in parts if p.strip()]
|
||||
|
||||
|
||||
def _extract_site_filter(query: str) -> Tuple[str, Optional[str]]:
|
||||
"""Detect a 'site:example.com' token. Returns (query_without_token, site_or_None)."""
|
||||
if not isinstance(query, str):
|
||||
return "", None
|
||||
match = re.search(r"\bsite:([^\s]+)", query, flags=re.I)
|
||||
if match:
|
||||
site = match.group(1)
|
||||
@@ -71,6 +77,8 @@ def _boost_entities_in_query(base_query: str, entities: Dict[str, List[str]]) ->
|
||||
|
||||
def enhance_query(original_query: str) -> Tuple[str, Optional[str]]:
|
||||
"""Process the original query: site filter, question type boosts, entity extraction."""
|
||||
if not isinstance(original_query, str):
|
||||
original_query = ""
|
||||
query_without_site, site = _extract_site_filter(original_query)
|
||||
sub_queries = _split_multi_part(query_without_site)
|
||||
|
||||
@@ -119,6 +127,8 @@ def build_enhanced_query(query: str, time_filter: str = None) -> str:
|
||||
# ----------------------------------------------------------------------
|
||||
def _is_news_query(query: str) -> bool:
|
||||
"""Lightweight heuristic to decide if a query is news-oriented."""
|
||||
if not isinstance(query, str):
|
||||
return False
|
||||
news_terms = {"news", "latest", "breaking", "today", "today's", "current", "updates", "happening"}
|
||||
tokens = set(re.findall(r"\b\w+\b", query.lower()))
|
||||
return bool(tokens & news_terms)
|
||||
|
||||
33
tests/test_src_search_query_nonstring.py
Normal file
33
tests/test_src_search_query_nonstring.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import importlib.machinery
|
||||
import importlib.util
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
_PATH = Path(__file__).resolve().parents[1] / "src" / "search" / "query.py"
|
||||
|
||||
|
||||
def _load():
|
||||
loader = importlib.machinery.SourceFileLoader("odysseus_src_search_query", str(_PATH))
|
||||
spec = importlib.util.spec_from_loader(loader.name, loader)
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
loader.exec_module(module)
|
||||
return module
|
||||
|
||||
|
||||
def test_src_search_helpers_handle_non_string_queries():
|
||||
q = _load()
|
||||
|
||||
assert q._detect_question_type(None) is None
|
||||
assert q._split_multi_part(None) == []
|
||||
assert q._extract_site_filter(None) == ("", None)
|
||||
assert q._is_news_query(None) is False
|
||||
assert isinstance(q.enhance_query(None)[0], str)
|
||||
assert isinstance(q.build_enhanced_query(123), str)
|
||||
|
||||
|
||||
def test_src_search_valid_query_still_works():
|
||||
q = _load()
|
||||
|
||||
assert q._detect_question_type("who is bob") == "who"
|
||||
assert q._is_news_query("latest news today") is True
|
||||
assert q._extract_site_filter("cats site:x.com")[1] == "x.com"
|
||||
Reference in New Issue
Block a user