Handle non-string src search queries (#1646)

This commit is contained in:
red person
2026-06-03 08:11:02 +03:00
committed by GitHub
parent ade755b184
commit b409b20940
2 changed files with 43 additions and 0 deletions

View File

@@ -13,6 +13,8 @@ logger = logging.getLogger(__name__)
# ----------------------------------------------------------------------
def _detect_question_type(query: str) -> Optional[str]:
"""Return the leading question word if present (who, what, when, where, why, how)."""
if not isinstance(query, str):
return None
q = query.strip().lower()
for word in ("who", "what", "when", "where", "why", "how"):
# Require a whole-word match: a bare prefix mis-flags ordinary queries
@@ -45,12 +47,16 @@ def _extract_entities(query: str) -> Dict[str, List[str]]:
def _split_multi_part(query: str) -> List[str]:
"""Split a query into sub-queries on common conjunctions."""
if not isinstance(query, str):
return []
parts = re.split(r"\s+and\s+|\s+or\s+|;", query, flags=re.I)
return [p.strip() for p in parts if p.strip()]
def _extract_site_filter(query: str) -> Tuple[str, Optional[str]]:
"""Detect a 'site:example.com' token. Returns (query_without_token, site_or_None)."""
if not isinstance(query, str):
return "", None
match = re.search(r"\bsite:([^\s]+)", query, flags=re.I)
if match:
site = match.group(1)
@@ -71,6 +77,8 @@ def _boost_entities_in_query(base_query: str, entities: Dict[str, List[str]]) ->
def enhance_query(original_query: str) -> Tuple[str, Optional[str]]:
"""Process the original query: site filter, question type boosts, entity extraction."""
if not isinstance(original_query, str):
original_query = ""
query_without_site, site = _extract_site_filter(original_query)
sub_queries = _split_multi_part(query_without_site)
@@ -119,6 +127,8 @@ def build_enhanced_query(query: str, time_filter: str = None) -> str:
# ----------------------------------------------------------------------
def _is_news_query(query: str) -> bool:
"""Lightweight heuristic to decide if a query is news-oriented."""
if not isinstance(query, str):
return False
news_terms = {"news", "latest", "breaking", "today", "today's", "current", "updates", "happening"}
tokens = set(re.findall(r"\b\w+\b", query.lower()))
return bool(tokens & news_terms)

View File

@@ -0,0 +1,33 @@
import importlib.machinery
import importlib.util
from pathlib import Path
_PATH = Path(__file__).resolve().parents[1] / "src" / "search" / "query.py"
def _load():
loader = importlib.machinery.SourceFileLoader("odysseus_src_search_query", str(_PATH))
spec = importlib.util.spec_from_loader(loader.name, loader)
module = importlib.util.module_from_spec(spec)
loader.exec_module(module)
return module
def test_src_search_helpers_handle_non_string_queries():
q = _load()
assert q._detect_question_type(None) is None
assert q._split_multi_part(None) == []
assert q._extract_site_filter(None) == ("", None)
assert q._is_news_query(None) is False
assert isinstance(q.enhance_query(None)[0], str)
assert isinstance(q.build_enhanced_query(123), str)
def test_src_search_valid_query_still_works():
q = _load()
assert q._detect_question_type("who is bob") == "who"
assert q._is_news_query("latest news today") is True
assert q._extract_site_filter("cats site:x.com")[1] == "x.com"