diff --git a/services/search/query.py b/services/search/query.py index 22f0c11..eb54ee1 100644 --- a/services/search/query.py +++ b/services/search/query.py @@ -15,7 +15,10 @@ def _detect_question_type(query: str) -> Optional[str]: """Return the leading question word if present (who, what, when, where, why, how).""" q = query.strip().lower() for word in ("who", "what", "when", "where", "why", "how"): - if q.startswith(word): + # Require a whole-word match: a bare prefix mis-flags ordinary queries + # like "whatsapp pricing" (-> what) or "however ..." (-> how), which + # then get spurious boost terms OR-appended in enhance_query. + if q == word or q.startswith(word + " "): return word return None diff --git a/src/search/query.py b/src/search/query.py index 22f0c11..eb54ee1 100644 --- a/src/search/query.py +++ b/src/search/query.py @@ -15,7 +15,10 @@ def _detect_question_type(query: str) -> Optional[str]: """Return the leading question word if present (who, what, when, where, why, how).""" q = query.strip().lower() for word in ("who", "what", "when", "where", "why", "how"): - if q.startswith(word): + # Require a whole-word match: a bare prefix mis-flags ordinary queries + # like "whatsapp pricing" (-> what) or "however ..." (-> how), which + # then get spurious boost terms OR-appended in enhance_query. + if q == word or q.startswith(word + " "): return word return None diff --git a/tests/test_question_type_detection.py b/tests/test_question_type_detection.py new file mode 100644 index 0000000..3540c5e --- /dev/null +++ b/tests/test_question_type_detection.py @@ -0,0 +1,18 @@ +"""Tests for question-word detection in research query enhancement.""" + +from src.search.query import _detect_question_type + + +def test_whole_word_questions_detected(): + assert _detect_question_type("what is topological data analysis") == "what" + assert _detect_question_type("how do transformers work") == "how" + assert _detect_question_type("why") == "why" + + +def test_prefix_lookalikes_not_misclassified(): + # Regression: a bare prefix used to flag these as questions and append + # spurious boost terms in enhance_query. + assert _detect_question_type("whatsapp pricing") is None + assert _detect_question_type("however we proceed") is None + assert _detect_question_type("whole foods stock") is None + assert _detect_question_type("howard stern show") is None