fix: research query misclassifies 'whatsapp'/'however' as questions (#1247)

* fix: detect question words as whole words, not prefixes

* fix: same question-word prefix bug in the services search copy

* test: question-word detection rejects prefix lookalikes
This commit is contained in:
Afonso Coutinho
2026-06-02 17:10:06 +01:00
committed by GitHub
parent 311f226d44
commit f62d6ea3d7
3 changed files with 26 additions and 2 deletions

View File

@@ -15,7 +15,10 @@ def _detect_question_type(query: str) -> Optional[str]:
"""Return the leading question word if present (who, what, when, where, why, how).""" """Return the leading question word if present (who, what, when, where, why, how)."""
q = query.strip().lower() q = query.strip().lower()
for word in ("who", "what", "when", "where", "why", "how"): for word in ("who", "what", "when", "where", "why", "how"):
if q.startswith(word): # Require a whole-word match: a bare prefix mis-flags ordinary queries
# like "whatsapp pricing" (-> what) or "however ..." (-> how), which
# then get spurious boost terms OR-appended in enhance_query.
if q == word or q.startswith(word + " "):
return word return word
return None return None

View File

@@ -15,7 +15,10 @@ def _detect_question_type(query: str) -> Optional[str]:
"""Return the leading question word if present (who, what, when, where, why, how).""" """Return the leading question word if present (who, what, when, where, why, how)."""
q = query.strip().lower() q = query.strip().lower()
for word in ("who", "what", "when", "where", "why", "how"): for word in ("who", "what", "when", "where", "why", "how"):
if q.startswith(word): # Require a whole-word match: a bare prefix mis-flags ordinary queries
# like "whatsapp pricing" (-> what) or "however ..." (-> how), which
# then get spurious boost terms OR-appended in enhance_query.
if q == word or q.startswith(word + " "):
return word return word
return None return None

View File

@@ -0,0 +1,18 @@
"""Tests for question-word detection in research query enhancement."""
from src.search.query import _detect_question_type
def test_whole_word_questions_detected():
assert _detect_question_type("what is topological data analysis") == "what"
assert _detect_question_type("how do transformers work") == "how"
assert _detect_question_type("why") == "why"
def test_prefix_lookalikes_not_misclassified():
# Regression: a bare prefix used to flag these as questions and append
# spurious boost terms in enhance_query.
assert _detect_question_type("whatsapp pricing") is None
assert _detect_question_type("however we proceed") is None
assert _detect_question_type("whole foods stock") is None
assert _detect_question_type("howard stern show") is None