From 48d3b7ababe8f245d74f59d3696fbe80a68bb860 Mon Sep 17 00:00:00 2001 From: Afonso Coutinho <116525378+afonsopc@users.noreply.github.com> Date: Tue, 2 Jun 2026 03:42:04 +0100 Subject: [PATCH] fix: topic analysis false-matches keywords as substrings (e.g. 'ai' in 'email') (#687) * fix: match topic keywords on word boundaries, not substrings * fix: apply word-boundary matching to topic example snippets too * test: topic keywords match whole words, not substrings --- src/topic_analyzer.py | 4 ++-- tests/test_topic_analyzer.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 2 deletions(-) create mode 100644 tests/test_topic_analyzer.py diff --git a/src/topic_analyzer.py b/src/topic_analyzer.py index 73666ae..e8563df 100644 --- a/src/topic_analyzer.py +++ b/src/topic_analyzer.py @@ -60,11 +60,11 @@ def analyze_topics(session_manager, owner: str = None) -> Dict[str, Any]: for topic, keywords in TOPIC_KEYWORDS.items(): for kw in keywords: - if kw in content: + if re.search(rf"\b{re.escape(kw)}\b", content): topic_counts[topic] += 1 sentences = re.split(r'[.!?]', str(content_raw)) for sentence in sentences: - if kw in sentence.lower(): + if re.search(rf"\b{re.escape(kw)}\b", sentence.lower()): topic_matches[topic].append({ "session_id": session_id, "session_name": session_name, diff --git a/tests/test_topic_analyzer.py b/tests/test_topic_analyzer.py new file mode 100644 index 0000000..ba55db0 --- /dev/null +++ b/tests/test_topic_analyzer.py @@ -0,0 +1,30 @@ +"""Tests for topic keyword matching (src/topic_analyzer.py).""" +from types import SimpleNamespace + +from src.topic_analyzer import analyze_topics + + +def _sm(*messages): + history = [{"role": "user", "content": c} for c in messages] + return SimpleNamespace(sessions={"s1": {"owner": None, "name": "S", "history": history}}) + + +def _freq(result): + return {t["topic"]: t["frequency"] for t in result["topics"]} + + +def test_substring_does_not_false_match_technology(): + # Regression: "ai" matched inside "email"/"again"/"rain"/"wait", flagging + # Technology for messages with no technical content at all. + result = analyze_topics(_sm("Can you send me an email again about the rain? I will wait.")) + assert "Technology" not in _freq(result) + + +def test_real_keywords_still_match(): + result = analyze_topics(_sm("I wrote some Python code to test the algorithm.")) + assert _freq(result).get("Technology", 0) >= 1 + + +def test_multiword_keyword_matches(): + result = analyze_topics(_sm("Can you explain how to set this up?")) + assert "Learning" in _freq(result)