fix: topic analysis false-matches keywords as substrings (e.g. 'ai' in 'email') (#687)
* fix: match topic keywords on word boundaries, not substrings * fix: apply word-boundary matching to topic example snippets too * test: topic keywords match whole words, not substrings
This commit is contained in:
@@ -60,11 +60,11 @@ def analyze_topics(session_manager, owner: str = None) -> Dict[str, Any]:
|
||||
|
||||
for topic, keywords in TOPIC_KEYWORDS.items():
|
||||
for kw in keywords:
|
||||
if kw in content:
|
||||
if re.search(rf"\b{re.escape(kw)}\b", content):
|
||||
topic_counts[topic] += 1
|
||||
sentences = re.split(r'[.!?]', str(content_raw))
|
||||
for sentence in sentences:
|
||||
if kw in sentence.lower():
|
||||
if re.search(rf"\b{re.escape(kw)}\b", sentence.lower()):
|
||||
topic_matches[topic].append({
|
||||
"session_id": session_id,
|
||||
"session_name": session_name,
|
||||
|
||||
30
tests/test_topic_analyzer.py
Normal file
30
tests/test_topic_analyzer.py
Normal file
@@ -0,0 +1,30 @@
|
||||
"""Tests for topic keyword matching (src/topic_analyzer.py)."""
|
||||
from types import SimpleNamespace
|
||||
|
||||
from src.topic_analyzer import analyze_topics
|
||||
|
||||
|
||||
def _sm(*messages):
|
||||
history = [{"role": "user", "content": c} for c in messages]
|
||||
return SimpleNamespace(sessions={"s1": {"owner": None, "name": "S", "history": history}})
|
||||
|
||||
|
||||
def _freq(result):
|
||||
return {t["topic"]: t["frequency"] for t in result["topics"]}
|
||||
|
||||
|
||||
def test_substring_does_not_false_match_technology():
|
||||
# Regression: "ai" matched inside "email"/"again"/"rain"/"wait", flagging
|
||||
# Technology for messages with no technical content at all.
|
||||
result = analyze_topics(_sm("Can you send me an email again about the rain? I will wait."))
|
||||
assert "Technology" not in _freq(result)
|
||||
|
||||
|
||||
def test_real_keywords_still_match():
|
||||
result = analyze_topics(_sm("I wrote some Python code to test the algorithm."))
|
||||
assert _freq(result).get("Technology", 0) >= 1
|
||||
|
||||
|
||||
def test_multiword_keyword_matches():
|
||||
result = analyze_topics(_sm("Can you explain how to set this up?"))
|
||||
assert "Learning" in _freq(result)
|
||||
Reference in New Issue
Block a user