Odysseus v1.0
This commit is contained in:
85
src/topic_analyzer.py
Normal file
85
src/topic_analyzer.py
Normal file
@@ -0,0 +1,85 @@
|
||||
"""
|
||||
Topic analysis for conversations — deduplicated from app.py.
|
||||
Used by /api/conversations/topics and /api/memory/extract fallback.
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Dict, Any, List
|
||||
|
||||
TOPIC_KEYWORDS: Dict[str, List[str]] = {
|
||||
"Technology": ["ai", "machine learning", "python", "code", "programming", "computer", "software", "hardware", "algorithm"],
|
||||
"Science": ["science", "physics", "chemistry", "biology", "math", "mathematics", "research", "experiment"],
|
||||
"Work": ["work", "job", "career", "project", "task", "deadline", "meeting", "colleague", "manager"],
|
||||
"Personal": ["personal", "family", "friend", "relationship", "health", "wellness", "exercise", "diet"],
|
||||
"Learning": ["learn", "study", "education", "course", "tutorial", "guide", "how to", "explain"],
|
||||
"Creativity": ["write", "story", "create", "design", "art", "music", "draw", "paint"],
|
||||
"Planning": ["plan", "schedule", "organize", "arrange", "coordinate", "timeline", "calendar"],
|
||||
"Troubleshooting": ["error", "bug", "fix", "problem", "issue", "debug", "troubleshoot"],
|
||||
}
|
||||
|
||||
|
||||
def analyze_topics(session_manager, owner: str = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Scan non-archived sessions and return topic frequency data.
|
||||
If owner is set, only include sessions belonging to that user.
|
||||
|
||||
Returns dict with "topics" list and "total_topics" count.
|
||||
"""
|
||||
topic_counts: Dict[str, int] = {t: 0 for t in TOPIC_KEYWORDS}
|
||||
topic_matches: Dict[str, list] = {t: [] for t in TOPIC_KEYWORDS}
|
||||
|
||||
for session_id, session_data in session_manager.sessions.items():
|
||||
if session_data.get("archived", False):
|
||||
continue
|
||||
# SECURITY: strict ownership — the previous predicate let any
|
||||
# null-owner session feed into another user's topic analysis.
|
||||
if owner:
|
||||
sess_owner = session_data.get("owner") or getattr(session_data, "owner", None)
|
||||
if sess_owner != owner:
|
||||
continue
|
||||
|
||||
for msg in session_data.get("history", []):
|
||||
content_raw = msg.get("content") if isinstance(msg, dict) else getattr(msg, "content", None)
|
||||
if not content_raw:
|
||||
continue
|
||||
|
||||
content = str(content_raw).lower()
|
||||
role = msg.get("role") if isinstance(msg, dict) else getattr(msg, "role", "")
|
||||
session_name = session_data.get("name", f"Session {session_id[:6]}")
|
||||
|
||||
for topic, keywords in TOPIC_KEYWORDS.items():
|
||||
for kw in keywords:
|
||||
if kw in content:
|
||||
topic_counts[topic] += 1
|
||||
sentences = re.split(r'[.!?]', str(content_raw))
|
||||
for sentence in sentences:
|
||||
if kw in sentence.lower():
|
||||
topic_matches[topic].append({
|
||||
"session_id": session_id,
|
||||
"session_name": session_name,
|
||||
"role": role,
|
||||
"snippet": sentence.strip(),
|
||||
"keyword": kw,
|
||||
})
|
||||
break
|
||||
|
||||
results = []
|
||||
for topic, count in topic_counts.items():
|
||||
if count == 0:
|
||||
continue
|
||||
matches = topic_matches[topic]
|
||||
unique, seen = [], set()
|
||||
for m in matches:
|
||||
key = f"{m['session_id']}-{m['snippet'][:50]}"
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
unique.append(m)
|
||||
results.append({
|
||||
"topic": topic,
|
||||
"frequency": count,
|
||||
"examples": unique[:5],
|
||||
"session_count": len({m["session_id"] for m in unique}),
|
||||
})
|
||||
|
||||
results.sort(key=lambda x: x["frequency"], reverse=True)
|
||||
return {"topics": results, "total_topics": len(results)}
|
||||
Reference in New Issue
Block a user