analyze_topics() iterates session_manager.sessions and reads
session_data.get("history", []) directly. But SessionManager.load_sessions
seeds sessions metadata-only with empty history — messages are loaded
lazily, only when get_session(session_id) is called. So analyze_topics saw
empty history for every session that hadn't been individually opened this
process lifetime and reported total_topics: 0, even when the database held
plenty of matching messages.
Hydrate each candidate session via session_manager.get_session(session_id)
(the existing lazy-load path) before reading its history, after the
owner/archived filters so skipped sessions aren't loaded. Falls back to the
raw cached history when the manager has no get_session (test stubs).
tests/test_topic_analyzer.py: new test_topic_analyzer_hydrates_sessions
seeds a real SQLite DB with a session + message, runs the real
SessionManager (asserting cached history starts empty), then asserts
analyze_topics finds the topic. Fails before this change. The existing
keyword tests now pass an explicit owner to satisfy the owner-required
early return.
105 lines
4.7 KiB
Python
105 lines
4.7 KiB
Python
"""
|
|
Topic analysis for conversations — deduplicated from app.py.
|
|
Used by /api/conversations/topics and /api/memory/extract fallback.
|
|
"""
|
|
|
|
import re
|
|
from typing import Dict, Any, List
|
|
|
|
TOPIC_KEYWORDS: Dict[str, List[str]] = {
|
|
"Technology": ["ai", "machine learning", "python", "code", "programming", "computer", "software", "hardware", "algorithm"],
|
|
"Science": ["science", "physics", "chemistry", "biology", "math", "mathematics", "research", "experiment"],
|
|
"Work": ["work", "job", "career", "project", "task", "deadline", "meeting", "colleague", "manager"],
|
|
"Personal": ["personal", "family", "friend", "relationship", "health", "wellness", "exercise", "diet"],
|
|
"Learning": ["learn", "study", "education", "course", "tutorial", "guide", "how to", "explain"],
|
|
"Creativity": ["write", "story", "create", "design", "art", "music", "draw", "paint"],
|
|
"Planning": ["plan", "schedule", "organize", "arrange", "coordinate", "timeline", "calendar"],
|
|
"Troubleshooting": ["error", "bug", "fix", "problem", "issue", "debug", "troubleshoot"],
|
|
}
|
|
|
|
|
|
def analyze_topics(session_manager, owner: str = None) -> Dict[str, Any]:
|
|
"""
|
|
Scan non-archived sessions and return topic frequency data.
|
|
If owner is set, only include sessions belonging to that user.
|
|
|
|
When `owner` is None or empty the helper returns an empty result. The
|
|
unauthenticated-loopback path in `app.py` produces a None owner, and
|
|
silently aggregating topic frequencies in that case is a cross-tenant
|
|
data leak. Callers that want a system-wide aggregate must pass an
|
|
explicit `owner` string (e.g. a documented "admin" pseudo-owner) or
|
|
the route must reject the request with 401.
|
|
|
|
Returns dict with "topics" list and "total_topics" count.
|
|
"""
|
|
if not owner:
|
|
return {"topics": [], "total_topics": 0}
|
|
|
|
topic_counts: Dict[str, int] = {t: 0 for t in TOPIC_KEYWORDS}
|
|
topic_matches: Dict[str, list] = {t: [] for t in TOPIC_KEYWORDS}
|
|
|
|
for session_id, session_data in session_manager.sessions.items():
|
|
if session_data.get("archived", False):
|
|
continue
|
|
# Strict ownership: any session whose owner does not match the
|
|
# caller is excluded. Ownerless sessions are never included
|
|
# unless the caller is itself ownerless (which the early return
|
|
# above already prevents).
|
|
sess_owner = session_data.get("owner") or getattr(session_data, "owner", None)
|
|
if sess_owner != owner:
|
|
continue
|
|
|
|
# Hydrate session to load history from DB if needed
|
|
if hasattr(session_manager, "get_session"):
|
|
hydrated_session = session_manager.get_session(session_id)
|
|
history = hydrated_session.history
|
|
else:
|
|
hydrated_session = session_data
|
|
history = session_data.get("history", [])
|
|
|
|
for msg in history:
|
|
content_raw = msg.get("content") if isinstance(msg, dict) else getattr(msg, "content", None)
|
|
if not content_raw:
|
|
continue
|
|
|
|
content = str(content_raw).lower()
|
|
role = msg.get("role") if isinstance(msg, dict) else getattr(msg, "role", "")
|
|
session_name = session_data.get("name", f"Session {session_id[:6]}")
|
|
|
|
for topic, keywords in TOPIC_KEYWORDS.items():
|
|
for kw in keywords:
|
|
if re.search(rf"\b{re.escape(kw)}\b", content):
|
|
topic_counts[topic] += 1
|
|
sentences = re.split(r'[.!?]', str(content_raw))
|
|
for sentence in sentences:
|
|
if re.search(rf"\b{re.escape(kw)}\b", sentence.lower()):
|
|
topic_matches[topic].append({
|
|
"session_id": session_id,
|
|
"session_name": session_name,
|
|
"role": role,
|
|
"snippet": sentence.strip(),
|
|
"keyword": kw,
|
|
})
|
|
break
|
|
|
|
results = []
|
|
for topic, count in topic_counts.items():
|
|
if count == 0:
|
|
continue
|
|
matches = topic_matches[topic]
|
|
unique, seen = [], set()
|
|
for m in matches:
|
|
key = f"{m['session_id']}-{m['snippet'][:50]}"
|
|
if key not in seen:
|
|
seen.add(key)
|
|
unique.append(m)
|
|
results.append({
|
|
"topic": topic,
|
|
"frequency": count,
|
|
"examples": unique[:5],
|
|
"session_count": len({m["session_id"] for m in unique}),
|
|
})
|
|
|
|
results.sort(key=lambda x: x["frequency"], reverse=True)
|
|
return {"topics": results, "total_topics": len(results)}
|