From 7f97ab3032a52bf044fe4cfd5612f5f8c31dc1ac Mon Sep 17 00:00:00 2001 From: Tatlatat Date: Tue, 2 Jun 2026 18:44:27 +0700 Subject: [PATCH] Topics: hydrate session history before analysis MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit analyze_topics() iterates session_manager.sessions and reads session_data.get("history", []) directly. But SessionManager.load_sessions seeds sessions metadata-only with empty history — messages are loaded lazily, only when get_session(session_id) is called. So analyze_topics saw empty history for every session that hadn't been individually opened this process lifetime and reported total_topics: 0, even when the database held plenty of matching messages. Hydrate each candidate session via session_manager.get_session(session_id) (the existing lazy-load path) before reading its history, after the owner/archived filters so skipped sessions aren't loaded. Falls back to the raw cached history when the manager has no get_session (test stubs). tests/test_topic_analyzer.py: new test_topic_analyzer_hydrates_sessions seeds a real SQLite DB with a session + message, runs the real SessionManager (asserting cached history starts empty), then asserts analyze_topics finds the topic. Fails before this change. The existing keyword tests now pass an explicit owner to satisfy the owner-required early return. --- src/topic_analyzer.py | 10 ++++- tests/test_topic_analyzer.py | 76 +++++++++++++++++++++++++++++++++--- 2 files changed, 80 insertions(+), 6 deletions(-) diff --git a/src/topic_analyzer.py b/src/topic_analyzer.py index e8563df..4509baf 100644 --- a/src/topic_analyzer.py +++ b/src/topic_analyzer.py @@ -49,7 +49,15 @@ def analyze_topics(session_manager, owner: str = None) -> Dict[str, Any]: if sess_owner != owner: continue - for msg in session_data.get("history", []): + # Hydrate session to load history from DB if needed + if hasattr(session_manager, "get_session"): + hydrated_session = session_manager.get_session(session_id) + history = hydrated_session.history + else: + hydrated_session = session_data + history = session_data.get("history", []) + + for msg in history: content_raw = msg.get("content") if isinstance(msg, dict) else getattr(msg, "content", None) if not content_raw: continue diff --git a/tests/test_topic_analyzer.py b/tests/test_topic_analyzer.py index ba55db0..6101526 100644 --- a/tests/test_topic_analyzer.py +++ b/tests/test_topic_analyzer.py @@ -1,12 +1,17 @@ """Tests for topic keyword matching (src/topic_analyzer.py).""" from types import SimpleNamespace - +import pytest +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker +from core.database import Base, Session as DbSession, ChatMessage as DbChatMessage +from core.session_manager import SessionManager from src.topic_analyzer import analyze_topics +from datetime import datetime def _sm(*messages): history = [{"role": "user", "content": c} for c in messages] - return SimpleNamespace(sessions={"s1": {"owner": None, "name": "S", "history": history}}) + return SimpleNamespace(sessions={"s1": {"owner": "alice", "name": "S", "history": history}}) def _freq(result): @@ -16,15 +21,76 @@ def _freq(result): def test_substring_does_not_false_match_technology(): # Regression: "ai" matched inside "email"/"again"/"rain"/"wait", flagging # Technology for messages with no technical content at all. - result = analyze_topics(_sm("Can you send me an email again about the rain? I will wait.")) + result = analyze_topics(_sm("Can you send me an email again about the rain? I will wait."), owner="alice") assert "Technology" not in _freq(result) def test_real_keywords_still_match(): - result = analyze_topics(_sm("I wrote some Python code to test the algorithm.")) + result = analyze_topics(_sm("I wrote some Python code to test the algorithm."), owner="alice") assert _freq(result).get("Technology", 0) >= 1 def test_multiword_keyword_matches(): - result = analyze_topics(_sm("Can you explain how to set this up?")) + result = analyze_topics(_sm("Can you explain how to set this up?"), owner="alice") assert "Learning" in _freq(result) + + +def test_topic_analyzer_hydrates_sessions(monkeypatch): + # 1. Create clean in-memory database + engine = create_engine("sqlite:///:memory:") + Base.metadata.create_all(bind=engine) + + # 2. Create test session factory + TestSessionLocal = sessionmaker(bind=engine) + + # 3. Populate test database with a session and a message about Python + db = TestSessionLocal() + session_id = "session-1" + + s = DbSession( + id=session_id, + name="Python chat", + endpoint_url="http://localhost:8000", + model="gpt-4", + owner="alice", + message_count=1, + created_at=datetime.utcnow(), + updated_at=datetime.utcnow() + ) + m = DbChatMessage( + id="msg-1", + session_id=session_id, + role="user", + content="I love writing python code.", + timestamp=datetime.utcnow() + ) + + db.add(s) + db.add(m) + db.commit() + db.close() + + # 4. Patch SessionLocal to use our in-memory DB + import core.session_manager + import core.database + monkeypatch.setattr(core.session_manager, "SessionLocal", TestSessionLocal) + monkeypatch.setattr(core.database, "SessionLocal", TestSessionLocal) + + # 5. Initialize the real SessionManager and load metadata (seeds sessions with empty history) + sm = SessionManager() + + # Verify that the session is in sm.sessions, and its history is currently empty + assert session_id in sm.sessions + assert len(sm.sessions[session_id].history) == 0 + + # 6. Execute the topic analysis + res = analyze_topics(sm, owner="alice") + + # 7. Assertions + # There should be 1 topic found (Technology, since "python" / "code" are keywords) + assert res["total_topics"] > 0 + + # Check that the topic is Technology + tech_topic = next((t for t in res["topics"] if t["topic"] == "Technology"), None) + assert tech_topic is not None + assert tech_topic["frequency"] >= 1