Odysseus v1.0

2026-05-31 23:58:26 +09:00
commit e5c99a5eee
421 changed files with 271349 additions and 0 deletions
--- a/src/chat_processor.py
+++ b/src/chat_processor.py
@@ -0,0 +1,320 @@
+# src/chat_processor.py
+import logging
+import math
+import re
+import time
+from collections import Counter
+from typing import List, Dict, Any, Optional, Tuple
+from src.chat_helpers import extract_urls
+from src.youtube_handler import is_youtube_url
+from src.search import comprehensive_web_search, fetch_webpage_content
+from src.prompt_security import UNTRUSTED_CONTEXT_POLICY, untrusted_context_message
+
+logger = logging.getLogger(__name__)
+
+# ── Stopwords & tokenizer ──
+
+_STOPWORDS = frozenset(
+    "a an the is am are was were be been being have has had do does did "
+    "will would shall should can could may might must need ought dare "
+    "i me my mine we us our ours you your yours he him his she her hers "
+    "it its they them their theirs this that these those "
+    "and but or nor not no so if then else than too also very "
+    "in on at to for of by with from up out about into over after "
+    "what when where which who whom how why all each every some any "
+    "just very really actually like well also still already even "
+    "oh ok okay yes yeah hey hi hello thanks thank please sorry "
+    "much more most own other another such only same here there "
+    "because while during before until since through between both "
+    "few many several some none nothing something anything everything "
+    "get got make made go going went been come came take took "
+    "know think want let say tell give see look find way thing "
+    "don doesn didn won wouldn couldn shouldn wasn weren isn aren haven hasn "
+    "don't doesn't didn't won't wouldn't couldn't shouldn't "
+    "it's i'm i've i'll i'd you're you've you'll he's she's we're we've they're they've "
+    "that's there's here's what's who's how's let's can't".split()
+)
+
+def _content_tokens(text: str) -> list:
+    """Extract meaningful content words: no stopwords, min 3 chars, lowercase."""
+    words = re.findall(r'[a-z0-9]+(?:[-_][a-z0-9]+)*', text.lower())
+    return [w for w in words if len(w) >= 3 and w not in _STOPWORDS]
+
+
+class ChatProcessor:
+    def __init__(self, memory_manager, personal_docs_manager, memory_vector=None, skills_manager=None):
+        self.memory_manager = memory_manager
+        self.personal_docs_manager = personal_docs_manager
+        self.memory_vector = memory_vector
+        self.skills_manager = skills_manager
+
+    # Minimum similarity score for RAG results to be injected
+    RAG_SIMILARITY_THRESHOLD = 0.35
+
+    def _hybrid_retrieve(self, message: str, mem_entries: list, k: int = 5) -> list:
+        """Retrieve memories relevant to the message.
+
+        Uses BM25-style keyword scoring + optional vector similarity.
+        Recency is a tiebreaker only, never the primary signal.
+        """
+        if not mem_entries or not message.strip():
+            return []
+
+        now = time.time()
+        query_tokens = _content_tokens(message)
+
+        # If the query has no meaningful tokens, skip keyword retrieval entirely
+        if not query_tokens:
+            # Fall back to vector-only if available
+            if not (self.memory_vector and self.memory_vector.healthy):
+                return []
+
+        # ── Build IDF from the memory corpus ──
+        N = len(mem_entries)
+        doc_freq = Counter()  # token -> how many memories contain it
+        mem_token_cache = {}  # mem_id -> set of content tokens
+        for mem in mem_entries:
+            toks = set(_content_tokens(mem["text"]))
+            mem_token_cache[mem["id"]] = toks
+            for t in toks:
+                doc_freq[t] += 1
+
+        def _bm25_score(query_toks, mem_id):
+            """BM25-inspired score between query and a memory."""
+            mem_toks = mem_token_cache.get(mem_id, set())
+            if not mem_toks or not query_toks:
+                return 0.0
+            score = 0.0
+            mem_len = len(mem_toks)
+            avg_len = max(sum(len(v) for v in mem_token_cache.values()) / N, 1)
+            k1, b = 1.5, 0.75
+            for qt in query_toks:
+                if qt not in mem_toks:
+                    continue
+                df = doc_freq.get(qt, 0)
+                idf = math.log((N - df + 0.5) / (df + 0.5) + 1)
+                tf = 1  # binary presence (memory entries are short)
+                tf_norm = (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * mem_len / avg_len))
+                score += idf * tf_norm
+            return score
+
+        # ── Score all candidates ──
+        has_vector = self.memory_vector and self.memory_vector.healthy
+        vector_scores = {}
+
+        if has_vector:
+            results = self.memory_vector.search(message, k=min(k * 3, 20))
+            mem_by_id = {m["id"]: m for m in mem_entries}
+            for r in results:
+                if r["memory_id"] in mem_by_id:
+                    vector_scores[r["memory_id"]] = max(r["score"], 0.0)
+
+        scored = []
+        for mem in mem_entries:
+            mid = mem["id"]
+            vs = vector_scores.get(mid, 0.0)
+            kw = _bm25_score(query_tokens, mid)
+
+            # Normalize BM25 to roughly 0-1 range (cap at a reasonable max)
+            kw_norm = min(kw / 6.0, 1.0) if kw > 0 else 0.0
+
+            # Category-aware boost for identity/contact queries
+            category = mem.get("category", "fact")
+            msg_lower = message.lower()
+            mem_lower = mem["text"].lower()
+            cat_boost = 1.0
+            if any(w in msg_lower for w in ["name", "who am i", "my name"]):
+                if category == "identity" or any(w in mem_lower for w in ["name is", "i am", "called"]):
+                    cat_boost = 1.4
+            elif any(w in msg_lower for w in ["phone", "email", "address", "contact"]):
+                if category == "contact" or "@" in mem_lower:
+                    cat_boost = 1.3
+            elif any(w in msg_lower for w in ["like", "prefer", "favorite"]):
+                if category == "preference":
+                    cat_boost = 1.2
+
+            kw_norm = min(kw_norm * cat_boost, 1.0)
+
+            # Recency — tiebreaker only (max 5% contribution)
+            ts = mem.get("timestamp", 0)
+            days_old = max((now - ts) / 86400, 0)
+            recency = 1.0 / (1.0 + days_old * 0.05)
+
+            # Gate: need real relevance, not just recency
+            if has_vector:
+                if vs < 0.20 and kw_norm < 0.08:
+                    continue
+                final = (0.55 * vs) + (0.40 * kw_norm) + (0.05 * recency)
+            else:
+                if kw_norm < 0.08:
+                    continue
+                final = (0.95 * kw_norm) + (0.05 * recency)
+
+            if final > 0.12:
+                scored.append((final, mem))
+
+        scored.sort(key=lambda x: x[0], reverse=True)
+        return [mem for _, mem in scored[:k]]
+
+    def build_context_preface(
+        self,
+        message: str,
+        session: Any,
+        use_web: bool = False,
+        use_rag: bool = True,
+        use_memory: bool = True,
+        time_filter: Optional[str] = None,
+        preset_system_prompt: Optional[str] = None,
+        owner: Optional[str] = None,
+        character_name: Optional[str] = None,
+        agent_mode: bool = False,
+        incognito: bool = False,
+        use_skills: bool = True,
+    ) -> Tuple[List[Dict[str, str]], List[Dict[str, Any]], List[Dict[str, str]]]:
+        """Build the context preface for LLM calls.
+
+        Returns:
+            Tuple of (preface messages, rag_sources list)
+        """
+        preface = []
+        rag_sources = []
+
+        # Add preset system prompt if specified
+        if preset_system_prompt:
+            preface.append({
+                "role": "system",
+                "content": preset_system_prompt
+            })
+        preface.append({
+            "role": "system",
+            "content": UNTRUSTED_CONTEXT_POLICY,
+        })
+
+        # Memory: pinned (always included) + extended (RAG-retrieved when relevant)
+        self._last_used_memories = []  # track what was injected
+        if use_memory:
+            mem_entries = self.memory_manager.load(owner=owner)
+
+            pinned = [m for m in mem_entries if m.get("pinned")]
+            extended = [m for m in mem_entries if not m.get("pinned")]
+
+            _used_ids: list = []
+            if pinned:
+                pinned_text = "\n- ".join([m["text"] for m in pinned])
+                preface.append(untrusted_context_message(
+                    "saved memory: pinned user facts",
+                    f"Core facts about the user:\n- {pinned_text}",
+                ))
+                for m in pinned:
+                    self._last_used_memories.append({"text": m["text"], "category": m.get("category", "fact"), "type": "pinned"})
+                    if m.get("id"):
+                        _used_ids.append(m["id"])
+
+            if extended:
+                relevant = self._hybrid_retrieve(message, extended, k=3)
+                if relevant:
+                    ext_text = "\n".join([f"- {m['text']}" for m in relevant])
+                    preface.append(untrusted_context_message(
+                        "saved memory: retrieved context",
+                        (
+                            "Memory context. Do not reference unless the user asks "
+                            f"about these topics.\n{ext_text}"
+                        ),
+                    ))
+                    for m in relevant:
+                        self._last_used_memories.append({"text": m["text"], "category": m.get("category", "fact"), "type": "recalled"})
+                        if m.get("id"):
+                            _used_ids.append(m["id"])
+
+            # Bump usage counters for the memories that were actually injected.
+            if _used_ids and hasattr(self.memory_manager, "increment_uses"):
+                try:
+                    self.memory_manager.increment_uses(_used_ids)
+                except Exception as _e:
+                    logger.warning("Failed to increment memory uses: %s", _e)
+
+            # (skills index injection moved out — see below; only fires in
+            # agent mode so chat mode and incognito stay clean.)
+
+        # RAG: search if enabled and rag_manager available, inject only above threshold
+        if use_rag:
+            try:
+                rag_manager = getattr(self.personal_docs_manager, 'rag_manager', None)
+                if rag_manager:
+                    results = rag_manager.search(message, k=5, owner=owner)
+                    # Filter by similarity threshold
+                    relevant = [r for r in results if r.get("similarity", 0) >= self.RAG_SIMILARITY_THRESHOLD]
+                    if relevant:
+                        logger.info(f"RAG: {len(relevant)}/{len(results)} results above threshold {self.RAG_SIMILARITY_THRESHOLD}")
+                        rag_sources = [
+                            {
+                                "filename": r["metadata"].get("filename", r["metadata"].get("source", "unknown")),
+                                "snippet": r["document"][:200],
+                                "similarity": round(r.get("similarity", 0), 3)
+                            }
+                            for r in relevant
+                        ]
+                        rag_content = "Relevant documents:\n\n" + "\n\n---\n\n".join(
+                            f"[{s['filename']}]\n{r['document']}" for s, r in zip(rag_sources, relevant)
+                        )
+                        if len(rag_content) > 10000:
+                            rag_content = rag_content[:10000] + "\n[Truncated]"
+                        preface.append(untrusted_context_message("retrieved documents", rag_content))
+            except Exception as e:
+                logger.warning(f"RAG retrieval failed: {e}")
+
+        # Add web search if enabled
+        web_sources = []
+        if use_web:
+            try:
+                web_context, web_sources = comprehensive_web_search(
+                    message, time_filter=time_filter, return_sources=True
+                )
+                preface.append(untrusted_context_message("web search results", web_context))
+            except Exception as e:
+                logger.error(f"Web search failed: {e}")
+                preface.append({"role": "system", "content": "Web search encountered an error and could not retrieve results."})
+
+        # Process non-YouTube URLs in message (YouTube handled by preprocess_message)
+        # Skip auto-fetch for long pastes (the user already pasted the content —
+        # fetching every embedded link buries the actual question under
+        # hundreds of KB of duplicate page HTML and confuses the model) or for
+        # link-heavy pastes (>3 URLs typically means it's a boilerplate-laden
+        # blog post, not a "summarize this URL" request).
+        urls = extract_urls(message)
+        non_yt_urls = [u for u in urls if not is_youtube_url(u)]
+        skip_url_fetch = len(message) > 2000 or len(non_yt_urls) > 3
+        if not skip_url_fetch:
+            for url in non_yt_urls:
+                result = fetch_webpage_content(url)
+                if result.get('success'):
+                    content = result.get('content', '')[:10000]
+                    preface.append(untrusted_context_message(
+                        f"web page: {url}",
+                        f"Content from {url}:\n\n{content}",
+                    ))
+
+        # Skills index — progressive disclosure. Only injected when the
+        # model has the `manage_skills` tool available (agent_mode), and
+        # never in incognito mode (the user has explicitly opted out of
+        # context retention this turn). In plain chat mode the model can't
+        # call the tool anyway, so the index would be noise.
+        if agent_mode and not incognito and use_skills and self.skills_manager:
+            try:
+                idx = self.skills_manager.index_for(owner=owner)
+            except Exception as e:
+                logger.debug(f"Skills index unavailable: {e}")
+                idx = []
+            if idx:
+                by_cat: Dict[str, list] = {}
+                for s in idx:
+                    by_cat.setdefault(s.get("category") or "general", []).append(s)
+                lines = ["[Available skills — call manage_skills(action='view', name='...') to load one when relevant]"]
+                for cat in sorted(by_cat):
+                    lines.append(f"  {cat}:")
+                    for s in sorted(by_cat[cat], key=lambda x: x["name"]):
+                        desc = s.get("description") or ""
+                        lines.append(f"    - {s['name']}: {desc}" if desc else f"    - {s['name']}")
+                preface.append(untrusted_context_message("available skills index", "\n".join(lines)))
+
+        return preface, rag_sources, web_sources