Odysseus v1.0

2026-05-31 23:58:26 +09:00
commit e5c99a5eee
421 changed files with 271349 additions and 0 deletions
--- a/src/memory_vector.py
+++ b/src/memory_vector.py
@@ -0,0 +1,175 @@
+"""
+memory_vector.py
+
+ChromaDB-backed vector store for memory entries.
+Shares the EmbeddingClient with RAG to save memory.
+Stores pre-computed embeddings (ChromaDB does not manage embedding).
+"""
+
+import logging
+from typing import List, Dict, Optional
+
+logger = logging.getLogger(__name__)
+
+
+class MemoryVectorStore:
+    """Vector index over memory entries for semantic retrieval."""
+
+    COLLECTION_NAME = "odysseus_memories"
+
+    def __init__(self, data_dir: str, embedding_model=None):
+        self._model = embedding_model
+        self._collection = None
+        self._healthy = False
+
+        self._initialize()
+
+    def _initialize(self):
+        try:
+            from src.chroma_client import get_chroma_client
+
+            if self._model is None:
+                from src.embeddings import get_embedding_client
+                self._model = get_embedding_client()
+                if self._model is None:
+                    raise RuntimeError("No embedding backend available")
+                logger.info(f"MemoryVectorStore using embeddings: {self._model.url}")
+
+            client = get_chroma_client()
+            self._collection = client.get_or_create_collection(
+                name=self.COLLECTION_NAME,
+                metadata={"hnsw:space": "cosine"},
+            )
+
+            self._healthy = True
+            count = self._collection.count()
+            logger.info(f"MemoryVectorStore ready (entries={count})")
+
+        except Exception as e:
+            logger.error(f"MemoryVectorStore init failed: {e}")
+
+    @property
+    def healthy(self) -> bool:
+        return self._healthy
+
+    def _embed(self, texts: List[str]) -> List[List[float]]:
+        vecs = self._model.encode(texts, normalize_embeddings=True)
+        return vecs.tolist()
+
+    def count(self) -> int:
+        """Return the number of stored vectors."""
+        if not self._healthy:
+            return 0
+        return self._collection.count()
+
+    def add(self, memory_id: str, text: str):
+        """Add a single memory entry to the vector index."""
+        if not self._healthy:
+            return
+        # Skip if already exists
+        existing = self._collection.get(ids=[memory_id])
+        if existing["ids"]:
+            return
+        embeddings = self._embed([text])
+        self._collection.add(
+            ids=[memory_id],
+            embeddings=embeddings,
+            documents=[text],
+            metadatas=[{"source": "memory"}],
+        )
+
+    def remove(self, memory_id: str):
+        """Remove a memory entry. O(1) — no rebuild needed."""
+        if not self._healthy:
+            return
+        try:
+            self._collection.delete(ids=[memory_id])
+        except Exception as e:
+            logger.warning(f"memory remove {memory_id}: {e}")
+
+    def search(self, query: str, k: int = 8) -> List[Dict]:
+        """Search for the most relevant memory IDs by semantic similarity.
+        Returns list of {"memory_id": str, "score": float}.
+
+        ChromaDB cosine distance = 1 - cosine_similarity.
+        We convert back: similarity = 1.0 - distance.
+        """
+        if not self._healthy or self._collection.count() == 0:
+            return []
+
+        embeddings = self._embed([query])
+        actual_k = min(k, self._collection.count())
+        results = self._collection.query(
+            query_embeddings=embeddings,
+            n_results=actual_k,
+        )
+
+        out = []
+        for idx, mid in enumerate(results["ids"][0]):
+            distance = results["distances"][0][idx]
+            out.append({
+                "memory_id": mid,
+                "score": round(1.0 - distance, 4),
+            })
+        return out
+
+    def find_similar(self, text: str, threshold: float = 0.92) -> Optional[str]:
+        """Check if a near-duplicate exists. Returns memory_id if found, else None."""
+        if not self._healthy or self._collection.count() == 0:
+            return None
+
+        embeddings = self._embed([text])
+        results = self._collection.query(
+            query_embeddings=embeddings,
+            n_results=1,
+        )
+
+        if results["ids"][0]:
+            distance = results["distances"][0][0]
+            similarity = 1.0 - distance
+            if similarity >= threshold:
+                return results["ids"][0][0]
+        return None
+
+    def rebuild(self, memories: List[Dict]):
+        """Rebuild the entire index from a list of memory entries.
+        Each entry must have 'id' and 'text' keys."""
+        if not self._healthy:
+            return
+
+        from src.chroma_client import get_chroma_client
+
+        # Delete and recreate collection for a clean rebuild
+        client = get_chroma_client()
+        try:
+            client.delete_collection(self.COLLECTION_NAME)
+        except Exception:
+            pass
+        self._collection = client.get_or_create_collection(
+            name=self.COLLECTION_NAME,
+            metadata={"hnsw:space": "cosine"},
+        )
+
+        texts = []
+        ids = []
+        for mem in memories:
+            text = mem.get("text", "").strip()
+            mid = mem.get("id", "")
+            if text and mid:
+                texts.append(text)
+                ids.append(mid)
+
+        if texts:
+            # Batch in chunks of 100 to avoid oversized requests
+            for i in range(0, len(texts), 100):
+                batch_texts = texts[i:i + 100]
+                batch_ids = ids[i:i + 100]
+                embeddings = self._embed(batch_texts)
+                self._collection.add(
+                    ids=batch_ids,
+                    embeddings=embeddings,
+                    documents=batch_texts,
+                    metadatas=[{"source": "memory"}] * len(batch_ids),
+                )
+
+        logger.info(f"MemoryVectorStore rebuilt with {len(ids)} entries")