143 lines
4.2 KiB
Python
143 lines
4.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
migrate_faiss_to_chroma.py
|
|
|
|
One-time migration of existing FAISS data to ChromaDB.
|
|
|
|
Migrates:
|
|
- Memory vectors: data/memory_vectors/ -> odysseus_memories collection
|
|
- RAG vectors: data/rag/ -> odysseus_rag collection
|
|
|
|
Usage:
|
|
python scripts/migrate_faiss_to_chroma.py
|
|
|
|
Requires: faiss-cpu, chromadb-client, and the embedding endpoint to be running.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
import logging
|
|
|
|
# Add project root to path
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
logger = logging.getLogger("migrate")
|
|
|
|
|
|
def migrate_memories():
|
|
"""Migrate memory vectors from FAISS to ChromaDB."""
|
|
from src.chroma_client import get_chroma_client
|
|
from src.embeddings import get_embedding_client
|
|
from src.constants import DATA_DIR
|
|
|
|
ids_path = os.path.join(DATA_DIR, "memory_vectors", "ids.json")
|
|
memory_path = os.path.join(DATA_DIR, "memory.json")
|
|
|
|
if not os.path.exists(ids_path):
|
|
logger.info("No memory FAISS index found, skipping memory migration")
|
|
return
|
|
|
|
ids = json.loads(open(ids_path, encoding="utf-8").read())
|
|
if not ids:
|
|
logger.info("Memory FAISS index is empty, skipping")
|
|
return
|
|
|
|
# Load memory texts
|
|
memories = {}
|
|
if os.path.exists(memory_path):
|
|
for mem in json.loads(open(memory_path, encoding="utf-8").read()):
|
|
memories[mem.get("id", "")] = mem
|
|
|
|
embed = get_embedding_client()
|
|
if not embed:
|
|
logger.error("No embedding client available")
|
|
return
|
|
|
|
client = get_chroma_client()
|
|
collection = client.get_or_create_collection(
|
|
name="odysseus_memories",
|
|
metadata={"hnsw:space": "cosine"},
|
|
)
|
|
|
|
batch_ids, batch_texts, batch_metas = [], [], []
|
|
for mid in ids:
|
|
mem = memories.get(mid)
|
|
if not mem:
|
|
continue
|
|
text = mem.get("text", "").strip()
|
|
if not text:
|
|
continue
|
|
batch_ids.append(mid)
|
|
batch_texts.append(text)
|
|
batch_metas.append({"source": "memory", "category": mem.get("category", "fact")})
|
|
|
|
if batch_texts:
|
|
vecs = embed.encode(batch_texts, normalize_embeddings=True).tolist()
|
|
for i in range(0, len(batch_texts), 100):
|
|
collection.add(
|
|
ids=batch_ids[i:i+100],
|
|
embeddings=vecs[i:i+100],
|
|
documents=batch_texts[i:i+100],
|
|
metadatas=batch_metas[i:i+100],
|
|
)
|
|
logger.info(f"Migrated {len(batch_texts)} memories to ChromaDB")
|
|
else:
|
|
logger.info("No memory entries to migrate")
|
|
|
|
|
|
def migrate_rag():
|
|
"""Migrate RAG documents from FAISS DocStore to ChromaDB."""
|
|
from src.chroma_client import get_chroma_client
|
|
from src.embeddings import get_embedding_client
|
|
|
|
docs_path = os.path.join("data", "rag", "docs.json")
|
|
if not os.path.exists(docs_path):
|
|
logger.info("No RAG DocStore found, skipping RAG migration")
|
|
return
|
|
|
|
data = json.loads(open(docs_path, encoding="utf-8").read())
|
|
ids = data.get("ids", [])
|
|
documents = data.get("documents", [])
|
|
metadatas = data.get("metadatas", [])
|
|
|
|
if not ids:
|
|
logger.info("RAG DocStore is empty, skipping")
|
|
return
|
|
|
|
embed = get_embedding_client()
|
|
if not embed:
|
|
logger.error("No embedding client available")
|
|
return
|
|
|
|
client = get_chroma_client()
|
|
collection = client.get_or_create_collection(
|
|
name="odysseus_rag",
|
|
metadata={"hnsw:space": "cosine"},
|
|
)
|
|
|
|
for i in range(0, len(ids), 100):
|
|
batch_ids = ids[i:i+100]
|
|
batch_docs = documents[i:i+100]
|
|
batch_metas = metadatas[i:i+100]
|
|
vecs = embed.encode(batch_docs, normalize_embeddings=True).tolist()
|
|
collection.add(
|
|
ids=batch_ids,
|
|
embeddings=vecs,
|
|
documents=batch_docs,
|
|
metadatas=batch_metas,
|
|
)
|
|
|
|
logger.info(f"Migrated {len(ids)} RAG chunks to ChromaDB")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
from dotenv import load_dotenv
|
|
load_dotenv()
|
|
|
|
logger.info("Starting FAISS -> ChromaDB migration")
|
|
migrate_memories()
|
|
migrate_rag()
|
|
logger.info("Migration complete")
|