#!/usr/bin/env python3
"""
migrate_faiss_to_chroma.py

One-time migration of existing FAISS data to ChromaDB.

Migrates:
  - Memory vectors: data/memory_vectors/ -> odysseus_memories collection
  - RAG vectors:    data/rag/            -> odysseus_rag collection

Usage:
    python scripts/migrate_faiss_to_chroma.py

Requires: faiss-cpu, chromadb-client, and the embedding endpoint to be running.
"""

import json
import os
import sys
import logging

# Add project root to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger("migrate")


def migrate_memories():
    """Migrate memory vectors from FAISS to ChromaDB."""
    from src.chroma_client import get_chroma_client
    from src.embeddings import get_embedding_client
    from src.constants import DATA_DIR

    ids_path = os.path.join(DATA_DIR, "memory_vectors", "ids.json")
    memory_path = os.path.join(DATA_DIR, "memory.json")

    if not os.path.exists(ids_path):
        logger.info("No memory FAISS index found, skipping memory migration")
        return

    ids = json.loads(open(ids_path, encoding="utf-8").read())
    if not ids:
        logger.info("Memory FAISS index is empty, skipping")
        return

    # Load memory texts
    memories = {}
    if os.path.exists(memory_path):
        for mem in json.loads(open(memory_path, encoding="utf-8").read()):
            memories[mem.get("id", "")] = mem

    embed = get_embedding_client()
    if not embed:
        logger.error("No embedding client available")
        return

    client = get_chroma_client()
    collection = client.get_or_create_collection(
        name="odysseus_memories",
        metadata={"hnsw:space": "cosine"},
    )

    batch_ids, batch_texts, batch_metas = [], [], []
    for mid in ids:
        mem = memories.get(mid)
        if not mem:
            continue
        text = mem.get("text", "").strip()
        if not text:
            continue
        batch_ids.append(mid)
        batch_texts.append(text)
        batch_metas.append({"source": "memory", "category": mem.get("category", "fact")})

    if batch_texts:
        vecs = embed.encode(batch_texts, normalize_embeddings=True).tolist()
        for i in range(0, len(batch_texts), 100):
            collection.add(
                ids=batch_ids[i:i+100],
                embeddings=vecs[i:i+100],
                documents=batch_texts[i:i+100],
                metadatas=batch_metas[i:i+100],
            )
        logger.info(f"Migrated {len(batch_texts)} memories to ChromaDB")
    else:
        logger.info("No memory entries to migrate")


def migrate_rag():
    """Migrate RAG documents from FAISS DocStore to ChromaDB."""
    from src.chroma_client import get_chroma_client
    from src.embeddings import get_embedding_client

    docs_path = os.path.join("data", "rag", "docs.json")
    if not os.path.exists(docs_path):
        logger.info("No RAG DocStore found, skipping RAG migration")
        return

    data = json.loads(open(docs_path, encoding="utf-8").read())
    ids = data.get("ids", [])
    documents = data.get("documents", [])
    metadatas = data.get("metadatas", [])

    if not ids:
        logger.info("RAG DocStore is empty, skipping")
        return

    embed = get_embedding_client()
    if not embed:
        logger.error("No embedding client available")
        return

    client = get_chroma_client()
    collection = client.get_or_create_collection(
        name="odysseus_rag",
        metadata={"hnsw:space": "cosine"},
    )

    for i in range(0, len(ids), 100):
        batch_ids = ids[i:i+100]
        batch_docs = documents[i:i+100]
        batch_metas = metadatas[i:i+100]
        vecs = embed.encode(batch_docs, normalize_embeddings=True).tolist()
        collection.add(
            ids=batch_ids,
            embeddings=vecs,
            documents=batch_docs,
            metadatas=batch_metas,
        )

    logger.info(f"Migrated {len(ids)} RAG chunks to ChromaDB")


if __name__ == "__main__":
    from dotenv import load_dotenv
    load_dotenv()

    logger.info("Starting FAISS -> ChromaDB migration")
    migrate_memories()
    migrate_rag()
    logger.info("Migration complete")