Skip invalid FAISS migration JSON (#1547)
This commit is contained in:
@@ -26,6 +26,39 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(mess
|
|||||||
logger = logging.getLogger("migrate")
|
logger = logging.getLogger("migrate")
|
||||||
|
|
||||||
|
|
||||||
|
def _load_json(path, default):
|
||||||
|
try:
|
||||||
|
with open(path, encoding="utf-8") as f:
|
||||||
|
return json.load(f)
|
||||||
|
except (OSError, json.JSONDecodeError):
|
||||||
|
return default
|
||||||
|
|
||||||
|
|
||||||
|
def _memory_map(rows):
|
||||||
|
memories = {}
|
||||||
|
if not isinstance(rows, list):
|
||||||
|
return memories
|
||||||
|
for row in rows:
|
||||||
|
if not isinstance(row, dict):
|
||||||
|
continue
|
||||||
|
memory_id = row.get("id", "")
|
||||||
|
if memory_id:
|
||||||
|
memories[memory_id] = row
|
||||||
|
return memories
|
||||||
|
|
||||||
|
|
||||||
|
def _rag_docstore(data):
|
||||||
|
if not isinstance(data, dict):
|
||||||
|
return [], [], []
|
||||||
|
ids = data.get("ids", [])
|
||||||
|
documents = data.get("documents", [])
|
||||||
|
metadatas = data.get("metadatas", [])
|
||||||
|
if not isinstance(ids, list) or not isinstance(documents, list) or not isinstance(metadatas, list):
|
||||||
|
return [], [], []
|
||||||
|
count = min(len(ids), len(documents), len(metadatas))
|
||||||
|
return ids[:count], documents[:count], metadatas[:count]
|
||||||
|
|
||||||
|
|
||||||
def migrate_memories():
|
def migrate_memories():
|
||||||
"""Migrate memory vectors from FAISS to ChromaDB."""
|
"""Migrate memory vectors from FAISS to ChromaDB."""
|
||||||
from src.chroma_client import get_chroma_client
|
from src.chroma_client import get_chroma_client
|
||||||
@@ -39,7 +72,9 @@ def migrate_memories():
|
|||||||
logger.info("No memory FAISS index found, skipping memory migration")
|
logger.info("No memory FAISS index found, skipping memory migration")
|
||||||
return
|
return
|
||||||
|
|
||||||
ids = json.loads(open(ids_path, encoding="utf-8").read())
|
ids = _load_json(ids_path, [])
|
||||||
|
if not isinstance(ids, list):
|
||||||
|
ids = []
|
||||||
if not ids:
|
if not ids:
|
||||||
logger.info("Memory FAISS index is empty, skipping")
|
logger.info("Memory FAISS index is empty, skipping")
|
||||||
return
|
return
|
||||||
@@ -47,8 +82,7 @@ def migrate_memories():
|
|||||||
# Load memory texts
|
# Load memory texts
|
||||||
memories = {}
|
memories = {}
|
||||||
if os.path.exists(memory_path):
|
if os.path.exists(memory_path):
|
||||||
for mem in json.loads(open(memory_path, encoding="utf-8").read()):
|
memories = _memory_map(_load_json(memory_path, []))
|
||||||
memories[mem.get("id", "")] = mem
|
|
||||||
|
|
||||||
embed = get_embedding_client()
|
embed = get_embedding_client()
|
||||||
if not embed:
|
if not embed:
|
||||||
@@ -97,10 +131,7 @@ def migrate_rag():
|
|||||||
logger.info("No RAG DocStore found, skipping RAG migration")
|
logger.info("No RAG DocStore found, skipping RAG migration")
|
||||||
return
|
return
|
||||||
|
|
||||||
data = json.loads(open(docs_path, encoding="utf-8").read())
|
ids, documents, metadatas = _rag_docstore(_load_json(docs_path, {}))
|
||||||
ids = data.get("ids", [])
|
|
||||||
documents = data.get("documents", [])
|
|
||||||
metadatas = data.get("metadatas", [])
|
|
||||||
|
|
||||||
if not ids:
|
if not ids:
|
||||||
logger.info("RAG DocStore is empty, skipping")
|
logger.info("RAG DocStore is empty, skipping")
|
||||||
|
|||||||
36
tests/test_migrate_faiss_to_chroma.py
Normal file
36
tests/test_migrate_faiss_to_chroma.py
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
import importlib.util
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
ROOT = Path(__file__).resolve().parents[1]
|
||||||
|
|
||||||
|
|
||||||
|
def _load_module():
|
||||||
|
path = ROOT / "scripts" / "migrate_faiss_to_chroma.py"
|
||||||
|
spec = importlib.util.spec_from_file_location("migrate_faiss_to_chroma", path)
|
||||||
|
module = importlib.util.module_from_spec(spec)
|
||||||
|
spec.loader.exec_module(module)
|
||||||
|
return module
|
||||||
|
|
||||||
|
|
||||||
|
def test_memory_map_skips_invalid_rows():
|
||||||
|
mod = _load_module()
|
||||||
|
|
||||||
|
assert mod._memory_map([
|
||||||
|
{"id": "m1", "text": "hello"},
|
||||||
|
"bad-row",
|
||||||
|
None,
|
||||||
|
{"text": "missing id"},
|
||||||
|
]) == {"m1": {"id": "m1", "text": "hello"}}
|
||||||
|
|
||||||
|
|
||||||
|
def test_rag_docstore_requires_matching_lists():
|
||||||
|
mod = _load_module()
|
||||||
|
|
||||||
|
assert mod._rag_docstore([]) == ([], [], [])
|
||||||
|
assert mod._rag_docstore({"ids": ["a"], "documents": ["doc"], "metadatas": "bad"}) == ([], [], [])
|
||||||
|
assert mod._rag_docstore({
|
||||||
|
"ids": ["a", "b"],
|
||||||
|
"documents": ["doc"],
|
||||||
|
"metadatas": [{"source": "x"}, {"source": "y"}],
|
||||||
|
}) == (["a"], ["doc"], [{"source": "x"}])
|
||||||
Reference in New Issue
Block a user