Odysseus v1.0
This commit is contained in:
114
scripts/index_documents.py
Normal file
114
scripts/index_documents.py
Normal file
@@ -0,0 +1,114 @@
|
||||
"""
|
||||
index_documents.py
|
||||
|
||||
A standalone script to index documents from the personal_docs directory
|
||||
into the vector database using RAGManager. This script scans for text files,
|
||||
processes them with proper chunking, and adds them to the vector database
|
||||
with progress reporting and final statistics.
|
||||
|
||||
Features:
|
||||
1. Imports RAGManager from rag_manager
|
||||
2. Scans personal_docs directory for .txt, .md, .json files
|
||||
3. Reads each file, chunks it (1000 chars with 200 overlap), and adds to vector database
|
||||
4. Shows progress during processing and final statistics
|
||||
"""
|
||||
|
||||
import os
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple
|
||||
|
||||
# Configure logging for the script
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.StreamHandler(sys.stdout)
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def main():
|
||||
"""Main function to index documents from personal_docs directory."""
|
||||
|
||||
# Import RAGManager
|
||||
try:
|
||||
from src.rag_manager import RAGManager
|
||||
logger.info("Successfully imported RAGManager")
|
||||
except ImportError as e:
|
||||
logger.error(f"Failed to import RAGManager: {e}")
|
||||
logger.error("Make sure rag_manager.py is in the same directory and accessible")
|
||||
return
|
||||
|
||||
# Initialize RAGManager
|
||||
rag_manager = RAGManager()
|
||||
|
||||
# Directory to scan
|
||||
docs_directory = "data/personal_docs"
|
||||
directory_path = Path(docs_directory)
|
||||
|
||||
# Check if directory exists
|
||||
if not directory_path.exists():
|
||||
logger.error(f"Directory '{docs_directory}' not found!")
|
||||
logger.info(f"Please create the directory and add your documents: mkdir {docs_directory}")
|
||||
return
|
||||
|
||||
# Supported file extensions
|
||||
supported_extensions = {'.txt', '.md', '.json'}
|
||||
logger.info(f"Scanning '{docs_directory}' for {', '.join(sorted(supported_extensions))} files...")
|
||||
|
||||
# Find all supported files
|
||||
files_to_index = []
|
||||
for ext in supported_extensions:
|
||||
files_to_index.extend(directory_path.rglob(f"*{ext}"))
|
||||
|
||||
# Sort files for consistent processing
|
||||
files_to_index.sort()
|
||||
|
||||
if not files_to_index:
|
||||
logger.warning(f"No supported files found in '{docs_directory}' directory.")
|
||||
logger.info("Add .txt, .md, or .json files to the directory and run this script again.")
|
||||
return
|
||||
|
||||
logger.info(f"Found {len(files_to_index)} files to index:")
|
||||
for file_path in files_to_index:
|
||||
logger.info(f" - {file_path}")
|
||||
|
||||
# Index the documents
|
||||
logger.info("\nStarting document indexing process...")
|
||||
|
||||
try:
|
||||
result = rag_manager.index_personal_documents(docs_directory)
|
||||
|
||||
# Display results
|
||||
logger.info("\n" + "="*50)
|
||||
if result["success"]:
|
||||
logger.info("✅ Document indexing completed successfully!")
|
||||
logger.info(f" Indexed {result['indexed_count']} document chunks")
|
||||
if result.get("failed_count", 0) > 0:
|
||||
logger.warning(f" Failed to process {result['failed_count']} files")
|
||||
else:
|
||||
logger.error("❌ Document indexing failed!")
|
||||
if "message" in result:
|
||||
logger.error(f" Error: {result['message']}")
|
||||
|
||||
# Show final statistics
|
||||
logger.info("\n" + "-"*30)
|
||||
logger.info("Database Statistics:")
|
||||
|
||||
stats = rag_manager.get_stats()
|
||||
if "error" not in stats:
|
||||
for key, value in stats.items():
|
||||
logger.info(f" {key}: {value}")
|
||||
else:
|
||||
logger.error(f" Failed to retrieve statistics: {stats['error']}")
|
||||
|
||||
logger.info("="*50)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to index documents: {e}")
|
||||
return
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user