""" index_documents.py A standalone script to index documents from the personal_docs directory into the vector database using RAGManager. This script scans for text files, processes them with proper chunking, and adds them to the vector database with progress reporting and final statistics. Features: 1. Imports RAGManager from rag_manager 2. Scans personal_docs directory for .txt, .md, .json files 3. Reads each file, chunks it (1000 chars with 200 overlap), and adds to vector database 4. Shows progress during processing and final statistics """ import os import logging import sys from pathlib import Path from typing import List, Tuple # Configure logging for the script logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(sys.stdout) ] ) logger = logging.getLogger(__name__) def main(): """Main function to index documents from personal_docs directory.""" # Import RAGManager try: from src.rag_manager import RAGManager logger.info("Successfully imported RAGManager") except ImportError as e: logger.error(f"Failed to import RAGManager: {e}") logger.error("Make sure rag_manager.py is in the same directory and accessible") return # Initialize RAGManager rag_manager = RAGManager() # Directory to scan docs_directory = "data/personal_docs" directory_path = Path(docs_directory) # Check if directory exists if not directory_path.exists(): logger.error(f"Directory '{docs_directory}' not found!") logger.info(f"Please create the directory and add your documents: mkdir {docs_directory}") return # Supported file extensions supported_extensions = {'.txt', '.md', '.json'} logger.info(f"Scanning '{docs_directory}' for {', '.join(sorted(supported_extensions))} files...") # Find all supported files files_to_index = [] for ext in supported_extensions: files_to_index.extend(directory_path.rglob(f"*{ext}")) # Sort files for consistent processing files_to_index.sort() if not files_to_index: logger.warning(f"No supported files found in '{docs_directory}' directory.") logger.info("Add .txt, .md, or .json files to the directory and run this script again.") return logger.info(f"Found {len(files_to_index)} files to index:") for file_path in files_to_index: logger.info(f" - {file_path}") # Index the documents logger.info("\nStarting document indexing process...") try: result = rag_manager.index_personal_documents(docs_directory) # Display results logger.info("\n" + "="*50) if result["success"]: logger.info("✅ Document indexing completed successfully!") logger.info(f" Indexed {result['indexed_count']} document chunks") if result.get("failed_count", 0) > 0: logger.warning(f" Failed to process {result['failed_count']} files") else: logger.error("❌ Document indexing failed!") if "message" in result: logger.error(f" Error: {result['message']}") # Show final statistics logger.info("\n" + "-"*30) logger.info("Database Statistics:") stats = rag_manager.get_stats() if "error" not in stats: for key, value in stats.items(): logger.info(f" {key}: {value}") else: logger.error(f" Failed to retrieve statistics: {stats['error']}") logger.info("="*50) except Exception as e: logger.error(f"Failed to index documents: {e}") return if __name__ == "__main__": main()