115 lines
3.8 KiB
Python
115 lines
3.8 KiB
Python
"""
|
|
index_documents.py
|
|
|
|
A standalone script to index documents from the personal_docs directory
|
|
into the vector database using RAGManager. This script scans for text files,
|
|
processes them with proper chunking, and adds them to the vector database
|
|
with progress reporting and final statistics.
|
|
|
|
Features:
|
|
1. Imports RAGManager from rag_manager
|
|
2. Scans personal_docs directory for .txt, .md, .json files
|
|
3. Reads each file, chunks it (1000 chars with 200 overlap), and adds to vector database
|
|
4. Shows progress during processing and final statistics
|
|
"""
|
|
|
|
import os
|
|
import logging
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import List, Tuple
|
|
|
|
# Configure logging for the script
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.StreamHandler(sys.stdout)
|
|
]
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
def main():
|
|
"""Main function to index documents from personal_docs directory."""
|
|
|
|
# Import RAGManager
|
|
try:
|
|
from src.rag_manager import RAGManager
|
|
logger.info("Successfully imported RAGManager")
|
|
except ImportError as e:
|
|
logger.error(f"Failed to import RAGManager: {e}")
|
|
logger.error("Make sure rag_manager.py is in the same directory and accessible")
|
|
return
|
|
|
|
# Initialize RAGManager
|
|
rag_manager = RAGManager()
|
|
|
|
# Directory to scan
|
|
docs_directory = "data/personal_docs"
|
|
directory_path = Path(docs_directory)
|
|
|
|
# Check if directory exists
|
|
if not directory_path.exists():
|
|
logger.error(f"Directory '{docs_directory}' not found!")
|
|
logger.info(f"Please create the directory and add your documents: mkdir {docs_directory}")
|
|
return
|
|
|
|
# Supported file extensions
|
|
supported_extensions = {'.txt', '.md', '.json'}
|
|
logger.info(f"Scanning '{docs_directory}' for {', '.join(sorted(supported_extensions))} files...")
|
|
|
|
# Find all supported files
|
|
files_to_index = []
|
|
for ext in supported_extensions:
|
|
files_to_index.extend(directory_path.rglob(f"*{ext}"))
|
|
|
|
# Sort files for consistent processing
|
|
files_to_index.sort()
|
|
|
|
if not files_to_index:
|
|
logger.warning(f"No supported files found in '{docs_directory}' directory.")
|
|
logger.info("Add .txt, .md, or .json files to the directory and run this script again.")
|
|
return
|
|
|
|
logger.info(f"Found {len(files_to_index)} files to index:")
|
|
for file_path in files_to_index:
|
|
logger.info(f" - {file_path}")
|
|
|
|
# Index the documents
|
|
logger.info("\nStarting document indexing process...")
|
|
|
|
try:
|
|
result = rag_manager.index_personal_documents(docs_directory)
|
|
|
|
# Display results
|
|
logger.info("\n" + "="*50)
|
|
if result["success"]:
|
|
logger.info("✅ Document indexing completed successfully!")
|
|
logger.info(f" Indexed {result['indexed_count']} document chunks")
|
|
if result.get("failed_count", 0) > 0:
|
|
logger.warning(f" Failed to process {result['failed_count']} files")
|
|
else:
|
|
logger.error("❌ Document indexing failed!")
|
|
if "message" in result:
|
|
logger.error(f" Error: {result['message']}")
|
|
|
|
# Show final statistics
|
|
logger.info("\n" + "-"*30)
|
|
logger.info("Database Statistics:")
|
|
|
|
stats = rag_manager.get_stats()
|
|
if "error" not in stats:
|
|
for key, value in stats.items():
|
|
logger.info(f" {key}: {value}")
|
|
else:
|
|
logger.error(f" Failed to retrieve statistics: {stats['error']}")
|
|
|
|
logger.info("="*50)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to index documents: {e}")
|
|
return
|
|
|
|
if __name__ == "__main__":
|
|
main()
|