Add optional markitdown extraction for Office/EPUB documents (#766)
Office documents were dropped server-side: .docx fell through to "[Attached document file]", .xlsx/.pptx weren't recognized at all, and the personal-docs RAG index only covered txt/md/json/pdf. Wire the optional markitdown dependency (MIT, Microsoft) into both the chat-attachment path (build_user_content) and the RAG indexer (personal_docs), converting .docx/.xlsx/.pptx/.xls/.epub to Markdown. It is lazy-imported with graceful fallback (mirrors src/pdf_runtime.py): without it those formats show an "install to extract" banner and the MIT core is unaffected. pypdf stays the default PDF path. - src/markitdown_runtime.py: optional-dep loader + convert_to_markdown - upload_handler: recognize Office/EPUB extensions + MIME types - document_processor: extract Office docs in the chat else-branch - personal_docs: index Office docs (DEFAULT_EXTENSIONS + dispatch) - requirements-optional.txt + ACKNOWLEDGMENTS.md: pinned markitdown 0.1.5 - tests: markitdown_runtime + office index coverage Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
committed by
GitHub
parent
610968f91e
commit
f58fbc8b85
@@ -128,7 +128,8 @@ class UploadHandler:
|
||||
def is_document_file(self, filename: str, content_type: str = None) -> bool:
|
||||
"""Check if a file is a document based on extension or content type."""
|
||||
document_extensions = {
|
||||
'.pdf', '.docx', '.txt', '.py', '.js', '.html', '.htm',
|
||||
'.pdf', '.docx', '.xlsx', '.pptx', '.xls', '.epub',
|
||||
'.txt', '.py', '.js', '.html', '.htm',
|
||||
'.css', '.json', '.md', '.csv', '.log', '.xml', '.yml',
|
||||
'.yaml', '.sql', '.sh', '.bash', '.c', '.cpp', '.h',
|
||||
'.java', '.go', '.rs', '.php', '.rb', '.ts', '.jsx', '.tsx'
|
||||
@@ -136,6 +137,10 @@ class UploadHandler:
|
||||
document_mime_types = {
|
||||
'application/pdf',
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
||||
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
||||
'application/vnd.ms-excel',
|
||||
'application/epub+zip',
|
||||
'text/plain'
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user