Add optional markitdown extraction for Office/EPUB documents (#766)
Office documents were dropped server-side: .docx fell through to "[Attached document file]", .xlsx/.pptx weren't recognized at all, and the personal-docs RAG index only covered txt/md/json/pdf. Wire the optional markitdown dependency (MIT, Microsoft) into both the chat-attachment path (build_user_content) and the RAG indexer (personal_docs), converting .docx/.xlsx/.pptx/.xls/.epub to Markdown. It is lazy-imported with graceful fallback (mirrors src/pdf_runtime.py): without it those formats show an "install to extract" banner and the MIT core is unaffected. pypdf stays the default PDF path. - src/markitdown_runtime.py: optional-dep loader + convert_to_markdown - upload_handler: recognize Office/EPUB extensions + MIME types - document_processor: extract Office docs in the chat else-branch - personal_docs: index Office docs (DEFAULT_EXTENSIONS + dispatch) - requirements-optional.txt + ACKNOWLEDGMENTS.md: pinned markitdown 0.1.5 - tests: markitdown_runtime + office index coverage Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
committed by
GitHub
parent
610968f91e
commit
f58fbc8b85
@@ -152,6 +152,44 @@ def _process_pdf(path: str) -> str:
|
||||
return f"\n\n[PDF processing failed: {str(e)}]"
|
||||
|
||||
|
||||
def _truncate_inline(text: str, limit: int = 15000) -> tuple[str, str]:
|
||||
"""Cap inline document text so a huge file can't blow the model's context."""
|
||||
text = (text or "").strip()
|
||||
if len(text) > limit:
|
||||
return text[:limit], "\n[…truncated for inline context.]"
|
||||
return text, ""
|
||||
|
||||
|
||||
def _process_office_document(path: str, display_name: str) -> str:
|
||||
"""Extract an Office/EPUB document to Markdown via the optional markitdown dep.
|
||||
|
||||
Falls back to a friendly banner when markitdown is unavailable or finds no
|
||||
text, so a missing optional dependency never breaks the chat path.
|
||||
"""
|
||||
from src.markitdown_runtime import (
|
||||
is_markitdown_format,
|
||||
convert_to_markdown,
|
||||
load_markitdown,
|
||||
)
|
||||
|
||||
if not is_markitdown_format(path):
|
||||
return "\n\n[Attached document file]"
|
||||
|
||||
markdown = convert_to_markdown(path)
|
||||
if markdown and markdown.strip():
|
||||
title = os.path.splitext(os.path.basename(path))[0]
|
||||
body, marker = _truncate_inline(markdown)
|
||||
return f"\n\n[Document content — {title}]:\n{body}{marker}"
|
||||
|
||||
# No content: tell the user whether to install the optional dep or whether
|
||||
# the document simply had no extractable text.
|
||||
try:
|
||||
load_markitdown()
|
||||
return f"\n\n[Attached document: {display_name} — no extractable text found.]"
|
||||
except RuntimeError as exc:
|
||||
return f"\n\n[Attached document: {display_name} — {exc}]"
|
||||
|
||||
|
||||
def _load_vl_settings() -> dict:
|
||||
"""Load admin settings from disk."""
|
||||
try:
|
||||
@@ -429,7 +467,7 @@ def build_user_content(
|
||||
elif mime.startswith("text/") or _is_text_file(path):
|
||||
extracted_text = _process_text_file(path)
|
||||
else:
|
||||
extracted_text = "\n\n[Attached document file]"
|
||||
extracted_text = _process_office_document(path, display_name)
|
||||
|
||||
if content and content[0]["type"] == "text":
|
||||
content[0]["text"] += extracted_text
|
||||
|
||||
Reference in New Issue
Block a user