Office documents were dropped server-side: .docx fell through to "[Attached document file]", .xlsx/.pptx weren't recognized at all, and the personal-docs RAG index only covered txt/md/json/pdf. Wire the optional markitdown dependency (MIT, Microsoft) into both the chat-attachment path (build_user_content) and the RAG indexer (personal_docs), converting .docx/.xlsx/.pptx/.xls/.epub to Markdown. It is lazy-imported with graceful fallback (mirrors src/pdf_runtime.py): without it those formats show an "install to extract" banner and the MIT core is unaffected. pypdf stays the default PDF path. - src/markitdown_runtime.py: optional-dep loader + convert_to_markdown - upload_handler: recognize Office/EPUB extensions + MIME types - document_processor: extract Office docs in the chat else-branch - personal_docs: index Office docs (DEFAULT_EXTENSIONS + dispatch) - requirements-optional.txt + ACKNOWLEDGMENTS.md: pinned markitdown 0.1.5 - tests: markitdown_runtime + office index coverage Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
61 lines
2.2 KiB
Python
61 lines
2.2 KiB
Python
"""Helpers for the optional markitdown document-extraction dependency.
|
|
|
|
markitdown (MIT, Microsoft) converts Office/EPUB documents to Markdown, which is
|
|
more token-efficient and model-legible than a raw text dump. It is **optional**:
|
|
install with `pip install -r requirements-optional.txt`. When absent, callers
|
|
degrade gracefully (chat shows a hint; the RAG indexer skips the file) — the MIT
|
|
core never hard-depends on it. Mirrors the optional-dependency pattern in
|
|
`src/pdf_runtime.py`.
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
MARKITDOWN_MISSING = (
|
|
"Office/EPUB document extraction requires markitdown. Install optional "
|
|
"dependencies with `pip install -r requirements-optional.txt`."
|
|
)
|
|
|
|
# Formats routed through markitdown. PDFs stay on pypdf (src/document_processor
|
|
# and src/personal_docs); plain text/code/csv/json/markdown/html stay on the
|
|
# cheaper built-in text path. These are the formats currently dropped entirely.
|
|
MARKITDOWN_EXTS = frozenset({".docx", ".pptx", ".xlsx", ".xls", ".epub"})
|
|
|
|
|
|
def is_markitdown_format(path: str) -> bool:
|
|
"""True if the file extension is one we route through markitdown."""
|
|
return os.path.splitext(path)[1].lower() in MARKITDOWN_EXTS
|
|
|
|
|
|
def load_markitdown():
|
|
"""Return the MarkItDown class, or raise a user-facing setup hint."""
|
|
try:
|
|
from markitdown import MarkItDown # optional dependency
|
|
except ImportError as exc:
|
|
raise RuntimeError(MARKITDOWN_MISSING) from exc
|
|
return MarkItDown
|
|
|
|
|
|
def convert_to_markdown(path: str) -> str | None:
|
|
"""Convert a document to Markdown text via markitdown.
|
|
|
|
Returns the extracted Markdown, or ``None`` if markitdown is unavailable or
|
|
the conversion fails — callers degrade gracefully rather than erroring.
|
|
"""
|
|
try:
|
|
markitdown_cls = load_markitdown()
|
|
except RuntimeError:
|
|
logger.warning("markitdown not installed; cannot extract %s", path)
|
|
return None
|
|
try:
|
|
result = markitdown_cls().convert(path)
|
|
text = getattr(result, "text_content", None)
|
|
if text is None:
|
|
text = getattr(result, "markdown", None)
|
|
return text
|
|
except Exception as e:
|
|
logger.warning("markitdown failed to convert %s: %s", path, e)
|
|
return None
|