Add optional markitdown extraction for Office/EPUB documents (#766)

Office documents were dropped server-side: .docx fell through to
"[Attached document file]", .xlsx/.pptx weren't recognized at all, and
the personal-docs RAG index only covered txt/md/json/pdf.

Wire the optional markitdown dependency (MIT, Microsoft) into both the
chat-attachment path (build_user_content) and the RAG indexer
(personal_docs), converting .docx/.xlsx/.pptx/.xls/.epub to Markdown.
It is lazy-imported with graceful fallback (mirrors src/pdf_runtime.py):
without it those formats show an "install to extract" banner and the
MIT core is unaffected. pypdf stays the default PDF path.

- src/markitdown_runtime.py: optional-dep loader + convert_to_markdown
- upload_handler: recognize Office/EPUB extensions + MIME types
- document_processor: extract Office docs in the chat else-branch
- personal_docs: index Office docs (DEFAULT_EXTENSIONS + dispatch)
- requirements-optional.txt + ACKNOWLEDGMENTS.md: pinned markitdown 0.1.5
- tests: markitdown_runtime + office index coverage

Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Marius Oppedal Ringsby
2026-06-02 04:28:52 +02:00
committed by GitHub
parent 610968f91e
commit f58fbc8b85
8 changed files with 241 additions and 4 deletions

View File

@@ -0,0 +1,75 @@
import builtins
import pytest
from src.markitdown_runtime import (
MARKITDOWN_MISSING,
MARKITDOWN_EXTS,
is_markitdown_format,
load_markitdown,
convert_to_markdown,
)
def _block_markitdown_import(monkeypatch):
real_import = builtins.__import__
def fake_import(name, *args, **kwargs):
if name == "markitdown":
raise ImportError("No module named markitdown")
return real_import(name, *args, **kwargs)
monkeypatch.setattr(builtins, "__import__", fake_import)
def test_missing_dependency_error_is_user_actionable(monkeypatch):
_block_markitdown_import(monkeypatch)
with pytest.raises(RuntimeError) as exc:
load_markitdown()
message = str(exc.value)
assert message == MARKITDOWN_MISSING
assert "requirements-optional.txt" in message
def test_convert_returns_none_when_dependency_missing(monkeypatch):
_block_markitdown_import(monkeypatch)
assert convert_to_markdown("whatever.docx") is None
def test_convert_returns_none_on_conversion_failure(monkeypatch):
class Boom:
def convert(self, path):
raise ValueError("bad file")
monkeypatch.setattr("src.markitdown_runtime.load_markitdown", lambda: Boom)
assert convert_to_markdown("anything.docx") is None
def test_is_markitdown_format():
assert is_markitdown_format("report.docx")
assert is_markitdown_format("/path/to/Sheet.XLSX") # case-insensitive
assert not is_markitdown_format("notes.pdf") # PDFs stay on pypdf
assert not is_markitdown_format("readme.md") # text stays on the text path
def test_markitdown_exts_cover_dropped_office_formats():
for ext in (".docx", ".pptx", ".xlsx", ".xls"):
assert ext in MARKITDOWN_EXTS
def test_convert_extracts_real_docx(tmp_path):
"""End-to-end: a .docx round-trips to Markdown with a heading (needs markitdown)."""
pytest.importorskip("markitdown")
Document = pytest.importorskip("docx").Document
doc = Document()
doc.add_heading("Quarterly Report", level=1)
doc.add_paragraph("Revenue grew across all regions.")
path = tmp_path / "report.docx"
doc.save(str(path))
md = convert_to_markdown(str(path))
assert md and "Quarterly Report" in md
assert "#" in md # docx heading styles become Markdown headings