Add optional markitdown extraction for Office/EPUB documents (#766)

Office documents were dropped server-side: .docx fell through to "[Attached document file]", .xlsx/.pptx weren't recognized at all, and the personal-docs RAG index only covered txt/md/json/pdf. Wire the optional markitdown dependency (MIT, Microsoft) into both the chat-attachment path (build_user_content) and the RAG indexer (personal_docs), converting .docx/.xlsx/.pptx/.xls/.epub to Markdown. It is lazy-imported with graceful fallback (mirrors src/pdf_runtime.py): without it those formats show an "install to extract" banner and the MIT core is unaffected. pypdf stays the default PDF path. - src/markitdown_runtime.py: optional-dep loader + convert_to_markdown - upload_handler: recognize Office/EPUB extensions + MIME types - document_processor: extract Office docs in the chat else-branch - personal_docs: index Office docs (DEFAULT_EXTENSIONS + dispatch) - requirements-optional.txt + ACKNOWLEDGMENTS.md: pinned markitdown 0.1.5 - tests: markitdown_runtime + office index coverage Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-02 04:28:52 +02:00
parent 610968f91e
commit f58fbc8b85
8 changed files with 241 additions and 4 deletions
--- a/tests/test_markitdown_runtime.py
+++ b/tests/test_markitdown_runtime.py
@@ -0,0 +1,75 @@
+import builtins
+
+import pytest
+
+from src.markitdown_runtime import (
+    MARKITDOWN_MISSING,
+    MARKITDOWN_EXTS,
+    is_markitdown_format,
+    load_markitdown,
+    convert_to_markdown,
+)
+
+
+def _block_markitdown_import(monkeypatch):
+    real_import = builtins.__import__
+
+    def fake_import(name, *args, **kwargs):
+        if name == "markitdown":
+            raise ImportError("No module named markitdown")
+        return real_import(name, *args, **kwargs)
+
+    monkeypatch.setattr(builtins, "__import__", fake_import)
+
+
+def test_missing_dependency_error_is_user_actionable(monkeypatch):
+    _block_markitdown_import(monkeypatch)
+
+    with pytest.raises(RuntimeError) as exc:
+        load_markitdown()
+
+    message = str(exc.value)
+    assert message == MARKITDOWN_MISSING
+    assert "requirements-optional.txt" in message
+
+
+def test_convert_returns_none_when_dependency_missing(monkeypatch):
+    _block_markitdown_import(monkeypatch)
+    assert convert_to_markdown("whatever.docx") is None
+
+
+def test_convert_returns_none_on_conversion_failure(monkeypatch):
+    class Boom:
+        def convert(self, path):
+            raise ValueError("bad file")
+
+    monkeypatch.setattr("src.markitdown_runtime.load_markitdown", lambda: Boom)
+    assert convert_to_markdown("anything.docx") is None
+
+
+def test_is_markitdown_format():
+    assert is_markitdown_format("report.docx")
+    assert is_markitdown_format("/path/to/Sheet.XLSX")  # case-insensitive
+    assert not is_markitdown_format("notes.pdf")  # PDFs stay on pypdf
+    assert not is_markitdown_format("readme.md")  # text stays on the text path
+
+
+def test_markitdown_exts_cover_dropped_office_formats():
+    for ext in (".docx", ".pptx", ".xlsx", ".xls"):
+        assert ext in MARKITDOWN_EXTS
+
+
+def test_convert_extracts_real_docx(tmp_path):
+    """End-to-end: a .docx round-trips to Markdown with a heading (needs markitdown)."""
+    pytest.importorskip("markitdown")
+    Document = pytest.importorskip("docx").Document
+
+    doc = Document()
+    doc.add_heading("Quarterly Report", level=1)
+    doc.add_paragraph("Revenue grew across all regions.")
+    path = tmp_path / "report.docx"
+    doc.save(str(path))
+
+    md = convert_to_markdown(str(path))
+    assert md and "Quarterly Report" in md
+    assert "#" in md  # docx heading styles become Markdown headings