Add optional markitdown extraction for Office/EPUB documents (#766)

Office documents were dropped server-side: .docx fell through to "[Attached document file]", .xlsx/.pptx weren't recognized at all, and the personal-docs RAG index only covered txt/md/json/pdf. Wire the optional markitdown dependency (MIT, Microsoft) into both the chat-attachment path (build_user_content) and the RAG indexer (personal_docs), converting .docx/.xlsx/.pptx/.xls/.epub to Markdown. It is lazy-imported with graceful fallback (mirrors src/pdf_runtime.py): without it those formats show an "install to extract" banner and the MIT core is unaffected. pypdf stays the default PDF path. - src/markitdown_runtime.py: optional-dep loader + convert_to_markdown - upload_handler: recognize Office/EPUB extensions + MIME types - document_processor: extract Office docs in the chat else-branch - personal_docs: index Office docs (DEFAULT_EXTENSIONS + dispatch) - requirements-optional.txt + ACKNOWLEDGMENTS.md: pinned markitdown 0.1.5 - tests: markitdown_runtime + office index coverage Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-02 04:28:52 +02:00
parent 610968f91e
commit f58fbc8b85
8 changed files with 241 additions and 4 deletions
--- a/src/document_processor.py
+++ b/src/document_processor.py
@@ -152,6 +152,44 @@ def _process_pdf(path: str) -> str:
        return f"\n\n[PDF processing failed: {str(e)}]"


+def _truncate_inline(text: str, limit: int = 15000) -> tuple[str, str]:
+    """Cap inline document text so a huge file can't blow the model's context."""
+    text = (text or "").strip()
+    if len(text) > limit:
+        return text[:limit], "\n[…truncated for inline context.]"
+    return text, ""
+
+
+def _process_office_document(path: str, display_name: str) -> str:
+    """Extract an Office/EPUB document to Markdown via the optional markitdown dep.
+
+    Falls back to a friendly banner when markitdown is unavailable or finds no
+    text, so a missing optional dependency never breaks the chat path.
+    """
+    from src.markitdown_runtime import (
+        is_markitdown_format,
+        convert_to_markdown,
+        load_markitdown,
+    )
+
+    if not is_markitdown_format(path):
+        return "\n\n[Attached document file]"
+
+    markdown = convert_to_markdown(path)
+    if markdown and markdown.strip():
+        title = os.path.splitext(os.path.basename(path))[0]
+        body, marker = _truncate_inline(markdown)
+        return f"\n\n[Document content — {title}]:\n{body}{marker}"
+
+    # No content: tell the user whether to install the optional dep or whether
+    # the document simply had no extractable text.
+    try:
+        load_markitdown()
+        return f"\n\n[Attached document: {display_name} — no extractable text found.]"
+    except RuntimeError as exc:
+        return f"\n\n[Attached document: {display_name} — {exc}]"
+
+
 def _load_vl_settings() -> dict:
    """Load admin settings from disk."""
    try:
@@ -429,7 +467,7 @@ def build_user_content(
            elif mime.startswith("text/") or _is_text_file(path):
                extracted_text = _process_text_file(path)
            else:
-                extracted_text = "\n\n[Attached document file]"
+                extracted_text = _process_office_document(path, display_name)

            if content and content[0]["type"] == "text":
                content[0]["text"] += extracted_text
--- a/src/markitdown_runtime.py
+++ b/src/markitdown_runtime.py
@@ -0,0 +1,60 @@
+"""Helpers for the optional markitdown document-extraction dependency.
+
+markitdown (MIT, Microsoft) converts Office/EPUB documents to Markdown, which is
+more token-efficient and model-legible than a raw text dump. It is **optional**:
+install with `pip install -r requirements-optional.txt`. When absent, callers
+degrade gracefully (chat shows a hint; the RAG indexer skips the file) — the MIT
+core never hard-depends on it. Mirrors the optional-dependency pattern in
+`src/pdf_runtime.py`.
+"""
+
+import logging
+import os
+
+logger = logging.getLogger(__name__)
+
+MARKITDOWN_MISSING = (
+    "Office/EPUB document extraction requires markitdown. Install optional "
+    "dependencies with `pip install -r requirements-optional.txt`."
+)
+
+# Formats routed through markitdown. PDFs stay on pypdf (src/document_processor
+# and src/personal_docs); plain text/code/csv/json/markdown/html stay on the
+# cheaper built-in text path. These are the formats currently dropped entirely.
+MARKITDOWN_EXTS = frozenset({".docx", ".pptx", ".xlsx", ".xls", ".epub"})
+
+
+def is_markitdown_format(path: str) -> bool:
+    """True if the file extension is one we route through markitdown."""
+    return os.path.splitext(path)[1].lower() in MARKITDOWN_EXTS
+
+
+def load_markitdown():
+    """Return the MarkItDown class, or raise a user-facing setup hint."""
+    try:
+        from markitdown import MarkItDown  # optional dependency
+    except ImportError as exc:
+        raise RuntimeError(MARKITDOWN_MISSING) from exc
+    return MarkItDown
+
+
+def convert_to_markdown(path: str) -> str | None:
+    """Convert a document to Markdown text via markitdown.
+
+    Returns the extracted Markdown, or ``None`` if markitdown is unavailable or
+    the conversion fails — callers degrade gracefully rather than erroring.
+    """
+    try:
+        markitdown_cls = load_markitdown()
+    except RuntimeError:
+        logger.warning("markitdown not installed; cannot extract %s", path)
+        return None
+    try:
+        result = markitdown_cls().convert(path)
+        text = getattr(result, "text_content", None)
+        if text is None:
+            text = getattr(result, "markdown", None)
+        return text
+    except Exception as e:
+        logger.warning("markitdown failed to convert %s: %s", path, e)
+        return None
--- a/src/personal_docs.py
+++ b/src/personal_docs.py
@@ -6,6 +6,8 @@ import logging
 from typing import List, Dict, Set, Any, Tuple
 from dataclasses import dataclass

+from src.markitdown_runtime import MARKITDOWN_EXTS
+
 logger = logging.getLogger(__name__)


@@ -24,12 +26,24 @@ def extract_pdf_text(file_path: str) -> str:
        return ""


+def extract_office_text(file_path: str) -> str:
+    """Extract text from an Office/EPUB doc via the optional markitdown dep.
+
+    Returns "" when markitdown is missing or extraction fails, mirroring
+    extract_pdf_text — the indexer then simply skips the file's content.
+    """
+    from src.markitdown_runtime import convert_to_markdown
+    return convert_to_markdown(file_path) or ""
+
+
@dataclass
 class PersonalDocsConfig:
    """Configuration for personal documents management."""
    CHUNK_SIZE: int = 1000
    CHUNK_OVERLAP: int = 200
-    DEFAULT_EXTENSIONS: Tuple[str, ...] = (".txt", ".md", ".json", ".pdf")
+    DEFAULT_EXTENSIONS: Tuple[str, ...] = (
+        ".txt", ".md", ".json", ".pdf", ".docx", ".pptx", ".xlsx", ".xls", ".epub",
+    )
    DEFAULT_K: int = 5
    STOP_WORDS: Set[str] = None
    
@@ -86,7 +100,12 @@ def load_personal_index(
                continue
            size = os.path.getsize(p)
            ext = os.path.splitext(name)[1].lower()
-            text = extract_pdf_text(p) if ext == ".pdf" else read_text_file(p)
+            if ext == ".pdf":
+                text = extract_pdf_text(p)
+            elif ext in MARKITDOWN_EXTS:
+                text = extract_office_text(p)
+            else:
+                text = read_text_file(p)
            chunks = split_chunks(text)
            display = os.path.relpath(p, personal_dir)
            files.append({"name": display, "path": p, "size": size, "chunks": chunks})
--- a/src/upload_handler.py
+++ b/src/upload_handler.py
@@ -128,7 +128,8 @@ class UploadHandler:
    def is_document_file(self, filename: str, content_type: str = None) -> bool:
        """Check if a file is a document based on extension or content type."""
        document_extensions = {
-            '.pdf', '.docx', '.txt', '.py', '.js', '.html', '.htm', 
+            '.pdf', '.docx', '.xlsx', '.pptx', '.xls', '.epub',
+            '.txt', '.py', '.js', '.html', '.htm',
            '.css', '.json', '.md', '.csv', '.log', '.xml', '.yml', 
            '.yaml', '.sql', '.sh', '.bash', '.c', '.cpp', '.h', 
            '.java', '.go', '.rs', '.php', '.rb', '.ts', '.jsx', '.tsx'
@@ -136,6 +137,10 @@ class UploadHandler:
        document_mime_types = {
            'application/pdf', 
            'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+            'application/vnd.openxmlformats-officedocument.presentationml.presentation',
+            'application/vnd.ms-excel',
+            'application/epub+zip',
            'text/plain'
        }