"""Helpers for the optional markitdown document-extraction dependency. markitdown (MIT, Microsoft) converts Office/EPUB documents to Markdown, which is more token-efficient and model-legible than a raw text dump. It is **optional**: install with `pip install -r requirements-optional.txt`. When absent, callers degrade gracefully (chat shows a hint; the RAG indexer skips the file) — the MIT core never hard-depends on it. Mirrors the optional-dependency pattern in `src/pdf_runtime.py`. """ import logging import os logger = logging.getLogger(__name__) MARKITDOWN_MISSING = ( "Office/EPUB document extraction requires markitdown. Install optional " "dependencies with `pip install -r requirements-optional.txt`." ) # Formats routed through markitdown. PDFs stay on pypdf (src/document_processor # and src/personal_docs); plain text/code/csv/json/markdown/html stay on the # cheaper built-in text path. These are the formats currently dropped entirely. MARKITDOWN_EXTS = frozenset({".docx", ".pptx", ".xlsx", ".xls", ".epub"}) def is_markitdown_format(path: str) -> bool: """True if the file extension is one we route through markitdown.""" return os.path.splitext(path)[1].lower() in MARKITDOWN_EXTS def load_markitdown(): """Return the MarkItDown class, or raise a user-facing setup hint.""" try: from markitdown import MarkItDown # optional dependency except ImportError as exc: raise RuntimeError(MARKITDOWN_MISSING) from exc return MarkItDown def convert_to_markdown(path: str) -> str | None: """Convert a document to Markdown text via markitdown. Returns the extracted Markdown, or ``None`` if markitdown is unavailable or the conversion fails — callers degrade gracefully rather than erroring. """ try: markitdown_cls = load_markitdown() except RuntimeError: logger.warning("markitdown not installed; cannot extract %s", path) return None try: result = markitdown_cls().convert(path) text = getattr(result, "text_content", None) if text is None: text = getattr(result, "markdown", None) return text except Exception as e: logger.warning("markitdown failed to convert %s: %s", path, e) return None