diff --git a/ACKNOWLEDGMENTS.md b/ACKNOWLEDGMENTS.md index c4079e6..7ab8b95 100644 --- a/ACKNOWLEDGMENTS.md +++ b/ACKNOWLEDGMENTS.md @@ -118,6 +118,7 @@ Core (`requirements.txt`) and optional (`requirements-optional.txt`): | croniter | MIT | | pytest / pytest-asyncio | MIT / Apache-2.0 | | duckduckgo-search (optional) | MIT | +| markitdown (optional — Office/EPUB text extraction) | MIT | | **PyMuPDF** *(optional — form-filling only)* | **AGPL-3.0** — see note below | ## Companion services (interoperated with, not bundled) @@ -152,6 +153,9 @@ concerns from earlier are resolved: deployment (Artifex also sells a commercial PyMuPDF license that lifts this). - **`caldav`** (Python lib) is **dual-licensed GPL-3.0-or-later OR Apache-2.0**. Odysseus uses it under **Apache-2.0**, which is permissive and MIT-compatible. +- **`markitdown`** (Microsoft) is **MIT** and used only as an *optional* dependency for Office/EPUB text + extraction (`src/markitdown_runtime.py`), lazy-imported with graceful fallback — the MIT core runs without + it. The cloud `az-doc-intel` extra is deliberately **not** installed, keeping extraction fully local. --- diff --git a/requirements-optional.txt b/requirements-optional.txt index d4900fe..eeb57c1 100644 --- a/requirements-optional.txt +++ b/requirements-optional.txt @@ -23,3 +23,14 @@ duckduckgo-search # network-served app — see ACKNOWLEDGMENTS.md. The MIT core (PDF *text* # extraction via pypdf) works without it; this only unlocks form-filling. PyMuPDF + +# Office / EPUB document text extraction (chat attachments + the personal-docs +# RAG index). markitdown (MIT, Microsoft) converts .docx/.xlsx/.pptx/.xls/.epub +# to Markdown — more token-efficient and model-legible than a raw dump. Optional +# and lazy-imported via src/markitdown_runtime.py; without it those formats fall +# back to a friendly "install to extract" banner and the core stays pure-MIT. +# Extras pull mammoth/lxml/python-pptx/pandas/openpyxl/xlrd; the base also pulls +# magika (onnxruntime), already a core dep via fastembed. We avoid the +# [all]/Azure/audio extras (cloud + heavy). Pinned to a release >30 days old per +# the dependency-age discussion in issue #485. +markitdown[docx,pptx,xlsx,xls]==0.1.5 diff --git a/src/document_processor.py b/src/document_processor.py index dfcc1e5..3285fa1 100644 --- a/src/document_processor.py +++ b/src/document_processor.py @@ -152,6 +152,44 @@ def _process_pdf(path: str) -> str: return f"\n\n[PDF processing failed: {str(e)}]" +def _truncate_inline(text: str, limit: int = 15000) -> tuple[str, str]: + """Cap inline document text so a huge file can't blow the model's context.""" + text = (text or "").strip() + if len(text) > limit: + return text[:limit], "\n[…truncated for inline context.]" + return text, "" + + +def _process_office_document(path: str, display_name: str) -> str: + """Extract an Office/EPUB document to Markdown via the optional markitdown dep. + + Falls back to a friendly banner when markitdown is unavailable or finds no + text, so a missing optional dependency never breaks the chat path. + """ + from src.markitdown_runtime import ( + is_markitdown_format, + convert_to_markdown, + load_markitdown, + ) + + if not is_markitdown_format(path): + return "\n\n[Attached document file]" + + markdown = convert_to_markdown(path) + if markdown and markdown.strip(): + title = os.path.splitext(os.path.basename(path))[0] + body, marker = _truncate_inline(markdown) + return f"\n\n[Document content — {title}]:\n{body}{marker}" + + # No content: tell the user whether to install the optional dep or whether + # the document simply had no extractable text. + try: + load_markitdown() + return f"\n\n[Attached document: {display_name} — no extractable text found.]" + except RuntimeError as exc: + return f"\n\n[Attached document: {display_name} — {exc}]" + + def _load_vl_settings() -> dict: """Load admin settings from disk.""" try: @@ -429,7 +467,7 @@ def build_user_content( elif mime.startswith("text/") or _is_text_file(path): extracted_text = _process_text_file(path) else: - extracted_text = "\n\n[Attached document file]" + extracted_text = _process_office_document(path, display_name) if content and content[0]["type"] == "text": content[0]["text"] += extracted_text diff --git a/src/markitdown_runtime.py b/src/markitdown_runtime.py new file mode 100644 index 0000000..f8338d3 --- /dev/null +++ b/src/markitdown_runtime.py @@ -0,0 +1,60 @@ +"""Helpers for the optional markitdown document-extraction dependency. + +markitdown (MIT, Microsoft) converts Office/EPUB documents to Markdown, which is +more token-efficient and model-legible than a raw text dump. It is **optional**: +install with `pip install -r requirements-optional.txt`. When absent, callers +degrade gracefully (chat shows a hint; the RAG indexer skips the file) — the MIT +core never hard-depends on it. Mirrors the optional-dependency pattern in +`src/pdf_runtime.py`. +""" + +import logging +import os + +logger = logging.getLogger(__name__) + +MARKITDOWN_MISSING = ( + "Office/EPUB document extraction requires markitdown. Install optional " + "dependencies with `pip install -r requirements-optional.txt`." +) + +# Formats routed through markitdown. PDFs stay on pypdf (src/document_processor +# and src/personal_docs); plain text/code/csv/json/markdown/html stay on the +# cheaper built-in text path. These are the formats currently dropped entirely. +MARKITDOWN_EXTS = frozenset({".docx", ".pptx", ".xlsx", ".xls", ".epub"}) + + +def is_markitdown_format(path: str) -> bool: + """True if the file extension is one we route through markitdown.""" + return os.path.splitext(path)[1].lower() in MARKITDOWN_EXTS + + +def load_markitdown(): + """Return the MarkItDown class, or raise a user-facing setup hint.""" + try: + from markitdown import MarkItDown # optional dependency + except ImportError as exc: + raise RuntimeError(MARKITDOWN_MISSING) from exc + return MarkItDown + + +def convert_to_markdown(path: str) -> str | None: + """Convert a document to Markdown text via markitdown. + + Returns the extracted Markdown, or ``None`` if markitdown is unavailable or + the conversion fails — callers degrade gracefully rather than erroring. + """ + try: + markitdown_cls = load_markitdown() + except RuntimeError: + logger.warning("markitdown not installed; cannot extract %s", path) + return None + try: + result = markitdown_cls().convert(path) + text = getattr(result, "text_content", None) + if text is None: + text = getattr(result, "markdown", None) + return text + except Exception as e: + logger.warning("markitdown failed to convert %s: %s", path, e) + return None diff --git a/src/personal_docs.py b/src/personal_docs.py index 80eb4cb..c7106d2 100644 --- a/src/personal_docs.py +++ b/src/personal_docs.py @@ -6,6 +6,8 @@ import logging from typing import List, Dict, Set, Any, Tuple from dataclasses import dataclass +from src.markitdown_runtime import MARKITDOWN_EXTS + logger = logging.getLogger(__name__) @@ -24,12 +26,24 @@ def extract_pdf_text(file_path: str) -> str: return "" +def extract_office_text(file_path: str) -> str: + """Extract text from an Office/EPUB doc via the optional markitdown dep. + + Returns "" when markitdown is missing or extraction fails, mirroring + extract_pdf_text — the indexer then simply skips the file's content. + """ + from src.markitdown_runtime import convert_to_markdown + return convert_to_markdown(file_path) or "" + + @dataclass class PersonalDocsConfig: """Configuration for personal documents management.""" CHUNK_SIZE: int = 1000 CHUNK_OVERLAP: int = 200 - DEFAULT_EXTENSIONS: Tuple[str, ...] = (".txt", ".md", ".json", ".pdf") + DEFAULT_EXTENSIONS: Tuple[str, ...] = ( + ".txt", ".md", ".json", ".pdf", ".docx", ".pptx", ".xlsx", ".xls", ".epub", + ) DEFAULT_K: int = 5 STOP_WORDS: Set[str] = None @@ -86,7 +100,12 @@ def load_personal_index( continue size = os.path.getsize(p) ext = os.path.splitext(name)[1].lower() - text = extract_pdf_text(p) if ext == ".pdf" else read_text_file(p) + if ext == ".pdf": + text = extract_pdf_text(p) + elif ext in MARKITDOWN_EXTS: + text = extract_office_text(p) + else: + text = read_text_file(p) chunks = split_chunks(text) display = os.path.relpath(p, personal_dir) files.append({"name": display, "path": p, "size": size, "chunks": chunks}) diff --git a/src/upload_handler.py b/src/upload_handler.py index b7f7f0b..ea80a37 100644 --- a/src/upload_handler.py +++ b/src/upload_handler.py @@ -128,7 +128,8 @@ class UploadHandler: def is_document_file(self, filename: str, content_type: str = None) -> bool: """Check if a file is a document based on extension or content type.""" document_extensions = { - '.pdf', '.docx', '.txt', '.py', '.js', '.html', '.htm', + '.pdf', '.docx', '.xlsx', '.pptx', '.xls', '.epub', + '.txt', '.py', '.js', '.html', '.htm', '.css', '.json', '.md', '.csv', '.log', '.xml', '.yml', '.yaml', '.sql', '.sh', '.bash', '.c', '.cpp', '.h', '.java', '.go', '.rs', '.php', '.rb', '.ts', '.jsx', '.tsx' @@ -136,6 +137,10 @@ class UploadHandler: document_mime_types = { 'application/pdf', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + 'application/vnd.openxmlformats-officedocument.presentationml.presentation', + 'application/vnd.ms-excel', + 'application/epub+zip', 'text/plain' } diff --git a/tests/test_markitdown_runtime.py b/tests/test_markitdown_runtime.py new file mode 100644 index 0000000..8f72037 --- /dev/null +++ b/tests/test_markitdown_runtime.py @@ -0,0 +1,75 @@ +import builtins + +import pytest + +from src.markitdown_runtime import ( + MARKITDOWN_MISSING, + MARKITDOWN_EXTS, + is_markitdown_format, + load_markitdown, + convert_to_markdown, +) + + +def _block_markitdown_import(monkeypatch): + real_import = builtins.__import__ + + def fake_import(name, *args, **kwargs): + if name == "markitdown": + raise ImportError("No module named markitdown") + return real_import(name, *args, **kwargs) + + monkeypatch.setattr(builtins, "__import__", fake_import) + + +def test_missing_dependency_error_is_user_actionable(monkeypatch): + _block_markitdown_import(monkeypatch) + + with pytest.raises(RuntimeError) as exc: + load_markitdown() + + message = str(exc.value) + assert message == MARKITDOWN_MISSING + assert "requirements-optional.txt" in message + + +def test_convert_returns_none_when_dependency_missing(monkeypatch): + _block_markitdown_import(monkeypatch) + assert convert_to_markdown("whatever.docx") is None + + +def test_convert_returns_none_on_conversion_failure(monkeypatch): + class Boom: + def convert(self, path): + raise ValueError("bad file") + + monkeypatch.setattr("src.markitdown_runtime.load_markitdown", lambda: Boom) + assert convert_to_markdown("anything.docx") is None + + +def test_is_markitdown_format(): + assert is_markitdown_format("report.docx") + assert is_markitdown_format("/path/to/Sheet.XLSX") # case-insensitive + assert not is_markitdown_format("notes.pdf") # PDFs stay on pypdf + assert not is_markitdown_format("readme.md") # text stays on the text path + + +def test_markitdown_exts_cover_dropped_office_formats(): + for ext in (".docx", ".pptx", ".xlsx", ".xls"): + assert ext in MARKITDOWN_EXTS + + +def test_convert_extracts_real_docx(tmp_path): + """End-to-end: a .docx round-trips to Markdown with a heading (needs markitdown).""" + pytest.importorskip("markitdown") + Document = pytest.importorskip("docx").Document + + doc = Document() + doc.add_heading("Quarterly Report", level=1) + doc.add_paragraph("Revenue grew across all regions.") + path = tmp_path / "report.docx" + doc.save(str(path)) + + md = convert_to_markdown(str(path)) + assert md and "Quarterly Report" in md + assert "#" in md # docx heading styles become Markdown headings diff --git a/tests/test_personal_docs_office_index.py b/tests/test_personal_docs_office_index.py new file mode 100644 index 0000000..6f42260 --- /dev/null +++ b/tests/test_personal_docs_office_index.py @@ -0,0 +1,25 @@ +from pathlib import Path + +from src import personal_docs + + +def test_personal_index_includes_office_uploads(tmp_path, monkeypatch): + docx_path = tmp_path / "report.docx" + docx_path.write_bytes(b"PK fake docx bytes") + + monkeypatch.setattr( + personal_docs, + "extract_office_text", + lambda path: "# Report\n\nreadable office text" if Path(path) == docx_path else "", + ) + + files = personal_docs.load_personal_index(str(tmp_path)) + + assert [item["name"] for item in files] == ["report.docx"] + assert files[0]["path"] == str(docx_path) + assert files[0]["chunks"] == ["# Report\n\nreadable office text"] + + +def test_personal_index_default_extensions_advertise_office_support(): + for ext in (".docx", ".pptx", ".xlsx", ".xls"): + assert ext in personal_docs.config.DEFAULT_EXTENSIONS