Add optional markitdown extraction for Office/EPUB documents (#766)
Office documents were dropped server-side: .docx fell through to "[Attached document file]", .xlsx/.pptx weren't recognized at all, and the personal-docs RAG index only covered txt/md/json/pdf. Wire the optional markitdown dependency (MIT, Microsoft) into both the chat-attachment path (build_user_content) and the RAG indexer (personal_docs), converting .docx/.xlsx/.pptx/.xls/.epub to Markdown. It is lazy-imported with graceful fallback (mirrors src/pdf_runtime.py): without it those formats show an "install to extract" banner and the MIT core is unaffected. pypdf stays the default PDF path. - src/markitdown_runtime.py: optional-dep loader + convert_to_markdown - upload_handler: recognize Office/EPUB extensions + MIME types - document_processor: extract Office docs in the chat else-branch - personal_docs: index Office docs (DEFAULT_EXTENSIONS + dispatch) - requirements-optional.txt + ACKNOWLEDGMENTS.md: pinned markitdown 0.1.5 - tests: markitdown_runtime + office index coverage Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
committed by
GitHub
parent
610968f91e
commit
f58fbc8b85
@@ -118,6 +118,7 @@ Core (`requirements.txt`) and optional (`requirements-optional.txt`):
|
|||||||
| croniter | MIT |
|
| croniter | MIT |
|
||||||
| pytest / pytest-asyncio | MIT / Apache-2.0 |
|
| pytest / pytest-asyncio | MIT / Apache-2.0 |
|
||||||
| duckduckgo-search (optional) | MIT |
|
| duckduckgo-search (optional) | MIT |
|
||||||
|
| markitdown (optional — Office/EPUB text extraction) | MIT |
|
||||||
| **PyMuPDF** *(optional — form-filling only)* | **AGPL-3.0** — see note below |
|
| **PyMuPDF** *(optional — form-filling only)* | **AGPL-3.0** — see note below |
|
||||||
|
|
||||||
## Companion services (interoperated with, not bundled)
|
## Companion services (interoperated with, not bundled)
|
||||||
@@ -152,6 +153,9 @@ concerns from earlier are resolved:
|
|||||||
deployment (Artifex also sells a commercial PyMuPDF license that lifts this).
|
deployment (Artifex also sells a commercial PyMuPDF license that lifts this).
|
||||||
- **`caldav`** (Python lib) is **dual-licensed GPL-3.0-or-later OR Apache-2.0**.
|
- **`caldav`** (Python lib) is **dual-licensed GPL-3.0-or-later OR Apache-2.0**.
|
||||||
Odysseus uses it under **Apache-2.0**, which is permissive and MIT-compatible.
|
Odysseus uses it under **Apache-2.0**, which is permissive and MIT-compatible.
|
||||||
|
- **`markitdown`** (Microsoft) is **MIT** and used only as an *optional* dependency for Office/EPUB text
|
||||||
|
extraction (`src/markitdown_runtime.py`), lazy-imported with graceful fallback — the MIT core runs without
|
||||||
|
it. The cloud `az-doc-intel` extra is deliberately **not** installed, keeping extraction fully local.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|||||||
@@ -23,3 +23,14 @@ duckduckgo-search
|
|||||||
# network-served app — see ACKNOWLEDGMENTS.md. The MIT core (PDF *text*
|
# network-served app — see ACKNOWLEDGMENTS.md. The MIT core (PDF *text*
|
||||||
# extraction via pypdf) works without it; this only unlocks form-filling.
|
# extraction via pypdf) works without it; this only unlocks form-filling.
|
||||||
PyMuPDF
|
PyMuPDF
|
||||||
|
|
||||||
|
# Office / EPUB document text extraction (chat attachments + the personal-docs
|
||||||
|
# RAG index). markitdown (MIT, Microsoft) converts .docx/.xlsx/.pptx/.xls/.epub
|
||||||
|
# to Markdown — more token-efficient and model-legible than a raw dump. Optional
|
||||||
|
# and lazy-imported via src/markitdown_runtime.py; without it those formats fall
|
||||||
|
# back to a friendly "install to extract" banner and the core stays pure-MIT.
|
||||||
|
# Extras pull mammoth/lxml/python-pptx/pandas/openpyxl/xlrd; the base also pulls
|
||||||
|
# magika (onnxruntime), already a core dep via fastembed. We avoid the
|
||||||
|
# [all]/Azure/audio extras (cloud + heavy). Pinned to a release >30 days old per
|
||||||
|
# the dependency-age discussion in issue #485.
|
||||||
|
markitdown[docx,pptx,xlsx,xls]==0.1.5
|
||||||
|
|||||||
@@ -152,6 +152,44 @@ def _process_pdf(path: str) -> str:
|
|||||||
return f"\n\n[PDF processing failed: {str(e)}]"
|
return f"\n\n[PDF processing failed: {str(e)}]"
|
||||||
|
|
||||||
|
|
||||||
|
def _truncate_inline(text: str, limit: int = 15000) -> tuple[str, str]:
|
||||||
|
"""Cap inline document text so a huge file can't blow the model's context."""
|
||||||
|
text = (text or "").strip()
|
||||||
|
if len(text) > limit:
|
||||||
|
return text[:limit], "\n[…truncated for inline context.]"
|
||||||
|
return text, ""
|
||||||
|
|
||||||
|
|
||||||
|
def _process_office_document(path: str, display_name: str) -> str:
|
||||||
|
"""Extract an Office/EPUB document to Markdown via the optional markitdown dep.
|
||||||
|
|
||||||
|
Falls back to a friendly banner when markitdown is unavailable or finds no
|
||||||
|
text, so a missing optional dependency never breaks the chat path.
|
||||||
|
"""
|
||||||
|
from src.markitdown_runtime import (
|
||||||
|
is_markitdown_format,
|
||||||
|
convert_to_markdown,
|
||||||
|
load_markitdown,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not is_markitdown_format(path):
|
||||||
|
return "\n\n[Attached document file]"
|
||||||
|
|
||||||
|
markdown = convert_to_markdown(path)
|
||||||
|
if markdown and markdown.strip():
|
||||||
|
title = os.path.splitext(os.path.basename(path))[0]
|
||||||
|
body, marker = _truncate_inline(markdown)
|
||||||
|
return f"\n\n[Document content — {title}]:\n{body}{marker}"
|
||||||
|
|
||||||
|
# No content: tell the user whether to install the optional dep or whether
|
||||||
|
# the document simply had no extractable text.
|
||||||
|
try:
|
||||||
|
load_markitdown()
|
||||||
|
return f"\n\n[Attached document: {display_name} — no extractable text found.]"
|
||||||
|
except RuntimeError as exc:
|
||||||
|
return f"\n\n[Attached document: {display_name} — {exc}]"
|
||||||
|
|
||||||
|
|
||||||
def _load_vl_settings() -> dict:
|
def _load_vl_settings() -> dict:
|
||||||
"""Load admin settings from disk."""
|
"""Load admin settings from disk."""
|
||||||
try:
|
try:
|
||||||
@@ -429,7 +467,7 @@ def build_user_content(
|
|||||||
elif mime.startswith("text/") or _is_text_file(path):
|
elif mime.startswith("text/") or _is_text_file(path):
|
||||||
extracted_text = _process_text_file(path)
|
extracted_text = _process_text_file(path)
|
||||||
else:
|
else:
|
||||||
extracted_text = "\n\n[Attached document file]"
|
extracted_text = _process_office_document(path, display_name)
|
||||||
|
|
||||||
if content and content[0]["type"] == "text":
|
if content and content[0]["type"] == "text":
|
||||||
content[0]["text"] += extracted_text
|
content[0]["text"] += extracted_text
|
||||||
|
|||||||
60
src/markitdown_runtime.py
Normal file
60
src/markitdown_runtime.py
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
"""Helpers for the optional markitdown document-extraction dependency.
|
||||||
|
|
||||||
|
markitdown (MIT, Microsoft) converts Office/EPUB documents to Markdown, which is
|
||||||
|
more token-efficient and model-legible than a raw text dump. It is **optional**:
|
||||||
|
install with `pip install -r requirements-optional.txt`. When absent, callers
|
||||||
|
degrade gracefully (chat shows a hint; the RAG indexer skips the file) — the MIT
|
||||||
|
core never hard-depends on it. Mirrors the optional-dependency pattern in
|
||||||
|
`src/pdf_runtime.py`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
MARKITDOWN_MISSING = (
|
||||||
|
"Office/EPUB document extraction requires markitdown. Install optional "
|
||||||
|
"dependencies with `pip install -r requirements-optional.txt`."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Formats routed through markitdown. PDFs stay on pypdf (src/document_processor
|
||||||
|
# and src/personal_docs); plain text/code/csv/json/markdown/html stay on the
|
||||||
|
# cheaper built-in text path. These are the formats currently dropped entirely.
|
||||||
|
MARKITDOWN_EXTS = frozenset({".docx", ".pptx", ".xlsx", ".xls", ".epub"})
|
||||||
|
|
||||||
|
|
||||||
|
def is_markitdown_format(path: str) -> bool:
|
||||||
|
"""True if the file extension is one we route through markitdown."""
|
||||||
|
return os.path.splitext(path)[1].lower() in MARKITDOWN_EXTS
|
||||||
|
|
||||||
|
|
||||||
|
def load_markitdown():
|
||||||
|
"""Return the MarkItDown class, or raise a user-facing setup hint."""
|
||||||
|
try:
|
||||||
|
from markitdown import MarkItDown # optional dependency
|
||||||
|
except ImportError as exc:
|
||||||
|
raise RuntimeError(MARKITDOWN_MISSING) from exc
|
||||||
|
return MarkItDown
|
||||||
|
|
||||||
|
|
||||||
|
def convert_to_markdown(path: str) -> str | None:
|
||||||
|
"""Convert a document to Markdown text via markitdown.
|
||||||
|
|
||||||
|
Returns the extracted Markdown, or ``None`` if markitdown is unavailable or
|
||||||
|
the conversion fails — callers degrade gracefully rather than erroring.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
markitdown_cls = load_markitdown()
|
||||||
|
except RuntimeError:
|
||||||
|
logger.warning("markitdown not installed; cannot extract %s", path)
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
result = markitdown_cls().convert(path)
|
||||||
|
text = getattr(result, "text_content", None)
|
||||||
|
if text is None:
|
||||||
|
text = getattr(result, "markdown", None)
|
||||||
|
return text
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("markitdown failed to convert %s: %s", path, e)
|
||||||
|
return None
|
||||||
@@ -6,6 +6,8 @@ import logging
|
|||||||
from typing import List, Dict, Set, Any, Tuple
|
from typing import List, Dict, Set, Any, Tuple
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from src.markitdown_runtime import MARKITDOWN_EXTS
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@@ -24,12 +26,24 @@ def extract_pdf_text(file_path: str) -> str:
|
|||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def extract_office_text(file_path: str) -> str:
|
||||||
|
"""Extract text from an Office/EPUB doc via the optional markitdown dep.
|
||||||
|
|
||||||
|
Returns "" when markitdown is missing or extraction fails, mirroring
|
||||||
|
extract_pdf_text — the indexer then simply skips the file's content.
|
||||||
|
"""
|
||||||
|
from src.markitdown_runtime import convert_to_markdown
|
||||||
|
return convert_to_markdown(file_path) or ""
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class PersonalDocsConfig:
|
class PersonalDocsConfig:
|
||||||
"""Configuration for personal documents management."""
|
"""Configuration for personal documents management."""
|
||||||
CHUNK_SIZE: int = 1000
|
CHUNK_SIZE: int = 1000
|
||||||
CHUNK_OVERLAP: int = 200
|
CHUNK_OVERLAP: int = 200
|
||||||
DEFAULT_EXTENSIONS: Tuple[str, ...] = (".txt", ".md", ".json", ".pdf")
|
DEFAULT_EXTENSIONS: Tuple[str, ...] = (
|
||||||
|
".txt", ".md", ".json", ".pdf", ".docx", ".pptx", ".xlsx", ".xls", ".epub",
|
||||||
|
)
|
||||||
DEFAULT_K: int = 5
|
DEFAULT_K: int = 5
|
||||||
STOP_WORDS: Set[str] = None
|
STOP_WORDS: Set[str] = None
|
||||||
|
|
||||||
@@ -86,7 +100,12 @@ def load_personal_index(
|
|||||||
continue
|
continue
|
||||||
size = os.path.getsize(p)
|
size = os.path.getsize(p)
|
||||||
ext = os.path.splitext(name)[1].lower()
|
ext = os.path.splitext(name)[1].lower()
|
||||||
text = extract_pdf_text(p) if ext == ".pdf" else read_text_file(p)
|
if ext == ".pdf":
|
||||||
|
text = extract_pdf_text(p)
|
||||||
|
elif ext in MARKITDOWN_EXTS:
|
||||||
|
text = extract_office_text(p)
|
||||||
|
else:
|
||||||
|
text = read_text_file(p)
|
||||||
chunks = split_chunks(text)
|
chunks = split_chunks(text)
|
||||||
display = os.path.relpath(p, personal_dir)
|
display = os.path.relpath(p, personal_dir)
|
||||||
files.append({"name": display, "path": p, "size": size, "chunks": chunks})
|
files.append({"name": display, "path": p, "size": size, "chunks": chunks})
|
||||||
|
|||||||
@@ -128,7 +128,8 @@ class UploadHandler:
|
|||||||
def is_document_file(self, filename: str, content_type: str = None) -> bool:
|
def is_document_file(self, filename: str, content_type: str = None) -> bool:
|
||||||
"""Check if a file is a document based on extension or content type."""
|
"""Check if a file is a document based on extension or content type."""
|
||||||
document_extensions = {
|
document_extensions = {
|
||||||
'.pdf', '.docx', '.txt', '.py', '.js', '.html', '.htm',
|
'.pdf', '.docx', '.xlsx', '.pptx', '.xls', '.epub',
|
||||||
|
'.txt', '.py', '.js', '.html', '.htm',
|
||||||
'.css', '.json', '.md', '.csv', '.log', '.xml', '.yml',
|
'.css', '.json', '.md', '.csv', '.log', '.xml', '.yml',
|
||||||
'.yaml', '.sql', '.sh', '.bash', '.c', '.cpp', '.h',
|
'.yaml', '.sql', '.sh', '.bash', '.c', '.cpp', '.h',
|
||||||
'.java', '.go', '.rs', '.php', '.rb', '.ts', '.jsx', '.tsx'
|
'.java', '.go', '.rs', '.php', '.rb', '.ts', '.jsx', '.tsx'
|
||||||
@@ -136,6 +137,10 @@ class UploadHandler:
|
|||||||
document_mime_types = {
|
document_mime_types = {
|
||||||
'application/pdf',
|
'application/pdf',
|
||||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||||
|
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
||||||
|
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
||||||
|
'application/vnd.ms-excel',
|
||||||
|
'application/epub+zip',
|
||||||
'text/plain'
|
'text/plain'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
75
tests/test_markitdown_runtime.py
Normal file
75
tests/test_markitdown_runtime.py
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
import builtins
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from src.markitdown_runtime import (
|
||||||
|
MARKITDOWN_MISSING,
|
||||||
|
MARKITDOWN_EXTS,
|
||||||
|
is_markitdown_format,
|
||||||
|
load_markitdown,
|
||||||
|
convert_to_markdown,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _block_markitdown_import(monkeypatch):
|
||||||
|
real_import = builtins.__import__
|
||||||
|
|
||||||
|
def fake_import(name, *args, **kwargs):
|
||||||
|
if name == "markitdown":
|
||||||
|
raise ImportError("No module named markitdown")
|
||||||
|
return real_import(name, *args, **kwargs)
|
||||||
|
|
||||||
|
monkeypatch.setattr(builtins, "__import__", fake_import)
|
||||||
|
|
||||||
|
|
||||||
|
def test_missing_dependency_error_is_user_actionable(monkeypatch):
|
||||||
|
_block_markitdown_import(monkeypatch)
|
||||||
|
|
||||||
|
with pytest.raises(RuntimeError) as exc:
|
||||||
|
load_markitdown()
|
||||||
|
|
||||||
|
message = str(exc.value)
|
||||||
|
assert message == MARKITDOWN_MISSING
|
||||||
|
assert "requirements-optional.txt" in message
|
||||||
|
|
||||||
|
|
||||||
|
def test_convert_returns_none_when_dependency_missing(monkeypatch):
|
||||||
|
_block_markitdown_import(monkeypatch)
|
||||||
|
assert convert_to_markdown("whatever.docx") is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_convert_returns_none_on_conversion_failure(monkeypatch):
|
||||||
|
class Boom:
|
||||||
|
def convert(self, path):
|
||||||
|
raise ValueError("bad file")
|
||||||
|
|
||||||
|
monkeypatch.setattr("src.markitdown_runtime.load_markitdown", lambda: Boom)
|
||||||
|
assert convert_to_markdown("anything.docx") is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_markitdown_format():
|
||||||
|
assert is_markitdown_format("report.docx")
|
||||||
|
assert is_markitdown_format("/path/to/Sheet.XLSX") # case-insensitive
|
||||||
|
assert not is_markitdown_format("notes.pdf") # PDFs stay on pypdf
|
||||||
|
assert not is_markitdown_format("readme.md") # text stays on the text path
|
||||||
|
|
||||||
|
|
||||||
|
def test_markitdown_exts_cover_dropped_office_formats():
|
||||||
|
for ext in (".docx", ".pptx", ".xlsx", ".xls"):
|
||||||
|
assert ext in MARKITDOWN_EXTS
|
||||||
|
|
||||||
|
|
||||||
|
def test_convert_extracts_real_docx(tmp_path):
|
||||||
|
"""End-to-end: a .docx round-trips to Markdown with a heading (needs markitdown)."""
|
||||||
|
pytest.importorskip("markitdown")
|
||||||
|
Document = pytest.importorskip("docx").Document
|
||||||
|
|
||||||
|
doc = Document()
|
||||||
|
doc.add_heading("Quarterly Report", level=1)
|
||||||
|
doc.add_paragraph("Revenue grew across all regions.")
|
||||||
|
path = tmp_path / "report.docx"
|
||||||
|
doc.save(str(path))
|
||||||
|
|
||||||
|
md = convert_to_markdown(str(path))
|
||||||
|
assert md and "Quarterly Report" in md
|
||||||
|
assert "#" in md # docx heading styles become Markdown headings
|
||||||
25
tests/test_personal_docs_office_index.py
Normal file
25
tests/test_personal_docs_office_index.py
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from src import personal_docs
|
||||||
|
|
||||||
|
|
||||||
|
def test_personal_index_includes_office_uploads(tmp_path, monkeypatch):
|
||||||
|
docx_path = tmp_path / "report.docx"
|
||||||
|
docx_path.write_bytes(b"PK fake docx bytes")
|
||||||
|
|
||||||
|
monkeypatch.setattr(
|
||||||
|
personal_docs,
|
||||||
|
"extract_office_text",
|
||||||
|
lambda path: "# Report\n\nreadable office text" if Path(path) == docx_path else "",
|
||||||
|
)
|
||||||
|
|
||||||
|
files = personal_docs.load_personal_index(str(tmp_path))
|
||||||
|
|
||||||
|
assert [item["name"] for item in files] == ["report.docx"]
|
||||||
|
assert files[0]["path"] == str(docx_path)
|
||||||
|
assert files[0]["chunks"] == ["# Report\n\nreadable office text"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_personal_index_default_extensions_advertise_office_support():
|
||||||
|
for ext in (".docx", ".pptx", ".xlsx", ".xls"):
|
||||||
|
assert ext in personal_docs.config.DEFAULT_EXTENSIONS
|
||||||
Reference in New Issue
Block a user