odysseus/routes/document_routes.py

"""Document routes — CRUD for living documents with version history."""

import uuid
import logging
from datetime import datetime, timezone
from typing import Dict, Any, List, Optional

from fastapi import APIRouter, HTTPException, Query, Request, UploadFile, File, Form

from sqlalchemy import func
from core.database import SessionLocal, Document, DocumentVersion
from core.database import Session as DbSession
from src.auth_helpers import get_current_user

logger = logging.getLogger(__name__)


from routes.document_helpers import (
    DocumentCreate, DocumentUpdate, DocumentPatch,
    _doc_to_dict, _version_to_dict,
    _verify_doc_owner, _owner_session_filter,
    _slug, _locate_upload, _derive_title,
    _PDF_RENDER_SCALE,
)

def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
    router = APIRouter(tags=["documents"])

    # ---- POST /api/document ----
    @router.post("/api/document")
    async def create_document(request: Request, req: DocumentCreate) -> Dict[str, Any]:
        from src.auth_helpers import require_privilege
        user = require_privilege(request, "can_use_documents")
        db = SessionLocal()
        try:
            # session_id is optional: a doc can be a session-less "library" doc
            # (e.g. files imported from the library) — session_id is nullable and
            # the doc is owner-stamped, so it lives in the library on its own.
            session = None
            if req.session_id:
                session = db.query(DbSession).filter(DbSession.id == req.session_id).first()
                if not session:
                    raise HTTPException(404, "Session not found")
                # Match the lenient ownership model the rest of the app uses
                # (see _owner_filter): only block when an AUTHENTICATED user is
                # writing into a DIFFERENT user's session. In single-user /
                # unconfigured / localhost-bypass mode the middleware leaves
                # current_user unset (None), and those sessions are already
                # served freely everywhere else.
                if user and session.owner and session.owner != user:
                    raise HTTPException(403, "Cannot create document in another user's session")

            doc_id = str(uuid.uuid4())
            ver_id = str(uuid.uuid4())

            # If no language was supplied (e.g. cloning a doc whose language
            # was never set), detect it from the content rather than storing
            # NULL — which made the editor fall back to plain text. Defaults
            # to markdown for prose.
            language = req.language
            if not language:
                from src.tool_implementations import _looks_like_email_document, _sniff_doc_language
                language = _sniff_doc_language(req.content)
            else:
                from src.tool_implementations import _looks_like_email_document
            if _looks_like_email_document(req.content, req.title):
                language = "email"

            doc = Document(
                id=doc_id,
                session_id=req.session_id,
                title=req.title,
                language=language,
                current_content=req.content,
                version_count=1,
                is_active=True,
                # Stamp ownership directly so the doc survives its session
                # being deleted. Fall back to the session's owner when the
                # request is unauthenticated (single-user / localhost bypass).
                owner=user or (session.owner if session else None),
            )
            ver = DocumentVersion(
                id=ver_id,
                document_id=doc_id,
                version_number=1,
                content=req.content,
                summary="Initial version",
                source="user",
            )
            db.add(doc)
            db.add(ver)
            db.commit()
            db.refresh(doc)
            try:
                from src.event_bus import fire_event
                fire_event("document_created", doc.owner)
            except Exception:
                logger.debug("document_created event dispatch failed", exc_info=True)
            return _doc_to_dict(doc)
        except HTTPException:
            raise
        except Exception as e:
            db.rollback()
            logger.error(f"Failed to create document: {e}")
            raise HTTPException(500, f"Failed to create document: {e}")
        finally:
            db.close()

    # ---- POST /api/documents/import-pdf ----
    @router.post("/api/documents/import-pdf")
    async def import_pdf(
        request: Request,
        file: UploadFile = File(...),
        session_id: Optional[str] = Form(None),
    ) -> Dict[str, Any]:
        """Upload a PDF and create the matching Document.

        Detects AcroForm fields — if any, creates a form-backed markdown doc
        (clickable inputs in the PDF view). Otherwise creates a plain PDF doc
        with a `pdf_source` marker so the viewer renders the pages without
        overlays.
        """
        from src.constants import UPLOAD_DIR
        from src.pdf_forms import has_form_fields, extract_fields
        from src.pdf_form_doc import (
            save_field_sidecar,
            create_form_markdown_document,
            create_plain_pdf_document,
        )
        from src.document_processor import _process_pdf
        import os

        user = get_current_user(request)

        # session_id is optional — a library import isn't tied to a chat. When
        # given, validate it; otherwise the PDF becomes a session-less library
        # doc (the doc creators below already handle a missing session).
        if session_id:
            db = SessionLocal()
            try:
                sess = db.query(DbSession).filter(DbSession.id == session_id).first()
                if not sess:
                    raise HTTPException(404, "Session not found")
                if user and sess.owner and sess.owner != user:
                    raise HTTPException(403, "Cannot import into another user's session")
            finally:
                db.close()

        if upload_handler is None:
            raise HTTPException(500, "Upload handler not configured")

        client_ip = request.client.host if request.client else "unknown"
        try:
            meta = upload_handler.save_upload(file, client_ip, owner=user)
        except HTTPException:
            raise
        except Exception as e:
            logger.error(f"PDF import save_upload failed: {e}")
            raise HTTPException(500, f"Upload failed: {e}")

        upload_id = meta["id"]
        pdf_path = _locate_upload(UPLOAD_DIR, upload_id)
        if not pdf_path:
            raise HTTPException(500, "Saved PDF could not be located")

        title = os.path.splitext(meta.get("original_name") or meta.get("name") or upload_id)[0]
        try:
            body_text = _process_pdf(pdf_path).lstrip("\n[PDF content]:").strip()
        except Exception:
            body_text = None

        is_form = False
        try:
            is_form = has_form_fields(pdf_path)
        except Exception as e:
            logger.warning(f"has_form_fields failed for {pdf_path}: {e}")

        if is_form:
            fields = extract_fields(pdf_path)
            save_field_sidecar(pdf_path, fields)
            doc_id = create_form_markdown_document(
                session_id=session_id,
                fields=fields,
                upload_id=upload_id,
                title=title,
                intro_text=body_text,
            )
        else:
            doc_id = create_plain_pdf_document(
                session_id=session_id,
                upload_id=upload_id,
                title=title,
                body_text=body_text,
            )

        if not doc_id:
            raise HTTPException(500, "Failed to create document for PDF")

        db = SessionLocal()
        try:
            doc = db.query(Document).filter(Document.id == doc_id).first()
            if not doc:
                raise HTTPException(500, "Created document not found")
            # The PDF doc creators stamp owner from the session only; a
            # session-less library import leaves owner NULL, which the Library's
            # owner filter then hides. Stamp the requesting user so it shows.
            if not doc.owner and user:
                doc.owner = user
                db.commit()
                db.refresh(doc)
            return _doc_to_dict(doc)
        finally:
            db.close()

    # ---- GET /api/documents/library ----
    @router.get("/api/documents/library")
    async def documents_library(
        request: Request,
        search: Optional[str] = Query(None),
        language: Optional[str] = Query(None),
        sort: str = Query("recent"),
        offset: int = Query(0, ge=0),
        limit: int = Query(20, ge=1, le=50),
        archived: bool = Query(False),
    ) -> Dict[str, Any]:
        user = get_current_user(request)
        db = SessionLocal()
        try:
            from sqlalchemy import or_
            # Archived view shows ONLY archived docs; the default view excludes
            # them (NULL = legacy rows that predate the column = not archived).
            _arch_cond = (Document.archived == True) if archived else or_(
                Document.archived == False, Document.archived.is_(None))
            # Language facet counts (owner-filtered)
            lang_q = (
                db.query(Document.language, func.count(Document.id))
                .outerjoin(DbSession, Document.session_id == DbSession.id)
                .filter(Document.is_active == True).filter(_arch_cond)
            )
            lang_q = _owner_session_filter(lang_q, user)
            lang_rows = lang_q.group_by(Document.language).all()
            languages = {lang or "text": cnt for lang, cnt in lang_rows}

            # Session count (owner-filtered)
            sc_q = (
                db.query(func.count(func.distinct(Document.session_id)))
                .outerjoin(DbSession, Document.session_id == DbSession.id)
                .filter(Document.is_active == True).filter(_arch_cond)
            )
            sc_q = _owner_session_filter(sc_q, user)
            session_count = sc_q.scalar()

            # Base query
            q = (
                db.query(Document, DbSession.name)
                .outerjoin(DbSession, Document.session_id == DbSession.id)
                .filter(Document.is_active == True).filter(_arch_cond)
            )
            q = _owner_session_filter(q, user)

            # Search filter — split on whitespace and require EACH term to
            # match (title OR content). A single `%foo bar%` LIKE only matched
            # the exact adjacent phrase, so any multi-word query with a space
            # silently returned nothing. Per-term AND makes "machine learning"
            # match docs containing both words regardless of position/order.
            if search:
                for tok in search.split():
                    term = f"%{tok}%"
                    q = q.filter(
                        Document.title.ilike(term) | Document.current_content.ilike(term)
                    )

            # Language filter
            if language:
                if language == "text":
                    q = q.filter((Document.language == None) | (Document.language == "text"))
                else:
                    q = q.filter(Document.language == language)

            # Total before pagination
            total = q.count()

            # Sorting
            if sort == "oldest":
                q = q.order_by(Document.created_at.asc())
            elif sort == "edits":
                q = q.order_by(Document.version_count.desc())
            elif sort == "alpha":
                q = q.order_by(Document.title.asc())
            else:  # recent
                q = q.order_by(Document.updated_at.desc())

            rows = q.offset(offset).limit(limit).all()

            documents = []
            for doc, session_name in rows:
                documents.append({
                    "id": doc.id,
                    "session_id": doc.session_id,
                    "session_name": session_name,
                    "title": doc.title,
                    "language": doc.language or "text",
                    "preview": (doc.current_content or "")[:500],
                    "version_count": doc.version_count,
                    "created_at": (doc.created_at.isoformat() + "Z") if doc.created_at else None,
                    "updated_at": (doc.updated_at.isoformat() + "Z") if doc.updated_at else None,
                })

            return {
                "documents": documents,
                "total": total,
                "languages": languages,
                "session_count": session_count,
            }
        except Exception as e:
            logger.error(f"Failed to fetch document library: {e}")
            raise HTTPException(500, f"Failed to fetch document library: {e}")
        finally:
            db.close()

    # ---- GET /api/documents/{session_id} ----
    @router.get("/api/documents/{session_id}")
    async def list_documents(request: Request, session_id: str) -> List[Dict[str, Any]]:
        user = get_current_user(request)
        db = SessionLocal()
        try:
            if not user:
                raise HTTPException(403, "Authentication required")
            session = db.query(DbSession).filter(DbSession.id == session_id).first()
            # v2 review HIGH-9: raise 403 explicitly when the caller
            # can't see this session, instead of returning [] which the
            # UI treats identically to "no docs" and silently masks
            # auth failures.
            if not session:
                raise HTTPException(404, "Session not found")
            if user and session.owner and session.owner != user:
                raise HTTPException(403, "Access denied")
            docs = db.query(Document).filter(
                Document.session_id == session_id
            ).order_by(Document.created_at.desc()).all()
            return [_doc_to_dict(d) for d in docs]
        finally:
            db.close()

    # ---- GET /api/document/{doc_id} ----
    @router.get("/api/document/{doc_id}")
    async def get_document(request: Request, doc_id: str) -> Dict[str, Any]:
        user = get_current_user(request)
        db = SessionLocal()
        try:
            doc = db.query(Document).filter(Document.id == doc_id).first()
            if not doc:
                raise HTTPException(404, "Document not found")
            _verify_doc_owner(db, doc, user)
            return _doc_to_dict(doc)
        finally:
            db.close()

    # ---- POST /api/document/{doc_id}/archive — soft-archive / restore ----
    @router.post("/api/document/{doc_id}/archive")
    async def archive_document(request: Request, doc_id: str, archived: bool = Query(True)) -> Dict[str, Any]:
        user = get_current_user(request)
        db = SessionLocal()
        try:
            doc = db.query(Document).filter(Document.id == doc_id).first()
            if not doc:
                raise HTTPException(404, "Document not found")
            _verify_doc_owner(db, doc, user)
            doc.archived = bool(archived)
            db.commit()
            return {"ok": True, "id": doc_id, "archived": doc.archived}
        finally:
            db.close()

    # ---- POST /api/document/{doc_id}/extract-pdf-text ----
    @router.post("/api/document/{doc_id}/extract-pdf-text")
    async def extract_pdf_text(request: Request, doc_id: str) -> Dict[str, Any]:
        """Re-run pypdf+VL text extraction against the PDF linked to this doc
        and merge the result into the doc's markdown content. Idempotent — the
        existing body (everything below the title heading) is replaced.

        Lets the AI see PDF contents for old docs that were imported before
        text extraction was wired, plus for scanned/image-only PDFs where the
        VL model picks up text the basic pypdf path missed."""
        import re
        from src.constants import UPLOAD_DIR
        from src.document_processor import _process_pdf

        user = get_current_user(request)
        db = SessionLocal()
        try:
            doc = db.query(Document).filter(Document.id == doc_id).first()
            if not doc:
                raise HTTPException(404, "Document not found")
            _verify_doc_owner(db, doc, user)

            content = doc.current_content or ""
            m = re.search(r'<!--\s*(?:pdf_source|pdf_form_source)\s+upload_id="([^"]+)"', content)
            if not m:
                raise HTTPException(400, "Document is not a PDF — no pdf_source marker found")
            upload_id = m.group(1)

            pdf_path = _locate_upload(UPLOAD_DIR, upload_id)
            if not pdf_path:
                raise HTTPException(404, "Source PDF could not be located")

            try:
                body_text = _process_pdf(pdf_path).lstrip("\n[PDF content]:").strip()
            except Exception as e:
                logger.error(f"extract_pdf_text failed for {pdf_path}: {e}")
                raise HTTPException(500, f"Extraction failed: {e}")

            if not body_text:
                return {"ok": True, "id": doc_id, "extracted": False, "reason": "No readable content"}

            # Preserve everything up through the title (front-matter marker +
            # first H1) and replace the rest with the freshly extracted text.
            head_re = re.compile(r'^(<!--[^>]+-->\s*\n+#[^\n]*\n+)', re.MULTILINE)
            head_match = head_re.match(content)
            head = head_match.group(1) if head_match else (content.splitlines()[0] + "\n\n# " + (doc.title or "PDF") + "\n\n")
            doc.current_content = head + body_text.strip() + "\n"
            doc.version_count = (doc.version_count or 1) + 1
            db.add(DocumentVersion(
                id=str(__import__("uuid").uuid4()),
                document_id=doc_id,
                version_number=doc.version_count,
                content=doc.current_content,
                summary="PDF text re-extracted (OCR)",
                source="ocr",
            ))
            db.commit()
            return {"ok": True, "id": doc_id, "extracted": True, "chars": len(body_text)}
        finally:
            db.close()

    # ---- POST /api/documents/export-zip — bundle selected docs into a .zip ----
    @router.post("/api/documents/export-zip")
    async def documents_export_zip(request: Request):
        """Zip the selected documents (each as a text file with the right
        extension) — mirrors the gallery's bulk download-zip so multi-export
        is one file instead of a blocked flood of individual downloads."""
        user = get_current_user(request)
        try:
            data = await request.json()
        except Exception:
            data = {}
        ids = data.get("ids") or []
        if not ids:
            raise HTTPException(400, "No documents specified")
        _ext = {
            "javascript": ".js", "python": ".py", "html": ".html", "css": ".css",
            "markdown": ".md", "json": ".json", "yaml": ".yml", "bash": ".sh",
            "sql": ".sql", "rust": ".rs", "go": ".go", "java": ".java", "c": ".c",
            "cpp": ".cpp", "typescript": ".ts", "ruby": ".rb", "php": ".php",
            "text": ".txt", "xml": ".xml", "toml": ".toml", "ini": ".ini",
        }
        db = SessionLocal()
        try:
            import io
            import re
            import zipfile
            from fastapi import Response
            docs = db.query(Document).filter(Document.id.in_(ids)).all()
            buf = io.BytesIO()
            used = set()
            wrote = 0
            with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
                for doc in docs:
                    try:
                        _verify_doc_owner(db, doc, user)
                    except HTTPException:
                        continue   # skip docs the user doesn't own
                    ext = _ext.get(doc.language or "text", ".txt")
                    base = (doc.title or "document").strip() or "document"
                    base = re.sub(r"[^\w\-. ]+", "", base)[:60].strip() or doc.id
                    name = base if "." in base else base + ext
                    i = 1
                    while name in used:
                        name = f"{base}-{i}" + ("" if "." in base else ext)
                        i += 1
                    used.add(name)
                    zf.writestr(name, doc.current_content or "")
                    wrote += 1
            if not wrote:
                raise HTTPException(404, "No documents found")
            return Response(
                content=buf.getvalue(),
                media_type="application/zip",
                headers={"Content-Disposition": 'attachment; filename="documents.zip"'},
            )
        finally:
            db.close()

    # ---- PUT /api/document/{doc_id} — user manual edit ----
    # Coalesce window: if the last user version was saved within this many
    # seconds, update it in-place (user is still actively editing).
    # Once the gap exceeds this, the next save creates a new version.
    VERSION_COALESCE_SECONDS = 60

    @router.put("/api/document/{doc_id}")
    async def update_document(request: Request, doc_id: str, req: DocumentUpdate) -> Dict[str, Any]:
        user = get_current_user(request)
        db = SessionLocal()
        try:
            doc = db.query(Document).filter(Document.id == doc_id).first()
            if not doc:
                raise HTTPException(404, "Document not found")
            _verify_doc_owner(db, doc, user)

            # Skip if content is identical
            if doc.current_content == req.content:
                return _doc_to_dict(doc)

            # Check if we can coalesce with the latest version
            latest_ver = db.query(DocumentVersion).filter(
                DocumentVersion.document_id == doc_id,
            ).order_by(DocumentVersion.version_number.desc()).first()

            now = datetime.now(timezone.utc)
            coalesced = False
            if latest_ver and latest_ver.source == "user":
                ver_time = latest_ver.created_at
                if ver_time.tzinfo is None:
                    ver_time = ver_time.replace(tzinfo=timezone.utc)
                age = (now - ver_time).total_seconds()
                if age < VERSION_COALESCE_SECONDS:
                    # Update the existing version in-place
                    latest_ver.content = req.content
                    latest_ver.created_at = now
                    if req.summary:
                        latest_ver.summary = req.summary
                    coalesced = True

            if not coalesced:
                new_ver = doc.version_count + 1
                ver = DocumentVersion(
                    id=str(uuid.uuid4()),
                    document_id=doc_id,
                    version_number=new_ver,
                    content=req.content,
                    summary=req.summary or "Manual edit",
                    source="user",
                )
                doc.version_count = new_ver
                db.add(ver)

            doc.current_content = req.content
            db.commit()
            db.refresh(doc)
            return _doc_to_dict(doc)
        except HTTPException:
            raise
        except Exception as e:
            db.rollback()
            raise HTTPException(500, f"Failed to update document: {e}")
        finally:
            db.close()

    # ---- PATCH /api/document/{doc_id} — metadata only ----
    @router.patch("/api/document/{doc_id}")
    async def patch_document(request: Request, doc_id: str, req: DocumentPatch) -> Dict[str, Any]:
        user = get_current_user(request)
        db = SessionLocal()
        try:
            doc = db.query(Document).filter(Document.id == doc_id).first()
            if not doc:
                raise HTTPException(404, "Document not found")
            _verify_doc_owner(db, doc, user)
            if req.title is not None:
                doc.title = req.title
            if req.language is not None:
                doc.language = req.language
            if req.session_id is not None:
                # Empty string = unlink from session
                doc.session_id = req.session_id if req.session_id else None
            db.commit()
            db.refresh(doc)
            return _doc_to_dict(doc)
        except HTTPException:
            raise
        except Exception as e:
            db.rollback()
            raise HTTPException(500, str(e))
        finally:
            db.close()

    # ---- DELETE /api/document/{doc_id} — soft delete ----
    @router.delete("/api/document/{doc_id}")
    async def delete_document(request: Request, doc_id: str) -> Dict[str, str]:
        user = get_current_user(request)
        db = SessionLocal()
        try:
            doc = db.query(Document).filter(Document.id == doc_id).first()
            if not doc:
                raise HTTPException(404, "Document not found")
            _verify_doc_owner(db, doc, user)
            doc.is_active = False
            db.commit()
            return {"status": "deleted", "id": doc_id}
        except HTTPException:
            raise
        except Exception as e:
            db.rollback()
            raise HTTPException(500, str(e))
        finally:
            db.close()

    # ---- GET /api/document/{doc_id}/versions ----
    @router.get("/api/document/{doc_id}/versions")
    async def list_versions(request: Request, doc_id: str) -> List[Dict[str, Any]]:
        user = get_current_user(request)
        db = SessionLocal()
        try:
            # Verify ownership before listing versions
            doc = db.query(Document).filter(Document.id == doc_id).first()
            if doc:
                _verify_doc_owner(db, doc, user)
            versions = db.query(DocumentVersion).filter(
                DocumentVersion.document_id == doc_id
            ).order_by(DocumentVersion.version_number.desc()).all()
            return [{
                "id": v.id,
                "version_number": v.version_number,
                "content": v.content,
                "summary": v.summary,
                "source": v.source,
                "created_at": v.created_at.isoformat() if v.created_at else None,
            } for v in versions]
        finally:
            db.close()

    # ---- GET /api/document/{doc_id}/version/{num} ----
    @router.get("/api/document/{doc_id}/version/{num}")
    async def get_version(request: Request, doc_id: str, num: int) -> Dict[str, Any]:
        user = get_current_user(request)
        db = SessionLocal()
        try:
            # Verify ownership
            doc = db.query(Document).filter(Document.id == doc_id).first()
            if doc:
                _verify_doc_owner(db, doc, user)
            ver = db.query(DocumentVersion).filter(
                DocumentVersion.document_id == doc_id,
                DocumentVersion.version_number == num,
            ).first()
            if not ver:
                raise HTTPException(404, "Version not found")
            return _version_to_dict(ver)
        finally:
            db.close()

    # ---- POST /api/document/{doc_id}/restore/{num} ----
    @router.post("/api/document/{doc_id}/restore/{num}")
    async def restore_version(request: Request, doc_id: str, num: int) -> Dict[str, Any]:
        user = get_current_user(request)
        db = SessionLocal()
        try:
            doc = db.query(Document).filter(Document.id == doc_id).first()
            if not doc:
                raise HTTPException(404, "Document not found")
            _verify_doc_owner(db, doc, user)

            old_ver = db.query(DocumentVersion).filter(
                DocumentVersion.document_id == doc_id,
                DocumentVersion.version_number == num,
            ).first()
            if not old_ver:
                raise HTTPException(404, "Version not found")

            new_ver_num = doc.version_count + 1
            ver = DocumentVersion(
                id=str(uuid.uuid4()),
                document_id=doc_id,
                version_number=new_ver_num,
                content=old_ver.content,
                summary=f"Restored from v{num}",
                source="user",
            )
            doc.current_content = old_ver.content
            doc.version_count = new_ver_num
            db.add(ver)
            db.commit()
            db.refresh(doc)
            return _doc_to_dict(doc)
        except HTTPException:
            raise
        except Exception as e:
            db.rollback()
            raise HTTPException(500, str(e))
        finally:
            db.close()

    # ---- POST /api/documents/tidy — clean up broken/empty documents ----
    @router.post("/api/documents/tidy")
    async def tidy_documents(request: Request) -> Dict[str, Any]:
        """Fix empty titles and remove broken/empty documents (user's docs only)."""
        user = get_current_user(request)
        db = SessionLocal()
        try:
            q = (
                db.query(Document)
                .outerjoin(DbSession, Document.session_id == DbSession.id)
                .filter(Document.is_active == True)
                .filter((Document.archived == False) | (Document.archived.is_(None)))
            )
            q = _owner_session_filter(q, user)
            docs = q.all()
            fixed_titles = 0
            deleted = 0

            # Same junk-detection logic as the scheduled tidy_documents
            # action (src/document_actions.py). Keep these two in sync.
            import re as _re
            from src.document_actions import _JUNK_TITLES

            to_delete = []
            for doc in docs:
                content = (doc.current_content or "").strip()
                title_raw = (doc.title or "").strip()
                title = title_raw.lower()

                # Strip markdown noise to get a "real" character count
                stripped = _re.sub(r"^#{1,6}\s+", "", content, flags=_re.MULTILINE)
                stripped = _re.sub(r"[*_`>\-=]+", "", stripped)
                stripped = _re.sub(r"\s+", " ", stripped).strip()
                real_len = len(stripped)

                # Detect email-scaffold stubs: "To: \nSubject: \n---\n" style
                # bodies with nothing typed in. Stub = every meaningful line
                # is a header label (To:/From:/Subject:/...) with no real
                # value (blank, "empty", "(empty)", "-", "none", "n/a").
                _is_email_stub = False
                _HEADER_RE = _re.compile(r"^(to|from|cc|bcc|subject|reply-to):\s*(.*)$", _re.I)
                _PLACEHOLDER_VALS = {"", "empty", "(empty)", "-", "—", "none", "n/a", "na", "tbd"}
                if title in ("new email", "new mail", "new message") or doc.language == "email":
                    body_lines = [ln.strip() for ln in content.split("\n")
                                  if ln.strip() and ln.strip() != "---"]
                    def _is_filler(ln):
                        m = _HEADER_RE.match(ln)
                        if not m:
                            return False
                        val = (m.group(2) or "").strip().lower()
                        return val in _PLACEHOLDER_VALS
                    has_real_body = any(not _is_filler(ln) for ln in body_lines)
                    if body_lines and not has_real_body:
                        _is_email_stub = True

                # Hard-delete obviously empty / junk documents
                if not content or content in ("", "# Untitled"):
                    to_delete.append(doc); deleted += 1; continue
                if _is_email_stub:
                    to_delete.append(doc); deleted += 1; continue
                if title in _JUNK_TITLES:
                    to_delete.append(doc); deleted += 1; continue
                if real_len < 30:
                    to_delete.append(doc); deleted += 1; continue
                if "\n" not in content and real_len < 50:
                    to_delete.append(doc); deleted += 1; continue

                # Fix empty or placeholder titles on survivors
                if not title_raw or title_raw == "Untitled":
                    new_title = _derive_title(content)
                    if new_title and new_title != "Untitled":
                        doc.title = new_title
                        fixed_titles += 1

            for doc in to_delete:
                db.delete(doc)

            # Also clean up inactive empty docs from previous soft-deletes
            inactive_q = (
                db.query(Document)
                .outerjoin(DbSession, Document.session_id == DbSession.id)
                .filter(Document.is_active == False)
                .filter((Document.current_content == None) | (Document.current_content == ""))
            )
            inactive_q = _owner_session_filter(inactive_q, user)
            inactive_docs = inactive_q.all()
            for doc in inactive_docs:
                db.delete(doc)
            deleted += len(inactive_docs)

            db.commit()
            return {
                "fixed_titles": fixed_titles,
                "deleted": deleted,
                "message": f"Fixed {fixed_titles} title{'s' if fixed_titles != 1 else ''}, removed {deleted} empty document{'s' if deleted != 1 else ''}",
            }
        except Exception as e:
            db.rollback()
            logger.error(f"Document tidy failed: {e}")
            raise HTTPException(500, f"Tidy failed: {e}")
        finally:
            db.close()

    # ---- POST /api/documents/ai-tidy — AI-powered cleanup of junk/test documents ----
    @router.post("/api/documents/ai-tidy")
    async def ai_tidy_documents(request: Request) -> Dict[str, Any]:
        """Use AI to judge if documents are junk/test/accidental, then delete them.
        Caches verdicts so previously-reviewed docs are skipped."""
        from src.task_endpoint import resolve_task_endpoint
        from src.endpoint_resolver import resolve_endpoint
        from src.llm_core import llm_call_async

        user = get_current_user(request)
        url, model, headers = resolve_task_endpoint()
        if not url or not model:
            # Fall back to default endpoint
            url, model, headers = resolve_endpoint("default")
        if not url or not model:
            raise HTTPException(500, "No endpoint configured for AI tidy")

        db = SessionLocal()
        try:
            q = (
                db.query(Document)
                .outerjoin(DbSession, Document.session_id == DbSession.id)
                .filter(Document.is_active == True)
                .filter((Document.archived == False) | (Document.archived.is_(None)))
            )
            q = _owner_session_filter(q, user)
            docs = q.all()

            # Only review docs that haven't been reviewed yet
            to_review = [d for d in docs if not d.tidy_verdict]
            if not to_review:
                return {"deleted": 0, "reviewed": 0, "message": "All documents already reviewed"}

            # Build a batch prompt — review up to 30 at a time
            batch = to_review[:30]
            doc_list = []
            for i, doc in enumerate(batch):
                preview = (doc.current_content or "")[:300].strip()
                doc_list.append(f"[{i}] title=\"{doc.title}\" lang={doc.language or 'text'} content_preview=\"{preview}\"")

            prompt = (
                "You are a document library cleaner. For each document below, decide if it is JUNK "
                "(test, accidental, placeholder, empty-ish, tool-test, throwaway) or KEEP (real content worth saving).\n\n"
                "Respond with ONLY a JSON array of verdicts, one per document, like: [\"junk\",\"keep\",\"junk\",...]\n"
                "No explanation, no markdown, just the JSON array.\n\n"
                + "\n".join(doc_list)
            )

            response = await llm_call_async(
                url, model,
                [{"role": "system", "content": "You classify documents as junk or keep. Respond only with a JSON array."},
                 {"role": "user", "content": prompt}],
                temperature=0.1,
                max_tokens=200,
                headers=headers,
                timeout=30,
            )

            # Parse verdicts
            import re
            match = re.search(r'\[.*?\]', response, re.DOTALL)
            if not match:
                raise HTTPException(500, "AI returned invalid response")

            import json as _json
            verdicts = _json.loads(match.group())

            deleted = 0
            reviewed = 0
            for i, doc in enumerate(batch):
                if i >= len(verdicts):
                    break
                verdict = verdicts[i].lower().strip()
                if verdict == "junk":
                    doc.tidy_verdict = "junk"
                    db.delete(doc)
                    deleted += 1
                else:
                    doc.tidy_verdict = "keep"
                reviewed += 1

            db.commit()
            return {
                "deleted": deleted,
                "reviewed": reviewed,
                "remaining": len(to_review) - len(batch),
                "message": f"Reviewed {reviewed}, removed {deleted} junk document{'s' if deleted != 1 else ''}",
            }
        except HTTPException:
            raise
        except Exception as e:
            db.rollback()
            logger.error(f"AI tidy failed: {e}")
            raise HTTPException(500, f"AI tidy failed: {e}")
        finally:
            db.close()

    # ---- POST /api/document/{doc_id}/export-pdf/preview ----
    @router.post("/api/document/{doc_id}/export-pdf/preview")
    async def export_pdf_preview(doc_id: str, request: Request) -> Dict[str, Any]:
        """Return the field-value mapping that would be written to the PDF.

        Frontend shows this in a confirmation modal so the user can spot/fix
        any wrong values before triggering the actual download.
        """
        from src.pdf_form_doc import find_source_upload_id, parse_markdown_to_values, load_field_sidecar
        from src.constants import UPLOAD_DIR

        user = get_current_user(request)
        db = SessionLocal()
        try:
            doc = db.query(Document).filter(Document.id == doc_id).first()
            if not doc:
                raise HTTPException(404, "Document not found")
            _verify_doc_owner(db, doc, user)

            upload_id = find_source_upload_id(doc.current_content or "")
            if not upload_id:
                raise HTTPException(400, "Document is not linked to a source PDF")

            pdf_path = _locate_upload(UPLOAD_DIR, upload_id)
            if not pdf_path:
                raise HTTPException(404, f"Source PDF {upload_id} not found in uploads")

            fields = load_field_sidecar(pdf_path)
            if not fields:
                raise HTTPException(404, "Field schema sidecar missing for source PDF")

            values = parse_markdown_to_values(doc.current_content or "")
            field_meta = {f["name"]: f for f in fields}

            preview = []
            for name, current in values.items():
                meta = field_meta.get(name)
                if not meta:
                    continue
                preview.append({
                    "name": name,
                    "label": meta.get("label") or name,
                    "type": meta.get("type"),
                    "options": meta.get("options") or [],
                    "page": meta.get("page"),
                    "value": current,
                })

            unknown = [
                name for name in values
                if name not in field_meta
            ]
            return {
                "doc_id": doc_id,
                "upload_id": upload_id,
                "fields": preview,
                "unknown_fields": unknown,
                "total": len(fields),
                "filled": sum(1 for p in preview if p["value"] not in ("", False, None)),
            }
        finally:
            db.close()

    # ---- GET /api/document/{doc_id}/render-pages ----
    @router.get("/api/document/{doc_id}/render-pages")
    async def render_pages(doc_id: str, request: Request) -> Dict[str, Any]:
        """Return per-page metadata for the interactive PDF view.

        Each page entry has its rendered-image dimensions (matching what
        /page/{n}.png returns at the same DPI) plus the list of form fields
        on that page with their rects translated to image-pixel coordinates.
        Frontend overlays HTML form controls at those positions.
        """
        from src.pdf_form_doc import find_source_upload_id, parse_markdown_to_values, load_field_sidecar
        from src.constants import UPLOAD_DIR
        import fitz

        user = get_current_user(request)
        db = SessionLocal()
        try:
            doc = db.query(Document).filter(Document.id == doc_id).first()
            if not doc:
                raise HTTPException(404, "Document not found")
            _verify_doc_owner(db, doc, user)
            upload_id = find_source_upload_id(doc.current_content or "")
            if not upload_id:
                raise HTTPException(400, "Document is not linked to a source PDF")
            pdf_path = _locate_upload(UPLOAD_DIR, upload_id)
            if not pdf_path:
                raise HTTPException(404, f"Source PDF {upload_id} not found")

            schema = load_field_sidecar(pdf_path) or []
            values = parse_markdown_to_values(doc.current_content or "")

            # Group fields by page
            by_page: Dict[int, list] = {}
            for f in schema:
                by_page.setdefault(f["page"], []).append(f)

            scale = _PDF_RENDER_SCALE
            pdf_doc = fitz.open(pdf_path)
            try:
                pages_out = []
                for page_index in range(pdf_doc.page_count):
                    page = pdf_doc[page_index]
                    page_no = page_index + 1
                    pw, ph = page.rect.width, page.rect.height
                    img_w = int(pw * scale)
                    img_h = int(ph * scale)
                    fields_out = []
                    for f in by_page.get(page_no, []):
                        x0, y0, x1, y1 = f["rect"]
                        fields_out.append({
                            "name": f["name"],
                            "type": f["type"],
                            "label": f.get("label") or "",
                            "options": f.get("options") or [],
                            "value": values.get(f["name"], f.get("value", "")),
                            "rect_px": [
                                int(x0 * scale), int(y0 * scale),
                                int(x1 * scale), int(y1 * scale),
                            ],
                        })
                    pages_out.append({
                        "page": page_no,
                        "width": img_w,
                        "height": img_h,
                        "fields": fields_out,
                    })
                return {"doc_id": doc_id, "scale": scale, "pages": pages_out}
            finally:
                pdf_doc.close()
        finally:
            db.close()

    # ---- GET /api/document/{doc_id}/page/{n}.png ----
    @router.get("/api/document/{doc_id}/page/{page_no}.png")
    async def render_page_png(doc_id: str, page_no: int, request: Request):
        """Render one page of the source PDF as a PNG (no values stamped — the
        frontend overlays HTML form inputs on top)."""
        from fastapi.responses import Response
        from src.pdf_form_doc import find_source_upload_id
        from src.constants import UPLOAD_DIR
        import fitz

        user = get_current_user(request)
        db = SessionLocal()
        try:
            doc = db.query(Document).filter(Document.id == doc_id).first()
            if not doc:
                raise HTTPException(404, "Document not found")
            _verify_doc_owner(db, doc, user)
            upload_id = find_source_upload_id(doc.current_content or "")
            if not upload_id:
                raise HTTPException(400, "Document is not linked to a source PDF")
            pdf_path = _locate_upload(UPLOAD_DIR, upload_id)
            if not pdf_path:
                raise HTTPException(404, "Source PDF not found")
        finally:
            db.close()

        pdf_doc = fitz.open(pdf_path)
        try:
            if page_no < 1 or page_no > pdf_doc.page_count:
                raise HTTPException(404, "Page out of range")
            page = pdf_doc[page_no - 1]
            mat = fitz.Matrix(_PDF_RENDER_SCALE, _PDF_RENDER_SCALE)
            pix = page.get_pixmap(matrix=mat, alpha=False)
            png_bytes = pix.tobytes("png")
            return Response(
                content=png_bytes,
                media_type="image/png",
                headers={"Cache-Control": "public, max-age=3600"},
            )
        finally:
            pdf_doc.close()

    # ---- POST /api/document/{doc_id}/ai-fill-annotations ----
    @router.post("/api/document/{doc_id}/ai-fill-annotations")
    async def ai_fill_annotations(doc_id: str, request: Request) -> Dict[str, Any]:
        """Ask a vision-capable LLM to locate fillable areas on a flat PDF and
        propose annotation values for each, given a free-form user instruction.

        Returns a list of annotations: [{page, x, y, w, h, value}] where x/y/w/h
        are page-percentages (0–100) — same coordinate system as the freeform
        annotations the frontend already renders.
        """
        import base64
        import json
        import fitz
        from src.pdf_form_doc import find_source_upload_id
        from src.constants import UPLOAD_DIR
        from src.document_processor import _resolve_vl_model, _load_vl_settings
        from src.llm_core import llm_call_async

        body = await request.json() if request.headers.get("content-type", "").startswith("application/json") else {}
        instruction = (body or {}).get("instruction", "").strip()
        if not instruction:
            raise HTTPException(400, "instruction is required")

        user = get_current_user(request)
        db = SessionLocal()
        try:
            doc = db.query(Document).filter(Document.id == doc_id).first()
            if not doc:
                raise HTTPException(404, "Document not found")
            _verify_doc_owner(db, doc, user)
            upload_id = find_source_upload_id(doc.current_content or "")
            if not upload_id:
                raise HTTPException(400, "Document is not linked to a source PDF")
            pdf_path = _locate_upload(UPLOAD_DIR, upload_id)
            if not pdf_path:
                raise HTTPException(404, "Source PDF not found")
        finally:
            db.close()

        # Resolve VL model (admin-configured or auto-detected vision-capable)
        settings = _load_vl_settings()
        vl_model = settings.get("vision_model", "")
        try:
            url, model_id, headers = _resolve_vl_model(vl_model)
        except Exception as e:
            raise HTTPException(503, f"No vision model available: {e}")

        system_prompt = (
            "You analyze rendered PDF page images and propose values to fill in. "
            "For each blank line, box, underscore, or labeled space on the page that "
            "should be filled given the user's instruction, output one annotation. "
            "Coordinates are percentages (0-100) of the page width/height with the "
            "origin at top-left. Width/height should match the visible blank box. "
            "Return ONLY a JSON array, no prose, no markdown fences. Each entry: "
            '{"x": number, "y": number, "w": number, "h": number, "value": string}. '
            "If a region should not be filled, omit it. If nothing should be filled, "
            "return []."
        )

        all_annotations = []
        pdf_doc = fitz.open(pdf_path)
        try:
            for page_index in range(pdf_doc.page_count):
                page = pdf_doc[page_index]
                mat = fitz.Matrix(_PDF_RENDER_SCALE, _PDF_RENDER_SCALE)
                pix = page.get_pixmap(matrix=mat, alpha=False)
                png_bytes = pix.tobytes("png")
                b64 = base64.b64encode(png_bytes).decode("ascii")

                messages = [
                    {"role": "system", "content": system_prompt},
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": (
                                    f"User instruction:\n{instruction}\n\n"
                                    f"This is page {page_index + 1} of {pdf_doc.page_count}. "
                                    "Return JSON array of annotations to add to this page."
                                ),
                            },
                            {
                                "type": "image_url",
                                "image_url": {"url": f"data:image/png;base64,{b64}"},
                            },
                        ],
                    },
                ]
                try:
                    raw = await llm_call_async(
                        url, model_id, messages,
                        temperature=0.1, max_tokens=2000, headers=headers,
                    )
                except Exception as e:
                    logger.error(f"VL call failed on page {page_index + 1}: {e}")
                    continue

                raw = (raw or "").strip()
                if raw.startswith("```"):
                    raw = raw.split("\n", 1)[-1].rsplit("```", 1)[0].strip()
                try:
                    parsed = json.loads(raw)
                except Exception:
                    logger.warning(f"AI fill: page {page_index + 1} returned non-JSON: {raw[:200]}")
                    continue
                if not isinstance(parsed, list):
                    continue
                for item in parsed:
                    if not isinstance(item, dict):
                        continue
                    try:
                        x = float(item.get("x", 0))
                        y = float(item.get("y", 0))
                        w = float(item.get("w", 0))
                        h = float(item.get("h", 0))
                        value = str(item.get("value", "") or "")
                    except Exception:
                        continue
                    # Clamp + reject zero-size entries
                    if w <= 0.5 or h <= 0.3:
                        continue
                    x = max(0.0, min(99.0, x))
                    y = max(0.0, min(99.0, y))
                    w = max(0.5, min(100.0 - x, w))
                    h = max(0.3, min(100.0 - y, h))
                    if not value.strip():
                        continue
                    all_annotations.append({
                        "page": page_index + 1,
                        "x": round(x, 2),
                        "y": round(y, 2),
                        "w": round(w, 2),
                        "h": round(h, 2),
                        "value": value,
                    })
        finally:
            pdf_doc.close()

        return {"annotations": all_annotations}

    # ---- GET /api/document/{doc_id}/render-pdf ----
    @router.get("/api/document/{doc_id}/render-pdf")
    async def render_pdf(doc_id: str, request: Request):
        """Inline PDF preview filled with the current markdown values.

        Same plumbing as the export route, but no signature stamping and
        served inline (Content-Disposition: inline) so the browser can
        embed it in an iframe. Cache-busted by the caller via query string.
        """
        import base64
        import os
        import tempfile
        from fastapi.responses import FileResponse
        from starlette.background import BackgroundTask
        from src.pdf_form_doc import find_source_upload_id, parse_markdown_to_values, parse_markdown_annotations
        from src.pdf_forms import fill_fields, stamp_annotations
        from src.constants import UPLOAD_DIR
        from core.database import Signature

        # Track temp files for this request so they get unlinked AFTER
        # the response is fully sent (BackgroundTask runs post-send).
        _to_unlink: list[str] = []
        def _cleanup_temps():
            for _p in _to_unlink:
                try:
                    os.unlink(_p)
                except FileNotFoundError:
                    pass
                except Exception as _e:
                    logger.warning(f"Could not unlink temp PDF {_p}: {_e}")

        user = get_current_user(request)
        db = SessionLocal()
        try:
            doc = db.query(Document).filter(Document.id == doc_id).first()
            if not doc:
                raise HTTPException(404, "Document not found")
            _verify_doc_owner(db, doc, user)
            upload_id = find_source_upload_id(doc.current_content or "")
            if not upload_id:
                raise HTTPException(400, "Document is not linked to a source PDF")
            pdf_path = _locate_upload(UPLOAD_DIR, upload_id)
            if not pdf_path:
                raise HTTPException(404, f"Source PDF {upload_id} not found")

            values = parse_markdown_to_values(doc.current_content or "")
            out_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
            _to_unlink.append(out_path)
            try:
                fill_fields(pdf_path, out_path, values)
            except Exception as e:
                logger.error(f"render_pdf fill_fields failed for {doc_id}: {e}")
                _cleanup_temps()
                raise HTTPException(500, f"PDF render failed: {e}")

            annotations = parse_markdown_annotations(doc.current_content or "")
            if annotations:
                ann_sig_ids = [
                    a["value"][len("signature:"):].strip()
                    for a in annotations
                    if a.get("kind") == "signature"
                    and isinstance(a.get("value"), str)
                    and a["value"].startswith("signature:")
                ]
                ann_signature_pngs: dict[str, bytes] = {}
                if ann_sig_ids:
                    # SECURITY: filter by owner so a caller can't reference
                    # someone else's signature ID from doc markdown and have
                    # it stamped/exported.
                    _sig_q = db.query(Signature).filter(Signature.id.in_(ann_sig_ids))
                    if user:
                        _sig_q = _sig_q.filter(Signature.owner == user)
                    sig_rows = _sig_q.all()
                    for s in sig_rows:
                        try:
                            ann_signature_pngs[s.id] = base64.b64decode(s.data_png)
                        except Exception as e:
                            logger.warning(f"Bad annotation signature data for {s.id}: {e}")
                annotated_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
                _to_unlink.append(annotated_path)
                try:
                    stamp_annotations(out_path, annotated_path, annotations, ann_signature_pngs)
                    out_path = annotated_path
                except Exception as e:
                    logger.error(f"stamp_annotations (render) failed for {doc_id}: {e}")

            return FileResponse(
                out_path,
                media_type="application/pdf",
                headers={"Content-Disposition": "inline"},
                background=BackgroundTask(_cleanup_temps),
            )
        finally:
            db.close()

    # ---- GET /api/document/{doc_id}/export-pdf ----
    @router.get("/api/document/{doc_id}/export-pdf")
    async def export_pdf(doc_id: str, request: Request):
        """Stream the filled PDF for download.

        Reads field values and signature selections from the markdown — there
        is no separate confirmation step. Signature fields contain their
        chosen signature ID encoded as `signature:<id>` in the value.
        """
        import base64
        import os
        import tempfile
        from fastapi.responses import FileResponse
        from starlette.background import BackgroundTask
        from src.pdf_form_doc import find_source_upload_id, parse_markdown_to_values, load_field_sidecar, parse_markdown_annotations
        from src.pdf_forms import fill_fields, stamp_signatures, stamp_annotations
        from src.constants import UPLOAD_DIR
        from core.database import Signature

        _to_unlink: list[str] = []
        def _cleanup_temps():
            for _p in _to_unlink:
                try:
                    os.unlink(_p)
                except FileNotFoundError:
                    pass
                except Exception as _e:
                    logger.warning(f"Could not unlink temp PDF {_p}: {_e}")

        user = get_current_user(request)
        db = SessionLocal()
        try:
            doc = db.query(Document).filter(Document.id == doc_id).first()
            if not doc:
                raise HTTPException(404, "Document not found")
            _verify_doc_owner(db, doc, user)

            upload_id = find_source_upload_id(doc.current_content or "")
            if not upload_id:
                raise HTTPException(400, "Document is not linked to a source PDF")

            pdf_path = _locate_upload(UPLOAD_DIR, upload_id)
            if not pdf_path:
                raise HTTPException(404, f"Source PDF {upload_id} not found in uploads")

            schema = load_field_sidecar(pdf_path) or []
            sig_field_names = {f["name"] for f in schema if f.get("type") == "signature"}

            all_values = parse_markdown_to_values(doc.current_content or "")
            # Split: signature fields go to stamps, everything else to fill_fields
            text_values: dict = {}
            sig_ids: dict[str, str] = {}
            for name, raw in all_values.items():
                if name in sig_field_names and isinstance(raw, str) and raw.startswith("signature:"):
                    sig_ids[name] = raw[len("signature:"):].strip()
                elif name not in sig_field_names:
                    text_values[name] = raw

            stamps: dict = {}
            if sig_ids:
                # SECURITY: filter by owner — same reason as render_pdf.
                _sig_q2 = db.query(Signature).filter(Signature.id.in_(list(sig_ids.values())))
                if user:
                    _sig_q2 = _sig_q2.filter(Signature.owner == user)
                rows = _sig_q2.all()
                by_id = {s.id: s for s in rows}
                for field_name, sid in sig_ids.items():
                    s = by_id.get(sid)
                    if not s:
                        continue
                    try:
                        stamps[field_name] = base64.b64decode(s.data_png)
                    except Exception as e:
                        logger.warning(f"Bad signature data for {sid}: {e}")

            filled_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
            _to_unlink.append(filled_path)
            try:
                fill_fields(pdf_path, filled_path, text_values)
            except Exception as e:
                logger.error(f"fill_fields failed for doc {doc_id}: {e}")
                _cleanup_temps()
                raise HTTPException(500, f"PDF fill failed: {e}")

            out_path = filled_path
            if stamps:
                stamped_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
                _to_unlink.append(stamped_path)
                try:
                    stamp_signatures(filled_path, stamped_path, stamps)
                    out_path = stamped_path
                except Exception as e:
                    logger.error(f"stamp_signatures failed for doc {doc_id}: {e}")

            # Burn freeform annotations (Text/Check/Sign drops) on top.
            annotations = parse_markdown_annotations(doc.current_content or "")
            if annotations:
                # Resolve any signature annotations to their PNG bytes.
                ann_sig_ids = [
                    a["value"][len("signature:"):].strip()
                    for a in annotations
                    if a.get("kind") == "signature"
                    and isinstance(a.get("value"), str)
                    and a["value"].startswith("signature:")
                ]
                ann_signature_pngs: dict[str, bytes] = {}
                if ann_sig_ids:
                    # SECURITY: filter by owner so a caller can't reference
                    # someone else's signature ID from doc markdown and have
                    # it stamped/exported.
                    _sig_q = db.query(Signature).filter(Signature.id.in_(ann_sig_ids))
                    if user:
                        _sig_q = _sig_q.filter(Signature.owner == user)
                    sig_rows = _sig_q.all()
                    for s in sig_rows:
                        try:
                            ann_signature_pngs[s.id] = base64.b64decode(s.data_png)
                        except Exception as e:
                            logger.warning(f"Bad annotation signature data for {s.id}: {e}")
                annotated_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
                _to_unlink.append(annotated_path)
                try:
                    stamp_annotations(out_path, annotated_path, annotations, ann_signature_pngs)
                    out_path = annotated_path
                except Exception as e:
                    logger.error(f"stamp_annotations failed for doc {doc_id}: {e}")

            download_name = _slug(doc.title or "form") + "_annotated.pdf"
            return FileResponse(
                out_path,
                media_type="application/pdf",
                filename=download_name,
                background=BackgroundTask(_cleanup_temps),
            )
        finally:
            db.close()

    # ---- POST /api/document/{doc_id}/prepare-signed-reply ----
    @router.post("/api/document/{doc_id}/prepare-signed-reply")
    async def prepare_signed_reply(doc_id: str, request: Request):
        """Bake the current PDF state (form fields + signature stamps +
        annotations) into a flattened PDF, drop it in COMPOSE_UPLOADS_DIR
        and return the reply context (To/Subject/threading headers) so the
        frontend can open a reply draft with this attachment pre-loaded.

        Requires the document to have source_email_* metadata (set when the
        doc was created via /api/email/attachment-as-doc). Otherwise 400.
        """
        import base64
        import tempfile
        import shutil
        import uuid as _uuid
        import email as _email_mod
        from src.pdf_form_doc import (
            find_source_upload_id, parse_markdown_to_values,
            load_field_sidecar, parse_markdown_annotations,
        )
        from src.pdf_forms import fill_fields, stamp_signatures, stamp_annotations
        from src.constants import UPLOAD_DIR
        from core.database import Signature
        # COMPOSE_UPLOADS_DIR lives in email_routes — re-derive here so we
        # don't import from a routes file (cycle-prone). Same env override
        # as email_routes (ODYSSEUS_MAIL_ATTACHMENTS_DIR).
        from pathlib import Path as _Path
        import os as _os
        _DATA_DIR = _Path(__file__).resolve().parent.parent / "data"
        _BASE = _os.environ.get("ODYSSEUS_MAIL_ATTACHMENTS_DIR", str(_DATA_DIR / "mail-attachments"))
        _COMPOSE_DIR = _Path(_BASE) / "_compose"
        _COMPOSE_DIR.mkdir(parents=True, exist_ok=True)

        user = get_current_user(request)
        db = SessionLocal()
        try:
            doc = db.query(Document).filter(Document.id == doc_id).first()
            if not doc:
                raise HTTPException(404, "Document not found")
            _verify_doc_owner(db, doc, user)

            if not (doc.source_email_uid and doc.source_email_folder):
                raise HTTPException(400, "Document has no source email — cannot reply")

            # 1) Build the flattened PDF (same pipeline as export_pdf)
            upload_id = find_source_upload_id(doc.current_content or "")
            if not upload_id:
                raise HTTPException(400, "Document is not linked to a source PDF")
            pdf_path = _locate_upload(UPLOAD_DIR, upload_id)
            if not pdf_path:
                raise HTTPException(404, f"Source PDF {upload_id} not found")

            schema = load_field_sidecar(pdf_path) or []
            sig_field_names = {f["name"] for f in schema if f.get("type") == "signature"}
            all_values = parse_markdown_to_values(doc.current_content or "")
            text_values: dict = {}
            sig_ids: dict[str, str] = {}
            for name, raw in all_values.items():
                if name in sig_field_names and isinstance(raw, str) and raw.startswith("signature:"):
                    sig_ids[name] = raw[len("signature:"):].strip()
                elif name not in sig_field_names:
                    text_values[name] = raw

            stamps: dict = {}
            if sig_ids:
                # SECURITY: filter by owner — same reason as render_pdf.
                _sig_q2 = db.query(Signature).filter(Signature.id.in_(list(sig_ids.values())))
                if user:
                    _sig_q2 = _sig_q2.filter(Signature.owner == user)
                rows = _sig_q2.all()
                by_id = {s.id: s for s in rows}
                for fname, sid in sig_ids.items():
                    s = by_id.get(sid)
                    if not s:
                        continue
                    try:
                        stamps[fname] = base64.b64decode(s.data_png)
                    except Exception:
                        pass

            import os
            _to_unlink: list[str] = []
            filled_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
            _to_unlink.append(filled_path)
            fill_fields(pdf_path, filled_path, text_values)
            out_path = filled_path
            if stamps:
                stamped_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
                _to_unlink.append(stamped_path)
                try:
                    stamp_signatures(filled_path, stamped_path, stamps)
                    out_path = stamped_path
                except Exception as e:
                    logger.warning(f"stamp_signatures failed for {doc_id}: {e}")

            annotations = parse_markdown_annotations(doc.current_content or "")
            if annotations:
                ann_sig_ids = [
                    a["value"][len("signature:"):].strip()
                    for a in annotations
                    if a.get("kind") == "signature"
                    and isinstance(a.get("value"), str)
                    and a["value"].startswith("signature:")
                ]
                ann_signature_pngs: dict[str, bytes] = {}
                if ann_sig_ids:
                    # SECURITY: filter by owner so a caller can't reference
                    # someone else's signature ID from doc markdown and have
                    # it stamped/exported.
                    _sig_q = db.query(Signature).filter(Signature.id.in_(ann_sig_ids))
                    if user:
                        _sig_q = _sig_q.filter(Signature.owner == user)
                    sig_rows = _sig_q.all()
                    for s in sig_rows:
                        try:
                            ann_signature_pngs[s.id] = base64.b64decode(s.data_png)
                        except Exception:
                            pass
                annotated_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
                _to_unlink.append(annotated_path)
                try:
                    stamp_annotations(out_path, annotated_path, annotations, ann_signature_pngs)
                    out_path = annotated_path
                except Exception as e:
                    logger.warning(f"stamp_annotations failed for {doc_id}: {e}")

            # 2) Move/copy into COMPOSE_UPLOADS_DIR with the token format
            #    `<uuid>_<original_name>` that /api/email/send expects.
            filename = _slug(doc.title or "signed") + "_signed.pdf"
            token = f"{_uuid.uuid4().hex}_{filename}"
            dest = _COMPOSE_DIR / token
            shutil.copyfile(out_path, str(dest))
            # Unlink the intermediate temp PDFs now that they've been
            # copied into COMPOSE_UPLOADS_DIR.
            for _p in _to_unlink:
                try:
                    os.unlink(_p)
                except FileNotFoundError:
                    pass
                except Exception as _e:
                    logger.warning(f"Could not unlink temp PDF {_p}: {_e}")

            # 3) Fetch the source email's headers so we can build a clean reply
            #    context (To/Subject/In-Reply-To/References).
            try:
                from routes.email_routes import _imap, _decode_header
            except Exception:
                _imap = None
                _decode_header = lambda x: x or ""

            to_addr = ""
            from_name = ""
            subject = ""
            in_reply_to = doc.source_email_message_id or ""
            references = in_reply_to
            if _imap:
                try:
                    with _imap(doc.source_email_account_id or None) as conn:
                        conn.select(doc.source_email_folder, readonly=True)
                        status, data = conn.fetch(doc.source_email_uid.encode(), "(RFC822.HEADER)")
                    if status == "OK" and data and data[0]:
                        raw_hdr = data[0][1]
                        m = _email_mod.message_from_bytes(raw_hdr)
                        sender = _decode_header(m.get("From", ""))
                        from_name, to_addr = _email_mod.utils.parseaddr(sender)
                        if not to_addr:
                            to_addr = sender
                        subject = _decode_header(m.get("Subject", "") or "")
                        if subject and not subject.lower().startswith("re:"):
                            subject = "Re: " + subject
                        msg_refs = (m.get("References") or "").strip()
                        msg_in_reply = (m.get("Message-ID") or "").strip() or in_reply_to
                        in_reply_to = msg_in_reply
                        references = (msg_refs + " " + msg_in_reply).strip() if msg_refs else msg_in_reply
                except Exception as e:
                    logger.warning(f"prepare-signed-reply header fetch failed: {e}")

            return {
                "ok": True,
                "attachment": {
                    "token": token,
                    "filename": filename,
                    "size": dest.stat().st_size,
                },
                "reply": {
                    "to": to_addr,
                    "to_name": from_name,
                    "subject": subject,
                    "in_reply_to": in_reply_to,
                    "references": references,
                    "account_id": doc.source_email_account_id or None,
                    "source_uid": doc.source_email_uid,
                    "source_folder": doc.source_email_folder,
                    "source_message_id": doc.source_email_message_id,
                },
            }
        finally:
            db.close()

    return router