odysseus/src/document_actions.py

"""
document_actions.py

Reusable document actions callable from both REST routes and the task scheduler.
"""

import logging
import re

logger = logging.getLogger(__name__)


_JUNK_TITLES = {
    "untitled", "untitled document", "new document", "document",
    "new email", "new mail", "new message", "reply", "fwd", "re:",
    "test", "testing", "asdf", "asd", "foo", "bar", "baz",
    "tmp", "temp", "scratch", "scratchpad", "draft", "delete",
    "remove", "junk", "trash", "xxx", "abc", "qwerty",
}


def _norm_title(t: str) -> str:
    """Normalize a title for grouping: trim, collapse whitespace, lowercase."""
    return re.sub(r"\s+", " ", (t or "").strip()).lower()


def _content_fingerprint(content: str) -> str:
    """A stable fingerprint of document content for duplicate detection.

    Strips bits that differ between otherwise-identical copies — chiefly the
    `upload_id` of a re-imported PDF and the random `id=` of annotations — so
    that N imports of the same file collapse to one fingerprint. Whitespace is
    collapsed and the result lowercased.
    """
    c = content or ""
    c = re.sub(r'upload_id="[^"]*"', "upload_id", c)          # pdf_source re-imports
    c = re.sub(r"\bid=ann-[A-Za-z0-9_-]+", "id=ann", c)        # annotation ids
    c = re.sub(r"\s+", " ", c).strip().lower()
    return c


def _real_len(content: str) -> int:
    """Length of content with markdown noise stripped — a 'completeness' proxy."""
    stripped = re.sub(r"^#{1,6}\s+", "", content or "", flags=re.MULTILINE)
    stripped = re.sub(r"[*_`>\-=]+", "", stripped)
    stripped = re.sub(r"\s+", " ", stripped).strip()
    return len(stripped)


async def run_document_tidy(owner: str) -> str:
    """Remove clearly-junk documents and redundant duplicates for an owner.

    Conservative rules (no length-based deletion — short notes are valid):
    - Empty / whitespace-only / placeholder ("# Untitled")
    - Title is a throwaway name (test, asdf, …) or the content itself is one
    - Email reply-chain with no original content
    - Duplicates: docs sharing the same normalized title AND the same content
      fingerprint (ignoring volatile upload/annotation ids). The most complete
      copy (longest real content, then most recent) is kept; the rest deleted.
    """
    from core.database import SessionLocal, Document, Session as DbSession

    db = SessionLocal()
    try:
        if owner:
            # Documents now carry their own owner column (robust to a deleted
            # session). Match on it directly; orphaned legacy rows are swept
            # to the admin at boot so they're attributed too.
            docs = db.query(Document).filter(Document.owner == owner).all()
        else:
            docs = db.query(Document).all()

        deleted_examples = []
        deleted = 0
        kept = 0
        survivors = []  # docs that pass the junk rules, considered for dedup

        for doc in docs:
            content = (doc.current_content or "").strip()
            title = (doc.title or "").strip().lower()

            # Strip markdown noise to get "real" character count
            stripped = re.sub(r"^#{1,6}\s+", "", content, flags=re.MULTILINE)  # headers
            stripped = re.sub(r"[*_`>\-=]+", "", stripped)  # markdown chars
            stripped = re.sub(r"\s+", " ", stripped).strip()
            real_len = len(stripped)

            # Detect emails-saved-as-documents (quote chains with no original content)
            lines = [ln for ln in content.split("\n") if ln.strip()]
            quoted_lines = [ln for ln in lines if ln.lstrip().startswith(">")]
            header_lines = [ln for ln in lines if re.match(r"^On .+ wrote:?\s*$", ln.strip())]
            non_quote_content = "\n".join(
                ln for ln in lines
                if not ln.lstrip().startswith(">")
                and not re.match(r"^On .+ wrote:?\s*$", ln.strip())
            ).strip()
            quote_ratio = len(quoted_lines) / max(len(lines), 1)

            should_delete = False
            reason = ""

            if not content or content in ("", "# Untitled"):
                should_delete = True
                reason = "empty"
            elif title in _JUNK_TITLES:
                # If you named it "test" or "asdf" etc, you don't care about it
                should_delete = True
                reason = f"junk title '{title}'"
            elif stripped.lower() in _JUNK_TITLES:
                should_delete = True
                reason = "throwaway content"
            # No length-based deletion: short notes are legitimate content.
            elif (quoted_lines or header_lines) and len(non_quote_content) < 50 and quote_ratio > 0.4:
                # Email reply chain with no original content
                should_delete = True
                reason = "email quote-chain only"

            if should_delete:
                if len(deleted_examples) < 5:
                    label = (doc.title or "(no title)")[:40]
                    deleted_examples.append(f"{label} ({reason})")
                db.delete(doc)
                deleted += 1
            else:
                survivors.append(doc)

        # --- Duplicate pass: group survivors by (normalized title, content
        # fingerprint) and keep only the most complete copy of each group. ---
        groups: dict = {}
        for doc in survivors:
            key = (_norm_title(doc.title), _content_fingerprint(doc.current_content))
            groups.setdefault(key, []).append(doc)

        for (title_key, _fp), members in groups.items():
            if len(members) < 2:
                kept += 1
                continue
            # Keep the most complete (longest real content), then most recent.
            def _updated(d):
                return d.updated_at or d.created_at
            members.sort(key=lambda d: (_real_len(d.current_content), _updated(d)), reverse=True)
            keeper = members[0]
            kept += 1
            dupes = members[1:]
            if len(deleted_examples) < 5:
                label = (keeper.title or "(no title)")[:40]
                deleted_examples.append(f"{label} (+{len(dupes)} duplicate copies)")
            for d in dupes:
                db.delete(d)
                deleted += 1

        if deleted:
            db.commit()

        if deleted == 0:
            # Use sentinel so the scheduler can drop the run row entirely.
            from src.builtin_actions import TaskNoop
            raise TaskNoop(f"scanned {len(docs)} document(s), no junk")
        preview = "; ".join(deleted_examples)
        extra = f" (+{deleted - len(deleted_examples)} more)" if deleted > len(deleted_examples) else ""
        return f"Removed {deleted} of {len(docs)}: {preview}{extra} · {kept} kept"
    finally:
        db.close()