Odysseus v1.0
This commit is contained in:
198
routes/document_helpers.py
Normal file
198
routes/document_helpers.py
Normal file
@@ -0,0 +1,198 @@
|
||||
"""document_helpers.py — Pydantic models, doc serializers, owner gating, file-locator helpers shared with document_routes.py."""
|
||||
|
||||
"""Document routes — CRUD for living documents with version history."""
|
||||
|
||||
import logging
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
from fastapi import HTTPException
|
||||
from pydantic import BaseModel
|
||||
|
||||
from core.database import Document, DocumentVersion
|
||||
from core.database import Session as DbSession
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---- Request schemas ----
|
||||
|
||||
class DocumentCreate(BaseModel):
|
||||
session_id: Optional[str] = None
|
||||
title: str = "Untitled"
|
||||
language: Optional[str] = None
|
||||
content: str = ""
|
||||
|
||||
class DocumentUpdate(BaseModel):
|
||||
content: str
|
||||
summary: Optional[str] = None
|
||||
|
||||
class DocumentPatch(BaseModel):
|
||||
title: Optional[str] = None
|
||||
language: Optional[str] = None
|
||||
session_id: Optional[str] = None # link/unlink document to a session
|
||||
|
||||
|
||||
# ---- Helpers ----
|
||||
|
||||
def _doc_to_dict(doc: Document) -> Dict[str, Any]:
|
||||
return {
|
||||
"id": doc.id,
|
||||
"session_id": doc.session_id,
|
||||
"title": doc.title,
|
||||
"language": doc.language,
|
||||
"current_content": doc.current_content,
|
||||
"version_count": doc.version_count,
|
||||
"is_active": doc.is_active,
|
||||
"archived": bool(getattr(doc, "archived", False)),
|
||||
"created_at": (doc.created_at.isoformat() + "Z") if doc.created_at else None,
|
||||
"updated_at": (doc.updated_at.isoformat() + "Z") if doc.updated_at else None,
|
||||
# Source-email provenance (set when doc was created from an email
|
||||
# attachment) — drives the "Send signed reply" menu item.
|
||||
"source_email_uid": getattr(doc, "source_email_uid", None),
|
||||
"source_email_folder": getattr(doc, "source_email_folder", None),
|
||||
"source_email_account_id": getattr(doc, "source_email_account_id", None),
|
||||
"source_email_message_id": getattr(doc, "source_email_message_id", None),
|
||||
}
|
||||
|
||||
def _version_to_dict(v: DocumentVersion) -> Dict[str, Any]:
|
||||
return {
|
||||
"id": v.id,
|
||||
"document_id": v.document_id,
|
||||
"version_number": v.version_number,
|
||||
"content": v.content,
|
||||
"summary": v.summary,
|
||||
"source": v.source,
|
||||
"created_at": v.created_at.isoformat() if v.created_at else None,
|
||||
}
|
||||
|
||||
|
||||
def _verify_doc_owner(db, doc: Document, user: str):
|
||||
"""Verify `user` owns this document. Raise 404 if not.
|
||||
|
||||
Documents now carry their own `owner` column, so a doc whose session
|
||||
was deleted (session_id → NULL) can still prove ownership and stay
|
||||
openable / cloneable. We trust that column first and only fall back to
|
||||
the session join for any not-yet-backfilled legacy row.
|
||||
"""
|
||||
if user is None:
|
||||
raise HTTPException(403, "Authentication required")
|
||||
if doc.owner is not None:
|
||||
if doc.owner != user:
|
||||
raise HTTPException(404, "Document not found")
|
||||
return
|
||||
# Legacy fallback: derive ownership from the linked session.
|
||||
if not doc.session_id:
|
||||
raise HTTPException(404, "Document not found")
|
||||
session = db.query(DbSession).filter(DbSession.id == doc.session_id).first()
|
||||
if not session or session.owner != user:
|
||||
raise HTTPException(404, "Document not found")
|
||||
|
||||
|
||||
def _owner_session_filter(q, user):
|
||||
"""Restrict a documents query to those owned by `user`.
|
||||
|
||||
Documents now carry their own `owner` column (backfilled at boot from
|
||||
the linked session, or assigned to the admin user for legacy/orphaned
|
||||
docs). We filter on that directly rather than on a session join, so a
|
||||
document whose session was deleted (session_id → NULL) still shows up
|
||||
for its owner instead of silently vanishing from the Library + search.
|
||||
|
||||
The owner backfill runs in init_db before the app serves requests, so
|
||||
by the time this filter is live there are no NULL-owner rows to leak;
|
||||
we therefore match the owner strictly."""
|
||||
if user is None:
|
||||
return q.filter(False)
|
||||
return q.filter(Document.owner == user)
|
||||
|
||||
|
||||
|
||||
def _slug(name: str) -> str:
|
||||
"""Filesystem-friendly version of a document title.
|
||||
|
||||
Whitespace becomes underscores; other unsafe punctuation is dropped.
|
||||
Preserves letters, digits, dot, hyphen, underscore. Idempotent.
|
||||
"""
|
||||
import re as _re
|
||||
s = (name or "").strip()
|
||||
# Drop the trailing extension if the title happens to include one
|
||||
s = _re.sub(r'\.pdf$', '', s, flags=_re.IGNORECASE)
|
||||
s = _re.sub(r'\s+', '_', s)
|
||||
s = _re.sub(r'[^A-Za-z0-9._-]', '', s)
|
||||
s = _re.sub(r'_+', '_', s).strip('_')
|
||||
return s or "form"
|
||||
|
||||
|
||||
# DPI scale for the interactive PDF view. ~150 DPI (2x of 72 PDF user-units).
|
||||
_PDF_RENDER_SCALE = 2.0
|
||||
|
||||
|
||||
def _locate_upload(upload_dir: str, file_id: str):
|
||||
"""Find an upload by its filename ID.
|
||||
|
||||
Lookup order:
|
||||
1. Direct hit at `upload_dir/file_id` (very small deployments).
|
||||
2. The `uploads.json` index that `UploadHandler.save_upload` maintains —
|
||||
maps file_hash → metadata containing the full path. O(1) once loaded.
|
||||
3. Fallback: `os.walk` the date-bucketed tree. Slow on large stores;
|
||||
only triggers for legacy uploads recorded before the index existed.
|
||||
|
||||
`followlinks=False` keeps a stray symlink loop in `data/uploads/` from
|
||||
spinning the walker into infinite recursion.
|
||||
"""
|
||||
import os
|
||||
import json as _json
|
||||
direct = os.path.join(upload_dir, file_id)
|
||||
if os.path.exists(direct):
|
||||
return direct
|
||||
# O(1) via uploads.json
|
||||
try:
|
||||
idx_path = os.path.join(upload_dir, "uploads.json")
|
||||
if os.path.exists(idx_path):
|
||||
with open(idx_path, "r") as f:
|
||||
idx = _json.load(f)
|
||||
for meta in (idx.values() if isinstance(idx, dict) else []):
|
||||
if meta.get("id") == file_id:
|
||||
p = meta.get("path")
|
||||
if p and os.path.exists(p):
|
||||
return p
|
||||
except Exception:
|
||||
pass
|
||||
for root, _dirs, files in os.walk(upload_dir, followlinks=False):
|
||||
if file_id in files:
|
||||
return os.path.join(root, file_id)
|
||||
return None
|
||||
|
||||
|
||||
def _derive_title(content: str) -> str:
|
||||
"""Derive a title from document content."""
|
||||
import re
|
||||
text = content.strip()
|
||||
if not text:
|
||||
return "Untitled"
|
||||
|
||||
# Markdown header
|
||||
md = re.match(r'^#{1,3}\s+(.+)', text, re.MULTILINE)
|
||||
if md:
|
||||
title = md.group(1).strip()
|
||||
if len(title) > 50:
|
||||
title = title[:48] + "…"
|
||||
return title
|
||||
|
||||
# HTML heading
|
||||
html = re.search(r'<h[1-3][^>]*>([^<]+)</h[1-3]>', text, re.IGNORECASE)
|
||||
if html:
|
||||
title = html.group(1).strip()
|
||||
if len(title) > 50:
|
||||
title = title[:48] + "…"
|
||||
return title
|
||||
|
||||
# First non-empty line (if short enough)
|
||||
for line in text.split('\n'):
|
||||
line = line.strip()
|
||||
if line and 2 <= len(line) <= 60:
|
||||
title = re.sub(r'[:#*`]+$', '', line).strip()
|
||||
if title and len(title) > 50:
|
||||
title = title[:48] + "…"
|
||||
return title or "Untitled"
|
||||
|
||||
return "Untitled"
|
||||
Reference in New Issue
Block a user