""" email_helpers.py Lower-level helpers used by both `email_routes.py` (the FastAPI route file) and `email_pollers.py` (the background loops): - auth dependencies (require_owner / require_user / _assert_owns_account) - account config + settings persistence (`_get_email_config`, `_list_email_accounts`) - IMAP connection helpers (`_imap_connect`, `_imap`, folder detection) - message parsing (`_decode_header`, `_extract_html/text`, attachment helpers) - sender context retrieval for the AI-summary / AI-reply pipelines - Pydantic models, shared constants, scheduled-DB bootstrap """ import os import imaplib import smtplib import email as email_mod import email.header import email.utils import json import re import html import logging from email.mime.multipart import MIMEMultipart from email.mime.base import MIMEBase from email import encoders import mimetypes from pathlib import Path from fastapi import Query, HTTPException, Request from pydantic import BaseModel from typing import Optional, List from src.auth_helpers import get_current_user from src.secret_storage import decrypt as _decrypt logger = logging.getLogger(__name__) def _send_smtp_message(cfg: dict, from_addr: str, recipients: list[str], message: str | bytes, timeout: int = 30) -> None: """Send through SMTP using the conventional TLS mode for the configured port. Account settings only store host/port today. Port 465 is implicit TLS (SMTP_SSL); port 587 is plain SMTP upgraded with STARTTLS. Using SSL directly against 587 raises the classic "[SSL: WRONG_VERSION_NUMBER]" error even when credentials are correct. """ host = cfg["smtp_host"] port = int(cfg.get("smtp_port") or 465) user = cfg.get("smtp_user") or "" password = cfg.get("smtp_password") or "" if port == 587: with smtplib.SMTP(host, port, timeout=timeout) as smtp: smtp.starttls() if user and password: smtp.login(user, password) smtp.sendmail(from_addr, recipients, message) return with smtplib.SMTP_SSL(host, port, timeout=timeout) as smtp: if user and password: smtp.login(user, password) smtp.sendmail(from_addr, recipients, message) def _strip_think(text: str) -> str: """Email-flavored think strip — thin wrapper over the central helper. Email AI features get the prose-strip extension because their outputs are short LLM-only generations (replies, summaries, calendar extraction, urgency, classification, writing-style) where untagged reasoning leaks are common. The central helper only runs the prose-strip when an actual `` tag was present in the input, so legit user content is safe. """ if not text: return "" from src.text_helpers import strip_think as _central, _THINK_CLOSED_RE, _THINK_OPEN_RE, _THINK_TAG_RE had_think = bool(_THINK_CLOSED_RE.search(text) or _THINK_OPEN_RE.search(text) or _THINK_TAG_RE.search(text)) return _central(text, prose=had_think, prompt_echo=True) import re as _re_reply # Accept REPLY / SUMMARY / OUTPUT as the opening fence so the same extractor # serves replies and summaries (any fenced final-output block). _REPLY_OPEN_RE = _re_reply.compile(r"<<<\s*(?:REPLY|SUMMARY|OUTPUT)\s*>>>", _re_reply.I) _REPLY_CLOSE_RE = _re_reply.compile(r"<<<\s*END\s*>>>", _re_reply.I) def _extract_reply(text: str) -> str: """Pull the final email reply out of a model response. Positive extraction beats blocklist stripping: the model is asked to fence its reply in <<>> ... <<>> markers, so we keep ONLY that region and ignore whatever reasoning came before/after it. Deterministic, and it can never clip a legit reply that merely opens reflectively. Fallbacks when the markers are absent (older/weaker models): we just run the usual think-strip on the whole text — strictly no worse than before. A second think-strip pass always runs on the extracted body too, in case the model also reasoned *inside* the markers. """ if not text: return "" t = text m = _REPLY_OPEN_RE.search(t) if m: rest = t[m.end():] c = _REPLY_CLOSE_RE.search(rest) t = rest[:c.start()] if c else rest # Drop any stray/duplicate marker tokens, then strip think markup. t = _REPLY_OPEN_RE.sub("", t) t = _REPLY_CLOSE_RE.sub("", t) return _strip_think(t).strip() def _apply_email_style_mechanics(text: str) -> str: """Enforce deterministic writing-style mechanics that models often miss.""" if not text: return "" return ( text.replace("—", "--") .replace("–", "--") .replace("’", "'") .replace("‘", "'") ) def _require_auth(request: Request) -> str: """Defense-in-depth: reject unauthenticated callers even if upstream middleware was bypassed (e.g. localhost-bypass, SSRF from a sibling service). Mirrors core.middleware.require_admin's resolution path. v2 review HIGH-13: previously fell open whenever auth_manager wasn't `is_configured`, exposing IMAP creds and SMTP send to any network caller on a half-configured deploy. Now: anonymous callers in unconfigured mode are only honoured if they're coming from localhost; everyone else gets 401. """ u = get_current_user(request) if u: return u auth_mgr = getattr(request.app.state, "auth_manager", None) if auth_mgr is not None and getattr(auth_mgr, "is_configured", False): raise HTTPException(401, "Not authenticated") # Unconfigured / first-run mode: only allow loopback callers. Public # network traffic must authenticate even before auth is set up. client = getattr(request, "client", None) host = (client.host if client else "") or "" if host in ("127.0.0.1", "::1", "localhost"): return "" raise HTTPException(401, "Not authenticated") def require_owner(request: Request, account_id: str | None = Query(None)) -> str: """FastAPI dependency: authenticate the caller and, if `account_id` is in the query string, assert ownership. Returns the resolved owner ("" in unconfigured single-user mode). Routes whose `account_id` lives in the request body or path must still call `_assert_owns_account(body_id, owner)` explicitly. Use `require_user` (no Query read) for path-param routes.""" owner = _require_auth(request) if account_id: _assert_owns_account(account_id, owner) return owner def require_user(request: Request) -> str: """Auth-only dependency for routes where `account_id` is a path param or absent. Avoids `require_owner`'s Query collision with path params.""" return _require_auth(request) def _assert_owns_account(account_id: str, owner: str) -> None: """Reject requests that name an `account_id` belonging to another user. Previously the account lookup in `_get_email_config` filtered only on `id == account_id`, letting a multi-user deploy enumerate / operate against any other user's IMAP/SMTP mailbox. Call this *before* opening the IMAP connection or reading creds. `owner == ""` is the unconfigured / single-user case — accept any account.""" if not account_id or not owner: return try: from core.database import SessionLocal as _SL, EmailAccount as _EA db = _SL() try: row = db.query(_EA).filter(_EA.id == account_id).first() if row is None: raise HTTPException(404, "Account not found") if row.owner and row.owner != owner: # Treat as 404 (not 403) so we don't leak existence. raise HTTPException(404, "Account not found") finally: db.close() except HTTPException: raise except Exception as e: # Fail closed — a DB hiccup must not let cross-tenant access slip # through. 503 tells the caller to retry; logs preserve detail. logger.error(f"Account-owner check failed: {e}") raise HTTPException(503, "Account check failed") def _q(name: str) -> str: """Quote an IMAP mailbox name. Defensive: escapes `\\` and `"` and wraps in double quotes so user-supplied folder names with spaces or quotes can't confuse `SELECT` / `COPY`. imaplib already rejects CRLF, but quoting also handles `[Gmail]/Sent Mail`-style names that need wrapping anyway.""" return '"' + (name or "").replace("\\", "\\\\").replace('"', '\\"') + '"' def _attach_compose_uploads(outer: MIMEMultipart, tokens) -> None: """Read each staged upload token, build a MIMEBase part, and attach to `outer`. Tokens are sanitized via Path(token).name to prevent traversal. Missing files are skipped silently. Used by /send, scheduled delivery, and the agent send pipeline.""" if not tokens: return for token in tokens: safe_token = Path(token).name path = COMPOSE_UPLOADS_DIR / safe_token if not path.exists(): logger.warning(f"Attachment token not found: {safe_token}") continue ctype, encoding = mimetypes.guess_type(str(path)) if ctype is None or encoding is not None: ctype = "application/octet-stream" maintype, subtype = ctype.split("/", 1) with open(path, "rb") as f: part = MIMEBase(maintype, subtype) part.set_payload(f.read()) encoders.encode_base64(part) # Token format: "_" original_name = safe_token.split("_", 1)[1] if "_" in safe_token else safe_token part.add_header("Content-Disposition", "attachment", filename=original_name) outer.attach(part) def _cleanup_compose_uploads(tokens) -> None: """Best-effort unlink of staged uploads after delivery (or failure).""" if not tokens: return for token in tokens: try: (COMPOSE_UPLOADS_DIR / Path(token).name).unlink(missing_ok=True) except Exception: pass DATA_DIR = Path(__file__).resolve().parent.parent / "data" SETTINGS_FILE = DATA_DIR / "settings.json" # Override at deploy time via ODYSSEUS_MAIL_ATTACHMENTS_DIR. Defaults to a # subdir of the install's data/ tree so the app works out-of-the-box without # a hardcoded /home// path. ATTACHMENTS_DIR = Path(os.environ.get("ODYSSEUS_MAIL_ATTACHMENTS_DIR", str(DATA_DIR / "mail-attachments"))) ATTACHMENTS_DIR.mkdir(parents=True, exist_ok=True) COMPOSE_UPLOADS_DIR = ATTACHMENTS_DIR / "_compose" COMPOSE_UPLOADS_DIR.mkdir(parents=True, exist_ok=True) SCHEDULED_DB = DATA_DIR / "scheduled_emails.db" def _init_scheduled_db(): import sqlite3 conn = sqlite3.connect(SCHEDULED_DB) conn.execute(""" CREATE TABLE IF NOT EXISTS scheduled_emails ( id TEXT PRIMARY KEY, to_addr TEXT NOT NULL, cc TEXT, bcc TEXT, subject TEXT, body TEXT NOT NULL, in_reply_to TEXT, references_hdr TEXT, attachments TEXT, send_at TEXT NOT NULL, created_at TEXT NOT NULL, status TEXT NOT NULL DEFAULT 'pending', error TEXT ) """) # Email summary cache (keyed by Message-ID) conn.execute(""" CREATE TABLE IF NOT EXISTS email_summaries ( message_id TEXT PRIMARY KEY, uid TEXT, folder TEXT, subject TEXT, sender TEXT, summary TEXT NOT NULL, model_used TEXT, created_at TEXT NOT NULL ) """) # Email AI reply cache (pre-generated draft replies) conn.execute(""" CREATE TABLE IF NOT EXISTS email_ai_replies ( message_id TEXT PRIMARY KEY, uid TEXT, folder TEXT, reply TEXT NOT NULL, model_used TEXT, created_at TEXT NOT NULL ) """) # Email tags / spam classification cache. SECURITY: keyed by # (message_id, owner) because Message-IDs are GLOBAL (a newsletter goes # to many users with the same Message-ID). Without owner-scoping, a # tag-write for user A's row clobbered user B's row and surfaced A's # UID in B's `tag:urgent` IMAP filter (review C2). conn.execute(""" CREATE TABLE IF NOT EXISTS email_tags ( message_id TEXT, owner TEXT DEFAULT '', uid TEXT, folder TEXT, subject TEXT, sender TEXT, tags TEXT, spam_verdict INTEGER DEFAULT 0, spam_reason TEXT, moved_to TEXT, model_used TEXT, created_at TEXT NOT NULL, PRIMARY KEY (message_id, owner) ) """) # Backfill migration: older installs created the table with # message_id as a bare PK and no owner column. Add the column + # promote it into the PK by rebuild-copy-swap (SQLite can't ALTER PK). try: _cols = [r[1] for r in conn.execute("PRAGMA table_info(email_tags)")] if "owner" not in _cols: # Add the column first so reads/writes don't break mid-migration. conn.execute("ALTER TABLE email_tags ADD COLUMN owner TEXT DEFAULT ''") # Rebuild with composite PK. Existing rows get owner='' (legacy # single-user); the urgency scanner will overwrite as it # re-classifies. No data loss. conn.execute(""" CREATE TABLE IF NOT EXISTS email_tags__new ( message_id TEXT, owner TEXT DEFAULT '', uid TEXT, folder TEXT, subject TEXT, sender TEXT, tags TEXT, spam_verdict INTEGER DEFAULT 0, spam_reason TEXT, moved_to TEXT, model_used TEXT, created_at TEXT NOT NULL, PRIMARY KEY (message_id, owner) ) """) conn.execute(""" INSERT OR IGNORE INTO email_tags__new (message_id, owner, uid, folder, subject, sender, tags, spam_verdict, spam_reason, moved_to, model_used, created_at) SELECT message_id, COALESCE(owner, ''), uid, folder, subject, sender, tags, spam_verdict, spam_reason, moved_to, model_used, created_at FROM email_tags """) conn.execute("DROP TABLE email_tags") conn.execute("ALTER TABLE email_tags__new RENAME TO email_tags") except Exception as _mig_e: # Best-effort — log via the module logger if available import logging as _lg _lg.getLogger(__name__).warning(f"email_tags owner-migration skipped: {_mig_e}") conn.execute(""" CREATE TABLE IF NOT EXISTS email_calendar_extractions ( message_id TEXT PRIMARY KEY, uid TEXT, events_created INTEGER DEFAULT 0, created_at TEXT NOT NULL ) """) conn.execute(""" CREATE TABLE IF NOT EXISTS email_urgency_alerts ( message_id TEXT PRIMARY KEY, uid TEXT, folder TEXT, subject TEXT, sender TEXT, urgency TEXT, reason TEXT, alerted INTEGER DEFAULT 0, created_at TEXT NOT NULL ) """) conn.execute(""" CREATE TABLE IF NOT EXISTS email_event_seen ( owner TEXT NOT NULL, account_key TEXT NOT NULL, folder TEXT NOT NULL, message_key TEXT NOT NULL, first_seen_at TEXT NOT NULL, PRIMARY KEY (owner, account_key, folder, message_key) ) """) # Boundary cache — LLM-detected sig/quote start positions in the body. # Stored as char offsets (-1 = no boundary found). Once cached, the # client uses these to fold without ever re-calling the LLM. conn.execute(""" CREATE TABLE IF NOT EXISTS email_boundaries ( message_id TEXT PRIMARY KEY, uid TEXT, folder TEXT, sig_start INTEGER, quote_start INTEGER, model_used TEXT, created_at TEXT NOT NULL ) """) # Lazy migration: add account_id column to scheduled_emails if missing try: cols = [r[1] for r in conn.execute("PRAGMA table_info(scheduled_emails)").fetchall()] if "account_id" not in cols: conn.execute("ALTER TABLE scheduled_emails ADD COLUMN account_id TEXT") if "odysseus_kind" not in cols: conn.execute("ALTER TABLE scheduled_emails ADD COLUMN odysseus_kind TEXT") except Exception: pass # Lazy migration: add turns_json to email_boundaries for server-side # thread parsing cache (talon-style precomputed reply chain). try: cols = [r[1] for r in conn.execute("PRAGMA table_info(email_boundaries)").fetchall()] if "turns_json" not in cols: conn.execute("ALTER TABLE email_boundaries ADD COLUMN turns_json TEXT") except Exception: pass # Per-sender signature cache. Populated by `learn_sender_signatures` # action: the LLM extracts the common trailing block across N emails # from each sender; the renderer folds it consistently for every # future email from that address. conn.execute(""" CREATE TABLE IF NOT EXISTS sender_signatures ( from_address TEXT PRIMARY KEY, signature_text TEXT, sample_count INTEGER, last_built_at TEXT NOT NULL, model_used TEXT, source TEXT ) """) conn.commit() conn.close() _init_scheduled_db() def _load_settings(): if SETTINGS_FILE.exists(): return json.loads(SETTINGS_FILE.read_text()) return {} def _save_settings(settings): from core.atomic_io import atomic_write_json atomic_write_json(str(SETTINGS_FILE), settings, indent=2) def _get_email_config(account_id: str | None = None, owner: str = "") -> dict: """Return IMAP/SMTP config as a dict. Resolution order: 1. If account_id given → that specific EmailAccount row. 2. Else → the row with is_default=True (scoped to `owner` when given). 3. Else → the first enabled row (scoped to `owner` when given). 4. Else → legacy flat keys in data/settings.json (kept for envs where the migration hasn't run yet or accounts table is empty). 5. Else → env vars (SMTP_HOST / IMAP_HOST / ...). Returned dict always has the same shape as before; an `account_id` key is added so callers can stamp derivative records (email_ai_replies etc.). SECURITY: without `owner`, the fallback queries (is_default, first-enabled) don't filter by user — so on a multi-user deploy a brand-new account would inherit whoever else's IMAP/SMTP creds happened to be the default. Pass `owner` from the route's auth dependency to scope the lookup. """ import os from core.database import SessionLocal as _SL, EmailAccount as _EA def _owner_or_matching_legacy_account(query): if not owner: return query from sqlalchemy import and_, or_ unowned = or_(_EA.owner == None, _EA.owner == "") # noqa: E711 same_mailbox = or_(_EA.imap_user == owner, _EA.from_address == owner) return query.filter(or_(_EA.owner == owner, and_(unowned, same_mailbox))) resolved_id = None row = None try: db = _SL() try: if account_id: row = db.query(_EA).filter(_EA.id == account_id, _EA.enabled == True).first() # noqa: E712 # If the resolved row belongs to a different owner, treat as # not-found rather than silently serving it. This is a defense # in depth — `require_owner` already calls `_assert_owns_account` # for query-param account_ids, but other callers (cookbook # rules, scheduled poller) may not. if row is not None and owner and row.owner and row.owner != owner: row = None # Fallback path — restrict to this owner's accounts so we don't # leak another user's default mailbox to an unconfigured user. if row is None: q = db.query(_EA).filter(_EA.is_default == True, _EA.enabled == True) # noqa: E712 q = _owner_or_matching_legacy_account(q) row = q.first() if row is None: q = db.query(_EA).filter(_EA.enabled == True) # noqa: E712 q = _owner_or_matching_legacy_account(q) row = q.order_by(_EA.created_at.asc()).first() if row is not None: resolved_id = row.id cfg = { "account_id": row.id, "account_name": row.name, "smtp_host": row.smtp_host or "", "smtp_port": int(row.smtp_port or 465), "smtp_user": row.smtp_user or "", "smtp_password": _decrypt(row.smtp_password or ""), "imap_host": row.imap_host or "", "imap_port": int(row.imap_port or 993), "imap_user": row.imap_user or "", "imap_password": _decrypt(row.imap_password or ""), "imap_starttls": bool(row.imap_starttls), "from_address": row.from_address or row.imap_user or "", } if not (cfg["smtp_host"] and cfg["smtp_user"] and cfg["smtp_password"]): logger.warning(f"SMTP not configured for account {row.name!r}") if not (cfg["imap_host"] and cfg["imap_user"] and cfg["imap_password"]): logger.warning(f"IMAP not configured for account {row.name!r}") return cfg finally: db.close() except Exception as e: logger.debug(f"email_accounts lookup failed, falling back to settings.json: {e}") # Legacy fallback — flat keys in settings.json / env vars settings = _load_settings() cfg = { "account_id": resolved_id, "account_name": "legacy", "smtp_host": settings.get("smtp_host", os.environ.get("SMTP_HOST", "")), "smtp_port": int(settings.get("smtp_port", os.environ.get("SMTP_PORT", "465")) or 465), "smtp_user": settings.get("smtp_user", os.environ.get("SMTP_USER", "")), "smtp_password": settings.get("smtp_password", os.environ.get("SMTP_PASSWORD", "")), "imap_host": settings.get("imap_host", os.environ.get("IMAP_HOST", "")), "imap_port": int(settings.get("imap_port", os.environ.get("IMAP_PORT", "993")) or 993), "imap_user": settings.get("imap_user", os.environ.get("IMAP_USER", "")), "imap_password": settings.get("imap_password", os.environ.get("IMAP_PASSWORD", "")), "imap_starttls": settings.get("imap_starttls", True), "from_address": settings.get("email_from", os.environ.get("EMAIL_FROM", "")), } if not (cfg["smtp_host"] and cfg["smtp_user"] and cfg["smtp_password"]): logger.warning("SMTP not configured — add an Email Account in Settings or set env vars") if not (cfg["imap_host"] and cfg["imap_user"] and cfg["imap_password"]): logger.warning("IMAP not configured — add an Email Account in Settings or set env vars") return cfg def _list_email_accounts() -> list[dict]: """Return all enabled accounts in creation order. Used by background loops that iterate over every account (auto-summarize, urgency, etc.).""" from core.database import SessionLocal as _SL, EmailAccount as _EA try: db = _SL() try: rows = ( db.query(_EA) .filter(_EA.enabled == True) # noqa: E712 .order_by(_EA.is_default.desc(), _EA.created_at.asc()) .all() ) return [_get_email_config(r.id) for r in rows] finally: db.close() except Exception as e: logger.debug(f"_list_email_accounts failed, returning [default]: {e}") return [_get_email_config()] # ── IMAP helpers ── _IMAP_TIMEOUT_SECONDS = 15 def _imap_connect(account_id: str | None = None, owner: str = ""): # SECURITY: passing `owner` scopes the fallback config lookup so a brand # new user doesn't get connected against another user's default mailbox # when they have no account configured. cfg = _get_email_config(account_id, owner=owner) # Connection mode: # STARTTLS on → plain + upgrade # STARTTLS off + port 993 → implicit SSL (IMAPS) # STARTTLS off + any other port → plain (local Dovecot, custom ports) # The last branch is critical: previously this fell into IMAP4_SSL # for any non-STARTTLS port, which would fail the TLS handshake on # plain local servers (Dovecot on 31143, etc.). if cfg.get("imap_starttls"): conn = imaplib.IMAP4(cfg["imap_host"], cfg["imap_port"], timeout=_IMAP_TIMEOUT_SECONDS) conn.starttls() elif int(cfg.get("imap_port") or 993) == 993: conn = imaplib.IMAP4_SSL(cfg["imap_host"], cfg["imap_port"], timeout=_IMAP_TIMEOUT_SECONDS) else: conn = imaplib.IMAP4(cfg["imap_host"], cfg["imap_port"], timeout=_IMAP_TIMEOUT_SECONDS) try: conn.sock.settimeout(_IMAP_TIMEOUT_SECONDS) except Exception: pass conn.login(cfg["imap_user"], cfg["imap_password"]) return conn from contextlib import contextmanager # Filled in by setup_email_routes() once its closure-scoped pool helpers are # defined. Keyed so we can swap them out in tests. _POOL_HOOKS: dict = {"connect": None, "release": None} @contextmanager def _imap(account_id: str | None = None, owner: str = ""): """IMAP connection scoped to a `with` block. Uses the connection pool when available so we don't pay the TCP+TLS+LOGIN handshake (~30-100ms with Dovecot) on every request. Falls back to a fresh connect+logout pair before `setup_email_routes()` has run (e.g. background pollers spinning up early). SECURITY: `owner` flows through `_imap_connect` → `_get_email_config` so the fallback config lookup (when `account_id` is missing) is scoped to this user's accounts. """ pool_connect = _POOL_HOOKS.get("connect") pool_release = _POOL_HOOKS.get("release") if pool_connect and pool_release: # SECURITY: forward owner so the pool slot is per-user and the # fresh-connection fallback runs through a scoped config lookup. try: conn, _reused = pool_connect(account_id, owner=owner) except TypeError: # Older hook signature without owner — fall back transparently. conn, _reused = pool_connect(account_id) ok = True try: yield conn except Exception: ok = False raise finally: try: try: pool_release(account_id, conn, ok=ok, owner=owner) except TypeError: pool_release(account_id, conn, ok=ok) except Exception: pass return # Fallback: plain connect+logout. Used pre-setup or in tests. conn = _imap_connect(account_id, owner=owner) try: yield conn finally: try: conn.logout() except Exception: pass def _decode_header(raw): if not raw: return "" parts = email.header.decode_header(raw) decoded = [] for data, charset in parts: if isinstance(data, bytes): decoded.append(data.decode(charset or "utf-8", errors="replace")) else: decoded.append(data) return " ".join(decoded) def _detect_sent_folder(conn): """Find the server's Sent folder name. Returns 'Sent' if nothing matches. Different IMAP servers expose the sent folder under different names: Dovecot/typical: "Sent" Gmail: "[Gmail]/Sent Mail" Outlook/EWS: "Sent Items" Some hosts: "INBOX.Sent" """ candidates = ("Sent", "[Gmail]/Sent Mail", "Sent Mail", "Sent Items", "INBOX.Sent") try: status, folders = conn.list() if status != "OK" or not folders: return "Sent" names = [] for f in folders: decoded = f.decode() if isinstance(f, bytes) else str(f) m = re.search(r'"([^"]*)"\s*$|(\S+)\s*$', decoded) if m: names.append(m.group(1) or m.group(2)) # Prefer \Sent flag in LIST response if present. for f in folders: decoded = f.decode() if isinstance(f, bytes) else str(f) if r"\Sent" in decoded: m = re.search(r'"([^"]*)"\s*$|(\S+)\s*$', decoded) if m: return m.group(1) or m.group(2) for c in candidates: if c in names: return c except Exception: pass return "Sent" def _detect_drafts_folder(conn): """Find the server's Drafts folder name. Gmail usually exposes "[Gmail]/Drafts"; other servers often use "Drafts".""" candidates = ("Drafts", "[Gmail]/Drafts", "Draft", "INBOX.Drafts") try: status, folders = conn.list() if status != "OK" or not folders: return "Drafts" names = [] for f in folders: decoded = f.decode() if isinstance(f, bytes) else str(f) m = re.search(r'"([^"]*)"\s*$|(\S+)\s*$', decoded) if m: names.append(m.group(1) or m.group(2)) for f in folders: decoded = f.decode() if isinstance(f, bytes) else str(f) if r"\Drafts" in decoded or r"\Draft" in decoded: m = re.search(r'"([^"]*)"\s*$|(\S+)\s*$', decoded) if m: return m.group(1) or m.group(2) for c in candidates: if c in names: return c except Exception: pass return "Drafts" def _detect_spam_folder(conn): """Find the server's Junk/Spam folder name, if any.""" try: status, folders = conn.list() if status != "OK" or not folders: return None preferred = None fallback = None for f in folders: decoded = f.decode() if isinstance(f, bytes) else str(f) m = re.search(r'"([^"]*)"\s*$|(\S+)\s*$', decoded) if not m: continue name = m.group(1) or m.group(2) if r"\Junk" in decoded: preferred = name break low = name.lower() if low in ("junk", "spam", "junk mail", "junk e-mail") or low.endswith("/junk") or low.endswith("/spam"): fallback = fallback or name return preferred or fallback except Exception: return None def _imap_move(uid, dest, src="INBOX"): """Move a single IMAP UID from src folder to dest. Returns True on success.""" try: c = _imap_connect() c.select(_q(src)) status, _ = c.copy(uid, _q(dest)) if status != "OK": c.logout() return False c.store(uid, "+FLAGS", "\\Deleted") c.expunge() c.logout() return True except Exception as e: logger.warning(f"IMAP move {uid} → {dest} failed: {e}") return False def _extract_attachment_text(msg, max_chars: int = 6000) -> str: """Pull readable text out of an email's attachments — PDF (via PyMuPDF), plain text, markdown, csv, log. Caps total at `max_chars`. Returns a formatted string with `[Attachment: filename]\\n` blocks separated by `---`. Empty string if there's nothing useful. Used by the summarize/reply pipeline so an email like "see attached invoice" produces a summary that actually references the invoice. """ if not msg or not msg.is_multipart(): return "" out_parts: list[str] = [] total = 0 import os as _os import tempfile as _tempfile for part in msg.walk(): if part.is_multipart(): continue cd = str(part.get("Content-Disposition", "")) ct = (part.get_content_type() or "").lower() if ct in ("text/plain", "text/html") and "attachment" not in cd.lower(): continue filename = part.get_filename() or "" if filename: try: filename = _decode_header(filename) except Exception: pass fname_lower = (filename or "").lower() payload = part.get_payload(decode=True) if not payload: continue # Cap per-attachment size to avoid huge PDFs blowing the budget. if len(payload) > 2_000_000: continue text = "" try: if ct == "application/pdf" or fname_lower.endswith(".pdf"): tmp = _tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) try: tmp.write(payload) tmp.close() from src.personal_docs import extract_pdf_text text = extract_pdf_text(tmp.name) or "" finally: try: _os.unlink(tmp.name) except Exception: pass elif ct.startswith("text/") or fname_lower.endswith((".txt", ".md", ".csv", ".log", ".json")): text = payload.decode("utf-8", errors="replace") except Exception as e: logger.debug(f"attachment-text extract failed for {filename}: {e}") continue text = (text or "").strip() if not text: continue remaining = max_chars - total if remaining <= 0: break snippet = text[:remaining] out_parts.append(f"[Attachment: {filename or 'file'}]\n{snippet}") total += len(snippet) if total >= max_chars: break return "\n\n---\n\n".join(out_parts) def _list_attachments_from_msg(msg): """Return a list of attachment metadata from an email message.""" attachments = [] if not msg.is_multipart(): return attachments idx = 0 for part in msg.walk(): if part.is_multipart(): continue cd = str(part.get("Content-Disposition", "")) ct = part.get_content_type() # Skip text/html body parts (only consider real attachments) if ct in ("text/plain", "text/html") and "attachment" not in cd: continue filename = part.get_filename() if filename: filename = _decode_header(filename) else: # Inline images, etc. - generate a name ext = ct.split("/")[-1] if "/" in ct else "bin" filename = f"attachment_{idx}.{ext}" payload = part.get_payload(decode=True) size = len(payload) if payload else 0 attachments.append({ "index": idx, "filename": filename, "content_type": ct, "size": size, "is_inline": "inline" in cd.lower(), }) idx += 1 return attachments def _extract_attachment_to_disk(msg, index, target_dir): """Extract a specific attachment to disk and return the file path.""" if not msg.is_multipart(): return None idx = 0 for part in msg.walk(): if part.is_multipart(): continue cd = str(part.get("Content-Disposition", "")) ct = part.get_content_type() if ct in ("text/plain", "text/html") and "attachment" not in cd: continue if idx == index: filename = part.get_filename() if filename: filename = _decode_header(filename) else: ext = ct.split("/")[-1] if "/" in ct else "bin" filename = f"attachment_{idx}.{ext}" # Sanitize safe_name = re.sub(r"[^\w\s\-.]", "_", filename).strip() payload = part.get_payload(decode=True) if not payload: return None target_dir.mkdir(parents=True, exist_ok=True) filepath = target_dir / safe_name with open(filepath, "wb") as f: f.write(payload) return filepath idx += 1 return None def _extract_html(msg): """Extract raw HTML body from an email message, if present.""" if msg.is_multipart(): for part in msg.walk(): ct = part.get_content_type() cd = str(part.get("Content-Disposition", "")) if ct == "text/html" and "attachment" not in cd: payload = part.get_payload(decode=True) if payload: charset = part.get_content_charset() or "utf-8" return payload.decode(charset, errors="replace") elif msg.get_content_type() == "text/html": payload = msg.get_payload(decode=True) if payload: charset = msg.get_content_charset() or "utf-8" return payload.decode(charset, errors="replace") return None def _extract_text(msg): if msg.is_multipart(): text_parts = [] for part in msg.walk(): ct = part.get_content_type() cd = str(part.get("Content-Disposition", "")) if ct == "text/plain" and "attachment" not in cd: payload = part.get_payload(decode=True) if payload: charset = part.get_content_charset() or "utf-8" text_parts.append(payload.decode(charset, errors="replace")) elif ct == "text/html" and not text_parts and "attachment" not in cd: payload = part.get_payload(decode=True) if payload: charset = part.get_content_charset() or "utf-8" raw_html = payload.decode(charset, errors="replace") text = re.sub(r"", "\n", raw_html, flags=re.I) text = re.sub(r"<[^>]+>", "", text) text = html.unescape(text) text_parts.append(text.strip()) return "\n".join(text_parts) else: payload = msg.get_payload(decode=True) if payload: charset = msg.get_content_charset() or "utf-8" return payload.decode(charset, errors="replace") return "" def _fetch_sender_thread_context(sender_addr: str, exclude_uid: str = "", exclude_folder: str = "INBOX", limit: int = 3, max_chars_per_email: int = 1500, max_attachment_chars: int = 4000) -> str: """Pull the last N emails from `sender_addr` (across common folders), extract their body snippets + attachment text, and return one formatted block ready to be glued into an LLM system prompt as "REFERENCED MATERIAL". Returns empty string if nothing useful was found. Never raises. Used by the AI reply path so a follow-up like "regarding question 3 of the document you sent" can actually quote that document instead of pretending. """ if not sender_addr: return "" sender_addr = sender_addr.strip().lower() if not sender_addr: return "" blocks: list[str] = [] seen_uids: set[tuple[str, str]] = set() # (folder, uid) if exclude_uid: seen_uids.add((exclude_folder or "INBOX", str(exclude_uid))) try: conn = _imap_connect() except Exception as e: logger.warning(f"sender-thread-context: imap connect failed: {e}") return "" try: for folder in ["INBOX", "Sent", "Archive", "Drafts"]: if len(blocks) >= limit: break try: st_sel, _ = conn.select(_q(folder), readonly=True) if st_sel != "OK": continue except Exception: continue try: addr_escaped = sender_addr.replace('"', '\\"') status, sdata = conn.search(None, f'(FROM "{addr_escaped}")') if status != "OK" or not sdata or not sdata[0]: continue uids = sdata[0].split() # Most recent first. uids = list(reversed(uids)) except Exception: continue for raw_uid in uids: if len(blocks) >= limit: break uid = raw_uid.decode() if isinstance(raw_uid, bytes) else str(raw_uid) key = (folder, uid) if key in seen_uids: continue seen_uids.add(key) try: st_f, msg_data = conn.fetch(raw_uid, "(RFC822)") if st_f != "OK" or not msg_data: continue raw_bytes = None for part in msg_data: if isinstance(part, tuple) and len(part) >= 2 and part[1]: raw_bytes = part[1] break if not raw_bytes: continue msg = email_mod.message_from_bytes(raw_bytes) except Exception as e: logger.debug(f"sender-thread-context fetch fail uid={uid}: {e}") continue try: subj = _decode_header(msg.get("Subject", "(no subject)")) date_hdr = msg.get("Date", "") body_text = (_extract_text(msg) or "").strip() body_text = re.sub(r"\n{3,}", "\n\n", body_text) if len(body_text) > max_chars_per_email: body_text = body_text[:max_chars_per_email].rstrip() + "…" atts_text = _extract_attachment_text(msg, max_chars=max_attachment_chars) except Exception as e: logger.debug(f"sender-thread-context parse fail uid={uid}: {e}") continue if not body_text and not atts_text: continue lines = [f"— {folder} · {date_hdr} · Subject: {subj}"] if body_text: lines.append(body_text) if atts_text: lines.append(atts_text) blocks.append("\n".join(lines)) finally: try: conn.close() except Exception: pass try: conn.logout() except Exception: pass if not blocks: return "" return "\n\n=====\n\n".join(blocks) def _pre_retrieve_context(body: str, sender: str) -> tuple: """Extract key terms from an incoming email and search past emails + contacts. Returns (context_snippets, terms_list). Best-effort; never raises. Sec note: this is called from the auto-reply path. An attacker who can craft an inbound email's content to contain Capitalized words matching private context (legal/medical names, project codenames) can coerce the LLM reply to quote that context back in the auto-reply. To narrow the blast radius: - require terms ≥ 5 chars (was 4), - require multiword for an unknown sender, - cap to 3 terms (was 4), - skip entirely for senders with no prior contact / no past mail. """ STOPWORDS = {"dear", "hello", "hi", "hey", "thanks", "thank", "regards", "best", "kind", "sincerely", "cheers", "the", "this", "that", "from", "subject", "re", "fwd", "yours", "my", "our", "your"} context_snippets = [] terms_list = [] try: # ── Known-sender check: only retrieve context for senders we already # have a relationship with. New / cold senders get an empty context. sender_addr = email.utils.parseaddr(sender or "")[1].lower() is_known = False try: from routes.contacts_routes import _fetch_contacts for c in _fetch_contacts() or []: if (c.get("email") or "").lower() == sender_addr: is_known = True break except Exception: pass if not is_known and sender_addr: try: with _imap() as _ck: _ck.select("INBOX", readonly=True) st_known, dk = _ck.search(None, f'(FROM "{sender_addr}")') if st_known == "OK" and dk and dk[0]: is_known = True except Exception: pass if not is_known: logger.info(f"Pre-retrieval skipped — unknown sender {sender_addr}") return [], [] seen = set() multiword = [] singleword = [] for m in re.finditer(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,2})\b", body or ""): term = m.group(1).strip() key = term.lower() if key in seen: continue first = term.split()[0].lower() if first in STOPWORDS: continue if len(term) < 5: continue seen.add(key) (multiword if " " in term else singleword).append(term) sender_name_clean = _decode_header(sender or "").split("<")[0].strip().lower() # Multiword terms are far less likely to collide with unrelated context # than single capitalized words. Prefer them; only fall back to # singletons when we don't have enough multiwords. ranked = [t for t in (multiword + singleword) if t.lower() != sender_name_clean] terms_list = ranked[:3] logger.info(f"Pre-retrieval terms={terms_list}") if not terms_list: return context_snippets, terms_list try: ctx_conn = _imap_connect() for folder in ["INBOX", "Sent", "Archive", "Drafts"]: try: st_sel, _sd = ctx_conn.select(_q(folder), readonly=True) if st_sel != "OK": continue except Exception: continue for term in terms_list: try: safe_term = term.replace('"', '').replace('\\', '') st, data2 = ctx_conn.search(None, "TEXT", f'"{safe_term}"') if st != "OK" or not data2 or not data2[0]: continue all_hits = data2[0].split() hit_uids = all_hits[-2:] logger.info(f" [{folder}] term={term!r} hits={len(all_hits)}") for huid in hit_uids: try: st2, hd = ctx_conn.fetch(huid, "(RFC822)") if st2 != "OK" or not hd or not hd[0]: continue hmsg = email_mod.message_from_bytes(hd[0][1]) hsubj = _decode_header(hmsg.get("Subject", "")) hfrom = _decode_header(hmsg.get("From", "")) hdate = hmsg.get("Date", "") hbody = _extract_text(hmsg)[:600] context_snippets.append( f"[{folder} match for \"{term}\"]\nFrom: {hfrom}\nDate: {hdate}\nSubject: {hsubj}\n{hbody}" ) except Exception: continue except Exception as _e: logger.warning(f" search {folder} {term!r} failed: {_e}") continue try: ctx_conn.logout() except Exception: pass except Exception as _e: logger.warning(f"IMAP context search failed: {_e}") try: from routes.contacts_routes import _fetch_contacts all_contacts = _fetch_contacts() for term in terms_list: t_lower = term.lower() matches = [c for c in all_contacts if t_lower in (c.get("name") or "").lower() or t_lower in (c.get("email") or "").lower()] for c in matches[:2]: parts = [f"Name: {c.get('name','')}"] if c.get("email"): parts.append(f"Email: {c['email']}") if c.get("phone"): parts.append(f"Phone: {c['phone']}") context_snippets.append(f"[Contact match for \"{term}\"] " + ", ".join(parts)) except Exception: pass except Exception as e: logger.warning(f"Pre-retrieval failed: {e}") logger.info(f"Pre-retrieval snippets={len(context_snippets)}") return context_snippets, terms_list _EMAIL_REPLY_SYS_PROMPT_BASE = ( "You are drafting an email reply. Write only the reply body, no subject line, " "and no extra commentary. The saved WRITING STYLE below outranks generic tone guidance. " "If the saved style says to use a greeting/sign-off, include them. For English replies, " "default to 'Hi [Name]' rather than 'Hey'. Be direct and concise. Match the tone of the " "original email without violating the saved style.\n\n" "MECHANICAL STYLE RULES — CRITICAL: Never use an em dash or en dash; use -- instead. " "Never use curly apostrophes; write I'm, don't, we'll with straight '. Do not start " "with 'Hey' unless the saved style explicitly requests it.\n\n" "IDENTITY RULE — CRITICAL: write as the user/mailbox owner only. NEVER sign as, " "speak as, or imply you are the recipient, original sender, quoted sender, spouse, " "assistant, company, or any third party. Do not copy a name from the quoted thread " "into the sign-off. If a writing style below names a signature, use only that " "signature; otherwise omit the sign-off.\n\n" "CRITICAL RULE: NEVER invent facts, names, dates, phone numbers, emails, addresses, " "or any specifics not explicitly present in the RELEVANT CONTEXT section below or " "the original email itself. If the sender asks for information you don't have in " "the context, say plainly that you don't have it on hand — do NOT guess or fabricate. " "Do not promise to 'look it up' or 'get back to you soon' as a way to pad the reply. " "If you have no real information to offer, write a short honest reply (2-4 sentences max).\n\n" "OUTPUT FORMAT — IMPORTANT: Put ONLY the final email reply between these exact markers, " "each on its own line:\n" "<<>>\n" "(the reply body goes here)\n" "<<>>\n" "Any reasoning, planning, or notes-to-self must come BEFORE the <<>> marker " "(ideally wrapped in ...). Only the text between <<>> and <<>> " "is sent as the email — nothing else is shown to anyone." ) # ── Request models ── class SendEmailRequest(BaseModel): to: str cc: Optional[str] = None bcc: Optional[str] = None subject: str body: str # WYSIWYG compose sends the rendered HTML here; the server sanitizes it and # uses it for the text/html part (body stays the plain-text fallback). When # absent, the server renders markdown from `body` instead. body_html: Optional[str] = None in_reply_to: Optional[str] = None references: Optional[str] = None # List of uploaded attachment tokens (filenames in COMPOSE_UPLOADS_DIR) attachments: Optional[List[str]] = None # Which account to send from. None = default account. account_id: Optional[str] = None # Internal marker for Odysseus-generated mail (e.g. reminder, scheduled). odysseus_kind: Optional[str] = None # If true, /send waits for SMTP + Sent append and returns the sent UID. wait_for_delivery: bool = False class ExtractStyleRequest(BaseModel): sample_count: Optional[int] = 20