Fix email-thread HTML injection, attachment path traversal, and missing authz (#475)

Hardens issues found in a security review of the current tree (separate from the cookbook SSH PR): - Email thread rendering (static/js/emailLibrary.js): the flat read path runs inbound HTML through the allowlist sanitizer, but the two threaded paths (_renderTurnsAsBubbles / _renderTurnsFromServer — the default view) injected server-parsed `body_html` raw into the DOM. A crafted inbound email could inject arbitrary markup (phishing/form/credential-capture/tracking; full XSS if a deployment relaxes the script CSP). Now sanitized on all paths. - Attachment extraction (routes/email_routes.py, routes/email_helpers.py): the on-disk extraction dir was `ATTACHMENTS_DIR / f"{folder}_{uid}"` with user-controlled folder/uid and no containment, so a folder like `../../tmp` could escape ATTACHMENTS_DIR. New attachment_extract_dir() flattens both to a single safe segment and asserts containment. - Diagnostics routes (routes/diagnostics_routes.py): /api/db/stats, /api/rag/stats, /api/test/youtube, /api/test-research relied only on the global session check (any logged-in user). Now require_admin-gated. - Defense-in-depth HTML escaping: session HTML export escapes the session name (routes/session_routes.py); the MCP OAuth page escapes the reflected Host header / server_id (routes/mcp_routes.py). - Internal-tool token now compared with secrets.compare_digest (constant time) in core/middleware.py and app.py. Adds regression tests in tests/test_security_regressions.py.
2026-06-01 23:20:17 +10:00
parent 9e8de43f25
commit 171c29dcf3
9 changed files with 113 additions and 16 deletions
--- a/app.py
+++ b/app.py
@@ -21,6 +21,7 @@ import uuid

 import asyncio
 import logging
+import secrets
 from datetime import datetime
 from typing import Dict

@@ -222,7 +223,7 @@ if AUTH_ENABLED:
            try:
                from core.middleware import INTERNAL_TOOL_HEADER, INTERNAL_TOOL_TOKEN as _ITT
                _hdr = request.headers.get(INTERNAL_TOOL_HEADER)
-                if _hdr and _hdr == _ITT and _is_trusted_loopback(request):
+                if _hdr and secrets.compare_digest(_hdr, _ITT) and _is_trusted_loopback(request):
                    # Impersonation: when the agent's loopback call sets
                    # X-Odysseus-Owner, attribute the request to that user only
                    # if they exist. Authorization checks remain separate; this
--- a/core/middleware.py
+++ b/core/middleware.py
@@ -27,7 +27,8 @@ def require_admin(request: Request):
    # (b) the auth middleware already validated the token and stamped
    #     request.state.current_user = "internal-tool".
    try:
-        if request.headers.get(INTERNAL_TOOL_HEADER) == INTERNAL_TOOL_TOKEN:
+        hdr = request.headers.get(INTERNAL_TOOL_HEADER)
+        if hdr and secrets.compare_digest(hdr, INTERNAL_TOOL_TOKEN):
            return
        if getattr(request.state, "current_user", None) == "internal-tool":
            return
--- a/routes/diagnostics_routes.py
+++ b/routes/diagnostics_routes.py
@@ -3,10 +3,11 @@
 import logging
 from typing import Dict, Any

-from fastapi import APIRouter, HTTPException, Form
+from fastapi import APIRouter, HTTPException, Form, Request

 from services.youtube.youtube_handler import extract_youtube_id, extract_transcript_async
 from core.constants import DEFAULT_HOST
+from core.middleware import require_admin

 logger = logging.getLogger(__name__)

@@ -19,7 +20,8 @@ def setup_diagnostics_routes(
    router = APIRouter(tags=["diagnostics"])

    @router.get("/api/db/stats")
-    async def get_database_stats() -> Dict[str, Any]:
+    async def get_database_stats(request: Request) -> Dict[str, Any]:
+        require_admin(request)
        try:
            from core.database import get_detailed_stats
            return get_detailed_stats()
@@ -28,13 +30,15 @@ def setup_diagnostics_routes(
            raise HTTPException(500, "Failed to retrieve database statistics")

    @router.get("/api/rag/stats")
-    async def get_rag_stats() -> Dict[str, Any]:
+    async def get_rag_stats(request: Request) -> Dict[str, Any]:
+        require_admin(request)
        if rag_available and rag_manager:
            return rag_manager.get_stats()
        return {"error": "RAG system not available"}

    @router.get("/api/test/youtube")
-    async def test_youtube(url: str) -> Dict[str, Any]:
+    async def test_youtube(request: Request, url: str) -> Dict[str, Any]:
+        require_admin(request)
        try:
            video_id = extract_youtube_id(url)
            if not video_id:
@@ -54,7 +58,8 @@ def setup_diagnostics_routes(
            return {"error": str(e)}

    @router.post("/api/test-research")
-    async def test_research(query: str = Form("What is machine learning?")) -> Dict[str, Any]:
+    async def test_research(request: Request, query: str = Form("What is machine learning?")) -> Dict[str, Any]:
+        require_admin(request)
        try:
            endpoint = f"http://{DEFAULT_HOST}:8000/v1/chat/completions"
            model = "gpt-oss-120b"
--- a/routes/email_helpers.py
+++ b/routes/email_helpers.py
@@ -269,6 +269,20 @@ COMPOSE_UPLOADS_DIR.mkdir(parents=True, exist_ok=True)
 SCHEDULED_DB = DATA_DIR / "scheduled_emails.db"


+def attachment_extract_dir(folder: str, uid: str) -> Path:
+    """Containment-safe extraction directory for an attachment.
+
+    `folder` and `uid` are user-controlled (query/path params). Flatten them to
+    a single safe path segment so a value like folder='../../tmp' can't escape
+    ATTACHMENTS_DIR, then assert containment as belt-and-suspenders."""
+    key = re.sub(r"[^A-Za-z0-9._-]", "_", f"{folder}_{uid}") or "_"
+    target = (ATTACHMENTS_DIR / key).resolve()
+    base = ATTACHMENTS_DIR.resolve()
+    if target != base and base not in target.parents:
+        raise HTTPException(400, "Invalid attachment location")
+    return target
+
+
 def _init_scheduled_db():
    import sqlite3
    conn = sqlite3.connect(SCHEDULED_DB)
--- a/routes/email_routes.py
+++ b/routes/email_routes.py
@@ -48,6 +48,7 @@ from routes.email_helpers import (
    _EMAIL_REPLY_SYS_PROMPT_BASE, _POOL_HOOKS,
    SendEmailRequest, ExtractStyleRequest,
    ATTACHMENTS_DIR, COMPOSE_UPLOADS_DIR, SCHEDULED_DB,
+    attachment_extract_dir,
 )
 from routes.email_pollers import _start_poller

@@ -1390,7 +1391,7 @@ def setup_email_routes():
            msg = email_mod.message_from_bytes(raw)

            # Extract to a per-email folder
-            target_dir = ATTACHMENTS_DIR / f"{folder}_{uid}"
+            target_dir = attachment_extract_dir(folder, uid)
            filepath = _extract_attachment_to_disk(msg, index, target_dir)
            if not filepath:
                return {"error": f"Attachment index {index} not found"}
@@ -1425,7 +1426,7 @@ def setup_email_routes():
            raw = msg_data[0][1]
            msg = email_mod.message_from_bytes(raw)

-            target_dir = ATTACHMENTS_DIR / f"{folder}_{uid}"
+            target_dir = attachment_extract_dir(folder, uid)
            filepath = _extract_attachment_to_disk(msg, index, target_dir)
            if not filepath:
                return {"error": f"Attachment index {index} not found"}
@@ -1633,7 +1634,7 @@ def setup_email_routes():
            raw = msg_data[0][1]
            msg = email_mod.message_from_bytes(raw)

-            target_dir = ATTACHMENTS_DIR / f"{folder}_{uid}"
+            target_dir = attachment_extract_dir(folder, uid)
            filepath = _extract_attachment_to_disk(msg, index, target_dir)
            if not filepath:
                return {"error": f"Attachment index {index} not found"}
--- a/routes/mcp_routes.py
+++ b/routes/mcp_routes.py
@@ -499,6 +499,11 @@ def setup_mcp_routes(mcp_manager: McpManager):

 def _oauth_authorize_page(auth_url: str, server_id: str, host: str) -> str:
    """Page with Google sign-in link and URL paste-back form for remote access."""
+    # Escape values interpolated into the page: `host` comes from the request
+    # Host header and `server_id` from the OAuth state — neither is trusted.
+    auth_url = html.escape(auth_url, quote=True)
+    server_id = html.escape(server_id, quote=True)
+    host = html.escape(host, quote=True)
    return f"""<!DOCTYPE html>
 <html><head>
 <meta charset="UTF-8"><title>Authorize — Odysseus</title>
--- a/routes/session_routes.py
+++ b/routes/session_routes.py
@@ -1,5 +1,6 @@
 # routes/session_routes.py
 import re
+import html
 import json
 import uuid
 from datetime import datetime
@@ -587,15 +588,16 @@ def setup_session_routes(session_manager: SessionManager, config: dict, webhook_
            )

        if fmt == "html":
+            safe_title = html.escape(session.name or "")
            html_parts = [
                "<!DOCTYPE html><html><head>",
-                f"<meta charset='utf-8'><title>{session.name}</title>",
+                f"<meta charset='utf-8'><title>{safe_title}</title>",
                "<style>body{font-family:monospace;max-width:800px;margin:2rem auto;padding:0 1rem;background:#111;color:#ddd}",
                ".msg{margin:1rem 0;padding:0.8rem;border-radius:6px;border:1px solid #333}",
                ".user{background:#1a1a2e}.ai{background:#1a2e1a}",
                ".role{font-weight:bold;margin-bottom:0.4rem;opacity:0.7;text-transform:uppercase;font-size:0.85em}",
                "pre{background:#000;padding:0.5rem;border-radius:4px;overflow-x:auto}</style></head><body>",
-                f"<h1>{session.name}</h1>",
+                f"<h1>{safe_title}</h1>",
            ]
            for m in session.history:
                cls = "user" if m.role == "user" else "ai"
--- a/static/js/emailLibrary.js
+++ b/static/js/emailLibrary.js
@@ -2469,7 +2469,7 @@ function _renderTurnsAsBubbles(turns, data) {
      + (isMine ? '' : avatar)
      + `<div class="email-bubble">`
      +   head
-      +   `<div class="email-bubble-body">${t.body_html || ''}</div>`
+      +   `<div class="email-bubble-body">${_sanitizeHtml(t.body_html || '')}</div>`
      + `</div>`
      + (isMine ? avatar : '')
      + `</div>`
@@ -2499,7 +2499,7 @@ function _renderTurnsFromServer(turns) {
        const w = wrap(top);
        if (stack.length) stack[stack.length - 1].html += w; else out += w;
      }
-      out += t.body_html || '';
+      out += _sanitizeHtml(t.body_html || '');
    } else {
      while (stack.length && stack[stack.length - 1].level > t.level) {
        const top = stack.pop();
@@ -2507,9 +2507,9 @@ function _renderTurnsFromServer(turns) {
        if (stack.length) stack[stack.length - 1].html += w; else out += w;
      }
      if (!stack.length || stack[stack.length - 1].level < t.level) {
-        stack.push({ level: t.level, meta: t.meta, html: t.body_html || '' });
+        stack.push({ level: t.level, meta: t.meta, html: _sanitizeHtml(t.body_html || '') });
      } else {
-        stack[stack.length - 1].html += t.body_html || '';
+        stack[stack.length - 1].html += _sanitizeHtml(t.body_html || '');
        if (t.meta && !stack[stack.length - 1].meta) {
          stack[stack.length - 1].meta = t.meta;
        }
--- a/tests/test_security_regressions.py
+++ b/tests/test_security_regressions.py
@@ -622,3 +622,71 @@ def test_web_fetch_guard_blocks_redirect_into_private(monkeypatch):
    with _pytest.raises(httpx.RequestError) as exc:
        content._get_public_url("http://public.example/start", headers={}, timeout=5)
    assert "non-public" in str(exc.value)
+
+
+# ── audit fixes (2026-06-01): email XSS, attachment traversal, authz ──
+
+def _import_attachment_extract_dir():
+    sys.modules.pop("routes.email_helpers", None)
+    from routes.email_helpers import attachment_extract_dir, ATTACHMENTS_DIR
+    return attachment_extract_dir, ATTACHMENTS_DIR
+
+
+@pytest.mark.parametrize("folder,uid", [
+    ("../../../../tmp/evil", "1"),
+    ("INBOX", "../../etc/cron.d/x"),
+    ("a/../../b", "x"),
+    ("..", ".."),
+    ("/abs/path", "2"),
+])
+def test_attachment_extract_dir_stays_contained(folder, uid):
+    """User-controlled folder/uid must never escape ATTACHMENTS_DIR — pins the
+    fix for the attachment-extraction path traversal."""
+    aed, base = _import_attachment_extract_dir()
+    target = aed(folder, uid)
+    base_r = base.resolve()
+    assert target == base_r or base_r in target.parents
+    # exactly one extra path segment, and no `..` component survived
+    rel = target.relative_to(base_r)
+    assert ".." not in rel.parts
+
+
+def test_attachment_extract_dir_normal_inputs_unchanged():
+    aed, base = _import_attachment_extract_dir()
+    assert aed("INBOX", "123") == base.resolve() / "INBOX_123"
+
+
+def test_diagnostics_routes_are_admin_gated():
+    """db/rag stats + test endpoints must require admin (they relied only on
+    the global session check before)."""
+    src = Path(__file__).resolve().parents[1] / "routes" / "diagnostics_routes.py"
+    text = src.read_text()
+    for handler in ("get_database_stats", "get_rag_stats", "test_youtube", "test_research"):
+        assert f"def {handler}(request: Request" in text, handler
+    assert text.count("require_admin(request)") >= 4
+
+
+def test_email_thread_rendering_sanitizes_body_html():
+    """Both threaded render paths must run server-parsed body_html through the
+    allowlist sanitizer (the flat path already did)."""
+    src = Path(__file__).resolve().parents[1] / "static" / "js" / "emailLibrary.js"
+    text = src.read_text()
+    # every `t.body_html` reference is wrapped by _sanitizeHtml(...)
+    assert text.count("t.body_html") == text.count("_sanitizeHtml(t.body_html")
+    assert "t.body_html" in text  # guard against the file being refactored away
+
+
+def test_session_html_export_escapes_name():
+    src = Path(__file__).resolve().parents[1] / "routes" / "session_routes.py"
+    text = src.read_text()
+    assert "safe_title = html.escape(session.name" in text
+    assert "<title>{session.name}" not in text
+    assert "<h1>{session.name}</h1>" not in text
+
+
+def test_mcp_oauth_page_escapes_reflected_values():
+    src = Path(__file__).resolve().parents[1] / "routes" / "mcp_routes.py"
+    text = src.read_text()
+    body = text.split("def _oauth_authorize_page(", 1)[1].split("return f", 1)[0]
+    for var in ("auth_url", "server_id", "host"):
+        assert f"{var} = html.escape({var}" in body, var