From 171c29dcf3f1ea2465f3ed0e968022359ee22e79 Mon Sep 17 00:00:00 2001 From: Jamieson O'Reilly <6668807+orlyjamie@users.noreply.github.com> Date: Mon, 1 Jun 2026 23:20:17 +1000 Subject: [PATCH] Fix email-thread HTML injection, attachment path traversal, and missing authz (#475) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hardens issues found in a security review of the current tree (separate from the cookbook SSH PR): - Email thread rendering (static/js/emailLibrary.js): the flat read path runs inbound HTML through the allowlist sanitizer, but the two threaded paths (_renderTurnsAsBubbles / _renderTurnsFromServer — the default view) injected server-parsed `body_html` raw into the DOM. A crafted inbound email could inject arbitrary markup (phishing/form/credential-capture/tracking; full XSS if a deployment relaxes the script CSP). Now sanitized on all paths. - Attachment extraction (routes/email_routes.py, routes/email_helpers.py): the on-disk extraction dir was `ATTACHMENTS_DIR / f"{folder}_{uid}"` with user-controlled folder/uid and no containment, so a folder like `../../tmp` could escape ATTACHMENTS_DIR. New attachment_extract_dir() flattens both to a single safe segment and asserts containment. - Diagnostics routes (routes/diagnostics_routes.py): /api/db/stats, /api/rag/stats, /api/test/youtube, /api/test-research relied only on the global session check (any logged-in user). Now require_admin-gated. - Defense-in-depth HTML escaping: session HTML export escapes the session name (routes/session_routes.py); the MCP OAuth page escapes the reflected Host header / server_id (routes/mcp_routes.py). - Internal-tool token now compared with secrets.compare_digest (constant time) in core/middleware.py and app.py. Adds regression tests in tests/test_security_regressions.py. --- app.py | 3 +- core/middleware.py | 3 +- routes/diagnostics_routes.py | 15 ++++--- routes/email_helpers.py | 14 ++++++ routes/email_routes.py | 7 +-- routes/mcp_routes.py | 5 +++ routes/session_routes.py | 6 ++- static/js/emailLibrary.js | 8 ++-- tests/test_security_regressions.py | 68 ++++++++++++++++++++++++++++++ 9 files changed, 113 insertions(+), 16 deletions(-) diff --git a/app.py b/app.py index 0ff6e42..d45161e 100644 --- a/app.py +++ b/app.py @@ -21,6 +21,7 @@ import uuid import asyncio import logging +import secrets from datetime import datetime from typing import Dict @@ -222,7 +223,7 @@ if AUTH_ENABLED: try: from core.middleware import INTERNAL_TOOL_HEADER, INTERNAL_TOOL_TOKEN as _ITT _hdr = request.headers.get(INTERNAL_TOOL_HEADER) - if _hdr and _hdr == _ITT and _is_trusted_loopback(request): + if _hdr and secrets.compare_digest(_hdr, _ITT) and _is_trusted_loopback(request): # Impersonation: when the agent's loopback call sets # X-Odysseus-Owner, attribute the request to that user only # if they exist. Authorization checks remain separate; this diff --git a/core/middleware.py b/core/middleware.py index a3e9e9a..82d1d03 100644 --- a/core/middleware.py +++ b/core/middleware.py @@ -27,7 +27,8 @@ def require_admin(request: Request): # (b) the auth middleware already validated the token and stamped # request.state.current_user = "internal-tool". try: - if request.headers.get(INTERNAL_TOOL_HEADER) == INTERNAL_TOOL_TOKEN: + hdr = request.headers.get(INTERNAL_TOOL_HEADER) + if hdr and secrets.compare_digest(hdr, INTERNAL_TOOL_TOKEN): return if getattr(request.state, "current_user", None) == "internal-tool": return diff --git a/routes/diagnostics_routes.py b/routes/diagnostics_routes.py index 8f3a915..daebef8 100644 --- a/routes/diagnostics_routes.py +++ b/routes/diagnostics_routes.py @@ -3,10 +3,11 @@ import logging from typing import Dict, Any -from fastapi import APIRouter, HTTPException, Form +from fastapi import APIRouter, HTTPException, Form, Request from services.youtube.youtube_handler import extract_youtube_id, extract_transcript_async from core.constants import DEFAULT_HOST +from core.middleware import require_admin logger = logging.getLogger(__name__) @@ -19,7 +20,8 @@ def setup_diagnostics_routes( router = APIRouter(tags=["diagnostics"]) @router.get("/api/db/stats") - async def get_database_stats() -> Dict[str, Any]: + async def get_database_stats(request: Request) -> Dict[str, Any]: + require_admin(request) try: from core.database import get_detailed_stats return get_detailed_stats() @@ -28,13 +30,15 @@ def setup_diagnostics_routes( raise HTTPException(500, "Failed to retrieve database statistics") @router.get("/api/rag/stats") - async def get_rag_stats() -> Dict[str, Any]: + async def get_rag_stats(request: Request) -> Dict[str, Any]: + require_admin(request) if rag_available and rag_manager: return rag_manager.get_stats() return {"error": "RAG system not available"} @router.get("/api/test/youtube") - async def test_youtube(url: str) -> Dict[str, Any]: + async def test_youtube(request: Request, url: str) -> Dict[str, Any]: + require_admin(request) try: video_id = extract_youtube_id(url) if not video_id: @@ -54,7 +58,8 @@ def setup_diagnostics_routes( return {"error": str(e)} @router.post("/api/test-research") - async def test_research(query: str = Form("What is machine learning?")) -> Dict[str, Any]: + async def test_research(request: Request, query: str = Form("What is machine learning?")) -> Dict[str, Any]: + require_admin(request) try: endpoint = f"http://{DEFAULT_HOST}:8000/v1/chat/completions" model = "gpt-oss-120b" diff --git a/routes/email_helpers.py b/routes/email_helpers.py index 27d7338..c14fd8c 100644 --- a/routes/email_helpers.py +++ b/routes/email_helpers.py @@ -269,6 +269,20 @@ COMPOSE_UPLOADS_DIR.mkdir(parents=True, exist_ok=True) SCHEDULED_DB = DATA_DIR / "scheduled_emails.db" +def attachment_extract_dir(folder: str, uid: str) -> Path: + """Containment-safe extraction directory for an attachment. + + `folder` and `uid` are user-controlled (query/path params). Flatten them to + a single safe path segment so a value like folder='../../tmp' can't escape + ATTACHMENTS_DIR, then assert containment as belt-and-suspenders.""" + key = re.sub(r"[^A-Za-z0-9._-]", "_", f"{folder}_{uid}") or "_" + target = (ATTACHMENTS_DIR / key).resolve() + base = ATTACHMENTS_DIR.resolve() + if target != base and base not in target.parents: + raise HTTPException(400, "Invalid attachment location") + return target + + def _init_scheduled_db(): import sqlite3 conn = sqlite3.connect(SCHEDULED_DB) diff --git a/routes/email_routes.py b/routes/email_routes.py index 94ce9dc..8b82aa5 100644 --- a/routes/email_routes.py +++ b/routes/email_routes.py @@ -48,6 +48,7 @@ from routes.email_helpers import ( _EMAIL_REPLY_SYS_PROMPT_BASE, _POOL_HOOKS, SendEmailRequest, ExtractStyleRequest, ATTACHMENTS_DIR, COMPOSE_UPLOADS_DIR, SCHEDULED_DB, + attachment_extract_dir, ) from routes.email_pollers import _start_poller @@ -1390,7 +1391,7 @@ def setup_email_routes(): msg = email_mod.message_from_bytes(raw) # Extract to a per-email folder - target_dir = ATTACHMENTS_DIR / f"{folder}_{uid}" + target_dir = attachment_extract_dir(folder, uid) filepath = _extract_attachment_to_disk(msg, index, target_dir) if not filepath: return {"error": f"Attachment index {index} not found"} @@ -1425,7 +1426,7 @@ def setup_email_routes(): raw = msg_data[0][1] msg = email_mod.message_from_bytes(raw) - target_dir = ATTACHMENTS_DIR / f"{folder}_{uid}" + target_dir = attachment_extract_dir(folder, uid) filepath = _extract_attachment_to_disk(msg, index, target_dir) if not filepath: return {"error": f"Attachment index {index} not found"} @@ -1633,7 +1634,7 @@ def setup_email_routes(): raw = msg_data[0][1] msg = email_mod.message_from_bytes(raw) - target_dir = ATTACHMENTS_DIR / f"{folder}_{uid}" + target_dir = attachment_extract_dir(folder, uid) filepath = _extract_attachment_to_disk(msg, index, target_dir) if not filepath: return {"error": f"Attachment index {index} not found"} diff --git a/routes/mcp_routes.py b/routes/mcp_routes.py index 5b1a51d..c09108f 100644 --- a/routes/mcp_routes.py +++ b/routes/mcp_routes.py @@ -499,6 +499,11 @@ def setup_mcp_routes(mcp_manager: McpManager): def _oauth_authorize_page(auth_url: str, server_id: str, host: str) -> str: """Page with Google sign-in link and URL paste-back form for remote access.""" + # Escape values interpolated into the page: `host` comes from the request + # Host header and `server_id` from the OAuth state — neither is trusted. + auth_url = html.escape(auth_url, quote=True) + server_id = html.escape(server_id, quote=True) + host = html.escape(host, quote=True) return f"""