Harden PDF document markers against cross-owner upload access (#445)

Route PDF lookups through UploadHandler.resolve_upload, reject poisoned pdf_source markers on document create/update, and add regression tests.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Duarte Antunes
2026-06-01 14:38:14 +01:00
committed by GitHub
parent b2e8d692a4
commit 448401a0fc
5 changed files with 183 additions and 106 deletions

View File

@@ -5,16 +5,16 @@
import logging
import os
import re
from typing import Dict, Any, Optional
from typing import Any, Dict, Optional
from fastapi import HTTPException
from fastapi import HTTPException, Request
from pydantic import BaseModel
from core.database import Document, DocumentVersion
from core.database import Session as DbSession
from src.upload_handler import UploadHandler
logger = logging.getLogger(__name__)
_UPLOAD_ID_RE = re.compile(r"^[0-9a-fA-F]{32}\.[A-Za-z0-9]+$")
# ---- Request schemas ----
@@ -138,78 +138,66 @@ def _upload_path_inside(upload_dir: str, path: str) -> bool:
return False
def _upload_owner_allowed(
meta: Optional[dict],
user: Optional[str],
def _resolve_user_upload_path(
upload_handler: Any,
upload_id: str,
owner: Optional[str],
auth_manager=None,
allow_admin: bool = True,
) -> bool:
if not user:
return (
not bool(auth_manager and getattr(auth_manager, "is_configured", False))
and not (meta and meta.get("owner") is not None)
) -> Optional[str]:
"""Resolve an upload id to a filesystem path the caller may read."""
if upload_handler is None:
return None
resolved = upload_handler.resolve_upload(
upload_id,
owner=owner,
auth_manager=auth_manager,
)
if not resolved:
return None
path = resolved.get("path")
upload_dir = getattr(upload_handler, "upload_dir", None)
if path and upload_dir and not _upload_path_inside(upload_dir, path):
logger.warning("Upload path outside upload directory: %s", path)
return None
return path
def _locate_upload(
upload_dir: str,
file_id: str,
owner: Optional[str] = None,
auth_manager=None,
upload_handler: Any = None,
):
"""Find an upload by its filename ID via UploadHandler.resolve_upload."""
if upload_handler is None:
from src.upload_handler import UploadHandler
base_dir = os.path.dirname(os.path.abspath(upload_dir))
upload_handler = UploadHandler(base_dir, upload_dir)
return _resolve_user_upload_path(upload_handler, file_id, owner, auth_manager)
def _assert_pdf_marker_upload_owned(
request: Request,
content: str,
user: Optional[str],
upload_handler: Any,
) -> None:
"""Reject document content whose pdf_source marker points at another user's upload."""
if upload_handler is None:
return
from src.pdf_form_doc import find_source_upload_id
upload_id = find_source_upload_id(content or "")
if not upload_id:
return
auth_manager = getattr(getattr(request.app, "state", None), "auth_manager", None)
if not _resolve_user_upload_path(upload_handler, upload_id, user, auth_manager):
raise HTTPException(
400,
"Document PDF marker references an upload you do not own",
)
if allow_admin and auth_manager and hasattr(auth_manager, "is_admin"):
try:
if auth_manager.is_admin(user):
return True
except Exception:
pass
return bool(meta and meta.get("owner") == user)
def _locate_upload(upload_dir: str, file_id: str, owner: Optional[str] = None, auth_manager=None):
"""Find an upload by its filename ID.
Lookup order:
1. The `uploads.json` index that `UploadHandler.save_upload` maintains,
so owner can be verified before a document reads the source file.
2. Direct hit at `upload_dir/file_id` (very small deployments).
3. Fallback: `os.walk` the date-bucketed tree. Slow on large stores;
only allowed after the index owner check passes, or in single-user /
admin-style contexts where no owner is enforced.
`followlinks=False` keeps a stray symlink loop in `data/uploads/` from
spinning the walker into infinite recursion.
"""
import json as _json
if not _UPLOAD_ID_RE.fullmatch(file_id or ""):
logger.warning("Rejected invalid upload id in document lookup: %r", file_id)
return None
meta = None
try:
idx_path = os.path.join(upload_dir, "uploads.json")
if os.path.exists(idx_path):
with open(idx_path, "r", encoding="utf-8") as f:
idx = _json.load(f)
for item in (idx.values() if isinstance(idx, dict) else []):
if isinstance(item, dict) and item.get("id") == file_id:
meta = item
break
except Exception:
meta = None
if not _upload_owner_allowed(meta, owner, auth_manager):
logger.warning("Upload %s denied for document owner %s", file_id, owner)
return None
if meta:
p = meta.get("path")
if p and os.path.exists(p) and _upload_path_inside(upload_dir, p):
return p
direct = os.path.join(upload_dir, file_id)
if os.path.exists(direct) and _upload_path_inside(upload_dir, direct):
return direct
for root, _dirs, files in os.walk(upload_dir, followlinks=False):
if file_id in files:
p = os.path.join(root, file_id)
if _upload_path_inside(upload_dir, p):
return p
return None
def _derive_title(content: str) -> str:

View File

@@ -20,28 +20,28 @@ from routes.document_helpers import (
DocumentCreate, DocumentUpdate, DocumentPatch,
_doc_to_dict, _version_to_dict,
_verify_doc_owner, _owner_session_filter,
_slug, _locate_upload, _derive_title,
_slug, _resolve_user_upload_path, _assert_pdf_marker_upload_owned, _derive_title,
_PDF_RENDER_SCALE,
)
def _locate_current_user_upload(request: Request, upload_dir: str, upload_id: str, user: Optional[str]):
auth_manager = getattr(getattr(request.app, "state", None), "auth_manager", None)
return _locate_upload(upload_dir, upload_id, owner=user, auth_manager=auth_manager)
def _load_pdf_viewer_fitz():
from src.pdf_runtime import load_pymupdf_for_pdf_viewer
try:
return load_pymupdf_for_pdf_viewer()
except RuntimeError as exc:
raise HTTPException(503, str(exc)) from exc
def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
router = APIRouter(tags=["documents"])
def _locate_current_user_upload(request: Request, upload_id: str, user: Optional[str]):
if upload_handler is None:
return None
auth_manager = getattr(getattr(request.app, "state", None), "auth_manager", None)
return _resolve_user_upload_path(upload_handler, upload_id, user, auth_manager)
def _load_pdf_viewer_fitz():
from src.pdf_runtime import load_pymupdf_for_pdf_viewer
try:
return load_pymupdf_for_pdf_viewer()
except RuntimeError as exc:
raise HTTPException(503, str(exc)) from exc
# ---- POST /api/document ----
@router.post("/api/document")
async def create_document(request: Request, req: DocumentCreate) -> Dict[str, Any]:
@@ -82,6 +82,8 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
if _looks_like_email_document(req.content, req.title):
language = "email"
_assert_pdf_marker_upload_owned(request, req.content, user, upload_handler)
doc = Document(
id=doc_id,
session_id=req.session_id,
@@ -176,7 +178,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
raise HTTPException(500, f"Upload failed: {e}")
upload_id = meta["id"]
pdf_path = _locate_current_user_upload(request, UPLOAD_DIR, upload_id, user)
pdf_path = _locate_current_user_upload(request, upload_id, user)
if not pdf_path:
raise HTTPException(500, "Saved PDF could not be located")
@@ -400,8 +402,8 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
text extraction was wired, plus for scanned/image-only PDFs where the
VL model picks up text the basic pypdf path missed."""
import re
from src.constants import UPLOAD_DIR
from src.document_processor import _process_pdf
from src.pdf_form_doc import find_source_upload_id
user = get_current_user(request)
db = SessionLocal()
@@ -412,12 +414,11 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
_verify_doc_owner(db, doc, user)
content = doc.current_content or ""
m = re.search(r'<!--\s*(?:pdf_source|pdf_form_source)\s+upload_id="([^"]+)"', content)
if not m:
upload_id = find_source_upload_id(content)
if not upload_id:
raise HTTPException(400, "Document is not a PDF — no pdf_source marker found")
upload_id = m.group(1)
pdf_path = _locate_current_user_upload(request, UPLOAD_DIR, upload_id, user)
pdf_path = _locate_current_user_upload(request, upload_id, user)
if not pdf_path:
raise HTTPException(404, "Source PDF could not be located")
@@ -528,6 +529,8 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
if doc.current_content == req.content:
return _doc_to_dict(doc)
_assert_pdf_marker_upload_owned(request, req.content, user, upload_handler)
# Check if we can coalesce with the latest version
latest_ver = db.query(DocumentVersion).filter(
DocumentVersion.document_id == doc_id,
@@ -930,7 +933,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
if not upload_id:
raise HTTPException(400, "Document is not linked to a source PDF")
pdf_path = _locate_current_user_upload(request, UPLOAD_DIR, upload_id, user)
pdf_path = _locate_current_user_upload(request, upload_id, user)
if not pdf_path:
raise HTTPException(404, f"Source PDF {upload_id} not found in uploads")
@@ -993,7 +996,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
upload_id = find_source_upload_id(doc.current_content or "")
if not upload_id:
raise HTTPException(400, "Document is not linked to a source PDF")
pdf_path = _locate_current_user_upload(request, UPLOAD_DIR, upload_id, user)
pdf_path = _locate_current_user_upload(request, upload_id, user)
if not pdf_path:
raise HTTPException(404, f"Source PDF {upload_id} not found")
@@ -1061,7 +1064,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
upload_id = find_source_upload_id(doc.current_content or "")
if not upload_id:
raise HTTPException(400, "Document is not linked to a source PDF")
pdf_path = _locate_current_user_upload(request, UPLOAD_DIR, upload_id, user)
pdf_path = _locate_current_user_upload(request, upload_id, user)
if not pdf_path:
raise HTTPException(404, "Source PDF not found")
finally:
@@ -1117,7 +1120,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
upload_id = find_source_upload_id(doc.current_content or "")
if not upload_id:
raise HTTPException(400, "Document is not linked to a source PDF")
pdf_path = _locate_current_user_upload(request, UPLOAD_DIR, upload_id, user)
pdf_path = _locate_current_user_upload(request, upload_id, user)
if not pdf_path:
raise HTTPException(404, "Source PDF not found")
finally:
@@ -1266,7 +1269,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
upload_id = find_source_upload_id(doc.current_content or "")
if not upload_id:
raise HTTPException(400, "Document is not linked to a source PDF")
pdf_path = _locate_current_user_upload(request, UPLOAD_DIR, upload_id, user)
pdf_path = _locate_current_user_upload(request, upload_id, user)
if not pdf_path:
raise HTTPException(404, f"Source PDF {upload_id} not found")
@@ -1361,7 +1364,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
if not upload_id:
raise HTTPException(400, "Document is not linked to a source PDF")
pdf_path = _locate_current_user_upload(request, UPLOAD_DIR, upload_id, user)
pdf_path = _locate_current_user_upload(request, upload_id, user)
if not pdf_path:
raise HTTPException(404, f"Source PDF {upload_id} not found in uploads")
@@ -1505,7 +1508,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
upload_id = find_source_upload_id(doc.current_content or "")
if not upload_id:
raise HTTPException(400, "Document is not linked to a source PDF")
pdf_path = _locate_current_user_upload(request, UPLOAD_DIR, upload_id, user)
pdf_path = _locate_current_user_upload(request, upload_id, user)
if not pdf_path:
raise HTTPException(404, f"Source PDF {upload_id} not found")

View File

@@ -167,9 +167,18 @@ def find_source_upload_id(content: str) -> Optional[str]:
Matches both the form-source marker (`pdf_form_source`) used for fillable
PDFs and the plain marker (`pdf_source`) used for any imported PDF.
Rejects malformed ids (path traversal, wrong shape) before any lookup.
"""
from src.upload_handler import is_valid_upload_id
m = _FRONT_MATTER_RE.search(content or "") or _PLAIN_FRONT_MATTER_RE.search(content or "")
return m.group("upload_id") if m else None
if not m:
return None
upload_id = m.group("upload_id")
if not is_valid_upload_id(upload_id):
logger.warning("Ignoring invalid pdf_source upload_id in document content: %r", upload_id)
return None
return upload_id
def render_plain_pdf_markdown(upload_id: str, title: str, body_text: Optional[str] = None) -> str:

View File

@@ -29,6 +29,14 @@ import logging
logger = logging.getLogger(__name__)
UPLOAD_ID_RE = re.compile(r"^[0-9a-fA-F]{32}\.[A-Za-z0-9]+$")
def is_valid_upload_id(upload_id: str) -> bool:
"""Return True when *upload_id* matches the canonical uploads.json id format."""
return UPLOAD_ID_RE.fullmatch(upload_id or "") is not None
class UploadHandler:
def __init__(self, base_dir: str, upload_dir: str):
self.base_dir = base_dir
@@ -223,8 +231,7 @@ class UploadHandler:
def validate_upload_id(self, upload_id: str) -> bool:
"""Validate that the upload ID matches the expected pattern."""
pattern = r'^[0-9a-fA-F]{32}\.[A-Za-z0-9]+$'
return re.fullmatch(pattern, upload_id) is not None
return is_valid_upload_id(upload_id)
def _inside_upload_dir(self, path: str) -> bool:
"""Check if path is inside the upload directory."""

View File

@@ -367,17 +367,87 @@ def test_chat_preprocess_does_not_surface_cross_owner_attachment(tmp_path, monke
def test_document_upload_lookup_rejects_cross_owner_marker(tmp_path, monkeypatch):
from src.upload_handler import UploadHandler
sys.modules.pop("routes.document_helpers", None)
_stub_core_database_for_route_imports(monkeypatch)
from routes.document_helpers import _locate_upload
upload_dir, _alice_id, bob_id = _make_upload_store(tmp_path)
handler = UploadHandler(str(tmp_path), str(upload_dir))
assert _locate_upload(str(upload_dir), bob_id, owner="alice") is None
assert _locate_upload(str(upload_dir), bob_id, owner="bob").endswith(bob_id)
assert _locate_upload(str(upload_dir), bob_id, owner="alice", upload_handler=handler) is None
assert _locate_upload(str(upload_dir), bob_id, owner="bob", upload_handler=handler).endswith(bob_id)
sys.modules.pop("routes.document_helpers", None)
def test_find_source_upload_id_rejects_path_traversal_marker():
from src.pdf_form_doc import find_source_upload_id
content = '<!-- pdf_source upload_id="../../etc/passwd" -->\n\n# x\n'
assert find_source_upload_id(content) is None
def test_pdf_marker_write_rejects_cross_owner_upload(tmp_path, monkeypatch):
"""Saving a doc whose front-matter points at another user's upload must 400."""
from src.upload_handler import UploadHandler
sys.modules.pop("routes.document_helpers", None)
_stub_core_database_for_route_imports(monkeypatch)
from fastapi import HTTPException
from routes.document_helpers import _assert_pdf_marker_upload_owned
upload_dir, _alice_id, bob_id = _make_upload_store(tmp_path)
handler = UploadHandler(str(tmp_path), str(upload_dir))
class _AuthMgr:
is_configured = True
@staticmethod
def is_admin(_user):
return False
class _AppState:
auth_manager = _AuthMgr()
class _App:
state = _AppState()
class _Req:
app = _App()
marker = f'<!-- pdf_source upload_id="{bob_id}" -->\n\n# Notes\n'
with pytest.raises(HTTPException) as exc:
_assert_pdf_marker_upload_owned(_Req(), marker, "alice", handler)
assert exc.value.status_code == 400
# Own upload is allowed
own_marker = f'<!-- pdf_source upload_id="{_alice_id}" -->\n\n# Notes\n'
_assert_pdf_marker_upload_owned(_Req(), own_marker, "alice", handler)
sys.modules.pop("routes.document_helpers", None)
def test_pdf_marker_render_lookup_denies_cross_owner_without_doc_leak(tmp_path):
"""Read path: cross-owner marker resolves to None (404 at route layer)."""
from src.upload_handler import UploadHandler
upload_dir, alice_id, bob_id = _make_upload_store(tmp_path)
handler = UploadHandler(str(tmp_path), str(upload_dir))
class _AuthMgr:
is_configured = True
@staticmethod
def is_admin(_user):
return False
assert handler.resolve_upload(bob_id, owner="alice", auth_manager=_AuthMgr()) is None
resolved = handler.resolve_upload(alice_id, owner="alice", auth_manager=_AuthMgr())
assert resolved is not None
assert resolved["path"].endswith(alice_id)
# ── require_user dependency rejects anon callers ────────────────
def test_require_user_rejects_unauthenticated(monkeypatch):