fix: uploads with _ or - in the extension become permanently unreadable (#1756)

This commit is contained in:
Afonso Coutinho
2026-06-03 05:28:45 +01:00
committed by GitHub
parent 992866e167
commit 133948cc78
2 changed files with 52 additions and 2 deletions

View File

@@ -43,6 +43,20 @@ def is_valid_upload_id(upload_id: str) -> bool:
return UPLOAD_ID_RE.fullmatch(upload_id or "") is not None return UPLOAD_ID_RE.fullmatch(upload_id or "") is not None
def _build_upload_id(safe_filename: str) -> str:
"""Build a unique upload id whose extension matches UPLOAD_ID_RE.
secure_filename keeps '_' and '-', so an extension like '.jpg-1' (the
suffix browsers append to duplicate downloads) or '.v1_final' produced an
id that failed is_valid_upload_id, making the saved file permanently
unreadable (every read path gates on validate_upload_id). Sanitize the
extension to the single-alnum shape the id contract requires.
"""
_, ext = os.path.splitext(safe_filename or "")
ext = re.sub(r"[^A-Za-z0-9]", "", ext)
return uuid.uuid4().hex + (("." + ext) if ext else "")
def count_recent_uploads(timestamps, now: float, window: float = 10.0) -> int: def count_recent_uploads(timestamps, now: float, window: float = 10.0) -> int:
"""Number of upload events in *timestamps* within the last *window* seconds. """Number of upload events in *timestamps* within the last *window* seconds.
@@ -596,8 +610,7 @@ class UploadHandler:
} }
# Generate unique ID and determine save location # Generate unique ID and determine save location
_, ext = os.path.splitext(safe_filename) file_id = _build_upload_id(safe_filename)
file_id = f"{uuid.uuid4().hex}{ext}"
# Create date-based directory structure # Create date-based directory structure
upload_dir = self.get_upload_dir() upload_dir = self.get_upload_dir()

View File

@@ -0,0 +1,37 @@
"""Upload ids must satisfy UPLOAD_ID_RE for every accepted filename.
secure_filename keeps '_' and '-', so a filename whose final extension
contains them (e.g. "photo.jpg-1" — the suffix browsers add to duplicate
downloads, or "doc.v1_final") produced an id like "<hex>.jpg-1" that fails
is_valid_upload_id. Since every read path (download, resolve, vision)
validates the id first, the saved bytes became permanently unreachable.
"""
import pytest
from src.upload_handler import _build_upload_id, is_valid_upload_id
@pytest.mark.parametrize("name", [
"photo.jpg-1",
"doc.v1_final",
"invoice.2024-01",
"file.JPG_backup",
"report.pdf",
"image.png",
"noextension",
"",
])
def test_built_id_is_always_valid(name):
fid = _build_upload_id(name)
assert is_valid_upload_id(fid), (name, fid)
def test_normal_extension_is_preserved():
assert _build_upload_id("photo.png").endswith(".png")
assert _build_upload_id("doc.pdf").endswith(".pdf")
def test_problem_extension_is_sanitized_not_dropped_to_invalid():
fid = _build_upload_id("photo.jpg-1")
assert is_valid_upload_id(fid)
assert fid.endswith(".jpg1") # the '-' is stripped, alnum kept