diff --git a/src/upload_handler.py b/src/upload_handler.py index 61df395..8c8b2bd 100644 --- a/src/upload_handler.py +++ b/src/upload_handler.py @@ -43,6 +43,20 @@ def is_valid_upload_id(upload_id: str) -> bool: return UPLOAD_ID_RE.fullmatch(upload_id or "") is not None +def _build_upload_id(safe_filename: str) -> str: + """Build a unique upload id whose extension matches UPLOAD_ID_RE. + + secure_filename keeps '_' and '-', so an extension like '.jpg-1' (the + suffix browsers append to duplicate downloads) or '.v1_final' produced an + id that failed is_valid_upload_id, making the saved file permanently + unreadable (every read path gates on validate_upload_id). Sanitize the + extension to the single-alnum shape the id contract requires. + """ + _, ext = os.path.splitext(safe_filename or "") + ext = re.sub(r"[^A-Za-z0-9]", "", ext) + return uuid.uuid4().hex + (("." + ext) if ext else "") + + def count_recent_uploads(timestamps, now: float, window: float = 10.0) -> int: """Number of upload events in *timestamps* within the last *window* seconds. @@ -596,8 +610,7 @@ class UploadHandler: } # Generate unique ID and determine save location - _, ext = os.path.splitext(safe_filename) - file_id = f"{uuid.uuid4().hex}{ext}" + file_id = _build_upload_id(safe_filename) # Create date-based directory structure upload_dir = self.get_upload_dir() diff --git a/tests/test_upload_id_extension.py b/tests/test_upload_id_extension.py new file mode 100644 index 0000000..70e2613 --- /dev/null +++ b/tests/test_upload_id_extension.py @@ -0,0 +1,37 @@ +"""Upload ids must satisfy UPLOAD_ID_RE for every accepted filename. + +secure_filename keeps '_' and '-', so a filename whose final extension +contains them (e.g. "photo.jpg-1" — the suffix browsers add to duplicate +downloads, or "doc.v1_final") produced an id like ".jpg-1" that fails +is_valid_upload_id. Since every read path (download, resolve, vision) +validates the id first, the saved bytes became permanently unreachable. +""" +import pytest + +from src.upload_handler import _build_upload_id, is_valid_upload_id + + +@pytest.mark.parametrize("name", [ + "photo.jpg-1", + "doc.v1_final", + "invoice.2024-01", + "file.JPG_backup", + "report.pdf", + "image.png", + "noextension", + "", +]) +def test_built_id_is_always_valid(name): + fid = _build_upload_id(name) + assert is_valid_upload_id(fid), (name, fid) + + +def test_normal_extension_is_preserved(): + assert _build_upload_id("photo.png").endswith(".png") + assert _build_upload_id("doc.pdf").endswith(".pdf") + + +def test_problem_extension_is_sanitized_not_dropped_to_invalid(): + fid = _build_upload_id("photo.jpg-1") + assert is_valid_upload_id(fid) + assert fid.endswith(".jpg1") # the '-' is stripped, alnum kept