Ignore stale duplicate upload rows (#1256)

This commit is contained in:
red person
2026-06-02 18:59:01 +03:00
committed by GitHub
parent a04553013d
commit aa420e2060
2 changed files with 46 additions and 3 deletions

View File

@@ -512,11 +512,23 @@ class UploadHandler:
existing_key = None
with self._index_lock:
existing_files = self._load_upload_index()
stale_keys = []
for key, info in existing_files.items():
if info.get("hash") == file_hash and info.get("owner") == owner:
existing_key = key
existing_file = info
break
stored_path = info.get("path")
if stored_path and os.path.exists(stored_path) and self._inside_upload_dir(stored_path):
existing_key = key
existing_file = info
break
stale_keys.append(key)
if stale_keys:
for key in stale_keys:
existing_files.pop(key, None)
try:
self._atomic_write_json(uploads_db_path, existing_files)
logger.info("Removed %d stale upload index entries for missing duplicates", len(stale_keys))
except Exception as e:
logger.warning(f"Failed to remove stale upload index entries: {e}")
if existing_file:
logger.info(f"Duplicate file upload detected: {original_filename} -> {existing_file['id']}")

View File

@@ -339,6 +339,37 @@ def test_smoke_duplicate_upload(tmp_path):
assert len(final) == 1, f"Duplicate upload should not add a new row, got {len(final)}"
def test_duplicate_upload_ignores_stale_missing_file(tmp_path):
"""A stale uploads.json row should not make a new upload point at a
file that cleanup already removed from disk."""
handler = _make_handler(tmp_path)
handler.upload_rate_limit = 100
content = b"same-content-after-cleanup"
first = handler.save_upload(
SimpleNamespace(filename="cleanup.txt", file=io.BytesIO(content)),
"127.0.0.1",
"owner_a",
)
os.remove(first["path"])
second = handler.save_upload(
SimpleNamespace(filename="cleanup.txt", file=io.BytesIO(content)),
"127.0.0.1",
"owner_a",
)
assert second.get("is_duplicate") is not True
assert second["id"] != first["id"]
assert os.path.exists(second["path"])
with open(_db_path(handler), "r", encoding="utf-8") as f:
final = json.load(f)
ids = {row.get("id") for row in final.values()}
assert first["id"] not in ids
assert second["id"] in ids
def test_smoke_info_lookup_after_bak_recovery(tmp_path):
"""Smoke test: after a torn write is recovered from the ``.bak`` sibling,
``get_upload_info`` still finds the original entry by id."""