From d25a860f715ccebf714fadf12da4c65a52a6e3d4 Mon Sep 17 00:00:00 2001 From: Afonso Coutinho Date: Wed, 3 Jun 2026 05:23:01 +0100 Subject: [PATCH] fix: document tidy crashes on a duplicate with NULL timestamps (#1772) --- src/document_actions.py | 16 +++++- tests/test_document_tidy_null_timestamp.py | 60 ++++++++++++++++++++++ 2 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 tests/test_document_tidy_null_timestamp.py diff --git a/src/document_actions.py b/src/document_actions.py index cd74b4b..4fb7af2 100644 --- a/src/document_actions.py +++ b/src/document_actions.py @@ -6,6 +6,7 @@ Reusable document actions callable from both REST routes and the task scheduler. import logging import re +from datetime import datetime logger = logging.getLogger(__name__) @@ -140,7 +141,20 @@ async def run_document_tidy(owner: str) -> str: # Keep the most complete (longest real content), then most recent. def _updated(d): return d.updated_at or d.created_at - members.sort(key=lambda d: (_real_len(d.current_content), _updated(d)), reverse=True) + # Sort key must be total-order safe: a document with both + # updated_at and created_at NULL would otherwise make Python + # compare None against a datetime on a real-length tie, raising + # TypeError and aborting the whole tidy run. Rank "has a + # timestamp" before the timestamp itself so a None is never + # compared against a datetime. + members.sort( + key=lambda d: ( + _real_len(d.current_content), + _updated(d) is not None, + _updated(d) or datetime.min, + ), + reverse=True, + ) keeper = members[0] kept += 1 dupes = members[1:] diff --git a/tests/test_document_tidy_null_timestamp.py b/tests/test_document_tidy_null_timestamp.py new file mode 100644 index 0000000..331a89d --- /dev/null +++ b/tests/test_document_tidy_null_timestamp.py @@ -0,0 +1,60 @@ +"""run_document_tidy must not crash when a duplicate has NULL timestamps. + +The duplicate-keeper sort used key=(real_len, updated_at or created_at). When +two duplicates tie on real length and one has both timestamps NULL, Python +compared None against a datetime and raised TypeError, aborting the entire +tidy run. The sort key is now total-order safe. +""" +import asyncio +import tempfile +import uuid +from datetime import datetime + +import pytest +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker +from sqlalchemy.pool import NullPool + +import core.database as cdb +from core.database import Document + + +@pytest.fixture +def db_factory(monkeypatch): + tmp = tempfile.NamedTemporaryFile(suffix=".db", delete=False) + engine = create_engine(f"sqlite:///{tmp.name}", connect_args={"check_same_thread": False}, poolclass=NullPool) + cdb.Base.metadata.create_all(engine) + TS = sessionmaker(bind=engine, autoflush=False, autocommit=False) + monkeypatch.setattr(cdb, "SessionLocal", TS) + return TS + + +def test_tidy_survives_duplicate_with_null_timestamps(db_factory): + content = "This is a real document body long enough to survive junk rules." + db = db_factory() + try: + # Same title + content => same dedup group, equal real length. + db.add(Document(id=str(uuid.uuid4()), owner="alice", title="My Report", + current_content=content, updated_at=None, created_at=None)) + db.add(Document(id=str(uuid.uuid4()), owner="alice", title="My Report", + current_content=content, + updated_at=datetime(2026, 6, 1, 9, 0), created_at=datetime(2026, 6, 1, 9, 0))) + db.commit() + finally: + db.close() + + # Old code raised TypeError (None vs datetime) and aborted. + result = asyncio.run(run_tidy()) + assert isinstance(result, str) + + db = db_factory() + try: + remaining = db.query(Document).filter(Document.owner == "alice").count() + assert remaining == 1 # one duplicate kept, the other removed + finally: + db.close() + + +async def run_tidy(): + from src.document_actions import run_document_tidy + return await run_document_tidy("alice")