fix: document tidy crashes on a duplicate with NULL timestamps (#1772)
This commit is contained in:
@@ -6,6 +6,7 @@ Reusable document actions callable from both REST routes and the task scheduler.
|
|||||||
|
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -140,7 +141,20 @@ async def run_document_tidy(owner: str) -> str:
|
|||||||
# Keep the most complete (longest real content), then most recent.
|
# Keep the most complete (longest real content), then most recent.
|
||||||
def _updated(d):
|
def _updated(d):
|
||||||
return d.updated_at or d.created_at
|
return d.updated_at or d.created_at
|
||||||
members.sort(key=lambda d: (_real_len(d.current_content), _updated(d)), reverse=True)
|
# Sort key must be total-order safe: a document with both
|
||||||
|
# updated_at and created_at NULL would otherwise make Python
|
||||||
|
# compare None against a datetime on a real-length tie, raising
|
||||||
|
# TypeError and aborting the whole tidy run. Rank "has a
|
||||||
|
# timestamp" before the timestamp itself so a None is never
|
||||||
|
# compared against a datetime.
|
||||||
|
members.sort(
|
||||||
|
key=lambda d: (
|
||||||
|
_real_len(d.current_content),
|
||||||
|
_updated(d) is not None,
|
||||||
|
_updated(d) or datetime.min,
|
||||||
|
),
|
||||||
|
reverse=True,
|
||||||
|
)
|
||||||
keeper = members[0]
|
keeper = members[0]
|
||||||
kept += 1
|
kept += 1
|
||||||
dupes = members[1:]
|
dupes = members[1:]
|
||||||
|
|||||||
60
tests/test_document_tidy_null_timestamp.py
Normal file
60
tests/test_document_tidy_null_timestamp.py
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
"""run_document_tidy must not crash when a duplicate has NULL timestamps.
|
||||||
|
|
||||||
|
The duplicate-keeper sort used key=(real_len, updated_at or created_at). When
|
||||||
|
two duplicates tie on real length and one has both timestamps NULL, Python
|
||||||
|
compared None against a datetime and raised TypeError, aborting the entire
|
||||||
|
tidy run. The sort key is now total-order safe.
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import tempfile
|
||||||
|
import uuid
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from sqlalchemy import create_engine
|
||||||
|
from sqlalchemy.orm import sessionmaker
|
||||||
|
from sqlalchemy.pool import NullPool
|
||||||
|
|
||||||
|
import core.database as cdb
|
||||||
|
from core.database import Document
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def db_factory(monkeypatch):
|
||||||
|
tmp = tempfile.NamedTemporaryFile(suffix=".db", delete=False)
|
||||||
|
engine = create_engine(f"sqlite:///{tmp.name}", connect_args={"check_same_thread": False}, poolclass=NullPool)
|
||||||
|
cdb.Base.metadata.create_all(engine)
|
||||||
|
TS = sessionmaker(bind=engine, autoflush=False, autocommit=False)
|
||||||
|
monkeypatch.setattr(cdb, "SessionLocal", TS)
|
||||||
|
return TS
|
||||||
|
|
||||||
|
|
||||||
|
def test_tidy_survives_duplicate_with_null_timestamps(db_factory):
|
||||||
|
content = "This is a real document body long enough to survive junk rules."
|
||||||
|
db = db_factory()
|
||||||
|
try:
|
||||||
|
# Same title + content => same dedup group, equal real length.
|
||||||
|
db.add(Document(id=str(uuid.uuid4()), owner="alice", title="My Report",
|
||||||
|
current_content=content, updated_at=None, created_at=None))
|
||||||
|
db.add(Document(id=str(uuid.uuid4()), owner="alice", title="My Report",
|
||||||
|
current_content=content,
|
||||||
|
updated_at=datetime(2026, 6, 1, 9, 0), created_at=datetime(2026, 6, 1, 9, 0)))
|
||||||
|
db.commit()
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
# Old code raised TypeError (None vs datetime) and aborted.
|
||||||
|
result = asyncio.run(run_tidy())
|
||||||
|
assert isinstance(result, str)
|
||||||
|
|
||||||
|
db = db_factory()
|
||||||
|
try:
|
||||||
|
remaining = db.query(Document).filter(Document.owner == "alice").count()
|
||||||
|
assert remaining == 1 # one duplicate kept, the other removed
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
async def run_tidy():
|
||||||
|
from src.document_actions import run_document_tidy
|
||||||
|
return await run_document_tidy("alice")
|
||||||
Reference in New Issue
Block a user