From 992866e1670acbdcbb17380b4626a09975306356 Mon Sep 17 00:00:00 2001 From: Afonso Coutinho Date: Wed, 3 Jun 2026 05:28:38 +0100 Subject: [PATCH] fix: document library language facet undercounts text documents (#1758) --- routes/document_routes.py | 17 ++++++++++- tests/test_document_library_language_facet.py | 28 +++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 tests/test_document_library_language_facet.py diff --git a/routes/document_routes.py b/routes/document_routes.py index baf84b2..5625df8 100644 --- a/routes/document_routes.py +++ b/routes/document_routes.py @@ -15,6 +15,21 @@ from src.auth_helpers import get_current_user logger = logging.getLogger(__name__) +def _aggregate_language_facets(lang_rows): + """Sum document counts per display language for the library facet. + + NULL-language and explicit "text" rows share the "text" bucket (the + language filter treats them as one), so they must be ADDED. The old dict + comprehension keyed both to "text", silently overwriting one group and + undercounting the facet versus what the filter actually returns. + """ + out = {} + for lang, cnt in lang_rows: + key = lang or "text" + out[key] = out.get(key, 0) + cnt + return out + + from routes.document_helpers import ( DocumentCreate, DocumentUpdate, DocumentPatch, @@ -258,7 +273,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter: ) lang_q = _owner_session_filter(lang_q, user) lang_rows = lang_q.group_by(Document.language).all() - languages = {lang or "text": cnt for lang, cnt in lang_rows} + languages = _aggregate_language_facets(lang_rows) # Session count (owner-filtered) sc_q = ( diff --git a/tests/test_document_library_language_facet.py b/tests/test_document_library_language_facet.py new file mode 100644 index 0000000..ee23eb4 --- /dev/null +++ b/tests/test_document_library_language_facet.py @@ -0,0 +1,28 @@ +"""Library language facet must SUM NULL-language and "text" docs. + +documents_library built the facet with {lang or "text": cnt ...}, so a +NULL-language row and an explicit "text" row both keyed "text" and one +silently overwrote the other. The language FILTER treats NULL and "text" +as a single bucket ((language == None) | (language == "text")), so the +facet count must add them, otherwise clicking the facet returns more docs +than the count promised. +""" +from routes.document_routes import _aggregate_language_facets + + +def test_null_and_text_are_summed(): + rows = [(None, 3), ("text", 2), ("python", 5)] + assert _aggregate_language_facets(rows) == {"text": 5, "python": 5} + + +def test_only_null(): + assert _aggregate_language_facets([(None, 4)]) == {"text": 4} + + +def test_distinct_languages_preserved(): + rows = [("python", 2), ("javascript", 7), ("text", 1)] + assert _aggregate_language_facets(rows) == {"python": 2, "javascript": 7, "text": 1} + + +def test_empty(): + assert _aggregate_language_facets([]) == {}