From 2e2da2aefe626c8f575468f99eb448d76f73a0e7 Mon Sep 17 00:00:00 2001 From: Afonso Coutinho <116525378+afonsopc@users.noreply.github.com> Date: Tue, 2 Jun 2026 14:35:30 +0100 Subject: [PATCH] fix: extract_statistics drops large numbers and trailing % signs (#1153) * fix: extract_statistics misses comma-less numbers and drops trailing % * fix: same extract_statistics number/percent bug in services copy * test: extract_statistics captures full numbers and percent signs --- services/search/content.py | 5 ++++- src/search/content.py | 5 ++++- tests/test_extract_statistics.py | 25 +++++++++++++++++++++++++ 3 files changed, 33 insertions(+), 2 deletions(-) create mode 100644 tests/test_extract_statistics.py diff --git a/services/search/content.py b/services/search/content.py index 0d1b762..290dc35 100644 --- a/services/search/content.py +++ b/services/search/content.py @@ -392,8 +392,11 @@ def extract_quotes(text: str) -> List[str]: def extract_statistics(text: str) -> List[str]: """Find numbers, percentages, dates and simple measurements.""" + # Match a comma-grouped number (1,000,000) OR a plain digit run (50000) — + # the old `\d{1,3}(?:,\d{3})*` matched only the first 3 digits of a + # comma-less number, and the trailing `\b` dropped a closing `%`. pattern = re.compile( - r"\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\s*(%|percent|‰|per cent|[a-zA-Z]+)?\b", + r"\b(?:\d{1,3}(?:,\d{3})+|\d+)(?:\.\d+)?\s*(%|percent|‰|per cent|[a-zA-Z]+)?", re.IGNORECASE, ) return [m.group(0).strip() for m in pattern.finditer(text)] diff --git a/src/search/content.py b/src/search/content.py index d449b66..a7eddb4 100644 --- a/src/search/content.py +++ b/src/search/content.py @@ -397,8 +397,11 @@ def extract_quotes(text: str) -> List[str]: def extract_statistics(text: str) -> List[str]: """Find numbers, percentages, dates and simple measurements.""" + # Match a comma-grouped number (1,000,000) OR a plain digit run (50000) — + # the old `\d{1,3}(?:,\d{3})*` matched only the first 3 digits of a + # comma-less number, and the trailing `\b` dropped a closing `%`. pattern = re.compile( - r"\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\s*(%|percent|‰|per cent|[a-zA-Z]+)?\b", + r"\b(?:\d{1,3}(?:,\d{3})+|\d+)(?:\.\d+)?\s*(%|percent|‰|per cent|[a-zA-Z]+)?", re.IGNORECASE, ) return [m.group(0).strip() for m in pattern.finditer(text)] diff --git a/tests/test_extract_statistics.py b/tests/test_extract_statistics.py new file mode 100644 index 0000000..c567477 --- /dev/null +++ b/tests/test_extract_statistics.py @@ -0,0 +1,25 @@ +"""Tests for extract_statistics (src/search/content.py).""" +import pytest + +pytest.importorskip("bs4") # content.py imports BeautifulSoup at module load + +from src.search.content import extract_statistics + + +def test_captures_comma_less_large_number(): + # Regression: `\d{1,3}(?:,\d{3})*` matched only the first 3 digits of a + # comma-less number, so "50000" was never captured whole. + assert any(s.startswith("50000") for s in extract_statistics("about 50000 users")) + + +def test_keeps_percent_sign(): + # Regression: a trailing `\b` after the optional unit dropped the "%". + assert "12%" in extract_statistics("conversion rose to 12% this quarter") + + +def test_comma_grouped_number(): + assert any(s.startswith("1,000,000") for s in extract_statistics("revenue of 1,000,000 dollars")) + + +def test_four_digit_year_captured(): + assert any("2024" in s for s in extract_statistics("released in 2024"))