fix: extract_statistics drops large numbers and trailing % signs (#1153)

* fix: extract_statistics misses comma-less numbers and drops trailing %

* fix: same extract_statistics number/percent bug in services copy

* test: extract_statistics captures full numbers and percent signs
This commit is contained in:
Afonso Coutinho
2026-06-02 14:35:30 +01:00
committed by GitHub
parent 2b2943a7b7
commit 2e2da2aefe
3 changed files with 33 additions and 2 deletions

View File

@@ -392,8 +392,11 @@ def extract_quotes(text: str) -> List[str]:
def extract_statistics(text: str) -> List[str]:
"""Find numbers, percentages, dates and simple measurements."""
# Match a comma-grouped number (1,000,000) OR a plain digit run (50000) —
# the old `\d{1,3}(?:,\d{3})*` matched only the first 3 digits of a
# comma-less number, and the trailing `\b` dropped a closing `%`.
pattern = re.compile(
r"\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\s*(%|percent|‰|per cent|[a-zA-Z]+)?\b",
r"\b(?:\d{1,3}(?:,\d{3})+|\d+)(?:\.\d+)?\s*(%|percent|‰|per cent|[a-zA-Z]+)?",
re.IGNORECASE,
)
return [m.group(0).strip() for m in pattern.finditer(text)]