fix: extract_statistics drops large numbers and trailing % signs (#1153)
* fix: extract_statistics misses comma-less numbers and drops trailing % * fix: same extract_statistics number/percent bug in services copy * test: extract_statistics captures full numbers and percent signs
This commit is contained in:
@@ -392,8 +392,11 @@ def extract_quotes(text: str) -> List[str]:
|
|||||||
|
|
||||||
def extract_statistics(text: str) -> List[str]:
|
def extract_statistics(text: str) -> List[str]:
|
||||||
"""Find numbers, percentages, dates and simple measurements."""
|
"""Find numbers, percentages, dates and simple measurements."""
|
||||||
|
# Match a comma-grouped number (1,000,000) OR a plain digit run (50000) —
|
||||||
|
# the old `\d{1,3}(?:,\d{3})*` matched only the first 3 digits of a
|
||||||
|
# comma-less number, and the trailing `\b` dropped a closing `%`.
|
||||||
pattern = re.compile(
|
pattern = re.compile(
|
||||||
r"\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\s*(%|percent|‰|per cent|[a-zA-Z]+)?\b",
|
r"\b(?:\d{1,3}(?:,\d{3})+|\d+)(?:\.\d+)?\s*(%|percent|‰|per cent|[a-zA-Z]+)?",
|
||||||
re.IGNORECASE,
|
re.IGNORECASE,
|
||||||
)
|
)
|
||||||
return [m.group(0).strip() for m in pattern.finditer(text)]
|
return [m.group(0).strip() for m in pattern.finditer(text)]
|
||||||
|
|||||||
@@ -397,8 +397,11 @@ def extract_quotes(text: str) -> List[str]:
|
|||||||
|
|
||||||
def extract_statistics(text: str) -> List[str]:
|
def extract_statistics(text: str) -> List[str]:
|
||||||
"""Find numbers, percentages, dates and simple measurements."""
|
"""Find numbers, percentages, dates and simple measurements."""
|
||||||
|
# Match a comma-grouped number (1,000,000) OR a plain digit run (50000) —
|
||||||
|
# the old `\d{1,3}(?:,\d{3})*` matched only the first 3 digits of a
|
||||||
|
# comma-less number, and the trailing `\b` dropped a closing `%`.
|
||||||
pattern = re.compile(
|
pattern = re.compile(
|
||||||
r"\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\s*(%|percent|‰|per cent|[a-zA-Z]+)?\b",
|
r"\b(?:\d{1,3}(?:,\d{3})+|\d+)(?:\.\d+)?\s*(%|percent|‰|per cent|[a-zA-Z]+)?",
|
||||||
re.IGNORECASE,
|
re.IGNORECASE,
|
||||||
)
|
)
|
||||||
return [m.group(0).strip() for m in pattern.finditer(text)]
|
return [m.group(0).strip() for m in pattern.finditer(text)]
|
||||||
|
|||||||
25
tests/test_extract_statistics.py
Normal file
25
tests/test_extract_statistics.py
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
"""Tests for extract_statistics (src/search/content.py)."""
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
pytest.importorskip("bs4") # content.py imports BeautifulSoup at module load
|
||||||
|
|
||||||
|
from src.search.content import extract_statistics
|
||||||
|
|
||||||
|
|
||||||
|
def test_captures_comma_less_large_number():
|
||||||
|
# Regression: `\d{1,3}(?:,\d{3})*` matched only the first 3 digits of a
|
||||||
|
# comma-less number, so "50000" was never captured whole.
|
||||||
|
assert any(s.startswith("50000") for s in extract_statistics("about 50000 users"))
|
||||||
|
|
||||||
|
|
||||||
|
def test_keeps_percent_sign():
|
||||||
|
# Regression: a trailing `\b` after the optional unit dropped the "%".
|
||||||
|
assert "12%" in extract_statistics("conversion rose to 12% this quarter")
|
||||||
|
|
||||||
|
|
||||||
|
def test_comma_grouped_number():
|
||||||
|
assert any(s.startswith("1,000,000") for s in extract_statistics("revenue of 1,000,000 dollars"))
|
||||||
|
|
||||||
|
|
||||||
|
def test_four_digit_year_captured():
|
||||||
|
assert any("2024" in s for s in extract_statistics("released in 2024"))
|
||||||
Reference in New Issue
Block a user