fix: extract_quotes accepts mismatched opening/closing quotes (#1113)
* fix: only extract quotes whose closing quote matches the opening one * fix: same mismatched-quote bug in the services search copy * test: extract_quotes requires matching open/close quotes
This commit is contained in:
@@ -385,7 +385,9 @@ def get_tldr(text: str, max_sentences: int = 3) -> str:
|
|||||||
|
|
||||||
def extract_quotes(text: str) -> List[str]:
|
def extract_quotes(text: str) -> List[str]:
|
||||||
"""Return quoted excerpts that are at least 15 characters long."""
|
"""Return quoted excerpts that are at least 15 characters long."""
|
||||||
return [m.group(1).strip() for m in re.finditer(r'["\']([^"\']{15,}?)["\']', text)]
|
# Backreference the opening quote so the closing quote must match it —
|
||||||
|
# otherwise `"text'` (open double, close single) is treated as a quote.
|
||||||
|
return [m.group(2).strip() for m in re.finditer(r'(["\'])([^"\']{15,}?)\1', text)]
|
||||||
|
|
||||||
|
|
||||||
def extract_statistics(text: str) -> List[str]:
|
def extract_statistics(text: str) -> List[str]:
|
||||||
|
|||||||
@@ -390,7 +390,9 @@ def get_tldr(text: str, max_sentences: int = 3) -> str:
|
|||||||
|
|
||||||
def extract_quotes(text: str) -> List[str]:
|
def extract_quotes(text: str) -> List[str]:
|
||||||
"""Return quoted excerpts that are at least 15 characters long."""
|
"""Return quoted excerpts that are at least 15 characters long."""
|
||||||
return [m.group(1).strip() for m in re.finditer(r'["\']([^"\']{15,}?)["\']', text)]
|
# Backreference the opening quote so the closing quote must match it —
|
||||||
|
# otherwise `"text'` (open double, close single) is treated as a quote.
|
||||||
|
return [m.group(2).strip() for m in re.finditer(r'(["\'])([^"\']{15,}?)\1', text)]
|
||||||
|
|
||||||
|
|
||||||
def extract_statistics(text: str) -> List[str]:
|
def extract_statistics(text: str) -> List[str]:
|
||||||
|
|||||||
28
tests/test_extract_quotes.py
Normal file
28
tests/test_extract_quotes.py
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
"""Tests for extract_quotes (src/search/content.py)."""
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
pytest.importorskip("bs4") # content.py imports BeautifulSoup at module load
|
||||||
|
|
||||||
|
from src.search.content import extract_quotes
|
||||||
|
|
||||||
|
|
||||||
|
def test_matched_double_quotes():
|
||||||
|
assert extract_quotes('She said "this is a proper long quote" today') == [
|
||||||
|
"this is a proper long quote"
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_matched_single_quotes():
|
||||||
|
assert extract_quotes("He wrote 'another sufficiently long quote' here") == [
|
||||||
|
"another sufficiently long quote"
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_mismatched_quotes_are_not_extracted():
|
||||||
|
# Regression: `"text'` (open double, close single) used to be accepted
|
||||||
|
# because the closing quote wasn't required to match the opening one.
|
||||||
|
assert extract_quotes("""apostrophe d'accord then a "dangling long opener""") == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_short_quotes_ignored():
|
||||||
|
assert extract_quotes('say "too short" please') == []
|
||||||
Reference in New Issue
Block a user