fix: extract_quotes accepts mismatched opening/closing quotes (#1113)
* fix: only extract quotes whose closing quote matches the opening one * fix: same mismatched-quote bug in the services search copy * test: extract_quotes requires matching open/close quotes
This commit is contained in:
@@ -385,7 +385,9 @@ def get_tldr(text: str, max_sentences: int = 3) -> str:
|
||||
|
||||
def extract_quotes(text: str) -> List[str]:
|
||||
"""Return quoted excerpts that are at least 15 characters long."""
|
||||
return [m.group(1).strip() for m in re.finditer(r'["\']([^"\']{15,}?)["\']', text)]
|
||||
# Backreference the opening quote so the closing quote must match it —
|
||||
# otherwise `"text'` (open double, close single) is treated as a quote.
|
||||
return [m.group(2).strip() for m in re.finditer(r'(["\'])([^"\']{15,}?)\1', text)]
|
||||
|
||||
|
||||
def extract_statistics(text: str) -> List[str]:
|
||||
|
||||
@@ -390,7 +390,9 @@ def get_tldr(text: str, max_sentences: int = 3) -> str:
|
||||
|
||||
def extract_quotes(text: str) -> List[str]:
|
||||
"""Return quoted excerpts that are at least 15 characters long."""
|
||||
return [m.group(1).strip() for m in re.finditer(r'["\']([^"\']{15,}?)["\']', text)]
|
||||
# Backreference the opening quote so the closing quote must match it —
|
||||
# otherwise `"text'` (open double, close single) is treated as a quote.
|
||||
return [m.group(2).strip() for m in re.finditer(r'(["\'])([^"\']{15,}?)\1', text)]
|
||||
|
||||
|
||||
def extract_statistics(text: str) -> List[str]:
|
||||
|
||||
28
tests/test_extract_quotes.py
Normal file
28
tests/test_extract_quotes.py
Normal file
@@ -0,0 +1,28 @@
|
||||
"""Tests for extract_quotes (src/search/content.py)."""
|
||||
import pytest
|
||||
|
||||
pytest.importorskip("bs4") # content.py imports BeautifulSoup at module load
|
||||
|
||||
from src.search.content import extract_quotes
|
||||
|
||||
|
||||
def test_matched_double_quotes():
|
||||
assert extract_quotes('She said "this is a proper long quote" today') == [
|
||||
"this is a proper long quote"
|
||||
]
|
||||
|
||||
|
||||
def test_matched_single_quotes():
|
||||
assert extract_quotes("He wrote 'another sufficiently long quote' here") == [
|
||||
"another sufficiently long quote"
|
||||
]
|
||||
|
||||
|
||||
def test_mismatched_quotes_are_not_extracted():
|
||||
# Regression: `"text'` (open double, close single) used to be accepted
|
||||
# because the closing quote wasn't required to match the opening one.
|
||||
assert extract_quotes("""apostrophe d'accord then a "dangling long opener""") == []
|
||||
|
||||
|
||||
def test_short_quotes_ignored():
|
||||
assert extract_quotes('say "too short" please') == []
|
||||
Reference in New Issue
Block a user