fix: extract_quotes accepts mismatched opening/closing quotes (#1113)

* fix: only extract quotes whose closing quote matches the opening one

* fix: same mismatched-quote bug in the services search copy

* test: extract_quotes requires matching open/close quotes
This commit is contained in:
Afonso Coutinho
2026-06-02 14:34:52 +01:00
committed by GitHub
parent 5236a62de1
commit 2b2943a7b7
3 changed files with 34 additions and 2 deletions

View File

@@ -385,7 +385,9 @@ def get_tldr(text: str, max_sentences: int = 3) -> str:
def extract_quotes(text: str) -> List[str]:
"""Return quoted excerpts that are at least 15 characters long."""
return [m.group(1).strip() for m in re.finditer(r'["\']([^"\']{15,}?)["\']', text)]
# Backreference the opening quote so the closing quote must match it —
# otherwise `"text'` (open double, close single) is treated as a quote.
return [m.group(2).strip() for m in re.finditer(r'(["\'])([^"\']{15,}?)\1', text)]
def extract_statistics(text: str) -> List[str]:

View File

@@ -390,7 +390,9 @@ def get_tldr(text: str, max_sentences: int = 3) -> str:
def extract_quotes(text: str) -> List[str]:
"""Return quoted excerpts that are at least 15 characters long."""
return [m.group(1).strip() for m in re.finditer(r'["\']([^"\']{15,}?)["\']', text)]
# Backreference the opening quote so the closing quote must match it —
# otherwise `"text'` (open double, close single) is treated as a quote.
return [m.group(2).strip() for m in re.finditer(r'(["\'])([^"\']{15,}?)\1', text)]
def extract_statistics(text: str) -> List[str]:

View File

@@ -0,0 +1,28 @@
"""Tests for extract_quotes (src/search/content.py)."""
import pytest
pytest.importorskip("bs4") # content.py imports BeautifulSoup at module load
from src.search.content import extract_quotes
def test_matched_double_quotes():
assert extract_quotes('She said "this is a proper long quote" today') == [
"this is a proper long quote"
]
def test_matched_single_quotes():
assert extract_quotes("He wrote 'another sufficiently long quote' here") == [
"another sufficiently long quote"
]
def test_mismatched_quotes_are_not_extracted():
# Regression: `"text'` (open double, close single) used to be accepted
# because the closing quote wasn't required to match the opening one.
assert extract_quotes("""apostrophe d'accord then a "dangling long opener""") == []
def test_short_quotes_ignored():
assert extract_quotes('say "too short" please') == []