diff --git a/services/search/content.py b/services/search/content.py index 57385ec..0d1b762 100644 --- a/services/search/content.py +++ b/services/search/content.py @@ -385,7 +385,9 @@ def get_tldr(text: str, max_sentences: int = 3) -> str: def extract_quotes(text: str) -> List[str]: """Return quoted excerpts that are at least 15 characters long.""" - return [m.group(1).strip() for m in re.finditer(r'["\']([^"\']{15,}?)["\']', text)] + # Backreference the opening quote so the closing quote must match it — + # otherwise `"text'` (open double, close single) is treated as a quote. + return [m.group(2).strip() for m in re.finditer(r'(["\'])([^"\']{15,}?)\1', text)] def extract_statistics(text: str) -> List[str]: diff --git a/src/search/content.py b/src/search/content.py index 9711a03..d449b66 100644 --- a/src/search/content.py +++ b/src/search/content.py @@ -390,7 +390,9 @@ def get_tldr(text: str, max_sentences: int = 3) -> str: def extract_quotes(text: str) -> List[str]: """Return quoted excerpts that are at least 15 characters long.""" - return [m.group(1).strip() for m in re.finditer(r'["\']([^"\']{15,}?)["\']', text)] + # Backreference the opening quote so the closing quote must match it — + # otherwise `"text'` (open double, close single) is treated as a quote. + return [m.group(2).strip() for m in re.finditer(r'(["\'])([^"\']{15,}?)\1', text)] def extract_statistics(text: str) -> List[str]: diff --git a/tests/test_extract_quotes.py b/tests/test_extract_quotes.py new file mode 100644 index 0000000..a418336 --- /dev/null +++ b/tests/test_extract_quotes.py @@ -0,0 +1,28 @@ +"""Tests for extract_quotes (src/search/content.py).""" +import pytest + +pytest.importorskip("bs4") # content.py imports BeautifulSoup at module load + +from src.search.content import extract_quotes + + +def test_matched_double_quotes(): + assert extract_quotes('She said "this is a proper long quote" today') == [ + "this is a proper long quote" + ] + + +def test_matched_single_quotes(): + assert extract_quotes("He wrote 'another sufficiently long quote' here") == [ + "another sufficiently long quote" + ] + + +def test_mismatched_quotes_are_not_extracted(): + # Regression: `"text'` (open double, close single) used to be accepted + # because the closing quote wasn't required to match the opening one. + assert extract_quotes("""apostrophe d'accord then a "dangling long opener""") == [] + + +def test_short_quotes_ignored(): + assert extract_quotes('say "too short" please') == []