fix: source thumbnails dropped for http-only og:image URLs (#667)
* fix: accept http (not just https) og:image URLs for source thumbnails * test: og:image extraction accepts http and skips relative/svg
This commit is contained in:
@@ -130,9 +130,9 @@ def _extract_og_image(soup: BeautifulSoup) -> str:
|
|||||||
tag = soup.find("meta", attrs={"name": "thumbnail"})
|
tag = soup.find("meta", attrs={"name": "thumbnail"})
|
||||||
if tag and tag.get("content", "").strip():
|
if tag and tag.get("content", "").strip():
|
||||||
candidates.append(tag["content"].strip())
|
candidates.append(tag["content"].strip())
|
||||||
# Return first absolute https URL
|
# Return first absolute http(s) URL
|
||||||
for url in candidates:
|
for url in candidates:
|
||||||
if url.startswith("https://") and not url.endswith((".svg", ".ico")):
|
if url.startswith(("https://", "http://")) and not url.endswith((".svg", ".ico")):
|
||||||
return url
|
return url
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|||||||
32
tests/test_og_image_extraction.py
Normal file
32
tests/test_og_image_extraction.py
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
"""Tests for og:image extraction (src/search/content.py)."""
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
pytest.importorskip("bs4")
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from src.search.content import _extract_og_image
|
||||||
|
|
||||||
|
|
||||||
|
def _soup(html: str) -> BeautifulSoup:
|
||||||
|
return BeautifulSoup(html, "html.parser")
|
||||||
|
|
||||||
|
|
||||||
|
def test_accepts_http_og_image():
|
||||||
|
# Regression: only https URLs were returned, so plain-http og:image
|
||||||
|
# (still common) yielded no thumbnail despite the docstring promising
|
||||||
|
# "http(s)".
|
||||||
|
html = '<meta property="og:image" content="http://example.com/cover.jpg">'
|
||||||
|
assert _extract_og_image(_soup(html)) == "http://example.com/cover.jpg"
|
||||||
|
|
||||||
|
|
||||||
|
def test_still_accepts_https_og_image():
|
||||||
|
html = '<meta property="og:image" content="https://example.com/cover.png">'
|
||||||
|
assert _extract_og_image(_soup(html)) == "https://example.com/cover.png"
|
||||||
|
|
||||||
|
|
||||||
|
def test_skips_relative_and_svg():
|
||||||
|
html = (
|
||||||
|
'<meta property="og:image" content="/relative/logo.png">'
|
||||||
|
'<meta name="twitter:image" content="https://example.com/icon.svg">'
|
||||||
|
)
|
||||||
|
assert _extract_og_image(_soup(html)) == ""
|
||||||
Reference in New Issue
Block a user