fix: source thumbnails dropped for http-only og:image URLs (#667)
* fix: accept http (not just https) og:image URLs for source thumbnails * test: og:image extraction accepts http and skips relative/svg
This commit is contained in:
@@ -130,9 +130,9 @@ def _extract_og_image(soup: BeautifulSoup) -> str:
|
||||
tag = soup.find("meta", attrs={"name": "thumbnail"})
|
||||
if tag and tag.get("content", "").strip():
|
||||
candidates.append(tag["content"].strip())
|
||||
# Return first absolute https URL
|
||||
# Return first absolute http(s) URL
|
||||
for url in candidates:
|
||||
if url.startswith("https://") and not url.endswith((".svg", ".ico")):
|
||||
if url.startswith(("https://", "http://")) and not url.endswith((".svg", ".ico")):
|
||||
return url
|
||||
return ""
|
||||
|
||||
|
||||
32
tests/test_og_image_extraction.py
Normal file
32
tests/test_og_image_extraction.py
Normal file
@@ -0,0 +1,32 @@
|
||||
"""Tests for og:image extraction (src/search/content.py)."""
|
||||
import pytest
|
||||
|
||||
pytest.importorskip("bs4")
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from src.search.content import _extract_og_image
|
||||
|
||||
|
||||
def _soup(html: str) -> BeautifulSoup:
|
||||
return BeautifulSoup(html, "html.parser")
|
||||
|
||||
|
||||
def test_accepts_http_og_image():
|
||||
# Regression: only https URLs were returned, so plain-http og:image
|
||||
# (still common) yielded no thumbnail despite the docstring promising
|
||||
# "http(s)".
|
||||
html = '<meta property="og:image" content="http://example.com/cover.jpg">'
|
||||
assert _extract_og_image(_soup(html)) == "http://example.com/cover.jpg"
|
||||
|
||||
|
||||
def test_still_accepts_https_og_image():
|
||||
html = '<meta property="og:image" content="https://example.com/cover.png">'
|
||||
assert _extract_og_image(_soup(html)) == "https://example.com/cover.png"
|
||||
|
||||
|
||||
def test_skips_relative_and_svg():
|
||||
html = (
|
||||
'<meta property="og:image" content="/relative/logo.png">'
|
||||
'<meta name="twitter:image" content="https://example.com/icon.svg">'
|
||||
)
|
||||
assert _extract_og_image(_soup(html)) == ""
|
||||
Reference in New Issue
Block a user