fix: source thumbnails dropped for http-only og:image URLs (#667)

* fix: accept http (not just https) og:image URLs for source thumbnails * test: og:image extraction accepts http and skips relative/svg
2026-06-02 03:41:33 +01:00
parent c303a29670
commit 9d8eebfa63
2 changed files with 34 additions and 2 deletions
--- a/src/search/content.py
+++ b/src/search/content.py
@@ -130,9 +130,9 @@ def _extract_og_image(soup: BeautifulSoup) -> str:
    tag = soup.find("meta", attrs={"name": "thumbnail"})
    if tag and tag.get("content", "").strip():
        candidates.append(tag["content"].strip())
-    # Return first absolute https URL
+    # Return first absolute http(s) URL
    for url in candidates:
-        if url.startswith("https://") and not url.endswith((".svg", ".ico")):
+        if url.startswith(("https://", "http://")) and not url.endswith((".svg", ".ico")):
            return url
    return ""
--- a/tests/test_og_image_extraction.py
+++ b/tests/test_og_image_extraction.py
@@ -0,0 +1,32 @@
 """Tests for og:image extraction (src/search/content.py)."""
 import pytest
 pytest.importorskip("bs4")
 from bs4 import BeautifulSoup
 from src.search.content import _extract_og_image
 def _soup(html: str) -> BeautifulSoup:
    return BeautifulSoup(html, "html.parser")
 def test_accepts_http_og_image():
    # Regression: only https URLs were returned, so plain-http og:image
    # (still common) yielded no thumbnail despite the docstring promising
    # "http(s)".
    html = '<meta property="og:image" content="http://example.com/cover.jpg">'
    assert _extract_og_image(_soup(html)) == "http://example.com/cover.jpg"
 def test_still_accepts_https_og_image():
    html = '<meta property="og:image" content="https://example.com/cover.png">'
    assert _extract_og_image(_soup(html)) == "https://example.com/cover.png"
 def test_skips_relative_and_svg():
    html = (
        '<meta property="og:image" content="/relative/logo.png">'
        '<meta name="twitter:image" content="https://example.com/icon.svg">'
    )
    assert _extract_og_image(_soup(html)) == ""