From 9d8eebfa6397f0838937e1d6bdedcd2e484332fe Mon Sep 17 00:00:00 2001 From: Afonso Coutinho <116525378+afonsopc@users.noreply.github.com> Date: Tue, 2 Jun 2026 03:41:33 +0100 Subject: [PATCH] fix: source thumbnails dropped for http-only og:image URLs (#667) * fix: accept http (not just https) og:image URLs for source thumbnails * test: og:image extraction accepts http and skips relative/svg --- src/search/content.py | 4 ++-- tests/test_og_image_extraction.py | 32 +++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 2 deletions(-) create mode 100644 tests/test_og_image_extraction.py diff --git a/src/search/content.py b/src/search/content.py index 1c469e8..9711a03 100644 --- a/src/search/content.py +++ b/src/search/content.py @@ -130,9 +130,9 @@ def _extract_og_image(soup: BeautifulSoup) -> str: tag = soup.find("meta", attrs={"name": "thumbnail"}) if tag and tag.get("content", "").strip(): candidates.append(tag["content"].strip()) - # Return first absolute https URL + # Return first absolute http(s) URL for url in candidates: - if url.startswith("https://") and not url.endswith((".svg", ".ico")): + if url.startswith(("https://", "http://")) and not url.endswith((".svg", ".ico")): return url return "" diff --git a/tests/test_og_image_extraction.py b/tests/test_og_image_extraction.py new file mode 100644 index 0000000..164d51a --- /dev/null +++ b/tests/test_og_image_extraction.py @@ -0,0 +1,32 @@ +"""Tests for og:image extraction (src/search/content.py).""" +import pytest + +pytest.importorskip("bs4") +from bs4 import BeautifulSoup + +from src.search.content import _extract_og_image + + +def _soup(html: str) -> BeautifulSoup: + return BeautifulSoup(html, "html.parser") + + +def test_accepts_http_og_image(): + # Regression: only https URLs were returned, so plain-http og:image + # (still common) yielded no thumbnail despite the docstring promising + # "http(s)". + html = '' + assert _extract_og_image(_soup(html)) == "http://example.com/cover.jpg" + + +def test_still_accepts_https_og_image(): + html = '' + assert _extract_og_image(_soup(html)) == "https://example.com/cover.png" + + +def test_skips_relative_and_svg(): + html = ( + '' + '' + ) + assert _extract_og_image(_soup(html)) == ""