diff --git a/services/search/content.py b/services/search/content.py index 7702937..57385ec 100644 --- a/services/search/content.py +++ b/services/search/content.py @@ -1,5 +1,6 @@ """Webpage content fetching with caching, PDF extraction, and summarization helpers.""" +import copy import io import ipaddress import json @@ -115,6 +116,28 @@ def _extract_meta(soup: BeautifulSoup) -> dict: return {"description": description, "keywords": keywords} +def _extract_og_image(soup: BeautifulSoup) -> str: + """Extract the best representative image URL from meta tags. + + Only returns absolute http(s) URLs -- skips relative paths and data URIs. + """ + candidates = [] + for prop in ("og:image", "og:image:url", "og:image:secure_url"): + tag = soup.find("meta", attrs={"property": prop}) + if tag and tag.get("content", "").strip(): + candidates.append(tag["content"].strip()) + tag = soup.find("meta", attrs={"name": "twitter:image"}) + if tag and tag.get("content", "").strip(): + candidates.append(tag["content"].strip()) + tag = soup.find("meta", attrs={"name": "thumbnail"}) + if tag and tag.get("content", "").strip(): + candidates.append(tag["content"].strip()) + for url in candidates: + if url.startswith(("https://", "http://")) and not url.endswith((".svg", ".ico")): + return url + return "" + + def _extract_lists(soup: BeautifulSoup) -> List[List[str]]: """Return a list of lists, each inner list representing a