From aa0a9e8b5aeb153d960e087114ad585114985488 Mon Sep 17 00:00:00 2001 From: ghreprimand Date: Tue, 2 Jun 2026 06:53:07 -0500 Subject: [PATCH] Search: align service content extraction Co-authored-by: ghreprimand <203024559+ghreprimand@users.noreply.github.com> --- services/search/content.py | 47 +++++++++++++++-- .../test_search_content_extraction_parity.py | 52 +++++++++++++++++++ 2 files changed, 94 insertions(+), 5 deletions(-) create mode 100644 tests/test_search_content_extraction_parity.py diff --git a/services/search/content.py b/services/search/content.py index 7702937..57385ec 100644 --- a/services/search/content.py +++ b/services/search/content.py @@ -1,5 +1,6 @@ """Webpage content fetching with caching, PDF extraction, and summarization helpers.""" +import copy import io import ipaddress import json @@ -115,6 +116,28 @@ def _extract_meta(soup: BeautifulSoup) -> dict: return {"description": description, "keywords": keywords} +def _extract_og_image(soup: BeautifulSoup) -> str: + """Extract the best representative image URL from meta tags. + + Only returns absolute http(s) URLs -- skips relative paths and data URIs. + """ + candidates = [] + for prop in ("og:image", "og:image:url", "og:image:secure_url"): + tag = soup.find("meta", attrs={"property": prop}) + if tag and tag.get("content", "").strip(): + candidates.append(tag["content"].strip()) + tag = soup.find("meta", attrs={"name": "twitter:image"}) + if tag and tag.get("content", "").strip(): + candidates.append(tag["content"].strip()) + tag = soup.find("meta", attrs={"name": "thumbnail"}) + if tag and tag.get("content", "").strip(): + candidates.append(tag["content"].strip()) + for url in candidates: + if url.startswith(("https://", "http://")) and not url.endswith((".svg", ".ico")): + return url + return "" + + def _extract_lists(soup: BeautifulSoup) -> List[List[str]]: """Return a list of lists, each inner list representing a