"""Webpage content fetching with caching, PDF extraction, and summarization helpers.""" import copy import io import ipaddress import json import os import re import logging import socket from datetime import datetime, timedelta from typing import List from urllib.parse import urljoin, urlparse import httpx from bs4 import BeautifulSoup from .analytics import RateLimitError, error_logger from .cache import ( CONTENT_CACHE_DIR, content_cache_index, generate_cache_key, cleanup_cache, ) logger = logging.getLogger(__name__) _PRIVATE_NETWORKS = ( ipaddress.ip_network("0.0.0.0/8"), ipaddress.ip_network("10.0.0.0/8"), ipaddress.ip_network("127.0.0.0/8"), ipaddress.ip_network("169.254.0.0/16"), ipaddress.ip_network("172.16.0.0/12"), ipaddress.ip_network("192.168.0.0/16"), ipaddress.ip_network("::1/128"), ipaddress.ip_network("fc00::/7"), ipaddress.ip_network("fe80::/10"), ) def _is_private_address(addr: ipaddress._BaseAddress) -> bool: return any(addr in net for net in _PRIVATE_NETWORKS) or addr.is_private or addr.is_loopback def _resolve_hostname_ips(hostname: str) -> List[ipaddress._BaseAddress]: ips = [] for family, _, _, _, sockaddr in socket.getaddrinfo(hostname, None): if family in (socket.AF_INET, socket.AF_INET6): ips.append(ipaddress.ip_address(sockaddr[0])) return ips def _public_http_url(url: str) -> bool: parsed = urlparse(url) if parsed.scheme not in ("http", "https") or not parsed.hostname: return False host = parsed.hostname.strip().lower() if host in ("localhost", "metadata.google.internal", "metadata"): return False try: return not _is_private_address(ipaddress.ip_address(host)) except ValueError: pass try: ips = _resolve_hostname_ips(host) except OSError: return False # Fail closed: a hostname that resolves to nothing is treated as # non-public (an empty all(...) would otherwise return True). return bool(ips) and all(not _is_private_address(ip) for ip in ips) def _get_public_url(url: str, *, headers: dict, timeout: int) -> httpx.Response: if not _public_http_url(url): raise httpx.RequestError(f"Blocked non-public URL: {url}") current = url with httpx.Client(headers=headers, timeout=timeout, follow_redirects=False) as client: for _ in range(8): response = client.get(current) if response.status_code not in (301, 302, 303, 307, 308): return response location = response.headers.get("location") if not location: return response current = urljoin(current, location) if not _public_http_url(current): raise httpx.RequestError(f"Blocked redirect to non-public URL: {current}") raise httpx.RequestError("Too many redirects") # PDF extraction (optional dependency) try: from pdfminer.high_level import extract_text as pdf_extract_text except ImportError: pdf_extract_text = None # type: ignore # ---------------------------------------------------------------------- # HTML extraction helpers # ---------------------------------------------------------------------- def _extract_meta(soup: BeautifulSoup) -> dict: """Pull meta description and keywords if present.""" description = "" keywords = "" desc_tag = soup.find("meta", attrs={"name": re.compile("description", re.I)}) if desc_tag and desc_tag.get("content"): description = desc_tag["content"].strip() kw_tag = soup.find("meta", attrs={"name": re.compile("keywords", re.I)}) if kw_tag and kw_tag.get("content"): keywords = kw_tag["content"].strip() return {"description": description, "keywords": keywords} def _extract_og_image(soup: BeautifulSoup) -> str: """Extract the best representative image URL from meta tags. Only returns absolute http(s) URLs — skips relative paths and data URIs. """ candidates = [] # Open Graph image (most reliable) for prop in ("og:image", "og:image:url", "og:image:secure_url"): tag = soup.find("meta", attrs={"property": prop}) if tag and tag.get("content", "").strip(): candidates.append(tag["content"].strip()) # Twitter card image tag = soup.find("meta", attrs={"name": "twitter:image"}) if tag and tag.get("content", "").strip(): candidates.append(tag["content"].strip()) # Thumbnail meta tag = soup.find("meta", attrs={"name": "thumbnail"}) if tag and tag.get("content", "").strip(): candidates.append(tag["content"].strip()) # Return first absolute http(s) URL for url in candidates: if url.startswith(("https://", "http://")) and not url.endswith((".svg", ".ico")): return url return "" def _extract_lists(soup: BeautifulSoup) -> List[List[str]]: """Return a list of lists, each inner list representing a