"""Webpage content fetching with caching, PDF extraction, and summarization helpers.""" import copy import io import ipaddress import json import os import re import logging import socket from datetime import datetime, timedelta from typing import List from urllib.parse import urljoin, urlparse import httpx from bs4 import BeautifulSoup from .analytics import RateLimitError, error_logger from .cache import ( CONTENT_CACHE_DIR, content_cache_index, generate_cache_key, cleanup_cache, ) logger = logging.getLogger(__name__) _PRIVATE_NETWORKS = ( ipaddress.ip_network("0.0.0.0/8"), ipaddress.ip_network("10.0.0.0/8"), ipaddress.ip_network("127.0.0.0/8"), ipaddress.ip_network("169.254.0.0/16"), ipaddress.ip_network("172.16.0.0/12"), ipaddress.ip_network("192.168.0.0/16"), ipaddress.ip_network("::1/128"), ipaddress.ip_network("fc00::/7"), ipaddress.ip_network("fe80::/10"), ) def _is_private_address(addr: ipaddress._BaseAddress) -> bool: return addr.is_private or addr.is_loopback or addr.is_link_local or any(addr in net for net in _PRIVATE_NETWORKS) def _resolve_hostname_ips(hostname: str) -> list[ipaddress._BaseAddress]: try: infos = socket.getaddrinfo(hostname, None) except Exception: return [] out = [] for info in infos: try: out.append(ipaddress.ip_address(info[4][0])) except Exception: continue return out def _public_http_url(url: str) -> bool: try: parsed = urlparse(url) if parsed.scheme not in ("http", "https"): return False host = (parsed.hostname or "").strip() if not host: return False lower = host.lower() if lower in ("localhost", "metadata", "metadata.google.internal"): return False if lower.endswith((".local", ".localhost", ".internal", ".lan", ".intranet")): return False try: return not _is_private_address(ipaddress.ip_address(host)) except ValueError: pass addrs = _resolve_hostname_ips(host) return bool(addrs) and not any(_is_private_address(a) for a in addrs) except Exception: return False def _get_public_url(url: str, headers: dict, timeout: int, max_redirects: int = 5) -> httpx.Response: current = url for _ in range(max_redirects + 1): if not _public_http_url(current): raise httpx.RequestError("Blocked private/internal URL", request=httpx.Request("GET", current)) response = httpx.get(current, headers=headers, timeout=timeout, follow_redirects=False) if response.status_code not in (301, 302, 303, 307, 308): return response location = response.headers.get("location") if not location: return response current = urljoin(str(response.url), location) raise httpx.RequestError("Too many redirects", request=httpx.Request("GET", current)) # PDF extraction (optional dependency) try: from pdfminer.high_level import extract_text as pdf_extract_text except ImportError: pdf_extract_text = None # type: ignore # ---------------------------------------------------------------------- # HTML extraction helpers # ---------------------------------------------------------------------- def _extract_meta(soup: BeautifulSoup) -> dict: """Pull meta description and keywords if present.""" description = "" keywords = "" desc_tag = soup.find("meta", attrs={"name": re.compile("description", re.I)}) if desc_tag and desc_tag.get("content"): description = desc_tag["content"].strip() kw_tag = soup.find("meta", attrs={"name": re.compile("keywords", re.I)}) if kw_tag and kw_tag.get("content"): keywords = kw_tag["content"].strip() return {"description": description, "keywords": keywords} def _extract_og_image(soup: BeautifulSoup) -> str: """Extract the best representative image URL from meta tags. Only returns absolute http(s) URLs -- skips relative paths and data URIs. """ candidates = [] for prop in ("og:image", "og:image:url", "og:image:secure_url"): tag = soup.find("meta", attrs={"property": prop}) if tag and tag.get("content", "").strip(): candidates.append(tag["content"].strip()) tag = soup.find("meta", attrs={"name": "twitter:image"}) if tag and tag.get("content", "").strip(): candidates.append(tag["content"].strip()) tag = soup.find("meta", attrs={"name": "thumbnail"}) if tag and tag.get("content", "").strip(): candidates.append(tag["content"].strip()) for url in candidates: if url.startswith(("https://", "http://")) and not url.endswith((".svg", ".ico")): return url return "" def _extract_lists(soup: BeautifulSoup) -> List[List[str]]: """Return a list of lists, each inner list representing a