diff --git a/src/search/cache.py b/src/search/cache.py index 11fe722..e66aaff 100644 --- a/src/search/cache.py +++ b/src/search/cache.py @@ -1,57 +1,11 @@ -"""Search and content caching with LRU eviction.""" +"""Compatibility wrapper for the canonical services.search.cache module. -import hashlib -import logging -from datetime import datetime, timedelta -from pathlib import Path -from typing import Dict +``src.search.cache`` stays importable for older agent/deep-research code, but the +implementation now lives in ``services.search.cache`` so the two cannot drift. +""" -logger = logging.getLogger(__name__) +import sys -# Cache directories -CACHE_DIR = Path(__file__).resolve().parent.parent / "cache" -SEARCH_CACHE_DIR = CACHE_DIR / "search" -CONTENT_CACHE_DIR = CACHE_DIR / "content" -CACHE_MAX_ENTRIES = 1000 +from services.search import cache as _cache -# Create cache directories -SEARCH_CACHE_DIR.mkdir(parents=True, exist_ok=True) -CONTENT_CACHE_DIR.mkdir(parents=True, exist_ok=True) - -# Track cache size for LRU eviction -search_cache_index: Dict[str, datetime] = {} -content_cache_index: Dict[str, datetime] = {} - -# Cache metrics (shared across modules) -cache_metrics = {"hits": 0, "misses": 0, "evictions": 0} - - -def generate_cache_key(data: str) -> str: - """Generate a unique cache key using SHA-256 hash.""" - return hashlib.sha256(data.encode("utf-8")).hexdigest() - - -def cleanup_cache(cache_dir: Path, cache_index: Dict[str, datetime], max_age: timedelta): - """Remove expired cache entries and enforce LRU policy.""" - current_time = datetime.now() - files_in_dir = {f.name.split(".")[0]: f for f in cache_dir.glob("*.cache")} - - to_remove = [] - for key, timestamp in list(cache_index.items()): - if current_time - timestamp > max_age or key not in files_in_dir: - to_remove.append(key) - if key in files_in_dir: - files_in_dir[key].unlink(missing_ok=True) - - for key in to_remove: - cache_index.pop(key, None) - cache_metrics["evictions"] += 1 - - if len(cache_index) > CACHE_MAX_ENTRIES: - sorted_items = sorted(cache_index.items(), key=lambda x: x[1]) - excess_count = len(cache_index) - CACHE_MAX_ENTRIES - for key, _ in sorted_items[:excess_count]: - cache_index.pop(key, None) - cache_file = cache_dir / f"{key}.cache" - cache_file.unlink(missing_ok=True) - cache_metrics["evictions"] += 1 +sys.modules[__name__] = _cache diff --git a/src/search/content.py b/src/search/content.py index 42f8e34..971d4c2 100644 --- a/src/search/content.py +++ b/src/search/content.py @@ -1,419 +1,11 @@ -"""Webpage content fetching with caching, PDF extraction, and summarization helpers.""" +"""Compatibility wrapper for the canonical services.search.content module. -import copy -import io -import ipaddress -import json -import os -import re -import logging -import socket -from datetime import datetime, timedelta -from typing import List -from urllib.parse import urljoin, urlparse +``src.search.content`` stays importable for older agent/deep-research code, but the +implementation now lives in ``services.search.content`` so the two cannot drift. +""" -import httpx -from bs4 import BeautifulSoup +import sys -from .analytics import RateLimitError, error_logger -from .cache import ( - CONTENT_CACHE_DIR, - content_cache_index, - generate_cache_key, - cleanup_cache, -) +from services.search import content as _content -logger = logging.getLogger(__name__) - -_PRIVATE_NETWORKS = ( - ipaddress.ip_network("0.0.0.0/8"), - ipaddress.ip_network("10.0.0.0/8"), - ipaddress.ip_network("127.0.0.0/8"), - ipaddress.ip_network("169.254.0.0/16"), - ipaddress.ip_network("172.16.0.0/12"), - ipaddress.ip_network("192.168.0.0/16"), - ipaddress.ip_network("::1/128"), - ipaddress.ip_network("fc00::/7"), - ipaddress.ip_network("fe80::/10"), -) - - -def _is_private_address(addr: ipaddress._BaseAddress) -> bool: - if isinstance(addr, ipaddress.IPv6Address) and addr.ipv4_mapped is not None: - addr = addr.ipv4_mapped - return ( - addr.is_private - or addr.is_loopback - or addr.is_link_local - or addr.is_reserved - or addr.is_multicast - or addr.is_unspecified - or any(addr in net for net in _PRIVATE_NETWORKS) - ) - - -def _resolve_hostname_ips(hostname: str) -> List[ipaddress._BaseAddress]: - ips = [] - for family, _, _, _, sockaddr in socket.getaddrinfo(hostname, None): - if family in (socket.AF_INET, socket.AF_INET6): - ips.append(ipaddress.ip_address(sockaddr[0])) - return ips - - -def _public_http_url(url: str) -> bool: - parsed = urlparse(url) - if parsed.scheme not in ("http", "https") or not parsed.hostname: - return False - host = parsed.hostname.strip().lower() - if host in ("localhost", "metadata.google.internal", "metadata"): - return False - if host.endswith((".local", ".localhost", ".internal", ".lan", ".intranet")): - return False - try: - return not _is_private_address(ipaddress.ip_address(host)) - except ValueError: - pass - try: - ips = _resolve_hostname_ips(host) - except OSError: - return False - # Fail closed: a hostname that resolves to nothing is treated as - # non-public (an empty all(...) would otherwise return True). - return bool(ips) and all(not _is_private_address(ip) for ip in ips) - - -def _get_public_url(url: str, *, headers: dict, timeout: int) -> httpx.Response: - if not _public_http_url(url): - raise httpx.RequestError(f"Blocked non-public URL: {url}") - - current = url - with httpx.Client(headers=headers, timeout=timeout, follow_redirects=False) as client: - for _ in range(8): - response = client.get(current) - if response.status_code not in (301, 302, 303, 307, 308): - return response - location = response.headers.get("location") - if not location: - return response - current = urljoin(current, location) - if not _public_http_url(current): - raise httpx.RequestError(f"Blocked redirect to non-public URL: {current}") - raise httpx.RequestError("Too many redirects") - -# PDF extraction (optional dependency) -try: - from pdfminer.high_level import extract_text as pdf_extract_text -except ImportError: - pdf_extract_text = None # type: ignore - - -# ---------------------------------------------------------------------- -# HTML extraction helpers -# ---------------------------------------------------------------------- -def _extract_meta(soup: BeautifulSoup) -> dict: - """Pull meta description and keywords if present.""" - description = "" - keywords = "" - desc_tag = soup.find("meta", attrs={"name": re.compile("description", re.I)}) - if desc_tag and desc_tag.get("content"): - description = desc_tag["content"].strip() - kw_tag = soup.find("meta", attrs={"name": re.compile("keywords", re.I)}) - if kw_tag and kw_tag.get("content"): - keywords = kw_tag["content"].strip() - return {"description": description, "keywords": keywords} - - -def _extract_og_image(soup: BeautifulSoup) -> str: - """Extract the best representative image URL from meta tags. - - Only returns absolute http(s) URLs — skips relative paths and data URIs. - """ - candidates = [] - # Open Graph image (most reliable) - for prop in ("og:image", "og:image:url", "og:image:secure_url"): - tag = soup.find("meta", attrs={"property": prop}) - if tag and tag.get("content", "").strip(): - candidates.append(tag["content"].strip()) - # Twitter card image - tag = soup.find("meta", attrs={"name": "twitter:image"}) - if tag and tag.get("content", "").strip(): - candidates.append(tag["content"].strip()) - # Thumbnail meta - tag = soup.find("meta", attrs={"name": "thumbnail"}) - if tag and tag.get("content", "").strip(): - candidates.append(tag["content"].strip()) - # Return first absolute http(s) URL - for url in candidates: - if url.startswith(("https://", "http://")) and not url.endswith((".svg", ".ico")): - return url - return "" - - -def _extract_lists(soup: BeautifulSoup) -> List[List[str]]: - """Return a list of lists, each inner list representing a