Odysseus v1.0

2026-05-31 23:58:26 +09:00
commit e5c99a5eee
421 changed files with 271349 additions and 0 deletions
--- a/src/search/core.py
+++ b/src/search/core.py
@@ -0,0 +1,447 @@
+"""Core search orchestrators: searxng_search_results, comprehensive_web_search, config, cache invalidation."""
+
+import json
+import logging
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from datetime import datetime, timedelta
+from typing import Dict, Any, Optional, List, Set
+from urllib.parse import urlparse
+
+from .analytics import (
+    NetworkError,
+    ParseError,
+    RateLimitError,
+    error_logger,
+    _record_query,
+)
+from .cache import (
+    SEARCH_CACHE_DIR,
+    search_cache_index,
+    generate_cache_key,
+    cleanup_cache,
+)
+from .query import _cache_duration_for_query
+from .ranking import rank_search_results
+from .providers import (
+    searxng_search_api,
+    brave_search,
+    duckduckgo_search,
+    google_pse_search,
+    tavily_search,
+    serper_search,
+    _get_search_settings,
+    _get_result_count,
+)
+from .content import (
+    fetch_webpage_content,
+    extract_key_points,
+    get_tldr,
+    extract_quotes,
+    extract_statistics,
+)
+
+logger = logging.getLogger(__name__)
+
+# ========= CONFIG =========
+SEARCH_CONFIG: Dict[str, Any] = {
+    "primary_provider": "searxng",
+}
+
+
+def get_search_config() -> Dict[str, Any]:
+    """Get current search configuration including active provider info."""
+    config = SEARCH_CONFIG.copy()
+    settings = _get_search_settings()
+    provider = settings.get("search_provider", "searxng")
+    config["active_provider"] = provider
+    config["has_api_key"] = bool((settings.get("search_api_key") or "").strip())
+    config["result_count"] = _get_result_count()
+    if provider == "searxng":
+        from .providers import _get_search_instance
+        config["search_url"] = _get_search_instance()
+    return config
+
+
+def update_search_config(api_key: str = None, **kwargs):
+    """Update search configuration (e.g. Brave API key)."""
+    if api_key:
+        SEARCH_CONFIG["brave_api_key"] = api_key
+
+
+def _call_provider(provider_name: str, query: str, count: int, time_filter: str = None) -> List[dict]:
+    """Call a search provider by name. Returns list of results or empty list."""
+    if provider_name == "searxng":
+        return searxng_search_api(query, count, time_filter=time_filter)
+    elif provider_name == "brave":
+        return brave_search(query, count, time_filter)
+    elif provider_name == "duckduckgo":
+        return duckduckgo_search(query, count, time_filter)
+    elif provider_name == "google_pse":
+        return google_pse_search(query, count, time_filter)
+    elif provider_name == "tavily":
+        return tavily_search(query, count, time_filter)
+    elif provider_name == "serper":
+        return serper_search(query, count, time_filter)
+    return []
+
+
+# If the self-hosted SearXNG instance is up but all enabled engines return
+# empty, fall back to the no-key provider so "search X" still works on fresh
+# installs. Users can override/disable with `search_fallback_chain`.
+_FALLBACK_ORDER = ["duckduckgo"]
+
+
+def _build_provider_chain(primary: str) -> List[str]:
+    """Build ordered list: primary first, then fallbacks (skipping primary
+    and dedupes). The fallback list comes from
+    `settings.search_fallback_chain` if the user configured one, otherwise
+    the hardcoded default above."""
+    chain = [primary]
+    settings = _get_search_settings()
+    user_chain = settings.get("search_fallback_chain") or []
+    if isinstance(user_chain, str):
+        # Tolerate comma-separated form from older payloads.
+        user_chain = [s.strip() for s in user_chain.split(",") if s.strip()]
+    fallbacks = user_chain if user_chain else _FALLBACK_ORDER
+    for fb in fallbacks:
+        if fb and fb != primary and fb not in chain and fb != "disabled":
+            chain.append(fb)
+    return chain
+
+
+# ----------------------------------------------------------------------
+# Unified search with caching and retry
+# ----------------------------------------------------------------------
+def searxng_search_results(query: str, count: int = 10, time_filter: str = None) -> list[dict]:
+    """Perform a web search using configured provider with caching and retry."""
+    settings = _get_search_settings()
+    search_provider = settings.get("search_provider", "searxng")
+    result_count = _get_result_count()
+    # Use configured count if caller used default
+    if count == 10:
+        count = result_count
+
+    cache_key = generate_cache_key(f"{query}|{count}|{time_filter}")
+    cache_file = SEARCH_CACHE_DIR / f"{cache_key}.cache"
+
+    # Check cache
+    if cache_file.exists():
+        try:
+            with open(cache_file, "r", encoding="utf-8") as f:
+                cached_data = json.load(f)
+            expiry_raw = cached_data.get("expiry")
+            expiry = datetime.fromisoformat(expiry_raw) if expiry_raw else None
+            if expiry and datetime.now() < expiry:
+                logger.debug(f"Search cache hit for query: {query}")
+                results = cached_data["data"]
+                _record_query(query, bool(results), cache_hit=True)
+                return results
+            else:
+                cache_file.unlink(missing_ok=True)
+                search_cache_index.pop(cache_key, None)
+        except Exception as e:
+            logger.warning(f"Failed to read search cache for {query}: {e}")
+            cache_file.unlink(missing_ok=True)
+            search_cache_index.pop(cache_key, None)
+
+    logger.debug(f"Search cache miss for query: {query}")
+
+    if search_provider == "disabled":
+        logger.info("Search is disabled via admin settings")
+        return []
+
+    provider_chain = _build_provider_chain(search_provider)
+
+    results: List[dict] = []
+    for provider_name in provider_chain:
+        for attempt in range(2):
+            try:
+                logger.info(f"Attempting {provider_name} search (attempt {attempt + 1})")
+                results = _call_provider(provider_name, query, count, time_filter)
+                if results:
+                    logger.info(f"{provider_name} search succeeded with {len(results)} results")
+                    break
+            except (NetworkError, ParseError, RateLimitError) as e:
+                error_logger.error(f"{provider_name} search error (attempt {attempt + 1}): {e}")
+            except Exception as e:
+                error_logger.error(f"Unexpected error during {provider_name} search (attempt {attempt + 1}): {e}")
+        if results:
+            break
+
+    success = bool(results)
+    _record_query(query, success, cache_hit=False)
+
+    if success:
+        results = rank_search_results(query, results)
+        try:
+            expiry = datetime.now() + _cache_duration_for_query(query)
+            cache_data = {
+                "timestamp": datetime.now().isoformat(),
+                "expiry": expiry.isoformat(),
+                "data": results,
+            }
+            with open(cache_file, "w", encoding="utf-8") as f:
+                json.dump(cache_data, f)
+            search_cache_index[cache_key] = datetime.now()
+            cleanup_cache(SEARCH_CACHE_DIR, search_cache_index, timedelta(hours=1))
+        except Exception as e:
+            logger.warning(f"Failed to write search cache for {query}: {e}")
+
+    if not success:
+        logger.error(f"All search providers failed for query: {query}")
+
+    return results
+
+
+# ----------------------------------------------------------------------
+# Cache invalidation
+# ----------------------------------------------------------------------
+def invalidate_search_cache(query: Optional[str] = None) -> None:
+    """Invalidate cached search results. None clears all, otherwise just the given query."""
+    if query is None:
+        for file in SEARCH_CACHE_DIR.glob("*.cache"):
+            try:
+                file.unlink(missing_ok=True)
+            except Exception as e:
+                error_logger.warning(f"Failed to delete cache file {file}: {e}")
+        search_cache_index.clear()
+        logger.info("All search cache entries have been cleared.")
+    else:
+        cache_key = generate_cache_key(f"{query}|10|None")
+        cache_file = SEARCH_CACHE_DIR / f"{cache_key}.cache"
+        if cache_file.exists():
+            try:
+                cache_file.unlink(missing_ok=True)
+                search_cache_index.pop(cache_key, None)
+                logger.info(f"Cache entry for query '{query}' has been invalidated.")
+            except Exception as e:
+                error_logger.warning(f"Failed to delete cache file for query '{query}': {e}")
+        else:
+            logger.info(f"No cache entry found for query '{query}'.")
+
+
+# ----------------------------------------------------------------------
+# Comprehensive web search (with advanced filtering)
+# ----------------------------------------------------------------------
+def comprehensive_web_search(
+    query: str,
+    max_pages: int = 3,
+    max_workers: int = 4,
+    time_filter: str = None,
+    domain_whitelist: Optional[Set[str]] = None,
+    domain_blacklist: Optional[Set[str]] = None,
+    content_type: Optional[str] = None,
+    language: Optional[str] = None,
+    min_content_length: int = 0,
+    return_sources: bool = False,
+):
+    """Perform comprehensive web search with content fetching and advanced filtering."""
+    logger.info(f"Starting comprehensive search for: {query}")
+    if time_filter:
+        logger.info(f"Applying time filter: {time_filter}")
+
+    settings = _get_search_settings()
+    search_provider = settings.get("search_provider", "searxng")
+    result_count = _get_result_count()
+
+    if search_provider == "disabled":
+        logger.info("Search is disabled via admin settings")
+        msg = "Web search is disabled by the administrator."
+        return (msg, []) if return_sources else msg
+
+    # Use configured result count (at least max_pages for content fetching)
+    fetch_count = max(result_count, max_pages)
+
+    provider_chain = _build_provider_chain(search_provider)
+
+    # Each provider gets 2 attempts (matches the inner unified_search behavior).
+    # Empty results are tracked separately from exceptions so the failure
+    # message can tell a soft-fail (provider returned []) apart from a real
+    # error (network blow-up, rate limit, etc.) — useful both for logging
+    # and for the model when it sees the response.
+    search_results = []
+    provider_attempts = {}  # provider -> "ok N", "empty", "error: ..."
+    for provider_name in provider_chain:
+        last_err = None
+        empty = False
+        for attempt in range(2):
+            try:
+                search_results = _call_provider(provider_name, query, fetch_count, time_filter)
+                if search_results:
+                    provider_attempts[provider_name] = f"ok ({len(search_results)})"
+                    logger.info(f"Comprehensive search: {provider_name} returned {len(search_results)} results")
+                    break
+                # Empty result — try once more (transient empties are common on flaky instances)
+                empty = True
+            except Exception as e:
+                last_err = e
+                logger.warning(f"Comprehensive search: {provider_name} attempt {attempt + 1} failed: {e}")
+        if search_results:
+            break
+        if last_err is not None:
+            provider_attempts[provider_name] = f"error: {last_err}"
+        elif empty:
+            provider_attempts[provider_name] = "empty"
+
+    if not search_results:
+        # Build a per-provider tally so the model (and logs) see which
+        # providers were tried and how each one fared, instead of the
+        # uninformative "No search results found".
+        tally = ", ".join(f"{p}:{r}" for p, r in provider_attempts.items()) or "no providers configured"
+        any_errors = any(r.startswith("error") for r in provider_attempts.values())
+        if any_errors:
+            msg = f"Web search failed — all providers errored or returned empty. Tried: {tally}"
+            logger.error(msg)
+        else:
+            msg = (
+                f"No search results found. Tried: {tally}. "
+                "All providers returned empty — possibly a niche query or upstream rate-limiting; "
+                "rephrasing or using the browser tool for a specific URL may help."
+            )
+            logger.warning(msg)
+        return (msg, []) if return_sources else msg
+
+    search_results = rank_search_results(query, search_results)
+
+    # URL filter helper
+    def url_passes_filters(url: str) -> bool:
+        try:
+            netloc = urlparse(url).netloc.lower()
+        except Exception:
+            return False
+        if domain_whitelist is not None and netloc not in domain_whitelist:
+            return False
+        if domain_blacklist is not None and netloc in domain_blacklist:
+            return False
+        if content_type:
+            ct = content_type.lower()
+            if ct == "article":
+                if not any(k in url.lower() for k in ("article", "blog", "news", "post")):
+                    return False
+            elif ct == "forum":
+                if not any(k in url.lower() for k in ("forum", "discussion", "thread", "topic")):
+                    return False
+            elif ct == "academic":
+                if not any(k in url.lower() for k in ("pdf", "doi", "scholar", "arxiv", "journal", "research")):
+                    return False
+        if language:
+            lang_pat = language.lower()
+            if not (f"/{lang_pat}/" in url.lower() or f"?lang={lang_pat}" in url.lower() or f"&lang={lang_pat}" in url.lower()):
+                return False
+        return True
+
+    filtered_urls = [r["url"] for r in search_results[:max_pages] if url_passes_filters(r["url"])]
+    if not filtered_urls:
+        logger.warning("All URLs filtered out by advanced criteria")
+        msg = "No suitable results after applying filters."
+        return (msg, []) if return_sources else msg
+
+    # Build sources list for the frontend (before content fetching)
+    _source_list = [
+        {"url": r.get("url", ""), "title": r.get("title", "")}
+        for r in search_results if r.get("url")
+    ]
+
+    # Fetch content in parallel
+    fetched_content = []
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        future_to_url = {
+            executor.submit(fetch_webpage_content, url, 8, retry_attempt=0): url
+            for url in filtered_urls
+        }
+        for future in as_completed(future_to_url):
+            url = future_to_url[future]
+            try:
+                result = future.result()
+                if result["success"] and result["content"] and len(result["content"]) >= min_content_length:
+                    fetched_content.append(result)
+            except Exception as e:
+                logger.error(f"Exception while fetching {url}: {str(e)}")
+
+    logger.info(f"Successfully fetched content from {len(fetched_content)} pages")
+
+    # Format results
+    output_parts = []
+
+    if search_results:
+        output_parts.append("```sources")
+        for i, result in enumerate(search_results, 1):
+            output_parts.append(f"[{i}] {result['title']}")
+            output_parts.append(f"    {result['url']}")
+            if result.get("age"):
+                output_parts.append(f"    {result['age']}")
+        output_parts.append("```")
+        output_parts.append("")
+
+    output_parts.append("=" * 70)
+    output_parts.append("WEB SEARCH RESULTS AND FETCHED CONTENT")
+    output_parts.append(f"Query: {query}")
+    output_parts.append(f"Searched {len(search_results)} results, fetched {len(fetched_content)} pages")
+    output_parts.append("=" * 70)
+    output_parts.append("")
+
+    output_parts.append("SEARCH RESULTS SUMMARY:")
+    output_parts.append("-" * 50)
+    for i, result in enumerate(search_results, 1):
+        output_parts.append(f"\n[{i}] {result['title']}")
+        output_parts.append(f"    URL: {result['url']}")
+        output_parts.append(f"    Snippet: {result['snippet'][:200]}...")
+        if result.get("age"):
+            output_parts.append(f"    Age: {result['age']}")
+
+    if fetched_content:
+        output_parts.append("\n" + "=" * 70)
+        output_parts.append("FETCHED PAGE CONTENT:")
+        output_parts.append("-" * 50)
+
+        for i, content in enumerate(fetched_content, 1):
+            output_parts.append(f"\n[CONTENT {i}] From: {content['url']}")
+            output_parts.append(f"Title: {content['title']}")
+            output_parts.append("-" * 30)
+
+            text = content["content"][:3000]
+            if len(content["content"]) > 3000:
+                text += "... [truncated]"
+            output_parts.append(text)
+
+            key_points = extract_key_points(content["content"])
+            if key_points:
+                output_parts.append("\nKey Points:")
+                for pt in key_points[:5]:
+                    output_parts.append(f"- {pt}")
+
+            tldr = get_tldr(content["content"])
+            if tldr:
+                output_parts.append("\nTL;DR:")
+                output_parts.append(tldr)
+
+            quotes = extract_quotes(content["content"])
+            if quotes:
+                output_parts.append("\nImportant Quotes:")
+                for q in quotes[:3]:
+                    output_parts.append(f"\u201c{q}\u201d")
+
+            stats = extract_statistics(content["content"])
+            if stats:
+                output_parts.append("\nData / Statistics:")
+                for s in stats[:5]:
+                    output_parts.append(f"- {s}")
+
+            output_parts.append("")
+
+    output_parts.append("=" * 70)
+    output_parts.append("END OF WEB SEARCH RESULTS")
+    output_parts.append("=" * 70)
+
+    instructions = (
+        "\n\nIMPORTANT INSTRUCTIONS:\n"
+        "1. Use the above web search results and fetched content to answer the user's question\n"
+        "2. Prioritize information from the FETCHED PAGE CONTENT section as it contains actual page data\n"
+        "3. Cross-reference multiple sources when possible\n"
+        "4. If the information is time-sensitive, pay attention to the age of the results\n"
+        "5. Be explicit if the search results don't contain sufficient information to fully answer the question"
+    )
+    output_parts.append(instructions)
+
+    result = "\n".join(output_parts)
+    return (result, _source_list) if return_sources else result