Odysseus v1.0

2026-05-31 23:58:26 +09:00
commit e5c99a5eee
421 changed files with 271349 additions and 0 deletions
--- a/services/research/init.py
+++ b/services/research/init.py
@@ -0,0 +1,12 @@
+# services/research/__init__.py
+"""Research service — deep research with LLM-in-the-loop."""
+
+from .service import ResearchService, ResearchResult, ResearchSource
+from .research_handler import ResearchHandler
+
+__all__ = [
+    "ResearchService",
+    "ResearchResult",
+    "ResearchSource",
+    "ResearchHandler",
+]
--- a/services/research/research_handler.py
+++ b/services/research/research_handler.py
@@ -0,0 +1,463 @@
+# src/research_handler.py
+"""Handler for research service integration with expandable UI support.
+
+Uses the IterResearch-style DeepResearcher (LLM-in-the-loop) as the primary
+engine, falling back to the legacy ResearchOrchestrator or basic web search
+if needed.
+
+Includes a task registry so research survives page refreshes and can be cancelled.
+"""
+import asyncio
+import json
+import logging
+import time
+from pathlib import Path
+from typing import Optional, Dict
+
+logger = logging.getLogger(__name__)
+
+RESEARCH_DATA_DIR = Path("data/deep_research")
+
+
+class ResearchHandler:
+    """Handles research service operations with iterative deep research."""
+
+    def __init__(self):
+        self._legacy_engine = None
+        self._active_tasks: Dict[str, dict] = {}
+        self._initialize_legacy_engine()
+        RESEARCH_DATA_DIR.mkdir(parents=True, exist_ok=True)
+
+    def _initialize_legacy_engine(self):
+        """Initialize the legacy research engine as a fallback."""
+        try:
+            from research_engine import ResearchOrchestrator, Config
+            config = Config(max_searches=12, max_content_per_page=15000)
+            self._legacy_engine = ResearchOrchestrator(config)
+            logger.info("Legacy ResearchOrchestrator initialized (fallback)")
+        except ImportError:
+            logger.info("Legacy research_engine.py not found — DeepResearcher only")
+            self._legacy_engine = None
+        except Exception as e:
+            logger.warning(f"Legacy research engine init failed: {e}")
+            self._legacy_engine = None
+
+    # ------------------------------------------------------------------
+    # Task registry — background research with persistence
+    # ------------------------------------------------------------------
+
+    def start_research(
+        self,
+        session_id: str,
+        query: str,
+        llm_endpoint: str,
+        llm_model: str,
+        max_time: int = 300,
+        llm_headers: dict = None,
+    ) -> dict:
+        """Start research as a background task. Returns task info dict."""
+        # Cancel any existing research for this session
+        if session_id in self._active_tasks:
+            existing = self._active_tasks[session_id]
+            if existing.get("status") == "running":
+                self.cancel_research(session_id)
+
+        entry = {
+            "task": None,
+            "researcher": None,
+            "query": query,
+            "status": "running",
+            "progress": {},
+            "result": None,
+            "started_at": time.time(),
+        }
+        self._active_tasks[session_id] = entry
+
+        def on_progress(event):
+            entry["progress"] = event
+
+        async def _run():
+            try:
+                result = await self.call_research_service(
+                    query, llm_endpoint, llm_model,
+                    max_time=max_time,
+                    progress_callback=on_progress,
+                    _task_entry=entry,
+                    llm_headers=llm_headers,
+                )
+                entry["result"] = result
+                entry["status"] = "done"
+                self._save_result(session_id, entry)
+            except asyncio.CancelledError:
+                entry["status"] = "cancelled"
+                raise
+            except Exception as e:
+                logger.error(f"Background research failed: {e}", exc_info=True)
+                entry["result"] = str(e)
+                entry["status"] = "error"
+
+        task = asyncio.create_task(_run())
+        entry["task"] = task
+        return {"session_id": session_id, "status": "running", "query": query}
+
+    def get_status(self, session_id: str) -> Optional[dict]:
+        """Get current research status for a session."""
+        if session_id in self._active_tasks:
+            entry = self._active_tasks[session_id]
+            return {
+                "status": entry["status"],
+                "progress": entry["progress"],
+                "query": entry["query"],
+                "started_at": entry["started_at"],
+            }
+        # Check disk for completed research
+        path = RESEARCH_DATA_DIR / f"{session_id}.json"
+        if path.exists():
+            try:
+                data = json.loads(path.read_text())
+                return {
+                    "status": data.get("status", "done"),
+                    "progress": {},
+                    "query": data.get("query", ""),
+                    "started_at": data.get("started_at", 0),
+                }
+            except Exception:
+                pass
+        return None
+
+    def cancel_research(self, session_id: str) -> bool:
+        """Cancel running research for a session."""
+        if session_id not in self._active_tasks:
+            return False
+        entry = self._active_tasks[session_id]
+        if entry["status"] != "running":
+            return False
+        researcher = entry.get("researcher")
+        if researcher:
+            researcher.cancel()
+        task = entry.get("task")
+        if task and not task.done():
+            task.cancel()
+        entry["status"] = "cancelled"
+        return True
+
+    def get_result(self, session_id: str) -> Optional[str]:
+        """Get the completed research result."""
+        if session_id in self._active_tasks:
+            entry = self._active_tasks[session_id]
+            if entry["status"] in ("done", "error", "cancelled"):
+                return entry.get("result")
+        # Check disk
+        path = RESEARCH_DATA_DIR / f"{session_id}.json"
+        if path.exists():
+            try:
+                data = json.loads(path.read_text())
+                return data.get("result")
+            except Exception:
+                pass
+        return None
+
+    def get_sources(self, session_id: str) -> Optional[list]:
+        """Get deduplicated source list from research findings."""
+        # Check in-memory first
+        if session_id in self._active_tasks:
+            entry = self._active_tasks[session_id]
+            if entry.get("sources"):
+                return entry["sources"]
+            researcher = entry.get("researcher")
+            if researcher and researcher.findings:
+                return self._extract_sources(researcher.findings)
+        # Check disk
+        path = RESEARCH_DATA_DIR / f"{session_id}.json"
+        if path.exists():
+            try:
+                data = json.loads(path.read_text())
+                return data.get("sources")
+            except Exception:
+                pass
+        return None
+
+    @staticmethod
+    def _extract_sources(findings: list) -> list:
+        """Extract deduplicated [{url, title}] from findings."""
+        seen = set()
+        sources = []
+        for f in findings:
+            url = f.get("url", "")
+            title = f.get("title", "") or url
+            if url and url not in seen:
+                seen.add(url)
+                sources.append({"url": url, "title": title})
+        return sources
+
+    def clear_result(self, session_id: str):
+        """Remove persisted result after it's been consumed."""
+        self._active_tasks.pop(session_id, None)
+        path = RESEARCH_DATA_DIR / f"{session_id}.json"
+        if path.exists():
+            try:
+                path.unlink()
+            except Exception:
+                pass
+
+    def _save_result(self, session_id: str, entry: dict):
+        """Persist completed research result to disk."""
+        try:
+            # Extract and cache sources
+            sources = []
+            researcher = entry.get("researcher")
+            if researcher and researcher.findings:
+                sources = self._extract_sources(researcher.findings)
+            entry["sources"] = sources
+
+            path = RESEARCH_DATA_DIR / f"{session_id}.json"
+            data = {
+                "query": entry["query"],
+                "status": entry["status"],
+                "result": entry["result"],
+                "sources": sources,
+                "started_at": entry["started_at"],
+                "completed_at": time.time(),
+            }
+            path.write_text(json.dumps(data))
+            logger.info(f"Research result saved to {path}")
+        except Exception as e:
+            logger.error(f"Failed to save research result: {e}")
+
+    async def call_research_service(
+        self,
+        query: str,
+        llm_endpoint: str,
+        llm_model: str,
+        max_time: int = 300,
+        progress_callback=None,
+        _task_entry: dict = None,
+        llm_headers: dict = None,
+    ) -> str:
+        """
+        Run iterative deep research using the LLM-in-the-loop DeepResearcher.
+
+        Args:
+            query: Research question
+            llm_endpoint: LLM endpoint URL for chat completions
+            llm_model: Model name/ID
+            max_time: Maximum research time in seconds (default 5 minutes)
+            _task_entry: Internal - registry entry to store researcher ref
+
+        Returns:
+            Formatted research report with expandable section and summary
+        """
+        logger.info("Starting IterResearch Deep Research")
+        logger.info(f"Query: {query}")
+        logger.info(f"LLM: {llm_endpoint} / {llm_model}")
+        logger.info(f"Max time: {max_time}s")
+
+        try:
+            from src.deep_research import DeepResearcher
+            from src.settings import get_setting
+
+            researcher = DeepResearcher(
+                llm_endpoint=llm_endpoint,
+                llm_model=llm_model,
+                llm_headers=llm_headers,
+                max_rounds=8,
+                max_time=max_time,
+                max_report_tokens=int(get_setting("research_max_tokens", 8192)),
+                progress_callback=progress_callback,
+            )
+            if _task_entry is not None:
+                _task_entry["researcher"] = researcher
+
+            start_time = time.time()
+            report = await researcher.research(query)
+            elapsed = time.time() - start_time
+
+            stats = researcher.get_stats()
+            logger.info("IterResearch completed successfully")
+            for key, value in stats.items():
+                logger.info(f"  {key}: {value}")
+
+            return self._format_research_report(
+                query, report, stats, elapsed,
+                findings=researcher.findings,
+                evolving_report=researcher.evolving_report,
+            )
+
+        except Exception as e:
+            logger.error(f"DeepResearcher failed: {e}", exc_info=True)
+            return await self._fallback_research(query, llm_endpoint, llm_model, max_time, str(e))
+
+    async def _fallback_research(
+        self, query: str, llm_endpoint: str, llm_model: str,
+        max_time: int, primary_error: str,
+    ) -> str:
+        """Fall back to legacy engine, then to basic web search."""
+        # Try legacy orchestrator
+        if self._legacy_engine:
+            try:
+                import asyncio
+                logger.info("Falling back to legacy ResearchOrchestrator...")
+                loop = asyncio.get_event_loop()
+                result = await loop.run_in_executor(
+                    None, self._legacy_engine.start_research, query, max_time
+                )
+                stats = self._get_legacy_stats()
+                elapsed = float(stats.get("Duration", "0").rstrip("s") or 0)
+                return self._format_research_report(query, result, stats, elapsed)
+            except Exception as e:
+                logger.error(f"Legacy engine also failed: {e}")
+
+        # Fall back to basic web search
+        return self._handle_research_failure(query, primary_error)
+
+    def _get_legacy_stats(self) -> dict:
+        """Get statistics from the legacy research engine."""
+        if not self._legacy_engine:
+            return {}
+        try:
+            tracker = self._legacy_engine.progress_tracker
+            return {
+                "Findings": len(self._legacy_engine.findings),
+                "Sources": len(self._legacy_engine.source_reports),
+                "Searches": tracker.counters['searches_executed'],
+                "URLs": tracker.counters['urls_processed'],
+            }
+        except Exception:
+            return {}
+
+    def _format_research_report(
+        self, query: str, full_report: str, stats: dict, elapsed: float,
+        findings: list = None, evolving_report: str = None,
+    ) -> str:
+        """Format research report with sources list and expandable raw findings."""
+        summary_lines = [
+            f"**Duration:** {elapsed:.1f}s",
+            f"**Rounds:** {stats.get('Rounds', stats.get('Findings', '?'))}",
+            f"**Queries:** {stats.get('Queries', stats.get('Searches', '?'))}",
+            f"**URLs Analyzed:** {stats.get('URLs', '?')}",
+        ]
+        summary_text = " | ".join(summary_lines)
+
+        # Build sources list with clickable links
+        sources_section = ""
+        if findings:
+            seen_urls = set()
+            source_lines = []
+            for f in findings:
+                url = f.get("url", "")
+                title = f.get("title", "") or url
+                if url and url not in seen_urls:
+                    seen_urls.add(url)
+                    source_lines.append(f"- [{title}]({url})")
+            if source_lines:
+                sources_section = "\n### Sources\n\n" + "\n".join(source_lines) + "\n"
+
+        # Build raw findings section (individual extractions per source)
+        raw_findings_section = ""
+        if findings:
+            parts = []
+            for i, f in enumerate(findings, 1):
+                url = f.get("url", "")
+                title = f.get("title", "") or "Untitled"
+                summary = f.get("summary", "")
+                evidence = f.get("evidence", "")
+                content = summary if summary else (evidence[:2000] if evidence else "(no content)")
+                parts.append(f"**{i}. [{title}]({url})**\n\n{content}")
+            raw_findings_section = "\n\n".join(parts)
+
+        # Build expandable collected info section
+        collected_section = ""
+        if evolving_report or raw_findings_section:
+            collected_section = "\n<details>\n<summary><strong>Raw collected findings ({} sources)</strong></summary>\n\n".format(
+                len(findings) if findings else 0
+            )
+            if raw_findings_section:
+                collected_section += raw_findings_section + "\n"
+            collected_section += "\n</details>\n"
+
+        formatted = f"""---
+
+## Research Summary
+
+{summary_text}
+
+---
+
+{full_report}
+
+{sources_section}
+{collected_section}
+---
+
+**The AI has analyzed all research findings above. Ask me anything about: "{query}"**
+"""
+        return formatted
+
+    def _format_error_response(self, error_msg: str, query: str) -> str:
+        """Format error response in a user-friendly way."""
+        return f"""## Research Engine Unavailable
+
+**Query:** {query}
+
+**Error:** {error_msg}
+
+**Please check:**
+1. LLM endpoint is reachable
+2. SearXNG is running at the configured instance
+3. Application logs for detailed error information
+
+**Troubleshooting:**
+- Test basic search: Try the web search toggle first
+- Check search config: `/api/search/config`
+- Review logs for initialization errors
+"""
+
+    def _handle_research_failure(self, query: str, error: str) -> str:
+        """Handle research failure with fallback to basic search."""
+        try:
+            logger.info("Attempting fallback to basic web search...")
+            from src.search import comprehensive_web_search
+
+            search_result = comprehensive_web_search(query)
+
+            return f"""## Research Failed - Basic Search Fallback
+
+**Query:** {query}
+
+**Error:** {error}
+
+**Note:** The deep research engine encountered an error. Here are basic search results instead:
+
+---
+
+### Basic Web Search Results
+
+{search_result}
+
+---
+
+**To fix deep research:**
+1. Check that your LLM endpoint and search provider are properly configured
+2. Verify network connectivity
+3. Review application logs for detailed error information
+
+Try the web search toggle for simpler queries, or fix the research engine for comprehensive analysis.
+"""
+
+        except Exception as e2:
+            logger.error(f"Fallback search also failed: {e2}", exc_info=True)
+            return f"""## Complete Research Failure
+
+**Primary Error:** {error}
+**Fallback Error:** {str(e2)}
+
+**Please check:**
+1. Search provider configuration in Settings -> Search Settings
+2. Network connectivity to search APIs
+3. Application logs for detailed error information
+4. That SearXNG is running (if using SearXNG)
+
+**Debug Info:**
+- Search config endpoint: `/api/search/config`
+- Test basic search toggle with a simple query first
+"""
--- a/services/research/service.py
+++ b/services/research/service.py
@@ -0,0 +1,117 @@
+# services/research/service.py
+"""Research service — deep research with LLM-in-the-loop."""
+
+from dataclasses import dataclass, field
+from typing import List, Optional, Callable
+
+from .research_handler import ResearchHandler
+
+
+@dataclass
+class ResearchSource:
+    """A source found during research."""
+    url: str
+    title: str
+    snippet: str
+    relevance: float = 0.0
+
+
+@dataclass
+class ResearchResult:
+    """Result of a deep research query."""
+    query: str
+    summary: str
+    sources: List[ResearchSource] = field(default_factory=list)
+    sections: List[str] = field(default_factory=list)
+    tokens_used: int = 0
+    duration_seconds: float = 0.0
+
+
+class ResearchService:
+    """
+    Deep research service.
+
+    Usage:
+        service = ResearchService()
+        result = await service.research("quantum computing advances 2024")
+        print(result.summary)
+    """
+
+    def __init__(self):
+        self.handler = ResearchHandler()
+        self._active: dict = {}
+
+    async def research(
+        self,
+        topic: str,
+        llm_endpoint: str,
+        llm_model: str,
+        max_time: int = 300,
+        on_progress: Optional[Callable[[dict], None]] = None,
+    ) -> ResearchResult:
+        """
+        Perform deep research on a topic.
+
+        Args:
+            topic: Research topic/question
+            llm_endpoint: LLM API endpoint
+            llm_model: Model to use
+            max_time: Maximum time in seconds
+            on_progress: Optional progress callback
+
+        Returns:
+            ResearchResult with findings
+        """
+        import time
+        start = time.time()
+
+        result = await self.handler.call_research_service(
+            topic,
+            llm_endpoint,
+            llm_model,
+            max_time=max_time,
+            progress_callback=on_progress,
+        )
+
+        duration = time.time() - start
+
+        # Parse result into structured format
+        sources = [
+            ResearchSource(
+                url=s.get("url", ""),
+                title=s.get("title", ""),
+                snippet=s.get("snippet", ""),
+                relevance=s.get("relevance", 0.0),
+            )
+            for s in result.get("sources", [])
+        ]
+
+        return ResearchResult(
+            query=topic,
+            summary=result.get("summary", result.get("answer", "")),
+            sources=sources,
+            sections=result.get("sections", []),
+            tokens_used=result.get("tokens_used", 0),
+            duration_seconds=duration,
+        )
+
+    def start_background(
+        self,
+        session_id: str,
+        topic: str,
+        llm_endpoint: str,
+        llm_model: str,
+        max_time: int = 300,
+    ) -> dict:
+        """Start research in background. Returns task info."""
+        return self.handler.start_research(
+            session_id, topic, llm_endpoint, llm_model, max_time
+        )
+
+    def get_status(self, session_id: str) -> Optional[dict]:
+        """Get status of background research."""
+        return self.handler.get_status(session_id)
+
+    def cancel(self, session_id: str) -> bool:
+        """Cancel background research."""
+        return self.handler.cancel_research(session_id)