Odysseus v1.0
This commit is contained in:
12
services/research/__init__.py
Normal file
12
services/research/__init__.py
Normal file
@@ -0,0 +1,12 @@
|
||||
# services/research/__init__.py
|
||||
"""Research service — deep research with LLM-in-the-loop."""
|
||||
|
||||
from .service import ResearchService, ResearchResult, ResearchSource
|
||||
from .research_handler import ResearchHandler
|
||||
|
||||
__all__ = [
|
||||
"ResearchService",
|
||||
"ResearchResult",
|
||||
"ResearchSource",
|
||||
"ResearchHandler",
|
||||
]
|
||||
463
services/research/research_handler.py
Normal file
463
services/research/research_handler.py
Normal file
@@ -0,0 +1,463 @@
|
||||
# src/research_handler.py
|
||||
"""Handler for research service integration with expandable UI support.
|
||||
|
||||
Uses the IterResearch-style DeepResearcher (LLM-in-the-loop) as the primary
|
||||
engine, falling back to the legacy ResearchOrchestrator or basic web search
|
||||
if needed.
|
||||
|
||||
Includes a task registry so research survives page refreshes and can be cancelled.
|
||||
"""
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
RESEARCH_DATA_DIR = Path("data/deep_research")
|
||||
|
||||
|
||||
class ResearchHandler:
|
||||
"""Handles research service operations with iterative deep research."""
|
||||
|
||||
def __init__(self):
|
||||
self._legacy_engine = None
|
||||
self._active_tasks: Dict[str, dict] = {}
|
||||
self._initialize_legacy_engine()
|
||||
RESEARCH_DATA_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def _initialize_legacy_engine(self):
|
||||
"""Initialize the legacy research engine as a fallback."""
|
||||
try:
|
||||
from research_engine import ResearchOrchestrator, Config
|
||||
config = Config(max_searches=12, max_content_per_page=15000)
|
||||
self._legacy_engine = ResearchOrchestrator(config)
|
||||
logger.info("Legacy ResearchOrchestrator initialized (fallback)")
|
||||
except ImportError:
|
||||
logger.info("Legacy research_engine.py not found — DeepResearcher only")
|
||||
self._legacy_engine = None
|
||||
except Exception as e:
|
||||
logger.warning(f"Legacy research engine init failed: {e}")
|
||||
self._legacy_engine = None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Task registry — background research with persistence
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def start_research(
|
||||
self,
|
||||
session_id: str,
|
||||
query: str,
|
||||
llm_endpoint: str,
|
||||
llm_model: str,
|
||||
max_time: int = 300,
|
||||
llm_headers: dict = None,
|
||||
) -> dict:
|
||||
"""Start research as a background task. Returns task info dict."""
|
||||
# Cancel any existing research for this session
|
||||
if session_id in self._active_tasks:
|
||||
existing = self._active_tasks[session_id]
|
||||
if existing.get("status") == "running":
|
||||
self.cancel_research(session_id)
|
||||
|
||||
entry = {
|
||||
"task": None,
|
||||
"researcher": None,
|
||||
"query": query,
|
||||
"status": "running",
|
||||
"progress": {},
|
||||
"result": None,
|
||||
"started_at": time.time(),
|
||||
}
|
||||
self._active_tasks[session_id] = entry
|
||||
|
||||
def on_progress(event):
|
||||
entry["progress"] = event
|
||||
|
||||
async def _run():
|
||||
try:
|
||||
result = await self.call_research_service(
|
||||
query, llm_endpoint, llm_model,
|
||||
max_time=max_time,
|
||||
progress_callback=on_progress,
|
||||
_task_entry=entry,
|
||||
llm_headers=llm_headers,
|
||||
)
|
||||
entry["result"] = result
|
||||
entry["status"] = "done"
|
||||
self._save_result(session_id, entry)
|
||||
except asyncio.CancelledError:
|
||||
entry["status"] = "cancelled"
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Background research failed: {e}", exc_info=True)
|
||||
entry["result"] = str(e)
|
||||
entry["status"] = "error"
|
||||
|
||||
task = asyncio.create_task(_run())
|
||||
entry["task"] = task
|
||||
return {"session_id": session_id, "status": "running", "query": query}
|
||||
|
||||
def get_status(self, session_id: str) -> Optional[dict]:
|
||||
"""Get current research status for a session."""
|
||||
if session_id in self._active_tasks:
|
||||
entry = self._active_tasks[session_id]
|
||||
return {
|
||||
"status": entry["status"],
|
||||
"progress": entry["progress"],
|
||||
"query": entry["query"],
|
||||
"started_at": entry["started_at"],
|
||||
}
|
||||
# Check disk for completed research
|
||||
path = RESEARCH_DATA_DIR / f"{session_id}.json"
|
||||
if path.exists():
|
||||
try:
|
||||
data = json.loads(path.read_text())
|
||||
return {
|
||||
"status": data.get("status", "done"),
|
||||
"progress": {},
|
||||
"query": data.get("query", ""),
|
||||
"started_at": data.get("started_at", 0),
|
||||
}
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
def cancel_research(self, session_id: str) -> bool:
|
||||
"""Cancel running research for a session."""
|
||||
if session_id not in self._active_tasks:
|
||||
return False
|
||||
entry = self._active_tasks[session_id]
|
||||
if entry["status"] != "running":
|
||||
return False
|
||||
researcher = entry.get("researcher")
|
||||
if researcher:
|
||||
researcher.cancel()
|
||||
task = entry.get("task")
|
||||
if task and not task.done():
|
||||
task.cancel()
|
||||
entry["status"] = "cancelled"
|
||||
return True
|
||||
|
||||
def get_result(self, session_id: str) -> Optional[str]:
|
||||
"""Get the completed research result."""
|
||||
if session_id in self._active_tasks:
|
||||
entry = self._active_tasks[session_id]
|
||||
if entry["status"] in ("done", "error", "cancelled"):
|
||||
return entry.get("result")
|
||||
# Check disk
|
||||
path = RESEARCH_DATA_DIR / f"{session_id}.json"
|
||||
if path.exists():
|
||||
try:
|
||||
data = json.loads(path.read_text())
|
||||
return data.get("result")
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
def get_sources(self, session_id: str) -> Optional[list]:
|
||||
"""Get deduplicated source list from research findings."""
|
||||
# Check in-memory first
|
||||
if session_id in self._active_tasks:
|
||||
entry = self._active_tasks[session_id]
|
||||
if entry.get("sources"):
|
||||
return entry["sources"]
|
||||
researcher = entry.get("researcher")
|
||||
if researcher and researcher.findings:
|
||||
return self._extract_sources(researcher.findings)
|
||||
# Check disk
|
||||
path = RESEARCH_DATA_DIR / f"{session_id}.json"
|
||||
if path.exists():
|
||||
try:
|
||||
data = json.loads(path.read_text())
|
||||
return data.get("sources")
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _extract_sources(findings: list) -> list:
|
||||
"""Extract deduplicated [{url, title}] from findings."""
|
||||
seen = set()
|
||||
sources = []
|
||||
for f in findings:
|
||||
url = f.get("url", "")
|
||||
title = f.get("title", "") or url
|
||||
if url and url not in seen:
|
||||
seen.add(url)
|
||||
sources.append({"url": url, "title": title})
|
||||
return sources
|
||||
|
||||
def clear_result(self, session_id: str):
|
||||
"""Remove persisted result after it's been consumed."""
|
||||
self._active_tasks.pop(session_id, None)
|
||||
path = RESEARCH_DATA_DIR / f"{session_id}.json"
|
||||
if path.exists():
|
||||
try:
|
||||
path.unlink()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _save_result(self, session_id: str, entry: dict):
|
||||
"""Persist completed research result to disk."""
|
||||
try:
|
||||
# Extract and cache sources
|
||||
sources = []
|
||||
researcher = entry.get("researcher")
|
||||
if researcher and researcher.findings:
|
||||
sources = self._extract_sources(researcher.findings)
|
||||
entry["sources"] = sources
|
||||
|
||||
path = RESEARCH_DATA_DIR / f"{session_id}.json"
|
||||
data = {
|
||||
"query": entry["query"],
|
||||
"status": entry["status"],
|
||||
"result": entry["result"],
|
||||
"sources": sources,
|
||||
"started_at": entry["started_at"],
|
||||
"completed_at": time.time(),
|
||||
}
|
||||
path.write_text(json.dumps(data))
|
||||
logger.info(f"Research result saved to {path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to save research result: {e}")
|
||||
|
||||
async def call_research_service(
|
||||
self,
|
||||
query: str,
|
||||
llm_endpoint: str,
|
||||
llm_model: str,
|
||||
max_time: int = 300,
|
||||
progress_callback=None,
|
||||
_task_entry: dict = None,
|
||||
llm_headers: dict = None,
|
||||
) -> str:
|
||||
"""
|
||||
Run iterative deep research using the LLM-in-the-loop DeepResearcher.
|
||||
|
||||
Args:
|
||||
query: Research question
|
||||
llm_endpoint: LLM endpoint URL for chat completions
|
||||
llm_model: Model name/ID
|
||||
max_time: Maximum research time in seconds (default 5 minutes)
|
||||
_task_entry: Internal - registry entry to store researcher ref
|
||||
|
||||
Returns:
|
||||
Formatted research report with expandable section and summary
|
||||
"""
|
||||
logger.info("Starting IterResearch Deep Research")
|
||||
logger.info(f"Query: {query}")
|
||||
logger.info(f"LLM: {llm_endpoint} / {llm_model}")
|
||||
logger.info(f"Max time: {max_time}s")
|
||||
|
||||
try:
|
||||
from src.deep_research import DeepResearcher
|
||||
from src.settings import get_setting
|
||||
|
||||
researcher = DeepResearcher(
|
||||
llm_endpoint=llm_endpoint,
|
||||
llm_model=llm_model,
|
||||
llm_headers=llm_headers,
|
||||
max_rounds=8,
|
||||
max_time=max_time,
|
||||
max_report_tokens=int(get_setting("research_max_tokens", 8192)),
|
||||
progress_callback=progress_callback,
|
||||
)
|
||||
if _task_entry is not None:
|
||||
_task_entry["researcher"] = researcher
|
||||
|
||||
start_time = time.time()
|
||||
report = await researcher.research(query)
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
stats = researcher.get_stats()
|
||||
logger.info("IterResearch completed successfully")
|
||||
for key, value in stats.items():
|
||||
logger.info(f" {key}: {value}")
|
||||
|
||||
return self._format_research_report(
|
||||
query, report, stats, elapsed,
|
||||
findings=researcher.findings,
|
||||
evolving_report=researcher.evolving_report,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"DeepResearcher failed: {e}", exc_info=True)
|
||||
return await self._fallback_research(query, llm_endpoint, llm_model, max_time, str(e))
|
||||
|
||||
async def _fallback_research(
|
||||
self, query: str, llm_endpoint: str, llm_model: str,
|
||||
max_time: int, primary_error: str,
|
||||
) -> str:
|
||||
"""Fall back to legacy engine, then to basic web search."""
|
||||
# Try legacy orchestrator
|
||||
if self._legacy_engine:
|
||||
try:
|
||||
import asyncio
|
||||
logger.info("Falling back to legacy ResearchOrchestrator...")
|
||||
loop = asyncio.get_event_loop()
|
||||
result = await loop.run_in_executor(
|
||||
None, self._legacy_engine.start_research, query, max_time
|
||||
)
|
||||
stats = self._get_legacy_stats()
|
||||
elapsed = float(stats.get("Duration", "0").rstrip("s") or 0)
|
||||
return self._format_research_report(query, result, stats, elapsed)
|
||||
except Exception as e:
|
||||
logger.error(f"Legacy engine also failed: {e}")
|
||||
|
||||
# Fall back to basic web search
|
||||
return self._handle_research_failure(query, primary_error)
|
||||
|
||||
def _get_legacy_stats(self) -> dict:
|
||||
"""Get statistics from the legacy research engine."""
|
||||
if not self._legacy_engine:
|
||||
return {}
|
||||
try:
|
||||
tracker = self._legacy_engine.progress_tracker
|
||||
return {
|
||||
"Findings": len(self._legacy_engine.findings),
|
||||
"Sources": len(self._legacy_engine.source_reports),
|
||||
"Searches": tracker.counters['searches_executed'],
|
||||
"URLs": tracker.counters['urls_processed'],
|
||||
}
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
def _format_research_report(
|
||||
self, query: str, full_report: str, stats: dict, elapsed: float,
|
||||
findings: list = None, evolving_report: str = None,
|
||||
) -> str:
|
||||
"""Format research report with sources list and expandable raw findings."""
|
||||
summary_lines = [
|
||||
f"**Duration:** {elapsed:.1f}s",
|
||||
f"**Rounds:** {stats.get('Rounds', stats.get('Findings', '?'))}",
|
||||
f"**Queries:** {stats.get('Queries', stats.get('Searches', '?'))}",
|
||||
f"**URLs Analyzed:** {stats.get('URLs', '?')}",
|
||||
]
|
||||
summary_text = " | ".join(summary_lines)
|
||||
|
||||
# Build sources list with clickable links
|
||||
sources_section = ""
|
||||
if findings:
|
||||
seen_urls = set()
|
||||
source_lines = []
|
||||
for f in findings:
|
||||
url = f.get("url", "")
|
||||
title = f.get("title", "") or url
|
||||
if url and url not in seen_urls:
|
||||
seen_urls.add(url)
|
||||
source_lines.append(f"- [{title}]({url})")
|
||||
if source_lines:
|
||||
sources_section = "\n### Sources\n\n" + "\n".join(source_lines) + "\n"
|
||||
|
||||
# Build raw findings section (individual extractions per source)
|
||||
raw_findings_section = ""
|
||||
if findings:
|
||||
parts = []
|
||||
for i, f in enumerate(findings, 1):
|
||||
url = f.get("url", "")
|
||||
title = f.get("title", "") or "Untitled"
|
||||
summary = f.get("summary", "")
|
||||
evidence = f.get("evidence", "")
|
||||
content = summary if summary else (evidence[:2000] if evidence else "(no content)")
|
||||
parts.append(f"**{i}. [{title}]({url})**\n\n{content}")
|
||||
raw_findings_section = "\n\n".join(parts)
|
||||
|
||||
# Build expandable collected info section
|
||||
collected_section = ""
|
||||
if evolving_report or raw_findings_section:
|
||||
collected_section = "\n<details>\n<summary><strong>Raw collected findings ({} sources)</strong></summary>\n\n".format(
|
||||
len(findings) if findings else 0
|
||||
)
|
||||
if raw_findings_section:
|
||||
collected_section += raw_findings_section + "\n"
|
||||
collected_section += "\n</details>\n"
|
||||
|
||||
formatted = f"""---
|
||||
|
||||
## Research Summary
|
||||
|
||||
{summary_text}
|
||||
|
||||
---
|
||||
|
||||
{full_report}
|
||||
|
||||
{sources_section}
|
||||
{collected_section}
|
||||
---
|
||||
|
||||
**The AI has analyzed all research findings above. Ask me anything about: "{query}"**
|
||||
"""
|
||||
return formatted
|
||||
|
||||
def _format_error_response(self, error_msg: str, query: str) -> str:
|
||||
"""Format error response in a user-friendly way."""
|
||||
return f"""## Research Engine Unavailable
|
||||
|
||||
**Query:** {query}
|
||||
|
||||
**Error:** {error_msg}
|
||||
|
||||
**Please check:**
|
||||
1. LLM endpoint is reachable
|
||||
2. SearXNG is running at the configured instance
|
||||
3. Application logs for detailed error information
|
||||
|
||||
**Troubleshooting:**
|
||||
- Test basic search: Try the web search toggle first
|
||||
- Check search config: `/api/search/config`
|
||||
- Review logs for initialization errors
|
||||
"""
|
||||
|
||||
def _handle_research_failure(self, query: str, error: str) -> str:
|
||||
"""Handle research failure with fallback to basic search."""
|
||||
try:
|
||||
logger.info("Attempting fallback to basic web search...")
|
||||
from src.search import comprehensive_web_search
|
||||
|
||||
search_result = comprehensive_web_search(query)
|
||||
|
||||
return f"""## Research Failed - Basic Search Fallback
|
||||
|
||||
**Query:** {query}
|
||||
|
||||
**Error:** {error}
|
||||
|
||||
**Note:** The deep research engine encountered an error. Here are basic search results instead:
|
||||
|
||||
---
|
||||
|
||||
### Basic Web Search Results
|
||||
|
||||
{search_result}
|
||||
|
||||
---
|
||||
|
||||
**To fix deep research:**
|
||||
1. Check that your LLM endpoint and search provider are properly configured
|
||||
2. Verify network connectivity
|
||||
3. Review application logs for detailed error information
|
||||
|
||||
Try the web search toggle for simpler queries, or fix the research engine for comprehensive analysis.
|
||||
"""
|
||||
|
||||
except Exception as e2:
|
||||
logger.error(f"Fallback search also failed: {e2}", exc_info=True)
|
||||
return f"""## Complete Research Failure
|
||||
|
||||
**Primary Error:** {error}
|
||||
**Fallback Error:** {str(e2)}
|
||||
|
||||
**Please check:**
|
||||
1. Search provider configuration in Settings -> Search Settings
|
||||
2. Network connectivity to search APIs
|
||||
3. Application logs for detailed error information
|
||||
4. That SearXNG is running (if using SearXNG)
|
||||
|
||||
**Debug Info:**
|
||||
- Search config endpoint: `/api/search/config`
|
||||
- Test basic search toggle with a simple query first
|
||||
"""
|
||||
117
services/research/service.py
Normal file
117
services/research/service.py
Normal file
@@ -0,0 +1,117 @@
|
||||
# services/research/service.py
|
||||
"""Research service — deep research with LLM-in-the-loop."""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional, Callable
|
||||
|
||||
from .research_handler import ResearchHandler
|
||||
|
||||
|
||||
@dataclass
|
||||
class ResearchSource:
|
||||
"""A source found during research."""
|
||||
url: str
|
||||
title: str
|
||||
snippet: str
|
||||
relevance: float = 0.0
|
||||
|
||||
|
||||
@dataclass
|
||||
class ResearchResult:
|
||||
"""Result of a deep research query."""
|
||||
query: str
|
||||
summary: str
|
||||
sources: List[ResearchSource] = field(default_factory=list)
|
||||
sections: List[str] = field(default_factory=list)
|
||||
tokens_used: int = 0
|
||||
duration_seconds: float = 0.0
|
||||
|
||||
|
||||
class ResearchService:
|
||||
"""
|
||||
Deep research service.
|
||||
|
||||
Usage:
|
||||
service = ResearchService()
|
||||
result = await service.research("quantum computing advances 2024")
|
||||
print(result.summary)
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.handler = ResearchHandler()
|
||||
self._active: dict = {}
|
||||
|
||||
async def research(
|
||||
self,
|
||||
topic: str,
|
||||
llm_endpoint: str,
|
||||
llm_model: str,
|
||||
max_time: int = 300,
|
||||
on_progress: Optional[Callable[[dict], None]] = None,
|
||||
) -> ResearchResult:
|
||||
"""
|
||||
Perform deep research on a topic.
|
||||
|
||||
Args:
|
||||
topic: Research topic/question
|
||||
llm_endpoint: LLM API endpoint
|
||||
llm_model: Model to use
|
||||
max_time: Maximum time in seconds
|
||||
on_progress: Optional progress callback
|
||||
|
||||
Returns:
|
||||
ResearchResult with findings
|
||||
"""
|
||||
import time
|
||||
start = time.time()
|
||||
|
||||
result = await self.handler.call_research_service(
|
||||
topic,
|
||||
llm_endpoint,
|
||||
llm_model,
|
||||
max_time=max_time,
|
||||
progress_callback=on_progress,
|
||||
)
|
||||
|
||||
duration = time.time() - start
|
||||
|
||||
# Parse result into structured format
|
||||
sources = [
|
||||
ResearchSource(
|
||||
url=s.get("url", ""),
|
||||
title=s.get("title", ""),
|
||||
snippet=s.get("snippet", ""),
|
||||
relevance=s.get("relevance", 0.0),
|
||||
)
|
||||
for s in result.get("sources", [])
|
||||
]
|
||||
|
||||
return ResearchResult(
|
||||
query=topic,
|
||||
summary=result.get("summary", result.get("answer", "")),
|
||||
sources=sources,
|
||||
sections=result.get("sections", []),
|
||||
tokens_used=result.get("tokens_used", 0),
|
||||
duration_seconds=duration,
|
||||
)
|
||||
|
||||
def start_background(
|
||||
self,
|
||||
session_id: str,
|
||||
topic: str,
|
||||
llm_endpoint: str,
|
||||
llm_model: str,
|
||||
max_time: int = 300,
|
||||
) -> dict:
|
||||
"""Start research in background. Returns task info."""
|
||||
return self.handler.start_research(
|
||||
session_id, topic, llm_endpoint, llm_model, max_time
|
||||
)
|
||||
|
||||
def get_status(self, session_id: str) -> Optional[dict]:
|
||||
"""Get status of background research."""
|
||||
return self.handler.get_status(session_id)
|
||||
|
||||
def cancel(self, session_id: str) -> bool:
|
||||
"""Cancel background research."""
|
||||
return self.handler.cancel_research(session_id)
|
||||
Reference in New Issue
Block a user