Add Deep Research extraction controls

This commit is contained in:
pewdiepie-archdaemon
2026-06-01 14:55:24 +09:00
parent 6872679f31
commit b998c52dd0
8 changed files with 165 additions and 6 deletions

View File

@@ -180,6 +180,8 @@ class DeepResearcher:
max_urls_per_round: int = 3,
max_content_chars: int = 15000,
max_report_tokens: int = 8192,
extraction_timeout: int = 90,
extraction_concurrency: int = 3,
min_rounds: int = 2,
max_empty_rounds: int = 2,
synthesis_window: int = 10,
@@ -197,6 +199,8 @@ class DeepResearcher:
self.max_urls_per_round = max_urls_per_round
self.max_content_chars = max_content_chars
self.max_report_tokens = max_report_tokens
self.extraction_timeout = min(600, max(15, int(extraction_timeout or 90)))
self.extraction_concurrency = min(12, max(1, int(extraction_concurrency or 3)))
self.min_rounds = min_rounds
self.max_empty_rounds = max_empty_rounds
self.synthesis_window = synthesis_window
@@ -492,11 +496,16 @@ class DeepResearcher:
if self._cancelled or self._time_exceeded():
return all_findings
# Fetch and extract all URLs concurrently
extract_tasks = [
self._fetch_and_extract(r["url"], question, r.get("title", ""))
for r in urls_to_fetch
]
# Fetch and extract URLs with backpressure. Local model servers often
# serialize requests behind one GPU; flooding them makes every request
# slower and can trip the extraction timeout.
semaphore = asyncio.Semaphore(self.extraction_concurrency)
async def _bounded_extract(result: Dict) -> Optional[Dict]:
async with semaphore:
return await self._fetch_and_extract(result["url"], question, result.get("title", ""))
extract_tasks = [_bounded_extract(r) for r in urls_to_fetch]
results_gathered = await asyncio.gather(*extract_tasks, return_exceptions=True)
for result in results_gathered:
@@ -576,7 +585,7 @@ class DeepResearcher:
[{"role": "user", "content": prompt}],
temperature=0.2,
max_tokens=2048,
timeout=45,
timeout=self.extraction_timeout,
)
parsed = self._parse_json_object(response)
if parsed: