From ec43ba83dd1a624af37f8532dcbc9b05a6b7c2b9 Mon Sep 17 00:00:00 2001 From: LittleLlama <72672345+Ninjayeti@users.noreply.github.com> Date: Sun, 31 May 2026 22:23:19 -0700 Subject: [PATCH] Fix NPX MCP server crash (skip if not installed, alternative shape to #242 / #252) (#253) * Fix NPX MCP server crash by checking install state instead of timing out When @playwright/mcp (or any future npx-based built-in server) isn't already cached, npx tries to download and install it on first invoke. That can take minutes or hang on a fresh install missing Playwright system deps. The previous code bounded that wait with asyncio.wait_for(mcp_manager.connect_server(...), timeout=30), but the cancellation that wait_for fires on timeout propagates into mcp.client.stdio.stdio_client's internal anyio task group, which raises: RuntimeError: Attempted to exit cancel scope in a different task than it was entered in The error fires in a sibling background task (Task exception was never retrieved) so the surrounding try/except BaseException doesn't catch it, and the orphaned cancel scope cascades cancellations into other tasks in the same event loop. Running requests start failing and the process needs a restart. Fix: detect whether the package is already cached before invoking connect_server, instead of trying to bound the connect with a timeout. A new _is_npx_package_cached helper runs: npx --no-install --version The --no-install flag makes npx fail fast on a cache miss instead of downloading, so the probe returns in <500ms either way. If the package isn't cached, we log a warning with the exact command the user can run to install it, and skip the server. If it is cached, we call connect_server normally with no wait_for wrapper, so there's no cancellation that could enter stdio_client's task group. This removes the entire bug class instead of papering over it. No asyncio.wait_for around stdio_client, no shielded-task leak, no shutdown-time RuntimeError. Verified against current versions (mcp library on Python 3.14, anyio 4.13.0) with the existing @playwright/mcp@latest cached, and with a deliberately uncached package spec to exercise the skip path. * Make first-run setup explicit when NPX MCP package isn't cached Per @pewdiepie-archdaemon review on #253: - src/builtin_mcp.py: expand the skip-server warning into a multi-line block with Reason/Impact/Fix/Notes lines, so the message stands out in startup logs and clearly tells the user what to run. - README.md: add 'Built-in MCP servers (optional setup)' subsection under Configuration, with the install command and a brief note that it's optional and skipped if not cached. --- README.md | 12 ++++++ src/builtin_mcp.py | 91 ++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 91 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 4c6042b..c895976 100644 --- a/README.md +++ b/README.md @@ -219,6 +219,18 @@ Docker Compose includes these by default. The bundled service ports bind to `127 ### Optional external services - **Ollama** → local LLM server -- [ollama.ai](https://ollama.ai) +### Built-in MCP servers (optional setup) + +Odysseus auto-registers a few built-in MCP servers at startup. The npx-based ones (currently the browser server, `@playwright/mcp`) only start when their npm package is already in the local npx cache. If a package isn't cached, that server is skipped with a startup log message explaining what to do, so a fresh install does not block on a multi-minute npm download or hang if Playwright system deps are missing. + +To enable the browser MCP (page navigation, screenshots, vision), run once: + +```bash +npx -y @playwright/mcp@latest --version +``` + +That installs `@playwright/mcp` plus Playwright (~300MB total). Restart Odysseus and the server will register at startup. + ### Ollama with Docker If Odysseus is running in Docker and Ollama is running on the host, add the endpoint in Settings as: diff --git a/src/builtin_mcp.py b/src/builtin_mcp.py index 14e2851..c570044 100644 --- a/src/builtin_mcp.py +++ b/src/builtin_mcp.py @@ -108,27 +108,94 @@ async def register_builtin_servers(mcp_manager): async def _start_npx_servers(): await asyncio.sleep(3) # let Python servers finish first for server_id, cfg in _BUILTIN_NPX_SERVERS.items(): + # Skip the server if its npx package isn't cached. Without this + # check, npx would try to download/install the package on first + # use, which can take minutes (or hang) on fresh installs without + # Playwright system deps. Wrapping that in asyncio.wait_for to + # bound the wait sounds reasonable, but mcp.client.stdio uses an + # internal anyio task group that can't survive the resulting + # cross-task cancellation: it raises "Attempted to exit cancel + # scope in a different task than it was entered in" in a sibling + # task, which cascades cancellations into the rest of the event + # loop and downs the app. Detecting installed-state up-front lets + # us bail with a useful warning before we ever touch stdio_client. + args = cfg["args"] + pkg_spec = _npx_package_from_args(args) + if pkg_spec and not await _is_npx_package_cached(npx_path, pkg_spec): + logger.warning( + f"{cfg['name']} is not available.\n" + f" Reason: npm package {pkg_spec!r} is not installed in the npx cache.\n" + f" Impact: tools provided by this MCP server will be unavailable.\n" + f" Fix: {os.path.basename(npx_path)} -y {pkg_spec} --version\n" + f" (run once, then restart Odysseus)\n" + f" Notes: this server is optional; see README.md " + f"'Built-in MCP servers' for details." + ) + continue + + logger.info(f"Starting NPX server: {cfg['name']} ({npx_path} {' '.join(args)})") try: - logger.info(f"Starting NPX server: {cfg['name']} ({npx_path} {' '.join(cfg['args'])})") - ok = await asyncio.wait_for( - mcp_manager.connect_server( - server_id=server_id, - name=cfg["name"], - transport="stdio", - command=npx_path, - args=cfg["args"], - ), - timeout=30, + ok = await mcp_manager.connect_server( + server_id=server_id, + name=cfg["name"], + transport="stdio", + command=npx_path, + args=args, ) if ok: logger.info(f"Built-in NPX server registered: {cfg['name']}") else: logger.warning(f"Built-in NPX server failed to connect: {cfg['name']}") - except asyncio.TimeoutError: - logger.warning(f"Built-in NPX server timed out: {cfg['name']}") except asyncio.CancelledError: raise except BaseException as e: logger.warning(f"Built-in NPX server {cfg['name']} error: {type(e).__name__}: {e}") asyncio.create_task(_start_npx_servers()) + + +def _npx_package_from_args(args): + """Pick the package spec out of an npx args list shaped like + ['-y', '', ...flags]. Returns None if the + convention doesn't match (we then skip the cache check and just + try the connect).""" + if not args: + return None + if "-y" in args: + idx = args.index("-y") + 1 + if idx < len(args) and not args[idx].startswith("-"): + return args[idx] + # No -y prefix: first non-flag arg is the package + for a in args: + if not a.startswith("-"): + return a + return None + + +async def _is_npx_package_cached(npx_path, package_spec, timeout_s=5): + """Probe whether an npx package is already in the local cache. + + Runs `npx --no-install --version`. --no-install tells npx to + fail instead of downloading, so a cache miss returns fast. We treat + "exited 0 with non-empty stdout" as proof of a working cached copy. + Anything else (non-zero exit, empty stdout, timeout, missing npx, + network error) means we should skip the server. + """ + try: + proc = await asyncio.create_subprocess_exec( + npx_path, "--no-install", package_spec, "--version", + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + except (OSError, ValueError): + return False + try: + stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=timeout_s) + except asyncio.TimeoutError: + try: + proc.kill() + await proc.wait() + except Exception: + pass + return False + return proc.returncode == 0 and bool(stdout.strip())