fix(cookbook): surface backend diagnosis when serve fails in background (#1636)

* refactor(cookbook): move _diagnose_serve_output to module level in cookbook_helpers Extracts the nested _diagnose_serve_output function from inside setup_cookbook_routes() and moves it to module level in cookbook_helpers.py, alongside the other helper functions it logically belongs with. No behaviour change — the function is now importable directly for testing and by other callers without going through the route factory closure. * fix(cookbook): surface backend diagnosis when serve fails in background The background poll (_pollBackgroundStatus) already received `diagnosis` and `cmd` from /api/cookbook/tasks/status but discarded both. When a serve job died while the Cookbook modal was closed, reopening it showed only a red error badge with no context. - Persist live.diagnosis into task._backendDiagnosis in localStorage so it survives modal close/reopen and page refresh - Persist live.cmd into task.payload._cmd for agent-spawned tasks so the crash report includes the actual command - After _renderRunningTab(), walk rendered cards and call _showDiagnosis() for any that have a stored _backendDiagnosis but no panel yet - In _renderTaskCard(), use _backendDiagnosis as a fallback when the client-side _terminalServeDiagnosis() finds nothing * test(cookbook): add coverage for _diagnose_serve_output error patterns 10 tests verifying the 16 serve-failure patterns: - CUDA OOM, port-in-use, vLLM missing, gated model - Traceback fallback fires without startup success marker - Traceback suppressed when server actually started - Clean/empty output returns None - trust-remote-code and no-GGUF patterns
2026-06-05 05:52:07 -03:00
parent 367858a587
commit f5d834b0c5
4 changed files with 210 additions and 122 deletions
--- a/routes/cookbook_helpers.py
+++ b/routes/cookbook_helpers.py
@@ -804,3 +804,125 @@ def _ssh_ps(host, script_path, port=None):

 # Windows session dir — stored in user's temp on the remote
 WIN_SESSION_DIR = "$env:TEMP\\\\odysseus-sessions"
+
+
+def _diagnose_serve_output(text: str) -> dict | None:
+    """Server-side mirror of the Cookbook UI's common serve diagnoses.
+
+    The browser uses cookbook-diagnosis.js for clickable fixes. This gives
+    the agent/tool path the same structured signal so it can retry with an
+    adjusted command instead of guessing from raw tmux output.
+    """
+    if not text:
+        return None
+    tail = text[-6000:]
+    patterns = [
+        (
+            r"No available memory for the cache blocks|Available KV cache memory:.*-",
+            "No GPU memory left for KV cache after loading model.",
+            [
+                {"label": "retry with GPU memory utilization 0.95", "op": "replace", "flag": "--gpu-memory-utilization", "value": "0.95"},
+                {"label": "retry with context 2048", "op": "replace", "flag": "--max-model-len", "value": "2048"},
+            ],
+        ),
+        (
+            r"CUDA out of memory|torch\.cuda\.OutOfMemoryError|CUDA error: out of memory|warming up sampler|max_num_seqs.*gpu_memory_utilization",
+            "GPU ran out of memory during startup or warmup.",
+            [
+                {"label": "retry with context 4096", "op": "replace", "flag": "--max-model-len", "value": "4096"},
+                {"label": "retry with GPU memory utilization 0.80", "op": "replace", "flag": "--gpu-memory-utilization", "value": "0.80"},
+                {"label": "retry with --enforce-eager", "op": "append", "arg": "--enforce-eager"},
+            ],
+        ),
+        (
+            r"not divisib|must be divisible|attention heads.*divisible",
+            "Tensor parallel size is incompatible with the model.",
+            [
+                {"label": "retry with tensor parallel size 1", "op": "replace", "flag": "--tensor-parallel-size", "value": "1"},
+                {"label": "retry with tensor parallel size 2", "op": "replace", "flag": "--tensor-parallel-size", "value": "2"},
+            ],
+        ),
+        (
+            r"KV cache.*too (small|large)|max_model_len.*exceeds|maximum.*context",
+            "Context length is too large for available GPU memory.",
+            [
+                {"label": "retry with context 8192", "op": "replace", "flag": "--max-model-len", "value": "8192"},
+                {"label": "retry with context 4096", "op": "replace", "flag": "--max-model-len", "value": "4096"},
+            ],
+        ),
+        (
+            r"enable-auto-tool-choice requires --tool-call-parser",
+            "Auto tool choice requires an explicit tool call parser.",
+            [{"label": "retry with Hermes tool parser", "op": "append", "arg": "--tool-call-parser hermes"}],
+        ),
+        (
+            r"Please pass.*trust.remote.code=True|contains custom code which must be executed to correctly load|does not recognize this architecture|model type.*but Transformers does not",
+            "Model requires custom code or newer model support.",
+            [{"label": "retry with --trust-remote-code", "op": "append", "arg": "--trust-remote-code"}],
+        ),
+        (
+            r"Either a revision or a version must be specified|transformers\.integrations\.hub_kernels|kernels/layer",
+            "vLLM/Transformers kernel package mismatch.",
+            [{"label": "update vLLM, Transformers, and kernels on this server", "op": "dependency", "package": "vllm transformers kernels"}],
+        ),
+        (
+            r"Address already in use|bind.*address.*in use",
+            "Port is already in use.",
+            [{"label": "retry on port 8001", "op": "replace", "flag": "--port", "value": "8001"}],
+        ),
+        (
+            r"No CUDA GPUs are available|no GPU.*found|CUDA_VISIBLE_DEVICES.*invalid",
+            "No GPUs are visible to the serve process.",
+            [{"label": "clear Cookbook GPU selection or choose available GPUs", "op": "settings", "field": "gpus", "value": ""}],
+        ),
+        (
+            r"Failed to infer device type|NVML Shared Library Not Found|No module named 'amdsmi'|platform is not available",
+            "vLLM could not find a supported GPU (CUDA or ROCm). "
+            "This machine may have integrated or unsupported graphics only.",
+            [
+                {"label": "switch to llama.cpp (CPU/Metal, works without a discrete GPU)", "op": "manual"},
+                {"label": "switch to Ollama (CPU/Metal, works without a discrete GPU)", "op": "manual"},
+            ],
+        ),
+        (
+            r"vllm.*command not found|No module named vllm|ERROR: vLLM is not installed",
+            "vLLM is not installed or not in PATH on this server.",
+            [{"label": "install vLLM in Cookbook Dependencies", "op": "dependency", "package": "vllm"}],
+        ),
+        (
+            r"sglang.*command not found|No module named sglang|SGLang is not installed",
+            "SGLang is not installed or not in PATH on this server.",
+            [{"label": "install SGLang in Cookbook Dependencies", "op": "dependency", "package": "sglang[all]"}],
+        ),
+        (
+            r"llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'|git: command not found|cmake: command not found",
+            "llama.cpp / llama-cpp-python dependencies are missing.",
+            [{"label": "install llama.cpp dependencies or llama-cpp-python[server]", "op": "dependency", "package": "llama-cpp-python[server]"}],
+        ),
+        (
+            r"No GGUF found on this host|no \.gguf file|No GGUF file found",
+            "No GGUF file found for this model on this host. The llama.cpp backend needs a .gguf file.",
+            [{"label": "download a GGUF build of this model (repo name usually ends in -GGUF, file like Q4_K_M.gguf)", "op": "manual"}],
+        ),
+        (
+            r"No module named 'torch'|No module named torch|No module named 'diffusers'|No module named diffusers",
+            "Diffusion serving requires PyTorch and diffusers.",
+            [{"label": "install diffusers[torch] in Cookbook Dependencies", "op": "dependency", "package": "diffusers[torch]"}],
+        ),
+        (
+            r"403 Forbidden|401 Unauthorized|Access to model.*is restricted|gated repo|not in the authorized list|awaiting a review",
+            "Model access is gated or unauthorized.",
+            [{"label": "set HF token and request model access on HuggingFace", "op": "manual"}],
+        ),
+    ]
+    for pattern, message, suggestions in patterns:
+        if re.search(pattern, tail, re.I):
+            return {"message": message, "suggestions": suggestions}
+    if re.search(r"Traceback \(most recent call last\)", tail, re.I) and not re.search(
+        r"Application startup complete|GET /v1/|Uvicorn running on", tail, re.I
+    ):
+        return {
+            "message": "Python traceback detected during serve startup.",
+            "suggestions": [{"label": "inspect traceback and retry with adjusted backend/settings", "op": "manual"}],
+        }
+    return None
--- a/routes/cookbook_routes.py
+++ b/routes/cookbook_routes.py
@@ -40,7 +40,7 @@ from routes.cookbook_helpers import (
    _append_serve_exit_code_lines, _append_llama_cpp_linux_accel_build_lines, _cached_model_scan_script,
    _ollama_bind_from_cmd, _pip_install_fallback_chain, _pip_install_no_cache,
    _user_shell_path_bootstrap, _venv_safe_local_pip_install_cmd,
-    ModelDownloadRequest, ServeRequest,
+    ModelDownloadRequest, ServeRequest, _diagnose_serve_output,
 )

 _HF_TOKEN_STATUS_SNIPPET = (
@@ -81,127 +81,6 @@ def setup_cookbook_routes() -> APIRouter:
                    task["payload"].pop("hf_token", None)
        return state

-    def _diagnose_serve_output(text: str) -> dict | None:
-        """Server-side mirror of the Cookbook UI's common serve diagnoses.
-
-        The browser uses cookbook-diagnosis.js for clickable fixes. This gives
-        the agent/tool path the same structured signal so it can retry with an
-        adjusted command instead of guessing from raw tmux output.
-        """
-        if not text:
-            return None
-        tail = text[-6000:]
-        patterns = [
-            (
-                r"No available memory for the cache blocks|Available KV cache memory:.*-",
-                "No GPU memory left for KV cache after loading model.",
-                [
-                    {"label": "retry with GPU memory utilization 0.95", "op": "replace", "flag": "--gpu-memory-utilization", "value": "0.95"},
-                    {"label": "retry with context 2048", "op": "replace", "flag": "--max-model-len", "value": "2048"},
-                ],
-            ),
-            (
-                r"CUDA out of memory|torch\.cuda\.OutOfMemoryError|CUDA error: out of memory|warming up sampler|max_num_seqs.*gpu_memory_utilization",
-                "GPU ran out of memory during startup or warmup.",
-                [
-                    {"label": "retry with context 4096", "op": "replace", "flag": "--max-model-len", "value": "4096"},
-                    {"label": "retry with GPU memory utilization 0.80", "op": "replace", "flag": "--gpu-memory-utilization", "value": "0.80"},
-                    {"label": "retry with --enforce-eager", "op": "append", "arg": "--enforce-eager"},
-                ],
-            ),
-            (
-                r"not divisib|must be divisible|attention heads.*divisible",
-                "Tensor parallel size is incompatible with the model.",
-                [
-                    {"label": "retry with tensor parallel size 1", "op": "replace", "flag": "--tensor-parallel-size", "value": "1"},
-                    {"label": "retry with tensor parallel size 2", "op": "replace", "flag": "--tensor-parallel-size", "value": "2"},
-                ],
-            ),
-            (
-                r"KV cache.*too (small|large)|max_model_len.*exceeds|maximum.*context",
-                "Context length is too large for available GPU memory.",
-                [
-                    {"label": "retry with context 8192", "op": "replace", "flag": "--max-model-len", "value": "8192"},
-                    {"label": "retry with context 4096", "op": "replace", "flag": "--max-model-len", "value": "4096"},
-                ],
-            ),
-            (
-                r"enable-auto-tool-choice requires --tool-call-parser",
-                "Auto tool choice requires an explicit tool call parser.",
-                [{"label": "retry with Hermes tool parser", "op": "append", "arg": "--tool-call-parser hermes"}],
-            ),
-            (
-                r"Please pass.*trust.remote.code=True|contains custom code which must be executed to correctly load|does not recognize this architecture|model type.*but Transformers does not",
-                "Model requires custom code or newer model support.",
-                [{"label": "retry with --trust-remote-code", "op": "append", "arg": "--trust-remote-code"}],
-            ),
-            (
-                r"Either a revision or a version must be specified|transformers\.integrations\.hub_kernels|kernels/layer",
-                "vLLM/Transformers kernel package mismatch.",
-                [{"label": "update vLLM, Transformers, and kernels on this server", "op": "dependency", "package": "vllm transformers kernels"}],
-            ),
-            (
-                r"Address already in use|bind.*address.*in use",
-                "Port is already in use.",
-                [{"label": "retry on port 8001", "op": "replace", "flag": "--port", "value": "8001"}],
-            ),
-            (
-                r"No CUDA GPUs are available|no GPU.*found|CUDA_VISIBLE_DEVICES.*invalid",
-                "No GPUs are visible to the serve process.",
-                [{"label": "clear Cookbook GPU selection or choose available GPUs", "op": "settings", "field": "gpus", "value": ""}],
-            ),
-            (
-                r"Failed to infer device type|NVML Shared Library Not Found|No module named 'amdsmi'|platform is not available",
-                "vLLM could not find a supported GPU (CUDA or ROCm). "
-                "This machine may have integrated or unsupported graphics only.",
-                [
-                    {"label": "switch to llama.cpp (CPU/Metal, works without a discrete GPU)", "op": "manual"},
-                    {"label": "switch to Ollama (CPU/Metal, works without a discrete GPU)", "op": "manual"},
-                ],
-            ),
-            (
-                r"vllm.*command not found|No module named vllm|ERROR: vLLM is not installed",
-                "vLLM is not installed or not in PATH on this server.",
-                [{"label": "install vLLM in Cookbook Dependencies", "op": "dependency", "package": "vllm"}],
-            ),
-            (
-                r"sglang.*command not found|No module named sglang|SGLang is not installed",
-                "SGLang is not installed or not in PATH on this server.",
-                [{"label": "install SGLang in Cookbook Dependencies", "op": "dependency", "package": "sglang[all]"}],
-            ),
-            (
-                r"llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'|git: command not found|cmake: command not found",
-                "llama.cpp / llama-cpp-python dependencies are missing.",
-                [{"label": "install llama.cpp dependencies or llama-cpp-python[server]", "op": "dependency", "package": "llama-cpp-python[server]"}],
-            ),
-            (
-                r"No GGUF found on this host|no \.gguf file|No GGUF file found",
-                "No GGUF file found for this model on this host. The llama.cpp backend needs a .gguf file.",
-                [{"label": "download a GGUF build of this model (repo name usually ends in -GGUF, file like Q4_K_M.gguf)", "op": "manual"}],
-            ),
-            (
-                r"No module named 'torch'|No module named torch|No module named 'diffusers'|No module named diffusers",
-                "Diffusion serving requires PyTorch and diffusers.",
-                [{"label": "install diffusers[torch] in Cookbook Dependencies", "op": "dependency", "package": "diffusers[torch]"}],
-            ),
-            (
-                r"403 Forbidden|401 Unauthorized|Access to model.*is restricted|gated repo|not in the authorized list|awaiting a review",
-                "Model access is gated or unauthorized.",
-                [{"label": "set HF token and request model access on HuggingFace", "op": "manual"}],
-            ),
-        ]
-        for pattern, message, suggestions in patterns:
-            if re.search(pattern, tail, re.I):
-                return {"message": message, "suggestions": suggestions}
-        if re.search(r"Traceback \(most recent call last\)", tail, re.I) and not re.search(
-            r"Application startup complete|GET /v1/|Uvicorn running on", tail, re.I
-        ):
-            return {
-                "message": "Python traceback detected during serve startup.",
-                "suggestions": [{"label": "inspect traceback and retry with adjusted backend/settings", "op": "manual"}],
-            }
-        return None
-
    def _state_for_client(state):
        """Return cookbook state without raw secrets for browser clients."""
        _strip_task_secrets(state)
--- a/static/js/cookbookRunning.js
+++ b/static/js/cookbookRunning.js
@@ -1900,6 +1900,9 @@ export function _renderRunningTab() {

    const terminalDiag = _terminalServeDiagnosis(task, task.output || '');
    if (terminalDiag) _showDiagnosis(el, terminalDiag, task.output || '');
+    if (!terminalDiag && (task.status === 'error' || task.status === 'crashed') && task._backendDiagnosis) {
+      _showDiagnosis(el, task._backendDiagnosis, task.output || '');
+    }

    const _uptimeEl = el.querySelector('.cookbook-task-uptime');
    if (_uptimeEl && (task.type === 'serve' || task.type === 'download') && task.status === 'running') {
@@ -3515,6 +3518,12 @@ async function _pollBackgroundStatus() {
            updates.output = `${previous ? `${previous}\n` : ''}${tail}`.slice(-5000);
          }
        }
+        if (live.diagnosis && !task._diagnosisDismissed) {
+          updates._backendDiagnosis = live.diagnosis;
+        }
+        if (live.cmd && !task.payload?._cmd) {
+          updates.payload = { ...(task.payload || {}), _cmd: live.cmd };
+        }
        if (Object.keys(updates).length) {
          Object.assign(task, updates);
          changed = true;
@@ -3523,6 +3532,12 @@ async function _pollBackgroundStatus() {
      if (changed) {
        _saveTasks(localTasks);
        _renderRunningTab();
+        for (const task of localTasks) {
+          if (!task._backendDiagnosis) continue;
+          const el = document.querySelector(`[data-session-id="${CSS.escape(task.sessionId)}"]`);
+          if (!el || el.querySelector('.cookbook-diagnosis')) continue;
+          _showDiagnosis(el, task._backendDiagnosis, task.output || '');
+        }
        completedDeps.forEach(t => _refreshDepsAfterInstall(t));
      }
    } catch (_) { /* non-fatal: background status should never break polling */ }
--- a/tests/test_cookbook_error_feedback.py
+++ b/tests/test_cookbook_error_feedback.py
@@ -0,0 +1,72 @@
+from routes.cookbook_helpers import _diagnose_serve_output
+
+
+def test_cuda_oom_returns_diagnosis():
+    out = "torch.cuda.OutOfMemoryError: CUDA out of memory."
+    result = _diagnose_serve_output(out)
+    assert result is not None
+    assert "memory" in result["message"].lower()
+    assert any(s["op"] == "replace" for s in result["suggestions"])
+
+
+def test_port_in_use_returns_diagnosis():
+    out = "OSError: [Errno 98] Address already in use"
+    result = _diagnose_serve_output(out)
+    assert result is not None
+    assert "port" in result["message"].lower()
+    assert result["suggestions"][0]["flag"] == "--port"
+
+
+def test_vllm_not_installed_returns_diagnosis():
+    out = "No module named vllm"
+    result = _diagnose_serve_output(out)
+    assert result is not None
+    assert "vLLM" in result["message"]
+    assert result["suggestions"][0]["package"] == "vllm"
+
+
+def test_gated_model_returns_diagnosis():
+    out = "403 Forbidden\nAccess to model is restricted"
+    result = _diagnose_serve_output(out)
+    assert result is not None
+    assert "gated" in result["message"].lower() or "unauthorized" in result["message"].lower()
+
+
+def test_traceback_fallback_fires_without_startup_success():
+    out = "Traceback (most recent call last):\n  File 'serve.py', line 1\nRuntimeError: bad config"
+    result = _diagnose_serve_output(out)
+    assert result is not None
+    assert "traceback" in result["message"].lower()
+
+
+def test_traceback_suppressed_when_server_started():
+    out = (
+        "Traceback (most recent call last):\n  File 'x.py'\nValueError: ...\n"
+        "Application startup complete."
+    )
+    result = _diagnose_serve_output(out)
+    assert result is None
+
+
+def test_clean_output_returns_none():
+    out = "INFO: Application startup complete.\nINFO: Uvicorn running on http://0.0.0.0:8000"
+    assert _diagnose_serve_output(out) is None
+
+
+def test_empty_input_returns_none():
+    assert _diagnose_serve_output("") is None
+    assert _diagnose_serve_output(None) is None
+
+
+def test_trust_remote_code_pattern():
+    out = "Please pass trust_remote_code=True when loading this model."
+    result = _diagnose_serve_output(out)
+    assert result is not None
+    assert "--trust-remote-code" in result["suggestions"][0]["arg"]
+
+
+def test_no_gguf_found_pattern():
+    out = "No GGUF found on this host for model qwen/qwen2-7b"
+    result = _diagnose_serve_output(out)
+    assert result is not None
+    assert "GGUF" in result["message"]