fix(cookbook): surface backend diagnosis when serve fails in background (#1636)
* refactor(cookbook): move _diagnose_serve_output to module level in cookbook_helpers Extracts the nested _diagnose_serve_output function from inside setup_cookbook_routes() and moves it to module level in cookbook_helpers.py, alongside the other helper functions it logically belongs with. No behaviour change — the function is now importable directly for testing and by other callers without going through the route factory closure. * fix(cookbook): surface backend diagnosis when serve fails in background The background poll (_pollBackgroundStatus) already received `diagnosis` and `cmd` from /api/cookbook/tasks/status but discarded both. When a serve job died while the Cookbook modal was closed, reopening it showed only a red error badge with no context. - Persist live.diagnosis into task._backendDiagnosis in localStorage so it survives modal close/reopen and page refresh - Persist live.cmd into task.payload._cmd for agent-spawned tasks so the crash report includes the actual command - After _renderRunningTab(), walk rendered cards and call _showDiagnosis() for any that have a stored _backendDiagnosis but no panel yet - In _renderTaskCard(), use _backendDiagnosis as a fallback when the client-side _terminalServeDiagnosis() finds nothing * test(cookbook): add coverage for _diagnose_serve_output error patterns 10 tests verifying the 16 serve-failure patterns: - CUDA OOM, port-in-use, vLLM missing, gated model - Traceback fallback fires without startup success marker - Traceback suppressed when server actually started - Clean/empty output returns None - trust-remote-code and no-GGUF patterns
This commit is contained in:
@@ -804,3 +804,125 @@ def _ssh_ps(host, script_path, port=None):
|
||||
|
||||
# Windows session dir — stored in user's temp on the remote
|
||||
WIN_SESSION_DIR = "$env:TEMP\\\\odysseus-sessions"
|
||||
|
||||
|
||||
def _diagnose_serve_output(text: str) -> dict | None:
|
||||
"""Server-side mirror of the Cookbook UI's common serve diagnoses.
|
||||
|
||||
The browser uses cookbook-diagnosis.js for clickable fixes. This gives
|
||||
the agent/tool path the same structured signal so it can retry with an
|
||||
adjusted command instead of guessing from raw tmux output.
|
||||
"""
|
||||
if not text:
|
||||
return None
|
||||
tail = text[-6000:]
|
||||
patterns = [
|
||||
(
|
||||
r"No available memory for the cache blocks|Available KV cache memory:.*-",
|
||||
"No GPU memory left for KV cache after loading model.",
|
||||
[
|
||||
{"label": "retry with GPU memory utilization 0.95", "op": "replace", "flag": "--gpu-memory-utilization", "value": "0.95"},
|
||||
{"label": "retry with context 2048", "op": "replace", "flag": "--max-model-len", "value": "2048"},
|
||||
],
|
||||
),
|
||||
(
|
||||
r"CUDA out of memory|torch\.cuda\.OutOfMemoryError|CUDA error: out of memory|warming up sampler|max_num_seqs.*gpu_memory_utilization",
|
||||
"GPU ran out of memory during startup or warmup.",
|
||||
[
|
||||
{"label": "retry with context 4096", "op": "replace", "flag": "--max-model-len", "value": "4096"},
|
||||
{"label": "retry with GPU memory utilization 0.80", "op": "replace", "flag": "--gpu-memory-utilization", "value": "0.80"},
|
||||
{"label": "retry with --enforce-eager", "op": "append", "arg": "--enforce-eager"},
|
||||
],
|
||||
),
|
||||
(
|
||||
r"not divisib|must be divisible|attention heads.*divisible",
|
||||
"Tensor parallel size is incompatible with the model.",
|
||||
[
|
||||
{"label": "retry with tensor parallel size 1", "op": "replace", "flag": "--tensor-parallel-size", "value": "1"},
|
||||
{"label": "retry with tensor parallel size 2", "op": "replace", "flag": "--tensor-parallel-size", "value": "2"},
|
||||
],
|
||||
),
|
||||
(
|
||||
r"KV cache.*too (small|large)|max_model_len.*exceeds|maximum.*context",
|
||||
"Context length is too large for available GPU memory.",
|
||||
[
|
||||
{"label": "retry with context 8192", "op": "replace", "flag": "--max-model-len", "value": "8192"},
|
||||
{"label": "retry with context 4096", "op": "replace", "flag": "--max-model-len", "value": "4096"},
|
||||
],
|
||||
),
|
||||
(
|
||||
r"enable-auto-tool-choice requires --tool-call-parser",
|
||||
"Auto tool choice requires an explicit tool call parser.",
|
||||
[{"label": "retry with Hermes tool parser", "op": "append", "arg": "--tool-call-parser hermes"}],
|
||||
),
|
||||
(
|
||||
r"Please pass.*trust.remote.code=True|contains custom code which must be executed to correctly load|does not recognize this architecture|model type.*but Transformers does not",
|
||||
"Model requires custom code or newer model support.",
|
||||
[{"label": "retry with --trust-remote-code", "op": "append", "arg": "--trust-remote-code"}],
|
||||
),
|
||||
(
|
||||
r"Either a revision or a version must be specified|transformers\.integrations\.hub_kernels|kernels/layer",
|
||||
"vLLM/Transformers kernel package mismatch.",
|
||||
[{"label": "update vLLM, Transformers, and kernels on this server", "op": "dependency", "package": "vllm transformers kernels"}],
|
||||
),
|
||||
(
|
||||
r"Address already in use|bind.*address.*in use",
|
||||
"Port is already in use.",
|
||||
[{"label": "retry on port 8001", "op": "replace", "flag": "--port", "value": "8001"}],
|
||||
),
|
||||
(
|
||||
r"No CUDA GPUs are available|no GPU.*found|CUDA_VISIBLE_DEVICES.*invalid",
|
||||
"No GPUs are visible to the serve process.",
|
||||
[{"label": "clear Cookbook GPU selection or choose available GPUs", "op": "settings", "field": "gpus", "value": ""}],
|
||||
),
|
||||
(
|
||||
r"Failed to infer device type|NVML Shared Library Not Found|No module named 'amdsmi'|platform is not available",
|
||||
"vLLM could not find a supported GPU (CUDA or ROCm). "
|
||||
"This machine may have integrated or unsupported graphics only.",
|
||||
[
|
||||
{"label": "switch to llama.cpp (CPU/Metal, works without a discrete GPU)", "op": "manual"},
|
||||
{"label": "switch to Ollama (CPU/Metal, works without a discrete GPU)", "op": "manual"},
|
||||
],
|
||||
),
|
||||
(
|
||||
r"vllm.*command not found|No module named vllm|ERROR: vLLM is not installed",
|
||||
"vLLM is not installed or not in PATH on this server.",
|
||||
[{"label": "install vLLM in Cookbook Dependencies", "op": "dependency", "package": "vllm"}],
|
||||
),
|
||||
(
|
||||
r"sglang.*command not found|No module named sglang|SGLang is not installed",
|
||||
"SGLang is not installed or not in PATH on this server.",
|
||||
[{"label": "install SGLang in Cookbook Dependencies", "op": "dependency", "package": "sglang[all]"}],
|
||||
),
|
||||
(
|
||||
r"llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'|git: command not found|cmake: command not found",
|
||||
"llama.cpp / llama-cpp-python dependencies are missing.",
|
||||
[{"label": "install llama.cpp dependencies or llama-cpp-python[server]", "op": "dependency", "package": "llama-cpp-python[server]"}],
|
||||
),
|
||||
(
|
||||
r"No GGUF found on this host|no \.gguf file|No GGUF file found",
|
||||
"No GGUF file found for this model on this host. The llama.cpp backend needs a .gguf file.",
|
||||
[{"label": "download a GGUF build of this model (repo name usually ends in -GGUF, file like Q4_K_M.gguf)", "op": "manual"}],
|
||||
),
|
||||
(
|
||||
r"No module named 'torch'|No module named torch|No module named 'diffusers'|No module named diffusers",
|
||||
"Diffusion serving requires PyTorch and diffusers.",
|
||||
[{"label": "install diffusers[torch] in Cookbook Dependencies", "op": "dependency", "package": "diffusers[torch]"}],
|
||||
),
|
||||
(
|
||||
r"403 Forbidden|401 Unauthorized|Access to model.*is restricted|gated repo|not in the authorized list|awaiting a review",
|
||||
"Model access is gated or unauthorized.",
|
||||
[{"label": "set HF token and request model access on HuggingFace", "op": "manual"}],
|
||||
),
|
||||
]
|
||||
for pattern, message, suggestions in patterns:
|
||||
if re.search(pattern, tail, re.I):
|
||||
return {"message": message, "suggestions": suggestions}
|
||||
if re.search(r"Traceback \(most recent call last\)", tail, re.I) and not re.search(
|
||||
r"Application startup complete|GET /v1/|Uvicorn running on", tail, re.I
|
||||
):
|
||||
return {
|
||||
"message": "Python traceback detected during serve startup.",
|
||||
"suggestions": [{"label": "inspect traceback and retry with adjusted backend/settings", "op": "manual"}],
|
||||
}
|
||||
return None
|
||||
|
||||
@@ -40,7 +40,7 @@ from routes.cookbook_helpers import (
|
||||
_append_serve_exit_code_lines, _append_llama_cpp_linux_accel_build_lines, _cached_model_scan_script,
|
||||
_ollama_bind_from_cmd, _pip_install_fallback_chain, _pip_install_no_cache,
|
||||
_user_shell_path_bootstrap, _venv_safe_local_pip_install_cmd,
|
||||
ModelDownloadRequest, ServeRequest,
|
||||
ModelDownloadRequest, ServeRequest, _diagnose_serve_output,
|
||||
)
|
||||
|
||||
_HF_TOKEN_STATUS_SNIPPET = (
|
||||
@@ -81,127 +81,6 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
task["payload"].pop("hf_token", None)
|
||||
return state
|
||||
|
||||
def _diagnose_serve_output(text: str) -> dict | None:
|
||||
"""Server-side mirror of the Cookbook UI's common serve diagnoses.
|
||||
|
||||
The browser uses cookbook-diagnosis.js for clickable fixes. This gives
|
||||
the agent/tool path the same structured signal so it can retry with an
|
||||
adjusted command instead of guessing from raw tmux output.
|
||||
"""
|
||||
if not text:
|
||||
return None
|
||||
tail = text[-6000:]
|
||||
patterns = [
|
||||
(
|
||||
r"No available memory for the cache blocks|Available KV cache memory:.*-",
|
||||
"No GPU memory left for KV cache after loading model.",
|
||||
[
|
||||
{"label": "retry with GPU memory utilization 0.95", "op": "replace", "flag": "--gpu-memory-utilization", "value": "0.95"},
|
||||
{"label": "retry with context 2048", "op": "replace", "flag": "--max-model-len", "value": "2048"},
|
||||
],
|
||||
),
|
||||
(
|
||||
r"CUDA out of memory|torch\.cuda\.OutOfMemoryError|CUDA error: out of memory|warming up sampler|max_num_seqs.*gpu_memory_utilization",
|
||||
"GPU ran out of memory during startup or warmup.",
|
||||
[
|
||||
{"label": "retry with context 4096", "op": "replace", "flag": "--max-model-len", "value": "4096"},
|
||||
{"label": "retry with GPU memory utilization 0.80", "op": "replace", "flag": "--gpu-memory-utilization", "value": "0.80"},
|
||||
{"label": "retry with --enforce-eager", "op": "append", "arg": "--enforce-eager"},
|
||||
],
|
||||
),
|
||||
(
|
||||
r"not divisib|must be divisible|attention heads.*divisible",
|
||||
"Tensor parallel size is incompatible with the model.",
|
||||
[
|
||||
{"label": "retry with tensor parallel size 1", "op": "replace", "flag": "--tensor-parallel-size", "value": "1"},
|
||||
{"label": "retry with tensor parallel size 2", "op": "replace", "flag": "--tensor-parallel-size", "value": "2"},
|
||||
],
|
||||
),
|
||||
(
|
||||
r"KV cache.*too (small|large)|max_model_len.*exceeds|maximum.*context",
|
||||
"Context length is too large for available GPU memory.",
|
||||
[
|
||||
{"label": "retry with context 8192", "op": "replace", "flag": "--max-model-len", "value": "8192"},
|
||||
{"label": "retry with context 4096", "op": "replace", "flag": "--max-model-len", "value": "4096"},
|
||||
],
|
||||
),
|
||||
(
|
||||
r"enable-auto-tool-choice requires --tool-call-parser",
|
||||
"Auto tool choice requires an explicit tool call parser.",
|
||||
[{"label": "retry with Hermes tool parser", "op": "append", "arg": "--tool-call-parser hermes"}],
|
||||
),
|
||||
(
|
||||
r"Please pass.*trust.remote.code=True|contains custom code which must be executed to correctly load|does not recognize this architecture|model type.*but Transformers does not",
|
||||
"Model requires custom code or newer model support.",
|
||||
[{"label": "retry with --trust-remote-code", "op": "append", "arg": "--trust-remote-code"}],
|
||||
),
|
||||
(
|
||||
r"Either a revision or a version must be specified|transformers\.integrations\.hub_kernels|kernels/layer",
|
||||
"vLLM/Transformers kernel package mismatch.",
|
||||
[{"label": "update vLLM, Transformers, and kernels on this server", "op": "dependency", "package": "vllm transformers kernels"}],
|
||||
),
|
||||
(
|
||||
r"Address already in use|bind.*address.*in use",
|
||||
"Port is already in use.",
|
||||
[{"label": "retry on port 8001", "op": "replace", "flag": "--port", "value": "8001"}],
|
||||
),
|
||||
(
|
||||
r"No CUDA GPUs are available|no GPU.*found|CUDA_VISIBLE_DEVICES.*invalid",
|
||||
"No GPUs are visible to the serve process.",
|
||||
[{"label": "clear Cookbook GPU selection or choose available GPUs", "op": "settings", "field": "gpus", "value": ""}],
|
||||
),
|
||||
(
|
||||
r"Failed to infer device type|NVML Shared Library Not Found|No module named 'amdsmi'|platform is not available",
|
||||
"vLLM could not find a supported GPU (CUDA or ROCm). "
|
||||
"This machine may have integrated or unsupported graphics only.",
|
||||
[
|
||||
{"label": "switch to llama.cpp (CPU/Metal, works without a discrete GPU)", "op": "manual"},
|
||||
{"label": "switch to Ollama (CPU/Metal, works without a discrete GPU)", "op": "manual"},
|
||||
],
|
||||
),
|
||||
(
|
||||
r"vllm.*command not found|No module named vllm|ERROR: vLLM is not installed",
|
||||
"vLLM is not installed or not in PATH on this server.",
|
||||
[{"label": "install vLLM in Cookbook Dependencies", "op": "dependency", "package": "vllm"}],
|
||||
),
|
||||
(
|
||||
r"sglang.*command not found|No module named sglang|SGLang is not installed",
|
||||
"SGLang is not installed or not in PATH on this server.",
|
||||
[{"label": "install SGLang in Cookbook Dependencies", "op": "dependency", "package": "sglang[all]"}],
|
||||
),
|
||||
(
|
||||
r"llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'|git: command not found|cmake: command not found",
|
||||
"llama.cpp / llama-cpp-python dependencies are missing.",
|
||||
[{"label": "install llama.cpp dependencies or llama-cpp-python[server]", "op": "dependency", "package": "llama-cpp-python[server]"}],
|
||||
),
|
||||
(
|
||||
r"No GGUF found on this host|no \.gguf file|No GGUF file found",
|
||||
"No GGUF file found for this model on this host. The llama.cpp backend needs a .gguf file.",
|
||||
[{"label": "download a GGUF build of this model (repo name usually ends in -GGUF, file like Q4_K_M.gguf)", "op": "manual"}],
|
||||
),
|
||||
(
|
||||
r"No module named 'torch'|No module named torch|No module named 'diffusers'|No module named diffusers",
|
||||
"Diffusion serving requires PyTorch and diffusers.",
|
||||
[{"label": "install diffusers[torch] in Cookbook Dependencies", "op": "dependency", "package": "diffusers[torch]"}],
|
||||
),
|
||||
(
|
||||
r"403 Forbidden|401 Unauthorized|Access to model.*is restricted|gated repo|not in the authorized list|awaiting a review",
|
||||
"Model access is gated or unauthorized.",
|
||||
[{"label": "set HF token and request model access on HuggingFace", "op": "manual"}],
|
||||
),
|
||||
]
|
||||
for pattern, message, suggestions in patterns:
|
||||
if re.search(pattern, tail, re.I):
|
||||
return {"message": message, "suggestions": suggestions}
|
||||
if re.search(r"Traceback \(most recent call last\)", tail, re.I) and not re.search(
|
||||
r"Application startup complete|GET /v1/|Uvicorn running on", tail, re.I
|
||||
):
|
||||
return {
|
||||
"message": "Python traceback detected during serve startup.",
|
||||
"suggestions": [{"label": "inspect traceback and retry with adjusted backend/settings", "op": "manual"}],
|
||||
}
|
||||
return None
|
||||
|
||||
def _state_for_client(state):
|
||||
"""Return cookbook state without raw secrets for browser clients."""
|
||||
_strip_task_secrets(state)
|
||||
|
||||
@@ -1900,6 +1900,9 @@ export function _renderRunningTab() {
|
||||
|
||||
const terminalDiag = _terminalServeDiagnosis(task, task.output || '');
|
||||
if (terminalDiag) _showDiagnosis(el, terminalDiag, task.output || '');
|
||||
if (!terminalDiag && (task.status === 'error' || task.status === 'crashed') && task._backendDiagnosis) {
|
||||
_showDiagnosis(el, task._backendDiagnosis, task.output || '');
|
||||
}
|
||||
|
||||
const _uptimeEl = el.querySelector('.cookbook-task-uptime');
|
||||
if (_uptimeEl && (task.type === 'serve' || task.type === 'download') && task.status === 'running') {
|
||||
@@ -3515,6 +3518,12 @@ async function _pollBackgroundStatus() {
|
||||
updates.output = `${previous ? `${previous}\n` : ''}${tail}`.slice(-5000);
|
||||
}
|
||||
}
|
||||
if (live.diagnosis && !task._diagnosisDismissed) {
|
||||
updates._backendDiagnosis = live.diagnosis;
|
||||
}
|
||||
if (live.cmd && !task.payload?._cmd) {
|
||||
updates.payload = { ...(task.payload || {}), _cmd: live.cmd };
|
||||
}
|
||||
if (Object.keys(updates).length) {
|
||||
Object.assign(task, updates);
|
||||
changed = true;
|
||||
@@ -3523,6 +3532,12 @@ async function _pollBackgroundStatus() {
|
||||
if (changed) {
|
||||
_saveTasks(localTasks);
|
||||
_renderRunningTab();
|
||||
for (const task of localTasks) {
|
||||
if (!task._backendDiagnosis) continue;
|
||||
const el = document.querySelector(`[data-session-id="${CSS.escape(task.sessionId)}"]`);
|
||||
if (!el || el.querySelector('.cookbook-diagnosis')) continue;
|
||||
_showDiagnosis(el, task._backendDiagnosis, task.output || '');
|
||||
}
|
||||
completedDeps.forEach(t => _refreshDepsAfterInstall(t));
|
||||
}
|
||||
} catch (_) { /* non-fatal: background status should never break polling */ }
|
||||
|
||||
72
tests/test_cookbook_error_feedback.py
Normal file
72
tests/test_cookbook_error_feedback.py
Normal file
@@ -0,0 +1,72 @@
|
||||
from routes.cookbook_helpers import _diagnose_serve_output
|
||||
|
||||
|
||||
def test_cuda_oom_returns_diagnosis():
|
||||
out = "torch.cuda.OutOfMemoryError: CUDA out of memory."
|
||||
result = _diagnose_serve_output(out)
|
||||
assert result is not None
|
||||
assert "memory" in result["message"].lower()
|
||||
assert any(s["op"] == "replace" for s in result["suggestions"])
|
||||
|
||||
|
||||
def test_port_in_use_returns_diagnosis():
|
||||
out = "OSError: [Errno 98] Address already in use"
|
||||
result = _diagnose_serve_output(out)
|
||||
assert result is not None
|
||||
assert "port" in result["message"].lower()
|
||||
assert result["suggestions"][0]["flag"] == "--port"
|
||||
|
||||
|
||||
def test_vllm_not_installed_returns_diagnosis():
|
||||
out = "No module named vllm"
|
||||
result = _diagnose_serve_output(out)
|
||||
assert result is not None
|
||||
assert "vLLM" in result["message"]
|
||||
assert result["suggestions"][0]["package"] == "vllm"
|
||||
|
||||
|
||||
def test_gated_model_returns_diagnosis():
|
||||
out = "403 Forbidden\nAccess to model is restricted"
|
||||
result = _diagnose_serve_output(out)
|
||||
assert result is not None
|
||||
assert "gated" in result["message"].lower() or "unauthorized" in result["message"].lower()
|
||||
|
||||
|
||||
def test_traceback_fallback_fires_without_startup_success():
|
||||
out = "Traceback (most recent call last):\n File 'serve.py', line 1\nRuntimeError: bad config"
|
||||
result = _diagnose_serve_output(out)
|
||||
assert result is not None
|
||||
assert "traceback" in result["message"].lower()
|
||||
|
||||
|
||||
def test_traceback_suppressed_when_server_started():
|
||||
out = (
|
||||
"Traceback (most recent call last):\n File 'x.py'\nValueError: ...\n"
|
||||
"Application startup complete."
|
||||
)
|
||||
result = _diagnose_serve_output(out)
|
||||
assert result is None
|
||||
|
||||
|
||||
def test_clean_output_returns_none():
|
||||
out = "INFO: Application startup complete.\nINFO: Uvicorn running on http://0.0.0.0:8000"
|
||||
assert _diagnose_serve_output(out) is None
|
||||
|
||||
|
||||
def test_empty_input_returns_none():
|
||||
assert _diagnose_serve_output("") is None
|
||||
assert _diagnose_serve_output(None) is None
|
||||
|
||||
|
||||
def test_trust_remote_code_pattern():
|
||||
out = "Please pass trust_remote_code=True when loading this model."
|
||||
result = _diagnose_serve_output(out)
|
||||
assert result is not None
|
||||
assert "--trust-remote-code" in result["suggestions"][0]["arg"]
|
||||
|
||||
|
||||
def test_no_gguf_found_pattern():
|
||||
out = "No GGUF found on this host for model qwen/qwen2-7b"
|
||||
result = _diagnose_serve_output(out)
|
||||
assert result is not None
|
||||
assert "GGUF" in result["message"]
|
||||
Reference in New Issue
Block a user