diff --git a/routes/cookbook_helpers.py b/routes/cookbook_helpers.py index 454c67b..1748bbb 100644 --- a/routes/cookbook_helpers.py +++ b/routes/cookbook_helpers.py @@ -804,3 +804,125 @@ def _ssh_ps(host, script_path, port=None): # Windows session dir — stored in user's temp on the remote WIN_SESSION_DIR = "$env:TEMP\\\\odysseus-sessions" + + +def _diagnose_serve_output(text: str) -> dict | None: + """Server-side mirror of the Cookbook UI's common serve diagnoses. + + The browser uses cookbook-diagnosis.js for clickable fixes. This gives + the agent/tool path the same structured signal so it can retry with an + adjusted command instead of guessing from raw tmux output. + """ + if not text: + return None + tail = text[-6000:] + patterns = [ + ( + r"No available memory for the cache blocks|Available KV cache memory:.*-", + "No GPU memory left for KV cache after loading model.", + [ + {"label": "retry with GPU memory utilization 0.95", "op": "replace", "flag": "--gpu-memory-utilization", "value": "0.95"}, + {"label": "retry with context 2048", "op": "replace", "flag": "--max-model-len", "value": "2048"}, + ], + ), + ( + r"CUDA out of memory|torch\.cuda\.OutOfMemoryError|CUDA error: out of memory|warming up sampler|max_num_seqs.*gpu_memory_utilization", + "GPU ran out of memory during startup or warmup.", + [ + {"label": "retry with context 4096", "op": "replace", "flag": "--max-model-len", "value": "4096"}, + {"label": "retry with GPU memory utilization 0.80", "op": "replace", "flag": "--gpu-memory-utilization", "value": "0.80"}, + {"label": "retry with --enforce-eager", "op": "append", "arg": "--enforce-eager"}, + ], + ), + ( + r"not divisib|must be divisible|attention heads.*divisible", + "Tensor parallel size is incompatible with the model.", + [ + {"label": "retry with tensor parallel size 1", "op": "replace", "flag": "--tensor-parallel-size", "value": "1"}, + {"label": "retry with tensor parallel size 2", "op": "replace", "flag": "--tensor-parallel-size", "value": "2"}, + ], + ), + ( + r"KV cache.*too (small|large)|max_model_len.*exceeds|maximum.*context", + "Context length is too large for available GPU memory.", + [ + {"label": "retry with context 8192", "op": "replace", "flag": "--max-model-len", "value": "8192"}, + {"label": "retry with context 4096", "op": "replace", "flag": "--max-model-len", "value": "4096"}, + ], + ), + ( + r"enable-auto-tool-choice requires --tool-call-parser", + "Auto tool choice requires an explicit tool call parser.", + [{"label": "retry with Hermes tool parser", "op": "append", "arg": "--tool-call-parser hermes"}], + ), + ( + r"Please pass.*trust.remote.code=True|contains custom code which must be executed to correctly load|does not recognize this architecture|model type.*but Transformers does not", + "Model requires custom code or newer model support.", + [{"label": "retry with --trust-remote-code", "op": "append", "arg": "--trust-remote-code"}], + ), + ( + r"Either a revision or a version must be specified|transformers\.integrations\.hub_kernels|kernels/layer", + "vLLM/Transformers kernel package mismatch.", + [{"label": "update vLLM, Transformers, and kernels on this server", "op": "dependency", "package": "vllm transformers kernels"}], + ), + ( + r"Address already in use|bind.*address.*in use", + "Port is already in use.", + [{"label": "retry on port 8001", "op": "replace", "flag": "--port", "value": "8001"}], + ), + ( + r"No CUDA GPUs are available|no GPU.*found|CUDA_VISIBLE_DEVICES.*invalid", + "No GPUs are visible to the serve process.", + [{"label": "clear Cookbook GPU selection or choose available GPUs", "op": "settings", "field": "gpus", "value": ""}], + ), + ( + r"Failed to infer device type|NVML Shared Library Not Found|No module named 'amdsmi'|platform is not available", + "vLLM could not find a supported GPU (CUDA or ROCm). " + "This machine may have integrated or unsupported graphics only.", + [ + {"label": "switch to llama.cpp (CPU/Metal, works without a discrete GPU)", "op": "manual"}, + {"label": "switch to Ollama (CPU/Metal, works without a discrete GPU)", "op": "manual"}, + ], + ), + ( + r"vllm.*command not found|No module named vllm|ERROR: vLLM is not installed", + "vLLM is not installed or not in PATH on this server.", + [{"label": "install vLLM in Cookbook Dependencies", "op": "dependency", "package": "vllm"}], + ), + ( + r"sglang.*command not found|No module named sglang|SGLang is not installed", + "SGLang is not installed or not in PATH on this server.", + [{"label": "install SGLang in Cookbook Dependencies", "op": "dependency", "package": "sglang[all]"}], + ), + ( + r"llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'|git: command not found|cmake: command not found", + "llama.cpp / llama-cpp-python dependencies are missing.", + [{"label": "install llama.cpp dependencies or llama-cpp-python[server]", "op": "dependency", "package": "llama-cpp-python[server]"}], + ), + ( + r"No GGUF found on this host|no \.gguf file|No GGUF file found", + "No GGUF file found for this model on this host. The llama.cpp backend needs a .gguf file.", + [{"label": "download a GGUF build of this model (repo name usually ends in -GGUF, file like Q4_K_M.gguf)", "op": "manual"}], + ), + ( + r"No module named 'torch'|No module named torch|No module named 'diffusers'|No module named diffusers", + "Diffusion serving requires PyTorch and diffusers.", + [{"label": "install diffusers[torch] in Cookbook Dependencies", "op": "dependency", "package": "diffusers[torch]"}], + ), + ( + r"403 Forbidden|401 Unauthorized|Access to model.*is restricted|gated repo|not in the authorized list|awaiting a review", + "Model access is gated or unauthorized.", + [{"label": "set HF token and request model access on HuggingFace", "op": "manual"}], + ), + ] + for pattern, message, suggestions in patterns: + if re.search(pattern, tail, re.I): + return {"message": message, "suggestions": suggestions} + if re.search(r"Traceback \(most recent call last\)", tail, re.I) and not re.search( + r"Application startup complete|GET /v1/|Uvicorn running on", tail, re.I + ): + return { + "message": "Python traceback detected during serve startup.", + "suggestions": [{"label": "inspect traceback and retry with adjusted backend/settings", "op": "manual"}], + } + return None diff --git a/routes/cookbook_routes.py b/routes/cookbook_routes.py index bf2365b..f25c7d7 100644 --- a/routes/cookbook_routes.py +++ b/routes/cookbook_routes.py @@ -40,7 +40,7 @@ from routes.cookbook_helpers import ( _append_serve_exit_code_lines, _append_llama_cpp_linux_accel_build_lines, _cached_model_scan_script, _ollama_bind_from_cmd, _pip_install_fallback_chain, _pip_install_no_cache, _user_shell_path_bootstrap, _venv_safe_local_pip_install_cmd, - ModelDownloadRequest, ServeRequest, + ModelDownloadRequest, ServeRequest, _diagnose_serve_output, ) _HF_TOKEN_STATUS_SNIPPET = ( @@ -81,127 +81,6 @@ def setup_cookbook_routes() -> APIRouter: task["payload"].pop("hf_token", None) return state - def _diagnose_serve_output(text: str) -> dict | None: - """Server-side mirror of the Cookbook UI's common serve diagnoses. - - The browser uses cookbook-diagnosis.js for clickable fixes. This gives - the agent/tool path the same structured signal so it can retry with an - adjusted command instead of guessing from raw tmux output. - """ - if not text: - return None - tail = text[-6000:] - patterns = [ - ( - r"No available memory for the cache blocks|Available KV cache memory:.*-", - "No GPU memory left for KV cache after loading model.", - [ - {"label": "retry with GPU memory utilization 0.95", "op": "replace", "flag": "--gpu-memory-utilization", "value": "0.95"}, - {"label": "retry with context 2048", "op": "replace", "flag": "--max-model-len", "value": "2048"}, - ], - ), - ( - r"CUDA out of memory|torch\.cuda\.OutOfMemoryError|CUDA error: out of memory|warming up sampler|max_num_seqs.*gpu_memory_utilization", - "GPU ran out of memory during startup or warmup.", - [ - {"label": "retry with context 4096", "op": "replace", "flag": "--max-model-len", "value": "4096"}, - {"label": "retry with GPU memory utilization 0.80", "op": "replace", "flag": "--gpu-memory-utilization", "value": "0.80"}, - {"label": "retry with --enforce-eager", "op": "append", "arg": "--enforce-eager"}, - ], - ), - ( - r"not divisib|must be divisible|attention heads.*divisible", - "Tensor parallel size is incompatible with the model.", - [ - {"label": "retry with tensor parallel size 1", "op": "replace", "flag": "--tensor-parallel-size", "value": "1"}, - {"label": "retry with tensor parallel size 2", "op": "replace", "flag": "--tensor-parallel-size", "value": "2"}, - ], - ), - ( - r"KV cache.*too (small|large)|max_model_len.*exceeds|maximum.*context", - "Context length is too large for available GPU memory.", - [ - {"label": "retry with context 8192", "op": "replace", "flag": "--max-model-len", "value": "8192"}, - {"label": "retry with context 4096", "op": "replace", "flag": "--max-model-len", "value": "4096"}, - ], - ), - ( - r"enable-auto-tool-choice requires --tool-call-parser", - "Auto tool choice requires an explicit tool call parser.", - [{"label": "retry with Hermes tool parser", "op": "append", "arg": "--tool-call-parser hermes"}], - ), - ( - r"Please pass.*trust.remote.code=True|contains custom code which must be executed to correctly load|does not recognize this architecture|model type.*but Transformers does not", - "Model requires custom code or newer model support.", - [{"label": "retry with --trust-remote-code", "op": "append", "arg": "--trust-remote-code"}], - ), - ( - r"Either a revision or a version must be specified|transformers\.integrations\.hub_kernels|kernels/layer", - "vLLM/Transformers kernel package mismatch.", - [{"label": "update vLLM, Transformers, and kernels on this server", "op": "dependency", "package": "vllm transformers kernels"}], - ), - ( - r"Address already in use|bind.*address.*in use", - "Port is already in use.", - [{"label": "retry on port 8001", "op": "replace", "flag": "--port", "value": "8001"}], - ), - ( - r"No CUDA GPUs are available|no GPU.*found|CUDA_VISIBLE_DEVICES.*invalid", - "No GPUs are visible to the serve process.", - [{"label": "clear Cookbook GPU selection or choose available GPUs", "op": "settings", "field": "gpus", "value": ""}], - ), - ( - r"Failed to infer device type|NVML Shared Library Not Found|No module named 'amdsmi'|platform is not available", - "vLLM could not find a supported GPU (CUDA or ROCm). " - "This machine may have integrated or unsupported graphics only.", - [ - {"label": "switch to llama.cpp (CPU/Metal, works without a discrete GPU)", "op": "manual"}, - {"label": "switch to Ollama (CPU/Metal, works without a discrete GPU)", "op": "manual"}, - ], - ), - ( - r"vllm.*command not found|No module named vllm|ERROR: vLLM is not installed", - "vLLM is not installed or not in PATH on this server.", - [{"label": "install vLLM in Cookbook Dependencies", "op": "dependency", "package": "vllm"}], - ), - ( - r"sglang.*command not found|No module named sglang|SGLang is not installed", - "SGLang is not installed or not in PATH on this server.", - [{"label": "install SGLang in Cookbook Dependencies", "op": "dependency", "package": "sglang[all]"}], - ), - ( - r"llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'|git: command not found|cmake: command not found", - "llama.cpp / llama-cpp-python dependencies are missing.", - [{"label": "install llama.cpp dependencies or llama-cpp-python[server]", "op": "dependency", "package": "llama-cpp-python[server]"}], - ), - ( - r"No GGUF found on this host|no \.gguf file|No GGUF file found", - "No GGUF file found for this model on this host. The llama.cpp backend needs a .gguf file.", - [{"label": "download a GGUF build of this model (repo name usually ends in -GGUF, file like Q4_K_M.gguf)", "op": "manual"}], - ), - ( - r"No module named 'torch'|No module named torch|No module named 'diffusers'|No module named diffusers", - "Diffusion serving requires PyTorch and diffusers.", - [{"label": "install diffusers[torch] in Cookbook Dependencies", "op": "dependency", "package": "diffusers[torch]"}], - ), - ( - r"403 Forbidden|401 Unauthorized|Access to model.*is restricted|gated repo|not in the authorized list|awaiting a review", - "Model access is gated or unauthorized.", - [{"label": "set HF token and request model access on HuggingFace", "op": "manual"}], - ), - ] - for pattern, message, suggestions in patterns: - if re.search(pattern, tail, re.I): - return {"message": message, "suggestions": suggestions} - if re.search(r"Traceback \(most recent call last\)", tail, re.I) and not re.search( - r"Application startup complete|GET /v1/|Uvicorn running on", tail, re.I - ): - return { - "message": "Python traceback detected during serve startup.", - "suggestions": [{"label": "inspect traceback and retry with adjusted backend/settings", "op": "manual"}], - } - return None - def _state_for_client(state): """Return cookbook state without raw secrets for browser clients.""" _strip_task_secrets(state) diff --git a/static/js/cookbookRunning.js b/static/js/cookbookRunning.js index 7f3cedd..30d78f8 100644 --- a/static/js/cookbookRunning.js +++ b/static/js/cookbookRunning.js @@ -1900,6 +1900,9 @@ export function _renderRunningTab() { const terminalDiag = _terminalServeDiagnosis(task, task.output || ''); if (terminalDiag) _showDiagnosis(el, terminalDiag, task.output || ''); + if (!terminalDiag && (task.status === 'error' || task.status === 'crashed') && task._backendDiagnosis) { + _showDiagnosis(el, task._backendDiagnosis, task.output || ''); + } const _uptimeEl = el.querySelector('.cookbook-task-uptime'); if (_uptimeEl && (task.type === 'serve' || task.type === 'download') && task.status === 'running') { @@ -3515,6 +3518,12 @@ async function _pollBackgroundStatus() { updates.output = `${previous ? `${previous}\n` : ''}${tail}`.slice(-5000); } } + if (live.diagnosis && !task._diagnosisDismissed) { + updates._backendDiagnosis = live.diagnosis; + } + if (live.cmd && !task.payload?._cmd) { + updates.payload = { ...(task.payload || {}), _cmd: live.cmd }; + } if (Object.keys(updates).length) { Object.assign(task, updates); changed = true; @@ -3523,6 +3532,12 @@ async function _pollBackgroundStatus() { if (changed) { _saveTasks(localTasks); _renderRunningTab(); + for (const task of localTasks) { + if (!task._backendDiagnosis) continue; + const el = document.querySelector(`[data-session-id="${CSS.escape(task.sessionId)}"]`); + if (!el || el.querySelector('.cookbook-diagnosis')) continue; + _showDiagnosis(el, task._backendDiagnosis, task.output || ''); + } completedDeps.forEach(t => _refreshDepsAfterInstall(t)); } } catch (_) { /* non-fatal: background status should never break polling */ } diff --git a/tests/test_cookbook_error_feedback.py b/tests/test_cookbook_error_feedback.py new file mode 100644 index 0000000..1eb8871 --- /dev/null +++ b/tests/test_cookbook_error_feedback.py @@ -0,0 +1,72 @@ +from routes.cookbook_helpers import _diagnose_serve_output + + +def test_cuda_oom_returns_diagnosis(): + out = "torch.cuda.OutOfMemoryError: CUDA out of memory." + result = _diagnose_serve_output(out) + assert result is not None + assert "memory" in result["message"].lower() + assert any(s["op"] == "replace" for s in result["suggestions"]) + + +def test_port_in_use_returns_diagnosis(): + out = "OSError: [Errno 98] Address already in use" + result = _diagnose_serve_output(out) + assert result is not None + assert "port" in result["message"].lower() + assert result["suggestions"][0]["flag"] == "--port" + + +def test_vllm_not_installed_returns_diagnosis(): + out = "No module named vllm" + result = _diagnose_serve_output(out) + assert result is not None + assert "vLLM" in result["message"] + assert result["suggestions"][0]["package"] == "vllm" + + +def test_gated_model_returns_diagnosis(): + out = "403 Forbidden\nAccess to model is restricted" + result = _diagnose_serve_output(out) + assert result is not None + assert "gated" in result["message"].lower() or "unauthorized" in result["message"].lower() + + +def test_traceback_fallback_fires_without_startup_success(): + out = "Traceback (most recent call last):\n File 'serve.py', line 1\nRuntimeError: bad config" + result = _diagnose_serve_output(out) + assert result is not None + assert "traceback" in result["message"].lower() + + +def test_traceback_suppressed_when_server_started(): + out = ( + "Traceback (most recent call last):\n File 'x.py'\nValueError: ...\n" + "Application startup complete." + ) + result = _diagnose_serve_output(out) + assert result is None + + +def test_clean_output_returns_none(): + out = "INFO: Application startup complete.\nINFO: Uvicorn running on http://0.0.0.0:8000" + assert _diagnose_serve_output(out) is None + + +def test_empty_input_returns_none(): + assert _diagnose_serve_output("") is None + assert _diagnose_serve_output(None) is None + + +def test_trust_remote_code_pattern(): + out = "Please pass trust_remote_code=True when loading this model." + result = _diagnose_serve_output(out) + assert result is not None + assert "--trust-remote-code" in result["suggestions"][0]["arg"] + + +def test_no_gguf_found_pattern(): + out = "No GGUF found on this host for model qwen/qwen2-7b" + result = _diagnose_serve_output(out) + assert result is not None + assert "GGUF" in result["message"]