Improve Cookbook serve diagnostics and recommendations

This commit is contained in:
pewdiepie-archdaemon
2026-06-02 12:15:41 +09:00
parent bdc99d746a
commit 966b53df77
14 changed files with 1113 additions and 191 deletions

View File

@@ -962,13 +962,23 @@ def setup_cookbook_routes() -> APIRouter:
# failed CUDA attempt) doesn't cause the next configure to reuse
# stale settings and silently produce a CPU-only binary.
runner_lines.append(' cd ~/llama.cpp && rm -rf build')
runner_lines.append(' _ody_has_cuda_runtime=0')
runner_lines.append(' if command -v nvcc &>/dev/null; then')
runner_lines.append(' for _cudalib in "${CUDA_HOME:-}/lib64"/libcudart.so* "${CUDA_HOME:-}/lib"/libcudart.so* /usr/local/cuda/lib64/libcudart.so* /usr/lib*/libcudart.so*; do')
runner_lines.append(' [ -e "$_cudalib" ] && _ody_has_cuda_runtime=1 && break')
runner_lines.append(' done')
runner_lines.append(' fi')
runner_lines.append(' if command -v nvcc &>/dev/null && [ "$_ody_has_cuda_runtime" = "1" ]; then')
runner_lines.append(' echo "[odysseus] CUDA nvcc found — building llama-server with CUDA (GPU) support..."')
runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON \\')
runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\')
runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
runner_lines.append(' else')
runner_lines.append(' echo "[odysseus] WARNING: nvcc not found — building llama-server for CPU only."')
runner_lines.append(' if command -v nvcc &>/dev/null; then')
runner_lines.append(' echo "[odysseus] WARNING: nvcc found but CUDA runtime library was not found — building llama-server for CPU only."')
runner_lines.append(' else')
runner_lines.append(' echo "[odysseus] WARNING: nvcc not found — building llama-server for CPU only."')
runner_lines.append(' fi')
runner_lines.append(' echo "[odysseus] GPU inference will not be available for this llama.cpp build."')
runner_lines.append(' echo "[odysseus] To get a GPU build, first install vLLM via Cookbook -> Dependencies"')
runner_lines.append(' echo "[odysseus] (its CUDA wheels include nvcc), then re-launch this serve task."')
@@ -982,6 +992,10 @@ def setup_cookbook_routes() -> APIRouter:
runner_lines.append(' echo "llama-server build failed — installing Python bindings as fallback..."')
runner_lines.append(f" {_pip_install_fallback_chain('llama-cpp-python', python_cmd='pip')} || true")
runner_lines.append(' fi')
runner_lines.append(' if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then')
runner_lines.append(' echo "ERROR: llama.cpp serving is not available after install/build attempts."')
runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127')
runner_lines.append(' fi')
runner_lines.append('fi')
elif "ollama" in req.cmd:
handled_ollama_serve = True
@@ -1037,19 +1051,24 @@ def setup_cookbook_routes() -> APIRouter:
# find the `vllm` CLI ("command not found"). Mirrors llama.cpp above.
runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
runner_lines.append('if ! command -v vllm &>/dev/null; then')
runner_lines.append(' echo "ERROR: vLLM is not installed. Open Cookbook -> Dependencies and install vllm on this server, then launch again."')
runner_lines.append(' echo "ERROR: vLLM is not installed."')
runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127')
runner_lines.append('fi')
elif "sglang.launch_server" in req.cmd:
runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
runner_lines.append('if ! python3 -c "import sglang" 2>/dev/null; then')
runner_lines.append(' echo "ERROR: SGLang is not installed. Open Cookbook -> Dependencies and install sglang on this server, then launch again."')
runner_lines.append('if ! command -v sglang &>/dev/null; then')
runner_lines.append(' echo "ERROR: SGLang is not installed."')
runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127')
runner_lines.append('elif ! ODYSSEUS_SGLANG_IMPORT_ERROR="$(python3 -c "import sglang" 2>&1)"; then')
runner_lines.append(' echo "ERROR: SGLang is installed but failed to import."')
runner_lines.append(' printf "%s\\n" "$ODYSSEUS_SGLANG_IMPORT_ERROR"')
runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127')
runner_lines.append('fi')
elif "scripts/diffusion_server.py" in req.cmd or ".diffusion_server.py" in req.cmd:
runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
runner_lines.append('if ! python3 -c "import torch, diffusers" 2>/dev/null; then')
runner_lines.append(' echo "ERROR: Diffusion serving requires PyTorch + diffusers. Open Cookbook -> Dependencies and install diffusers on this server, then launch again."')
runner_lines.append('if ! ODYSSEUS_DIFFUSION_IMPORT_ERROR="$(python3 -c "import torch, diffusers" 2>&1)"; then')
runner_lines.append(' echo "ERROR: Diffusion serving requires PyTorch + diffusers."')
runner_lines.append(' printf "%s\\n" "$ODYSSEUS_DIFFUSION_IMPORT_ERROR"')
runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127')
runner_lines.append('fi')

View File

@@ -88,6 +88,8 @@ def _quant_from_name(name):
if "6bit" in n:
return "mlx-6bit"
return "mlx-8bit" if is8 else "mlx-4bit"
if "nvfp4" in n:
return "NVFP4"
if "fp8" in n:
return "FP8"
if "int4" in n or "4bit" in n or "4-bit" in n:
@@ -136,7 +138,7 @@ def _entry_from_modelinfo(mi, overrides):
params_by_dtype = getattr(st, "parameters", None) or {}
if quant.endswith("4bit") or quant.endswith("Int4"):
pack_factor = 8
elif quant.endswith("8bit") or quant.endswith("Int8") or quant == "FP8":
elif quant.endswith("8bit") or quant.endswith("Int8") or quant in ("FP8", "NVFP4"):
pack_factor = 4
else:
pack_factor = 1
@@ -158,7 +160,7 @@ def _entry_from_modelinfo(mi, overrides):
rel = created.strftime("%Y-%m-%d") if created else datetime.utcnow().strftime("%Y-%m-%d")
# Rough RAM/VRAM hints (fit.py recomputes the real requirement from params+quant).
_BPP = {"AWQ-4bit": 0.58, "GPTQ-Int4": 0.58, "mlx-4bit": 0.55, "mlx-6bit": 0.85,
"AWQ-8bit": 1.1, "GPTQ-Int8": 1.1, "mlx-8bit": 1.1, "FP8": 1.1, "Q4_K_M": 0.6}
"AWQ-8bit": 1.1, "GPTQ-Int8": 1.1, "mlx-8bit": 1.1, "FP8": 1.1, "NVFP4": 0.6, "Q4_K_M": 0.6}
bpp = _BPP.get(quant, 0.6)
vram = round(pb * bpp + 0.5, 1)
entry = {

View File

@@ -13919,7 +13919,12 @@
"architecture": "gemma4",
"pipeline_tag": "image-text-to-text",
"release_date": "2026-04-01",
"gguf_sources": [],
"gguf_sources": [
{
"repo": "unsloth/gemma-4-E2B-it-GGUF",
"provider": "unsloth"
}
],
"capabilities": [
"vision"
]
@@ -13942,7 +13947,12 @@
"architecture": "gemma4",
"pipeline_tag": "image-text-to-text",
"release_date": "2026-04-01",
"gguf_sources": [],
"gguf_sources": [
{
"repo": "unsloth/gemma-4-E4B-it-GGUF",
"provider": "unsloth"
}
],
"capabilities": [
"vision"
]
@@ -13965,7 +13975,12 @@
"architecture": "gemma4",
"pipeline_tag": "image-text-to-text",
"release_date": "2026-04-01",
"gguf_sources": [],
"gguf_sources": [
{
"repo": "unsloth/gemma-4-31B-it-GGUF",
"provider": "unsloth"
}
],
"capabilities": [
"vision"
]
@@ -13988,7 +14003,12 @@
"architecture": "gemma4",
"pipeline_tag": "image-text-to-text",
"release_date": "2026-04-01",
"gguf_sources": [],
"gguf_sources": [
{
"repo": "unsloth/gemma-4-26B-A4B-it-GGUF",
"provider": "unsloth"
}
],
"capabilities": [
"vision"
]
@@ -18719,5 +18739,307 @@
"hf_likes": 0,
"release_date": "2026-04-19",
"_discovered": true
},
{
"name": "Qwen/Qwen3.6-27B-MTP",
"provider": "Qwen",
"parameter_count": "27.8B",
"parameters_raw": 27781427952,
"min_ram_gb": 16.6,
"recommended_ram_gb": 21.6,
"min_vram_gb": 16.6,
"quantization": "Q4_K_M",
"context_length": 262144,
"use_case": "General purpose, coding, MTP",
"is_moe": false,
"num_experts": null,
"active_experts": null,
"active_parameters": null,
"architecture": "qwen3",
"pipeline_tag": "text-generation",
"release_date": "2026-04-01",
"gguf_sources": [
{
"repo": "unsloth/Qwen3.6-27B-MTP-GGUF",
"provider": "unsloth"
}
],
"capabilities": [
"mtp"
],
"_discovered": true
},
{
"name": "Qwen/Qwen3.6-35B-A3B-MTP",
"provider": "Qwen",
"parameter_count": "36.0B",
"parameters_raw": 35951822704,
"min_ram_gb": 21.4,
"recommended_ram_gb": 27.8,
"min_vram_gb": 21.4,
"quantization": "Q4_K_M",
"context_length": 262144,
"use_case": "General purpose (MoE), MTP",
"is_moe": true,
"num_experts": null,
"active_experts": null,
"active_parameters": 3000000000,
"architecture": "qwen3_moe",
"pipeline_tag": "text-generation",
"release_date": "2026-04-01",
"gguf_sources": [
{
"repo": "unsloth/Qwen3.6-35B-A3B-MTP-GGUF",
"provider": "unsloth"
}
],
"capabilities": [
"mtp"
],
"_discovered": true
},
{
"name": "Qwen/Qwen3.5-0.8B-MTP",
"provider": "Qwen",
"parameter_count": "873M",
"parameters_raw": 873438784,
"min_ram_gb": 1.0,
"recommended_ram_gb": 2.0,
"min_vram_gb": 0.5,
"quantization": "Q4_K_M",
"context_length": 262144,
"use_case": "General purpose, MTP",
"capabilities": [
"mtp",
"tool_use",
"vision"
],
"pipeline_tag": "image-text-to-text",
"architecture": "qwen3_5",
"hf_downloads": 93448,
"hf_likes": 208,
"release_date": "2026-02-28",
"gguf_sources": [
{
"repo": "unsloth/Qwen3.5-0.8B-MTP-GGUF",
"provider": "unsloth"
}
],
"_discovered": true
},
{
"name": "Qwen/Qwen3.5-2B-MTP",
"provider": "Qwen",
"parameter_count": "2.3B",
"parameters_raw": 2274069824,
"min_ram_gb": 1.3,
"recommended_ram_gb": 2.1,
"min_vram_gb": 1.2,
"quantization": "Q4_K_M",
"context_length": 262144,
"use_case": "General purpose, MTP",
"capabilities": [
"mtp",
"tool_use",
"vision"
],
"pipeline_tag": "image-text-to-text",
"architecture": "qwen3_5",
"hf_downloads": 46974,
"hf_likes": 115,
"release_date": "2026-02-28",
"gguf_sources": [
{
"repo": "unsloth/Qwen3.5-2B-MTP-GGUF",
"provider": "unsloth"
}
],
"_discovered": true
},
{
"name": "Qwen/Qwen3.5-4B-MTP",
"provider": "Qwen",
"parameter_count": "4.7B",
"parameters_raw": 4659865088,
"min_ram_gb": 2.6,
"recommended_ram_gb": 4.3,
"min_vram_gb": 2.4,
"quantization": "Q4_K_M",
"context_length": 262144,
"use_case": "General purpose, MTP",
"capabilities": [
"mtp",
"tool_use",
"vision"
],
"pipeline_tag": "image-text-to-text",
"architecture": "qwen3_5",
"hf_downloads": 99087,
"hf_likes": 202,
"release_date": "2026-02-27",
"gguf_sources": [
{
"repo": "unsloth/Qwen3.5-4B-MTP-GGUF",
"provider": "unsloth"
}
],
"_discovered": true
},
{
"name": "Qwen/Qwen3.5-9B-MTP",
"provider": "Qwen",
"parameter_count": "9.7B",
"parameters_raw": 9653104368,
"min_ram_gb": 5.4,
"recommended_ram_gb": 9.0,
"min_vram_gb": 4.9,
"quantization": "Q4_K_M",
"context_length": 262144,
"use_case": "General purpose, MTP",
"capabilities": [
"mtp",
"tool_use",
"vision"
],
"pipeline_tag": "image-text-to-text",
"architecture": "qwen3_5",
"hf_downloads": 172298,
"hf_likes": 345,
"release_date": "2026-02-27",
"gguf_sources": [
{
"repo": "unsloth/Qwen3.5-9B-MTP-GGUF",
"provider": "unsloth"
}
],
"_discovered": true
},
{
"name": "Qwen/Qwen3.5-27B-MTP",
"provider": "Qwen",
"parameter_count": "27.8B",
"parameters_raw": 27781427952,
"min_ram_gb": 15.5,
"recommended_ram_gb": 25.9,
"min_vram_gb": 14.2,
"quantization": "Q4_K_M",
"context_length": 262144,
"use_case": "General purpose, MTP",
"capabilities": [
"mtp",
"tool_use",
"vision"
],
"pipeline_tag": "image-text-to-text",
"architecture": "qwen3_5",
"hf_downloads": 406808,
"hf_likes": 565,
"release_date": "2026-02-24",
"gguf_sources": [
{
"repo": "unsloth/Qwen3.5-27B-MTP-GGUF",
"provider": "unsloth"
}
],
"_discovered": true
},
{
"name": "Qwen/Qwen3.5-35B-A3B-MTP",
"provider": "Qwen",
"parameter_count": "36.0B",
"parameters_raw": 35951822704,
"min_ram_gb": 20.1,
"recommended_ram_gb": 33.5,
"min_vram_gb": 18.4,
"quantization": "Q4_K_M",
"context_length": 262144,
"use_case": "General purpose, MTP",
"capabilities": [
"mtp",
"tool_use",
"vision"
],
"pipeline_tag": "image-text-to-text",
"architecture": "qwen3_5_moe",
"hf_downloads": 769032,
"hf_likes": 905,
"release_date": "2026-02-24",
"is_moe": true,
"num_experts": 256,
"active_experts": 8,
"active_parameters": 3000000000,
"gguf_sources": [
{
"repo": "unsloth/Qwen3.5-35B-A3B-MTP-GGUF",
"provider": "unsloth"
}
],
"_discovered": true
},
{
"name": "Qwen/Qwen3.5-122B-A10B-MTP",
"provider": "Qwen",
"parameter_count": "125.1B",
"parameters_raw": 125086497008,
"min_ram_gb": 69.9,
"recommended_ram_gb": 116.5,
"min_vram_gb": 64.1,
"quantization": "Q4_K_M",
"context_length": 262144,
"use_case": "General purpose, MTP",
"capabilities": [
"mtp",
"tool_use",
"vision"
],
"pipeline_tag": "image-text-to-text",
"architecture": "qwen3_5_moe",
"hf_downloads": 171055,
"hf_likes": 389,
"release_date": "2026-02-24",
"is_moe": true,
"num_experts": 256,
"active_experts": 8,
"active_parameters": 10000000000,
"gguf_sources": [
{
"repo": "unsloth/Qwen3.5-122B-A10B-MTP-GGUF",
"provider": "unsloth"
}
],
"_discovered": true
},
{
"name": "Qwen/Qwen3.5-397B-A17B-MTP",
"provider": "Qwen",
"parameter_count": "403.4B",
"parameters_raw": 403397928944,
"min_ram_gb": 225.4,
"recommended_ram_gb": 375.7,
"min_vram_gb": 206.6,
"quantization": "Q4_K_M",
"context_length": 262144,
"use_case": "General purpose, MTP",
"capabilities": [
"mtp",
"tool_use",
"vision"
],
"pipeline_tag": "image-text-to-text",
"architecture": "qwen3_5_moe",
"hf_downloads": 1291825,
"hf_likes": 1214,
"release_date": "2026-02-16",
"is_moe": true,
"num_experts": 256,
"active_experts": 8,
"active_parameters": 17000000000,
"gguf_sources": [
{
"repo": "unsloth/Qwen3.5-397B-A17B-MTP-GGUF",
"provider": "unsloth"
}
],
"_discovered": true
}
]

View File

@@ -99,6 +99,27 @@ def _estimate_speed(model, quant, run_mode, system):
return k / pb * sm
def _architecture_bonus(model):
name = (model.get("name") or "").lower()
arch = (model.get("architecture") or "").lower()
text = f"{name} {arch}"
# Keep this intentionally small: hardware fit and speed still matter, but
# current model families should not be scored the same as older Qwen2/LLama
# era entries just because the parameter count is similar.
if "qwen3.6" in text or "qwen3_6" in text:
return 9
if "qwen3.5" in text or "qwen3_5" in text:
return 8
if "qwen3-next" in text or "qwen3_next" in text:
return 6
if "qwen3" in text or arch.startswith("qwen3"):
return 4
if "qwen2.5" in text or "qwen2_5" in text:
return 2
return 0
def _quality_score(model, quant, use_case):
pb = params_b(model)
if pb < 1:
@@ -128,6 +149,7 @@ def _quality_score(model, quant, use_case):
if "gemma" in name_lower:
base += 1
base += _architecture_bonus(model)
base += QUANT_QUALITY_PENALTY.get(quant, 0)
model_uc = infer_use_case(model)
@@ -220,12 +242,13 @@ def _quant_bits(q):
return 0
def analyze_model(model, system, target_quant=None):
def analyze_model(model, system, target_quant=None, scoring_use_case=None):
pb = params_b(model)
if pb <= 0:
return None
use_case = infer_use_case(model)
model_use_case = infer_use_case(model)
score_use_case = scoring_use_case or "general"
has_gpu = system.get("has_gpu", False)
gpu_vram = (system.get("gpu_vram_gb") or 0) if has_gpu else 0
gpu_count = system.get("gpu_count", 1) or 1
@@ -242,6 +265,8 @@ def analyze_model(model, system, target_quant=None):
ctx = model.get("context_length", 4096) or 4096
native_quant = model.get("quantization", "Q4_K_M")
if "nvfp4" in (model.get("name") or "").lower():
native_quant = "NVFP4"
preq = is_prequantized(model)
# GGUF models can't be sharded across GPUs — use single GPU VRAM
@@ -260,10 +285,13 @@ def analyze_model(model, system, target_quant=None):
# Determine which quant to evaluate at
if preq:
# AWQ/GPTQ/FP8/MLX come at a fixed bit-width. If the user picked a
# specific quant tier (e.g. Q8 → 8-bit), only keep prequant models whose
# native bit-width matches — otherwise selecting Q8 would still surface
# AWQ-4bit models, mixing 4- and 8-bit in one view.
# GGUF quant tier (Q4/Q8/etc.), do not treat a same-bit AWQ/GPTQ build
# as equivalent. "Q4" means llama.cpp/Ollama-style GGUF in this UI;
# AWQ/GPTQ/FP8 are separate GPU-serving formats and must only appear
# when explicitly selected or when no quant filter is applied.
if target_quant:
if not any(target_quant.startswith(p) for p in ("AWQ-", "GPTQ-", "FP8", "NVFP4")):
return None
_tb, _nb = _quant_bits(target_quant), _quant_bits(native_quant)
if _tb and _nb and _tb != _nb:
return None
@@ -300,7 +328,7 @@ def analyze_model(model, system, target_quant=None):
"parameter_count": model.get("parameter_count"),
"params_b": round(pb, 1),
"is_moe": is_moe,
"use_case": use_case,
"use_case": model_use_case,
"fit_level": "too_tight",
"run_mode": "no_fit",
"quant": quant_to_try,
@@ -334,12 +362,12 @@ def analyze_model(model, system, target_quant=None):
tps = _estimate_speed(model, quant, run_mode, system)
q_score = _quality_score(model, quant, use_case)
s_score = _speed_score(tps, use_case)
q_score = _quality_score(model, quant, score_use_case)
s_score = _speed_score(tps, score_use_case)
f_score = _fit_score(required_gb, budget)
c_score = _context_score(fit_ctx, use_case)
c_score = _context_score(fit_ctx, score_use_case)
wq, ws, wf, wc = USE_CASE_WEIGHTS.get(use_case, (0.45, 0.30, 0.15, 0.10))
wq, ws, wf, wc = USE_CASE_WEIGHTS.get(score_use_case, (0.45, 0.30, 0.15, 0.10))
composite = q_score * wq + s_score * ws + f_score * wf + c_score * wc
return {
@@ -348,7 +376,7 @@ def analyze_model(model, system, target_quant=None):
"parameter_count": model.get("parameter_count"),
"params_b": round(pb, 1),
"is_moe": is_moe,
"use_case": use_case,
"use_case": model_use_case,
"fit_level": fit_level,
"run_mode": run_mode,
"quant": quant,
@@ -419,21 +447,29 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
results.sort(key=sort_fn, reverse=(sort != "vram"))
return results[:limit]
# If user picked a prequantized format (AWQ/FP8/GPTQ), filter to only those models
filter_native = quant and any(quant.startswith(p) for p in ("AWQ-", "GPTQ-", "FP8"))
# If user picked a prequantized format (AWQ/FP8/GPTQ/NVFP4), filter to only those models
filter_native = quant and any(quant.startswith(p) for p in ("AWQ-", "GPTQ-", "FP8", "NVFP4"))
system_backend = (system.get("backend") or "").lower()
apple_silicon = system_backend in ("mps", "metal", "apple")
rocm = system_backend == "rocm"
for m in models:
native_q = m.get("quantization", "")
if "nvfp4" in (m.get("name") or "").lower():
native_q = "NVFP4"
# MLX-quantized models need the MLX runtime (mlx_lm), which Odysseus
# doesn't generate serve commands for — only llama.cpp/Ollama (Metal)
# and vLLM/SGLang (CUDA). MLX repos ship no GGUF alternative, so they're
# unrunnable on every backend we support. Always drop them, on Apple
# Silicon too, so the Cookbook never recommends a model it can't serve.
if native_q.startswith("mlx-"):
# MLX is Apple Silicon only. Hide MLX rows on non-Mac hardware scans,
# but leave them visible on Metal/MPS so Mac support is not broken.
if not apple_silicon and (native_q.startswith("mlx-") or "mlx" in (m.get("name") or "").lower()):
continue
# ROCm support for vLLM/SGLang quantized safetensors is too brittle to
# recommend blindly in the default scan. Keep AWQ/GPTQ/FP8 discoverable
# only when the user explicitly picks that format from the quant filter;
# otherwise prefer GGUF/Q* entries that Odysseus can route through
# llama.cpp/Ollama without pretending "fits VRAM" means "servable".
if rocm and is_prequantized(m) and not filter_native:
continue
# On Apple Silicon the only serving engines are llama.cpp and Ollama,
@@ -443,7 +479,8 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
# default GGUF quant) and vLLM-only AWQ/GPTQ/FP8 builds alike. Without
# this the Cookbook recommends models the Mac can't run; on CUDA these
# stay visible because vLLM serves safetensors directly.
if apple_silicon and not (m.get("is_gguf") or m.get("gguf_sources")):
is_mlx = native_q.startswith("mlx-") or "mlx" in (m.get("name") or "").lower()
if apple_silicon and not (m.get("is_gguf") or m.get("gguf_sources") or is_mlx):
continue
# Format filter: AWQ tab → only AWQ models, FP8 tab → only FP8 models
@@ -454,6 +491,8 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
continue
if quant.startswith("GPTQ") and not native_q.startswith("GPTQ"):
continue
if quant.startswith("NVFP4") and not native_q.startswith("NVFP4"):
continue
if search:
name = m.get("name", "").lower()
@@ -461,7 +500,7 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
if search.lower() not in name and search.lower() not in provider:
continue
result = analyze_model(m, system, target_quant=quant)
result = analyze_model(m, system, target_quant=quant, scoring_use_case=(use_case or "general"))
if result is None:
continue

View File

@@ -5,7 +5,7 @@ import re
QUANT_HIERARCHY = ["Q8_0", "Q6_K", "Q5_K_M", "Q4_K_M", "Q3_K_M", "Q2_K"]
QUANT_BPP = {
"F32": 4.0, "F16": 2.0, "BF16": 2.0, "FP8": 1.0,
"F32": 4.0, "F16": 2.0, "BF16": 2.0, "FP8": 1.0, "NVFP4": 0.5,
"Q8_0": 1.05, "Q6_K": 0.80, "Q5_K_M": 0.68,
"Q4_K_M": 0.58, "Q4_0": 0.58, "Q3_K_M": 0.48, "Q2_K": 0.37,
"AWQ-4bit": 0.50, "AWQ-8bit": 1.0,
@@ -14,7 +14,7 @@ QUANT_BPP = {
}
QUANT_SPEED_MULT = {
"F16": 0.6, "BF16": 0.6, "FP8": 0.85,
"F16": 0.6, "BF16": 0.6, "FP8": 0.85, "NVFP4": 1.1,
"Q8_0": 0.8, "Q6_K": 0.95, "Q5_K_M": 1.0,
"Q4_K_M": 1.15, "Q4_0": 1.15, "Q3_K_M": 1.25, "Q2_K": 1.35,
"AWQ-4bit": 1.2, "AWQ-8bit": 0.85,
@@ -23,7 +23,7 @@ QUANT_SPEED_MULT = {
}
QUANT_QUALITY_PENALTY = {
"F16": 0.0, "BF16": 0.0, "FP8": 0.0,
"F16": 0.0, "BF16": 0.0, "FP8": 0.0, "NVFP4": 0.0,
"Q8_0": 0.0, "Q6_K": -1.0, "Q5_K_M": -2.0,
"Q4_K_M": -5.0, "Q4_0": -5.0, "Q3_K_M": -8.0, "Q2_K": -12.0,
"AWQ-4bit": -3.0, "AWQ-8bit": 0.0,
@@ -32,7 +32,7 @@ QUANT_QUALITY_PENALTY = {
}
QUANT_BYTES_PER_PARAM = {
"F16": 2.0, "BF16": 2.0, "FP8": 1.0,
"F16": 2.0, "BF16": 2.0, "FP8": 1.0, "NVFP4": 0.5,
"Q8_0": 1.0, "Q6_K": 0.75, "Q5_K_M": 0.625,
"Q4_K_M": 0.5, "Q4_0": 0.5, "Q3_K_M": 0.375, "Q2_K": 0.25,
"AWQ-4bit": 0.5, "AWQ-8bit": 1.0,
@@ -41,12 +41,13 @@ QUANT_BYTES_PER_PARAM = {
}
# Pre-quantized formats that should NOT go through the GGUF quant hierarchy
PREQUANTIZED_PREFIXES = ("AWQ-", "GPTQ-", "mlx-", "FP8")
PREQUANTIZED_PREFIXES = ("AWQ-", "GPTQ-", "mlx-", "FP8", "NVFP4")
def is_prequantized(model):
q = model.get("quantization", "")
return any(q.startswith(p) for p in PREQUANTIZED_PREFIXES)
name = (model.get("name") or "").lower()
return "nvfp4" in name or any(q.startswith(p) for p in PREQUANTIZED_PREFIXES)
def params_b(model):

View File

@@ -502,6 +502,11 @@ async def _direct_fallback(
)
except asyncio.TimeoutError:
return {"error": f"web_fetch: timed out fetching {url}", "exit_code": 1}
except Exception as e:
# Direct URL fetches can hit bot protection / auth walls
# (e.g. eBay 403). Treat that as a tool failure the model can
# reason around, not an uncaught chat-stream 500.
return {"error": f"web_fetch: {url}: {e}", "exit_code": 1}
err = result.get("error")
text = (result.get("content") or "").strip()
title = result.get("title") or ""

View File

@@ -27,6 +27,56 @@ import spinnerModule from './spinner.js';
// ── Error diagnosis ──
function _openCookbookDependencies(pkgName = '') {
const cookbook = window.cookbookModule;
if (cookbook && typeof cookbook.open === 'function') {
cookbook.open({ tab: 'Dependencies' });
} else {
document.getElementById('tool-cookbook-btn')?.click();
}
const wanted = String(pkgName || '').toLowerCase();
const tryHighlight = (attempt = 0) => {
const modal = document.getElementById('cookbook-modal');
const tab = modal?.querySelector('.cookbook-tab[data-backend="Dependencies"]');
if (tab && !tab.classList.contains('active')) tab.click();
const rows = [...document.querySelectorAll('#cookbook-deps-list [data-pkg-name]')];
if (!rows.length) {
if (attempt < 45) setTimeout(() => tryHighlight(attempt + 1), 100);
return;
}
if (!wanted) return;
const row = rows.find(r => {
const name = (r.dataset.pkgName || '').toLowerCase();
const pip = (r.dataset.depPip || '').toLowerCase();
return name === wanted || pip.includes(wanted) || wanted.includes(name);
});
if (row) {
row.scrollIntoView({ block: 'center' });
row.classList.add('cookbook-pkg-flash');
setTimeout(() => row.classList.remove('cookbook-pkg-flash'), 1800);
}
};
tryHighlight();
}
function _openServeEditFromDiagnosis(panel, fields = null) {
const task = panel?.closest?.('.cookbook-task');
if (!task) return;
task.dispatchEvent(new CustomEvent('cookbook:edit-serve', { bubbles: true, detail: { fields } }));
}
function _openCpuServeEdit(panel) {
_openServeEditFromDiagnosis(panel, {
backend: 'llamacpp',
gpus: '',
tp: '1',
gpu_mem: '0.80',
_forceBackend: true,
});
}
// Infer the gated base repo that single-file checkpoints need configs from
function _inferBaseRepo(text) {
if (!text) return null;
@@ -218,6 +268,7 @@ export const ERROR_PATTERNS = [
pattern: /vllm.*command not found|No module named vllm/i,
message: 'vLLM is not installed or not in PATH.',
fixes: [
{ label: 'Open Dependencies', action: () => _openCookbookDependencies('vllm') },
{ label: 'Check environment is set', action: (panel) => {
const el = panel.querySelector('[data-field="env_type"]');
if (el) { el.focus(); el.style.borderColor = 'var(--red)'; }
@@ -226,11 +277,21 @@ export const ERROR_PATTERNS = [
},
{
pattern: /sglang.*command not found|No module named sglang|SGLang is not installed/i,
message: 'SGLang is not installed or not in PATH. Open Cookbook → Dependencies and install sglang on this server.',
message: 'SGLang is not installed or not in PATH.',
fixes: [
{ label: 'Open Dependencies', action: () => _openCookbookDependencies('sglang') },
{ label: 'Copy install command', action: () => _copyText('python3 -m pip install "sglang[all]"') },
],
},
{
pattern: /No accelerator \(CUDA, XPU, HPU, NPU, MUSA, MPS\) is available|Triton is not supported on current platform/i,
message: 'SGLang needs a visible GPU/accelerator on this server.',
suggestion: 'Suggested action: switch this serve config to llama.cpp for CPU/local serving, or choose a GPU server.',
fixes: [
{ label: 'Switch to llama.cpp', action: (panel) => _openCpuServeEdit(panel) },
{ label: 'Choose GPU server', action: (panel) => _openServeEditFromDiagnosis(panel) },
],
},
{
pattern: /flashinfer.*version.*does not match|flashinfer-cubin version/i,
message: 'FlashInfer version mismatch.',
@@ -241,8 +302,12 @@ export const ERROR_PATTERNS = [
},
{
pattern: /torch\.cuda\.is_available\(\).*False|No CUDA runtime/i,
message: 'CUDA not available in this environment.',
fixes: [],
message: 'vLLM needs a visible CUDA/ROCm GPU.',
suggestion: 'Suggested action: switch this serve config to llama.cpp for CPU/local serving, or choose a GPU server.',
fixes: [
{ label: 'Switch to llama.cpp', action: (panel) => _openCpuServeEdit(panel) },
{ label: 'Choose GPU server', action: (panel) => _openServeEditFromDiagnosis(panel) },
],
},
{
pattern: /Engine core initialization failed/i,
@@ -295,17 +360,20 @@ export const ERROR_PATTERNS = [
},
{
pattern: /Either a revision or a version must be specified|transformers\.integrations\.hub_kernels|kernels\/layer/i,
message: 'vLLM/Transformers kernel package mismatch.',
message: 'Transformers/kernels package mismatch.',
fixes: [
{ label: 'Update vLLM/Transformers/kernels', action: (panel) => {
{ label: 'Repair kernel package', action: (panel) => {
const taskEl = panel.closest('.cookbook-task');
const task = taskEl ? _loadTasks().find(t => t.sessionId === taskEl.dataset.taskId) : null;
const host = task?.remoteHost || '';
const prefix = _buildEnvPrefix();
const pipCmd = prefix ? prefix + ' python3 -m pip install -U vllm transformers kernels' : 'python3 -m pip install -U vllm transformers kernels';
const pipCmd = prefix
? prefix + ' python3 -m pip install --user --break-system-packages "kernels<0.15"'
: 'python3 -m pip install --user --break-system-packages "kernels<0.15"';
const cmd = host ? _sshCmd(host, pipCmd) : pipCmd;
_launchServeTask('update-vllm-stack', 'pip-update', cmd);
_launchServeTask('repair-kernels', 'pip-update', cmd);
}},
{ label: 'Open Dependencies', action: () => _openCookbookDependencies('sglang') },
],
},
{
@@ -319,13 +387,24 @@ export const ERROR_PATTERNS = [
pattern: /llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'/i,
message: 'llama-cpp-python server is not installed. Run: pip install "llama-cpp-python[server]"',
fixes: [
{ label: 'Open Dependencies', action: () => _openCookbookDependencies('llama_cpp') },
{ label: 'Copy install command', action: () => _copyText('pip install "llama-cpp-python[server]"') },
],
},
{
pattern: /CUDA Toolkit not found|Unable to find cudart library|missing:\s*CUDA_CUDART/i,
message: 'llama.cpp found nvcc, but the CUDA runtime library is missing.',
suggestion: 'Suggested action: relaunch with the updated runner so llama.cpp builds CPU-only, or install a complete CUDA toolkit/runtime on this server for GPU llama.cpp.',
fixes: [
{ label: 'Edit serve', action: (panel) => _openServeEditFromDiagnosis(panel) },
{ label: 'Open Dependencies', action: () => _openCookbookDependencies('llama_cpp') },
],
},
{
pattern: /No module named ['"]?torch|No module named ['"]?diffusers|diffusers.*command not found/i,
message: 'Diffusion serving needs PyTorch and diffusers. Install diffusers from Cookbook → Dependencies.',
fixes: [
{ label: 'Open Dependencies', action: () => _openCookbookDependencies('diffusers') },
{ label: 'Copy install command', action: () => _copyText('python3 -m pip install "diffusers[torch]"') },
],
},
@@ -402,10 +481,32 @@ export function _diagnose(text) {
return null;
}
function _diagnosisCopyBundle(task, diagnosis, sourceText, suggestionText) {
const lines = ['## Odysseus Cookbook troubleshooting'];
if (task) {
lines.push(
'',
'### Task',
`- ID: ${task.sessionId || task.id || 'unknown'}`,
`- Type: ${task.type || 'unknown'}`,
`- Status: ${task.status || 'unknown'}`,
`- Model: ${task.payload?.repo_id || task.name || 'unknown'}`,
`- Host: ${task.remoteHost || 'local'}${task.sshPort ? `:${task.sshPort}` : ''}`,
);
}
lines.push('', '### Diagnosis', diagnosis?.message || '(none)');
if (suggestionText) lines.push('', '### Suggested action', suggestionText.replace(/^Suggested action:\s*/i, ''));
const cmd = task?.payload?._cmd || '';
if (cmd) lines.push('', '### Launch command', '```bash', cmd, '```');
if (sourceText) lines.push('', '### Captured output', '```text', String(sourceText).trim(), '```');
return lines.join('\n');
}
export function _showDiagnosis(panel, diagnosis, sourceText) {
if (panel._lastDiagMsg === diagnosis.message) return;
if (panel._diagDismissed === diagnosis.message) return; // stay dismissed until new error
const wasCollapsed = panel._lastDiagMsg === diagnosis.message && panel._diagCollapsed;
if (panel._diagDismissed === diagnosis.message) return;
panel._lastDiagMsg = diagnosis.message;
panel._diagCollapsed = !!wasCollapsed;
let diag = panel.querySelector('.cookbook-diagnosis');
if (!diag) {
@@ -417,57 +518,161 @@ export function _showDiagnosis(panel, diagnosis, sourceText) {
}
diag.classList.remove('hidden');
diag.innerHTML = '';
const taskEl = panel?.closest?.('.cookbook-task');
const task = taskEl ? _loadTasks().find(t => t.sessionId === taskEl.dataset.taskId) : null;
const fixes = [...(diagnosis.fixes || [])];
if (task?.type === 'serve' && task.payload?._cmd && !fixes.some(f => f.label === 'Edit serve')) {
fixes.push({ label: 'Edit serve', action: (p) => _openServeEditFromDiagnosis(p) });
}
const suggestionText = diagnosis.suggestion || (fixes.length
? `Suggested action: ${fixes[0].label}.`
: 'Suggested action: copy the error and adjust the serve settings.');
const header = document.createElement('div');
header.style.cssText = 'display:flex;align-items:center;justify-content:space-between;';
header.className = 'cookbook-diag-header';
const msg = document.createElement('div');
msg.className = 'cookbook-diag-message';
msg.textContent = diagnosis.message;
header.appendChild(msg);
const fold = document.createElement('button');
fold.className = 'cookbook-diag-fold';
fold.type = 'button';
fold.innerHTML = '<span class="cookbook-diag-chevron">▾</span><span>Error message:</span>';
header.appendChild(fold);
const copy = document.createElement('button');
copy.className = 'cookbook-diag-copy';
copy.type = 'button';
copy.title = 'Copy troubleshooting bundle';
copy.setAttribute('aria-label', 'Copy troubleshooting bundle');
copy.innerHTML = '<svg width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.2" stroke-linecap="round" stroke-linejoin="round"><rect x="9" y="9" width="13" height="13" rx="2"/><path d="M5 15H4a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h9a2 2 0 0 1 2 2v1"/></svg>';
copy.addEventListener('click', (e) => {
e.stopPropagation();
_copyText(_diagnosisCopyBundle(task, diagnosis, sourceText, suggestionText));
copy.classList.add('copied');
copy.innerHTML = '<svg width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.6" stroke-linecap="round" stroke-linejoin="round"><polyline points="20 6 9 17 4 12"/></svg>';
setTimeout(() => {
if (!copy.isConnected) return;
copy.classList.remove('copied');
copy.innerHTML = '<svg width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.2" stroke-linecap="round" stroke-linejoin="round"><rect x="9" y="9" width="13" height="13" rx="2"/><path d="M5 15H4a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h9a2 2 0 0 1 2 2v1"/></svg>';
}, 1200);
});
header.appendChild(copy);
const dismiss = document.createElement('button');
dismiss.className = 'close-btn';
dismiss.style.cssText = 'width:16px;height:16px;font-size:9px;flex-shrink:0;';
dismiss.textContent = '\u2715';
dismiss.addEventListener('click', () => { panel._diagDismissed = diagnosis.message; _clearDiagnosis(panel); });
dismiss.className = 'cookbook-diag-dismiss';
dismiss.type = 'button';
dismiss.title = 'Dismiss error';
dismiss.setAttribute('aria-label', 'Dismiss error');
dismiss.textContent = '×';
dismiss.addEventListener('click', (e) => {
e.stopPropagation();
panel._diagDismissed = diagnosis.message;
_clearDiagnosis(panel);
});
header.appendChild(dismiss);
diag.appendChild(header);
if (diagnosis.fixes && diagnosis.fixes.length) {
const body = document.createElement('div');
body.className = 'cookbook-diag-body';
body.classList.toggle('hidden', panel._diagCollapsed);
fold.querySelector('.cookbook-diag-chevron').textContent = panel._diagCollapsed ? '▸' : '▾';
const msg = document.createElement('div');
msg.className = 'cookbook-diag-message';
msg.textContent = diagnosis.message;
body.appendChild(msg);
const suggestion = document.createElement('div');
suggestion.className = 'cookbook-diag-suggestion';
suggestion.textContent = suggestionText;
body.appendChild(suggestion);
fold.addEventListener('click', (e) => {
e.stopPropagation();
panel._diagCollapsed = !panel._diagCollapsed;
body.classList.toggle('hidden', panel._diagCollapsed);
fold.querySelector('.cookbook-diag-chevron').textContent = panel._diagCollapsed ? '▸' : '▾';
});
diag.appendChild(body);
const runFix = async (fix, button, busyLabel = fix.label, onStart = null, onDone = null) => {
if (!fix || !button || button.dataset.busy) return;
button.dataset.busy = '1';
const _orig = button.textContent;
const wp = spinnerModule.createWhirlpool(12);
wp.element.style.cssText = 'display:inline-block;vertical-align:middle;width:12px;height:12px;margin-right:5px;';
button.textContent = '';
button.appendChild(wp.element);
const _lbl = document.createElement('span');
_lbl.textContent = busyLabel;
_lbl.style.verticalAlign = 'middle';
button.appendChild(_lbl);
try {
if (typeof onStart === 'function') onStart();
await fix.action(panel, sourceText);
} catch (err) {
console.error('[cookbook] diagnosis fix failed', err);
} finally {
if (button.isConnected) {
try { wp.destroy(); } catch {}
button.textContent = _orig;
delete button.dataset.busy;
}
if (typeof onDone === 'function') onDone();
}
};
if (fixes.length) {
const row = document.createElement('div');
row.className = 'cookbook-diag-fixes';
for (const fix of diagnosis.fixes) {
const btn = document.createElement('button');
btn.className = 'cookbook-btn cookbook-diag-btn';
btn.textContent = fix.label;
btn.addEventListener('click', async () => {
if (btn.dataset.busy) return;
btn.dataset.busy = '1';
// Spinner feedback while the fix runs (kill + relaunch takes a moment).
const _orig = btn.textContent;
const wp = spinnerModule.createWhirlpool(12);
wp.element.style.cssText = 'display:inline-block;vertical-align:middle;width:12px;height:12px;margin-right:5px;';
btn.textContent = '';
btn.appendChild(wp.element);
const _lbl = document.createElement('span');
_lbl.textContent = _orig;
_lbl.style.verticalAlign = 'middle';
btn.appendChild(_lbl);
try {
await fix.action(panel, sourceText);
} catch (e) {
console.error('[cookbook] diagnosis fix failed', e);
} finally {
// Retries animate the whole card away (button goes with it). For fixes
// that leave the card in place, restore the label.
if (btn.isConnected) { try { wp.destroy(); } catch {} btn.textContent = _orig; delete btn.dataset.busy; }
}
});
row.appendChild(btn);
if (fixes.length <= 3) {
for (const fix of fixes) {
const btn = document.createElement('button');
btn.className = 'cookbook-btn cookbook-diag-btn';
btn.type = 'button';
btn.textContent = fix.label;
btn.addEventListener('click', (e) => {
e.stopPropagation();
runFix(fix, btn);
});
row.appendChild(btn);
}
body.appendChild(row);
return;
}
diag.appendChild(row);
const wrap = document.createElement('div');
wrap.className = 'cookbook-diag-actions';
const trigger = document.createElement('button');
trigger.className = 'cookbook-btn cookbook-diag-action-trigger';
trigger.type = 'button';
trigger.textContent = 'Actions';
trigger.appendChild(document.createTextNode(' ▾'));
wrap.appendChild(trigger);
const menu = document.createElement('div');
menu.className = 'dropdown cookbook-diag-menu hidden';
for (const fix of fixes) {
const item = document.createElement('button');
item.type = 'button';
item.textContent = fix.label;
item.addEventListener('click', async (e) => {
e.stopPropagation();
if (item.dataset.busy || trigger.dataset.busy) return;
item.dataset.busy = '1';
await runFix(fix, trigger, fix.label, () => menu.classList.add('hidden'), () => delete item.dataset.busy);
});
menu.appendChild(item);
}
wrap.appendChild(menu);
trigger.addEventListener('click', (e) => {
e.stopPropagation();
if (trigger.dataset.busy) return;
document.querySelectorAll('.cookbook-diag-menu').forEach(m => {
if (m !== menu) m.classList.add('hidden');
});
menu.classList.toggle('hidden');
});
row.appendChild(wrap);
body.appendChild(row);
}
}

View File

@@ -193,6 +193,8 @@ export function _renderGpuToggles(system) {
if (quantSel) {
if (count <= 1) {
quantSel.value = 'Q4_K_M'; // RAM or 1 GPU -> Q4 sweet spot
} else if (String(system?.backend || '').toLowerCase() === 'rocm') {
quantSel.value = 'Q4_K_M'; // ROCm default stays GGUF/local-safe; AWQ is explicit only
} else {
quantSel.value = 'AWQ-4bit'; // Multi-GPU -> AWQ for vLLM
}

View File

@@ -260,12 +260,31 @@ export function _detectBackend(model) {
const q = (model.quant || '').toUpperCase();
const sysBackend = String(_hwfitCache?.system?.backend || '').toLowerCase();
const isRocm = sysBackend === 'rocm';
const isAppleSilicon = ['metal', 'mps', 'apple'].includes(sysBackend);
const _nm = `${model.repo_id || ''} ${model.path || ''} ${model.name || ''}`.toLowerCase();
if (!isAppleSilicon && (/\bmlx\b|mlx-|_mlx/i.test(_nm) || q.startsWith('MLX'))) {
return { backend: 'unsupported', label: 'Unsupported' };
}
const isAwqLike = /^AWQ|^GPTQ|^NVFP4/.test(q) || q === 'FP8' || /\b(awq|gptq|fp8|nvfp4)\b/i.test(_nm);
const isGgufLike = model.is_gguf || /^Q[2-8]/.test(q) || /^IQ/.test(q) || q === 'GGUF' || _nm.includes('gguf');
// Image gen models → diffusers
if (model.is_image_gen || model.is_diffusion || model._tag === 'image') {
return { backend: 'diffusers', label: 'Diffusers' };
}
// AWQ / GPTQ / FP8 are safetensors GPU-serving formats. Never route them
// through llama.cpp/Ollama just because the host is Mac/Windows; those engines
// need GGUF. The UI will warn/block on Metal where vLLM/SGLang aren't viable.
if (isAwqLike) {
return { backend: 'vllm', label: 'vLLM' };
}
// GGUF → llama.cpp/Ollama-compatible.
if (isGgufLike) {
return { backend: 'llamacpp', label: 'llama.cpp' };
}
// Windows → default to llama.cpp (no vLLM support on Windows)
if (_isWindows()) {
return { backend: 'llamacpp', label: 'llama.cpp' };
@@ -278,19 +297,6 @@ export function _detectBackend(model) {
return { backend: 'llamacpp', label: 'llama.cpp' };
}
// AWQ / GPTQ / FP8 → vLLM
if (/^AWQ|^GPTQ/.test(q) || q === 'FP8') {
return { backend: 'vllm', label: 'vLLM' };
}
// GGUF → llama.cpp. Match the quant tag OR a gguf hint in the repo/path/name:
// a raw .gguf file often has no quant field, which made it fall through to the
// vLLM default below.
const _nm = `${model.repo_id || ''} ${model.path || ''} ${model.name || ''}`.toLowerCase();
if (model.is_gguf || /^Q[2-8]/.test(q) || /^IQ/.test(q) || q === 'GGUF' || _nm.includes('gguf')) {
return { backend: 'llamacpp', label: 'llama.cpp' };
}
// ROCm/AMD machines should not blindly default HF safetensors models to
// vLLM. SGLang is the safer OpenAI-compatible default for plain HF text
// repos there; llama.cpp still wins above whenever the model is GGUF.
@@ -1020,6 +1026,16 @@ function _wireTabEvents(body) {
// Download input
const dlBtn = document.getElementById('cookbook-dl-btn');
const dlInput = document.getElementById('cookbook-dl-repo');
const dlCardToggle = document.getElementById('cookbook-download-card-toggle');
const dlCardBody = document.getElementById('cookbook-download-card-body');
const dlCardArrow = document.getElementById('cookbook-download-card-arrow');
if (dlCardToggle && dlCardBody) {
dlCardToggle.addEventListener('click', () => {
const isOpen = dlCardBody.style.display !== 'none';
dlCardBody.style.display = isOpen ? 'none' : 'block';
if (dlCardArrow) dlCardArrow.style.transform = isOpen ? 'rotate(0deg)' : 'rotate(90deg)';
});
}
if (dlBtn && dlInput) {
function _stripHfUrl(input) {
let repo = input.trim();
@@ -1099,8 +1115,12 @@ function _wireTabEvents(body) {
if (hfToggle && hfList) {
let _loaded = false;
// Per-server VRAM cache so we don't re-probe on every expand
const _vramCache = {};
async function _getSelectedServerVram() {
const _hwCache = {};
function _hfModelLooksAwqLike(m) {
const text = `${m?.repo_id || ''} ${(m?.tags || []).join(' ')}`.toLowerCase();
return /\b(awq|gptq|fp8|4bit|int4)\b/.test(text);
}
async function _getSelectedServerHw() {
// Prefer the "What Fits" dropdown (the main control that shows hardware);
// fall back to the download dropdown. This is the server the list ranks for.
const dlSrv = document.getElementById('hwfit-server-select') || document.getElementById('hwfit-dl-server');
@@ -1117,7 +1137,7 @@ function _wireTabEvents(body) {
}
}
const cacheKey = host || 'local';
if (_vramCache[cacheKey] !== undefined) return _vramCache[cacheKey];
if (_hwCache[cacheKey]) return _hwCache[cacheKey];
// Fetch system info for this server from hwfit
try {
const qp = new URLSearchParams();
@@ -1127,13 +1147,13 @@ function _wireTabEvents(body) {
const r = await fetch(`/api/hwfit/system?${qp}`);
if (r.ok) {
const sys = await r.json();
const v = sys?.gpu_vram_gb || 0;
_vramCache[cacheKey] = v;
return v;
const hw = { vram: sys?.gpu_vram_gb || 0, backend: String(sys?.backend || '').toLowerCase() };
_hwCache[cacheKey] = hw;
return hw;
}
} catch {}
_vramCache[cacheKey] = 0;
return 0;
_hwCache[cacheKey] = { vram: 0, backend: '' };
return _hwCache[cacheKey];
}
async function _loadLatest() {
// Match the Dependencies loader: whirlpool spinner + text label so the
@@ -1152,7 +1172,8 @@ function _wireTabEvents(body) {
} catch {
hfList.innerHTML = '<div class="hwfit-loading">Scanning models…</div>';
}
const vram = await _getSelectedServerVram();
const hwInfo = await _getSelectedServerHw();
const vram = hwInfo.vram || 0;
try {
let lastErr = '';
const _fetchLatest = async (v) => {
@@ -1168,6 +1189,9 @@ function _wireTabEvents(body) {
if (!models.length && vram > 0) {
models = await _fetchLatest(0);
}
if (['rocm', 'metal', 'mps', 'apple', 'generic', 'cpu'].includes(hwInfo.backend)) {
models = models.filter(m => !_hfModelLooksAwqLike(m));
}
if (!models.length) {
// Distinguish "the HF API failed" from "nothing matched" so an outage
// doesn't masquerade as no-fitting-models.
@@ -1351,10 +1375,12 @@ function _renderRecipes() {
// Search group
html += '<div class="cookbook-group" data-backend-group="Search" style="flex:0 0 auto;">';
html += '<div class="admin-card" style="display:flex;flex-direction:column;overflow:hidden;">';
html += '<div style="display:flex;align-items:baseline;gap:8px;margin-bottom:2px;">';
html += '<button type="button" id="cookbook-download-card-toggle" style="display:flex;align-items:baseline;gap:8px;margin-bottom:2px;width:100%;background:transparent;border:0;padding:0;color:inherit;text-align:left;cursor:pointer;">';
html += '<h2 style="margin:0;padding:0;line-height:1;">Download</h2>';
html += '</div>';
html += '<p class="memory-desc doclib-desc" style="margin-top:6px;">Download from <a href="https://huggingface.co/models" target="_blank" rel="noopener" style="color:var(--accent,var(--red));text-decoration:none;"><svg width="10" height="10" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" style="vertical-align:-1px;margin-right:1px;"><path d="M18 13v6a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2V8a2 2 0 0 1 2-2h6"/><polyline points="15 3 21 3 21 9"/><line x1="10" y1="14" x2="21" y2="3"/></svg>HuggingFace</a> by pasting model link, or download directly in the Scan section below.</p>';
html += '<span id="cookbook-download-card-arrow" style="margin-left:auto;display:inline-block;transition:transform 0.15s;font-size:13px;line-height:1;">\u25B8</span>';
html += '</button>';
html += '<div id="cookbook-download-card-body" style="display:none;">';
html += '<p class="memory-desc doclib-desc" style="margin-top:6px;">Download directly from Scan, or paste a HuggingFace model link.</p>';
html += '<div class="hwfit-container" id="hwfit-container">';
// Section 1: Settings
@@ -1383,7 +1409,7 @@ function _renderRecipes() {
// silently sending downloads to the wrong server. An empty selection means Local; the user
// chooses a remote server explicitly via the dropdown.
// Download input
// Manual download input
html += `<div style="margin-top:7px;margin-bottom:2px;display:flex;gap:4px;align-items:center;">`;
if (_es.servers.length > 1) {
html += `<select class="cookbook-field-input hwfit-dl-server" id="hwfit-dl-server" style="height:28px;position:relative;top:0px;">`;
@@ -1399,7 +1425,7 @@ function _renderRecipes() {
html += `<button class="cookbook-btn cookbook-dl-btn" id="cookbook-dl-btn">Download</button>`;
html += `</div>`;
// Latest HF models that fit — collapsible card list
html += `<div style="margin-top:2px;position:relative;top:-8px;">`;
html += `<div style="margin-top:5px;position:relative;top:-3px;">`;
html += `<div style="display:flex;gap:4px;align-items:center;">`;
html += `<button type="button" class="memory-toolbar-btn" id="cookbook-hf-latest-toggle" style="flex:1;text-align:left;height:26px;display:flex;align-items:center;gap:6px;border-radius:4px;">`;
html += `<span id="cookbook-hf-latest-arrow" style="display:inline-block;transition:transform 0.15s;pointer-events:none;">\u25B8</span>`;
@@ -1411,7 +1437,7 @@ function _renderRecipes() {
html += `</div>`;
// Search section
html += '</div></div></div>';
html += '</div></div></div></div>';
html += '<div class="cookbook-group" data-backend-group="Search">';
html += '<div class="admin-card" style="flex:1;display:flex;flex-direction:column;overflow:hidden;">';
html += '<div style="display:flex;align-items:baseline;gap:8px;margin-bottom:2px;">';

View File

@@ -86,6 +86,9 @@ function _ggufIncludePattern(model, source) {
function _missingGgufMessage(model) {
const name = model?.name || 'this model';
if (/\bnvfp4\b/i.test(name)) {
return `${name} is an NVIDIA NVFP4 checkpoint, not a GGUF download. Pick the base model row with an Unsloth GGUF source, or paste the GGUF repo directly.`;
}
return `No GGUF source is configured for ${name}. Pick a model with a GGUF source, or paste the GGUF repo in Download.`;
}

View File

@@ -34,12 +34,106 @@ function _taskBadge(task) {
return { text: _statusLabel(task.status, task.type), cls: 'cookbook-task-' + task.status };
}
function _canClearTask(task) {
if (!task || task.status === 'running') return false;
if (task.type === 'serve' && (task.status === 'ready' || task._serveReady)) return false;
if (task.type === 'download' && task.status === 'done' && !task.payload?._dep) return false;
return ['done', 'stopped', 'error', 'crashed', 'failed'].includes(task.status);
}
function _clearPillLabel(task) {
return 'clear';
}
function _shouldOfferCrashReport(task) {
if (!task) return false;
if (task._unreachable && task.type === 'serve') return true;
return ['error', 'crashed', 'failed'].includes(task.status);
}
function _serveTaskLooksAwqOnLocalBackend(task, outputText = '') {
const repo = `${task?.payload?.repo_id || ''} ${task?.name || ''}`.toLowerCase();
const cmd = `${task?.payload?._cmd || ''} ${outputText || ''}`.toLowerCase();
return /\b(awq|gptq|fp8)\b/.test(repo) && /(llama-server|llama_cpp\.server|ollama|ggml_cuda_enable_unified_memory)/.test(cmd);
}
function _serveTaskLooksAwqWithoutUsableAccelerator(task, outputText = '') {
const repo = `${task?.payload?.repo_id || ''} ${task?.name || ''}`.toLowerCase();
const out = String(outputText || '').toLowerCase();
return /\b(awq|gptq|fp8)\b/.test(repo)
&& /(no accelerator|no cuda runtime|failed to infer device type|triton is not supported|0 active driver)/i.test(out);
}
async function _openDownloadForGgufTask(task) {
const raw = task?.payload?.repo_id || task?.name || '';
const modelName = String(raw)
.split('/').pop()
.replace(/[-_](?:AWQ|GPTQ|FP8|4bit|8bit|Int4|Int8).*$/i, '')
.replace(/[-_]+$/g, '')
|| String(raw).split('/').pop()
|| raw;
const cookbook = window.cookbookModule;
if (cookbook && typeof cookbook.open === 'function') {
cookbook.open({ tab: 'Search' });
} else {
document.getElementById('tool-cookbook-btn')?.click();
}
setTimeout(async () => {
const modal = document.getElementById('cookbook-modal');
const tab = modal?.querySelector('.cookbook-tab[data-backend="Search"]');
if (tab && !tab.classList.contains('active')) tab.click();
const search = document.getElementById('hwfit-search');
if (search) {
search.value = modelName;
search.dispatchEvent(new Event('input', { bubbles: true }));
search.focus();
}
const quant = document.getElementById('hwfit-quant');
if (quant) {
quant.value = 'Q4_K_M';
quant.dispatchEvent(new Event('change', { bubbles: true }));
}
try {
const hwfit = await import('./cookbook-hwfit.js');
if (typeof hwfit._hwfitFetch === 'function') hwfit._hwfitFetch(true);
} catch {}
}, 80);
}
function _terminalServeDiagnosis(task, outputText) {
const out = String(outputText || task?.output || '');
if (!task || task.type !== 'serve' || !['stopped', 'error', 'crashed', 'failed'].includes(task.status) || !out.trim()) return null;
if (_serveTaskLooksAwqOnLocalBackend(task, out)) {
return {
message: 'AWQ/GPTQ/FP8 cannot be served through llama.cpp/Ollama unified-memory mode.',
suggestion: 'Suggested action: use vLLM/SGLang on a compatible CUDA/ROCm GPU server, or download a GGUF version for llama.cpp/Ollama/unified-memory serving.',
fixes: [
{ label: 'Find GGUF download', action: () => _openDownloadForGgufTask(task) },
{ label: 'Edit serve', action: (panel) => _openServeEditForTask(task) },
],
};
}
if (_serveTaskLooksAwqWithoutUsableAccelerator(task, out)) {
return {
message: 'AWQ/GPTQ/FP8 needs a working vLLM/SGLang accelerator path; this server did not expose one.',
suggestion: 'Suggested action: choose a CUDA/ROCm server where vLLM/SGLang can see the GPU, or download a GGUF version and serve it with llama.cpp/Ollama.',
fixes: [
{ label: 'Find GGUF download', action: () => _openDownloadForGgufTask(task) },
{ label: 'Edit serve', action: (panel) => _openServeEditForTask(task) },
],
};
}
return _diagnose(out) || {
message: /Native llama-server not found|building llama-server|llama\.cpp/i.test(out)
? 'llama.cpp build stopped before the server became reachable.'
: 'Serve stopped before the model became reachable.',
suggestion: /Native llama-server not found|building llama-server|llama\.cpp/i.test(out)
? 'Suggested action: copy the troubleshooting bundle, then edit serve settings. For the quickest local/CPU path, use Ollama or a prebuilt llama-server; source builds can take several minutes and fail if build dependencies are incomplete.'
: 'Suggested action: copy the troubleshooting bundle, then edit serve settings or relaunch with a CPU/backend fallback.',
fixes: [{ label: 'Edit serve', action: (panel) => _openServeEditForTask(task) }],
};
}
function _redactCrashReportText(text) {
if (!text) return '';
return String(text)
@@ -173,6 +267,23 @@ export function _parseServePhase(snapshot) {
if (/Ollama API ready on port\s+\d+/i.test(flat)) {
return { phase: 'ready', status: 'ready' };
}
const llamaBuildMatches = [...flat.matchAll(/\[\s*(\d{1,3})%\]\s*(?:Building|Linking)/gi)];
if (llamaBuildMatches.length) {
const pct = Math.min(100, parseInt(llamaBuildMatches[llamaBuildMatches.length - 1][1], 10));
return { phase: `building llama.cpp ${pct}%`, status: 'running', pct };
}
if (/Native llama-server not found|building from source/i.test(flat)) {
if (/Cloning into ['"]?llama\.cpp/i.test(flat) && !/Receiving objects:\s*100%/i.test(flat)) {
return { phase: 'cloning llama.cpp', status: 'running' };
}
if (/Configuring incomplete|CMake Error/i.test(flat)) {
return {};
}
if (/CMAKE_BUILD_TYPE|Detecting CXX|Found Threads|Including CPU backend|CUDA nvcc found|building llama-server/i.test(flat)) {
return { phase: 'configuring llama.cpp', status: 'running' };
}
return { phase: 'building llama.cpp', status: 'running' };
}
// HTTP access logs (e.g. GET /v1/models 200 OK) mean the server is up
if (/(?:GET|POST)\s+\/[^\s]*\s+HTTP\/[\d.]+"\s*\d{3}/.test(flat)) {
return { phase: 'idle', status: 'ready' };
@@ -341,8 +452,24 @@ async function _startQueuedDownload(task) {
// ── Task CRUD ──
function _serveOutputLooksReady(task) {
const out = String(task?.output || '');
return !!task?._serveReady
|| /Application startup complete/i.test(out)
|| /Ollama API ready on port\s+\d+/i.test(out)
|| /(?:GET|POST)\s+\/[^\s]*\s+HTTP\/[\d.]+"\s*2\d\d/i.test(out);
}
function _normalizeTaskForDisplay(task) {
if (!task || typeof task !== 'object') return task;
if (task.type === 'serve' && task.status === 'done' && !_serveOutputLooksReady(task)) {
return { ...task, status: 'error' };
}
return task;
}
export function _loadTasks() {
try { return JSON.parse(localStorage.getItem(TASKS_KEY)) || []; }
try { return (JSON.parse(localStorage.getItem(TASKS_KEY)) || []).map(_normalizeTaskForDisplay); }
catch { return []; }
}
@@ -876,7 +1003,7 @@ export async function _serveAutoFix(panel, envVar) {
// Edit button, but optionally with a modified command (used by the diagnosis
// "Retry with X" buttons so a retry lands in the editable Serve panel with the
// adjusted setting, instead of blindly relaunching).
async function _openServeEditForTask(task, cmdOverride) {
async function _openServeEditForTask(task, cmdOverride, fieldOverrides = null) {
const repo = task.payload?.repo_id;
if (!repo) { uiModule.showToast('No model info on this task'); return; }
const cmd = cmdOverride || task.payload?._cmd;
@@ -884,6 +1011,9 @@ async function _openServeEditForTask(task, cmdOverride) {
let fields = cmdOverride
? _parseServeCmdToFields(cmd)
: (task.payload?._fields || (cmd ? _parseServeCmdToFields(cmd) : null));
if (fieldOverrides && typeof fieldOverrides === 'object') {
fields = { ...(fields || {}), ...fieldOverrides };
}
// Switch the active server to the one this serve ran on (mirrors _openEdit).
const _tHost = task.remoteHost || '';
_envState.remoteHost = _tHost;
@@ -1352,8 +1482,8 @@ export function _renderRunningTab() {
const host = btn.dataset.clearServer;
if (!await window.styledConfirm(`Clear finished tasks on ${_serverName(host)}?`, { confirmText: 'Clear' })) return;
const allTasks = _loadTasks();
const toRemove = allTasks.filter(t => (t.remoteHost || '') === host && t.status !== 'running');
const remaining = allTasks.filter(t => (t.remoteHost || '') !== host || t.status === 'running');
const toRemove = allTasks.filter(t => (t.remoteHost || '') === host && _canClearTask(t));
const remaining = allTasks.filter(t => (t.remoteHost || '') !== host || !_canClearTask(t));
_saveTasks(remaining);
// Fade/slide each finished card out (same exit as the per-card clear)
// instead of yanking them instantly.
@@ -1443,16 +1573,19 @@ export function _renderRunningTab() {
const _bdg = _taskBadge(task);
badge.textContent = _bdg.text;
badge.className = 'cookbook-task-status' + (_bdg.cls ? ' ' + _bdg.cls : '');
badge.style.display = isDone ? 'none' : ''; // hidden — type chip carries it
badge.style.display = '';
}
// Indicator: spinning wave while running, green check when finished.
const wave = el.querySelector('.cookbook-task-wave');
if (wave) wave.style.display = task.status === 'running' ? '' : 'none';
// Model downloads (which have a Serve → button) don't get a clear pill —
// pressing Serve clears them. Dep installs / serve tasks keep it.
const check = el.querySelector('.cookbook-task-check');
const _showClear = isDone && !(task.type === 'download' && !task.payload?._dep);
if (check) check.style.display = _showClear ? '' : 'none';
if (check) {
check.style.display = _canClearTask(task) ? '' : 'none';
const label = check.querySelector('.cookbook-task-done-label');
if (label) label.textContent = _clearPillLabel(task);
}
const terminalDiag = _terminalServeDiagnosis(task, el.querySelector('.cookbook-output-pre')?.textContent || task.output || '');
if (terminalDiag) _showDiagnosis(el, terminalDiag, el.querySelector('.cookbook-output-pre')?.textContent || task.output || '');
}
if (!task) {
if (el._uptimeInterval) { clearInterval(el._uptimeInterval); el._uptimeInterval = null; }
@@ -1476,11 +1609,8 @@ export function _renderRunningTab() {
<div class="cookbook-task-header">
<span class="cookbook-task-type${(task.status === 'done' && task.type === 'download') ? ' cookbook-task-type-done' : ''}" data-type="${esc(task.type)}">${esc((task.status === 'done' && task.type === 'download') ? 'finished' : task.type)}</span>
<span class="cookbook-task-name">${modelLogo(task.name)}${esc(task.name)}</span>
<span class="cookbook-task-status ${_bdg.cls}" style="display:${task.status === 'done' ? 'none' : ''}"${_bdgTitle}>${esc(_bdg.text)}</span>
${task.type === 'serve' && task.payload?._cmd ? '<button class="cookbook-task-edit-btn" title="Edit settings &amp; relaunch"><svg width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M11 4H4a2 2 0 0 0-2 2v14a2 2 0 0 0 2 2h14a2 2 0 0 0 2-2v-7"/><path d="M18.5 2.5a2.121 2.121 0 0 1 3 3L12 15l-4 1 1-4 9.5-9.5z"/></svg></button>' : ''}
${task.type === 'serve' && task.payload?._cmd ? '<button class="cookbook-task-save-btn" title="Save preset"><svg width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M19 21H5a2 2 0 0 1-2-2V5a2 2 0 0 1 2-2h11l5 5v11a2 2 0 0 1-2 2z"/><polyline points="17 21 17 13 7 13 7 21"/><polyline points="7 3 7 8 15 8"/></svg></button>' : ''}
<span class="cookbook-task-indicator"><span class="cookbook-task-wave" style="display:${task.status === 'running' ? '' : 'none'}"></span><span class="cookbook-task-check" title="Clear" style="display:${(task.status === 'done' && !(task.type === 'download' && !task.payload?._dep)) ? '' : 'none'}"><svg class="cookbook-task-check-ico" width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="#50fa7b" stroke-width="3" stroke-linecap="round" stroke-linejoin="round"><polyline points="20 6 9 17 4 12"/></svg><svg class="cookbook-task-clear-ico" width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="3" stroke-linecap="round" stroke-linejoin="round"><line x1="18" y1="6" x2="6" y2="18"/><line x1="6" y1="6" x2="18" y2="18"/></svg><span class="cookbook-task-done-label">done</span><span class="cookbook-task-clear-label">clear</span></span></span>
${task.type === 'download' && !task.payload?._dep && task.status === 'done' ? `<span class="cookbook-task-status cookbook-task-done">finished</span>` : ''}
<span class="cookbook-task-indicator"><span class="cookbook-task-wave" style="display:${task.status === 'running' ? '' : 'none'}"></span><span class="cookbook-task-check" title="Clear" style="display:${_canClearTask(task) ? '' : 'none'}"><svg class="cookbook-task-check-ico" width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="#50fa7b" stroke-width="3" stroke-linecap="round" stroke-linejoin="round"><polyline points="20 6 9 17 4 12"/></svg><svg class="cookbook-task-clear-ico" width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="3" stroke-linecap="round" stroke-linejoin="round"><line x1="18" y1="6" x2="6" y2="18"/><line x1="6" y1="6" x2="18" y2="18"/></svg><span class="cookbook-task-done-label">${esc(_clearPillLabel(task))}</span><span class="cookbook-task-clear-label">clear</span></span></span>
<span class="cookbook-task-status ${_bdg.cls}"${_bdgTitle}>${esc(_bdg.text)}</span>
<button class="cookbook-task-menu-btn" title="Actions">&#8942;</button>
</div>
<div class="cookbook-task-sub"><span class="cookbook-task-session">${esc(task.sessionId)}</span><span class="cookbook-task-uptime" style="display:${((task.type === 'serve' || task.type === 'download') && task.status === 'running') ? '' : 'none'}"></span></div>
@@ -1490,6 +1620,9 @@ export function _renderRunningTab() {
const _waveEl = el.querySelector('.cookbook-task-wave');
if (_waveEl && task.status === 'running') _registerWaveEl(_waveEl);
const terminalDiag = _terminalServeDiagnosis(task, task.output || '');
if (terminalDiag) _showDiagnosis(el, terminalDiag, task.output || '');
const _uptimeEl = el.querySelector('.cookbook-task-uptime');
if (_uptimeEl && (task.type === 'serve' || task.type === 'download') && task.status === 'running') {
const _startedAt = task.ts || Date.now();
@@ -1506,35 +1639,12 @@ export function _renderRunningTab() {
}
// Re-open the Serve panel for this model, pre-filled with the EXACT
// settings this instance launched with, and on the SERVER it runs on
// shared by the edit icon button and the ⋮ "Edit settings" menu item.
// settings this instance launched with, and on the SERVER it runs on.
const _openEdit = () => _openServeEditForTask(task);
const editBtn = el.querySelector('.cookbook-task-edit-btn');
if (editBtn) {
editBtn.addEventListener('click', (e) => { e.stopPropagation(); _openEdit(); });
}
// Wire save icon button
const saveBtn = el.querySelector('.cookbook-task-save-btn');
if (saveBtn) {
saveBtn.addEventListener('click', async (e) => {
e.stopPropagation();
// Tell them it's already saved up front (often true now that working
// configs auto-save) instead of after they've typed a name.
if (_loadPresets().some(p => p.cmd === task.payload?._cmd)) {
uiModule.showToast('Already saved');
return;
}
const label = (await uiModule.styledPrompt('Name this config so you can recall it later.', {
title: 'Save Config', defaultValue: task.name, placeholder: 'e.g. 8-bit, fast', confirmText: 'Save',
}) || '').trim();
if (!label) return;
if (!_saveTaskAsPreset(task, label)) { uiModule.showToast('Already saved'); return; }
saveBtn.innerHTML = '<svg width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="#50fa7b" stroke-width="2.5" stroke-linecap="round"><polyline points="20 6 9 17 4 12"/></svg>';
uiModule.showToast(`Saved "${label}"`);
setTimeout(() => { saveBtn.style.display = 'none'; }, 1500);
});
}
el.addEventListener('cookbook:edit-serve', (e) => {
e.stopPropagation();
_openServeEditForTask(task, null, e.detail?.fields || null);
});
// Finished download → an explicit "Serve →" button jumps straight to the
// Serve tab with this model pre-selected (on the server it downloaded to).
@@ -2018,12 +2128,31 @@ async function _reconnectTask(el, task) {
if (badge) { badge.textContent = _statusLabel('error', task.type); badge.className = 'cookbook-task-status cookbook-task-error'; }
_showCookbookNotif(true);
} else {
const looksSuccessful = !lastOutput.includes('DOWNLOAD_FAILED') && (lastOutput.includes('DONE') || lastOutput.includes('100%') || lastOutput.includes('Application startup complete') || lastOutput.includes('/snapshots/') || lastOutput.includes('Download complete') || lastOutput.includes('DOWNLOAD_OK'));
if (!lastOutput.trim() || (task.type === 'download' && !looksSuccessful)) {
const downloadLooksSuccessful = !lastOutput.includes('DOWNLOAD_FAILED')
&& (lastOutput.includes('DONE') || lastOutput.includes('100%') || lastOutput.includes('/snapshots/') || lastOutput.includes('Download complete') || lastOutput.includes('DOWNLOAD_OK'));
const serveLooksReady = task.type === 'serve' && _serveOutputLooksReady({ ...task, output: lastOutput });
const looksSuccessful = task.type === 'download' ? downloadLooksSuccessful : serveLooksReady;
if (!lastOutput.trim() || !looksSuccessful) {
_updateTask(task.sessionId, { status: 'crashed' });
el.dataset.status = 'crashed';
const badge = el.querySelector('.cookbook-task-status');
if (badge) { badge.textContent = _statusLabel('crashed', task.type); badge.className = 'cookbook-task-status cookbook-task-crashed'; }
if (task.type === 'serve') {
const diag = _diagnose(lastOutput) || {
message: _serveTaskLooksAwqOnLocalBackend(task, lastOutput)
? 'AWQ/GPTQ/FP8 cannot be served through llama.cpp/Ollama unified-memory mode.'
: /Native llama-server not found|building llama-server|llama\.cpp/i.test(lastOutput)
? 'llama.cpp build stopped before the server became reachable.'
: 'Serve stopped before the model became reachable.',
suggestion: _serveTaskLooksAwqOnLocalBackend(task, lastOutput)
? 'Suggested action: use vLLM/SGLang on a compatible CUDA/ROCm GPU server, or download a GGUF version for llama.cpp/Ollama/unified-memory serving.'
: /Native llama-server not found|building llama-server|llama\.cpp/i.test(lastOutput)
? 'Suggested action: copy the troubleshooting bundle, then edit serve settings. For the quickest local/CPU path, use Ollama or a prebuilt llama-server; source builds can take several minutes and fail if build dependencies are incomplete.'
: 'Suggested action: copy the troubleshooting bundle, then edit serve settings or relaunch with a CPU/backend fallback.',
fixes: [{ label: 'Edit serve', action: (panel) => _openServeEditForTask(task) }],
};
_showDiagnosis(el, diag, lastOutput);
}
_showCookbookNotif(true);
} else {
_updateTask(task.sessionId, { status: 'done' });

View File

@@ -41,6 +41,48 @@ const SERVE_STATE_KEY = 'cookbook-serve-state';
let _cachedAllModels = [];
function _repoLooksAwqLike(model, repo) {
const q = String(model?.quant || '').toUpperCase();
const n = `${repo || ''} ${model?.repo_id || ''} ${model?.name || ''} ${model?.path || ''}`.toLowerCase();
return /^AWQ|^GPTQ/.test(q) || q === 'FP8' || /\b(awq|gptq|fp8)\b/i.test(n);
}
function _repoLooksGgufLike(model, repo) {
const q = String(model?.quant || '').toUpperCase();
const n = `${repo || ''} ${model?.repo_id || ''} ${model?.name || ''} ${model?.path || ''}`.toLowerCase();
return !!model?.is_gguf || /^Q[2-8]/.test(q) || /^IQ/.test(q) || q === 'GGUF' || n.includes('gguf');
}
function _serveBackendWarning(model, repo, backend, fields = {}) {
const awqLike = _repoLooksAwqLike(model, repo);
const ggufLike = _repoLooksGgufLike(model, repo);
if (awqLike && (backend === 'llamacpp' || backend === 'ollama')) {
return {
title: 'AWQ needs vLLM or SGLang',
body: 'This model looks like AWQ/GPTQ/FP8 safetensors. llama.cpp and Ollama need GGUF files, so this backend cannot serve it. Choose vLLM/SGLang on a CUDA/ROCm GPU server, or download a GGUF version for llama.cpp/Ollama.',
};
}
if (awqLike && _isMetal() && (backend === 'vllm' || backend === 'sglang')) {
return {
title: 'AWQ is not a unified-memory path',
body: 'This model looks like AWQ/GPTQ/FP8 safetensors. AWQ is for vLLM/SGLang on CUDA/ROCm-style GPU servers, not local unified-memory llama.cpp/Ollama serving. For unified memory, download a GGUF model and use llama.cpp/Ollama.',
};
}
if (awqLike && fields.unified_mem) {
return {
title: 'AWQ is not a unified-memory path',
body: 'This model looks like AWQ/GPTQ/FP8 safetensors, but unified-memory local serving expects GGUF. Use vLLM/SGLang on a compatible GPU server, or download a GGUF version for llama.cpp/Ollama.',
};
}
if (ggufLike && (backend === 'vllm' || backend === 'sglang')) {
return {
title: 'GGUF needs llama.cpp or Ollama',
body: 'This model looks like GGUF. vLLM/SGLang expect HuggingFace safetensors-style repos. Choose llama.cpp/Ollama for GGUF, or download a safetensors model for vLLM/SGLang.',
};
}
return null;
}
function _hasOwn(obj, key) {
return Object.prototype.hasOwnProperty.call(obj || {}, key);
}
@@ -324,12 +366,6 @@ function _rerenderCachedModels() {
c.style.alignItems = '';
});
// Capture grid height
const _tb = list.closest('.admin-card')?.querySelector('.memory-toolbar');
const _tbH = _tb ? _tb.offsetHeight : 0;
list.style.minHeight = (list.offsetHeight + _tbH) + 'px';
list.style.maxHeight = (list.offsetHeight + _tbH) + 'px';
const shortName = repo.split('/').pop();
const _es = _envState;
// The venv set per-server in Settings (server.envPath). Used as the venv
@@ -350,8 +386,13 @@ function _rerenderCachedModels() {
? _byRepo[repo]
: (_lastUsed || (_isLegacyFlat ? _allSs : {}));
const detectedBackend = _detectBackend(m).backend;
const defaultBackend = detectedBackend;
const savedMatchesBackend = (ss.backend || 'vllm') === detectedBackend;
const _allowedBackends = new Set(_isWindows()
? ['llamacpp']
: (_isMetal() ? ['llamacpp', 'ollama'] : ['vllm', 'sglang', 'llamacpp', 'ollama', 'diffusers']));
const defaultBackend = (ss._forceBackend && ss.backend && _allowedBackends.has(ss.backend))
? ss.backend
: detectedBackend;
const savedMatchesBackend = !!ss._forceBackend || (ss.backend || 'vllm') === detectedBackend;
const sv = (k, def) => (ss[k] !== undefined && savedMatchesBackend) ? ss[k] : def;
const defaultTp = defaultBackend === 'llamacpp' ? '1' : sv('tp', '1');
const detectedGpuIds = _allGpuIds(_getGpuToggleTotal?.());
@@ -1200,7 +1241,16 @@ function _rerenderCachedModels() {
if (el.type === 'checkbox') serveState[el.dataset.field] = el.checked;
else serveState[el.dataset.field] = el.value;
});
serveState.backend = (_detectBackend(m).backend) || serveState.backend || 'vllm';
serveState.backend = serveState.backend || (_detectBackend(m).backend) || 'vllm';
const backendWarning = _serveBackendWarning(m, repo, serveState.backend, serveState);
if (backendWarning) {
await window.styledConfirm(backendWarning.body, {
title: backendWarning.title,
confirmText: 'Edit settings',
cancelText: 'Close',
});
return;
}
// Save in the { _byRepo, _lastUsed } schema — no legacy flat keys at
// the root so per-model state doesn't leak between models.
try {

View File

@@ -2253,8 +2253,9 @@ function _renderActivityEntry(entry) {
const hue = _categoryHue(entry.taskName, entry.kind);
// CSS vars feed the colored title + accent stripe.
const styleVars = `--cat-hue:${hue};`;
const _runningPlaceholder = /^(Starting…|Starting\.\.\.|_Running…_|_Running\.\.\._|_Queued\b)/i.test((entry.result || '').trim());
const hasResult = !!(entry.result && entry.result.trim() && entry.status !== 'running' && entry.status !== 'queued');
const hasRunningProgress = !!(entry.result && entry.result.trim() && (entry.status === 'running' || entry.status === 'queued'));
const hasRunningProgress = !!(entry.result && entry.result.trim() && !_runningPlaceholder && (entry.status === 'running' || entry.status === 'queued'));
// "Open in chat" only makes sense for runs whose result is a real assistant
// message (Prompt / Research tasks). Action/event runs are just log lines
// (e.g. "No recent emails", "Tidied N memories") — for those, replace the
@@ -2299,9 +2300,10 @@ function _renderActivityEntry(entry) {
let rightHtml;
if (_isRunning) {
const isQueued = entry.status === 'queued';
const label = isQueued ? 'Queued' : 'Running';
// Initial elapsed for the first paint; the 1s interval below keeps it live.
const startMs = entry.ts ? new Date(entry.ts).getTime() : Date.now();
const stale = !isQueued && (Date.now() - startMs) > 30 * 60 * 1000;
const label = isQueued ? 'Queued' : stale ? 'Still running' : 'Running';
const elapsedInit = isQueued ? '' : `<span class="task-log-running-elapsed" data-since="${startMs}">${_fmtElapsed(Date.now() - startMs)}</span>`;
const forceBtn = isQueued && entry.taskId ? `<button class="task-log-force-run" type="button" title="Start now in parallel, bypassing the queue" style="border:0;background:transparent;box-shadow:none;margin-left:5px;padding:0;width:12px;height:12px;display:inline-flex;align-items:center;justify-content:center;font-size:10px;line-height:1;color:inherit;opacity:.8;"><svg width="9" height="9" viewBox="0 0 24 24" fill="currentColor" style="display:block;"><polygon points="6 4 20 12 6 20 6 4"/></svg></button>` : '';
const stopBtn = entry.taskId ? `<button class="task-log-stop" type="button" title="Stop this task"><svg width="9" height="9" viewBox="0 0 24 24" fill="currentColor"><rect x="6" y="6" width="12" height="12" rx="1"/></svg></button>` : '';

View File

@@ -5363,19 +5363,20 @@ body.bg-pattern-sparkles {
#compare-model-overlay .modal-header h4 {
pointer-events: none;
}
/* Compare modal sizes to content the global .modal-content max-height
+ .modal-body overflow combo makes BOTH the outer card and the inner
body scrollable, so even when the content fits the viewport you get
a stray vertical scrollbar. Drop the cap and disable inner scroll
here; if the viewport is genuinely tiny the modal still won't exceed
it because it's centered and the parent .modal flex layout shrinks. */
/* Compare model selector: keep manually-resized/tiny windows contained.
Picker dropdowns are appended to document.body, so the card itself can
clip and scroll without cropping the dropdown list. */
#compare-model-overlay .modal-content {
max-height: none;
overflow: visible;
display: flex;
flex-direction: column;
max-height: min(720px, calc(100dvh - 48px));
overflow: hidden;
min-height: 180px;
}
#compare-model-overlay .modal-body {
overflow: visible;
flex: 0 0 auto;
overflow: auto;
flex: 1 1 auto;
min-height: 0;
}
.vis-hint {
font-size: 10px;
@@ -6955,6 +6956,8 @@ pre { background: var(--code-bg, var(--hl-bg, #282c34)) !important; }
.compare-mode-tabs {
display: flex;
gap: 4px;
flex-wrap: wrap;
min-width: 0;
}
/* Type tabs match Mode toggles 1:1 (same flex column layout, same metrics) */
.compare-mode-tab {
@@ -19015,7 +19018,7 @@ body.gallery-selecting .gallery-dl-btn,
align-items: center;
gap: 3px;
position: relative;
top: 2px;
top: 0;
cursor: pointer;
padding: 1px 6px 1px 4px;
border-radius: 9px;
@@ -19024,22 +19027,17 @@ body.gallery-selecting .gallery-dl-btn,
}
.cookbook-task-check svg { flex-shrink: 0; }
.cookbook-task-check:hover { background: color-mix(in srgb, var(--red, #ff5555) 18%, transparent); }
/* Shows "done" (green) normally; on hover the icon + label swap to a red /
"clear" to reveal it's a dismiss action. */
/* Terminal task clear pill. */
.cookbook-task-done-label,
.cookbook-task-clear-label {
font-size: 9px;
line-height: 1;
text-transform: lowercase;
}
.cookbook-task-done-label { color: var(--green, #50fa7b); }
.cookbook-task-clear-label { display: none; color: var(--red, #ff5555); }
.cookbook-task-check:hover .cookbook-task-done-label { display: none; }
.cookbook-task-check:hover .cookbook-task-clear-label { display: inline; }
/* Default: show the green check. On hover: swap to a red ✕ to signal "clear". */
.cookbook-task-clear-ico { display: none; }
.cookbook-task-check:hover .cookbook-task-check-ico { display: none; }
.cookbook-task-check:hover .cookbook-task-clear-ico { display: inline; }
.cookbook-task-done-label { color: var(--red, #ff5555); }
.cookbook-task-clear-label { display: none; }
.cookbook-task-check-ico { display: none; }
.cookbook-task-clear-ico { display: inline; }
/* "Serve" button on a finished download green pill matching the "running" /
finished badge (it sits next to the green FINISHED chip + check). */
.cookbook-task-serve-btn {
@@ -19583,17 +19581,136 @@ body.gallery-selecting .gallery-dl-btn,
border: 1px solid color-mix(in srgb, var(--color-error) 30%, transparent);
border-radius: 6px;
}
.cookbook-diag-header {
display: flex;
align-items: center;
gap: 7px;
position: relative;
top: -4px;
margin-bottom: -4px;
}
.cookbook-diag-fold {
display: inline-flex;
align-items: center;
gap: 5px;
padding: 0;
min-height: 0;
border: 0;
background: transparent;
color: var(--color-error);
font: inherit;
font-size: 11px;
font-weight: 700;
cursor: pointer;
margin-right: auto;
}
.cookbook-diag-fold:hover {
background: transparent;
color: var(--color-error);
opacity: 0.85;
}
.cookbook-diag-chevron {
display: inline-block;
width: 10px;
font-size: 10px;
}
.cookbook-diag-copy {
border: 0;
background: transparent;
color: var(--fg-muted);
padding: 0 2px;
width: 18px;
height: 18px;
min-height: 18px;
cursor: pointer;
display: inline-flex;
align-items: center;
justify-content: center;
}
.cookbook-diag-copy:hover {
background: transparent;
color: var(--fg);
}
.cookbook-diag-copy.copied {
color: var(--green, #50fa7b);
}
.cookbook-diag-copy svg {
display: block;
}
.cookbook-diag-dismiss {
border: 0;
background: transparent;
color: var(--fg-muted);
padding: 0;
width: 16px;
height: 18px;
min-height: 18px;
line-height: 16px;
font-size: 13px;
cursor: pointer;
display: inline-flex;
align-items: center;
justify-content: center;
position: relative;
top: -2px;
}
.cookbook-diag-dismiss:hover {
background: transparent;
color: var(--color-error);
}
.cookbook-diag-body {
margin-top: 7px;
}
.cookbook-diag-message {
font-size: 12px;
font-weight: 600;
color: var(--color-error);
margin-bottom: 4px;
margin-left: 2px;
user-select: text;
}
.cookbook-diag-suggestion {
font-size: 11px;
line-height: 1.35;
color: var(--fg-muted);
margin-bottom: 8px;
margin-left: 2px;
user-select: text;
}
.cookbook-diag-fixes {
display: flex;
flex-wrap: wrap;
gap: 6px;
}
.cookbook-diag-actions {
position: relative;
display: inline-flex;
}
.cookbook-diag-action-trigger {
font-size: 11px;
padding: 4px 10px;
min-height: 24px;
background: var(--panel);
border: 1px solid color-mix(in srgb, var(--color-error) 40%, transparent);
color: var(--fg);
}
.cookbook-diag-action-trigger:hover {
border-color: var(--color-error);
background: color-mix(in srgb, var(--color-error) 12%, transparent);
}
.cookbook-diag-menu {
position: absolute;
left: 0;
top: calc(100% + 4px);
min-width: 180px;
z-index: 80;
}
.cookbook-diag-menu button {
width: 100%;
justify-content: flex-start;
text-align: left;
white-space: nowrap;
}
.cookbook-diag-btn {
font-size: 11px;
padding: 4px 10px;