diff --git a/routes/cookbook_routes.py b/routes/cookbook_routes.py index 5a94a6a..a6f03d5 100644 --- a/routes/cookbook_routes.py +++ b/routes/cookbook_routes.py @@ -962,13 +962,23 @@ def setup_cookbook_routes() -> APIRouter: # failed CUDA attempt) doesn't cause the next configure to reuse # stale settings and silently produce a CPU-only binary. runner_lines.append(' cd ~/llama.cpp && rm -rf build') + runner_lines.append(' _ody_has_cuda_runtime=0') runner_lines.append(' if command -v nvcc &>/dev/null; then') + runner_lines.append(' for _cudalib in "${CUDA_HOME:-}/lib64"/libcudart.so* "${CUDA_HOME:-}/lib"/libcudart.so* /usr/local/cuda/lib64/libcudart.so* /usr/lib*/libcudart.so*; do') + runner_lines.append(' [ -e "$_cudalib" ] && _ody_has_cuda_runtime=1 && break') + runner_lines.append(' done') + runner_lines.append(' fi') + runner_lines.append(' if command -v nvcc &>/dev/null && [ "$_ody_has_cuda_runtime" = "1" ]; then') runner_lines.append(' echo "[odysseus] CUDA nvcc found — building llama-server with CUDA (GPU) support..."') runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON \\') runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\') runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server') runner_lines.append(' else') - runner_lines.append(' echo "[odysseus] WARNING: nvcc not found — building llama-server for CPU only."') + runner_lines.append(' if command -v nvcc &>/dev/null; then') + runner_lines.append(' echo "[odysseus] WARNING: nvcc found but CUDA runtime library was not found — building llama-server for CPU only."') + runner_lines.append(' else') + runner_lines.append(' echo "[odysseus] WARNING: nvcc not found — building llama-server for CPU only."') + runner_lines.append(' fi') runner_lines.append(' echo "[odysseus] GPU inference will not be available for this llama.cpp build."') runner_lines.append(' echo "[odysseus] To get a GPU build, first install vLLM via Cookbook -> Dependencies"') runner_lines.append(' echo "[odysseus] (its CUDA wheels include nvcc), then re-launch this serve task."') @@ -982,6 +992,10 @@ def setup_cookbook_routes() -> APIRouter: runner_lines.append(' echo "llama-server build failed — installing Python bindings as fallback..."') runner_lines.append(f" {_pip_install_fallback_chain('llama-cpp-python', python_cmd='pip')} || true") runner_lines.append(' fi') + runner_lines.append(' if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then') + runner_lines.append(' echo "ERROR: llama.cpp serving is not available after install/build attempts."') + runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127') + runner_lines.append(' fi') runner_lines.append('fi') elif "ollama" in req.cmd: handled_ollama_serve = True @@ -1037,19 +1051,24 @@ def setup_cookbook_routes() -> APIRouter: # find the `vllm` CLI ("command not found"). Mirrors llama.cpp above. runner_lines.append('export PATH="$HOME/.local/bin:$PATH"') runner_lines.append('if ! command -v vllm &>/dev/null; then') - runner_lines.append(' echo "ERROR: vLLM is not installed. Open Cookbook -> Dependencies and install vllm on this server, then launch again."') + runner_lines.append(' echo "ERROR: vLLM is not installed."') runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127') runner_lines.append('fi') elif "sglang.launch_server" in req.cmd: runner_lines.append('export PATH="$HOME/.local/bin:$PATH"') - runner_lines.append('if ! python3 -c "import sglang" 2>/dev/null; then') - runner_lines.append(' echo "ERROR: SGLang is not installed. Open Cookbook -> Dependencies and install sglang on this server, then launch again."') + runner_lines.append('if ! command -v sglang &>/dev/null; then') + runner_lines.append(' echo "ERROR: SGLang is not installed."') + runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127') + runner_lines.append('elif ! ODYSSEUS_SGLANG_IMPORT_ERROR="$(python3 -c "import sglang" 2>&1)"; then') + runner_lines.append(' echo "ERROR: SGLang is installed but failed to import."') + runner_lines.append(' printf "%s\\n" "$ODYSSEUS_SGLANG_IMPORT_ERROR"') runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127') runner_lines.append('fi') elif "scripts/diffusion_server.py" in req.cmd or ".diffusion_server.py" in req.cmd: runner_lines.append('export PATH="$HOME/.local/bin:$PATH"') - runner_lines.append('if ! python3 -c "import torch, diffusers" 2>/dev/null; then') - runner_lines.append(' echo "ERROR: Diffusion serving requires PyTorch + diffusers. Open Cookbook -> Dependencies and install diffusers on this server, then launch again."') + runner_lines.append('if ! ODYSSEUS_DIFFUSION_IMPORT_ERROR="$(python3 -c "import torch, diffusers" 2>&1)"; then') + runner_lines.append(' echo "ERROR: Diffusion serving requires PyTorch + diffusers."') + runner_lines.append(' printf "%s\\n" "$ODYSSEUS_DIFFUSION_IMPORT_ERROR"') runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127') runner_lines.append('fi') diff --git a/scripts/add_hwfit_models.py b/scripts/add_hwfit_models.py index fa48de9..b981379 100644 --- a/scripts/add_hwfit_models.py +++ b/scripts/add_hwfit_models.py @@ -88,6 +88,8 @@ def _quant_from_name(name): if "6bit" in n: return "mlx-6bit" return "mlx-8bit" if is8 else "mlx-4bit" + if "nvfp4" in n: + return "NVFP4" if "fp8" in n: return "FP8" if "int4" in n or "4bit" in n or "4-bit" in n: @@ -136,7 +138,7 @@ def _entry_from_modelinfo(mi, overrides): params_by_dtype = getattr(st, "parameters", None) or {} if quant.endswith("4bit") or quant.endswith("Int4"): pack_factor = 8 - elif quant.endswith("8bit") or quant.endswith("Int8") or quant == "FP8": + elif quant.endswith("8bit") or quant.endswith("Int8") or quant in ("FP8", "NVFP4"): pack_factor = 4 else: pack_factor = 1 @@ -158,7 +160,7 @@ def _entry_from_modelinfo(mi, overrides): rel = created.strftime("%Y-%m-%d") if created else datetime.utcnow().strftime("%Y-%m-%d") # Rough RAM/VRAM hints (fit.py recomputes the real requirement from params+quant). _BPP = {"AWQ-4bit": 0.58, "GPTQ-Int4": 0.58, "mlx-4bit": 0.55, "mlx-6bit": 0.85, - "AWQ-8bit": 1.1, "GPTQ-Int8": 1.1, "mlx-8bit": 1.1, "FP8": 1.1, "Q4_K_M": 0.6} + "AWQ-8bit": 1.1, "GPTQ-Int8": 1.1, "mlx-8bit": 1.1, "FP8": 1.1, "NVFP4": 0.6, "Q4_K_M": 0.6} bpp = _BPP.get(quant, 0.6) vram = round(pb * bpp + 0.5, 1) entry = { diff --git a/services/hwfit/data/hf_models.json b/services/hwfit/data/hf_models.json index 0267535..09a3dc9 100644 --- a/services/hwfit/data/hf_models.json +++ b/services/hwfit/data/hf_models.json @@ -13919,7 +13919,12 @@ "architecture": "gemma4", "pipeline_tag": "image-text-to-text", "release_date": "2026-04-01", - "gguf_sources": [], + "gguf_sources": [ + { + "repo": "unsloth/gemma-4-E2B-it-GGUF", + "provider": "unsloth" + } + ], "capabilities": [ "vision" ] @@ -13942,7 +13947,12 @@ "architecture": "gemma4", "pipeline_tag": "image-text-to-text", "release_date": "2026-04-01", - "gguf_sources": [], + "gguf_sources": [ + { + "repo": "unsloth/gemma-4-E4B-it-GGUF", + "provider": "unsloth" + } + ], "capabilities": [ "vision" ] @@ -13965,7 +13975,12 @@ "architecture": "gemma4", "pipeline_tag": "image-text-to-text", "release_date": "2026-04-01", - "gguf_sources": [], + "gguf_sources": [ + { + "repo": "unsloth/gemma-4-31B-it-GGUF", + "provider": "unsloth" + } + ], "capabilities": [ "vision" ] @@ -13988,7 +14003,12 @@ "architecture": "gemma4", "pipeline_tag": "image-text-to-text", "release_date": "2026-04-01", - "gguf_sources": [], + "gguf_sources": [ + { + "repo": "unsloth/gemma-4-26B-A4B-it-GGUF", + "provider": "unsloth" + } + ], "capabilities": [ "vision" ] @@ -18719,5 +18739,307 @@ "hf_likes": 0, "release_date": "2026-04-19", "_discovered": true + }, + { + "name": "Qwen/Qwen3.6-27B-MTP", + "provider": "Qwen", + "parameter_count": "27.8B", + "parameters_raw": 27781427952, + "min_ram_gb": 16.6, + "recommended_ram_gb": 21.6, + "min_vram_gb": 16.6, + "quantization": "Q4_K_M", + "context_length": 262144, + "use_case": "General purpose, coding, MTP", + "is_moe": false, + "num_experts": null, + "active_experts": null, + "active_parameters": null, + "architecture": "qwen3", + "pipeline_tag": "text-generation", + "release_date": "2026-04-01", + "gguf_sources": [ + { + "repo": "unsloth/Qwen3.6-27B-MTP-GGUF", + "provider": "unsloth" + } + ], + "capabilities": [ + "mtp" + ], + "_discovered": true + }, + { + "name": "Qwen/Qwen3.6-35B-A3B-MTP", + "provider": "Qwen", + "parameter_count": "36.0B", + "parameters_raw": 35951822704, + "min_ram_gb": 21.4, + "recommended_ram_gb": 27.8, + "min_vram_gb": 21.4, + "quantization": "Q4_K_M", + "context_length": 262144, + "use_case": "General purpose (MoE), MTP", + "is_moe": true, + "num_experts": null, + "active_experts": null, + "active_parameters": 3000000000, + "architecture": "qwen3_moe", + "pipeline_tag": "text-generation", + "release_date": "2026-04-01", + "gguf_sources": [ + { + "repo": "unsloth/Qwen3.6-35B-A3B-MTP-GGUF", + "provider": "unsloth" + } + ], + "capabilities": [ + "mtp" + ], + "_discovered": true + }, + { + "name": "Qwen/Qwen3.5-0.8B-MTP", + "provider": "Qwen", + "parameter_count": "873M", + "parameters_raw": 873438784, + "min_ram_gb": 1.0, + "recommended_ram_gb": 2.0, + "min_vram_gb": 0.5, + "quantization": "Q4_K_M", + "context_length": 262144, + "use_case": "General purpose, MTP", + "capabilities": [ + "mtp", + "tool_use", + "vision" + ], + "pipeline_tag": "image-text-to-text", + "architecture": "qwen3_5", + "hf_downloads": 93448, + "hf_likes": 208, + "release_date": "2026-02-28", + "gguf_sources": [ + { + "repo": "unsloth/Qwen3.5-0.8B-MTP-GGUF", + "provider": "unsloth" + } + ], + "_discovered": true + }, + { + "name": "Qwen/Qwen3.5-2B-MTP", + "provider": "Qwen", + "parameter_count": "2.3B", + "parameters_raw": 2274069824, + "min_ram_gb": 1.3, + "recommended_ram_gb": 2.1, + "min_vram_gb": 1.2, + "quantization": "Q4_K_M", + "context_length": 262144, + "use_case": "General purpose, MTP", + "capabilities": [ + "mtp", + "tool_use", + "vision" + ], + "pipeline_tag": "image-text-to-text", + "architecture": "qwen3_5", + "hf_downloads": 46974, + "hf_likes": 115, + "release_date": "2026-02-28", + "gguf_sources": [ + { + "repo": "unsloth/Qwen3.5-2B-MTP-GGUF", + "provider": "unsloth" + } + ], + "_discovered": true + }, + { + "name": "Qwen/Qwen3.5-4B-MTP", + "provider": "Qwen", + "parameter_count": "4.7B", + "parameters_raw": 4659865088, + "min_ram_gb": 2.6, + "recommended_ram_gb": 4.3, + "min_vram_gb": 2.4, + "quantization": "Q4_K_M", + "context_length": 262144, + "use_case": "General purpose, MTP", + "capabilities": [ + "mtp", + "tool_use", + "vision" + ], + "pipeline_tag": "image-text-to-text", + "architecture": "qwen3_5", + "hf_downloads": 99087, + "hf_likes": 202, + "release_date": "2026-02-27", + "gguf_sources": [ + { + "repo": "unsloth/Qwen3.5-4B-MTP-GGUF", + "provider": "unsloth" + } + ], + "_discovered": true + }, + { + "name": "Qwen/Qwen3.5-9B-MTP", + "provider": "Qwen", + "parameter_count": "9.7B", + "parameters_raw": 9653104368, + "min_ram_gb": 5.4, + "recommended_ram_gb": 9.0, + "min_vram_gb": 4.9, + "quantization": "Q4_K_M", + "context_length": 262144, + "use_case": "General purpose, MTP", + "capabilities": [ + "mtp", + "tool_use", + "vision" + ], + "pipeline_tag": "image-text-to-text", + "architecture": "qwen3_5", + "hf_downloads": 172298, + "hf_likes": 345, + "release_date": "2026-02-27", + "gguf_sources": [ + { + "repo": "unsloth/Qwen3.5-9B-MTP-GGUF", + "provider": "unsloth" + } + ], + "_discovered": true + }, + { + "name": "Qwen/Qwen3.5-27B-MTP", + "provider": "Qwen", + "parameter_count": "27.8B", + "parameters_raw": 27781427952, + "min_ram_gb": 15.5, + "recommended_ram_gb": 25.9, + "min_vram_gb": 14.2, + "quantization": "Q4_K_M", + "context_length": 262144, + "use_case": "General purpose, MTP", + "capabilities": [ + "mtp", + "tool_use", + "vision" + ], + "pipeline_tag": "image-text-to-text", + "architecture": "qwen3_5", + "hf_downloads": 406808, + "hf_likes": 565, + "release_date": "2026-02-24", + "gguf_sources": [ + { + "repo": "unsloth/Qwen3.5-27B-MTP-GGUF", + "provider": "unsloth" + } + ], + "_discovered": true + }, + { + "name": "Qwen/Qwen3.5-35B-A3B-MTP", + "provider": "Qwen", + "parameter_count": "36.0B", + "parameters_raw": 35951822704, + "min_ram_gb": 20.1, + "recommended_ram_gb": 33.5, + "min_vram_gb": 18.4, + "quantization": "Q4_K_M", + "context_length": 262144, + "use_case": "General purpose, MTP", + "capabilities": [ + "mtp", + "tool_use", + "vision" + ], + "pipeline_tag": "image-text-to-text", + "architecture": "qwen3_5_moe", + "hf_downloads": 769032, + "hf_likes": 905, + "release_date": "2026-02-24", + "is_moe": true, + "num_experts": 256, + "active_experts": 8, + "active_parameters": 3000000000, + "gguf_sources": [ + { + "repo": "unsloth/Qwen3.5-35B-A3B-MTP-GGUF", + "provider": "unsloth" + } + ], + "_discovered": true + }, + { + "name": "Qwen/Qwen3.5-122B-A10B-MTP", + "provider": "Qwen", + "parameter_count": "125.1B", + "parameters_raw": 125086497008, + "min_ram_gb": 69.9, + "recommended_ram_gb": 116.5, + "min_vram_gb": 64.1, + "quantization": "Q4_K_M", + "context_length": 262144, + "use_case": "General purpose, MTP", + "capabilities": [ + "mtp", + "tool_use", + "vision" + ], + "pipeline_tag": "image-text-to-text", + "architecture": "qwen3_5_moe", + "hf_downloads": 171055, + "hf_likes": 389, + "release_date": "2026-02-24", + "is_moe": true, + "num_experts": 256, + "active_experts": 8, + "active_parameters": 10000000000, + "gguf_sources": [ + { + "repo": "unsloth/Qwen3.5-122B-A10B-MTP-GGUF", + "provider": "unsloth" + } + ], + "_discovered": true + }, + { + "name": "Qwen/Qwen3.5-397B-A17B-MTP", + "provider": "Qwen", + "parameter_count": "403.4B", + "parameters_raw": 403397928944, + "min_ram_gb": 225.4, + "recommended_ram_gb": 375.7, + "min_vram_gb": 206.6, + "quantization": "Q4_K_M", + "context_length": 262144, + "use_case": "General purpose, MTP", + "capabilities": [ + "mtp", + "tool_use", + "vision" + ], + "pipeline_tag": "image-text-to-text", + "architecture": "qwen3_5_moe", + "hf_downloads": 1291825, + "hf_likes": 1214, + "release_date": "2026-02-16", + "is_moe": true, + "num_experts": 256, + "active_experts": 8, + "active_parameters": 17000000000, + "gguf_sources": [ + { + "repo": "unsloth/Qwen3.5-397B-A17B-MTP-GGUF", + "provider": "unsloth" + } + ], + "_discovered": true } ] diff --git a/services/hwfit/fit.py b/services/hwfit/fit.py index aac7823..fa5fa32 100644 --- a/services/hwfit/fit.py +++ b/services/hwfit/fit.py @@ -99,6 +99,27 @@ def _estimate_speed(model, quant, run_mode, system): return k / pb * sm +def _architecture_bonus(model): + name = (model.get("name") or "").lower() + arch = (model.get("architecture") or "").lower() + text = f"{name} {arch}" + + # Keep this intentionally small: hardware fit and speed still matter, but + # current model families should not be scored the same as older Qwen2/LLama + # era entries just because the parameter count is similar. + if "qwen3.6" in text or "qwen3_6" in text: + return 9 + if "qwen3.5" in text or "qwen3_5" in text: + return 8 + if "qwen3-next" in text or "qwen3_next" in text: + return 6 + if "qwen3" in text or arch.startswith("qwen3"): + return 4 + if "qwen2.5" in text or "qwen2_5" in text: + return 2 + return 0 + + def _quality_score(model, quant, use_case): pb = params_b(model) if pb < 1: @@ -128,6 +149,7 @@ def _quality_score(model, quant, use_case): if "gemma" in name_lower: base += 1 + base += _architecture_bonus(model) base += QUANT_QUALITY_PENALTY.get(quant, 0) model_uc = infer_use_case(model) @@ -220,12 +242,13 @@ def _quant_bits(q): return 0 -def analyze_model(model, system, target_quant=None): +def analyze_model(model, system, target_quant=None, scoring_use_case=None): pb = params_b(model) if pb <= 0: return None - use_case = infer_use_case(model) + model_use_case = infer_use_case(model) + score_use_case = scoring_use_case or "general" has_gpu = system.get("has_gpu", False) gpu_vram = (system.get("gpu_vram_gb") or 0) if has_gpu else 0 gpu_count = system.get("gpu_count", 1) or 1 @@ -242,6 +265,8 @@ def analyze_model(model, system, target_quant=None): ctx = model.get("context_length", 4096) or 4096 native_quant = model.get("quantization", "Q4_K_M") + if "nvfp4" in (model.get("name") or "").lower(): + native_quant = "NVFP4" preq = is_prequantized(model) # GGUF models can't be sharded across GPUs — use single GPU VRAM @@ -260,10 +285,13 @@ def analyze_model(model, system, target_quant=None): # Determine which quant to evaluate at if preq: # AWQ/GPTQ/FP8/MLX come at a fixed bit-width. If the user picked a - # specific quant tier (e.g. Q8 → 8-bit), only keep prequant models whose - # native bit-width matches — otherwise selecting Q8 would still surface - # AWQ-4bit models, mixing 4- and 8-bit in one view. + # GGUF quant tier (Q4/Q8/etc.), do not treat a same-bit AWQ/GPTQ build + # as equivalent. "Q4" means llama.cpp/Ollama-style GGUF in this UI; + # AWQ/GPTQ/FP8 are separate GPU-serving formats and must only appear + # when explicitly selected or when no quant filter is applied. if target_quant: + if not any(target_quant.startswith(p) for p in ("AWQ-", "GPTQ-", "FP8", "NVFP4")): + return None _tb, _nb = _quant_bits(target_quant), _quant_bits(native_quant) if _tb and _nb and _tb != _nb: return None @@ -300,7 +328,7 @@ def analyze_model(model, system, target_quant=None): "parameter_count": model.get("parameter_count"), "params_b": round(pb, 1), "is_moe": is_moe, - "use_case": use_case, + "use_case": model_use_case, "fit_level": "too_tight", "run_mode": "no_fit", "quant": quant_to_try, @@ -334,12 +362,12 @@ def analyze_model(model, system, target_quant=None): tps = _estimate_speed(model, quant, run_mode, system) - q_score = _quality_score(model, quant, use_case) - s_score = _speed_score(tps, use_case) + q_score = _quality_score(model, quant, score_use_case) + s_score = _speed_score(tps, score_use_case) f_score = _fit_score(required_gb, budget) - c_score = _context_score(fit_ctx, use_case) + c_score = _context_score(fit_ctx, score_use_case) - wq, ws, wf, wc = USE_CASE_WEIGHTS.get(use_case, (0.45, 0.30, 0.15, 0.10)) + wq, ws, wf, wc = USE_CASE_WEIGHTS.get(score_use_case, (0.45, 0.30, 0.15, 0.10)) composite = q_score * wq + s_score * ws + f_score * wf + c_score * wc return { @@ -348,7 +376,7 @@ def analyze_model(model, system, target_quant=None): "parameter_count": model.get("parameter_count"), "params_b": round(pb, 1), "is_moe": is_moe, - "use_case": use_case, + "use_case": model_use_case, "fit_level": fit_level, "run_mode": run_mode, "quant": quant, @@ -419,21 +447,29 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan results.sort(key=sort_fn, reverse=(sort != "vram")) return results[:limit] - # If user picked a prequantized format (AWQ/FP8/GPTQ), filter to only those models - filter_native = quant and any(quant.startswith(p) for p in ("AWQ-", "GPTQ-", "FP8")) + # If user picked a prequantized format (AWQ/FP8/GPTQ/NVFP4), filter to only those models + filter_native = quant and any(quant.startswith(p) for p in ("AWQ-", "GPTQ-", "FP8", "NVFP4")) system_backend = (system.get("backend") or "").lower() apple_silicon = system_backend in ("mps", "metal", "apple") + rocm = system_backend == "rocm" for m in models: native_q = m.get("quantization", "") + if "nvfp4" in (m.get("name") or "").lower(): + native_q = "NVFP4" - # MLX-quantized models need the MLX runtime (mlx_lm), which Odysseus - # doesn't generate serve commands for — only llama.cpp/Ollama (Metal) - # and vLLM/SGLang (CUDA). MLX repos ship no GGUF alternative, so they're - # unrunnable on every backend we support. Always drop them, on Apple - # Silicon too, so the Cookbook never recommends a model it can't serve. - if native_q.startswith("mlx-"): + # MLX is Apple Silicon only. Hide MLX rows on non-Mac hardware scans, + # but leave them visible on Metal/MPS so Mac support is not broken. + if not apple_silicon and (native_q.startswith("mlx-") or "mlx" in (m.get("name") or "").lower()): + continue + + # ROCm support for vLLM/SGLang quantized safetensors is too brittle to + # recommend blindly in the default scan. Keep AWQ/GPTQ/FP8 discoverable + # only when the user explicitly picks that format from the quant filter; + # otherwise prefer GGUF/Q* entries that Odysseus can route through + # llama.cpp/Ollama without pretending "fits VRAM" means "servable". + if rocm and is_prequantized(m) and not filter_native: continue # On Apple Silicon the only serving engines are llama.cpp and Ollama, @@ -443,7 +479,8 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan # default GGUF quant) and vLLM-only AWQ/GPTQ/FP8 builds alike. Without # this the Cookbook recommends models the Mac can't run; on CUDA these # stay visible because vLLM serves safetensors directly. - if apple_silicon and not (m.get("is_gguf") or m.get("gguf_sources")): + is_mlx = native_q.startswith("mlx-") or "mlx" in (m.get("name") or "").lower() + if apple_silicon and not (m.get("is_gguf") or m.get("gguf_sources") or is_mlx): continue # Format filter: AWQ tab → only AWQ models, FP8 tab → only FP8 models @@ -454,6 +491,8 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan continue if quant.startswith("GPTQ") and not native_q.startswith("GPTQ"): continue + if quant.startswith("NVFP4") and not native_q.startswith("NVFP4"): + continue if search: name = m.get("name", "").lower() @@ -461,7 +500,7 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan if search.lower() not in name and search.lower() not in provider: continue - result = analyze_model(m, system, target_quant=quant) + result = analyze_model(m, system, target_quant=quant, scoring_use_case=(use_case or "general")) if result is None: continue diff --git a/services/hwfit/models.py b/services/hwfit/models.py index 642983d..b62184e 100644 --- a/services/hwfit/models.py +++ b/services/hwfit/models.py @@ -5,7 +5,7 @@ import re QUANT_HIERARCHY = ["Q8_0", "Q6_K", "Q5_K_M", "Q4_K_M", "Q3_K_M", "Q2_K"] QUANT_BPP = { - "F32": 4.0, "F16": 2.0, "BF16": 2.0, "FP8": 1.0, + "F32": 4.0, "F16": 2.0, "BF16": 2.0, "FP8": 1.0, "NVFP4": 0.5, "Q8_0": 1.05, "Q6_K": 0.80, "Q5_K_M": 0.68, "Q4_K_M": 0.58, "Q4_0": 0.58, "Q3_K_M": 0.48, "Q2_K": 0.37, "AWQ-4bit": 0.50, "AWQ-8bit": 1.0, @@ -14,7 +14,7 @@ QUANT_BPP = { } QUANT_SPEED_MULT = { - "F16": 0.6, "BF16": 0.6, "FP8": 0.85, + "F16": 0.6, "BF16": 0.6, "FP8": 0.85, "NVFP4": 1.1, "Q8_0": 0.8, "Q6_K": 0.95, "Q5_K_M": 1.0, "Q4_K_M": 1.15, "Q4_0": 1.15, "Q3_K_M": 1.25, "Q2_K": 1.35, "AWQ-4bit": 1.2, "AWQ-8bit": 0.85, @@ -23,7 +23,7 @@ QUANT_SPEED_MULT = { } QUANT_QUALITY_PENALTY = { - "F16": 0.0, "BF16": 0.0, "FP8": 0.0, + "F16": 0.0, "BF16": 0.0, "FP8": 0.0, "NVFP4": 0.0, "Q8_0": 0.0, "Q6_K": -1.0, "Q5_K_M": -2.0, "Q4_K_M": -5.0, "Q4_0": -5.0, "Q3_K_M": -8.0, "Q2_K": -12.0, "AWQ-4bit": -3.0, "AWQ-8bit": 0.0, @@ -32,7 +32,7 @@ QUANT_QUALITY_PENALTY = { } QUANT_BYTES_PER_PARAM = { - "F16": 2.0, "BF16": 2.0, "FP8": 1.0, + "F16": 2.0, "BF16": 2.0, "FP8": 1.0, "NVFP4": 0.5, "Q8_0": 1.0, "Q6_K": 0.75, "Q5_K_M": 0.625, "Q4_K_M": 0.5, "Q4_0": 0.5, "Q3_K_M": 0.375, "Q2_K": 0.25, "AWQ-4bit": 0.5, "AWQ-8bit": 1.0, @@ -41,12 +41,13 @@ QUANT_BYTES_PER_PARAM = { } # Pre-quantized formats that should NOT go through the GGUF quant hierarchy -PREQUANTIZED_PREFIXES = ("AWQ-", "GPTQ-", "mlx-", "FP8") +PREQUANTIZED_PREFIXES = ("AWQ-", "GPTQ-", "mlx-", "FP8", "NVFP4") def is_prequantized(model): q = model.get("quantization", "") - return any(q.startswith(p) for p in PREQUANTIZED_PREFIXES) + name = (model.get("name") or "").lower() + return "nvfp4" in name or any(q.startswith(p) for p in PREQUANTIZED_PREFIXES) def params_b(model): diff --git a/src/tool_execution.py b/src/tool_execution.py index 972960c..33402ed 100644 --- a/src/tool_execution.py +++ b/src/tool_execution.py @@ -502,6 +502,11 @@ async def _direct_fallback( ) except asyncio.TimeoutError: return {"error": f"web_fetch: timed out fetching {url}", "exit_code": 1} + except Exception as e: + # Direct URL fetches can hit bot protection / auth walls + # (e.g. eBay 403). Treat that as a tool failure the model can + # reason around, not an uncaught chat-stream 500. + return {"error": f"web_fetch: {url}: {e}", "exit_code": 1} err = result.get("error") text = (result.get("content") or "").strip() title = result.get("title") or "" diff --git a/static/js/cookbook-diagnosis.js b/static/js/cookbook-diagnosis.js index 9442643..a110bf8 100644 --- a/static/js/cookbook-diagnosis.js +++ b/static/js/cookbook-diagnosis.js @@ -27,6 +27,56 @@ import spinnerModule from './spinner.js'; // ── Error diagnosis ── +function _openCookbookDependencies(pkgName = '') { + const cookbook = window.cookbookModule; + if (cookbook && typeof cookbook.open === 'function') { + cookbook.open({ tab: 'Dependencies' }); + } else { + document.getElementById('tool-cookbook-btn')?.click(); + } + + const wanted = String(pkgName || '').toLowerCase(); + const tryHighlight = (attempt = 0) => { + const modal = document.getElementById('cookbook-modal'); + const tab = modal?.querySelector('.cookbook-tab[data-backend="Dependencies"]'); + if (tab && !tab.classList.contains('active')) tab.click(); + + const rows = [...document.querySelectorAll('#cookbook-deps-list [data-pkg-name]')]; + if (!rows.length) { + if (attempt < 45) setTimeout(() => tryHighlight(attempt + 1), 100); + return; + } + if (!wanted) return; + const row = rows.find(r => { + const name = (r.dataset.pkgName || '').toLowerCase(); + const pip = (r.dataset.depPip || '').toLowerCase(); + return name === wanted || pip.includes(wanted) || wanted.includes(name); + }); + if (row) { + row.scrollIntoView({ block: 'center' }); + row.classList.add('cookbook-pkg-flash'); + setTimeout(() => row.classList.remove('cookbook-pkg-flash'), 1800); + } + }; + tryHighlight(); +} + +function _openServeEditFromDiagnosis(panel, fields = null) { + const task = panel?.closest?.('.cookbook-task'); + if (!task) return; + task.dispatchEvent(new CustomEvent('cookbook:edit-serve', { bubbles: true, detail: { fields } })); +} + +function _openCpuServeEdit(panel) { + _openServeEditFromDiagnosis(panel, { + backend: 'llamacpp', + gpus: '', + tp: '1', + gpu_mem: '0.80', + _forceBackend: true, + }); +} + // Infer the gated base repo that single-file checkpoints need configs from function _inferBaseRepo(text) { if (!text) return null; @@ -218,6 +268,7 @@ export const ERROR_PATTERNS = [ pattern: /vllm.*command not found|No module named vllm/i, message: 'vLLM is not installed or not in PATH.', fixes: [ + { label: 'Open Dependencies', action: () => _openCookbookDependencies('vllm') }, { label: 'Check environment is set', action: (panel) => { const el = panel.querySelector('[data-field="env_type"]'); if (el) { el.focus(); el.style.borderColor = 'var(--red)'; } @@ -226,11 +277,21 @@ export const ERROR_PATTERNS = [ }, { pattern: /sglang.*command not found|No module named sglang|SGLang is not installed/i, - message: 'SGLang is not installed or not in PATH. Open Cookbook → Dependencies and install sglang on this server.', + message: 'SGLang is not installed or not in PATH.', fixes: [ + { label: 'Open Dependencies', action: () => _openCookbookDependencies('sglang') }, { label: 'Copy install command', action: () => _copyText('python3 -m pip install "sglang[all]"') }, ], }, + { + pattern: /No accelerator \(CUDA, XPU, HPU, NPU, MUSA, MPS\) is available|Triton is not supported on current platform/i, + message: 'SGLang needs a visible GPU/accelerator on this server.', + suggestion: 'Suggested action: switch this serve config to llama.cpp for CPU/local serving, or choose a GPU server.', + fixes: [ + { label: 'Switch to llama.cpp', action: (panel) => _openCpuServeEdit(panel) }, + { label: 'Choose GPU server', action: (panel) => _openServeEditFromDiagnosis(panel) }, + ], + }, { pattern: /flashinfer.*version.*does not match|flashinfer-cubin version/i, message: 'FlashInfer version mismatch.', @@ -241,8 +302,12 @@ export const ERROR_PATTERNS = [ }, { pattern: /torch\.cuda\.is_available\(\).*False|No CUDA runtime/i, - message: 'CUDA not available in this environment.', - fixes: [], + message: 'vLLM needs a visible CUDA/ROCm GPU.', + suggestion: 'Suggested action: switch this serve config to llama.cpp for CPU/local serving, or choose a GPU server.', + fixes: [ + { label: 'Switch to llama.cpp', action: (panel) => _openCpuServeEdit(panel) }, + { label: 'Choose GPU server', action: (panel) => _openServeEditFromDiagnosis(panel) }, + ], }, { pattern: /Engine core initialization failed/i, @@ -295,17 +360,20 @@ export const ERROR_PATTERNS = [ }, { pattern: /Either a revision or a version must be specified|transformers\.integrations\.hub_kernels|kernels\/layer/i, - message: 'vLLM/Transformers kernel package mismatch.', + message: 'Transformers/kernels package mismatch.', fixes: [ - { label: 'Update vLLM/Transformers/kernels', action: (panel) => { + { label: 'Repair kernel package', action: (panel) => { const taskEl = panel.closest('.cookbook-task'); const task = taskEl ? _loadTasks().find(t => t.sessionId === taskEl.dataset.taskId) : null; const host = task?.remoteHost || ''; const prefix = _buildEnvPrefix(); - const pipCmd = prefix ? prefix + ' python3 -m pip install -U vllm transformers kernels' : 'python3 -m pip install -U vllm transformers kernels'; + const pipCmd = prefix + ? prefix + ' python3 -m pip install --user --break-system-packages "kernels<0.15"' + : 'python3 -m pip install --user --break-system-packages "kernels<0.15"'; const cmd = host ? _sshCmd(host, pipCmd) : pipCmd; - _launchServeTask('update-vllm-stack', 'pip-update', cmd); + _launchServeTask('repair-kernels', 'pip-update', cmd); }}, + { label: 'Open Dependencies', action: () => _openCookbookDependencies('sglang') }, ], }, { @@ -319,13 +387,24 @@ export const ERROR_PATTERNS = [ pattern: /llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'/i, message: 'llama-cpp-python server is not installed. Run: pip install "llama-cpp-python[server]"', fixes: [ + { label: 'Open Dependencies', action: () => _openCookbookDependencies('llama_cpp') }, { label: 'Copy install command', action: () => _copyText('pip install "llama-cpp-python[server]"') }, ], }, + { + pattern: /CUDA Toolkit not found|Unable to find cudart library|missing:\s*CUDA_CUDART/i, + message: 'llama.cpp found nvcc, but the CUDA runtime library is missing.', + suggestion: 'Suggested action: relaunch with the updated runner so llama.cpp builds CPU-only, or install a complete CUDA toolkit/runtime on this server for GPU llama.cpp.', + fixes: [ + { label: 'Edit serve', action: (panel) => _openServeEditFromDiagnosis(panel) }, + { label: 'Open Dependencies', action: () => _openCookbookDependencies('llama_cpp') }, + ], + }, { pattern: /No module named ['"]?torch|No module named ['"]?diffusers|diffusers.*command not found/i, message: 'Diffusion serving needs PyTorch and diffusers. Install diffusers from Cookbook → Dependencies.', fixes: [ + { label: 'Open Dependencies', action: () => _openCookbookDependencies('diffusers') }, { label: 'Copy install command', action: () => _copyText('python3 -m pip install "diffusers[torch]"') }, ], }, @@ -402,10 +481,32 @@ export function _diagnose(text) { return null; } +function _diagnosisCopyBundle(task, diagnosis, sourceText, suggestionText) { + const lines = ['## Odysseus Cookbook troubleshooting']; + if (task) { + lines.push( + '', + '### Task', + `- ID: ${task.sessionId || task.id || 'unknown'}`, + `- Type: ${task.type || 'unknown'}`, + `- Status: ${task.status || 'unknown'}`, + `- Model: ${task.payload?.repo_id || task.name || 'unknown'}`, + `- Host: ${task.remoteHost || 'local'}${task.sshPort ? `:${task.sshPort}` : ''}`, + ); + } + lines.push('', '### Diagnosis', diagnosis?.message || '(none)'); + if (suggestionText) lines.push('', '### Suggested action', suggestionText.replace(/^Suggested action:\s*/i, '')); + const cmd = task?.payload?._cmd || ''; + if (cmd) lines.push('', '### Launch command', '```bash', cmd, '```'); + if (sourceText) lines.push('', '### Captured output', '```text', String(sourceText).trim(), '```'); + return lines.join('\n'); +} + export function _showDiagnosis(panel, diagnosis, sourceText) { - if (panel._lastDiagMsg === diagnosis.message) return; - if (panel._diagDismissed === diagnosis.message) return; // stay dismissed until new error + const wasCollapsed = panel._lastDiagMsg === diagnosis.message && panel._diagCollapsed; + if (panel._diagDismissed === diagnosis.message) return; panel._lastDiagMsg = diagnosis.message; + panel._diagCollapsed = !!wasCollapsed; let diag = panel.querySelector('.cookbook-diagnosis'); if (!diag) { @@ -417,57 +518,161 @@ export function _showDiagnosis(panel, diagnosis, sourceText) { } diag.classList.remove('hidden'); diag.innerHTML = ''; + const taskEl = panel?.closest?.('.cookbook-task'); + const task = taskEl ? _loadTasks().find(t => t.sessionId === taskEl.dataset.taskId) : null; + const fixes = [...(diagnosis.fixes || [])]; + if (task?.type === 'serve' && task.payload?._cmd && !fixes.some(f => f.label === 'Edit serve')) { + fixes.push({ label: 'Edit serve', action: (p) => _openServeEditFromDiagnosis(p) }); + } + const suggestionText = diagnosis.suggestion || (fixes.length + ? `Suggested action: ${fixes[0].label}.` + : 'Suggested action: copy the error and adjust the serve settings.'); const header = document.createElement('div'); - header.style.cssText = 'display:flex;align-items:center;justify-content:space-between;'; + header.className = 'cookbook-diag-header'; - const msg = document.createElement('div'); - msg.className = 'cookbook-diag-message'; - msg.textContent = diagnosis.message; - header.appendChild(msg); + const fold = document.createElement('button'); + fold.className = 'cookbook-diag-fold'; + fold.type = 'button'; + fold.innerHTML = '▾Error message:'; + header.appendChild(fold); + + const copy = document.createElement('button'); + copy.className = 'cookbook-diag-copy'; + copy.type = 'button'; + copy.title = 'Copy troubleshooting bundle'; + copy.setAttribute('aria-label', 'Copy troubleshooting bundle'); + copy.innerHTML = ''; + copy.addEventListener('click', (e) => { + e.stopPropagation(); + _copyText(_diagnosisCopyBundle(task, diagnosis, sourceText, suggestionText)); + copy.classList.add('copied'); + copy.innerHTML = ''; + setTimeout(() => { + if (!copy.isConnected) return; + copy.classList.remove('copied'); + copy.innerHTML = ''; + }, 1200); + }); + header.appendChild(copy); const dismiss = document.createElement('button'); - dismiss.className = 'close-btn'; - dismiss.style.cssText = 'width:16px;height:16px;font-size:9px;flex-shrink:0;'; - dismiss.textContent = '\u2715'; - dismiss.addEventListener('click', () => { panel._diagDismissed = diagnosis.message; _clearDiagnosis(panel); }); + dismiss.className = 'cookbook-diag-dismiss'; + dismiss.type = 'button'; + dismiss.title = 'Dismiss error'; + dismiss.setAttribute('aria-label', 'Dismiss error'); + dismiss.textContent = '×'; + dismiss.addEventListener('click', (e) => { + e.stopPropagation(); + panel._diagDismissed = diagnosis.message; + _clearDiagnosis(panel); + }); header.appendChild(dismiss); diag.appendChild(header); - if (diagnosis.fixes && diagnosis.fixes.length) { + const body = document.createElement('div'); + body.className = 'cookbook-diag-body'; + body.classList.toggle('hidden', panel._diagCollapsed); + fold.querySelector('.cookbook-diag-chevron').textContent = panel._diagCollapsed ? '▸' : '▾'; + const msg = document.createElement('div'); + msg.className = 'cookbook-diag-message'; + msg.textContent = diagnosis.message; + body.appendChild(msg); + const suggestion = document.createElement('div'); + suggestion.className = 'cookbook-diag-suggestion'; + suggestion.textContent = suggestionText; + body.appendChild(suggestion); + fold.addEventListener('click', (e) => { + e.stopPropagation(); + panel._diagCollapsed = !panel._diagCollapsed; + body.classList.toggle('hidden', panel._diagCollapsed); + fold.querySelector('.cookbook-diag-chevron').textContent = panel._diagCollapsed ? '▸' : '▾'; + }); + diag.appendChild(body); + + const runFix = async (fix, button, busyLabel = fix.label, onStart = null, onDone = null) => { + if (!fix || !button || button.dataset.busy) return; + button.dataset.busy = '1'; + const _orig = button.textContent; + const wp = spinnerModule.createWhirlpool(12); + wp.element.style.cssText = 'display:inline-block;vertical-align:middle;width:12px;height:12px;margin-right:5px;'; + button.textContent = ''; + button.appendChild(wp.element); + const _lbl = document.createElement('span'); + _lbl.textContent = busyLabel; + _lbl.style.verticalAlign = 'middle'; + button.appendChild(_lbl); + try { + if (typeof onStart === 'function') onStart(); + await fix.action(panel, sourceText); + } catch (err) { + console.error('[cookbook] diagnosis fix failed', err); + } finally { + if (button.isConnected) { + try { wp.destroy(); } catch {} + button.textContent = _orig; + delete button.dataset.busy; + } + if (typeof onDone === 'function') onDone(); + } + }; + + if (fixes.length) { const row = document.createElement('div'); row.className = 'cookbook-diag-fixes'; - for (const fix of diagnosis.fixes) { - const btn = document.createElement('button'); - btn.className = 'cookbook-btn cookbook-diag-btn'; - btn.textContent = fix.label; - btn.addEventListener('click', async () => { - if (btn.dataset.busy) return; - btn.dataset.busy = '1'; - // Spinner feedback while the fix runs (kill + relaunch takes a moment). - const _orig = btn.textContent; - const wp = spinnerModule.createWhirlpool(12); - wp.element.style.cssText = 'display:inline-block;vertical-align:middle;width:12px;height:12px;margin-right:5px;'; - btn.textContent = ''; - btn.appendChild(wp.element); - const _lbl = document.createElement('span'); - _lbl.textContent = _orig; - _lbl.style.verticalAlign = 'middle'; - btn.appendChild(_lbl); - try { - await fix.action(panel, sourceText); - } catch (e) { - console.error('[cookbook] diagnosis fix failed', e); - } finally { - // Retries animate the whole card away (button goes with it). For fixes - // that leave the card in place, restore the label. - if (btn.isConnected) { try { wp.destroy(); } catch {} btn.textContent = _orig; delete btn.dataset.busy; } - } - }); - row.appendChild(btn); + + if (fixes.length <= 3) { + for (const fix of fixes) { + const btn = document.createElement('button'); + btn.className = 'cookbook-btn cookbook-diag-btn'; + btn.type = 'button'; + btn.textContent = fix.label; + btn.addEventListener('click', (e) => { + e.stopPropagation(); + runFix(fix, btn); + }); + row.appendChild(btn); + } + body.appendChild(row); + return; } - diag.appendChild(row); + + const wrap = document.createElement('div'); + wrap.className = 'cookbook-diag-actions'; + + const trigger = document.createElement('button'); + trigger.className = 'cookbook-btn cookbook-diag-action-trigger'; + trigger.type = 'button'; + trigger.textContent = 'Actions'; + trigger.appendChild(document.createTextNode(' ▾')); + wrap.appendChild(trigger); + + const menu = document.createElement('div'); + menu.className = 'dropdown cookbook-diag-menu hidden'; + for (const fix of fixes) { + const item = document.createElement('button'); + item.type = 'button'; + item.textContent = fix.label; + item.addEventListener('click', async (e) => { + e.stopPropagation(); + if (item.dataset.busy || trigger.dataset.busy) return; + item.dataset.busy = '1'; + await runFix(fix, trigger, fix.label, () => menu.classList.add('hidden'), () => delete item.dataset.busy); + }); + menu.appendChild(item); + } + wrap.appendChild(menu); + trigger.addEventListener('click', (e) => { + e.stopPropagation(); + if (trigger.dataset.busy) return; + document.querySelectorAll('.cookbook-diag-menu').forEach(m => { + if (m !== menu) m.classList.add('hidden'); + }); + menu.classList.toggle('hidden'); + }); + row.appendChild(wrap); + body.appendChild(row); } } diff --git a/static/js/cookbook-hwfit.js b/static/js/cookbook-hwfit.js index 4253553..7a46666 100644 --- a/static/js/cookbook-hwfit.js +++ b/static/js/cookbook-hwfit.js @@ -193,6 +193,8 @@ export function _renderGpuToggles(system) { if (quantSel) { if (count <= 1) { quantSel.value = 'Q4_K_M'; // RAM or 1 GPU -> Q4 sweet spot + } else if (String(system?.backend || '').toLowerCase() === 'rocm') { + quantSel.value = 'Q4_K_M'; // ROCm default stays GGUF/local-safe; AWQ is explicit only } else { quantSel.value = 'AWQ-4bit'; // Multi-GPU -> AWQ for vLLM } diff --git a/static/js/cookbook.js b/static/js/cookbook.js index cac5a90..3443156 100644 --- a/static/js/cookbook.js +++ b/static/js/cookbook.js @@ -260,12 +260,31 @@ export function _detectBackend(model) { const q = (model.quant || '').toUpperCase(); const sysBackend = String(_hwfitCache?.system?.backend || '').toLowerCase(); const isRocm = sysBackend === 'rocm'; + const isAppleSilicon = ['metal', 'mps', 'apple'].includes(sysBackend); + const _nm = `${model.repo_id || ''} ${model.path || ''} ${model.name || ''}`.toLowerCase(); + if (!isAppleSilicon && (/\bmlx\b|mlx-|_mlx/i.test(_nm) || q.startsWith('MLX'))) { + return { backend: 'unsupported', label: 'Unsupported' }; + } + const isAwqLike = /^AWQ|^GPTQ|^NVFP4/.test(q) || q === 'FP8' || /\b(awq|gptq|fp8|nvfp4)\b/i.test(_nm); + const isGgufLike = model.is_gguf || /^Q[2-8]/.test(q) || /^IQ/.test(q) || q === 'GGUF' || _nm.includes('gguf'); // Image gen models → diffusers if (model.is_image_gen || model.is_diffusion || model._tag === 'image') { return { backend: 'diffusers', label: 'Diffusers' }; } + // AWQ / GPTQ / FP8 are safetensors GPU-serving formats. Never route them + // through llama.cpp/Ollama just because the host is Mac/Windows; those engines + // need GGUF. The UI will warn/block on Metal where vLLM/SGLang aren't viable. + if (isAwqLike) { + return { backend: 'vllm', label: 'vLLM' }; + } + + // GGUF → llama.cpp/Ollama-compatible. + if (isGgufLike) { + return { backend: 'llamacpp', label: 'llama.cpp' }; + } + // Windows → default to llama.cpp (no vLLM support on Windows) if (_isWindows()) { return { backend: 'llamacpp', label: 'llama.cpp' }; @@ -278,19 +297,6 @@ export function _detectBackend(model) { return { backend: 'llamacpp', label: 'llama.cpp' }; } - // AWQ / GPTQ / FP8 → vLLM - if (/^AWQ|^GPTQ/.test(q) || q === 'FP8') { - return { backend: 'vllm', label: 'vLLM' }; - } - - // GGUF → llama.cpp. Match the quant tag OR a gguf hint in the repo/path/name: - // a raw .gguf file often has no quant field, which made it fall through to the - // vLLM default below. - const _nm = `${model.repo_id || ''} ${model.path || ''} ${model.name || ''}`.toLowerCase(); - if (model.is_gguf || /^Q[2-8]/.test(q) || /^IQ/.test(q) || q === 'GGUF' || _nm.includes('gguf')) { - return { backend: 'llamacpp', label: 'llama.cpp' }; - } - // ROCm/AMD machines should not blindly default HF safetensors models to // vLLM. SGLang is the safer OpenAI-compatible default for plain HF text // repos there; llama.cpp still wins above whenever the model is GGUF. @@ -1020,6 +1026,16 @@ function _wireTabEvents(body) { // Download input const dlBtn = document.getElementById('cookbook-dl-btn'); const dlInput = document.getElementById('cookbook-dl-repo'); + const dlCardToggle = document.getElementById('cookbook-download-card-toggle'); + const dlCardBody = document.getElementById('cookbook-download-card-body'); + const dlCardArrow = document.getElementById('cookbook-download-card-arrow'); + if (dlCardToggle && dlCardBody) { + dlCardToggle.addEventListener('click', () => { + const isOpen = dlCardBody.style.display !== 'none'; + dlCardBody.style.display = isOpen ? 'none' : 'block'; + if (dlCardArrow) dlCardArrow.style.transform = isOpen ? 'rotate(0deg)' : 'rotate(90deg)'; + }); + } if (dlBtn && dlInput) { function _stripHfUrl(input) { let repo = input.trim(); @@ -1099,8 +1115,12 @@ function _wireTabEvents(body) { if (hfToggle && hfList) { let _loaded = false; // Per-server VRAM cache so we don't re-probe on every expand - const _vramCache = {}; - async function _getSelectedServerVram() { + const _hwCache = {}; + function _hfModelLooksAwqLike(m) { + const text = `${m?.repo_id || ''} ${(m?.tags || []).join(' ')}`.toLowerCase(); + return /\b(awq|gptq|fp8|4bit|int4)\b/.test(text); + } + async function _getSelectedServerHw() { // Prefer the "What Fits" dropdown (the main control that shows hardware); // fall back to the download dropdown. This is the server the list ranks for. const dlSrv = document.getElementById('hwfit-server-select') || document.getElementById('hwfit-dl-server'); @@ -1117,7 +1137,7 @@ function _wireTabEvents(body) { } } const cacheKey = host || 'local'; - if (_vramCache[cacheKey] !== undefined) return _vramCache[cacheKey]; + if (_hwCache[cacheKey]) return _hwCache[cacheKey]; // Fetch system info for this server from hwfit try { const qp = new URLSearchParams(); @@ -1127,13 +1147,13 @@ function _wireTabEvents(body) { const r = await fetch(`/api/hwfit/system?${qp}`); if (r.ok) { const sys = await r.json(); - const v = sys?.gpu_vram_gb || 0; - _vramCache[cacheKey] = v; - return v; + const hw = { vram: sys?.gpu_vram_gb || 0, backend: String(sys?.backend || '').toLowerCase() }; + _hwCache[cacheKey] = hw; + return hw; } } catch {} - _vramCache[cacheKey] = 0; - return 0; + _hwCache[cacheKey] = { vram: 0, backend: '' }; + return _hwCache[cacheKey]; } async function _loadLatest() { // Match the Dependencies loader: whirlpool spinner + text label so the @@ -1152,7 +1172,8 @@ function _wireTabEvents(body) { } catch { hfList.innerHTML = '