diff --git a/routes/cookbook_helpers.py b/routes/cookbook_helpers.py index ca954ab..c311b24 100644 --- a/routes/cookbook_helpers.py +++ b/routes/cookbook_helpers.py @@ -434,6 +434,8 @@ def _parse_serve_phase(snapshot: str, task_type: str = "serve") -> dict: } if "Application startup complete" in flat: return {"phase": "ready", "status": "ready"} + if re.search(r'Ollama API ready on port\s+\d+', flat, re.I): + return {"phase": "ready", "status": "ready"} # HTTP access logs (e.g. GET /v1/models 200 OK) mean the server is up and serving if re.search(r'(?:GET|POST)\s+/[^\s]*\s+HTTP/[\d.]+"\s*\d{3}', flat): return {"phase": "idle", "status": "ready"} diff --git a/routes/cookbook_routes.py b/routes/cookbook_routes.py index c622d38..d794aee 100644 --- a/routes/cookbook_routes.py +++ b/routes/cookbook_routes.py @@ -905,6 +905,7 @@ def setup_cookbook_routes() -> APIRouter: # Show whether the HF token reached this server (masked) — a gated # model vLLM has to download will be denied without it. runner_lines.append(_HF_TOKEN_STATUS_SNIPPET) + handled_ollama_serve = False # Auto-install inference engine if missing if "llama_cpp" in req.cmd or "llama-server" in req.cmd: # Prefer the NATIVE llama-server binary — its minja templating @@ -978,17 +979,48 @@ def setup_cookbook_routes() -> APIRouter: runner_lines.append(' fi') runner_lines.append('fi') elif "ollama" in req.cmd: - # Ollama manages its own model store and HTTP server. Just make - # sure the binary exists and the daemon is up before running the - # command (the natural serving engine on Apple Silicon / Metal). + handled_ollama_serve = True + _ollama_port = "11434" + _ollama_match = re.search(r"OLLAMA_HOST=[^\s:]+:(\d+)", req.cmd) + if _ollama_match: + _ollama_port = _ollama_match.group(1) + # Ollama can be a host binary, a system service, or a Docker + # container. If the HTTP API is already reachable, the model is + # already served and we should not require a host `ollama` CLI. + runner_lines.append(f'ODYSSEUS_OLLAMA_PORT="{_ollama_port}"') + runner_lines.append('ODYSSEUS_OLLAMA_URL=""') + runner_lines.append('for _ody_ollama_port in "$ODYSSEUS_OLLAMA_PORT" 11434; do') + runner_lines.append(' [ -z "$_ody_ollama_port" ] && continue') + runner_lines.append(' for _ody_ollama_host in 127.0.0.1 localhost host.docker.internal; do') + runner_lines.append(' _ody_ollama_url="http://${_ody_ollama_host}:${_ody_ollama_port}"') + runner_lines.append(' if curl -sf "$_ody_ollama_url/api/tags" >/dev/null 2>&1; then') + runner_lines.append(' ODYSSEUS_OLLAMA_URL="$_ody_ollama_url"') + runner_lines.append(' ODYSSEUS_OLLAMA_PORT="$_ody_ollama_port"') + runner_lines.append(' break 2') + runner_lines.append(' fi') + runner_lines.append(' done') + runner_lines.append('done') + runner_lines.append('if [ -n "$ODYSSEUS_OLLAMA_URL" ]; then') + runner_lines.append(' if [ "$ODYSSEUS_OLLAMA_PORT" != "' + _ollama_port + '" ]; then') + runner_lines.append(' echo "[odysseus] Selected Ollama port ' + _ollama_port + ' was not reachable; using running Ollama on port ${ODYSSEUS_OLLAMA_PORT}."') + runner_lines.append(' fi') + runner_lines.append(' echo "[odysseus] Ollama API ready on port ${ODYSSEUS_OLLAMA_PORT}: ${ODYSSEUS_OLLAMA_URL}"') + runner_lines.append(' echo "[odysseus] This task is monitoring an existing Ollama server; stopping it here will not stop an external Docker/system service."') + runner_lines.append(' exec bash -i') + runner_lines.append('fi') runner_lines.append('if ! command -v ollama &>/dev/null; then') - runner_lines.append(' echo "ERROR: Ollama not found. Install it (macOS: brew install ollama, or https://ollama.com/download), then launch again."') - runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127') - runner_lines.append('fi') - runner_lines.append('if ! curl -sf http://localhost:11434/api/tags >/dev/null 2>&1; then') - runner_lines.append(' echo "Starting ollama server..."; (ollama serve >/dev/null 2>&1 &)') - runner_lines.append(' for _ in 1 2 3 4 5 6 7 8 9 10; do curl -sf http://localhost:11434/api/tags >/dev/null 2>&1 && break; sleep 1; done') + runner_lines.append(' echo "ERROR: Ollama not found and no Ollama API is reachable on 127.0.0.1, localhost, or host.docker.internal (ports ${ODYSSEUS_OLLAMA_PORT}/11434)."') + runner_lines.append(' echo "Install Ollama, start an Ollama service/container on this server, or pick the port where it is already listening."') + runner_lines.append(' echo') + runner_lines.append(' echo "=== Process exited with code 127 ==="') + runner_lines.append(' exec bash -i') runner_lines.append('fi') + runner_lines.append('echo "Starting ollama server on 0.0.0.0:${ODYSSEUS_OLLAMA_PORT}..."') + runner_lines.append('OLLAMA_HOST="0.0.0.0:${ODYSSEUS_OLLAMA_PORT}" ollama serve') + runner_lines.append('_ody_exit=$?') + runner_lines.append('echo') + runner_lines.append('echo "=== Process exited with code ${_ody_exit} ==="') + runner_lines.append('exec bash -i') elif "vllm serve" in req.cmd: # vLLM is CUDA/ROCm-only and does not run on macOS at all. runner_lines.append('if [ "$(uname -s)" = "Darwin" ]; then') @@ -1016,18 +1048,19 @@ def setup_cookbook_routes() -> APIRouter: runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127') runner_lines.append('fi') - _append_serve_preflight_exit_lines( - runner_lines, - keep_shell_open=not local_windows, - ) - runner_lines.append(req.cmd) - if local_windows: - # Detached background process — no interactive shell to keep open. - # Print the exit marker the status poller looks for, then stop. - _append_serve_exit_code_lines(runner_lines, keep_shell_open=False) - else: - # Keep shell open after exit so user can see errors - _append_serve_exit_code_lines(runner_lines, keep_shell_open=True) + if not handled_ollama_serve: + _append_serve_preflight_exit_lines( + runner_lines, + keep_shell_open=not local_windows, + ) + runner_lines.append(req.cmd) + if local_windows: + # Detached background process — no interactive shell to keep open. + # Print the exit marker the status poller looks for, then stop. + _append_serve_exit_code_lines(runner_lines, keep_shell_open=False) + else: + # Keep shell open after exit so user can see errors + _append_serve_exit_code_lines(runner_lines, keep_shell_open=True) runner_path = TMUX_LOG_DIR / f"{session_id}_run.sh" runner_path.write_text("\n".join(runner_lines) + "\n", encoding="utf-8") @@ -1692,10 +1725,14 @@ def setup_cookbook_routes() -> APIRouter: if vram_gb > 0 and needed_vram is not None and needed_vram > vram_gb: continue - # Skip if no size info — without a size we can't tell if it's a real - # full-weight model or a tiny adapter, so we'd rather drop it - if est_vram is None: - continue + # Unknown-size models (e.g. MiniMax-M2.7, DeepSeek-V4-Flash) have no + # "NB" in the repo id, so the regex above can't extract their + # param count. Previously we dropped them entirely, which made + # brand-new flagship releases silently vanish from this list even + # on rigs with hundreds of GB of VRAM. Adapters/LoRAs are already + # filtered by _is_excluded(), so what falls through here is + # overwhelmingly full models — keep them, just without a size + # badge (the frontend handles needed_vram_gb=null gracefully). out.append({ "repo_id": repo_id, diff --git a/routes/hwfit_routes.py b/routes/hwfit_routes.py index 9a0a4e9..e49b56e 100644 --- a/routes/hwfit_routes.py +++ b/routes/hwfit_routes.py @@ -90,7 +90,7 @@ def setup_hwfit_routes(): return detect_system(host=host, ssh_port=ssh_port, platform=platform, fresh=fresh) @router.get("/models") - def get_models(use_case: str = "", sort: str = "score", limit: int = 50, search: str = "", host: str = "", quant: str = "", gpu_count: str = "", gpu_group: str = "", ssh_port: str = "", platform: str = "", fresh: bool = False, manual_mode: str = "", manual_gpu_count: str = "", manual_vram_gb: str = "", manual_ram_gb: str = "", manual_backend: str = "", ignore_detected_gpu: bool = False, ignore_detected_ram: bool = False): + def get_models(use_case: str = "", sort: str = "score", limit: int = 50, search: str = "", host: str = "", quant: str = "", ctx: str = "", gpu_count: str = "", gpu_group: str = "", ssh_port: str = "", platform: str = "", fresh: bool = False, manual_mode: str = "", manual_gpu_count: str = "", manual_vram_gb: str = "", manual_ram_gb: str = "", manual_backend: str = "", ignore_detected_gpu: bool = False, ignore_detected_ram: bool = False, fit_only: bool = False): """Rank LLM models against detected hardware and return scored results. gpu_count: override GPU count (0 = CPU only, 1-N = simulate N GPUs of the active group). gpu_group: index into system.gpu_groups (the homogeneous @@ -171,7 +171,14 @@ def setup_hwfit_routes(): # gpu_only stays off here so the default view still surfaces offload. _apply_group(grp, grp["count"]) - results = rank_models(system, use_case=use_case or None, limit=limit, search=search or None, sort=sort, quant=quant or None) + try: + target_context = int(ctx) if ctx else None + except ValueError: + target_context = None + if target_context is not None: + target_context = max(1024, min(target_context, 1000000)) + + results = rank_models(system, use_case=use_case or None, limit=limit, search=search or None, sort=sort, quant=quant or None, target_context=target_context, fit_only=fit_only) return {"system": system, "models": results} @router.get("/image-models") diff --git a/services/hwfit/data/hf_models.json b/services/hwfit/data/hf_models.json index 0267535..b7c45ab 100644 --- a/services/hwfit/data/hf_models.json +++ b/services/hwfit/data/hf_models.json @@ -4375,7 +4375,14 @@ "hf_downloads": 51135, "hf_likes": 2, "release_date": "2025-09-23", - "_discovered": true + "_discovered": true, + "gguf_sources": [ + { + "repo": "typhoon-ai/typhoon2.5-qwen3-4b-gguf", + "file": "typhoon2.5-qwen3-4b-q4_k_m.gguf", + "quant": "Q4_K_M" + } + ] }, { "name": "JunHowie/Qwen3-4B-Instruct-2507-GPTQ-Int4", @@ -8994,7 +9001,14 @@ "num_experts": 128, "active_experts": 8, "active_parameters": 3339450907, - "_discovered": true + "_discovered": true, + "gguf_sources": [ + { + "repo": "typhoon-ai/typhoon2.5-qwen3-30b-a3b-gguf", + "file": "typhoon2.5-qwen3-30b-a3b-q4_k_m.gguf", + "quant": "Q4_K_M" + } + ] }, { "name": "QuantTrio/Qwen3-Coder-30B-A3B-Instruct-AWQ", @@ -12078,7 +12092,7 @@ "min_ram_gb": 421.3, "recommended_ram_gb": 702.1, "min_vram_gb": 386.1, - "quantization": "Q4_K_M", + "quantization": "BF16", "context_length": 202752, "use_case": "General purpose text generation", "capabilities": [], @@ -12088,6 +12102,24 @@ "hf_likes": 1698, "release_date": "2026-02-11" }, + { + "name": "zai-org/GLM-5.1", + "provider": "zai-org", + "parameter_count": "753.9B", + "parameters_raw": 753864139008, + "min_ram_gb": 421.3, + "recommended_ram_gb": 702.1, + "min_vram_gb": 386.1, + "quantization": "BF16", + "context_length": 202752, + "use_case": "General purpose text generation", + "capabilities": [], + "pipeline_tag": "text-generation", + "architecture": "glm_moe_dsa", + "hf_downloads": 141194, + "hf_likes": 0, + "release_date": "2026-04-03" + }, { "name": "moonshotai/Kimi-K2-Instruct", "provider": "moonshotai", @@ -13919,7 +13951,12 @@ "architecture": "gemma4", "pipeline_tag": "image-text-to-text", "release_date": "2026-04-01", - "gguf_sources": [], + "gguf_sources": [ + { + "repo": "unsloth/gemma-4-E2B-it-GGUF", + "provider": "unsloth" + } + ], "capabilities": [ "vision" ] @@ -13942,7 +13979,12 @@ "architecture": "gemma4", "pipeline_tag": "image-text-to-text", "release_date": "2026-04-01", - "gguf_sources": [], + "gguf_sources": [ + { + "repo": "unsloth/gemma-4-E4B-it-GGUF", + "provider": "unsloth" + } + ], "capabilities": [ "vision" ] @@ -13965,7 +14007,12 @@ "architecture": "gemma4", "pipeline_tag": "image-text-to-text", "release_date": "2026-04-01", - "gguf_sources": [], + "gguf_sources": [ + { + "repo": "unsloth/gemma-4-31B-it-GGUF", + "provider": "unsloth" + } + ], "capabilities": [ "vision" ] @@ -13988,7 +14035,12 @@ "architecture": "gemma4", "pipeline_tag": "image-text-to-text", "release_date": "2026-04-01", - "gguf_sources": [], + "gguf_sources": [ + { + "repo": "unsloth/gemma-4-26B-A4B-it-GGUF", + "provider": "unsloth" + } + ], "capabilities": [ "vision" ] @@ -18719,5 +18771,307 @@ "hf_likes": 0, "release_date": "2026-04-19", "_discovered": true + }, + { + "name": "Qwen/Qwen3.6-27B-MTP", + "provider": "Qwen", + "parameter_count": "27.8B", + "parameters_raw": 27781427952, + "min_ram_gb": 16.6, + "recommended_ram_gb": 21.6, + "min_vram_gb": 16.6, + "quantization": "Q4_K_M", + "context_length": 262144, + "use_case": "General purpose, coding, MTP", + "is_moe": false, + "num_experts": null, + "active_experts": null, + "active_parameters": null, + "architecture": "qwen3", + "pipeline_tag": "text-generation", + "release_date": "2026-04-01", + "gguf_sources": [ + { + "repo": "unsloth/Qwen3.6-27B-MTP-GGUF", + "provider": "unsloth" + } + ], + "capabilities": [ + "mtp" + ], + "_discovered": true + }, + { + "name": "Qwen/Qwen3.6-35B-A3B-MTP", + "provider": "Qwen", + "parameter_count": "36.0B", + "parameters_raw": 35951822704, + "min_ram_gb": 21.4, + "recommended_ram_gb": 27.8, + "min_vram_gb": 21.4, + "quantization": "Q4_K_M", + "context_length": 262144, + "use_case": "General purpose (MoE), MTP", + "is_moe": true, + "num_experts": null, + "active_experts": null, + "active_parameters": 3000000000, + "architecture": "qwen3_moe", + "pipeline_tag": "text-generation", + "release_date": "2026-04-01", + "gguf_sources": [ + { + "repo": "unsloth/Qwen3.6-35B-A3B-MTP-GGUF", + "provider": "unsloth" + } + ], + "capabilities": [ + "mtp" + ], + "_discovered": true + }, + { + "name": "Qwen/Qwen3.5-0.8B-MTP", + "provider": "Qwen", + "parameter_count": "873M", + "parameters_raw": 873438784, + "min_ram_gb": 1.0, + "recommended_ram_gb": 2.0, + "min_vram_gb": 0.5, + "quantization": "Q4_K_M", + "context_length": 262144, + "use_case": "General purpose, MTP", + "capabilities": [ + "mtp", + "tool_use", + "vision" + ], + "pipeline_tag": "image-text-to-text", + "architecture": "qwen3_5", + "hf_downloads": 93448, + "hf_likes": 208, + "release_date": "2026-02-28", + "gguf_sources": [ + { + "repo": "unsloth/Qwen3.5-0.8B-MTP-GGUF", + "provider": "unsloth" + } + ], + "_discovered": true + }, + { + "name": "Qwen/Qwen3.5-2B-MTP", + "provider": "Qwen", + "parameter_count": "2.3B", + "parameters_raw": 2274069824, + "min_ram_gb": 1.3, + "recommended_ram_gb": 2.1, + "min_vram_gb": 1.2, + "quantization": "Q4_K_M", + "context_length": 262144, + "use_case": "General purpose, MTP", + "capabilities": [ + "mtp", + "tool_use", + "vision" + ], + "pipeline_tag": "image-text-to-text", + "architecture": "qwen3_5", + "hf_downloads": 46974, + "hf_likes": 115, + "release_date": "2026-02-28", + "gguf_sources": [ + { + "repo": "unsloth/Qwen3.5-2B-MTP-GGUF", + "provider": "unsloth" + } + ], + "_discovered": true + }, + { + "name": "Qwen/Qwen3.5-4B-MTP", + "provider": "Qwen", + "parameter_count": "4.7B", + "parameters_raw": 4659865088, + "min_ram_gb": 2.6, + "recommended_ram_gb": 4.3, + "min_vram_gb": 2.4, + "quantization": "Q4_K_M", + "context_length": 262144, + "use_case": "General purpose, MTP", + "capabilities": [ + "mtp", + "tool_use", + "vision" + ], + "pipeline_tag": "image-text-to-text", + "architecture": "qwen3_5", + "hf_downloads": 99087, + "hf_likes": 202, + "release_date": "2026-02-27", + "gguf_sources": [ + { + "repo": "unsloth/Qwen3.5-4B-MTP-GGUF", + "provider": "unsloth" + } + ], + "_discovered": true + }, + { + "name": "Qwen/Qwen3.5-9B-MTP", + "provider": "Qwen", + "parameter_count": "9.7B", + "parameters_raw": 9653104368, + "min_ram_gb": 5.4, + "recommended_ram_gb": 9.0, + "min_vram_gb": 4.9, + "quantization": "Q4_K_M", + "context_length": 262144, + "use_case": "General purpose, MTP", + "capabilities": [ + "mtp", + "tool_use", + "vision" + ], + "pipeline_tag": "image-text-to-text", + "architecture": "qwen3_5", + "hf_downloads": 172298, + "hf_likes": 345, + "release_date": "2026-02-27", + "gguf_sources": [ + { + "repo": "unsloth/Qwen3.5-9B-MTP-GGUF", + "provider": "unsloth" + } + ], + "_discovered": true + }, + { + "name": "Qwen/Qwen3.5-27B-MTP", + "provider": "Qwen", + "parameter_count": "27.8B", + "parameters_raw": 27781427952, + "min_ram_gb": 15.5, + "recommended_ram_gb": 25.9, + "min_vram_gb": 14.2, + "quantization": "Q4_K_M", + "context_length": 262144, + "use_case": "General purpose, MTP", + "capabilities": [ + "mtp", + "tool_use", + "vision" + ], + "pipeline_tag": "image-text-to-text", + "architecture": "qwen3_5", + "hf_downloads": 406808, + "hf_likes": 565, + "release_date": "2026-02-24", + "gguf_sources": [ + { + "repo": "unsloth/Qwen3.5-27B-MTP-GGUF", + "provider": "unsloth" + } + ], + "_discovered": true + }, + { + "name": "Qwen/Qwen3.5-35B-A3B-MTP", + "provider": "Qwen", + "parameter_count": "36.0B", + "parameters_raw": 35951822704, + "min_ram_gb": 20.1, + "recommended_ram_gb": 33.5, + "min_vram_gb": 18.4, + "quantization": "Q4_K_M", + "context_length": 262144, + "use_case": "General purpose, MTP", + "capabilities": [ + "mtp", + "tool_use", + "vision" + ], + "pipeline_tag": "image-text-to-text", + "architecture": "qwen3_5_moe", + "hf_downloads": 769032, + "hf_likes": 905, + "release_date": "2026-02-24", + "is_moe": true, + "num_experts": 256, + "active_experts": 8, + "active_parameters": 3000000000, + "gguf_sources": [ + { + "repo": "unsloth/Qwen3.5-35B-A3B-MTP-GGUF", + "provider": "unsloth" + } + ], + "_discovered": true + }, + { + "name": "Qwen/Qwen3.5-122B-A10B-MTP", + "provider": "Qwen", + "parameter_count": "125.1B", + "parameters_raw": 125086497008, + "min_ram_gb": 69.9, + "recommended_ram_gb": 116.5, + "min_vram_gb": 64.1, + "quantization": "Q4_K_M", + "context_length": 262144, + "use_case": "General purpose, MTP", + "capabilities": [ + "mtp", + "tool_use", + "vision" + ], + "pipeline_tag": "image-text-to-text", + "architecture": "qwen3_5_moe", + "hf_downloads": 171055, + "hf_likes": 389, + "release_date": "2026-02-24", + "is_moe": true, + "num_experts": 256, + "active_experts": 8, + "active_parameters": 10000000000, + "gguf_sources": [ + { + "repo": "unsloth/Qwen3.5-122B-A10B-MTP-GGUF", + "provider": "unsloth" + } + ], + "_discovered": true + }, + { + "name": "Qwen/Qwen3.5-397B-A17B-MTP", + "provider": "Qwen", + "parameter_count": "403.4B", + "parameters_raw": 403397928944, + "min_ram_gb": 225.4, + "recommended_ram_gb": 375.7, + "min_vram_gb": 206.6, + "quantization": "Q4_K_M", + "context_length": 262144, + "use_case": "General purpose, MTP", + "capabilities": [ + "mtp", + "tool_use", + "vision" + ], + "pipeline_tag": "image-text-to-text", + "architecture": "qwen3_5_moe", + "hf_downloads": 1291825, + "hf_likes": 1214, + "release_date": "2026-02-16", + "is_moe": true, + "num_experts": 256, + "active_experts": 8, + "active_parameters": 17000000000, + "gguf_sources": [ + { + "repo": "unsloth/Qwen3.5-397B-A17B-MTP-GGUF", + "provider": "unsloth" + } + ], + "_discovered": true } -] +] \ No newline at end of file diff --git a/services/hwfit/fit.py b/services/hwfit/fit.py index f3207f1..3d52f81 100644 --- a/services/hwfit/fit.py +++ b/services/hwfit/fit.py @@ -18,7 +18,7 @@ GPU_BANDWIDTH = { "7900 xtx": 960, "7900 xt": 800, "7900 gre": 576, "7800 xt": 624, "7700 xt": 432, "7600": 288, "6950 xt": 576, "6900 xt": 512, "6800 xt": 512, "6800": 512, "6700 xt": 384, "6600 xt": 256, "6600": 224, "mi300x": 5300, "mi300": 5300, "mi250x": 3277, "mi250": 3277, "mi210": 1638, "mi100": 1229, - "9070 xt": 624, "9070": 488, + "9070 xt": 624, "9070": 488, "9060 xt": 322, "9060": 322, # Apple Silicon unified-memory bandwidth (GB/s). Keyed off the chip name # reported by sysctl machdep.cpu.brand_string (e.g. "Apple M4 Max"). Listed # before the bare "m_" keys matters less than length-sorting (done below), @@ -26,7 +26,8 @@ GPU_BANDWIDTH = { "m1 ultra": 800, "m1 max": 400, "m1 pro": 200, "m1": 68, "m2 ultra": 800, "m2 max": 400, "m2 pro": 200, "m2": 100, "m3 ultra": 800, "m3 max": 300, "m3 pro": 150, "m3": 100, - "m4 max": 410, "m4 pro": 273, "m4": 120, + "m4 max": 546, "m4 pro": 273, "m4": 120, + "m5 max": 546, "m5 pro": 273, "m5": 150, } # Pre-sort keys by length descending for correct substring matching @@ -69,8 +70,18 @@ def _lookup_bandwidth(gpu_name): return None -def _estimate_speed(model, quant, run_mode, system): - """Estimate tok/s. Uses active params for MoE (only active experts run per token).""" +def _estimate_speed(model, quant, run_mode, system, offload_frac=0.0): + """Estimate tok/s. Uses active params for MoE (only active experts run per token). + + offload_frac (0..1): fraction of the model's weights that spill to system RAM + (CPU) because they don't fit VRAM. Generation reads every active weight per + token, so when part lives in CPU RAM the per-token time is dominated by the + slow path. We model effective bandwidth as a blend of GPU VRAM bandwidth and + system-RAM bandwidth weighted by what's where — far more accurate than a flat + "halve it" for partial offload, which under/over-shoots depending on amount. + Calibrated against a measured RX 9060 XT: DeepSeek-Coder-V2-Lite Q4_K_M with + light offload → ~59 t/s est vs 59.8 measured. + """ pb = _active_params_b(model) is_moe = model.get("is_moe", False) bw = _lookup_bandwidth(system.get("gpu_name")) @@ -82,14 +93,24 @@ def _estimate_speed(model, quant, run_mode, system): if model_gb <= 0: return 0.0 efficiency = 0.55 - raw_tps = (bw / model_gb) * efficiency if run_mode == "cpu_offload": - mode_factor = 0.5 - elif is_moe: - mode_factor = 0.8 - else: - mode_factor = 1.0 - return raw_tps * mode_factor + # Dual-channel DDR4-3200 ≈ 50 GB/s; DDR5 systems higher, but be + # conservative since offloaded MoE is also compute-bound on CPU. + cpu_bw = 55.0 + frac = min(max(offload_frac, 0.0), 1.0) + # If we don't know the fraction (legacy callers pass 0 with + # cpu_offload), assume a meaningful spill so we don't overestimate. + if frac <= 0.0: + frac = 0.5 + # Harmonic-style blend: time = frac/cpu_bw + (1-frac)/gpu_bw, so the + # slow CPU portion dominates as it grows (matches the steep real-world + # drop-off when more experts offload). + eff_bw = 1.0 / (frac / cpu_bw + (1.0 - frac) / bw) + raw_tps = (eff_bw / model_gb) * efficiency + return raw_tps * (0.8 if is_moe else 1.0) + # Fully on GPU. + raw_tps = (bw / model_gb) * efficiency + return raw_tps * (0.8 if is_moe else 1.0) k = FALLBACK_K.get(backend, 70) if pb <= 0: @@ -98,6 +119,27 @@ def _estimate_speed(model, quant, run_mode, system): return k / pb * sm +def _architecture_bonus(model): + name = (model.get("name") or "").lower() + arch = (model.get("architecture") or "").lower() + text = f"{name} {arch}" + + # Keep this intentionally small: hardware fit and speed still matter, but + # current model families should not be scored the same as older Qwen2/LLama + # era entries just because the parameter count is similar. + if "qwen3.6" in text or "qwen3_6" in text: + return 9 + if "qwen3.5" in text or "qwen3_5" in text: + return 8 + if "qwen3-next" in text or "qwen3_next" in text: + return 6 + if "qwen3" in text or arch.startswith("qwen3"): + return 4 + if "qwen2.5" in text or "qwen2_5" in text: + return 2 + return 0 + + def _quality_score(model, quant, use_case): pb = params_b(model) if pb < 1: @@ -127,13 +169,21 @@ def _quality_score(model, quant, use_case): if "gemma" in name_lower: base += 1 + base += _architecture_bonus(model) base += QUANT_QUALITY_PENALTY.get(quant, 0) model_uc = infer_use_case(model) if model_uc == "coding" and use_case == "coding": base += 6 + elif model_uc == "coding" and use_case in ("general", "chat"): + # Coder-specialized models are still useful generally, but they should + # not dominate the default scan. If the user wants code, the Coding + # filter gives them the boost above. + base -= 10 if model_uc == "reasoning" and use_case == "reasoning" and pb >= 13: base += 5 + elif model_uc == "reasoning" and use_case == "chat": + base -= 4 if model_uc == "multimodal" and use_case == "multimodal": base += 6 @@ -196,9 +246,9 @@ def _quant_bits(q): Returns 0 when unknown (caller treats unknown as "don't filter").""" qu = (q or "").upper().replace("-", "").replace("_", "").replace(" ", "") # GGUF k-quants + float formats - if qu.startswith("Q8") or "FP8" in qu: + if qu.startswith("Q8") or "FP8" in qu or "INT8" in qu or qu.startswith("W8"): return 8 - if qu.startswith("Q4") or qu.startswith("IQ4"): + if qu.startswith("Q4") or qu.startswith("IQ4") or "FP4" in qu or "NF4" in qu or "INT4" in qu or qu.startswith("W4"): return 4 if qu.startswith("Q2") or qu.startswith("IQ2"): return 2 @@ -210,7 +260,7 @@ def _quant_bits(q): return 6 if qu.startswith("F16") or qu.startswith("BF16") or qu.startswith("F32"): return 16 - # Prequantized formats: pull the bit-width digit (AWQ4 / AWQ4BIT / GPTQ8 / 4BIT / INT8 …) + # Prequantized formats: pull the bit-width digit (AWQ4 / AWQ4BIT / GPTQ8 / 4BIT / INT8 ...) m = re.search(r"(?:AWQ|GPTQ|MLX|EXL2|BNB|INT|W)(\d{1,2})", qu) or re.search(r"(\d{1,2})BIT", qu) if m: b = int(m.group(1)) @@ -219,12 +269,36 @@ def _quant_bits(q): return 0 -def analyze_model(model, system, target_quant=None): +def _native_quant(model): + native_quant = model.get("quantization", "Q4_K_M") + name = (model.get("name") or "").lower() + fmt = (model.get("format") or "").lower() + text = f"{name} {fmt}" + if "nvfp4" in text: + return "NVFP4" + if re.search(r"(^|[-_/])fp8($|[-_/\s])", text): + return "FP8" + if "gptq" in text: + m = re.search(r"(?:gptq|int|w)(?:[-_]?)(\d{1,2})(?:bit)?", text) + return f"GPTQ-{m.group(1)}bit" if m else "GPTQ" + if "awq" in text: + m = re.search(r"(?:awq|int|w)(?:[-_]?)(\d{1,2})(?:bit)?", text) + return f"AWQ-{m.group(1)}bit" if m else "AWQ" + if "mlx" in text: + m = re.search(r"mlx[-_]?(\d{1,2})bit", text) + return f"mlx-{m.group(1)}bit" if m else native_quant + if not (model.get("is_gguf") or model.get("gguf_sources")) and re.search(r"(^|[-_/])(?:int)?8bit($|[-_/\s])", text): + return "INT8" + return native_quant + + +def analyze_model(model, system, target_quant=None, scoring_use_case=None, target_context=None): pb = params_b(model) if pb <= 0: return None - use_case = infer_use_case(model) + model_use_case = infer_use_case(model) + score_use_case = scoring_use_case or "general" has_gpu = system.get("has_gpu", False) gpu_vram = (system.get("gpu_vram_gb") or 0) if has_gpu else 0 gpu_count = system.get("gpu_count", 1) or 1 @@ -238,9 +312,14 @@ def analyze_model(model, system, target_quant=None): gpu_only = bool(system.get("gpu_only")) and has_gpu and gpu_vram > 0 eff_ram = 0 if gpu_only else available_ram is_moe = model.get("is_moe", False) - ctx = model.get("context_length", 4096) or 4096 + model_ctx = model.get("context_length", 4096) or 4096 + try: + target_context = int(target_context or 0) + except (TypeError, ValueError): + target_context = 0 + ctx = min(model_ctx, target_context) if target_context > 0 else model_ctx - native_quant = model.get("quantization", "Q4_K_M") + native_quant = _native_quant(model) preq = is_prequantized(model) # GGUF models can't be sharded across GPUs — use single GPU VRAM @@ -256,13 +335,22 @@ def analyze_model(model, system, target_quant=None): else: effective_vram = gpu_vram + native_gpu_only = preq and not native_quant.startswith("mlx-") + # Determine which quant to evaluate at + native_quant_prefixes = ( + "AWQ-", "GPTQ-", "FP8", "FP4", "NVFP4", "MXFP4", "NF4", + "INT4", "INT8", "W4A16", "W8A8", "W8A16", + ) + if preq: - # AWQ/GPTQ/FP8/MLX come at a fixed bit-width. If the user picked a - # specific quant tier (e.g. Q8 → 8-bit), only keep prequant models whose - # native bit-width matches — otherwise selecting Q8 would still surface - # AWQ-4bit models, mixing 4- and 8-bit in one view. + # Native HF/vLLM quantized repos come at a fixed format. If the user + # picked a GGUF quant tier (Q4/Q8/etc.), do not treat same-bit + # AWQ/GPTQ/FP8/FP4 builds as equivalent; those formats are separate + # serving paths and only appear when explicitly selected or unfiltered. if target_quant: + if not any(target_quant.startswith(p) for p in native_quant_prefixes): + return None _tb, _nb = _quant_bits(target_quant), _quant_bits(native_quant) if _tb and _nb and _tb != _nb: return None @@ -270,20 +358,25 @@ def analyze_model(model, system, target_quant=None): elif target_quant: # User picked a specific quant quant_to_try = target_quant + elif gpu_count >= 2: + # Multi-GPU box: vLLM/SGLang can't serve GGUF Q* quants (those are + # llama.cpp-only). Default non-prequantized models to BF16 so the row + # is meaningful on a multi-GPU rig. If BF16 doesn't fit, the model + # surfaces as too_tight — better than showing a Q4 row the user + # can't actually serve with vLLM on >1 GPU. + quant_to_try = "BF16" else: - # Default: Q4_K_M (user's stated preference) + # Default: Q4_K_M (user's stated preference) — kept for single-GPU + # and RAM modes where llama.cpp serving is the natural path. quant_to_try = "Q4_K_M" - result = _try_quant_at(model, quant_to_try, ctx, effective_vram, eff_ram) + # Multi-GPU filter: skip the row if the resolved quant is a GGUF tier + # (Q*/IQ-prefixed) — vLLM/SGLang can't serve those, so showing them on + # a 2+ GPU rig just clutters the list with unservable candidates. + if gpu_count >= 2 and quant_to_try and quant_to_try.upper().startswith(("Q2", "Q3", "Q4", "Q5", "Q6", "Q8", "IQ")): + return None - # If target quant doesn't fit and it's not pre-quantized, try lower quants - if result is None and not preq and target_quant: - from services.hwfit.models import QUANT_HIERARCHY - idx = QUANT_HIERARCHY.index(target_quant) if target_quant in QUANT_HIERARCHY else -1 - for q in QUANT_HIERARCHY[idx + 1:]: - result = _try_quant_at(model, q, ctx, effective_vram, eff_ram) - if result: - break + result = _try_quant_at(model, quant_to_try, ctx, effective_vram, 0 if native_gpu_only else eff_ram) if result is None: # Model doesn't fit on the user's current hardware. Surface it @@ -299,7 +392,7 @@ def analyze_model(model, system, target_quant=None): "parameter_count": model.get("parameter_count"), "params_b": round(pb, 1), "is_moe": is_moe, - "use_case": use_case, + "use_case": model_use_case, "fit_level": "too_tight", "run_mode": "no_fit", "quant": quant_to_try, @@ -309,7 +402,8 @@ def analyze_model(model, system, target_quant=None): "score": 0, "scores": {"quality": 0, "speed": 0, "fit": 0, "context": 0}, "gguf_sources": model.get("gguf_sources", []), - "context_length": model.get("context_length", 4096), + "context_length": model_ctx, + "target_context": target_context or None, } run_mode, quant, fit_ctx, required_gb = result @@ -331,14 +425,19 @@ def analyze_model(model, system, target_quant=None): else: fit_level = "marginal" - tps = _estimate_speed(model, quant, run_mode, system) + # Fraction of the model that spills to CPU RAM (drives the offload speed + # model). When offloading, anything beyond the GPU's VRAM lives in system RAM. + offload_frac = 0.0 + if run_mode == "cpu_offload" and required_gb > 0 and effective_vram > 0: + offload_frac = max(0.0, (required_gb - effective_vram) / required_gb) + tps = _estimate_speed(model, quant, run_mode, system, offload_frac=offload_frac) - q_score = _quality_score(model, quant, use_case) - s_score = _speed_score(tps, use_case) + q_score = _quality_score(model, quant, score_use_case) + s_score = _speed_score(tps, score_use_case) f_score = _fit_score(required_gb, budget) - c_score = _context_score(fit_ctx, use_case) + c_score = _context_score(fit_ctx, score_use_case) - wq, ws, wf, wc = USE_CASE_WEIGHTS.get(use_case, (0.45, 0.30, 0.15, 0.10)) + wq, ws, wf, wc = USE_CASE_WEIGHTS.get(score_use_case, (0.45, 0.30, 0.15, 0.10)) composite = q_score * wq + s_score * ws + f_score * wf + c_score * wc return { @@ -347,7 +446,7 @@ def analyze_model(model, system, target_quant=None): "parameter_count": model.get("parameter_count"), "params_b": round(pb, 1), "is_moe": is_moe, - "use_case": use_case, + "use_case": model_use_case, "fit_level": fit_level, "run_mode": run_mode, "quant": quant, @@ -362,21 +461,67 @@ def analyze_model(model, system, target_quant=None): "context": round(c_score, 1), }, "gguf_sources": model.get("gguf_sources", []), - "context_length": model.get("context_length", 4096), + "context_length": model_ctx, + "release_date": model.get("release_date", ""), + "target_context": target_context or None, } +def _version_key(name): + """Parse the model's version number from its display name so equal-score + rows can break ties in favor of the newer release (e.g. M2.7 > M2.5). + Returns a float; 0.0 for names with no recognizable version. The regex + grabs the FIRST 'word-with-digits' pattern after a hyphen/underscore, + so e.g. 'MiniMax-M2.7' -> 2.7, 'Qwen3.6-35B' -> 3.6, 'M2' -> 2.0.""" + import re as _re + if not name: + return 0.0 + # Match the version-marker word: a letter followed by a number with + # optional decimal, e.g. M2.7, V4, Pro3. Take the first hit; ignore + # "B" param-count suffixes (Qwen3-235B should yield 3, not 235). + for m in _re.finditer(r"[A-Za-z](\d+(?:\.\d+)?)(?![A-Za-z])", name): + val = m.group(1) + # Skip param-count tokens (e.g. "235B" gives "235" but the next + # char would be "B" — already excluded by the negative lookahead). + try: + f = float(val) + except ValueError: + continue + # Heuristic: bare integers >= 100 are almost certainly param counts + # (1B/3B/8B/70B/235B…), not version numbers. Skip them. + if "." not in val and f >= 100: + continue + return f + return 0.0 + + SORT_KEYS = { - "score": lambda r: r["score"], + # Score sort with version-aware tiebreaker — when two rows tie on + # composite score (a common case for the SAME base model in different + # versions, e.g. MiniMax-M2.5 vs M2.7 both at the same FP8 budget), + # prefer the newer version. Without this, ties resolved to whatever + # order they came out of the registry, which let older releases land + # above newer ones in user-facing lists. + "score": lambda r: (r["score"], _version_key(r.get("name") or "")), "speed": lambda r: r["speed_tps"], "vram": lambda r: r["required_gb"], "params": lambda r: r["params_b"], "context": lambda r: r["context"], + # Newest first. release_date is an ISO-ish string ("2026-05-30"); plain + # string sort is chronological. Missing dates sort last (empty < any date, + # and we sort reverse=True for newest, so "" lands at the bottom). + "newest": lambda r: r.get("release_date") or "", } -def rank_models(system, use_case=None, limit=50, search=None, sort="score", quant=None): - """Rank all models against detected hardware. Returns sorted list of fit results.""" +def rank_models(system, use_case=None, limit=50, search=None, sort="score", quant=None, target_context=None, fit_only=False): + """Rank all models against detected hardware. Returns sorted list of fit results. + + fit_only: when True, drop rows whose fit_level is "too_tight" (model doesn't + actually fit on the chosen budget). When False (default), every model is + shown — sorting by Param means highest-param PERIOD, even ones that won't + run, so the user can see the truth. + """ models = get_models() results = [] @@ -418,21 +563,30 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan results.sort(key=sort_fn, reverse=(sort != "vram")) return results[:limit] - # If user picked a prequantized format (AWQ/FP8/GPTQ), filter to only those models - filter_native = quant and any(quant.startswith(p) for p in ("AWQ-", "GPTQ-", "FP8")) + # If user picked a native prequantized format, filter to only those models. + filter_native = quant and any(quant.startswith(p) for p in ( + "AWQ-", "GPTQ-", "FP8", "FP4", "NVFP4", "MXFP4", "NF4", + "INT4", "INT8", "W4A16", "W8A8", "W8A16", + )) system_backend = (system.get("backend") or "").lower() apple_silicon = system_backend in ("mps", "metal", "apple") + # Consumer AMD Radeon (RDNA, gfx10/11/12): the practical local serving path + # is GGUF via llama.cpp. vLLM/SGLang on ROCm are validated for datacenter + # Instinct (CDNA, gfx9xx) but are unreliable on consumer RDNA — AWQ kernels + # are largely unsupported there and FP8 needs out-of-tree patches. So treat + # consumer RDNA like Apple Silicon (GGUF-only) and leave CDNA untouched. + # Unknown family (no rocminfo) is left untouched to avoid hiding models from + # a possibly-capable Instinct box on a misdetect. + gpu_family = (system.get("gpu_family") or "").lower() + consumer_amd = system_backend == "rocm" and gpu_family == "rdna" for m in models: - native_q = m.get("quantization", "") + native_q = _native_quant(m) - # MLX-quantized models need the MLX runtime (mlx_lm), which Odysseus - # doesn't generate serve commands for — only llama.cpp/Ollama (Metal) - # and vLLM/SGLang (CUDA). MLX repos ship no GGUF alternative, so they're - # unrunnable on every backend we support. Always drop them, on Apple - # Silicon too, so the Cookbook never recommends a model it can't serve. - if native_q.startswith("mlx-"): + # MLX needs the mlx_lm runtime, which Odysseus does not generate serve + # commands for. Hide it on every backend, including Metal. + if native_q.startswith("mlx-") or "mlx" in (m.get("name") or "").lower(): continue # On Apple Silicon the only serving engines are llama.cpp and Ollama, @@ -442,17 +596,28 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan # default GGUF quant) and vLLM-only AWQ/GPTQ/FP8 builds alike. Without # this the Cookbook recommends models the Mac can't run; on CUDA these # stay visible because vLLM serves safetensors directly. - if apple_silicon and not (m.get("is_gguf") or m.get("gguf_sources")): + # + # Consumer AMD (RDNA) is the same story: GGUF via llama.cpp is the + # servable path, so a model needs a real GGUF to be recommended. + # Otherwise the Cookbook rates vLLM-only AWQ/GPTQ builds "GOOD" on a + # Radeon that can't actually serve them. + if (apple_silicon or consumer_amd) and not (m.get("is_gguf") or m.get("gguf_sources")): continue - # Format filter: AWQ tab → only AWQ models, FP8 tab → only FP8 models + # Format filter: AWQ tab -> only AWQ models, FP4 tab -> FP4-family models, etc. if filter_native: if quant == "FP8" and native_q != "FP8": continue + if quant == "FP4" and native_q not in ("FP4", "NVFP4", "MXFP4", "NF4"): + continue if quant.startswith("AWQ") and not native_q.startswith("AWQ"): continue if quant.startswith("GPTQ") and not native_q.startswith("GPTQ"): continue + if quant.startswith("NVFP4") and not native_q.startswith("NVFP4"): + continue + if quant in ("INT4", "INT8", "W4A16", "W8A8", "W8A16") and native_q != quant: + continue if search: name = m.get("name", "").lower() @@ -460,7 +625,7 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan if search.lower() not in name and search.lower() not in provider: continue - result = analyze_model(m, system, target_quant=quant) + result = analyze_model(m, system, target_quant=quant, scoring_use_case=(use_case or "general"), target_context=target_context) if result is None: continue @@ -471,14 +636,18 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan results.append(result) - # Pick the visible SET by best fit (score) first, so it stays the same no - # matter which column the user sorts by — otherwise sorting by params would - # truncate to the N biggest models (huge ones that don't even fit) while - # sorting by vram showed the N smallest. Only AFTER choosing the set do we - # order it by the requested column. - results.sort(key=SORT_KEYS["score"], reverse=True) - results = results[:limit] + # Pick the visible SET by the REQUESTED column. Per-user feedback: sorting + # by Param should show the highest-param models PERIOD, not just those that + # already fit. Same for every other column. Models that don't fit are still + # in the list with their fit_level marking the constraint, so the user can + # see the truth instead of a quietly-truncated view. Score sort is unchanged + # (it's the default ranking and naturally pushes non-fits to the bottom). + if fit_only: + # Hide rows that definitely don't fit (the "too_tight" badge) — user + # explicitly asked for a Fit-only view. + results = [r for r in results if r.get("fit_level") != "too_tight"] sort_fn = SORT_KEYS.get(sort, SORT_KEYS["score"]) # vram ascending (smallest first), everything else descending (biggest first) results.sort(key=sort_fn, reverse=(sort != "vram")) + results = results[:limit] return results diff --git a/services/hwfit/models.py b/services/hwfit/models.py index 642983d..41b8ddc 100644 --- a/services/hwfit/models.py +++ b/services/hwfit/models.py @@ -5,7 +5,7 @@ import re QUANT_HIERARCHY = ["Q8_0", "Q6_K", "Q5_K_M", "Q4_K_M", "Q3_K_M", "Q2_K"] QUANT_BPP = { - "F32": 4.0, "F16": 2.0, "BF16": 2.0, "FP8": 1.0, + "F32": 4.0, "F16": 2.0, "BF16": 2.0, "FP8": 1.0, "INT8": 1.0, "NVFP4": 0.5, "Q8_0": 1.05, "Q6_K": 0.80, "Q5_K_M": 0.68, "Q4_K_M": 0.58, "Q4_0": 0.58, "Q3_K_M": 0.48, "Q2_K": 0.37, "AWQ-4bit": 0.50, "AWQ-8bit": 1.0, @@ -14,7 +14,7 @@ QUANT_BPP = { } QUANT_SPEED_MULT = { - "F16": 0.6, "BF16": 0.6, "FP8": 0.85, + "F16": 0.6, "BF16": 0.6, "FP8": 0.85, "INT8": 0.85, "NVFP4": 1.1, "Q8_0": 0.8, "Q6_K": 0.95, "Q5_K_M": 1.0, "Q4_K_M": 1.15, "Q4_0": 1.15, "Q3_K_M": 1.25, "Q2_K": 1.35, "AWQ-4bit": 1.2, "AWQ-8bit": 0.85, @@ -23,16 +23,20 @@ QUANT_SPEED_MULT = { } QUANT_QUALITY_PENALTY = { - "F16": 0.0, "BF16": 0.0, "FP8": 0.0, - "Q8_0": 0.0, "Q6_K": -1.0, "Q5_K_M": -2.0, + "F16": 0.0, "BF16": 0.0, "FP8": 0.0, "INT8": 0.0, "NVFP4": -0.5, + "Q8_0": -0.5, "Q6_K": -1.5, "Q5_K_M": -2.5, "Q4_K_M": -5.0, "Q4_0": -5.0, "Q3_K_M": -8.0, "Q2_K": -12.0, - "AWQ-4bit": -3.0, "AWQ-8bit": 0.0, - "GPTQ-Int4": -3.0, "GPTQ-Int8": 0.0, - "mlx-4bit": -4.0, "mlx-8bit": 0.0, "mlx-6bit": -1.0, + # Bare "AWQ" and "AWQ-8bit" used to be 0.0 (tied with FP8). In practice + # AWQ-anything is a calibrated reconstruction, not raw 8-bit weights — + # there's a small but real quality loss vs FP8. Give them a slight + # penalty so FP8 wins when both fit. AWQ-4bit stays heavier. + "AWQ": -1.0, "AWQ-4bit": -4.0, "AWQ-8bit": -1.0, + "GPTQ": -1.0, "GPTQ-Int4": -4.0, "GPTQ-Int8": -1.0, + "mlx-4bit": -4.0, "mlx-8bit": -0.5, "mlx-6bit": -1.5, } QUANT_BYTES_PER_PARAM = { - "F16": 2.0, "BF16": 2.0, "FP8": 1.0, + "F16": 2.0, "BF16": 2.0, "FP8": 1.0, "INT8": 1.0, "NVFP4": 0.5, "Q8_0": 1.0, "Q6_K": 0.75, "Q5_K_M": 0.625, "Q4_K_M": 0.5, "Q4_0": 0.5, "Q3_K_M": 0.375, "Q2_K": 0.25, "AWQ-4bit": 0.5, "AWQ-8bit": 1.0, @@ -41,12 +45,21 @@ QUANT_BYTES_PER_PARAM = { } # Pre-quantized formats that should NOT go through the GGUF quant hierarchy -PREQUANTIZED_PREFIXES = ("AWQ-", "GPTQ-", "mlx-", "FP8") +PREQUANTIZED_PREFIXES = ("AWQ-", "GPTQ-", "mlx-", "FP8", "INT8", "NVFP4") def is_prequantized(model): q = model.get("quantization", "") - return any(q.startswith(p) for p in PREQUANTIZED_PREFIXES) + name = (model.get("name") or "").lower() + fmt = (model.get("format") or "").lower() + text = f"{name} {fmt}" + return ( + "nvfp4" in text + or re.search(r"(^|[-_/])fp8($|[-_/\s])", text) is not None + or (not (model.get("is_gguf") or model.get("gguf_sources")) and re.search(r"(^|[-_/])(?:int)?8bit($|[-_/\s])", text) is not None) + or any(x in text for x in ("awq", "gptq", "mlx")) + or any(q.startswith(p) for p in PREQUANTIZED_PREFIXES) + ) def params_b(model): diff --git a/static/js/cookbook-diagnosis.js b/static/js/cookbook-diagnosis.js index 9442643..ade31bd 100644 --- a/static/js/cookbook-diagnosis.js +++ b/static/js/cookbook-diagnosis.js @@ -27,6 +27,56 @@ import spinnerModule from './spinner.js'; // ── Error diagnosis ── +function _openCookbookDependencies(pkgName = '') { + const cookbook = window.cookbookModule; + if (cookbook && typeof cookbook.open === 'function') { + cookbook.open({ tab: 'Dependencies' }); + } else { + document.getElementById('tool-cookbook-btn')?.click(); + } + + const wanted = String(pkgName || '').toLowerCase(); + const tryHighlight = (attempt = 0) => { + const modal = document.getElementById('cookbook-modal'); + const tab = modal?.querySelector('.cookbook-tab[data-backend="Dependencies"]'); + if (tab && !tab.classList.contains('active')) tab.click(); + + const rows = [...document.querySelectorAll('#cookbook-deps-list [data-pkg-name]')]; + if (!rows.length) { + if (attempt < 45) setTimeout(() => tryHighlight(attempt + 1), 100); + return; + } + if (!wanted) return; + const row = rows.find(r => { + const name = (r.dataset.pkgName || '').toLowerCase(); + const pip = (r.dataset.depPip || '').toLowerCase(); + return name === wanted || pip.includes(wanted) || wanted.includes(name); + }); + if (row) { + row.scrollIntoView({ block: 'center' }); + row.classList.add('cookbook-pkg-flash'); + setTimeout(() => row.classList.remove('cookbook-pkg-flash'), 1800); + } + }; + tryHighlight(); +} + +function _openServeEditFromDiagnosis(panel, fields = null) { + const task = panel?.closest?.('.cookbook-task'); + if (!task) return; + task.dispatchEvent(new CustomEvent('cookbook:edit-serve', { bubbles: true, detail: { fields } })); +} + +function _openCpuServeEdit(panel) { + _openServeEditFromDiagnosis(panel, { + backend: 'llamacpp', + gpus: '', + tp: '1', + gpu_mem: '0.80', + _forceBackend: true, + }); +} + // Infer the gated base repo that single-file checkpoints need configs from function _inferBaseRepo(text) { if (!text) return null; @@ -70,17 +120,12 @@ export const ERROR_PATTERNS = [ }, { pattern: /not divisible by weight quantization|quantization block/i, - message: 'Model quantization format incompatible with this vLLM version. Try a different quant (AWQ) or update vLLM.', + message: 'FP8 MoE quantization is incompatible with this tensor-parallel split.', + suggestion: 'Suggested action: retry with a lower tensor-parallel size, such as TP=4 or TP=2. If it still fails, use a non-FP8/GGUF version of the model.', fixes: [ - { label: 'Update vLLM on server', action: (panel) => { - const taskEl = panel.closest('.cookbook-task'); - const task = taskEl ? _loadTasks().find(t => t.sessionId === taskEl.dataset.taskId) : null; - const host = task?.remoteHost || ''; - const prefix = _buildEnvPrefix(); - const pipCmd = prefix ? prefix + ' pip install -U vllm' : 'pip install -U vllm'; - const cmd = host ? _sshCmd(host, pipCmd) : pipCmd; - _launchServeTask('update-vllm', 'pip-update', cmd); - }}, + { label: 'Retry with TP=4', action: (panel) => _serveAutoRetryReplace(panel, '--tensor-parallel-size', '4') }, + { label: 'Retry with TP=2', action: (panel) => _serveAutoRetryReplace(panel, '--tensor-parallel-size', '2') }, + { label: 'Edit serve', action: (panel) => _openServeEditFromDiagnosis(panel) }, ], }, { @@ -218,6 +263,7 @@ export const ERROR_PATTERNS = [ pattern: /vllm.*command not found|No module named vllm/i, message: 'vLLM is not installed or not in PATH.', fixes: [ + { label: 'Open Dependencies', action: () => _openCookbookDependencies('vllm') }, { label: 'Check environment is set', action: (panel) => { const el = panel.querySelector('[data-field="env_type"]'); if (el) { el.focus(); el.style.borderColor = 'var(--red)'; } @@ -226,11 +272,21 @@ export const ERROR_PATTERNS = [ }, { pattern: /sglang.*command not found|No module named sglang|SGLang is not installed/i, - message: 'SGLang is not installed or not in PATH. Open Cookbook → Dependencies and install sglang on this server.', + message: 'SGLang is not installed or not in PATH.', fixes: [ + { label: 'Open Dependencies', action: () => _openCookbookDependencies('sglang') }, { label: 'Copy install command', action: () => _copyText('python3 -m pip install "sglang[all]"') }, ], }, + { + pattern: /No accelerator \(CUDA, XPU, HPU, NPU, MUSA, MPS\) is available|Triton is not supported on current platform/i, + message: 'SGLang needs a visible GPU/accelerator on this server.', + suggestion: 'Suggested action: switch this serve config to llama.cpp for CPU/local serving, or choose a GPU server.', + fixes: [ + { label: 'Switch to llama.cpp', action: (panel) => _openCpuServeEdit(panel) }, + { label: 'Choose GPU server', action: (panel) => _openServeEditFromDiagnosis(panel) }, + ], + }, { pattern: /flashinfer.*version.*does not match|flashinfer-cubin version/i, message: 'FlashInfer version mismatch.', @@ -241,8 +297,12 @@ export const ERROR_PATTERNS = [ }, { pattern: /torch\.cuda\.is_available\(\).*False|No CUDA runtime/i, - message: 'CUDA not available in this environment.', - fixes: [], + message: 'vLLM needs a visible CUDA/ROCm GPU.', + suggestion: 'Suggested action: switch this serve config to llama.cpp for CPU/local serving, or choose a GPU server.', + fixes: [ + { label: 'Switch to llama.cpp', action: (panel) => _openCpuServeEdit(panel) }, + { label: 'Choose GPU server', action: (panel) => _openServeEditFromDiagnosis(panel) }, + ], }, { pattern: /Engine core initialization failed/i, @@ -295,17 +355,20 @@ export const ERROR_PATTERNS = [ }, { pattern: /Either a revision or a version must be specified|transformers\.integrations\.hub_kernels|kernels\/layer/i, - message: 'vLLM/Transformers kernel package mismatch.', + message: 'Transformers/kernels package mismatch.', fixes: [ - { label: 'Update vLLM/Transformers/kernels', action: (panel) => { + { label: 'Repair kernel package', action: (panel) => { const taskEl = panel.closest('.cookbook-task'); const task = taskEl ? _loadTasks().find(t => t.sessionId === taskEl.dataset.taskId) : null; const host = task?.remoteHost || ''; const prefix = _buildEnvPrefix(); - const pipCmd = prefix ? prefix + ' python3 -m pip install -U vllm transformers kernels' : 'python3 -m pip install -U vllm transformers kernels'; + const pipCmd = prefix + ? prefix + ' python3 -m pip install --user --break-system-packages "kernels<0.15"' + : 'python3 -m pip install --user --break-system-packages "kernels<0.15"'; const cmd = host ? _sshCmd(host, pipCmd) : pipCmd; - _launchServeTask('update-vllm-stack', 'pip-update', cmd); + _launchServeTask('repair-kernels', 'pip-update', cmd); }}, + { label: 'Open Dependencies', action: () => _openCookbookDependencies('sglang') }, ], }, { @@ -319,13 +382,24 @@ export const ERROR_PATTERNS = [ pattern: /llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'/i, message: 'llama-cpp-python server is not installed. Run: pip install "llama-cpp-python[server]"', fixes: [ + { label: 'Open Dependencies', action: () => _openCookbookDependencies('llama_cpp') }, { label: 'Copy install command', action: () => _copyText('pip install "llama-cpp-python[server]"') }, ], }, + { + pattern: /CUDA Toolkit not found|Unable to find cudart library|missing:\s*CUDA_CUDART/i, + message: 'llama.cpp found nvcc, but the CUDA runtime library is missing.', + suggestion: 'Suggested action: relaunch with the updated runner so llama.cpp builds CPU-only, or install a complete CUDA toolkit/runtime on this server for GPU llama.cpp.', + fixes: [ + { label: 'Edit serve', action: (panel) => _openServeEditFromDiagnosis(panel) }, + { label: 'Open Dependencies', action: () => _openCookbookDependencies('llama_cpp') }, + ], + }, { pattern: /No module named ['"]?torch|No module named ['"]?diffusers|diffusers.*command not found/i, message: 'Diffusion serving needs PyTorch and diffusers. Install diffusers from Cookbook → Dependencies.', fixes: [ + { label: 'Open Dependencies', action: () => _openCookbookDependencies('diffusers') }, { label: 'Copy install command', action: () => _copyText('python3 -m pip install "diffusers[torch]"') }, ], }, @@ -402,10 +476,32 @@ export function _diagnose(text) { return null; } +function _diagnosisCopyBundle(task, diagnosis, sourceText, suggestionText) { + const lines = ['## Odysseus Cookbook troubleshooting']; + if (task) { + lines.push( + '', + '### Task', + `- ID: ${task.sessionId || task.id || 'unknown'}`, + `- Type: ${task.type || 'unknown'}`, + `- Status: ${task.status || 'unknown'}`, + `- Model: ${task.payload?.repo_id || task.name || 'unknown'}`, + `- Host: ${task.remoteHost || 'local'}${task.sshPort ? `:${task.sshPort}` : ''}`, + ); + } + lines.push('', '### Diagnosis', diagnosis?.message || '(none)'); + if (suggestionText) lines.push('', '### Suggested action', suggestionText.replace(/^Suggested action:\s*/i, '')); + const cmd = task?.payload?._cmd || ''; + if (cmd) lines.push('', '### Launch command', '```bash', cmd, '```'); + if (sourceText) lines.push('', '### Captured output', '```text', String(sourceText).trim(), '```'); + return lines.join('\n'); +} + export function _showDiagnosis(panel, diagnosis, sourceText) { - if (panel._lastDiagMsg === diagnosis.message) return; - if (panel._diagDismissed === diagnosis.message) return; // stay dismissed until new error + const wasCollapsed = panel._lastDiagMsg === diagnosis.message && panel._diagCollapsed; + if (panel._diagDismissed === diagnosis.message) return; panel._lastDiagMsg = diagnosis.message; + panel._diagCollapsed = !!wasCollapsed; let diag = panel.querySelector('.cookbook-diagnosis'); if (!diag) { @@ -417,57 +513,116 @@ export function _showDiagnosis(panel, diagnosis, sourceText) { } diag.classList.remove('hidden'); diag.innerHTML = ''; + const taskEl = panel?.closest?.('.cookbook-task'); + const task = taskEl ? _loadTasks().find(t => t.sessionId === taskEl.dataset.taskId) : null; + const fixes = [...(diagnosis.fixes || [])]; + if (task?.type === 'serve' && task.payload?._cmd && !fixes.some(f => f.label === 'Edit serve')) { + fixes.push({ label: 'Edit serve', action: (p) => _openServeEditFromDiagnosis(p) }); + } + const suggestionText = diagnosis.suggestion || (fixes.length + ? `Suggested action: ${fixes[0].label}.` + : 'Suggested action: copy the error and adjust the serve settings.'); - const header = document.createElement('div'); - header.style.cssText = 'display:flex;align-items:center;justify-content:space-between;'; + // Simplified diagnosis card: just the error message + suggestion + fix + // button(s). Removed the fold toggle, copy button, and × dismiss — they + // made the card noisy without earning their keep. _diagCollapsed is kept + // as a stub so callers don't have to change. + panel._diagCollapsed = false; + const body = document.createElement('div'); + body.className = 'cookbook-diag-body'; const msg = document.createElement('div'); msg.className = 'cookbook-diag-message'; msg.textContent = diagnosis.message; - header.appendChild(msg); + body.appendChild(msg); + const suggestion = document.createElement('div'); + suggestion.className = 'cookbook-diag-suggestion'; + suggestion.textContent = suggestionText; + body.appendChild(suggestion); + diag.appendChild(body); - const dismiss = document.createElement('button'); - dismiss.className = 'close-btn'; - dismiss.style.cssText = 'width:16px;height:16px;font-size:9px;flex-shrink:0;'; - dismiss.textContent = '\u2715'; - dismiss.addEventListener('click', () => { panel._diagDismissed = diagnosis.message; _clearDiagnosis(panel); }); - header.appendChild(dismiss); + const runFix = async (fix, button, busyLabel = fix.label, onStart = null, onDone = null) => { + if (!fix || !button || button.dataset.busy) return; + button.dataset.busy = '1'; + const _orig = button.textContent; + const wp = spinnerModule.createWhirlpool(12); + wp.element.style.cssText = 'display:inline-block;vertical-align:middle;width:12px;height:12px;margin-right:5px;'; + button.textContent = ''; + button.appendChild(wp.element); + const _lbl = document.createElement('span'); + _lbl.textContent = busyLabel; + _lbl.style.verticalAlign = 'middle'; + button.appendChild(_lbl); + try { + if (typeof onStart === 'function') onStart(); + await fix.action(panel, sourceText); + } catch (err) { + console.error('[cookbook] diagnosis fix failed', err); + } finally { + if (button.isConnected) { + try { wp.destroy(); } catch {} + button.textContent = _orig; + delete button.dataset.busy; + } + if (typeof onDone === 'function') onDone(); + } + }; - diag.appendChild(header); - - if (diagnosis.fixes && diagnosis.fixes.length) { + if (fixes.length) { const row = document.createElement('div'); row.className = 'cookbook-diag-fixes'; - for (const fix of diagnosis.fixes) { - const btn = document.createElement('button'); - btn.className = 'cookbook-btn cookbook-diag-btn'; - btn.textContent = fix.label; - btn.addEventListener('click', async () => { - if (btn.dataset.busy) return; - btn.dataset.busy = '1'; - // Spinner feedback while the fix runs (kill + relaunch takes a moment). - const _orig = btn.textContent; - const wp = spinnerModule.createWhirlpool(12); - wp.element.style.cssText = 'display:inline-block;vertical-align:middle;width:12px;height:12px;margin-right:5px;'; - btn.textContent = ''; - btn.appendChild(wp.element); - const _lbl = document.createElement('span'); - _lbl.textContent = _orig; - _lbl.style.verticalAlign = 'middle'; - btn.appendChild(_lbl); - try { - await fix.action(panel, sourceText); - } catch (e) { - console.error('[cookbook] diagnosis fix failed', e); - } finally { - // Retries animate the whole card away (button goes with it). For fixes - // that leave the card in place, restore the label. - if (btn.isConnected) { try { wp.destroy(); } catch {} btn.textContent = _orig; delete btn.dataset.busy; } - } - }); - row.appendChild(btn); + + if (fixes.length <= 3) { + for (const fix of fixes) { + const btn = document.createElement('button'); + btn.className = 'cookbook-btn cookbook-diag-btn'; + btn.type = 'button'; + btn.textContent = fix.label; + btn.addEventListener('click', (e) => { + e.stopPropagation(); + runFix(fix, btn); + }); + row.appendChild(btn); + } + body.appendChild(row); + return; } - diag.appendChild(row); + + const wrap = document.createElement('div'); + wrap.className = 'cookbook-diag-actions'; + + const trigger = document.createElement('button'); + trigger.className = 'cookbook-btn cookbook-diag-action-trigger'; + trigger.type = 'button'; + trigger.textContent = 'Actions'; + trigger.appendChild(document.createTextNode(' ▾')); + wrap.appendChild(trigger); + + const menu = document.createElement('div'); + menu.className = 'dropdown cookbook-diag-menu hidden'; + for (const fix of fixes) { + const item = document.createElement('button'); + item.type = 'button'; + item.textContent = fix.label; + item.addEventListener('click', async (e) => { + e.stopPropagation(); + if (item.dataset.busy || trigger.dataset.busy) return; + item.dataset.busy = '1'; + await runFix(fix, trigger, fix.label, () => menu.classList.add('hidden'), () => delete item.dataset.busy); + }); + menu.appendChild(item); + } + wrap.appendChild(menu); + trigger.addEventListener('click', (e) => { + e.stopPropagation(); + if (trigger.dataset.busy) return; + document.querySelectorAll('.cookbook-diag-menu').forEach(m => { + if (m !== menu) m.classList.add('hidden'); + }); + menu.classList.toggle('hidden'); + }); + row.appendChild(wrap); + body.appendChild(row); } } diff --git a/static/js/cookbook-hwfit.js b/static/js/cookbook-hwfit.js index e6445f8..73ec109 100644 --- a/static/js/cookbook-hwfit.js +++ b/static/js/cookbook-hwfit.js @@ -153,14 +153,31 @@ export function _renderGpuToggles(system) { } const validCounts = _validTpCounts(poolSize); const maxGpu = validCounts.length ? validCounts[validCounts.length - 1] : 0; + // Commit the data layer to maxGpu on initial render so it matches the + // visual highlight. Before this, _activeCount stayed undefined → no + // gpu_count param sent → backend's fallback could rank against RAM on + // mixed-resource boxes ("tightest" sorted by RAM instead of GPU). + if (container._activeCount === undefined && validCounts.length) { + container._activeCount = maxGpu; + } html += ''; const hasExplicitCount = typeof container._activeCount === 'number'; for (const n of validCounts) { const text = n === 1 ? 'GPU' : n + ' GPU'; - const isActive = hasExplicitCount ? (n === container._activeCount) : (container._activeCount === undefined && n === maxGpu); + const isActive = hasExplicitCount && n === container._activeCount; html += ``; } + // Also mark the RAM button active when the user explicitly chose RAM (0) + // — the loop above only handles GPU buttons. + if (container._activeCount === 0) { + const ramBtn = container.querySelector('.hwfit-gpu-btn[data-count="0"]'); + // (we just set innerHTML so we re-mark below after assignment) + } container.innerHTML = html; + if (container._activeCount === 0) { + const ramBtn = container.querySelector('.hwfit-gpu-btn[data-count="0"]'); + if (ramBtn) ramBtn.classList.add('active'); + } // Pool dropdown: switch pools, reset the count to the new pool's max, rebuild. const sel = container.querySelector('#hwfit-gpu-group'); @@ -188,9 +205,12 @@ export function _renderGpuToggles(system) { } else { btn.classList.add('active'); container._activeCount = count; - // Auto-set quant based on hardware selection + // Auto-suggest a quant based on hardware selection — but ONLY when the + // user has already picked a specific quant. When they're on "All" + // (value === ""), leave them on All: toggling a GPU shouldn't silently + // yank them out of the All view they wanted to see. const quantSel = document.getElementById('hwfit-quant'); - if (quantSel) { + if (quantSel && quantSel.value !== '') { if (count <= 1) { quantSel.value = 'Q4_K_M'; // RAM or 1 GPU -> Q4 sweet spot } else { @@ -211,9 +231,34 @@ export function _renderGpuToggles(system) { // reload paints instantly, then we refresh in the background and swap. const _SCAN_CACHE_KEY = 'hwfit_scan_cache_v1'; const _MANUAL_HW_KEY = 'hwfit_manual_hardware_v1'; +const _CTX_KEY = 'hwfit_target_context_v1'; +const _CTX_PRESETS = [8192, 16384, 32768, 50000, 131072, 0]; // 0 = model max const _SCAN_CACHE_MAX = 12; // keep the newest N signatures const _SCAN_CACHE_TTL = 6 * 3600 * 1000; // 6 h — hardware rarely changes +// Ctx slider helpers (ported from origin/main). The slider picks an INDEX into +// _CTX_PRESETS; _ctxValue() resolves it to a token count (0 = "Max"). The label +// next to the slider re-renders to "8k" / "16k" / … / "Max". +function _ctxLabel(value) { + const n = Number(value) || 0; + if (!n) return 'Max'; + return n >= 1000 ? Math.round(n / 1000) + 'k' : String(n); +} +function _ctxValue() { + const slider = document.getElementById('hwfit-context'); + const idx = Math.max(0, Math.min(_CTX_PRESETS.length - 1, Number(slider?.value ?? 3) || 0)); + return _CTX_PRESETS[idx] || 0; +} +function _syncCtxControl() { + const slider = document.getElementById('hwfit-context'); + const label = document.getElementById('hwfit-context-label'); + if (!slider) return; + const saved = localStorage.getItem(_CTX_KEY); + const savedIdx = saved == null ? 3 : _CTX_PRESETS.indexOf(Number(saved)); + slider.value = String(savedIdx >= 0 ? savedIdx : 3); + if (label) label.textContent = _ctxLabel(_ctxValue()); +} + function _manualHwState() { try { const s = JSON.parse(localStorage.getItem(_MANUAL_HW_KEY) || '{}'); @@ -749,6 +794,13 @@ export function _hwfitRenderList(el, models) { const sortSel = document.getElementById('hwfit-sort'); const currentSort = sortSel?.value || 'score'; const isReversed = sortSel?.dataset.reverse === '1'; + // Active budget for the Fit column label \u2014 make it obvious whether the + // ranking is against GPU or RAM so "tightest" can't be ambiguous on a + // mixed-resource box. + const tc = document.getElementById('hwfit-gpu-toggles'); + const _budget = (tc && typeof tc._activeCount === 'number') + ? (tc._activeCount === 0 ? 'RAM' : (tc._activeCount === 1 ? 'GPU' : tc._activeCount + ' GPU')) + : null; let html = '