diff --git a/services/hwfit/fit.py b/services/hwfit/fit.py index 9a45b53..09aea29 100644 --- a/services/hwfit/fit.py +++ b/services/hwfit/fit.py @@ -576,6 +576,7 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan system_backend = (system.get("backend") or "").lower() apple_silicon = system_backend in ("mps", "metal", "apple") rocm = system_backend == "rocm" + is_windows = system.get("platform") == "windows" # Consumer AMD Radeon (RDNA, gfx10/11/12): the practical local serving path # is GGUF via llama.cpp. vLLM/SGLang on ROCm are validated for datacenter @@ -615,7 +616,11 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan # servable path, so a model needs a real GGUF to be recommended. # Otherwise the Cookbook rates vLLM-only AWQ/GPTQ builds "GOOD" on a # Radeon that can't actually serve them. - if (apple_silicon or consumer_amd) and not (m.get("is_gguf") or m.get("gguf_sources")): + # + # Windows is the same: Odysseus only supports llama.cpp on Windows, + # which requires GGUF. vLLM/SGLang are explicitly blocked, so AWQ/GPTQ + # models without a GGUF source are unservable there. + if (apple_silicon or consumer_amd or is_windows) and not (m.get("is_gguf") or m.get("gguf_sources")): continue # Format filter: AWQ tab -> only AWQ models, FP4 tab -> FP4-family models, etc. diff --git a/services/hwfit/hardware.py b/services/hwfit/hardware.py index 9815327..f961b70 100644 --- a/services/hwfit/hardware.py +++ b/services/hwfit/hardware.py @@ -539,6 +539,7 @@ def _detect_windows(): "backend": d.get("gpu_backend", "cpu_x86"), "homogeneous": True, "gpu_error": None, + "platform": "windows", } # PowerShell only reports aggregate GPU info, not per-card detail, so we # can't tell a mixed box from a uniform one here — assume one homogeneous diff --git a/tests/test_hwfit_windows.py b/tests/test_hwfit_windows.py new file mode 100644 index 0000000..7a96fb6 --- /dev/null +++ b/tests/test_hwfit_windows.py @@ -0,0 +1,74 @@ +"""Windows support for Cookbook hardware-fit. + +Odysseus only supports llama.cpp on Windows (vLLM/SGLang are explicitly +blocked). llama.cpp requires GGUF, so non-GGUF models — including AWQ/GPTQ/ +FP8 safetensors repos — must be filtered out on Windows so the Cookbook does +not recommend models the user cannot actually serve. +""" + +from services.hwfit.fit import rank_models +from services.hwfit.models import get_models + + +def _windows_system(ram_gb=32.0, vram_gb=16.0): + return { + "has_gpu": True, + "backend": "cuda", + "gpu_name": "NVIDIA RTX 4060", + "gpu_vram_gb": vram_gb, + "gpu_count": 1, + "available_ram_gb": ram_gb * 0.7, + "total_ram_gb": ram_gb, + "platform": "windows", + } + + +def _cuda_system(): + return { + "has_gpu": True, + "backend": "cuda", + "gpu_name": "NVIDIA RTX 4090", + "gpu_vram_gb": 24.0, + "gpu_count": 1, + "available_ram_gb": 32.0, + "total_ram_gb": 64.0, + } + + +def test_only_gguf_models_recommended_on_windows(): + """llama.cpp (GGUF) is the only servable path on Windows, so every model + recommended there must ship a real GGUF — no vLLM-only AWQ/GPTQ/FP8.""" + catalog = {m["name"]: m for m in get_models()} + unservable = [ + r["name"] for r in rank_models(_windows_system(), limit=900) + if not (catalog.get(r["name"], {}).get("is_gguf") + or catalog.get(r["name"], {}).get("gguf_sources")) + ] + assert unservable == [], f"{len(unservable)} non-GGUF models on Windows, e.g. {unservable[:3]}" + + +def test_safetensors_models_still_recommended_on_cuda(): + """Regression guard: the GGUF-only rule must not leak onto CUDA.""" + names = {r["name"] for r in rank_models(_cuda_system(), limit=900)} + assert "microsoft/Phi-mini-MoE-instruct" in names + + +def test_awq_model_hidden_on_windows(): + """The user's reported issue: Qwen2.5-3B-Instruct-AWQ is AWQ-only and must + not be recommended on Windows where it cannot be served.""" + names = {r["name"] for r in rank_models(_windows_system(), limit=900)} + assert "Qwen/Qwen2.5-3B-Instruct-AWQ" not in names + + +def test_awq_model_visible_on_cuda(): + """The same AWQ model should still be visible on CUDA where vLLM can + serve it.""" + names = {r["name"] for r in rank_models(_cuda_system(), limit=900)} + assert "Qwen/Qwen2.5-3B-Instruct-AWQ" in names + + +def test_gguf_alternate_still_recommended_on_windows(): + """Qwen2.5-3B-Instruct (the base model) has a GGUF source, so it should + still appear on Windows even though the AWQ variant is hidden.""" + names = {r["name"] for r in rank_models(_windows_system(), limit=900)} + assert "Qwen/Qwen2.5-3B-Instruct" in names