fix(hwfit): filter non-GGUF models on Windows (#2530)
Odysseus only supports llama.cpp on Windows (vLLM/SGLang are explicitly blocked). llama.cpp requires GGUF, so AWQ/GPTQ/FP8 safetensors models without a GGUF alternate should not be recommended in the Cookbook on Windows hosts. Changes: - hardware.py: add 'platform': 'windows' to _detect_windows() so downstream logic can identify Windows hosts. - fit.py: include is_windows in the existing GGUF-only filter alongside apple_silicon and consumer_amd. - tests: add test_hwfit_windows.py with regression tests. Fixes #122, #614 (root cause: unservable models recommended).
This commit is contained in:
@@ -576,6 +576,7 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
|
|||||||
system_backend = (system.get("backend") or "").lower()
|
system_backend = (system.get("backend") or "").lower()
|
||||||
apple_silicon = system_backend in ("mps", "metal", "apple")
|
apple_silicon = system_backend in ("mps", "metal", "apple")
|
||||||
rocm = system_backend == "rocm"
|
rocm = system_backend == "rocm"
|
||||||
|
is_windows = system.get("platform") == "windows"
|
||||||
|
|
||||||
# Consumer AMD Radeon (RDNA, gfx10/11/12): the practical local serving path
|
# Consumer AMD Radeon (RDNA, gfx10/11/12): the practical local serving path
|
||||||
# is GGUF via llama.cpp. vLLM/SGLang on ROCm are validated for datacenter
|
# is GGUF via llama.cpp. vLLM/SGLang on ROCm are validated for datacenter
|
||||||
@@ -615,7 +616,11 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
|
|||||||
# servable path, so a model needs a real GGUF to be recommended.
|
# servable path, so a model needs a real GGUF to be recommended.
|
||||||
# Otherwise the Cookbook rates vLLM-only AWQ/GPTQ builds "GOOD" on a
|
# Otherwise the Cookbook rates vLLM-only AWQ/GPTQ builds "GOOD" on a
|
||||||
# Radeon that can't actually serve them.
|
# Radeon that can't actually serve them.
|
||||||
if (apple_silicon or consumer_amd) and not (m.get("is_gguf") or m.get("gguf_sources")):
|
#
|
||||||
|
# Windows is the same: Odysseus only supports llama.cpp on Windows,
|
||||||
|
# which requires GGUF. vLLM/SGLang are explicitly blocked, so AWQ/GPTQ
|
||||||
|
# models without a GGUF source are unservable there.
|
||||||
|
if (apple_silicon or consumer_amd or is_windows) and not (m.get("is_gguf") or m.get("gguf_sources")):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Format filter: AWQ tab -> only AWQ models, FP4 tab -> FP4-family models, etc.
|
# Format filter: AWQ tab -> only AWQ models, FP4 tab -> FP4-family models, etc.
|
||||||
|
|||||||
@@ -539,6 +539,7 @@ def _detect_windows():
|
|||||||
"backend": d.get("gpu_backend", "cpu_x86"),
|
"backend": d.get("gpu_backend", "cpu_x86"),
|
||||||
"homogeneous": True,
|
"homogeneous": True,
|
||||||
"gpu_error": None,
|
"gpu_error": None,
|
||||||
|
"platform": "windows",
|
||||||
}
|
}
|
||||||
# PowerShell only reports aggregate GPU info, not per-card detail, so we
|
# PowerShell only reports aggregate GPU info, not per-card detail, so we
|
||||||
# can't tell a mixed box from a uniform one here — assume one homogeneous
|
# can't tell a mixed box from a uniform one here — assume one homogeneous
|
||||||
|
|||||||
74
tests/test_hwfit_windows.py
Normal file
74
tests/test_hwfit_windows.py
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
"""Windows support for Cookbook hardware-fit.
|
||||||
|
|
||||||
|
Odysseus only supports llama.cpp on Windows (vLLM/SGLang are explicitly
|
||||||
|
blocked). llama.cpp requires GGUF, so non-GGUF models — including AWQ/GPTQ/
|
||||||
|
FP8 safetensors repos — must be filtered out on Windows so the Cookbook does
|
||||||
|
not recommend models the user cannot actually serve.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from services.hwfit.fit import rank_models
|
||||||
|
from services.hwfit.models import get_models
|
||||||
|
|
||||||
|
|
||||||
|
def _windows_system(ram_gb=32.0, vram_gb=16.0):
|
||||||
|
return {
|
||||||
|
"has_gpu": True,
|
||||||
|
"backend": "cuda",
|
||||||
|
"gpu_name": "NVIDIA RTX 4060",
|
||||||
|
"gpu_vram_gb": vram_gb,
|
||||||
|
"gpu_count": 1,
|
||||||
|
"available_ram_gb": ram_gb * 0.7,
|
||||||
|
"total_ram_gb": ram_gb,
|
||||||
|
"platform": "windows",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _cuda_system():
|
||||||
|
return {
|
||||||
|
"has_gpu": True,
|
||||||
|
"backend": "cuda",
|
||||||
|
"gpu_name": "NVIDIA RTX 4090",
|
||||||
|
"gpu_vram_gb": 24.0,
|
||||||
|
"gpu_count": 1,
|
||||||
|
"available_ram_gb": 32.0,
|
||||||
|
"total_ram_gb": 64.0,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_only_gguf_models_recommended_on_windows():
|
||||||
|
"""llama.cpp (GGUF) is the only servable path on Windows, so every model
|
||||||
|
recommended there must ship a real GGUF — no vLLM-only AWQ/GPTQ/FP8."""
|
||||||
|
catalog = {m["name"]: m for m in get_models()}
|
||||||
|
unservable = [
|
||||||
|
r["name"] for r in rank_models(_windows_system(), limit=900)
|
||||||
|
if not (catalog.get(r["name"], {}).get("is_gguf")
|
||||||
|
or catalog.get(r["name"], {}).get("gguf_sources"))
|
||||||
|
]
|
||||||
|
assert unservable == [], f"{len(unservable)} non-GGUF models on Windows, e.g. {unservable[:3]}"
|
||||||
|
|
||||||
|
|
||||||
|
def test_safetensors_models_still_recommended_on_cuda():
|
||||||
|
"""Regression guard: the GGUF-only rule must not leak onto CUDA."""
|
||||||
|
names = {r["name"] for r in rank_models(_cuda_system(), limit=900)}
|
||||||
|
assert "microsoft/Phi-mini-MoE-instruct" in names
|
||||||
|
|
||||||
|
|
||||||
|
def test_awq_model_hidden_on_windows():
|
||||||
|
"""The user's reported issue: Qwen2.5-3B-Instruct-AWQ is AWQ-only and must
|
||||||
|
not be recommended on Windows where it cannot be served."""
|
||||||
|
names = {r["name"] for r in rank_models(_windows_system(), limit=900)}
|
||||||
|
assert "Qwen/Qwen2.5-3B-Instruct-AWQ" not in names
|
||||||
|
|
||||||
|
|
||||||
|
def test_awq_model_visible_on_cuda():
|
||||||
|
"""The same AWQ model should still be visible on CUDA where vLLM can
|
||||||
|
serve it."""
|
||||||
|
names = {r["name"] for r in rank_models(_cuda_system(), limit=900)}
|
||||||
|
assert "Qwen/Qwen2.5-3B-Instruct-AWQ" in names
|
||||||
|
|
||||||
|
|
||||||
|
def test_gguf_alternate_still_recommended_on_windows():
|
||||||
|
"""Qwen2.5-3B-Instruct (the base model) has a GGUF source, so it should
|
||||||
|
still appear on Windows even though the AWQ variant is hidden."""
|
||||||
|
names = {r["name"] for r in rank_models(_windows_system(), limit=900)}
|
||||||
|
assert "Qwen/Qwen2.5-3B-Instruct" in names
|
||||||
Reference in New Issue
Block a user