* Cookbook fit: consumer-AMD GGUF recommendations + accurate estimates (core logic) Split of #746 — the estimate/ranking MATH only, so it can be reviewed with tests first (UI changes follow separately). Backend files only: no static/js here. services/hwfit/fit.py, services/hwfit/hardware.py: - Recommend GGUF/llama.cpp on consumer AMD (RDNA, gfx10/11/12) instead of formats that don't run on consumer Radeon — vLLM-only AWQ/GPTQ/FP8 AND vendor-specific NVFP4 (NVIDIA) / MLX (Apple). Datacenter Instinct (CDNA) and CUDA are left untouched. - More accurate speed estimates across more GPUs (adds RDNA bandwidth data). - Detect AMD/RDNA GPUs (gpu_family from rocminfo) so fit/serve can branch on it. tests/test_hwfit_amd.py: AMD recommendation path, quant/bit matching, estimate realism, gfx RDNA-vs-CDNA classification. Rebased onto current main (analyze_model gained a scoring_use_case param there; kept it). Vision detection intentionally NOT added here — main already ships a "Vision" type filter + multimodal use-case handling; duplicating it was dropped. Checks: py_compile clean; pytest tests/test_hwfit_amd.py + hwfit/serve suites = 28 passed; full suite 0 new failures vs main. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> * Tests: assert NVFP4/MLX/FP8 formats are filtered on consumer RDNA Backs the #972 claim with an explicit regression: no NVIDIA NVFP4, Apple MLX, or vLLM-only FP8/AWQ/GPTQ repos are recommended on a consumer Radeon, and guards against vacuity by asserting such repos exist in the catalog. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
196 lines
8.8 KiB
Python
196 lines
8.8 KiB
Python
"""AMD ROCm support for Cookbook hardware-fit.
|
|
|
|
Consumer AMD Radeon (RDNA: gfx10/11/12) can realistically only serve GGUF via
|
|
llama.cpp — vLLM/SGLang on ROCm are validated for datacenter Instinct (CDNA,
|
|
gfx9xx), not consumer cards, where AWQ kernels are largely unsupported and FP8
|
|
needs out-of-tree patches. These tests lock in that consumer RDNA is treated
|
|
like Apple Silicon (GGUF-only recommendations) while datacenter CDNA and
|
|
unknown-family AMD are left untouched, and that CUDA is unchanged.
|
|
"""
|
|
|
|
from services.hwfit import hardware
|
|
from services.hwfit.fit import rank_models
|
|
from services.hwfit.models import get_models
|
|
|
|
|
|
def _rocm_system(family="rdna", ram_gb=32.0, vram_gb=16.0):
|
|
return {
|
|
"has_gpu": True,
|
|
"backend": "rocm",
|
|
"gpu_name": "AMD Radeon RX 9060 XT" if family == "rdna" else "AMD Instinct MI300X",
|
|
"gpu_vram_gb": vram_gb,
|
|
"gpu_count": 1,
|
|
"available_ram_gb": ram_gb * 0.7,
|
|
"total_ram_gb": ram_gb,
|
|
"gpu_arch": "gfx1200" if family == "rdna" else "gfx942",
|
|
"gpu_family": family,
|
|
}
|
|
|
|
|
|
def _cuda_system():
|
|
return {
|
|
"has_gpu": True, "backend": "cuda", "gpu_name": "NVIDIA RTX 4090",
|
|
"gpu_vram_gb": 24.0, "gpu_count": 1, "available_ram_gb": 32.0, "total_ram_gb": 64.0,
|
|
}
|
|
|
|
|
|
def test_only_gguf_models_recommended_on_consumer_rdna():
|
|
"""llama.cpp (GGUF) is the servable path on consumer Radeon, so every model
|
|
recommended on RDNA must ship a real GGUF — no vLLM-only AWQ/GPTQ/FP8."""
|
|
catalog = {m["name"]: m for m in get_models()}
|
|
unservable = [
|
|
r["name"] for r in rank_models(_rocm_system(family="rdna"), limit=900)
|
|
if not (catalog.get(r["name"], {}).get("is_gguf")
|
|
or catalog.get(r["name"], {}).get("gguf_sources"))
|
|
]
|
|
assert unservable == [], f"{len(unservable)} non-GGUF models on RDNA, e.g. {unservable[:3]}"
|
|
|
|
|
|
def test_safetensors_models_still_recommended_on_cdna():
|
|
"""Datacenter Instinct (CDNA) runs vLLM/SGLang on ROCm fine, so non-GGUF
|
|
repos must NOT be filtered there — the GGUF-only rule is consumer-RDNA only."""
|
|
names = {r["name"] for r in rank_models(_rocm_system(family="cdna"), limit=900)}
|
|
assert "microsoft/Phi-mini-MoE-instruct" in names
|
|
|
|
|
|
def test_unknown_amd_family_not_filtered():
|
|
"""When rocminfo is unavailable (family 'unknown'), don't hide non-GGUF
|
|
models — a possibly-capable Instinct box shouldn't lose models on misdetect."""
|
|
names = {r["name"] for r in rank_models(_rocm_system(family="unknown"), limit=900)}
|
|
assert "microsoft/Phi-mini-MoE-instruct" in names
|
|
|
|
|
|
def test_safetensors_models_still_recommended_on_cuda():
|
|
"""Regression guard: the GGUF-only rule must not leak onto CUDA."""
|
|
names = {r["name"] for r in rank_models(_cuda_system(), limit=900)}
|
|
assert "microsoft/Phi-mini-MoE-instruct" in names
|
|
|
|
|
|
def test_classify_amd_gfx_rdna_vs_cdna():
|
|
"""classify_amd_gfx maps gfx targets to the right family: consumer RDNA
|
|
(gfx10/11/12) vs datacenter CDNA (gfx9xx Instinct) vs older GCN."""
|
|
cases = {
|
|
"gfx1200": "rdna", # RX 9060 XT (RDNA4)
|
|
"gfx1201": "rdna", # RX 9070 (RDNA4)
|
|
"gfx1100": "rdna", # RX 7900 (RDNA3)
|
|
"gfx1030": "rdna", # RX 6800 (RDNA2)
|
|
"gfx942": "cdna", # MI300 (CDNA3)
|
|
"gfx950": "cdna", # MI350 (CDNA4)
|
|
"gfx90a": "cdna", # MI200 (CDNA2)
|
|
"gfx908": "cdna", # MI100 (CDNA1)
|
|
"gfx906": "gcn", # Radeon VII / MI50 (GCN5/Vega)
|
|
"": "unknown",
|
|
"gfx": "unknown",
|
|
}
|
|
for gfx, expected_family in cases.items():
|
|
out_gfx, family = hardware.classify_amd_gfx(gfx)
|
|
assert family == expected_family, f"{gfx} -> {family}, expected {expected_family}"
|
|
if expected_family != "unknown":
|
|
assert out_gfx == gfx
|
|
|
|
|
|
def test_detect_amd_reports_family(monkeypatch):
|
|
"""_detect_amd surfaces gpu_family from rocminfo so fit/serve can branch on
|
|
consumer-RDNA vs datacenter-CDNA. rocminfo lists the CPU agent first, then
|
|
the GPU's gfx target. Drive it through the remote-read path (no real sysfs)."""
|
|
rocminfo_out = " Name: AMD Ryzen 7 3700X\n Name: gfx1200\n Marketing Name: AMD Radeon RX 9060 XT\n"
|
|
|
|
def fake_run(cmd):
|
|
if not cmd:
|
|
return None
|
|
if "rocminfo" in cmd[0]:
|
|
return rocminfo_out
|
|
if cmd[0] == "ls":
|
|
return "card1\ncard1-DP-1\nrenderD128"
|
|
if cmd[0] == "cat":
|
|
path = cmd[1]
|
|
if path.endswith("/vendor"):
|
|
return "0x1002"
|
|
if path.endswith("/mem_info_vram_total"):
|
|
return str(16 * 1024**3)
|
|
if path.endswith("/product_name"):
|
|
return "AMD Radeon RX 9060 XT"
|
|
return None
|
|
return None
|
|
|
|
# _remote_host truthy routes _read/_list_drm_cards through _run (no real sysfs).
|
|
monkeypatch.setattr(hardware, "_remote_host", "fake-host")
|
|
monkeypatch.setattr(hardware, "_run", fake_run)
|
|
|
|
info = hardware._detect_amd()
|
|
assert info is not None
|
|
assert info["backend"] == "rocm"
|
|
assert info["gpu_family"] == "rdna"
|
|
assert info["gpu_arch"] == "gfx1200"
|
|
|
|
|
|
def test_consumer_amd_cards_have_real_bandwidth():
|
|
"""Consumer AMD cards must be in the bandwidth table so speed estimates use
|
|
real VRAM bandwidth, not the crude rocm FALLBACK_K constant. The RX 9060 XT
|
|
was missing entirely, so its estimates fell back to the constant and were off."""
|
|
from services.hwfit.fit import _lookup_bandwidth
|
|
for name, expected_min in [
|
|
("AMD Radeon RX 9060 XT", 300),
|
|
("AMD Radeon RX 9070 XT", 600),
|
|
("AMD Radeon RX 7900 XTX", 900),
|
|
]:
|
|
bw = _lookup_bandwidth(name)
|
|
assert bw and bw >= expected_min, f"{name}: {bw} GB/s (expected >= {expected_min})"
|
|
|
|
|
|
def test_9060xt_speed_estimate_is_realistic():
|
|
"""Calibration guard: a small MoE fully on a 9060 XT at Q4 should estimate in
|
|
a believable range, not the absurd numbers the missing-bandwidth fallback gave.
|
|
Measured reference: DeepSeek-Coder-V2-Lite Q4 ~60-86 t/s on this card."""
|
|
from services.hwfit.fit import _estimate_speed
|
|
model = {"name": "DeepSeek-Coder-V2-Lite-Instruct", "parameter_count": "16B",
|
|
"is_moe": True, "active_parameters": 2_400_000_000}
|
|
sys = {"backend": "rocm", "gpu_name": "AMD Radeon RX 9060 XT", "gpu_vram_gb": 15.9}
|
|
tps = _estimate_speed(model, "Q4_K_M", "gpu", sys)
|
|
assert 40 <= tps <= 130, f"unrealistic estimate: {tps} t/s"
|
|
|
|
|
|
def test_offload_is_slower_than_full_gpu():
|
|
"""Partial CPU offload must estimate slower than the same model fully on GPU,
|
|
and heavier offload slower than lighter — the blend model, not a flat halving."""
|
|
from services.hwfit.fit import _estimate_speed
|
|
model = {"name": "X", "parameter_count": "35B", "is_moe": True,
|
|
"active_parameters": 3_000_000_000}
|
|
sys = {"backend": "rocm", "gpu_name": "AMD Radeon RX 9060 XT", "gpu_vram_gb": 15.9}
|
|
full = _estimate_speed(model, "Q4_K_M", "gpu", sys)
|
|
light = _estimate_speed(model, "Q4_K_M", "cpu_offload", sys, offload_frac=0.2)
|
|
heavy = _estimate_speed(model, "Q4_K_M", "cpu_offload", sys, offload_frac=0.6)
|
|
assert full > light > heavy, (full, light, heavy)
|
|
|
|
|
|
def test_sort_by_newest_orders_by_release_date():
|
|
"""sort='newest' orders results by release_date descending (newest first),
|
|
with undated models sorted last."""
|
|
sys = {"backend": "rocm", "gpu_name": "AMD Radeon RX 9060 XT", "gpu_vram_gb": 15.9,
|
|
"gpu_family": "rdna", "gpu_count": 1, "available_ram_gb": 22.0, "total_ram_gb": 31.0}
|
|
res = rank_models(sys, sort="newest", limit=50)
|
|
dated = [r.get("release_date") for r in res if r.get("release_date")]
|
|
# dates present must be in descending order
|
|
assert dated == sorted(dated, reverse=True), "release dates not descending"
|
|
# any undated entries must come after all dated ones
|
|
seen_blank = False
|
|
for r in res:
|
|
if not r.get("release_date"):
|
|
seen_blank = True
|
|
elif seen_blank:
|
|
assert False, "a dated model appeared after an undated one"
|
|
|
|
|
|
def test_no_vendor_specific_formats_on_consumer_rdna():
|
|
"""Consumer Radeon can't run NVIDIA NVFP4, Apple MLX, or vLLM-only FP8/AWQ/
|
|
GPTQ builds — none should be recommended on RDNA even though such repos DO
|
|
exist in the catalog. Guards the format filter directly (not just is_gguf)."""
|
|
import re
|
|
bad = re.compile(r"NVFP4|FP8|FP4|-MLX-|\bMLX\b|AWQ|GPTQ", re.IGNORECASE)
|
|
names = [r["name"] for r in rank_models(_rocm_system(family="rdna"), limit=900)]
|
|
offenders = [n for n in names if bad.search(n)]
|
|
assert offenders == [], f"non-runnable formats recommended on RDNA: {offenders[:5]}"
|
|
# Guard against a vacuous test: such formats must actually be in the catalog.
|
|
assert any(bad.search(m["name"]) for m in get_models()), \
|
|
"catalog has no NVFP4/MLX/FP8 repos — test would be vacuous"
|