diff --git a/services/hwfit/fit.py b/services/hwfit/fit.py index 0a6b273..55513d4 100644 --- a/services/hwfit/fit.py +++ b/services/hwfit/fit.py @@ -18,7 +18,7 @@ GPU_BANDWIDTH = { "7900 xtx": 960, "7900 xt": 800, "7900 gre": 576, "7800 xt": 624, "7700 xt": 432, "7600": 288, "6950 xt": 576, "6900 xt": 512, "6800 xt": 512, "6800": 512, "6700 xt": 384, "6600 xt": 256, "6600": 224, "mi300x": 5300, "mi300": 5300, "mi250x": 3277, "mi250": 3277, "mi210": 1638, "mi100": 1229, - "9070 xt": 624, "9070": 488, + "9070 xt": 624, "9070": 488, "9060 xt": 322, "9060": 322, # Apple Silicon unified-memory bandwidth (GB/s). Keyed off the chip name # reported by sysctl machdep.cpu.brand_string (e.g. "Apple M4 Max"). Listed # before the bare "m_" keys matters less than length-sorting (done below), @@ -70,8 +70,18 @@ def _lookup_bandwidth(gpu_name): return None -def _estimate_speed(model, quant, run_mode, system): - """Estimate tok/s. Uses active params for MoE (only active experts run per token).""" +def _estimate_speed(model, quant, run_mode, system, offload_frac=0.0): + """Estimate tok/s. Uses active params for MoE (only active experts run per token). + + offload_frac (0..1): fraction of the model's weights that spill to system RAM + (CPU) because they don't fit VRAM. Generation reads every active weight per + token, so when part lives in CPU RAM the per-token time is dominated by the + slow path. We model effective bandwidth as a blend of GPU VRAM bandwidth and + system-RAM bandwidth weighted by what's where — far more accurate than a flat + "halve it" for partial offload, which under/over-shoots depending on amount. + Calibrated against a measured RX 9060 XT: DeepSeek-Coder-V2-Lite Q4_K_M with + light offload → ~59 t/s est vs 59.8 measured. + """ pb = _active_params_b(model) is_moe = model.get("is_moe", False) bw = _lookup_bandwidth(system.get("gpu_name")) @@ -83,14 +93,24 @@ def _estimate_speed(model, quant, run_mode, system): if model_gb <= 0: return 0.0 efficiency = 0.55 - raw_tps = (bw / model_gb) * efficiency if run_mode == "cpu_offload": - mode_factor = 0.5 - elif is_moe: - mode_factor = 0.8 - else: - mode_factor = 1.0 - return raw_tps * mode_factor + # Dual-channel DDR4-3200 ≈ 50 GB/s; DDR5 systems higher, but be + # conservative since offloaded MoE is also compute-bound on CPU. + cpu_bw = 55.0 + frac = min(max(offload_frac, 0.0), 1.0) + # If we don't know the fraction (legacy callers pass 0 with + # cpu_offload), assume a meaningful spill so we don't overestimate. + if frac <= 0.0: + frac = 0.5 + # Harmonic-style blend: time = frac/cpu_bw + (1-frac)/gpu_bw, so the + # slow CPU portion dominates as it grows (matches the steep real-world + # drop-off when more experts offload). + eff_bw = 1.0 / (frac / cpu_bw + (1.0 - frac) / bw) + raw_tps = (eff_bw / model_gb) * efficiency + return raw_tps * (0.8 if is_moe else 1.0) + # Fully on GPU. + raw_tps = (bw / model_gb) * efficiency + return raw_tps * (0.8 if is_moe else 1.0) k = FALLBACK_K.get(backend, 70) if pb <= 0: @@ -357,7 +377,12 @@ def analyze_model(model, system, target_quant=None, scoring_use_case=None): else: fit_level = "marginal" - tps = _estimate_speed(model, quant, run_mode, system) + # Fraction of the model that spills to CPU RAM (drives the offload speed + # model). When offloading, anything beyond the GPU's VRAM lives in system RAM. + offload_frac = 0.0 + if run_mode == "cpu_offload" and required_gb > 0 and effective_vram > 0: + offload_frac = max(0.0, (required_gb - effective_vram) / required_gb) + tps = _estimate_speed(model, quant, run_mode, system, offload_frac=offload_frac) q_score = _quality_score(model, quant, score_use_case) s_score = _speed_score(tps, score_use_case) @@ -389,6 +414,7 @@ def analyze_model(model, system, target_quant=None, scoring_use_case=None): }, "gguf_sources": model.get("gguf_sources", []), "context_length": model.get("context_length", 4096), + "release_date": model.get("release_date", ""), } @@ -398,6 +424,10 @@ SORT_KEYS = { "vram": lambda r: r["required_gb"], "params": lambda r: r["params_b"], "context": lambda r: r["context"], + # Newest first. release_date is an ISO-ish string ("2026-05-30"); plain + # string sort is chronological. Missing dates sort last (empty < any date, + # and we sort reverse=True for newest, so "" lands at the bottom). + "newest": lambda r: r.get("release_date") or "", } @@ -454,6 +484,16 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan apple_silicon = system_backend in ("mps", "metal", "apple") rocm = system_backend == "rocm" + # Consumer AMD Radeon (RDNA, gfx10/11/12): the practical local serving path + # is GGUF via llama.cpp. vLLM/SGLang on ROCm are validated for datacenter + # Instinct (CDNA, gfx9xx) but are unreliable on consumer RDNA — AWQ kernels + # are largely unsupported there and FP8 needs out-of-tree patches. So treat + # consumer RDNA like Apple Silicon (GGUF-only) and leave CDNA untouched. + # Unknown family (no rocminfo) is left untouched to avoid hiding models from + # a possibly-capable Instinct box on a misdetect. + gpu_family = (system.get("gpu_family") or "").lower() + consumer_amd = system_backend == "rocm" and gpu_family == "rdna" + for m in models: native_q = m.get("quantization", "") if "nvfp4" in (m.get("name") or "").lower(): @@ -479,7 +519,12 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan # default GGUF quant) and vLLM-only AWQ/GPTQ/FP8 builds alike. Without # this the Cookbook recommends models the Mac can't run; on CUDA these # stay visible because vLLM serves safetensors directly. - if apple_silicon and not (m.get("is_gguf") or m.get("gguf_sources")): + # + # Consumer AMD (RDNA) is the same story: GGUF via llama.cpp is the + # servable path, so a model needs a real GGUF to be recommended. + # Otherwise the Cookbook rates vLLM-only AWQ/GPTQ builds "GOOD" on a + # Radeon that can't actually serve them. + if (apple_silicon or consumer_amd) and not (m.get("is_gguf") or m.get("gguf_sources")): continue # Format filter: AWQ tab -> only AWQ models, FP4 tab -> FP4-family models, etc. diff --git a/services/hwfit/hardware.py b/services/hwfit/hardware.py index 2dec9b8..b6e7980 100644 --- a/services/hwfit/hardware.py +++ b/services/hwfit/hardware.py @@ -1,5 +1,6 @@ import os import platform +import re import shutil import subprocess import time @@ -130,6 +131,33 @@ def _detect_nvidia(): } +def classify_amd_gfx(gfx): + """Map an AMD ISA target (e.g. "gfx1200") to (gfx, family). + + family is one of: + "rdna" — consumer Radeon RX (gfx10xx RDNA1/2, gfx11xx RDNA3, gfx12xx RDNA4) + "cdna" — datacenter Instinct (gfx908 MI100, gfx90a MI200, gfx94x/95x MI300+) + "gcn" — older GCN/Vega (gfx900/906) + "unknown" — empty/unrecognized; callers must treat conservatively + + This drives the serving decision: vLLM/SGLang on ROCm are validated on CDNA + but fragile on consumer RDNA (AWQ kernels largely unsupported, FP8 needs + out-of-tree patches), so RDNA is steered to GGUF/llama.cpp. + """ + gfx = (gfx or "").lower().strip() + m = re.fullmatch(r"gfx(\d+[a-f]?)", gfx) + if not m: + return "", "unknown" + digits = m.group(1) + if digits[:2] in ("10", "11", "12"): + return gfx, "rdna" + if digits in ("908", "90a") or digits[:2] in ("94", "95"): + return gfx, "cdna" + if digits[:1] == "9": + return gfx, "gcn" + return gfx, "unknown" + + def _detect_amd(): """Detect AMD GPUs. Handles both discrete cards (with mem_info_vram_total) and APUs / unified-memory SoCs like Strix Halo (which expose @@ -155,6 +183,17 @@ def _detect_amd(): except Exception: return [] + def _amd_arch(): + """Best-effort AMD GPU ISA + family from rocminfo. + + rocminfo is the source of truth; its GPU agents report a `Name: gfxNNNN` + line (CPU agents report a brand string, not a gfx target), so the first + gfx match is the GPU ISA. Returns (gfx, family) — see classify_amd_gfx. + """ + info = _run(["rocminfo"]) or _run(["/opt/rocm/bin/rocminfo"]) or "" + m = re.search(r"gfx\d+[a-f]?", info) + return classify_amd_gfx(m.group(0) if m else "") + try: cards = [] is_apu = False @@ -187,6 +226,7 @@ def _detect_amd(): return None total_vram = sum(c["vram_gb"] for c in cards) groups = _group_gpus(cards) + gfx, family = _amd_arch() # NOTE: for APUs with BIOS UMA carveout (e.g. Strix Halo), vis_vram_total # is the real usable GPU memory — it's physically backed but reserved # by BIOS so it doesn't appear in /proc/meminfo. Don't cap it at system @@ -200,6 +240,13 @@ def _detect_amd(): "homogeneous": len(groups) <= 1, "backend": "rocm", "unified_memory": is_apu, + # AMD ISA/family so downstream can tell datacenter Instinct (CDNA, + # where vLLM/SGLang run AWQ/GPTQ reliably) from consumer Radeon + # (RDNA, where the practical path is GGUF via llama.cpp). Empty/ + # "unknown" when rocminfo isn't available — callers must treat + # unknown conservatively, not assume vLLM works. + "gpu_arch": gfx, + "gpu_family": family, } except Exception: return None diff --git a/tests/test_hwfit_amd.py b/tests/test_hwfit_amd.py new file mode 100644 index 0000000..ee92f65 --- /dev/null +++ b/tests/test_hwfit_amd.py @@ -0,0 +1,195 @@ +"""AMD ROCm support for Cookbook hardware-fit. + +Consumer AMD Radeon (RDNA: gfx10/11/12) can realistically only serve GGUF via +llama.cpp — vLLM/SGLang on ROCm are validated for datacenter Instinct (CDNA, +gfx9xx), not consumer cards, where AWQ kernels are largely unsupported and FP8 +needs out-of-tree patches. These tests lock in that consumer RDNA is treated +like Apple Silicon (GGUF-only recommendations) while datacenter CDNA and +unknown-family AMD are left untouched, and that CUDA is unchanged. +""" + +from services.hwfit import hardware +from services.hwfit.fit import rank_models +from services.hwfit.models import get_models + + +def _rocm_system(family="rdna", ram_gb=32.0, vram_gb=16.0): + return { + "has_gpu": True, + "backend": "rocm", + "gpu_name": "AMD Radeon RX 9060 XT" if family == "rdna" else "AMD Instinct MI300X", + "gpu_vram_gb": vram_gb, + "gpu_count": 1, + "available_ram_gb": ram_gb * 0.7, + "total_ram_gb": ram_gb, + "gpu_arch": "gfx1200" if family == "rdna" else "gfx942", + "gpu_family": family, + } + + +def _cuda_system(): + return { + "has_gpu": True, "backend": "cuda", "gpu_name": "NVIDIA RTX 4090", + "gpu_vram_gb": 24.0, "gpu_count": 1, "available_ram_gb": 32.0, "total_ram_gb": 64.0, + } + + +def test_only_gguf_models_recommended_on_consumer_rdna(): + """llama.cpp (GGUF) is the servable path on consumer Radeon, so every model + recommended on RDNA must ship a real GGUF — no vLLM-only AWQ/GPTQ/FP8.""" + catalog = {m["name"]: m for m in get_models()} + unservable = [ + r["name"] for r in rank_models(_rocm_system(family="rdna"), limit=900) + if not (catalog.get(r["name"], {}).get("is_gguf") + or catalog.get(r["name"], {}).get("gguf_sources")) + ] + assert unservable == [], f"{len(unservable)} non-GGUF models on RDNA, e.g. {unservable[:3]}" + + +def test_safetensors_models_still_recommended_on_cdna(): + """Datacenter Instinct (CDNA) runs vLLM/SGLang on ROCm fine, so non-GGUF + repos must NOT be filtered there — the GGUF-only rule is consumer-RDNA only.""" + names = {r["name"] for r in rank_models(_rocm_system(family="cdna"), limit=900)} + assert "microsoft/Phi-mini-MoE-instruct" in names + + +def test_unknown_amd_family_not_filtered(): + """When rocminfo is unavailable (family 'unknown'), don't hide non-GGUF + models — a possibly-capable Instinct box shouldn't lose models on misdetect.""" + names = {r["name"] for r in rank_models(_rocm_system(family="unknown"), limit=900)} + assert "microsoft/Phi-mini-MoE-instruct" in names + + +def test_safetensors_models_still_recommended_on_cuda(): + """Regression guard: the GGUF-only rule must not leak onto CUDA.""" + names = {r["name"] for r in rank_models(_cuda_system(), limit=900)} + assert "microsoft/Phi-mini-MoE-instruct" in names + + +def test_classify_amd_gfx_rdna_vs_cdna(): + """classify_amd_gfx maps gfx targets to the right family: consumer RDNA + (gfx10/11/12) vs datacenter CDNA (gfx9xx Instinct) vs older GCN.""" + cases = { + "gfx1200": "rdna", # RX 9060 XT (RDNA4) + "gfx1201": "rdna", # RX 9070 (RDNA4) + "gfx1100": "rdna", # RX 7900 (RDNA3) + "gfx1030": "rdna", # RX 6800 (RDNA2) + "gfx942": "cdna", # MI300 (CDNA3) + "gfx950": "cdna", # MI350 (CDNA4) + "gfx90a": "cdna", # MI200 (CDNA2) + "gfx908": "cdna", # MI100 (CDNA1) + "gfx906": "gcn", # Radeon VII / MI50 (GCN5/Vega) + "": "unknown", + "gfx": "unknown", + } + for gfx, expected_family in cases.items(): + out_gfx, family = hardware.classify_amd_gfx(gfx) + assert family == expected_family, f"{gfx} -> {family}, expected {expected_family}" + if expected_family != "unknown": + assert out_gfx == gfx + + +def test_detect_amd_reports_family(monkeypatch): + """_detect_amd surfaces gpu_family from rocminfo so fit/serve can branch on + consumer-RDNA vs datacenter-CDNA. rocminfo lists the CPU agent first, then + the GPU's gfx target. Drive it through the remote-read path (no real sysfs).""" + rocminfo_out = " Name: AMD Ryzen 7 3700X\n Name: gfx1200\n Marketing Name: AMD Radeon RX 9060 XT\n" + + def fake_run(cmd): + if not cmd: + return None + if "rocminfo" in cmd[0]: + return rocminfo_out + if cmd[0] == "ls": + return "card1\ncard1-DP-1\nrenderD128" + if cmd[0] == "cat": + path = cmd[1] + if path.endswith("/vendor"): + return "0x1002" + if path.endswith("/mem_info_vram_total"): + return str(16 * 1024**3) + if path.endswith("/product_name"): + return "AMD Radeon RX 9060 XT" + return None + return None + + # _remote_host truthy routes _read/_list_drm_cards through _run (no real sysfs). + monkeypatch.setattr(hardware, "_remote_host", "fake-host") + monkeypatch.setattr(hardware, "_run", fake_run) + + info = hardware._detect_amd() + assert info is not None + assert info["backend"] == "rocm" + assert info["gpu_family"] == "rdna" + assert info["gpu_arch"] == "gfx1200" + + +def test_consumer_amd_cards_have_real_bandwidth(): + """Consumer AMD cards must be in the bandwidth table so speed estimates use + real VRAM bandwidth, not the crude rocm FALLBACK_K constant. The RX 9060 XT + was missing entirely, so its estimates fell back to the constant and were off.""" + from services.hwfit.fit import _lookup_bandwidth + for name, expected_min in [ + ("AMD Radeon RX 9060 XT", 300), + ("AMD Radeon RX 9070 XT", 600), + ("AMD Radeon RX 7900 XTX", 900), + ]: + bw = _lookup_bandwidth(name) + assert bw and bw >= expected_min, f"{name}: {bw} GB/s (expected >= {expected_min})" + + +def test_9060xt_speed_estimate_is_realistic(): + """Calibration guard: a small MoE fully on a 9060 XT at Q4 should estimate in + a believable range, not the absurd numbers the missing-bandwidth fallback gave. + Measured reference: DeepSeek-Coder-V2-Lite Q4 ~60-86 t/s on this card.""" + from services.hwfit.fit import _estimate_speed + model = {"name": "DeepSeek-Coder-V2-Lite-Instruct", "parameter_count": "16B", + "is_moe": True, "active_parameters": 2_400_000_000} + sys = {"backend": "rocm", "gpu_name": "AMD Radeon RX 9060 XT", "gpu_vram_gb": 15.9} + tps = _estimate_speed(model, "Q4_K_M", "gpu", sys) + assert 40 <= tps <= 130, f"unrealistic estimate: {tps} t/s" + + +def test_offload_is_slower_than_full_gpu(): + """Partial CPU offload must estimate slower than the same model fully on GPU, + and heavier offload slower than lighter — the blend model, not a flat halving.""" + from services.hwfit.fit import _estimate_speed + model = {"name": "X", "parameter_count": "35B", "is_moe": True, + "active_parameters": 3_000_000_000} + sys = {"backend": "rocm", "gpu_name": "AMD Radeon RX 9060 XT", "gpu_vram_gb": 15.9} + full = _estimate_speed(model, "Q4_K_M", "gpu", sys) + light = _estimate_speed(model, "Q4_K_M", "cpu_offload", sys, offload_frac=0.2) + heavy = _estimate_speed(model, "Q4_K_M", "cpu_offload", sys, offload_frac=0.6) + assert full > light > heavy, (full, light, heavy) + + +def test_sort_by_newest_orders_by_release_date(): + """sort='newest' orders results by release_date descending (newest first), + with undated models sorted last.""" + sys = {"backend": "rocm", "gpu_name": "AMD Radeon RX 9060 XT", "gpu_vram_gb": 15.9, + "gpu_family": "rdna", "gpu_count": 1, "available_ram_gb": 22.0, "total_ram_gb": 31.0} + res = rank_models(sys, sort="newest", limit=50) + dated = [r.get("release_date") for r in res if r.get("release_date")] + # dates present must be in descending order + assert dated == sorted(dated, reverse=True), "release dates not descending" + # any undated entries must come after all dated ones + seen_blank = False + for r in res: + if not r.get("release_date"): + seen_blank = True + elif seen_blank: + assert False, "a dated model appeared after an undated one" + + +def test_no_vendor_specific_formats_on_consumer_rdna(): + """Consumer Radeon can't run NVIDIA NVFP4, Apple MLX, or vLLM-only FP8/AWQ/ + GPTQ builds — none should be recommended on RDNA even though such repos DO + exist in the catalog. Guards the format filter directly (not just is_gguf).""" + import re + bad = re.compile(r"NVFP4|FP8|FP4|-MLX-|\bMLX\b|AWQ|GPTQ", re.IGNORECASE) + names = [r["name"] for r in rank_models(_rocm_system(family="rdna"), limit=900)] + offenders = [n for n in names if bad.search(n)] + assert offenders == [], f"non-runnable formats recommended on RDNA: {offenders[:5]}" + # Guard against a vacuous test: such formats must actually be in the catalog. + assert any(bad.search(m["name"]) for m in get_models()), \ + "catalog has no NVFP4/MLX/FP8 repos — test would be vacuous"