Cookbook fit: steer consumer AMD to GGUF recommendations

* Cookbook fit: consumer-AMD GGUF recommendations + accurate estimates (core logic) Split of #746 — the estimate/ranking MATH only, so it can be reviewed with tests first (UI changes follow separately). Backend files only: no static/js here. services/hwfit/fit.py, services/hwfit/hardware.py: - Recommend GGUF/llama.cpp on consumer AMD (RDNA, gfx10/11/12) instead of formats that don't run on consumer Radeon — vLLM-only AWQ/GPTQ/FP8 AND vendor-specific NVFP4 (NVIDIA) / MLX (Apple). Datacenter Instinct (CDNA) and CUDA are left untouched. - More accurate speed estimates across more GPUs (adds RDNA bandwidth data). - Detect AMD/RDNA GPUs (gpu_family from rocminfo) so fit/serve can branch on it. tests/test_hwfit_amd.py: AMD recommendation path, quant/bit matching, estimate realism, gfx RDNA-vs-CDNA classification. Rebased onto current main (analyze_model gained a scoring_use_case param there; kept it). Vision detection intentionally NOT added here — main already ships a "Vision" type filter + multimodal use-case handling; duplicating it was dropped. Checks: py_compile clean; pytest tests/test_hwfit_amd.py + hwfit/serve suites = 28 passed; full suite 0 new failures vs main. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> * Tests: assert NVFP4/MLX/FP8 formats are filtered on consumer RDNA Backs the #972 claim with an explicit regression: no NVIDIA NVFP4, Apple MLX, or vLLM-only FP8/AWQ/GPTQ repos are recommended on a consumer Radeon, and guards against vacuity by asserting such repos exist in the catalog. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-02 14:01:42 +02:00
parent fd89d098a1
commit de92bbe47a
3 changed files with 299 additions and 12 deletions
--- a/services/hwfit/fit.py
+++ b/services/hwfit/fit.py
@@ -18,7 +18,7 @@ GPU_BANDWIDTH = {
    "7900 xtx": 960, "7900 xt": 800, "7900 gre": 576, "7800 xt": 624, "7700 xt": 432, "7600": 288,
    "6950 xt": 576, "6900 xt": 512, "6800 xt": 512, "6800": 512, "6700 xt": 384, "6600 xt": 256, "6600": 224,
    "mi300x": 5300, "mi300": 5300, "mi250x": 3277, "mi250": 3277, "mi210": 1638, "mi100": 1229,
-    "9070 xt": 624, "9070": 488,
+    "9070 xt": 624, "9070": 488, "9060 xt": 322, "9060": 322,
    # Apple Silicon unified-memory bandwidth (GB/s). Keyed off the chip name
    # reported by sysctl machdep.cpu.brand_string (e.g. "Apple M4 Max"). Listed
    # before the bare "m_" keys matters less than length-sorting (done below),
@@ -70,8 +70,18 @@ def _lookup_bandwidth(gpu_name):
    return None
-def _estimate_speed(model, quant, run_mode, system):
+def _estimate_speed(model, quant, run_mode, system, offload_frac=0.0):
-    """Estimate tok/s. Uses active params for MoE (only active experts run per token)."""
+    """Estimate tok/s. Uses active params for MoE (only active experts run per token).
    offload_frac (0..1): fraction of the model's weights that spill to system RAM
    (CPU) because they don't fit VRAM. Generation reads every active weight per
    token, so when part lives in CPU RAM the per-token time is dominated by the
    slow path. We model effective bandwidth as a blend of GPU VRAM bandwidth and
    system-RAM bandwidth weighted by what's where — far more accurate than a flat
    "halve it" for partial offload, which under/over-shoots depending on amount.
    Calibrated against a measured RX 9060 XT: DeepSeek-Coder-V2-Lite Q4_K_M with
    light offload → ~59 t/s est vs 59.8 measured.
    """
    pb = _active_params_b(model)
    is_moe = model.get("is_moe", False)
    bw = _lookup_bandwidth(system.get("gpu_name"))
@@ -83,14 +93,24 @@ def _estimate_speed(model, quant, run_mode, system):
        if model_gb <= 0:
            return 0.0
        efficiency = 0.55
        raw_tps = (bw / model_gb) * efficiency
        if run_mode == "cpu_offload":
-            mode_factor = 0.5
+            # Dual-channel DDR4-3200 ≈ 50 GB/s; DDR5 systems higher, but be
-        elif is_moe:
+            # conservative since offloaded MoE is also compute-bound on CPU.
-            mode_factor = 0.8
+            cpu_bw = 55.0
-        else:
+            frac = min(max(offload_frac, 0.0), 1.0)
-            mode_factor = 1.0
+            # If we don't know the fraction (legacy callers pass 0 with
-        return raw_tps * mode_factor
+            # cpu_offload), assume a meaningful spill so we don't overestimate.
            if frac <= 0.0:
                frac = 0.5
            # Harmonic-style blend: time = frac/cpu_bw + (1-frac)/gpu_bw, so the
            # slow CPU portion dominates as it grows (matches the steep real-world
            # drop-off when more experts offload).
            eff_bw = 1.0 / (frac / cpu_bw + (1.0 - frac) / bw)
            raw_tps = (eff_bw / model_gb) * efficiency
            return raw_tps * (0.8 if is_moe else 1.0)
        # Fully on GPU.
        raw_tps = (bw / model_gb) * efficiency
        return raw_tps * (0.8 if is_moe else 1.0)
    k = FALLBACK_K.get(backend, 70)
    if pb <= 0:
@@ -357,7 +377,12 @@ def analyze_model(model, system, target_quant=None, scoring_use_case=None):
    else:
        fit_level = "marginal"
-    tps = _estimate_speed(model, quant, run_mode, system)
+    # Fraction of the model that spills to CPU RAM (drives the offload speed
    # model). When offloading, anything beyond the GPU's VRAM lives in system RAM.
    offload_frac = 0.0
    if run_mode == "cpu_offload" and required_gb > 0 and effective_vram > 0:
        offload_frac = max(0.0, (required_gb - effective_vram) / required_gb)
    tps = _estimate_speed(model, quant, run_mode, system, offload_frac=offload_frac)
    q_score = _quality_score(model, quant, score_use_case)
    s_score = _speed_score(tps, score_use_case)
@@ -389,6 +414,7 @@ def analyze_model(model, system, target_quant=None, scoring_use_case=None):
        },
        "gguf_sources": model.get("gguf_sources", []),
        "context_length": model.get("context_length", 4096),
        "release_date": model.get("release_date", ""),
    }
@@ -398,6 +424,10 @@ SORT_KEYS = {
    "vram": lambda r: r["required_gb"],
    "params": lambda r: r["params_b"],
    "context": lambda r: r["context"],
    # Newest first. release_date is an ISO-ish string ("2026-05-30"); plain
    # string sort is chronological. Missing dates sort last (empty < any date,
    # and we sort reverse=True for newest, so "" lands at the bottom).
    "newest": lambda r: r.get("release_date") or "",
 }
@@ -454,6 +484,16 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
    apple_silicon = system_backend in ("mps", "metal", "apple")
    rocm = system_backend == "rocm"
    # Consumer AMD Radeon (RDNA, gfx10/11/12): the practical local serving path
    # is GGUF via llama.cpp. vLLM/SGLang on ROCm are validated for datacenter
    # Instinct (CDNA, gfx9xx) but are unreliable on consumer RDNA — AWQ kernels
    # are largely unsupported there and FP8 needs out-of-tree patches. So treat
    # consumer RDNA like Apple Silicon (GGUF-only) and leave CDNA untouched.
    # Unknown family (no rocminfo) is left untouched to avoid hiding models from
    # a possibly-capable Instinct box on a misdetect.
    gpu_family = (system.get("gpu_family") or "").lower()
    consumer_amd = system_backend == "rocm" and gpu_family == "rdna"
    for m in models:
        native_q = m.get("quantization", "")
        if "nvfp4" in (m.get("name") or "").lower():
@@ -479,7 +519,12 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
        # default GGUF quant) and vLLM-only AWQ/GPTQ/FP8 builds alike. Without
        # this the Cookbook recommends models the Mac can't run; on CUDA these
        # stay visible because vLLM serves safetensors directly.
-        if apple_silicon and not (m.get("is_gguf") or m.get("gguf_sources")):
+        #
        # Consumer AMD (RDNA) is the same story: GGUF via llama.cpp is the
        # servable path, so a model needs a real GGUF to be recommended.
        # Otherwise the Cookbook rates vLLM-only AWQ/GPTQ builds "GOOD" on a
        # Radeon that can't actually serve them.
        if (apple_silicon or consumer_amd) and not (m.get("is_gguf") or m.get("gguf_sources")):
            continue
        # Format filter: AWQ tab -> only AWQ models, FP4 tab -> FP4-family models, etc.
--- a/services/hwfit/hardware.py
+++ b/services/hwfit/hardware.py
@@ -1,5 +1,6 @@
 import os
 import platform
 import re
 import shutil
 import subprocess
 import time
@@ -130,6 +131,33 @@ def _detect_nvidia():
    }
 def classify_amd_gfx(gfx):
    """Map an AMD ISA target (e.g. "gfx1200") to (gfx, family).
    family is one of:
      "rdna"    — consumer Radeon RX (gfx10xx RDNA1/2, gfx11xx RDNA3, gfx12xx RDNA4)
      "cdna"    — datacenter Instinct (gfx908 MI100, gfx90a MI200, gfx94x/95x MI300+)
      "gcn"     — older GCN/Vega (gfx900/906)
      "unknown" — empty/unrecognized; callers must treat conservatively
    This drives the serving decision: vLLM/SGLang on ROCm are validated on CDNA
    but fragile on consumer RDNA (AWQ kernels largely unsupported, FP8 needs
    out-of-tree patches), so RDNA is steered to GGUF/llama.cpp.
    """
    gfx = (gfx or "").lower().strip()
    m = re.fullmatch(r"gfx(\d+[a-f]?)", gfx)
    if not m:
        return "", "unknown"
    digits = m.group(1)
    if digits[:2] in ("10", "11", "12"):
        return gfx, "rdna"
    if digits in ("908", "90a") or digits[:2] in ("94", "95"):
        return gfx, "cdna"
    if digits[:1] == "9":
        return gfx, "gcn"
    return gfx, "unknown"
 def _detect_amd():
    """Detect AMD GPUs. Handles both discrete cards (with mem_info_vram_total)
    and APUs / unified-memory SoCs like Strix Halo (which expose
@@ -155,6 +183,17 @@ def _detect_amd():
        except Exception:
            return []
    def _amd_arch():
        """Best-effort AMD GPU ISA + family from rocminfo.
        rocminfo is the source of truth; its GPU agents report a `Name: gfxNNNN`
        line (CPU agents report a brand string, not a gfx target), so the first
        gfx match is the GPU ISA. Returns (gfx, family) — see classify_amd_gfx.
        """
        info = _run(["rocminfo"]) or _run(["/opt/rocm/bin/rocminfo"]) or ""
        m = re.search(r"gfx\d+[a-f]?", info)
        return classify_amd_gfx(m.group(0) if m else "")
    try:
        cards = []
        is_apu = False
@@ -187,6 +226,7 @@ def _detect_amd():
            return None
        total_vram = sum(c["vram_gb"] for c in cards)
        groups = _group_gpus(cards)
        gfx, family = _amd_arch()
        # NOTE: for APUs with BIOS UMA carveout (e.g. Strix Halo), vis_vram_total
        # is the real usable GPU memory — it's physically backed but reserved
        # by BIOS so it doesn't appear in /proc/meminfo. Don't cap it at system
@@ -200,6 +240,13 @@ def _detect_amd():
            "homogeneous": len(groups) <= 1,
            "backend": "rocm",
            "unified_memory": is_apu,
            # AMD ISA/family so downstream can tell datacenter Instinct (CDNA,
            # where vLLM/SGLang run AWQ/GPTQ reliably) from consumer Radeon
            # (RDNA, where the practical path is GGUF via llama.cpp). Empty/
            # "unknown" when rocminfo isn't available — callers must treat
            # unknown conservatively, not assume vLLM works.
            "gpu_arch": gfx,
            "gpu_family": family,
        }
    except Exception:
        return None
--- a/tests/test_hwfit_amd.py
+++ b/tests/test_hwfit_amd.py
@@ -0,0 +1,195 @@
 """AMD ROCm support for Cookbook hardware-fit.
 Consumer AMD Radeon (RDNA: gfx10/11/12) can realistically only serve GGUF via
 llama.cpp — vLLM/SGLang on ROCm are validated for datacenter Instinct (CDNA,
 gfx9xx), not consumer cards, where AWQ kernels are largely unsupported and FP8
 needs out-of-tree patches. These tests lock in that consumer RDNA is treated
 like Apple Silicon (GGUF-only recommendations) while datacenter CDNA and
 unknown-family AMD are left untouched, and that CUDA is unchanged.
 """
 from services.hwfit import hardware
 from services.hwfit.fit import rank_models
 from services.hwfit.models import get_models
 def _rocm_system(family="rdna", ram_gb=32.0, vram_gb=16.0):
    return {
        "has_gpu": True,
        "backend": "rocm",
        "gpu_name": "AMD Radeon RX 9060 XT" if family == "rdna" else "AMD Instinct MI300X",
        "gpu_vram_gb": vram_gb,
        "gpu_count": 1,
        "available_ram_gb": ram_gb * 0.7,
        "total_ram_gb": ram_gb,
        "gpu_arch": "gfx1200" if family == "rdna" else "gfx942",
        "gpu_family": family,
    }
 def _cuda_system():
    return {
        "has_gpu": True, "backend": "cuda", "gpu_name": "NVIDIA RTX 4090",
        "gpu_vram_gb": 24.0, "gpu_count": 1, "available_ram_gb": 32.0, "total_ram_gb": 64.0,
    }
 def test_only_gguf_models_recommended_on_consumer_rdna():
    """llama.cpp (GGUF) is the servable path on consumer Radeon, so every model
    recommended on RDNA must ship a real GGUF — no vLLM-only AWQ/GPTQ/FP8."""
    catalog = {m["name"]: m for m in get_models()}
    unservable = [
        r["name"] for r in rank_models(_rocm_system(family="rdna"), limit=900)
        if not (catalog.get(r["name"], {}).get("is_gguf")
                or catalog.get(r["name"], {}).get("gguf_sources"))
    ]
    assert unservable == [], f"{len(unservable)} non-GGUF models on RDNA, e.g. {unservable[:3]}"
 def test_safetensors_models_still_recommended_on_cdna():
    """Datacenter Instinct (CDNA) runs vLLM/SGLang on ROCm fine, so non-GGUF
    repos must NOT be filtered there — the GGUF-only rule is consumer-RDNA only."""
    names = {r["name"] for r in rank_models(_rocm_system(family="cdna"), limit=900)}
    assert "microsoft/Phi-mini-MoE-instruct" in names
 def test_unknown_amd_family_not_filtered():
    """When rocminfo is unavailable (family 'unknown'), don't hide non-GGUF
    models — a possibly-capable Instinct box shouldn't lose models on misdetect."""
    names = {r["name"] for r in rank_models(_rocm_system(family="unknown"), limit=900)}
    assert "microsoft/Phi-mini-MoE-instruct" in names
 def test_safetensors_models_still_recommended_on_cuda():
    """Regression guard: the GGUF-only rule must not leak onto CUDA."""
    names = {r["name"] for r in rank_models(_cuda_system(), limit=900)}
    assert "microsoft/Phi-mini-MoE-instruct" in names
 def test_classify_amd_gfx_rdna_vs_cdna():
    """classify_amd_gfx maps gfx targets to the right family: consumer RDNA
    (gfx10/11/12) vs datacenter CDNA (gfx9xx Instinct) vs older GCN."""
    cases = {
        "gfx1200": "rdna",   # RX 9060 XT (RDNA4)
        "gfx1201": "rdna",   # RX 9070 (RDNA4)
        "gfx1100": "rdna",   # RX 7900 (RDNA3)
        "gfx1030": "rdna",   # RX 6800 (RDNA2)
        "gfx942": "cdna",    # MI300 (CDNA3)
        "gfx950": "cdna",    # MI350 (CDNA4)
        "gfx90a": "cdna",    # MI200 (CDNA2)
        "gfx908": "cdna",    # MI100 (CDNA1)
        "gfx906": "gcn",     # Radeon VII / MI50 (GCN5/Vega)
        "": "unknown",
        "gfx": "unknown",
    }
    for gfx, expected_family in cases.items():
        out_gfx, family = hardware.classify_amd_gfx(gfx)
        assert family == expected_family, f"{gfx} -> {family}, expected {expected_family}"
        if expected_family != "unknown":
            assert out_gfx == gfx
 def test_detect_amd_reports_family(monkeypatch):
    """_detect_amd surfaces gpu_family from rocminfo so fit/serve can branch on
    consumer-RDNA vs datacenter-CDNA. rocminfo lists the CPU agent first, then
    the GPU's gfx target. Drive it through the remote-read path (no real sysfs)."""
    rocminfo_out = "  Name:  AMD Ryzen 7 3700X\n  Name:  gfx1200\n  Marketing Name: AMD Radeon RX 9060 XT\n"
    def fake_run(cmd):
        if not cmd:
            return None
        if "rocminfo" in cmd[0]:
            return rocminfo_out
        if cmd[0] == "ls":
            return "card1\ncard1-DP-1\nrenderD128"
        if cmd[0] == "cat":
            path = cmd[1]
            if path.endswith("/vendor"):
                return "0x1002"
            if path.endswith("/mem_info_vram_total"):
                return str(16 * 1024**3)
            if path.endswith("/product_name"):
                return "AMD Radeon RX 9060 XT"
            return None
        return None
    # _remote_host truthy routes _read/_list_drm_cards through _run (no real sysfs).
    monkeypatch.setattr(hardware, "_remote_host", "fake-host")
    monkeypatch.setattr(hardware, "_run", fake_run)
    info = hardware._detect_amd()
    assert info is not None
    assert info["backend"] == "rocm"
    assert info["gpu_family"] == "rdna"
    assert info["gpu_arch"] == "gfx1200"
 def test_consumer_amd_cards_have_real_bandwidth():
    """Consumer AMD cards must be in the bandwidth table so speed estimates use
    real VRAM bandwidth, not the crude rocm FALLBACK_K constant. The RX 9060 XT
    was missing entirely, so its estimates fell back to the constant and were off."""
    from services.hwfit.fit import _lookup_bandwidth
    for name, expected_min in [
        ("AMD Radeon RX 9060 XT", 300),
        ("AMD Radeon RX 9070 XT", 600),
        ("AMD Radeon RX 7900 XTX", 900),
    ]:
        bw = _lookup_bandwidth(name)
        assert bw and bw >= expected_min, f"{name}: {bw} GB/s (expected >= {expected_min})"
 def test_9060xt_speed_estimate_is_realistic():
    """Calibration guard: a small MoE fully on a 9060 XT at Q4 should estimate in
    a believable range, not the absurd numbers the missing-bandwidth fallback gave.
    Measured reference: DeepSeek-Coder-V2-Lite Q4 ~60-86 t/s on this card."""
    from services.hwfit.fit import _estimate_speed
    model = {"name": "DeepSeek-Coder-V2-Lite-Instruct", "parameter_count": "16B",
             "is_moe": True, "active_parameters": 2_400_000_000}
    sys = {"backend": "rocm", "gpu_name": "AMD Radeon RX 9060 XT", "gpu_vram_gb": 15.9}
    tps = _estimate_speed(model, "Q4_K_M", "gpu", sys)
    assert 40 <= tps <= 130, f"unrealistic estimate: {tps} t/s"
 def test_offload_is_slower_than_full_gpu():
    """Partial CPU offload must estimate slower than the same model fully on GPU,
    and heavier offload slower than lighter — the blend model, not a flat halving."""
    from services.hwfit.fit import _estimate_speed
    model = {"name": "X", "parameter_count": "35B", "is_moe": True,
             "active_parameters": 3_000_000_000}
    sys = {"backend": "rocm", "gpu_name": "AMD Radeon RX 9060 XT", "gpu_vram_gb": 15.9}
    full = _estimate_speed(model, "Q4_K_M", "gpu", sys)
    light = _estimate_speed(model, "Q4_K_M", "cpu_offload", sys, offload_frac=0.2)
    heavy = _estimate_speed(model, "Q4_K_M", "cpu_offload", sys, offload_frac=0.6)
    assert full > light > heavy, (full, light, heavy)
 def test_sort_by_newest_orders_by_release_date():
    """sort='newest' orders results by release_date descending (newest first),
    with undated models sorted last."""
    sys = {"backend": "rocm", "gpu_name": "AMD Radeon RX 9060 XT", "gpu_vram_gb": 15.9,
           "gpu_family": "rdna", "gpu_count": 1, "available_ram_gb": 22.0, "total_ram_gb": 31.0}
    res = rank_models(sys, sort="newest", limit=50)
    dated = [r.get("release_date") for r in res if r.get("release_date")]
    # dates present must be in descending order
    assert dated == sorted(dated, reverse=True), "release dates not descending"
    # any undated entries must come after all dated ones
    seen_blank = False
    for r in res:
        if not r.get("release_date"):
            seen_blank = True
        elif seen_blank:
            assert False, "a dated model appeared after an undated one"
 def test_no_vendor_specific_formats_on_consumer_rdna():
    """Consumer Radeon can't run NVIDIA NVFP4, Apple MLX, or vLLM-only FP8/AWQ/
    GPTQ builds — none should be recommended on RDNA even though such repos DO
    exist in the catalog. Guards the format filter directly (not just is_gguf)."""
    import re
    bad = re.compile(r"NVFP4|FP8|FP4|-MLX-|\bMLX\b|AWQ|GPTQ", re.IGNORECASE)
    names = [r["name"] for r in rank_models(_rocm_system(family="rdna"), limit=900)]
    offenders = [n for n in names if bad.search(n)]
    assert offenders == [], f"non-runnable formats recommended on RDNA: {offenders[:5]}"
    # Guard against a vacuous test: such formats must actually be in the catalog.
    assert any(bad.search(m["name"]) for m in get_models()), \
        "catalog has no NVFP4/MLX/FP8 repos — test would be vacuous"