Cookbook fit: steer consumer AMD to GGUF recommendations

* Cookbook fit: consumer-AMD GGUF recommendations + accurate estimates (core logic) Split of #746 — the estimate/ranking MATH only, so it can be reviewed with tests first (UI changes follow separately). Backend files only: no static/js here. services/hwfit/fit.py, services/hwfit/hardware.py: - Recommend GGUF/llama.cpp on consumer AMD (RDNA, gfx10/11/12) instead of formats that don't run on consumer Radeon — vLLM-only AWQ/GPTQ/FP8 AND vendor-specific NVFP4 (NVIDIA) / MLX (Apple). Datacenter Instinct (CDNA) and CUDA are left untouched. - More accurate speed estimates across more GPUs (adds RDNA bandwidth data). - Detect AMD/RDNA GPUs (gpu_family from rocminfo) so fit/serve can branch on it. tests/test_hwfit_amd.py: AMD recommendation path, quant/bit matching, estimate realism, gfx RDNA-vs-CDNA classification. Rebased onto current main (analyze_model gained a scoring_use_case param there; kept it). Vision detection intentionally NOT added here — main already ships a "Vision" type filter + multimodal use-case handling; duplicating it was dropped. Checks: py_compile clean; pytest tests/test_hwfit_amd.py + hwfit/serve suites = 28 passed; full suite 0 new failures vs main. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> * Tests: assert NVFP4/MLX/FP8 formats are filtered on consumer RDNA Backs the #972 claim with an explicit regression: no NVIDIA NVFP4, Apple MLX, or vLLM-only FP8/AWQ/GPTQ repos are recommended on a consumer Radeon, and guards against vacuity by asserting such repos exist in the catalog. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-02 14:01:42 +02:00
parent fd89d098a1
commit de92bbe47a
3 changed files with 299 additions and 12 deletions
--- a/services/hwfit/fit.py
+++ b/services/hwfit/fit.py
@@ -18,7 +18,7 @@ GPU_BANDWIDTH = {
    "7900 xtx": 960, "7900 xt": 800, "7900 gre": 576, "7800 xt": 624, "7700 xt": 432, "7600": 288,
    "6950 xt": 576, "6900 xt": 512, "6800 xt": 512, "6800": 512, "6700 xt": 384, "6600 xt": 256, "6600": 224,
    "mi300x": 5300, "mi300": 5300, "mi250x": 3277, "mi250": 3277, "mi210": 1638, "mi100": 1229,
-    "9070 xt": 624, "9070": 488,
+    "9070 xt": 624, "9070": 488, "9060 xt": 322, "9060": 322,
    # Apple Silicon unified-memory bandwidth (GB/s). Keyed off the chip name
    # reported by sysctl machdep.cpu.brand_string (e.g. "Apple M4 Max"). Listed
    # before the bare "m_" keys matters less than length-sorting (done below),
@@ -70,8 +70,18 @@ def _lookup_bandwidth(gpu_name):
    return None


-def _estimate_speed(model, quant, run_mode, system):
-    """Estimate tok/s. Uses active params for MoE (only active experts run per token)."""
+def _estimate_speed(model, quant, run_mode, system, offload_frac=0.0):
+    """Estimate tok/s. Uses active params for MoE (only active experts run per token).
+
+    offload_frac (0..1): fraction of the model's weights that spill to system RAM
+    (CPU) because they don't fit VRAM. Generation reads every active weight per
+    token, so when part lives in CPU RAM the per-token time is dominated by the
+    slow path. We model effective bandwidth as a blend of GPU VRAM bandwidth and
+    system-RAM bandwidth weighted by what's where — far more accurate than a flat
+    "halve it" for partial offload, which under/over-shoots depending on amount.
+    Calibrated against a measured RX 9060 XT: DeepSeek-Coder-V2-Lite Q4_K_M with
+    light offload → ~59 t/s est vs 59.8 measured.
+    """
    pb = _active_params_b(model)
    is_moe = model.get("is_moe", False)
    bw = _lookup_bandwidth(system.get("gpu_name"))
@@ -83,14 +93,24 @@ def _estimate_speed(model, quant, run_mode, system):
        if model_gb <= 0:
            return 0.0
        efficiency = 0.55
-        raw_tps = (bw / model_gb) * efficiency
        if run_mode == "cpu_offload":
-            mode_factor = 0.5
-        elif is_moe:
-            mode_factor = 0.8
-        else:
-            mode_factor = 1.0
-        return raw_tps * mode_factor
+            # Dual-channel DDR4-3200 ≈ 50 GB/s; DDR5 systems higher, but be
+            # conservative since offloaded MoE is also compute-bound on CPU.
+            cpu_bw = 55.0
+            frac = min(max(offload_frac, 0.0), 1.0)
+            # If we don't know the fraction (legacy callers pass 0 with
+            # cpu_offload), assume a meaningful spill so we don't overestimate.
+            if frac <= 0.0:
+                frac = 0.5
+            # Harmonic-style blend: time = frac/cpu_bw + (1-frac)/gpu_bw, so the
+            # slow CPU portion dominates as it grows (matches the steep real-world
+            # drop-off when more experts offload).
+            eff_bw = 1.0 / (frac / cpu_bw + (1.0 - frac) / bw)
+            raw_tps = (eff_bw / model_gb) * efficiency
+            return raw_tps * (0.8 if is_moe else 1.0)
+        # Fully on GPU.
+        raw_tps = (bw / model_gb) * efficiency
+        return raw_tps * (0.8 if is_moe else 1.0)

    k = FALLBACK_K.get(backend, 70)
    if pb <= 0:
@@ -357,7 +377,12 @@ def analyze_model(model, system, target_quant=None, scoring_use_case=None):
    else:
        fit_level = "marginal"

-    tps = _estimate_speed(model, quant, run_mode, system)
+    # Fraction of the model that spills to CPU RAM (drives the offload speed
+    # model). When offloading, anything beyond the GPU's VRAM lives in system RAM.
+    offload_frac = 0.0
+    if run_mode == "cpu_offload" and required_gb > 0 and effective_vram > 0:
+        offload_frac = max(0.0, (required_gb - effective_vram) / required_gb)
+    tps = _estimate_speed(model, quant, run_mode, system, offload_frac=offload_frac)

    q_score = _quality_score(model, quant, score_use_case)
    s_score = _speed_score(tps, score_use_case)
@@ -389,6 +414,7 @@ def analyze_model(model, system, target_quant=None, scoring_use_case=None):
        },
        "gguf_sources": model.get("gguf_sources", []),
        "context_length": model.get("context_length", 4096),
+        "release_date": model.get("release_date", ""),
    }


@@ -398,6 +424,10 @@ SORT_KEYS = {
    "vram": lambda r: r["required_gb"],
    "params": lambda r: r["params_b"],
    "context": lambda r: r["context"],
+    # Newest first. release_date is an ISO-ish string ("2026-05-30"); plain
+    # string sort is chronological. Missing dates sort last (empty < any date,
+    # and we sort reverse=True for newest, so "" lands at the bottom).
+    "newest": lambda r: r.get("release_date") or "",
 }


@@ -454,6 +484,16 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
    apple_silicon = system_backend in ("mps", "metal", "apple")
    rocm = system_backend == "rocm"

+    # Consumer AMD Radeon (RDNA, gfx10/11/12): the practical local serving path
+    # is GGUF via llama.cpp. vLLM/SGLang on ROCm are validated for datacenter
+    # Instinct (CDNA, gfx9xx) but are unreliable on consumer RDNA — AWQ kernels
+    # are largely unsupported there and FP8 needs out-of-tree patches. So treat
+    # consumer RDNA like Apple Silicon (GGUF-only) and leave CDNA untouched.
+    # Unknown family (no rocminfo) is left untouched to avoid hiding models from
+    # a possibly-capable Instinct box on a misdetect.
+    gpu_family = (system.get("gpu_family") or "").lower()
+    consumer_amd = system_backend == "rocm" and gpu_family == "rdna"
+
    for m in models:
        native_q = m.get("quantization", "")
        if "nvfp4" in (m.get("name") or "").lower():
@@ -479,7 +519,12 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
        # default GGUF quant) and vLLM-only AWQ/GPTQ/FP8 builds alike. Without
        # this the Cookbook recommends models the Mac can't run; on CUDA these
        # stay visible because vLLM serves safetensors directly.
-        if apple_silicon and not (m.get("is_gguf") or m.get("gguf_sources")):
+        #
+        # Consumer AMD (RDNA) is the same story: GGUF via llama.cpp is the
+        # servable path, so a model needs a real GGUF to be recommended.
+        # Otherwise the Cookbook rates vLLM-only AWQ/GPTQ builds "GOOD" on a
+        # Radeon that can't actually serve them.
+        if (apple_silicon or consumer_amd) and not (m.get("is_gguf") or m.get("gguf_sources")):
            continue

        # Format filter: AWQ tab -> only AWQ models, FP4 tab -> FP4-family models, etc.
--- a/services/hwfit/hardware.py
+++ b/services/hwfit/hardware.py
@@ -1,5 +1,6 @@
 import os
 import platform
+import re
 import shutil
 import subprocess
 import time
@@ -130,6 +131,33 @@ def _detect_nvidia():
    }


+def classify_amd_gfx(gfx):
+    """Map an AMD ISA target (e.g. "gfx1200") to (gfx, family).
+
+    family is one of:
+      "rdna"    — consumer Radeon RX (gfx10xx RDNA1/2, gfx11xx RDNA3, gfx12xx RDNA4)
+      "cdna"    — datacenter Instinct (gfx908 MI100, gfx90a MI200, gfx94x/95x MI300+)
+      "gcn"     — older GCN/Vega (gfx900/906)
+      "unknown" — empty/unrecognized; callers must treat conservatively
+
+    This drives the serving decision: vLLM/SGLang on ROCm are validated on CDNA
+    but fragile on consumer RDNA (AWQ kernels largely unsupported, FP8 needs
+    out-of-tree patches), so RDNA is steered to GGUF/llama.cpp.
+    """
+    gfx = (gfx or "").lower().strip()
+    m = re.fullmatch(r"gfx(\d+[a-f]?)", gfx)
+    if not m:
+        return "", "unknown"
+    digits = m.group(1)
+    if digits[:2] in ("10", "11", "12"):
+        return gfx, "rdna"
+    if digits in ("908", "90a") or digits[:2] in ("94", "95"):
+        return gfx, "cdna"
+    if digits[:1] == "9":
+        return gfx, "gcn"
+    return gfx, "unknown"
+
+
 def _detect_amd():
    """Detect AMD GPUs. Handles both discrete cards (with mem_info_vram_total)
    and APUs / unified-memory SoCs like Strix Halo (which expose
@@ -155,6 +183,17 @@ def _detect_amd():
        except Exception:
            return []

+    def _amd_arch():
+        """Best-effort AMD GPU ISA + family from rocminfo.
+
+        rocminfo is the source of truth; its GPU agents report a `Name: gfxNNNN`
+        line (CPU agents report a brand string, not a gfx target), so the first
+        gfx match is the GPU ISA. Returns (gfx, family) — see classify_amd_gfx.
+        """
+        info = _run(["rocminfo"]) or _run(["/opt/rocm/bin/rocminfo"]) or ""
+        m = re.search(r"gfx\d+[a-f]?", info)
+        return classify_amd_gfx(m.group(0) if m else "")
+
    try:
        cards = []
        is_apu = False
@@ -187,6 +226,7 @@ def _detect_amd():
            return None
        total_vram = sum(c["vram_gb"] for c in cards)
        groups = _group_gpus(cards)
+        gfx, family = _amd_arch()
        # NOTE: for APUs with BIOS UMA carveout (e.g. Strix Halo), vis_vram_total
        # is the real usable GPU memory — it's physically backed but reserved
        # by BIOS so it doesn't appear in /proc/meminfo. Don't cap it at system
@@ -200,6 +240,13 @@ def _detect_amd():
            "homogeneous": len(groups) <= 1,
            "backend": "rocm",
            "unified_memory": is_apu,
+            # AMD ISA/family so downstream can tell datacenter Instinct (CDNA,
+            # where vLLM/SGLang run AWQ/GPTQ reliably) from consumer Radeon
+            # (RDNA, where the practical path is GGUF via llama.cpp). Empty/
+            # "unknown" when rocminfo isn't available — callers must treat
+            # unknown conservatively, not assume vLLM works.
+            "gpu_arch": gfx,
+            "gpu_family": family,
        }
    except Exception:
        return None
--- a/tests/test_hwfit_amd.py
+++ b/tests/test_hwfit_amd.py
@@ -0,0 +1,195 @@
+"""AMD ROCm support for Cookbook hardware-fit.
+
+Consumer AMD Radeon (RDNA: gfx10/11/12) can realistically only serve GGUF via
+llama.cpp — vLLM/SGLang on ROCm are validated for datacenter Instinct (CDNA,
+gfx9xx), not consumer cards, where AWQ kernels are largely unsupported and FP8
+needs out-of-tree patches. These tests lock in that consumer RDNA is treated
+like Apple Silicon (GGUF-only recommendations) while datacenter CDNA and
+unknown-family AMD are left untouched, and that CUDA is unchanged.
+"""
+
+from services.hwfit import hardware
+from services.hwfit.fit import rank_models
+from services.hwfit.models import get_models
+
+
+def _rocm_system(family="rdna", ram_gb=32.0, vram_gb=16.0):
+    return {
+        "has_gpu": True,
+        "backend": "rocm",
+        "gpu_name": "AMD Radeon RX 9060 XT" if family == "rdna" else "AMD Instinct MI300X",
+        "gpu_vram_gb": vram_gb,
+        "gpu_count": 1,
+        "available_ram_gb": ram_gb * 0.7,
+        "total_ram_gb": ram_gb,
+        "gpu_arch": "gfx1200" if family == "rdna" else "gfx942",
+        "gpu_family": family,
+    }
+
+
+def _cuda_system():
+    return {
+        "has_gpu": True, "backend": "cuda", "gpu_name": "NVIDIA RTX 4090",
+        "gpu_vram_gb": 24.0, "gpu_count": 1, "available_ram_gb": 32.0, "total_ram_gb": 64.0,
+    }
+
+
+def test_only_gguf_models_recommended_on_consumer_rdna():
+    """llama.cpp (GGUF) is the servable path on consumer Radeon, so every model
+    recommended on RDNA must ship a real GGUF — no vLLM-only AWQ/GPTQ/FP8."""
+    catalog = {m["name"]: m for m in get_models()}
+    unservable = [
+        r["name"] for r in rank_models(_rocm_system(family="rdna"), limit=900)
+        if not (catalog.get(r["name"], {}).get("is_gguf")
+                or catalog.get(r["name"], {}).get("gguf_sources"))
+    ]
+    assert unservable == [], f"{len(unservable)} non-GGUF models on RDNA, e.g. {unservable[:3]}"
+
+
+def test_safetensors_models_still_recommended_on_cdna():
+    """Datacenter Instinct (CDNA) runs vLLM/SGLang on ROCm fine, so non-GGUF
+    repos must NOT be filtered there — the GGUF-only rule is consumer-RDNA only."""
+    names = {r["name"] for r in rank_models(_rocm_system(family="cdna"), limit=900)}
+    assert "microsoft/Phi-mini-MoE-instruct" in names
+
+
+def test_unknown_amd_family_not_filtered():
+    """When rocminfo is unavailable (family 'unknown'), don't hide non-GGUF
+    models — a possibly-capable Instinct box shouldn't lose models on misdetect."""
+    names = {r["name"] for r in rank_models(_rocm_system(family="unknown"), limit=900)}
+    assert "microsoft/Phi-mini-MoE-instruct" in names
+
+
+def test_safetensors_models_still_recommended_on_cuda():
+    """Regression guard: the GGUF-only rule must not leak onto CUDA."""
+    names = {r["name"] for r in rank_models(_cuda_system(), limit=900)}
+    assert "microsoft/Phi-mini-MoE-instruct" in names
+
+
+def test_classify_amd_gfx_rdna_vs_cdna():
+    """classify_amd_gfx maps gfx targets to the right family: consumer RDNA
+    (gfx10/11/12) vs datacenter CDNA (gfx9xx Instinct) vs older GCN."""
+    cases = {
+        "gfx1200": "rdna",   # RX 9060 XT (RDNA4)
+        "gfx1201": "rdna",   # RX 9070 (RDNA4)
+        "gfx1100": "rdna",   # RX 7900 (RDNA3)
+        "gfx1030": "rdna",   # RX 6800 (RDNA2)
+        "gfx942": "cdna",    # MI300 (CDNA3)
+        "gfx950": "cdna",    # MI350 (CDNA4)
+        "gfx90a": "cdna",    # MI200 (CDNA2)
+        "gfx908": "cdna",    # MI100 (CDNA1)
+        "gfx906": "gcn",     # Radeon VII / MI50 (GCN5/Vega)
+        "": "unknown",
+        "gfx": "unknown",
+    }
+    for gfx, expected_family in cases.items():
+        out_gfx, family = hardware.classify_amd_gfx(gfx)
+        assert family == expected_family, f"{gfx} -> {family}, expected {expected_family}"
+        if expected_family != "unknown":
+            assert out_gfx == gfx
+
+
+def test_detect_amd_reports_family(monkeypatch):
+    """_detect_amd surfaces gpu_family from rocminfo so fit/serve can branch on
+    consumer-RDNA vs datacenter-CDNA. rocminfo lists the CPU agent first, then
+    the GPU's gfx target. Drive it through the remote-read path (no real sysfs)."""
+    rocminfo_out = "  Name:  AMD Ryzen 7 3700X\n  Name:  gfx1200\n  Marketing Name: AMD Radeon RX 9060 XT\n"
+
+    def fake_run(cmd):
+        if not cmd:
+            return None
+        if "rocminfo" in cmd[0]:
+            return rocminfo_out
+        if cmd[0] == "ls":
+            return "card1\ncard1-DP-1\nrenderD128"
+        if cmd[0] == "cat":
+            path = cmd[1]
+            if path.endswith("/vendor"):
+                return "0x1002"
+            if path.endswith("/mem_info_vram_total"):
+                return str(16 * 1024**3)
+            if path.endswith("/product_name"):
+                return "AMD Radeon RX 9060 XT"
+            return None
+        return None
+
+    # _remote_host truthy routes _read/_list_drm_cards through _run (no real sysfs).
+    monkeypatch.setattr(hardware, "_remote_host", "fake-host")
+    monkeypatch.setattr(hardware, "_run", fake_run)
+
+    info = hardware._detect_amd()
+    assert info is not None
+    assert info["backend"] == "rocm"
+    assert info["gpu_family"] == "rdna"
+    assert info["gpu_arch"] == "gfx1200"
+
+
+def test_consumer_amd_cards_have_real_bandwidth():
+    """Consumer AMD cards must be in the bandwidth table so speed estimates use
+    real VRAM bandwidth, not the crude rocm FALLBACK_K constant. The RX 9060 XT
+    was missing entirely, so its estimates fell back to the constant and were off."""
+    from services.hwfit.fit import _lookup_bandwidth
+    for name, expected_min in [
+        ("AMD Radeon RX 9060 XT", 300),
+        ("AMD Radeon RX 9070 XT", 600),
+        ("AMD Radeon RX 7900 XTX", 900),
+    ]:
+        bw = _lookup_bandwidth(name)
+        assert bw and bw >= expected_min, f"{name}: {bw} GB/s (expected >= {expected_min})"
+
+
+def test_9060xt_speed_estimate_is_realistic():
+    """Calibration guard: a small MoE fully on a 9060 XT at Q4 should estimate in
+    a believable range, not the absurd numbers the missing-bandwidth fallback gave.
+    Measured reference: DeepSeek-Coder-V2-Lite Q4 ~60-86 t/s on this card."""
+    from services.hwfit.fit import _estimate_speed
+    model = {"name": "DeepSeek-Coder-V2-Lite-Instruct", "parameter_count": "16B",
+             "is_moe": True, "active_parameters": 2_400_000_000}
+    sys = {"backend": "rocm", "gpu_name": "AMD Radeon RX 9060 XT", "gpu_vram_gb": 15.9}
+    tps = _estimate_speed(model, "Q4_K_M", "gpu", sys)
+    assert 40 <= tps <= 130, f"unrealistic estimate: {tps} t/s"
+
+
+def test_offload_is_slower_than_full_gpu():
+    """Partial CPU offload must estimate slower than the same model fully on GPU,
+    and heavier offload slower than lighter — the blend model, not a flat halving."""
+    from services.hwfit.fit import _estimate_speed
+    model = {"name": "X", "parameter_count": "35B", "is_moe": True,
+             "active_parameters": 3_000_000_000}
+    sys = {"backend": "rocm", "gpu_name": "AMD Radeon RX 9060 XT", "gpu_vram_gb": 15.9}
+    full = _estimate_speed(model, "Q4_K_M", "gpu", sys)
+    light = _estimate_speed(model, "Q4_K_M", "cpu_offload", sys, offload_frac=0.2)
+    heavy = _estimate_speed(model, "Q4_K_M", "cpu_offload", sys, offload_frac=0.6)
+    assert full > light > heavy, (full, light, heavy)
+
+
+def test_sort_by_newest_orders_by_release_date():
+    """sort='newest' orders results by release_date descending (newest first),
+    with undated models sorted last."""
+    sys = {"backend": "rocm", "gpu_name": "AMD Radeon RX 9060 XT", "gpu_vram_gb": 15.9,
+           "gpu_family": "rdna", "gpu_count": 1, "available_ram_gb": 22.0, "total_ram_gb": 31.0}
+    res = rank_models(sys, sort="newest", limit=50)
+    dated = [r.get("release_date") for r in res if r.get("release_date")]
+    # dates present must be in descending order
+    assert dated == sorted(dated, reverse=True), "release dates not descending"
+    # any undated entries must come after all dated ones
+    seen_blank = False
+    for r in res:
+        if not r.get("release_date"):
+            seen_blank = True
+        elif seen_blank:
+            assert False, "a dated model appeared after an undated one"
+
+
+def test_no_vendor_specific_formats_on_consumer_rdna():
+    """Consumer Radeon can't run NVIDIA NVFP4, Apple MLX, or vLLM-only FP8/AWQ/
+    GPTQ builds — none should be recommended on RDNA even though such repos DO
+    exist in the catalog. Guards the format filter directly (not just is_gguf)."""
+    import re
+    bad = re.compile(r"NVFP4|FP8|FP4|-MLX-|\bMLX\b|AWQ|GPTQ", re.IGNORECASE)
+    names = [r["name"] for r in rank_models(_rocm_system(family="rdna"), limit=900)]
+    offenders = [n for n in names if bad.search(n)]
+    assert offenders == [], f"non-runnable formats recommended on RDNA: {offenders[:5]}"
+    # Guard against a vacuous test: such formats must actually be in the catalog.
+    assert any(bad.search(m["name"]) for m in get_models()), \
+        "catalog has no NVFP4/MLX/FP8 repos — test would be vacuous"