odysseus/tests/test_hwfit_amd.py

"""AMD ROCm support for Cookbook hardware-fit.

Consumer AMD Radeon (RDNA: gfx10/11/12) can realistically only serve GGUF via
llama.cpp — vLLM/SGLang on ROCm are validated for datacenter Instinct (CDNA,
gfx9xx), not consumer cards, where AWQ kernels are largely unsupported and FP8
needs out-of-tree patches. These tests lock in that consumer RDNA is treated
like Apple Silicon (GGUF-only recommendations) while datacenter CDNA and
unknown-family AMD are left untouched, and that CUDA is unchanged.
"""

from services.hwfit import hardware
from services.hwfit.fit import rank_models
from services.hwfit.models import get_models


def _rocm_system(family="rdna", ram_gb=32.0, vram_gb=16.0):
    return {
        "has_gpu": True,
        "backend": "rocm",
        "gpu_name": "AMD Radeon RX 9060 XT" if family == "rdna" else "AMD Instinct MI300X",
        "gpu_vram_gb": vram_gb,
        "gpu_count": 1,
        "available_ram_gb": ram_gb * 0.7,
        "total_ram_gb": ram_gb,
        "gpu_arch": "gfx1200" if family == "rdna" else "gfx942",
        "gpu_family": family,
    }


def _cuda_system():
    return {
        "has_gpu": True, "backend": "cuda", "gpu_name": "NVIDIA RTX 4090",
        "gpu_vram_gb": 24.0, "gpu_count": 1, "available_ram_gb": 32.0, "total_ram_gb": 64.0,
    }


def test_only_gguf_models_recommended_on_consumer_rdna():
    """llama.cpp (GGUF) is the servable path on consumer Radeon, so every model
    recommended on RDNA must ship a real GGUF — no vLLM-only AWQ/GPTQ/FP8."""
    catalog = {m["name"]: m for m in get_models()}
    unservable = [
        r["name"] for r in rank_models(_rocm_system(family="rdna"), limit=900)
        if not (catalog.get(r["name"], {}).get("is_gguf")
                or catalog.get(r["name"], {}).get("gguf_sources"))
    ]
    assert unservable == [], f"{len(unservable)} non-GGUF models on RDNA, e.g. {unservable[:3]}"


def test_safetensors_models_still_recommended_on_cdna():
    """Datacenter Instinct (CDNA) runs vLLM/SGLang on ROCm fine, so non-GGUF
    repos must NOT be filtered there — the GGUF-only rule is consumer-RDNA only."""
    names = {r["name"] for r in rank_models(_rocm_system(family="cdna"), limit=900)}
    assert "microsoft/Phi-mini-MoE-instruct" in names


def test_unknown_amd_family_not_filtered():
    """When rocminfo is unavailable (family 'unknown'), don't hide non-GGUF
    models — a possibly-capable Instinct box shouldn't lose models on misdetect."""
    names = {r["name"] for r in rank_models(_rocm_system(family="unknown"), limit=900)}
    assert "microsoft/Phi-mini-MoE-instruct" in names


def test_safetensors_models_still_recommended_on_cuda():
    """Regression guard: the GGUF-only rule must not leak onto CUDA."""
    names = {r["name"] for r in rank_models(_cuda_system(), limit=900)}
    assert "microsoft/Phi-mini-MoE-instruct" in names


def test_classify_amd_gfx_rdna_vs_cdna():
    """classify_amd_gfx maps gfx targets to the right family: consumer RDNA
    (gfx10/11/12) vs datacenter CDNA (gfx9xx Instinct) vs older GCN."""
    cases = {
        "gfx1200": "rdna",   # RX 9060 XT (RDNA4)
        "gfx1201": "rdna",   # RX 9070 (RDNA4)
        "gfx1100": "rdna",   # RX 7900 (RDNA3)
        "gfx1030": "rdna",   # RX 6800 (RDNA2)
        "gfx942": "cdna",    # MI300 (CDNA3)
        "gfx950": "cdna",    # MI350 (CDNA4)
        "gfx90a": "cdna",    # MI200 (CDNA2)
        "gfx908": "cdna",    # MI100 (CDNA1)
        "gfx906": "gcn",     # Radeon VII / MI50 (GCN5/Vega)
        "": "unknown",
        "gfx": "unknown",
    }
    for gfx, expected_family in cases.items():
        out_gfx, family = hardware.classify_amd_gfx(gfx)
        assert family == expected_family, f"{gfx} -> {family}, expected {expected_family}"
        if expected_family != "unknown":
            assert out_gfx == gfx


def test_detect_amd_reports_family(monkeypatch):
    """_detect_amd surfaces gpu_family from rocminfo so fit/serve can branch on
    consumer-RDNA vs datacenter-CDNA. rocminfo lists the CPU agent first, then
    the GPU's gfx target. Drive it through the remote-read path (no real sysfs)."""
    rocminfo_out = "  Name:  AMD Ryzen 7 3700X\n  Name:  gfx1200\n  Marketing Name: AMD Radeon RX 9060 XT\n"

    def fake_run(cmd):
        if not cmd:
            return None
        if "rocminfo" in cmd[0]:
            return rocminfo_out
        if cmd[0] == "ls":
            return "card1\ncard1-DP-1\nrenderD128"
        if cmd[0] == "cat":
            path = cmd[1]
            if path.endswith("/vendor"):
                return "0x1002"
            if path.endswith("/mem_info_vram_total"):
                return str(16 * 1024**3)
            if path.endswith("/product_name"):
                return "AMD Radeon RX 9060 XT"
            return None
        return None

    # _remote_host truthy routes _read/_list_drm_cards through _run (no real sysfs).
    monkeypatch.setattr(hardware, "_remote_host", "fake-host")
    monkeypatch.setattr(hardware, "_run", fake_run)

    info = hardware._detect_amd()
    assert info is not None
    assert info["backend"] == "rocm"
    assert info["gpu_family"] == "rdna"
    assert info["gpu_arch"] == "gfx1200"


def test_consumer_amd_cards_have_real_bandwidth():
    """Consumer AMD cards must be in the bandwidth table so speed estimates use
    real VRAM bandwidth, not the crude rocm FALLBACK_K constant. The RX 9060 XT
    was missing entirely, so its estimates fell back to the constant and were off."""
    from services.hwfit.fit import _lookup_bandwidth
    for name, expected_min in [
        ("AMD Radeon RX 9060 XT", 300),
        ("AMD Radeon RX 9070 XT", 600),
        ("AMD Radeon RX 7900 XTX", 900),
    ]:
        bw = _lookup_bandwidth(name)
        assert bw and bw >= expected_min, f"{name}: {bw} GB/s (expected >= {expected_min})"


def test_9060xt_speed_estimate_is_realistic():
    """Calibration guard: a small MoE fully on a 9060 XT at Q4 should estimate in
    a believable range, not the absurd numbers the missing-bandwidth fallback gave.
    Measured reference: DeepSeek-Coder-V2-Lite Q4 ~60-86 t/s on this card."""
    from services.hwfit.fit import _estimate_speed
    model = {"name": "DeepSeek-Coder-V2-Lite-Instruct", "parameter_count": "16B",
             "is_moe": True, "active_parameters": 2_400_000_000}
    sys = {"backend": "rocm", "gpu_name": "AMD Radeon RX 9060 XT", "gpu_vram_gb": 15.9}
    tps = _estimate_speed(model, "Q4_K_M", "gpu", sys)
    assert 40 <= tps <= 130, f"unrealistic estimate: {tps} t/s"


def test_offload_is_slower_than_full_gpu():
    """Partial CPU offload must estimate slower than the same model fully on GPU,
    and heavier offload slower than lighter — the blend model, not a flat halving."""
    from services.hwfit.fit import _estimate_speed
    model = {"name": "X", "parameter_count": "35B", "is_moe": True,
             "active_parameters": 3_000_000_000}
    sys = {"backend": "rocm", "gpu_name": "AMD Radeon RX 9060 XT", "gpu_vram_gb": 15.9}
    full = _estimate_speed(model, "Q4_K_M", "gpu", sys)
    light = _estimate_speed(model, "Q4_K_M", "cpu_offload", sys, offload_frac=0.2)
    heavy = _estimate_speed(model, "Q4_K_M", "cpu_offload", sys, offload_frac=0.6)
    assert full > light > heavy, (full, light, heavy)


def test_sort_by_newest_orders_by_release_date():
    """sort='newest' orders results by release_date descending (newest first),
    with undated models sorted last."""
    sys = {"backend": "rocm", "gpu_name": "AMD Radeon RX 9060 XT", "gpu_vram_gb": 15.9,
           "gpu_family": "rdna", "gpu_count": 1, "available_ram_gb": 22.0, "total_ram_gb": 31.0}
    res = rank_models(sys, sort="newest", limit=50)
    dated = [r.get("release_date") for r in res if r.get("release_date")]
    # dates present must be in descending order
    assert dated == sorted(dated, reverse=True), "release dates not descending"
    # any undated entries must come after all dated ones
    seen_blank = False
    for r in res:
        if not r.get("release_date"):
            seen_blank = True
        elif seen_blank:
            assert False, "a dated model appeared after an undated one"


def test_no_vendor_specific_formats_on_consumer_rdna():
    """Consumer Radeon can't run NVIDIA NVFP4, Apple MLX, or vLLM-only FP8/AWQ/
    GPTQ builds — none should be recommended on RDNA even though such repos DO
    exist in the catalog. Guards the format filter directly (not just is_gguf)."""
    import re
    bad = re.compile(r"NVFP4|FP8|FP4|-MLX-|\bMLX\b|AWQ|GPTQ", re.IGNORECASE)
    names = [r["name"] for r in rank_models(_rocm_system(family="rdna"), limit=900)]
    offenders = [n for n in names if bad.search(n)]
    assert offenders == [], f"non-runnable formats recommended on RDNA: {offenders[:5]}"
    # Guard against a vacuous test: such formats must actually be in the catalog.
    assert any(bad.search(m["name"]) for m in get_models()), \
        "catalog has no NVFP4/MLX/FP8 repos — test would be vacuous"