Cookbook fit: steer consumer AMD to GGUF recommendations
* Cookbook fit: consumer-AMD GGUF recommendations + accurate estimates (core logic) Split of #746 — the estimate/ranking MATH only, so it can be reviewed with tests first (UI changes follow separately). Backend files only: no static/js here. services/hwfit/fit.py, services/hwfit/hardware.py: - Recommend GGUF/llama.cpp on consumer AMD (RDNA, gfx10/11/12) instead of formats that don't run on consumer Radeon — vLLM-only AWQ/GPTQ/FP8 AND vendor-specific NVFP4 (NVIDIA) / MLX (Apple). Datacenter Instinct (CDNA) and CUDA are left untouched. - More accurate speed estimates across more GPUs (adds RDNA bandwidth data). - Detect AMD/RDNA GPUs (gpu_family from rocminfo) so fit/serve can branch on it. tests/test_hwfit_amd.py: AMD recommendation path, quant/bit matching, estimate realism, gfx RDNA-vs-CDNA classification. Rebased onto current main (analyze_model gained a scoring_use_case param there; kept it). Vision detection intentionally NOT added here — main already ships a "Vision" type filter + multimodal use-case handling; duplicating it was dropped. Checks: py_compile clean; pytest tests/test_hwfit_amd.py + hwfit/serve suites = 28 passed; full suite 0 new failures vs main. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> * Tests: assert NVFP4/MLX/FP8 formats are filtered on consumer RDNA Backs the #972 claim with an explicit regression: no NVIDIA NVFP4, Apple MLX, or vLLM-only FP8/AWQ/GPTQ repos are recommended on a consumer Radeon, and guards against vacuity by asserting such repos exist in the catalog. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -18,7 +18,7 @@ GPU_BANDWIDTH = {
|
|||||||
"7900 xtx": 960, "7900 xt": 800, "7900 gre": 576, "7800 xt": 624, "7700 xt": 432, "7600": 288,
|
"7900 xtx": 960, "7900 xt": 800, "7900 gre": 576, "7800 xt": 624, "7700 xt": 432, "7600": 288,
|
||||||
"6950 xt": 576, "6900 xt": 512, "6800 xt": 512, "6800": 512, "6700 xt": 384, "6600 xt": 256, "6600": 224,
|
"6950 xt": 576, "6900 xt": 512, "6800 xt": 512, "6800": 512, "6700 xt": 384, "6600 xt": 256, "6600": 224,
|
||||||
"mi300x": 5300, "mi300": 5300, "mi250x": 3277, "mi250": 3277, "mi210": 1638, "mi100": 1229,
|
"mi300x": 5300, "mi300": 5300, "mi250x": 3277, "mi250": 3277, "mi210": 1638, "mi100": 1229,
|
||||||
"9070 xt": 624, "9070": 488,
|
"9070 xt": 624, "9070": 488, "9060 xt": 322, "9060": 322,
|
||||||
# Apple Silicon unified-memory bandwidth (GB/s). Keyed off the chip name
|
# Apple Silicon unified-memory bandwidth (GB/s). Keyed off the chip name
|
||||||
# reported by sysctl machdep.cpu.brand_string (e.g. "Apple M4 Max"). Listed
|
# reported by sysctl machdep.cpu.brand_string (e.g. "Apple M4 Max"). Listed
|
||||||
# before the bare "m_" keys matters less than length-sorting (done below),
|
# before the bare "m_" keys matters less than length-sorting (done below),
|
||||||
@@ -70,8 +70,18 @@ def _lookup_bandwidth(gpu_name):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _estimate_speed(model, quant, run_mode, system):
|
def _estimate_speed(model, quant, run_mode, system, offload_frac=0.0):
|
||||||
"""Estimate tok/s. Uses active params for MoE (only active experts run per token)."""
|
"""Estimate tok/s. Uses active params for MoE (only active experts run per token).
|
||||||
|
|
||||||
|
offload_frac (0..1): fraction of the model's weights that spill to system RAM
|
||||||
|
(CPU) because they don't fit VRAM. Generation reads every active weight per
|
||||||
|
token, so when part lives in CPU RAM the per-token time is dominated by the
|
||||||
|
slow path. We model effective bandwidth as a blend of GPU VRAM bandwidth and
|
||||||
|
system-RAM bandwidth weighted by what's where — far more accurate than a flat
|
||||||
|
"halve it" for partial offload, which under/over-shoots depending on amount.
|
||||||
|
Calibrated against a measured RX 9060 XT: DeepSeek-Coder-V2-Lite Q4_K_M with
|
||||||
|
light offload → ~59 t/s est vs 59.8 measured.
|
||||||
|
"""
|
||||||
pb = _active_params_b(model)
|
pb = _active_params_b(model)
|
||||||
is_moe = model.get("is_moe", False)
|
is_moe = model.get("is_moe", False)
|
||||||
bw = _lookup_bandwidth(system.get("gpu_name"))
|
bw = _lookup_bandwidth(system.get("gpu_name"))
|
||||||
@@ -83,14 +93,24 @@ def _estimate_speed(model, quant, run_mode, system):
|
|||||||
if model_gb <= 0:
|
if model_gb <= 0:
|
||||||
return 0.0
|
return 0.0
|
||||||
efficiency = 0.55
|
efficiency = 0.55
|
||||||
raw_tps = (bw / model_gb) * efficiency
|
|
||||||
if run_mode == "cpu_offload":
|
if run_mode == "cpu_offload":
|
||||||
mode_factor = 0.5
|
# Dual-channel DDR4-3200 ≈ 50 GB/s; DDR5 systems higher, but be
|
||||||
elif is_moe:
|
# conservative since offloaded MoE is also compute-bound on CPU.
|
||||||
mode_factor = 0.8
|
cpu_bw = 55.0
|
||||||
else:
|
frac = min(max(offload_frac, 0.0), 1.0)
|
||||||
mode_factor = 1.0
|
# If we don't know the fraction (legacy callers pass 0 with
|
||||||
return raw_tps * mode_factor
|
# cpu_offload), assume a meaningful spill so we don't overestimate.
|
||||||
|
if frac <= 0.0:
|
||||||
|
frac = 0.5
|
||||||
|
# Harmonic-style blend: time = frac/cpu_bw + (1-frac)/gpu_bw, so the
|
||||||
|
# slow CPU portion dominates as it grows (matches the steep real-world
|
||||||
|
# drop-off when more experts offload).
|
||||||
|
eff_bw = 1.0 / (frac / cpu_bw + (1.0 - frac) / bw)
|
||||||
|
raw_tps = (eff_bw / model_gb) * efficiency
|
||||||
|
return raw_tps * (0.8 if is_moe else 1.0)
|
||||||
|
# Fully on GPU.
|
||||||
|
raw_tps = (bw / model_gb) * efficiency
|
||||||
|
return raw_tps * (0.8 if is_moe else 1.0)
|
||||||
|
|
||||||
k = FALLBACK_K.get(backend, 70)
|
k = FALLBACK_K.get(backend, 70)
|
||||||
if pb <= 0:
|
if pb <= 0:
|
||||||
@@ -357,7 +377,12 @@ def analyze_model(model, system, target_quant=None, scoring_use_case=None):
|
|||||||
else:
|
else:
|
||||||
fit_level = "marginal"
|
fit_level = "marginal"
|
||||||
|
|
||||||
tps = _estimate_speed(model, quant, run_mode, system)
|
# Fraction of the model that spills to CPU RAM (drives the offload speed
|
||||||
|
# model). When offloading, anything beyond the GPU's VRAM lives in system RAM.
|
||||||
|
offload_frac = 0.0
|
||||||
|
if run_mode == "cpu_offload" and required_gb > 0 and effective_vram > 0:
|
||||||
|
offload_frac = max(0.0, (required_gb - effective_vram) / required_gb)
|
||||||
|
tps = _estimate_speed(model, quant, run_mode, system, offload_frac=offload_frac)
|
||||||
|
|
||||||
q_score = _quality_score(model, quant, score_use_case)
|
q_score = _quality_score(model, quant, score_use_case)
|
||||||
s_score = _speed_score(tps, score_use_case)
|
s_score = _speed_score(tps, score_use_case)
|
||||||
@@ -389,6 +414,7 @@ def analyze_model(model, system, target_quant=None, scoring_use_case=None):
|
|||||||
},
|
},
|
||||||
"gguf_sources": model.get("gguf_sources", []),
|
"gguf_sources": model.get("gguf_sources", []),
|
||||||
"context_length": model.get("context_length", 4096),
|
"context_length": model.get("context_length", 4096),
|
||||||
|
"release_date": model.get("release_date", ""),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -398,6 +424,10 @@ SORT_KEYS = {
|
|||||||
"vram": lambda r: r["required_gb"],
|
"vram": lambda r: r["required_gb"],
|
||||||
"params": lambda r: r["params_b"],
|
"params": lambda r: r["params_b"],
|
||||||
"context": lambda r: r["context"],
|
"context": lambda r: r["context"],
|
||||||
|
# Newest first. release_date is an ISO-ish string ("2026-05-30"); plain
|
||||||
|
# string sort is chronological. Missing dates sort last (empty < any date,
|
||||||
|
# and we sort reverse=True for newest, so "" lands at the bottom).
|
||||||
|
"newest": lambda r: r.get("release_date") or "",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -454,6 +484,16 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
|
|||||||
apple_silicon = system_backend in ("mps", "metal", "apple")
|
apple_silicon = system_backend in ("mps", "metal", "apple")
|
||||||
rocm = system_backend == "rocm"
|
rocm = system_backend == "rocm"
|
||||||
|
|
||||||
|
# Consumer AMD Radeon (RDNA, gfx10/11/12): the practical local serving path
|
||||||
|
# is GGUF via llama.cpp. vLLM/SGLang on ROCm are validated for datacenter
|
||||||
|
# Instinct (CDNA, gfx9xx) but are unreliable on consumer RDNA — AWQ kernels
|
||||||
|
# are largely unsupported there and FP8 needs out-of-tree patches. So treat
|
||||||
|
# consumer RDNA like Apple Silicon (GGUF-only) and leave CDNA untouched.
|
||||||
|
# Unknown family (no rocminfo) is left untouched to avoid hiding models from
|
||||||
|
# a possibly-capable Instinct box on a misdetect.
|
||||||
|
gpu_family = (system.get("gpu_family") or "").lower()
|
||||||
|
consumer_amd = system_backend == "rocm" and gpu_family == "rdna"
|
||||||
|
|
||||||
for m in models:
|
for m in models:
|
||||||
native_q = m.get("quantization", "")
|
native_q = m.get("quantization", "")
|
||||||
if "nvfp4" in (m.get("name") or "").lower():
|
if "nvfp4" in (m.get("name") or "").lower():
|
||||||
@@ -479,7 +519,12 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
|
|||||||
# default GGUF quant) and vLLM-only AWQ/GPTQ/FP8 builds alike. Without
|
# default GGUF quant) and vLLM-only AWQ/GPTQ/FP8 builds alike. Without
|
||||||
# this the Cookbook recommends models the Mac can't run; on CUDA these
|
# this the Cookbook recommends models the Mac can't run; on CUDA these
|
||||||
# stay visible because vLLM serves safetensors directly.
|
# stay visible because vLLM serves safetensors directly.
|
||||||
if apple_silicon and not (m.get("is_gguf") or m.get("gguf_sources")):
|
#
|
||||||
|
# Consumer AMD (RDNA) is the same story: GGUF via llama.cpp is the
|
||||||
|
# servable path, so a model needs a real GGUF to be recommended.
|
||||||
|
# Otherwise the Cookbook rates vLLM-only AWQ/GPTQ builds "GOOD" on a
|
||||||
|
# Radeon that can't actually serve them.
|
||||||
|
if (apple_silicon or consumer_amd) and not (m.get("is_gguf") or m.get("gguf_sources")):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Format filter: AWQ tab -> only AWQ models, FP4 tab -> FP4-family models, etc.
|
# Format filter: AWQ tab -> only AWQ models, FP4 tab -> FP4-family models, etc.
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
import platform
|
import platform
|
||||||
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
import time
|
import time
|
||||||
@@ -130,6 +131,33 @@ def _detect_nvidia():
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def classify_amd_gfx(gfx):
|
||||||
|
"""Map an AMD ISA target (e.g. "gfx1200") to (gfx, family).
|
||||||
|
|
||||||
|
family is one of:
|
||||||
|
"rdna" — consumer Radeon RX (gfx10xx RDNA1/2, gfx11xx RDNA3, gfx12xx RDNA4)
|
||||||
|
"cdna" — datacenter Instinct (gfx908 MI100, gfx90a MI200, gfx94x/95x MI300+)
|
||||||
|
"gcn" — older GCN/Vega (gfx900/906)
|
||||||
|
"unknown" — empty/unrecognized; callers must treat conservatively
|
||||||
|
|
||||||
|
This drives the serving decision: vLLM/SGLang on ROCm are validated on CDNA
|
||||||
|
but fragile on consumer RDNA (AWQ kernels largely unsupported, FP8 needs
|
||||||
|
out-of-tree patches), so RDNA is steered to GGUF/llama.cpp.
|
||||||
|
"""
|
||||||
|
gfx = (gfx or "").lower().strip()
|
||||||
|
m = re.fullmatch(r"gfx(\d+[a-f]?)", gfx)
|
||||||
|
if not m:
|
||||||
|
return "", "unknown"
|
||||||
|
digits = m.group(1)
|
||||||
|
if digits[:2] in ("10", "11", "12"):
|
||||||
|
return gfx, "rdna"
|
||||||
|
if digits in ("908", "90a") or digits[:2] in ("94", "95"):
|
||||||
|
return gfx, "cdna"
|
||||||
|
if digits[:1] == "9":
|
||||||
|
return gfx, "gcn"
|
||||||
|
return gfx, "unknown"
|
||||||
|
|
||||||
|
|
||||||
def _detect_amd():
|
def _detect_amd():
|
||||||
"""Detect AMD GPUs. Handles both discrete cards (with mem_info_vram_total)
|
"""Detect AMD GPUs. Handles both discrete cards (with mem_info_vram_total)
|
||||||
and APUs / unified-memory SoCs like Strix Halo (which expose
|
and APUs / unified-memory SoCs like Strix Halo (which expose
|
||||||
@@ -155,6 +183,17 @@ def _detect_amd():
|
|||||||
except Exception:
|
except Exception:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
def _amd_arch():
|
||||||
|
"""Best-effort AMD GPU ISA + family from rocminfo.
|
||||||
|
|
||||||
|
rocminfo is the source of truth; its GPU agents report a `Name: gfxNNNN`
|
||||||
|
line (CPU agents report a brand string, not a gfx target), so the first
|
||||||
|
gfx match is the GPU ISA. Returns (gfx, family) — see classify_amd_gfx.
|
||||||
|
"""
|
||||||
|
info = _run(["rocminfo"]) or _run(["/opt/rocm/bin/rocminfo"]) or ""
|
||||||
|
m = re.search(r"gfx\d+[a-f]?", info)
|
||||||
|
return classify_amd_gfx(m.group(0) if m else "")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
cards = []
|
cards = []
|
||||||
is_apu = False
|
is_apu = False
|
||||||
@@ -187,6 +226,7 @@ def _detect_amd():
|
|||||||
return None
|
return None
|
||||||
total_vram = sum(c["vram_gb"] for c in cards)
|
total_vram = sum(c["vram_gb"] for c in cards)
|
||||||
groups = _group_gpus(cards)
|
groups = _group_gpus(cards)
|
||||||
|
gfx, family = _amd_arch()
|
||||||
# NOTE: for APUs with BIOS UMA carveout (e.g. Strix Halo), vis_vram_total
|
# NOTE: for APUs with BIOS UMA carveout (e.g. Strix Halo), vis_vram_total
|
||||||
# is the real usable GPU memory — it's physically backed but reserved
|
# is the real usable GPU memory — it's physically backed but reserved
|
||||||
# by BIOS so it doesn't appear in /proc/meminfo. Don't cap it at system
|
# by BIOS so it doesn't appear in /proc/meminfo. Don't cap it at system
|
||||||
@@ -200,6 +240,13 @@ def _detect_amd():
|
|||||||
"homogeneous": len(groups) <= 1,
|
"homogeneous": len(groups) <= 1,
|
||||||
"backend": "rocm",
|
"backend": "rocm",
|
||||||
"unified_memory": is_apu,
|
"unified_memory": is_apu,
|
||||||
|
# AMD ISA/family so downstream can tell datacenter Instinct (CDNA,
|
||||||
|
# where vLLM/SGLang run AWQ/GPTQ reliably) from consumer Radeon
|
||||||
|
# (RDNA, where the practical path is GGUF via llama.cpp). Empty/
|
||||||
|
# "unknown" when rocminfo isn't available — callers must treat
|
||||||
|
# unknown conservatively, not assume vLLM works.
|
||||||
|
"gpu_arch": gfx,
|
||||||
|
"gpu_family": family,
|
||||||
}
|
}
|
||||||
except Exception:
|
except Exception:
|
||||||
return None
|
return None
|
||||||
|
|||||||
195
tests/test_hwfit_amd.py
Normal file
195
tests/test_hwfit_amd.py
Normal file
@@ -0,0 +1,195 @@
|
|||||||
|
"""AMD ROCm support for Cookbook hardware-fit.
|
||||||
|
|
||||||
|
Consumer AMD Radeon (RDNA: gfx10/11/12) can realistically only serve GGUF via
|
||||||
|
llama.cpp — vLLM/SGLang on ROCm are validated for datacenter Instinct (CDNA,
|
||||||
|
gfx9xx), not consumer cards, where AWQ kernels are largely unsupported and FP8
|
||||||
|
needs out-of-tree patches. These tests lock in that consumer RDNA is treated
|
||||||
|
like Apple Silicon (GGUF-only recommendations) while datacenter CDNA and
|
||||||
|
unknown-family AMD are left untouched, and that CUDA is unchanged.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from services.hwfit import hardware
|
||||||
|
from services.hwfit.fit import rank_models
|
||||||
|
from services.hwfit.models import get_models
|
||||||
|
|
||||||
|
|
||||||
|
def _rocm_system(family="rdna", ram_gb=32.0, vram_gb=16.0):
|
||||||
|
return {
|
||||||
|
"has_gpu": True,
|
||||||
|
"backend": "rocm",
|
||||||
|
"gpu_name": "AMD Radeon RX 9060 XT" if family == "rdna" else "AMD Instinct MI300X",
|
||||||
|
"gpu_vram_gb": vram_gb,
|
||||||
|
"gpu_count": 1,
|
||||||
|
"available_ram_gb": ram_gb * 0.7,
|
||||||
|
"total_ram_gb": ram_gb,
|
||||||
|
"gpu_arch": "gfx1200" if family == "rdna" else "gfx942",
|
||||||
|
"gpu_family": family,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _cuda_system():
|
||||||
|
return {
|
||||||
|
"has_gpu": True, "backend": "cuda", "gpu_name": "NVIDIA RTX 4090",
|
||||||
|
"gpu_vram_gb": 24.0, "gpu_count": 1, "available_ram_gb": 32.0, "total_ram_gb": 64.0,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_only_gguf_models_recommended_on_consumer_rdna():
|
||||||
|
"""llama.cpp (GGUF) is the servable path on consumer Radeon, so every model
|
||||||
|
recommended on RDNA must ship a real GGUF — no vLLM-only AWQ/GPTQ/FP8."""
|
||||||
|
catalog = {m["name"]: m for m in get_models()}
|
||||||
|
unservable = [
|
||||||
|
r["name"] for r in rank_models(_rocm_system(family="rdna"), limit=900)
|
||||||
|
if not (catalog.get(r["name"], {}).get("is_gguf")
|
||||||
|
or catalog.get(r["name"], {}).get("gguf_sources"))
|
||||||
|
]
|
||||||
|
assert unservable == [], f"{len(unservable)} non-GGUF models on RDNA, e.g. {unservable[:3]}"
|
||||||
|
|
||||||
|
|
||||||
|
def test_safetensors_models_still_recommended_on_cdna():
|
||||||
|
"""Datacenter Instinct (CDNA) runs vLLM/SGLang on ROCm fine, so non-GGUF
|
||||||
|
repos must NOT be filtered there — the GGUF-only rule is consumer-RDNA only."""
|
||||||
|
names = {r["name"] for r in rank_models(_rocm_system(family="cdna"), limit=900)}
|
||||||
|
assert "microsoft/Phi-mini-MoE-instruct" in names
|
||||||
|
|
||||||
|
|
||||||
|
def test_unknown_amd_family_not_filtered():
|
||||||
|
"""When rocminfo is unavailable (family 'unknown'), don't hide non-GGUF
|
||||||
|
models — a possibly-capable Instinct box shouldn't lose models on misdetect."""
|
||||||
|
names = {r["name"] for r in rank_models(_rocm_system(family="unknown"), limit=900)}
|
||||||
|
assert "microsoft/Phi-mini-MoE-instruct" in names
|
||||||
|
|
||||||
|
|
||||||
|
def test_safetensors_models_still_recommended_on_cuda():
|
||||||
|
"""Regression guard: the GGUF-only rule must not leak onto CUDA."""
|
||||||
|
names = {r["name"] for r in rank_models(_cuda_system(), limit=900)}
|
||||||
|
assert "microsoft/Phi-mini-MoE-instruct" in names
|
||||||
|
|
||||||
|
|
||||||
|
def test_classify_amd_gfx_rdna_vs_cdna():
|
||||||
|
"""classify_amd_gfx maps gfx targets to the right family: consumer RDNA
|
||||||
|
(gfx10/11/12) vs datacenter CDNA (gfx9xx Instinct) vs older GCN."""
|
||||||
|
cases = {
|
||||||
|
"gfx1200": "rdna", # RX 9060 XT (RDNA4)
|
||||||
|
"gfx1201": "rdna", # RX 9070 (RDNA4)
|
||||||
|
"gfx1100": "rdna", # RX 7900 (RDNA3)
|
||||||
|
"gfx1030": "rdna", # RX 6800 (RDNA2)
|
||||||
|
"gfx942": "cdna", # MI300 (CDNA3)
|
||||||
|
"gfx950": "cdna", # MI350 (CDNA4)
|
||||||
|
"gfx90a": "cdna", # MI200 (CDNA2)
|
||||||
|
"gfx908": "cdna", # MI100 (CDNA1)
|
||||||
|
"gfx906": "gcn", # Radeon VII / MI50 (GCN5/Vega)
|
||||||
|
"": "unknown",
|
||||||
|
"gfx": "unknown",
|
||||||
|
}
|
||||||
|
for gfx, expected_family in cases.items():
|
||||||
|
out_gfx, family = hardware.classify_amd_gfx(gfx)
|
||||||
|
assert family == expected_family, f"{gfx} -> {family}, expected {expected_family}"
|
||||||
|
if expected_family != "unknown":
|
||||||
|
assert out_gfx == gfx
|
||||||
|
|
||||||
|
|
||||||
|
def test_detect_amd_reports_family(monkeypatch):
|
||||||
|
"""_detect_amd surfaces gpu_family from rocminfo so fit/serve can branch on
|
||||||
|
consumer-RDNA vs datacenter-CDNA. rocminfo lists the CPU agent first, then
|
||||||
|
the GPU's gfx target. Drive it through the remote-read path (no real sysfs)."""
|
||||||
|
rocminfo_out = " Name: AMD Ryzen 7 3700X\n Name: gfx1200\n Marketing Name: AMD Radeon RX 9060 XT\n"
|
||||||
|
|
||||||
|
def fake_run(cmd):
|
||||||
|
if not cmd:
|
||||||
|
return None
|
||||||
|
if "rocminfo" in cmd[0]:
|
||||||
|
return rocminfo_out
|
||||||
|
if cmd[0] == "ls":
|
||||||
|
return "card1\ncard1-DP-1\nrenderD128"
|
||||||
|
if cmd[0] == "cat":
|
||||||
|
path = cmd[1]
|
||||||
|
if path.endswith("/vendor"):
|
||||||
|
return "0x1002"
|
||||||
|
if path.endswith("/mem_info_vram_total"):
|
||||||
|
return str(16 * 1024**3)
|
||||||
|
if path.endswith("/product_name"):
|
||||||
|
return "AMD Radeon RX 9060 XT"
|
||||||
|
return None
|
||||||
|
return None
|
||||||
|
|
||||||
|
# _remote_host truthy routes _read/_list_drm_cards through _run (no real sysfs).
|
||||||
|
monkeypatch.setattr(hardware, "_remote_host", "fake-host")
|
||||||
|
monkeypatch.setattr(hardware, "_run", fake_run)
|
||||||
|
|
||||||
|
info = hardware._detect_amd()
|
||||||
|
assert info is not None
|
||||||
|
assert info["backend"] == "rocm"
|
||||||
|
assert info["gpu_family"] == "rdna"
|
||||||
|
assert info["gpu_arch"] == "gfx1200"
|
||||||
|
|
||||||
|
|
||||||
|
def test_consumer_amd_cards_have_real_bandwidth():
|
||||||
|
"""Consumer AMD cards must be in the bandwidth table so speed estimates use
|
||||||
|
real VRAM bandwidth, not the crude rocm FALLBACK_K constant. The RX 9060 XT
|
||||||
|
was missing entirely, so its estimates fell back to the constant and were off."""
|
||||||
|
from services.hwfit.fit import _lookup_bandwidth
|
||||||
|
for name, expected_min in [
|
||||||
|
("AMD Radeon RX 9060 XT", 300),
|
||||||
|
("AMD Radeon RX 9070 XT", 600),
|
||||||
|
("AMD Radeon RX 7900 XTX", 900),
|
||||||
|
]:
|
||||||
|
bw = _lookup_bandwidth(name)
|
||||||
|
assert bw and bw >= expected_min, f"{name}: {bw} GB/s (expected >= {expected_min})"
|
||||||
|
|
||||||
|
|
||||||
|
def test_9060xt_speed_estimate_is_realistic():
|
||||||
|
"""Calibration guard: a small MoE fully on a 9060 XT at Q4 should estimate in
|
||||||
|
a believable range, not the absurd numbers the missing-bandwidth fallback gave.
|
||||||
|
Measured reference: DeepSeek-Coder-V2-Lite Q4 ~60-86 t/s on this card."""
|
||||||
|
from services.hwfit.fit import _estimate_speed
|
||||||
|
model = {"name": "DeepSeek-Coder-V2-Lite-Instruct", "parameter_count": "16B",
|
||||||
|
"is_moe": True, "active_parameters": 2_400_000_000}
|
||||||
|
sys = {"backend": "rocm", "gpu_name": "AMD Radeon RX 9060 XT", "gpu_vram_gb": 15.9}
|
||||||
|
tps = _estimate_speed(model, "Q4_K_M", "gpu", sys)
|
||||||
|
assert 40 <= tps <= 130, f"unrealistic estimate: {tps} t/s"
|
||||||
|
|
||||||
|
|
||||||
|
def test_offload_is_slower_than_full_gpu():
|
||||||
|
"""Partial CPU offload must estimate slower than the same model fully on GPU,
|
||||||
|
and heavier offload slower than lighter — the blend model, not a flat halving."""
|
||||||
|
from services.hwfit.fit import _estimate_speed
|
||||||
|
model = {"name": "X", "parameter_count": "35B", "is_moe": True,
|
||||||
|
"active_parameters": 3_000_000_000}
|
||||||
|
sys = {"backend": "rocm", "gpu_name": "AMD Radeon RX 9060 XT", "gpu_vram_gb": 15.9}
|
||||||
|
full = _estimate_speed(model, "Q4_K_M", "gpu", sys)
|
||||||
|
light = _estimate_speed(model, "Q4_K_M", "cpu_offload", sys, offload_frac=0.2)
|
||||||
|
heavy = _estimate_speed(model, "Q4_K_M", "cpu_offload", sys, offload_frac=0.6)
|
||||||
|
assert full > light > heavy, (full, light, heavy)
|
||||||
|
|
||||||
|
|
||||||
|
def test_sort_by_newest_orders_by_release_date():
|
||||||
|
"""sort='newest' orders results by release_date descending (newest first),
|
||||||
|
with undated models sorted last."""
|
||||||
|
sys = {"backend": "rocm", "gpu_name": "AMD Radeon RX 9060 XT", "gpu_vram_gb": 15.9,
|
||||||
|
"gpu_family": "rdna", "gpu_count": 1, "available_ram_gb": 22.0, "total_ram_gb": 31.0}
|
||||||
|
res = rank_models(sys, sort="newest", limit=50)
|
||||||
|
dated = [r.get("release_date") for r in res if r.get("release_date")]
|
||||||
|
# dates present must be in descending order
|
||||||
|
assert dated == sorted(dated, reverse=True), "release dates not descending"
|
||||||
|
# any undated entries must come after all dated ones
|
||||||
|
seen_blank = False
|
||||||
|
for r in res:
|
||||||
|
if not r.get("release_date"):
|
||||||
|
seen_blank = True
|
||||||
|
elif seen_blank:
|
||||||
|
assert False, "a dated model appeared after an undated one"
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_vendor_specific_formats_on_consumer_rdna():
|
||||||
|
"""Consumer Radeon can't run NVIDIA NVFP4, Apple MLX, or vLLM-only FP8/AWQ/
|
||||||
|
GPTQ builds — none should be recommended on RDNA even though such repos DO
|
||||||
|
exist in the catalog. Guards the format filter directly (not just is_gguf)."""
|
||||||
|
import re
|
||||||
|
bad = re.compile(r"NVFP4|FP8|FP4|-MLX-|\bMLX\b|AWQ|GPTQ", re.IGNORECASE)
|
||||||
|
names = [r["name"] for r in rank_models(_rocm_system(family="rdna"), limit=900)]
|
||||||
|
offenders = [n for n in names if bad.search(n)]
|
||||||
|
assert offenders == [], f"non-runnable formats recommended on RDNA: {offenders[:5]}"
|
||||||
|
# Guard against a vacuous test: such formats must actually be in the catalog.
|
||||||
|
assert any(bad.search(m["name"]) for m in get_models()), \
|
||||||
|
"catalog has no NVFP4/MLX/FP8 repos — test would be vacuous"
|
||||||
Reference in New Issue
Block a user