fix(hwfit): honor manual "metal" backend in the hardware simulator (#1090)
The Cookbook's manual hardware simulator ("what if I had this setup") let users
pick a backend, but _apply_manual_hardware only accepted cuda/rocm/cpu_x86/
cpu_arm and silently coerced anything else to cuda. So selecting Apple/Metal
simulated a CUDA box instead — and ranked safetensors-only repos a Mac can't
serve, even though the rest of hwfit (services.hwfit.fit, the serve-command
generation) already supports Metal as GGUF-only via llama.cpp/Ollama.
Add "metal" to the accepted backends (now a named _MANUAL_BACKENDS set, kept a
subset of what fit.py understands) and set unified_memory=True for it — Apple
Silicon shares one memory pool with the GPU — while clearing that flag for the
discrete (cuda/rocm) and CPU backends. _apply_manual_hardware is lifted to
module scope so it is directly unit-testable; both route call sites are
unchanged.
Adds tests/test_hwfit_manual_backend.py, including an end-to-end check that a
simulated Metal box only recommends GGUF-servable models.
Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -4,85 +4,102 @@ from copy import deepcopy
|
||||
from fastapi import APIRouter
|
||||
|
||||
|
||||
# Backends the manual hardware simulator accepts. Must stay a subset of what
|
||||
# services.hwfit.fit understands so a simulated box ranks like a real one:
|
||||
# "metal" routes through the Apple-Silicon path (GGUF-only, llama.cpp/Ollama),
|
||||
# the CPU backends through the RAM/offload path, cuda/rocm through vLLM.
|
||||
_MANUAL_BACKENDS = {"cuda", "rocm", "metal", "cpu_x86", "cpu_arm"}
|
||||
|
||||
|
||||
def _apply_manual_hardware(system, manual_mode="", manual_gpu_count="", manual_vram_gb="", manual_ram_gb="", manual_backend=""):
|
||||
"""Manual hardware is a "what if I had this setup" simulator —
|
||||
REPLACES the detected hardware entirely instead of adding to it.
|
||||
|
||||
The previous additive behavior averaged the manual VRAM across
|
||||
all GPUs (base + manual), which meant adding "1× 400 GB" on top
|
||||
of "2× 70 GB" only nudged the per-GPU cap from 70 to 180 GB
|
||||
(= 540 / 3), so GGUF models bigger than that still didn't surface
|
||||
— exactly the "cap stuck at detected level" bug the user hit.
|
||||
"""
|
||||
manual_mode = (manual_mode or "").lower()
|
||||
if manual_mode not in {"gpu", "ram"}:
|
||||
return system
|
||||
|
||||
try:
|
||||
override_ram_gb = float(manual_ram_gb) if manual_ram_gb else 0
|
||||
except ValueError:
|
||||
override_ram_gb = 0
|
||||
override_ram_gb = max(0.0, override_ram_gb)
|
||||
if override_ram_gb:
|
||||
# Replace RAM, don't add. The number in the field is the
|
||||
# TOTAL system memory the user wants to simulate.
|
||||
system["available_ram_gb"] = round(override_ram_gb, 1)
|
||||
system["total_ram_gb"] = round(override_ram_gb, 1)
|
||||
system["manual_hardware"] = True
|
||||
|
||||
if manual_mode == "ram":
|
||||
# RAM-only simulation — wipe GPU entirely so the ranker uses
|
||||
# CPU/RAM paths.
|
||||
system["has_gpu"] = False
|
||||
system["gpu_name"] = None
|
||||
system["gpu_vram_gb"] = 0
|
||||
system["gpu_count"] = 0
|
||||
system["gpus"] = []
|
||||
system["gpu_groups"] = []
|
||||
system["backend"] = "cpu_x86"
|
||||
system.pop("unified_memory", None)
|
||||
return system
|
||||
|
||||
try:
|
||||
count = int(manual_gpu_count) if manual_gpu_count else 1
|
||||
except ValueError:
|
||||
count = 1
|
||||
try:
|
||||
vram_each = float(manual_vram_gb) if manual_vram_gb else 8.0
|
||||
except ValueError:
|
||||
vram_each = 8.0
|
||||
count = max(1, min(count, 16))
|
||||
vram_each = max(1.0, vram_each)
|
||||
backend = (manual_backend or system.get("backend") or "cuda").lower()
|
||||
if backend not in _MANUAL_BACKENDS:
|
||||
backend = "cuda"
|
||||
total_vram = round(vram_each * count, 1)
|
||||
gpu_name = f"Simulated {backend.upper()} GPU" + (f" × {count}" if count > 1 else "")
|
||||
system["has_gpu"] = True
|
||||
system["gpu_name"] = gpu_name
|
||||
system["gpu_vram_gb"] = total_vram
|
||||
system["gpu_count"] = count
|
||||
system["gpus"] = [
|
||||
{"index": i, "name": gpu_name, "vram_gb": vram_each}
|
||||
for i in range(count)
|
||||
]
|
||||
# Single homogeneous pool — vram_each here is the ACTUAL per-GPU
|
||||
# VRAM the user entered, not an average. That's the whole point:
|
||||
# raising vram_each lifts the per-GPU cap (GGUF, tensor-parallel
|
||||
# math) all the way up, not just by a small fraction.
|
||||
system["gpu_groups"] = [{
|
||||
"name": gpu_name,
|
||||
"vram_each": vram_each,
|
||||
"count": count,
|
||||
"indices": list(range(count)),
|
||||
"vram_total": total_vram,
|
||||
}]
|
||||
system["homogeneous"] = True
|
||||
system["backend"] = backend
|
||||
# Apple Silicon shares one unified memory pool with the GPU; flag it so
|
||||
# the API/UI report it the way real Metal detection does. Discrete GPUs
|
||||
# (cuda/rocm) and the CPU backends carry separate VRAM, so clear any
|
||||
# stale flag a previous detection left on the dict.
|
||||
if backend == "metal":
|
||||
system["unified_memory"] = True
|
||||
else:
|
||||
system.pop("unified_memory", None)
|
||||
return system
|
||||
|
||||
|
||||
def setup_hwfit_routes():
|
||||
router = APIRouter(prefix="/api/hwfit", tags=["hwfit"])
|
||||
|
||||
def _apply_manual_hardware(system, manual_mode="", manual_gpu_count="", manual_vram_gb="", manual_ram_gb="", manual_backend=""):
|
||||
"""Manual hardware is a "what if I had this setup" simulator —
|
||||
REPLACES the detected hardware entirely instead of adding to it.
|
||||
|
||||
The previous additive behavior averaged the manual VRAM across
|
||||
all GPUs (base + manual), which meant adding "1× 400 GB" on top
|
||||
of "2× 70 GB" only nudged the per-GPU cap from 70 to 180 GB
|
||||
(= 540 / 3), so GGUF models bigger than that still didn't surface
|
||||
— exactly the "cap stuck at detected level" bug the user hit.
|
||||
"""
|
||||
manual_mode = (manual_mode or "").lower()
|
||||
if manual_mode not in {"gpu", "ram"}:
|
||||
return system
|
||||
|
||||
try:
|
||||
override_ram_gb = float(manual_ram_gb) if manual_ram_gb else 0
|
||||
except ValueError:
|
||||
override_ram_gb = 0
|
||||
override_ram_gb = max(0.0, override_ram_gb)
|
||||
if override_ram_gb:
|
||||
# Replace RAM, don't add. The number in the field is the
|
||||
# TOTAL system memory the user wants to simulate.
|
||||
system["available_ram_gb"] = round(override_ram_gb, 1)
|
||||
system["total_ram_gb"] = round(override_ram_gb, 1)
|
||||
system["manual_hardware"] = True
|
||||
|
||||
if manual_mode == "ram":
|
||||
# RAM-only simulation — wipe GPU entirely so the ranker uses
|
||||
# CPU/RAM paths.
|
||||
system["has_gpu"] = False
|
||||
system["gpu_name"] = None
|
||||
system["gpu_vram_gb"] = 0
|
||||
system["gpu_count"] = 0
|
||||
system["gpus"] = []
|
||||
system["gpu_groups"] = []
|
||||
system["backend"] = "cpu_x86"
|
||||
return system
|
||||
|
||||
try:
|
||||
count = int(manual_gpu_count) if manual_gpu_count else 1
|
||||
except ValueError:
|
||||
count = 1
|
||||
try:
|
||||
vram_each = float(manual_vram_gb) if manual_vram_gb else 8.0
|
||||
except ValueError:
|
||||
vram_each = 8.0
|
||||
count = max(1, min(count, 16))
|
||||
vram_each = max(1.0, vram_each)
|
||||
backend = (manual_backend or system.get("backend") or "cuda").lower()
|
||||
if backend not in {"cuda", "rocm", "cpu_x86", "cpu_arm"}:
|
||||
backend = "cuda"
|
||||
total_vram = round(vram_each * count, 1)
|
||||
gpu_name = f"Simulated {backend.upper()} GPU" + (f" × {count}" if count > 1 else "")
|
||||
system["has_gpu"] = True
|
||||
system["gpu_name"] = gpu_name
|
||||
system["gpu_vram_gb"] = total_vram
|
||||
system["gpu_count"] = count
|
||||
system["gpus"] = [
|
||||
{"index": i, "name": gpu_name, "vram_gb": vram_each}
|
||||
for i in range(count)
|
||||
]
|
||||
# Single homogeneous pool — vram_each here is the ACTUAL per-GPU
|
||||
# VRAM the user entered, not an average. That's the whole point:
|
||||
# raising vram_each lifts the per-GPU cap (GGUF, tensor-parallel
|
||||
# math) all the way up, not just by a small fraction.
|
||||
system["gpu_groups"] = [{
|
||||
"name": gpu_name,
|
||||
"vram_each": vram_each,
|
||||
"count": count,
|
||||
"indices": list(range(count)),
|
||||
"vram_total": total_vram,
|
||||
}]
|
||||
system["homogeneous"] = True
|
||||
system["backend"] = backend
|
||||
return system
|
||||
|
||||
@router.get("/system")
|
||||
def get_system(host: str = "", ssh_port: str = "", platform: str = "", fresh: bool = False):
|
||||
"""Detect and return current system hardware info. Pass host=user@server for remote.
|
||||
|
||||
85
tests/test_hwfit_manual_backend.py
Normal file
85
tests/test_hwfit_manual_backend.py
Normal file
@@ -0,0 +1,85 @@
|
||||
"""Manual hardware simulator backend handling (Cookbook "what if I had…").
|
||||
|
||||
`_apply_manual_hardware` replaces detected hardware with a user-described box so
|
||||
the Cookbook can rank models against hardware you don't have yet. These pin that
|
||||
the accepted backends stay in lock-step with what services.hwfit.fit can rank —
|
||||
notably that "metal" is honoured (Apple Silicon is GGUF-only via llama.cpp /
|
||||
Ollama) instead of being silently coerced to CUDA.
|
||||
"""
|
||||
|
||||
from routes.hwfit_routes import _apply_manual_hardware, _MANUAL_BACKENDS
|
||||
from services.hwfit.fit import rank_models
|
||||
from services.hwfit.models import get_models
|
||||
|
||||
|
||||
def test_no_manual_mode_leaves_system_untouched():
|
||||
base = {"backend": "cuda", "gpu_vram_gb": 24.0, "has_gpu": True}
|
||||
assert _apply_manual_hardware(dict(base), manual_mode="") == base
|
||||
assert _apply_manual_hardware(dict(base), manual_mode="bogus") == base
|
||||
|
||||
|
||||
def test_manual_metal_backend_is_accepted():
|
||||
"""The whole point of this change: 'metal' must survive instead of being
|
||||
rewritten to 'cuda', so the simulated Mac ranks through the Apple path."""
|
||||
s = _apply_manual_hardware({}, manual_mode="gpu", manual_vram_gb="24", manual_backend="metal")
|
||||
assert s["backend"] == "metal"
|
||||
assert s["unified_memory"] is True
|
||||
assert s["has_gpu"] is True
|
||||
assert "METAL" in s["gpu_name"]
|
||||
|
||||
|
||||
def test_manual_metal_vram_and_count_math():
|
||||
s = _apply_manual_hardware({}, manual_mode="gpu", manual_gpu_count="2", manual_vram_gb="24", manual_backend="metal")
|
||||
assert s["gpu_count"] == 2
|
||||
assert s["gpu_vram_gb"] == 48.0
|
||||
assert len(s["gpus"]) == 2
|
||||
grp = s["gpu_groups"][0]
|
||||
assert grp["vram_each"] == 24.0
|
||||
assert grp["count"] == 2
|
||||
assert grp["vram_total"] == 48.0
|
||||
|
||||
|
||||
def test_manual_backend_whitelist_matches_fit_backends():
|
||||
"""Guard against drift: every manual backend must be one fit.py understands."""
|
||||
assert _MANUAL_BACKENDS == {"cuda", "rocm", "metal", "cpu_x86", "cpu_arm"}
|
||||
|
||||
|
||||
def test_unknown_manual_backend_falls_back_to_cuda():
|
||||
s = _apply_manual_hardware({}, manual_mode="gpu", manual_backend="tpu")
|
||||
assert s["backend"] == "cuda"
|
||||
assert "unified_memory" not in s
|
||||
|
||||
|
||||
def test_manual_rocm_and_cuda_are_not_unified_memory():
|
||||
for backend in ("cuda", "rocm"):
|
||||
s = _apply_manual_hardware({"unified_memory": True}, manual_mode="gpu", manual_backend=backend)
|
||||
assert s["backend"] == backend
|
||||
# Discrete GPUs are not unified memory — a stale flag must be cleared.
|
||||
assert "unified_memory" not in s
|
||||
|
||||
|
||||
def test_manual_ram_mode_wipes_gpu_and_unified_flag():
|
||||
s = _apply_manual_hardware({"unified_memory": True}, manual_mode="ram", manual_ram_gb="64")
|
||||
assert s["has_gpu"] is False
|
||||
assert s["backend"] == "cpu_x86"
|
||||
assert s["gpu_vram_gb"] == 0
|
||||
assert s["total_ram_gb"] == 64.0
|
||||
assert "unified_memory" not in s
|
||||
|
||||
|
||||
def test_simulated_metal_box_only_recommends_gguf():
|
||||
"""End-to-end: a simulated Metal box must rank exactly like a real Mac —
|
||||
only models shipping a servable GGUF (llama.cpp/Ollama) survive. Before
|
||||
'metal' was accepted, this box ranked as CUDA and surfaced safetensors-only
|
||||
repos the Mac can't serve."""
|
||||
system = _apply_manual_hardware(
|
||||
{"backend": "cuda", "available_ram_gb": 32.0, "total_ram_gb": 64.0},
|
||||
manual_mode="gpu", manual_vram_gb="48", manual_backend="metal",
|
||||
)
|
||||
catalog = {m["name"]: m for m in get_models()}
|
||||
unservable = [
|
||||
r["name"] for r in rank_models(system, limit=900)
|
||||
if not (catalog.get(r["name"], {}).get("is_gguf")
|
||||
or catalog.get(r["name"], {}).get("gguf_sources"))
|
||||
]
|
||||
assert unservable == [], f"{len(unservable)} non-GGUF models on simulated Metal, e.g. {unservable[:3]}"
|
||||
Reference in New Issue
Block a user