fix(hwfit): honor manual "metal" backend in the hardware simulator (#1090)

The Cookbook's manual hardware simulator ("what if I had this setup") let users pick a backend, but _apply_manual_hardware only accepted cuda/rocm/cpu_x86/ cpu_arm and silently coerced anything else to cuda. So selecting Apple/Metal simulated a CUDA box instead — and ranked safetensors-only repos a Mac can't serve, even though the rest of hwfit (services.hwfit.fit, the serve-command generation) already supports Metal as GGUF-only via llama.cpp/Ollama. Add "metal" to the accepted backends (now a named _MANUAL_BACKENDS set, kept a subset of what fit.py understands) and set unified_memory=True for it — Apple Silicon shares one memory pool with the GPU — while clearing that flag for the discrete (cuda/rocm) and CPU backends. _apply_manual_hardware is lifted to module scope so it is directly unit-testable; both route call sites are unchanged. Adds tests/test_hwfit_manual_backend.py, including an end-to-end check that a simulated Metal box only recommends GGUF-servable models. Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-02 10:12:34 -04:00
parent c7ddfd7dd2
commit 16f7feee0a
2 changed files with 178 additions and 76 deletions
--- a/routes/hwfit_routes.py
+++ b/routes/hwfit_routes.py
@@ -4,10 +4,14 @@ from copy import deepcopy
 from fastapi import APIRouter


-def setup_hwfit_routes():
-    router = APIRouter(prefix="/api/hwfit", tags=["hwfit"])
+# Backends the manual hardware simulator accepts. Must stay a subset of what
+# services.hwfit.fit understands so a simulated box ranks like a real one:
+# "metal" routes through the Apple-Silicon path (GGUF-only, llama.cpp/Ollama),
+# the CPU backends through the RAM/offload path, cuda/rocm through vLLM.
+_MANUAL_BACKENDS = {"cuda", "rocm", "metal", "cpu_x86", "cpu_arm"}

-    def _apply_manual_hardware(system, manual_mode="", manual_gpu_count="", manual_vram_gb="", manual_ram_gb="", manual_backend=""):
+
+def _apply_manual_hardware(system, manual_mode="", manual_gpu_count="", manual_vram_gb="", manual_ram_gb="", manual_backend=""):
    """Manual hardware is a "what if I had this setup" simulator —
    REPLACES the detected hardware entirely instead of adding to it.

@@ -43,6 +47,7 @@ def setup_hwfit_routes():
        system["gpus"] = []
        system["gpu_groups"] = []
        system["backend"] = "cpu_x86"
+        system.pop("unified_memory", None)
        return system

    try:
@@ -56,7 +61,7 @@ def setup_hwfit_routes():
    count = max(1, min(count, 16))
    vram_each = max(1.0, vram_each)
    backend = (manual_backend or system.get("backend") or "cuda").lower()
-        if backend not in {"cuda", "rocm", "cpu_x86", "cpu_arm"}:
+    if backend not in _MANUAL_BACKENDS:
        backend = "cuda"
    total_vram = round(vram_each * count, 1)
    gpu_name = f"Simulated {backend.upper()} GPU" + (f" × {count}" if count > 1 else "")
@@ -81,8 +86,20 @@ def setup_hwfit_routes():
    }]
    system["homogeneous"] = True
    system["backend"] = backend
+    # Apple Silicon shares one unified memory pool with the GPU; flag it so
+    # the API/UI report it the way real Metal detection does. Discrete GPUs
+    # (cuda/rocm) and the CPU backends carry separate VRAM, so clear any
+    # stale flag a previous detection left on the dict.
+    if backend == "metal":
+        system["unified_memory"] = True
+    else:
+        system.pop("unified_memory", None)
    return system

+
+def setup_hwfit_routes():
+    router = APIRouter(prefix="/api/hwfit", tags=["hwfit"])
+
    @router.get("/system")
    def get_system(host: str = "", ssh_port: str = "", platform: str = "", fresh: bool = False):
        """Detect and return current system hardware info. Pass host=user@server for remote.
--- a/tests/test_hwfit_manual_backend.py
+++ b/tests/test_hwfit_manual_backend.py
@@ -0,0 +1,85 @@
+"""Manual hardware simulator backend handling (Cookbook "what if I had…").
+
+`_apply_manual_hardware` replaces detected hardware with a user-described box so
+the Cookbook can rank models against hardware you don't have yet. These pin that
+the accepted backends stay in lock-step with what services.hwfit.fit can rank —
+notably that "metal" is honoured (Apple Silicon is GGUF-only via llama.cpp /
+Ollama) instead of being silently coerced to CUDA.
+"""
+
+from routes.hwfit_routes import _apply_manual_hardware, _MANUAL_BACKENDS
+from services.hwfit.fit import rank_models
+from services.hwfit.models import get_models
+
+
+def test_no_manual_mode_leaves_system_untouched():
+    base = {"backend": "cuda", "gpu_vram_gb": 24.0, "has_gpu": True}
+    assert _apply_manual_hardware(dict(base), manual_mode="") == base
+    assert _apply_manual_hardware(dict(base), manual_mode="bogus") == base
+
+
+def test_manual_metal_backend_is_accepted():
+    """The whole point of this change: 'metal' must survive instead of being
+    rewritten to 'cuda', so the simulated Mac ranks through the Apple path."""
+    s = _apply_manual_hardware({}, manual_mode="gpu", manual_vram_gb="24", manual_backend="metal")
+    assert s["backend"] == "metal"
+    assert s["unified_memory"] is True
+    assert s["has_gpu"] is True
+    assert "METAL" in s["gpu_name"]
+
+
+def test_manual_metal_vram_and_count_math():
+    s = _apply_manual_hardware({}, manual_mode="gpu", manual_gpu_count="2", manual_vram_gb="24", manual_backend="metal")
+    assert s["gpu_count"] == 2
+    assert s["gpu_vram_gb"] == 48.0
+    assert len(s["gpus"]) == 2
+    grp = s["gpu_groups"][0]
+    assert grp["vram_each"] == 24.0
+    assert grp["count"] == 2
+    assert grp["vram_total"] == 48.0
+
+
+def test_manual_backend_whitelist_matches_fit_backends():
+    """Guard against drift: every manual backend must be one fit.py understands."""
+    assert _MANUAL_BACKENDS == {"cuda", "rocm", "metal", "cpu_x86", "cpu_arm"}
+
+
+def test_unknown_manual_backend_falls_back_to_cuda():
+    s = _apply_manual_hardware({}, manual_mode="gpu", manual_backend="tpu")
+    assert s["backend"] == "cuda"
+    assert "unified_memory" not in s
+
+
+def test_manual_rocm_and_cuda_are_not_unified_memory():
+    for backend in ("cuda", "rocm"):
+        s = _apply_manual_hardware({"unified_memory": True}, manual_mode="gpu", manual_backend=backend)
+        assert s["backend"] == backend
+        # Discrete GPUs are not unified memory — a stale flag must be cleared.
+        assert "unified_memory" not in s
+
+
+def test_manual_ram_mode_wipes_gpu_and_unified_flag():
+    s = _apply_manual_hardware({"unified_memory": True}, manual_mode="ram", manual_ram_gb="64")
+    assert s["has_gpu"] is False
+    assert s["backend"] == "cpu_x86"
+    assert s["gpu_vram_gb"] == 0
+    assert s["total_ram_gb"] == 64.0
+    assert "unified_memory" not in s
+
+
+def test_simulated_metal_box_only_recommends_gguf():
+    """End-to-end: a simulated Metal box must rank exactly like a real Mac —
+    only models shipping a servable GGUF (llama.cpp/Ollama) survive. Before
+    'metal' was accepted, this box ranked as CUDA and surfaced safetensors-only
+    repos the Mac can't serve."""
+    system = _apply_manual_hardware(
+        {"backend": "cuda", "available_ram_gb": 32.0, "total_ram_gb": 64.0},
+        manual_mode="gpu", manual_vram_gb="48", manual_backend="metal",
+    )
+    catalog = {m["name"]: m for m in get_models()}
+    unservable = [
+        r["name"] for r in rank_models(system, limit=900)
+        if not (catalog.get(r["name"], {}).get("is_gguf")
+                or catalog.get(r["name"], {}).get("gguf_sources"))
+    ]
+    assert unservable == [], f"{len(unservable)} non-GGUF models on simulated Metal, e.g. {unservable[:3]}"