diff --git a/routes/hwfit_routes.py b/routes/hwfit_routes.py index c377e5e..22995e8 100644 --- a/routes/hwfit_routes.py +++ b/routes/hwfit_routes.py @@ -4,85 +4,102 @@ from copy import deepcopy from fastapi import APIRouter +# Backends the manual hardware simulator accepts. Must stay a subset of what +# services.hwfit.fit understands so a simulated box ranks like a real one: +# "metal" routes through the Apple-Silicon path (GGUF-only, llama.cpp/Ollama), +# the CPU backends through the RAM/offload path, cuda/rocm through vLLM. +_MANUAL_BACKENDS = {"cuda", "rocm", "metal", "cpu_x86", "cpu_arm"} + + +def _apply_manual_hardware(system, manual_mode="", manual_gpu_count="", manual_vram_gb="", manual_ram_gb="", manual_backend=""): + """Manual hardware is a "what if I had this setup" simulator — + REPLACES the detected hardware entirely instead of adding to it. + + The previous additive behavior averaged the manual VRAM across + all GPUs (base + manual), which meant adding "1× 400 GB" on top + of "2× 70 GB" only nudged the per-GPU cap from 70 to 180 GB + (= 540 / 3), so GGUF models bigger than that still didn't surface + — exactly the "cap stuck at detected level" bug the user hit. + """ + manual_mode = (manual_mode or "").lower() + if manual_mode not in {"gpu", "ram"}: + return system + + try: + override_ram_gb = float(manual_ram_gb) if manual_ram_gb else 0 + except ValueError: + override_ram_gb = 0 + override_ram_gb = max(0.0, override_ram_gb) + if override_ram_gb: + # Replace RAM, don't add. The number in the field is the + # TOTAL system memory the user wants to simulate. + system["available_ram_gb"] = round(override_ram_gb, 1) + system["total_ram_gb"] = round(override_ram_gb, 1) + system["manual_hardware"] = True + + if manual_mode == "ram": + # RAM-only simulation — wipe GPU entirely so the ranker uses + # CPU/RAM paths. + system["has_gpu"] = False + system["gpu_name"] = None + system["gpu_vram_gb"] = 0 + system["gpu_count"] = 0 + system["gpus"] = [] + system["gpu_groups"] = [] + system["backend"] = "cpu_x86" + system.pop("unified_memory", None) + return system + + try: + count = int(manual_gpu_count) if manual_gpu_count else 1 + except ValueError: + count = 1 + try: + vram_each = float(manual_vram_gb) if manual_vram_gb else 8.0 + except ValueError: + vram_each = 8.0 + count = max(1, min(count, 16)) + vram_each = max(1.0, vram_each) + backend = (manual_backend or system.get("backend") or "cuda").lower() + if backend not in _MANUAL_BACKENDS: + backend = "cuda" + total_vram = round(vram_each * count, 1) + gpu_name = f"Simulated {backend.upper()} GPU" + (f" × {count}" if count > 1 else "") + system["has_gpu"] = True + system["gpu_name"] = gpu_name + system["gpu_vram_gb"] = total_vram + system["gpu_count"] = count + system["gpus"] = [ + {"index": i, "name": gpu_name, "vram_gb": vram_each} + for i in range(count) + ] + # Single homogeneous pool — vram_each here is the ACTUAL per-GPU + # VRAM the user entered, not an average. That's the whole point: + # raising vram_each lifts the per-GPU cap (GGUF, tensor-parallel + # math) all the way up, not just by a small fraction. + system["gpu_groups"] = [{ + "name": gpu_name, + "vram_each": vram_each, + "count": count, + "indices": list(range(count)), + "vram_total": total_vram, + }] + system["homogeneous"] = True + system["backend"] = backend + # Apple Silicon shares one unified memory pool with the GPU; flag it so + # the API/UI report it the way real Metal detection does. Discrete GPUs + # (cuda/rocm) and the CPU backends carry separate VRAM, so clear any + # stale flag a previous detection left on the dict. + if backend == "metal": + system["unified_memory"] = True + else: + system.pop("unified_memory", None) + return system + + def setup_hwfit_routes(): router = APIRouter(prefix="/api/hwfit", tags=["hwfit"]) - def _apply_manual_hardware(system, manual_mode="", manual_gpu_count="", manual_vram_gb="", manual_ram_gb="", manual_backend=""): - """Manual hardware is a "what if I had this setup" simulator — - REPLACES the detected hardware entirely instead of adding to it. - - The previous additive behavior averaged the manual VRAM across - all GPUs (base + manual), which meant adding "1× 400 GB" on top - of "2× 70 GB" only nudged the per-GPU cap from 70 to 180 GB - (= 540 / 3), so GGUF models bigger than that still didn't surface - — exactly the "cap stuck at detected level" bug the user hit. - """ - manual_mode = (manual_mode or "").lower() - if manual_mode not in {"gpu", "ram"}: - return system - - try: - override_ram_gb = float(manual_ram_gb) if manual_ram_gb else 0 - except ValueError: - override_ram_gb = 0 - override_ram_gb = max(0.0, override_ram_gb) - if override_ram_gb: - # Replace RAM, don't add. The number in the field is the - # TOTAL system memory the user wants to simulate. - system["available_ram_gb"] = round(override_ram_gb, 1) - system["total_ram_gb"] = round(override_ram_gb, 1) - system["manual_hardware"] = True - - if manual_mode == "ram": - # RAM-only simulation — wipe GPU entirely so the ranker uses - # CPU/RAM paths. - system["has_gpu"] = False - system["gpu_name"] = None - system["gpu_vram_gb"] = 0 - system["gpu_count"] = 0 - system["gpus"] = [] - system["gpu_groups"] = [] - system["backend"] = "cpu_x86" - return system - - try: - count = int(manual_gpu_count) if manual_gpu_count else 1 - except ValueError: - count = 1 - try: - vram_each = float(manual_vram_gb) if manual_vram_gb else 8.0 - except ValueError: - vram_each = 8.0 - count = max(1, min(count, 16)) - vram_each = max(1.0, vram_each) - backend = (manual_backend or system.get("backend") or "cuda").lower() - if backend not in {"cuda", "rocm", "cpu_x86", "cpu_arm"}: - backend = "cuda" - total_vram = round(vram_each * count, 1) - gpu_name = f"Simulated {backend.upper()} GPU" + (f" × {count}" if count > 1 else "") - system["has_gpu"] = True - system["gpu_name"] = gpu_name - system["gpu_vram_gb"] = total_vram - system["gpu_count"] = count - system["gpus"] = [ - {"index": i, "name": gpu_name, "vram_gb": vram_each} - for i in range(count) - ] - # Single homogeneous pool — vram_each here is the ACTUAL per-GPU - # VRAM the user entered, not an average. That's the whole point: - # raising vram_each lifts the per-GPU cap (GGUF, tensor-parallel - # math) all the way up, not just by a small fraction. - system["gpu_groups"] = [{ - "name": gpu_name, - "vram_each": vram_each, - "count": count, - "indices": list(range(count)), - "vram_total": total_vram, - }] - system["homogeneous"] = True - system["backend"] = backend - return system - @router.get("/system") def get_system(host: str = "", ssh_port: str = "", platform: str = "", fresh: bool = False): """Detect and return current system hardware info. Pass host=user@server for remote. diff --git a/tests/test_hwfit_manual_backend.py b/tests/test_hwfit_manual_backend.py new file mode 100644 index 0000000..4ebb3fe --- /dev/null +++ b/tests/test_hwfit_manual_backend.py @@ -0,0 +1,85 @@ +"""Manual hardware simulator backend handling (Cookbook "what if I had…"). + +`_apply_manual_hardware` replaces detected hardware with a user-described box so +the Cookbook can rank models against hardware you don't have yet. These pin that +the accepted backends stay in lock-step with what services.hwfit.fit can rank — +notably that "metal" is honoured (Apple Silicon is GGUF-only via llama.cpp / +Ollama) instead of being silently coerced to CUDA. +""" + +from routes.hwfit_routes import _apply_manual_hardware, _MANUAL_BACKENDS +from services.hwfit.fit import rank_models +from services.hwfit.models import get_models + + +def test_no_manual_mode_leaves_system_untouched(): + base = {"backend": "cuda", "gpu_vram_gb": 24.0, "has_gpu": True} + assert _apply_manual_hardware(dict(base), manual_mode="") == base + assert _apply_manual_hardware(dict(base), manual_mode="bogus") == base + + +def test_manual_metal_backend_is_accepted(): + """The whole point of this change: 'metal' must survive instead of being + rewritten to 'cuda', so the simulated Mac ranks through the Apple path.""" + s = _apply_manual_hardware({}, manual_mode="gpu", manual_vram_gb="24", manual_backend="metal") + assert s["backend"] == "metal" + assert s["unified_memory"] is True + assert s["has_gpu"] is True + assert "METAL" in s["gpu_name"] + + +def test_manual_metal_vram_and_count_math(): + s = _apply_manual_hardware({}, manual_mode="gpu", manual_gpu_count="2", manual_vram_gb="24", manual_backend="metal") + assert s["gpu_count"] == 2 + assert s["gpu_vram_gb"] == 48.0 + assert len(s["gpus"]) == 2 + grp = s["gpu_groups"][0] + assert grp["vram_each"] == 24.0 + assert grp["count"] == 2 + assert grp["vram_total"] == 48.0 + + +def test_manual_backend_whitelist_matches_fit_backends(): + """Guard against drift: every manual backend must be one fit.py understands.""" + assert _MANUAL_BACKENDS == {"cuda", "rocm", "metal", "cpu_x86", "cpu_arm"} + + +def test_unknown_manual_backend_falls_back_to_cuda(): + s = _apply_manual_hardware({}, manual_mode="gpu", manual_backend="tpu") + assert s["backend"] == "cuda" + assert "unified_memory" not in s + + +def test_manual_rocm_and_cuda_are_not_unified_memory(): + for backend in ("cuda", "rocm"): + s = _apply_manual_hardware({"unified_memory": True}, manual_mode="gpu", manual_backend=backend) + assert s["backend"] == backend + # Discrete GPUs are not unified memory — a stale flag must be cleared. + assert "unified_memory" not in s + + +def test_manual_ram_mode_wipes_gpu_and_unified_flag(): + s = _apply_manual_hardware({"unified_memory": True}, manual_mode="ram", manual_ram_gb="64") + assert s["has_gpu"] is False + assert s["backend"] == "cpu_x86" + assert s["gpu_vram_gb"] == 0 + assert s["total_ram_gb"] == 64.0 + assert "unified_memory" not in s + + +def test_simulated_metal_box_only_recommends_gguf(): + """End-to-end: a simulated Metal box must rank exactly like a real Mac — + only models shipping a servable GGUF (llama.cpp/Ollama) survive. Before + 'metal' was accepted, this box ranked as CUDA and surfaced safetensors-only + repos the Mac can't serve.""" + system = _apply_manual_hardware( + {"backend": "cuda", "available_ram_gb": 32.0, "total_ram_gb": 64.0}, + manual_mode="gpu", manual_vram_gb="48", manual_backend="metal", + ) + catalog = {m["name"]: m for m in get_models()} + unservable = [ + r["name"] for r in rank_models(system, limit=900) + if not (catalog.get(r["name"], {}).get("is_gguf") + or catalog.get(r["name"], {}).get("gguf_sources")) + ] + assert unservable == [], f"{len(unservable)} non-GGUF models on simulated Metal, e.g. {unservable[:3]}"