145 lines
6.1 KiB
Python
145 lines
6.1 KiB
Python
"""macOS / Apple Silicon (Metal) support for Cookbook hardware-fit.
|
|
|
|
Covers the Metal-specific behavior added for Apple Silicon and locks in the
|
|
guarantee that non-macOS (Linux/Windows) detection is unchanged.
|
|
"""
|
|
|
|
from services.hwfit import hardware
|
|
from services.hwfit.fit import rank_models
|
|
from services.hwfit.models import get_models
|
|
|
|
|
|
def _metal_system(ram_gb=16.0, vram_gb=10.7):
|
|
return {
|
|
"has_gpu": True,
|
|
"backend": "metal",
|
|
"gpu_name": "Apple M2",
|
|
"gpu_vram_gb": vram_gb,
|
|
"gpu_count": 1,
|
|
"available_ram_gb": ram_gb * 0.7,
|
|
"total_ram_gb": ram_gb,
|
|
"unified_memory": True,
|
|
}
|
|
|
|
|
|
def _fake_sysctl(brand="Apple M2 Pro", memsize_gb=32, wired_mb=None):
|
|
def run(cmd):
|
|
joined = " ".join(cmd)
|
|
if "machdep.cpu.brand_string" in joined:
|
|
return brand
|
|
if "hw.memsize" in joined:
|
|
return str(int(memsize_gb * 1024**3))
|
|
if "iogpu.wired_limit_mb" in joined:
|
|
return str(wired_mb) if wired_mb is not None else None
|
|
return None
|
|
return run
|
|
|
|
|
|
def test_mlx_models_hidden_on_metal():
|
|
"""MLX-quantized models can't be served by llama.cpp or Ollama (the only
|
|
Metal-capable engines Odysseus generates), so they must never be recommended
|
|
on Apple Silicon — even though the catalog tags them as Apple-only."""
|
|
results = rank_models(_metal_system(), limit=900)
|
|
mlx = [m for m in results if str(m.get("quant", "")).startswith("mlx-")]
|
|
assert mlx == [], f"MLX models surfaced but cannot be served: {[m['name'] for m in mlx]}"
|
|
|
|
|
|
def _cuda_system():
|
|
return {
|
|
"has_gpu": True, "backend": "cuda", "gpu_name": "NVIDIA RTX 4090",
|
|
"gpu_vram_gb": 24.0, "gpu_count": 1, "available_ram_gb": 32.0, "total_ram_gb": 64.0,
|
|
}
|
|
|
|
|
|
def test_mlx_hidden_on_cuda_backend_unchanged():
|
|
"""Regression guard: Linux/CUDA users never saw MLX before and still don't."""
|
|
mlx = [m for m in rank_models(_cuda_system(), limit=900) if str(m.get("quant", "")).startswith("mlx-")]
|
|
assert mlx == []
|
|
|
|
|
|
def test_only_gguf_models_recommended_on_metal():
|
|
"""llama.cpp and Ollama (the only Metal engines) need GGUF. Safetensors-only
|
|
repos — incl. vLLM-only AWQ/GPTQ/FP8 — can't be served on Metal, so every
|
|
model recommended on Apple Silicon must ship a servable GGUF."""
|
|
catalog = {m["name"]: m for m in get_models()}
|
|
unservable = [
|
|
r["name"] for r in rank_models(_metal_system(), limit=900)
|
|
if not (catalog.get(r["name"], {}).get("is_gguf")
|
|
or catalog.get(r["name"], {}).get("gguf_sources"))
|
|
]
|
|
assert unservable == [], f"{len(unservable)} non-GGUF models on Metal, e.g. {unservable[:3]}"
|
|
|
|
|
|
def test_qwen_catalog_entries_point_at_verified_gguf_repos():
|
|
"""Qwen GGUF-looking Cookbook rows must download GGUF repos, not the base
|
|
safetensors repositories."""
|
|
catalog = {m["name"]: m for m in get_models()}
|
|
expected = {
|
|
"Qwen/Qwen3.5-9B": ("unsloth/Qwen3.5-9B-GGUF", "Qwen3.5-9B-Q4_K_M.gguf"),
|
|
"Qwen/Qwen3.6-27B": ("unsloth/Qwen3.6-27B-GGUF", "Qwen3.6-27B-Q4_K_M.gguf"),
|
|
"Qwen/Qwen3.6-35B-A3B": ("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
|
|
}
|
|
|
|
for model_name, (repo, filename) in expected.items():
|
|
sources = catalog[model_name].get("gguf_sources") or []
|
|
assert any(src.get("repo") == repo and src.get("file") == filename for src in sources)
|
|
|
|
|
|
def test_safetensors_models_still_recommended_on_cuda():
|
|
"""Regression guard: vLLM serves safetensors on CUDA, so non-GGUF repos must
|
|
NOT be filtered there — the GGUF-only rule is Metal-specific."""
|
|
names = {r["name"] for r in rank_models(_cuda_system(), limit=900)}
|
|
assert "microsoft/Phi-mini-MoE-instruct" in names
|
|
|
|
|
|
def test_apple_silicon_detected_as_metal(monkeypatch):
|
|
"""On local Apple Silicon, detection reports a Metal GPU with a RAM-scaled
|
|
unified-memory budget."""
|
|
monkeypatch.setattr(hardware, "_remote_host", None)
|
|
monkeypatch.setattr(hardware.platform, "system", lambda: "Darwin")
|
|
monkeypatch.setattr(hardware.platform, "machine", lambda: "arm64")
|
|
monkeypatch.setattr(hardware, "_run", _fake_sysctl(memsize_gb=32))
|
|
|
|
info = hardware._detect_apple_silicon()
|
|
assert info is not None
|
|
assert info["backend"] == "metal"
|
|
assert info["gpu_name"] == "Apple M2 Pro"
|
|
assert info["unified_memory"] is True
|
|
assert info["gpu_vram_gb"] == 24.0 # 32GB * 0.75
|
|
|
|
|
|
def test_apple_silicon_skipped_on_linux(monkeypatch):
|
|
"""Guarantee Linux detection is untouched: the Metal probe bails immediately."""
|
|
monkeypatch.setattr(hardware, "_remote_host", None)
|
|
monkeypatch.setattr(hardware.platform, "system", lambda: "Linux")
|
|
monkeypatch.setattr(hardware.platform, "machine", lambda: "x86_64")
|
|
monkeypatch.setattr(hardware, "_run", _fake_sysctl())
|
|
assert hardware._detect_apple_silicon() is None
|
|
|
|
|
|
def test_intel_mac_skipped(monkeypatch):
|
|
"""Intel Macs have no Metal GPU worth serving LLMs on — fall through to CPU."""
|
|
monkeypatch.setattr(hardware, "_remote_host", None)
|
|
monkeypatch.setattr(hardware.platform, "system", lambda: "Darwin")
|
|
monkeypatch.setattr(hardware.platform, "machine", lambda: "x86_64")
|
|
monkeypatch.setattr(hardware, "_run", _fake_sysctl())
|
|
assert hardware._detect_apple_silicon() is None
|
|
|
|
|
|
def test_detect_system_propagates_unified_memory(monkeypatch):
|
|
"""The unified_memory flag set by GPU detection must survive into the
|
|
system dict so the API and UI can report it (it was being dropped)."""
|
|
monkeypatch.setattr(hardware, "_detect_apple_silicon", lambda: {
|
|
"gpu_name": "Apple M4", "gpu_vram_gb": 10.7, "gpu_count": 1,
|
|
"gpus": [], "gpu_groups": [], "homogeneous": True,
|
|
"backend": "metal", "unified_memory": True,
|
|
})
|
|
monkeypatch.setattr(hardware, "_get_ram_gb", lambda: 16.0)
|
|
monkeypatch.setattr(hardware, "_get_available_ram_gb", lambda: 11.0)
|
|
monkeypatch.setattr(hardware, "_get_cpu_count", lambda: 10)
|
|
monkeypatch.setattr(hardware, "_get_cpu_name", lambda: "Apple M4")
|
|
|
|
s = hardware.detect_system(fresh=True)
|
|
assert s["backend"] == "metal"
|
|
assert s.get("unified_memory") is True
|