diff --git a/services/hwfit/hardware.py b/services/hwfit/hardware.py index b6e7980..0af62a0 100644 --- a/services/hwfit/hardware.py +++ b/services/hwfit/hardware.py @@ -105,6 +105,8 @@ def _detect_nvidia(): return None gpus = [] + # Devices nvidia-smi lists with a real name but a non-numeric memory.total. + unified = [] # nvidia-smi lists GPUs in index order (0,1,2,...), so the row position is # the CUDA device index we'd pass to CUDA_VISIBLE_DEVICES. for idx, line in enumerate(out.strip().split("\n")): @@ -114,9 +116,32 @@ def _detect_nvidia(): vram_mb = float(parts[0]) gpus.append({"index": idx, "name": parts[1], "vram_gb": vram_mb / 1024.0}) except ValueError: + # Grace Blackwell GB10 / DGX Spark and other unified-memory + # NVIDIA parts report memory.total as "[N/A]"/"Not Supported" + # because the GPU shares the system LPDDR pool instead of + # carrying discrete VRAM. Don't drop the device — remember it so + # we report a unified-memory GPU below rather than "No GPU" (#1340). + if parts[1]: + unified.append({"index": idx, "name": parts[1]}) continue if not gpus: + if unified: + # Unified-memory CUDA box: report the GPU backed by system RAM so the + # Cookbook recommends models and serving works. The pool is shared + # (not per-GPU discrete VRAM), so report the RAM total once. + ram_gb = round(_get_ram_gb(), 1) + gpus = [{"index": g["index"], "name": g["name"], "vram_gb": ram_gb} for g in unified] + return { + "gpu_name": gpus[0]["name"], + "gpu_vram_gb": ram_gb, + "gpu_count": len(gpus), + "gpus": gpus, + "gpu_groups": _group_gpus(gpus), + "homogeneous": True, + "backend": "cuda", + "unified_memory": True, + } return None total_vram = sum(g["vram_gb"] for g in gpus) groups = _group_gpus(gpus) diff --git a/tests/test_hwfit_unified_nvidia.py b/tests/test_hwfit_unified_nvidia.py new file mode 100644 index 0000000..009288e --- /dev/null +++ b/tests/test_hwfit_unified_nvidia.py @@ -0,0 +1,73 @@ +"""Unified-memory NVIDIA detection — Grace Blackwell GB10 / DGX Spark (#1340). + +GB10 (and other unified-memory NVIDIA parts) report `nvidia-smi +--query-gpu=memory.total` as "[N/A]"/"Not Supported" because the GPU shares the +system LPDDR pool instead of carrying discrete VRAM. The detector did +`float(memory.total)` and, on the ValueError, `continue`d — dropping the only +GPU row, so a real GB10 running vLLM was reported as "No GPU" and Cookbook +recommendations/model-switching broke. These pin that such a device is detected +as a unified-memory CUDA GPU backed by system RAM, while discrete GPUs are +unchanged. +""" + +import pytest + +from services.hwfit import hardware + + +@pytest.fixture(autouse=True) +def _local(monkeypatch): + monkeypatch.setattr(hardware, "_remote_host", None) + + +def test_gb10_unified_memory_detected_not_dropped(monkeypatch): + # Real GB10 nvidia-smi --query-gpu=memory.total,name output: memory is N/A. + monkeypatch.setattr(hardware, "_run", lambda cmd: "[N/A], NVIDIA GB10") + monkeypatch.setattr(hardware, "_get_ram_gb", lambda: 128.0) + info = hardware._detect_nvidia() + assert info is not None, "GB10 was dropped as 'No GPU'" + assert info["gpu_name"] == "NVIDIA GB10" + assert info["backend"] == "cuda" + assert info["gpu_count"] == 1 + assert info["unified_memory"] is True + assert info["gpu_vram_gb"] == 128.0 # backed by the unified RAM pool + assert hardware._last_gpu_error is None + + +def test_detect_system_reports_gb10_as_gpu(monkeypatch): + """End-to-end through detect_system: has_gpu True + unified_memory propagated.""" + monkeypatch.setattr(hardware, "_run", lambda cmd: "[N/A], NVIDIA GB10") + monkeypatch.setattr(hardware, "_get_ram_gb", lambda: 128.0) + monkeypatch.setattr(hardware, "_get_available_ram_gb", lambda: 120.0) + monkeypatch.setattr(hardware, "_get_cpu_count", lambda: 20) + monkeypatch.setattr(hardware, "_get_cpu_name", lambda: "NVIDIA Grace") + monkeypatch.setattr(hardware, "_detect_apple_silicon", lambda: None) + s = hardware.detect_system(fresh=True) + assert s["has_gpu"] is True + assert s["gpu_name"] == "NVIDIA GB10" + assert s["backend"] == "cuda" + assert s.get("unified_memory") is True + + +def test_discrete_gpu_unchanged_and_not_unified(monkeypatch): + monkeypatch.setattr(hardware, "_run", lambda cmd: "24576, NVIDIA GeForce RTX 4090") + info = hardware._detect_nvidia() + assert info["gpu_vram_gb"] == 24.0 + assert info["gpu_count"] == 1 + assert not info.get("unified_memory") + + +def test_discrete_takes_precedence_over_unified_row(monkeypatch): + """A box with a real discrete-VRAM GPU keeps the discrete path; the + N/A-memory row is not conflated into a unified pool.""" + monkeypatch.setattr(hardware, "_run", lambda cmd: "24576, NVIDIA RTX 4090\n[N/A], NVIDIA GB10") + info = hardware._detect_nvidia() + assert info["gpu_name"] == "NVIDIA RTX 4090" + assert info["gpu_count"] == 1 + assert not info.get("unified_memory") + + +def test_no_gpu_still_none(monkeypatch): + """No nvidia-smi output → still None, no spurious unified GPU.""" + monkeypatch.setattr(hardware, "_run", lambda cmd: None) + assert hardware._detect_nvidia() is None