docs: add AMD Docker GPU preflight (#1168)

2026-06-02 23:54:08 +10:00
parent 4e769d537c
commit 18a445ba22
4 changed files with 230 additions and 8 deletions
--- a/.env.example
+++ b/.env.example
@@ -145,7 +145,8 @@ SEARXNG_INSTANCE=http://localhost:8080
 #
 # AMD ROCm (requires ROCm drivers on the host and the GID of the render group):
 # COMPOSE_FILE=docker-compose.yml:docker/gpu.amd.yml
-# RENDER_GID=992
+# Find the render GID with: getent group render | cut -d: -f3
+# RENDER_GID=989
 #
 # These overlays only expose the GPU devices. The slim Odysseus image
 # still needs CUDA/ROCm userspace via Cookbook -> Dependencies (vLLM,
--- a/README.md
+++ b/README.md
@@ -130,11 +130,13 @@ Odysseus SSH key and add the public key to the remote server's
 ssh-copy-id -i data/ssh/id_ed25519.pub user@server
 ```

-**NVIDIA Docker GPU overlay.** CPU-only users can skip this section.
-`scripts/check-docker-gpu.sh` diagnoses GPU passthrough and can optionally
-install the host runtime or update `.env`. Cookbook can only detect GPUs that
-Docker exposes to the container — if the host runtime is not configured,
-Cookbook sees the iGPU, another card, or CPU instead of your NVIDIA GPU.
+**Docker GPU overlays.** CPU-only users can skip this section. Cookbook can
+only detect GPUs that Docker exposes to the container — if the host runtime or
+device passthrough is not configured, Cookbook sees the iGPU, another card, or
+CPU instead of your intended GPU.
+
+For NVIDIA, `scripts/check-docker-gpu.sh` diagnoses GPU passthrough and can
+optionally install the host runtime or update `.env`.

 ```bash
 # Read-only diagnostic (default — installs nothing, never edits .env):
@@ -168,10 +170,18 @@ To enable manually without the script, add this to `.env`:
 COMPOSE_FILE=docker-compose.yml:docker/gpu.nvidia.yml
 ```

-**AMD / ROCm.** AMD GPU passthrough is not automated. Add manually:
+**AMD / ROCm.** AMD setup is read-only diagnostic plus manual `.env` edit. Run:
+
+```bash
+scripts/check-docker-amd-gpu.sh
+```
+
+Then add the reported values to `.env`, replacing `RENDER_GID` with your host's
+numeric render group id:

 ```bash
 COMPOSE_FILE=docker-compose.yml:docker/gpu.amd.yml
+RENDER_GID=989
 ```

 For NVIDIA/AMD GPU support, also read the comments in the selected overlay file: docker/gpu.nvidia.yml or docker/gpu.amd.yml.
@@ -180,7 +190,7 @@ Verify after enabling either overlay:

 ```bash
 docker compose exec odysseus nvidia-smi -L   # NVIDIA
-docker compose exec odysseus rocm-smi        # AMD
+docker compose exec odysseus sh -lc 'test -e /dev/kfd && test -d /dev/dri && ls -l /dev/kfd /dev/dri/renderD*'  # AMD
 ```

 > **GPU passthrough ≠ llama.cpp CUDA.** `nvidia-smi` passing inside the
@@ -190,6 +200,11 @@ docker compose exec odysseus rocm-smi        # AMD
 > tensors/layers assigned to CPU, that is a Cookbook/llama.cpp build issue —
 > not a Docker passthrough failure. Re-install the serve engine via
 > **Cookbook → Dependencies** to get a CUDA-enabled build.
+>
+> The same split applies to AMD/ROCm: seeing `/dev/kfd` and `/dev/dri` inside
+> the container confirms device passthrough, not ROCm userspace or a
+> ROCm-enabled vLLM/llama.cpp build. `rocm-smi` and `rocminfo` are not expected
+> inside the slim Odysseus image.

 **Ollama with Docker.** If Ollama runs on the host, add this endpoint in
 Settings:
--- a/docker/gpu.amd.yml
+++ b/docker/gpu.amd.yml
@@ -1,5 +1,6 @@
 # AMD ROCm GPU overlay. Enable by setting COMPOSE_FILE in .env:
 #   COMPOSE_FILE=docker-compose.yml:docker/gpu.amd.yml
+#   RENDER_GID=<numeric output of: getent group render | cut -d: -f3>
 #
 # Requires ROCm drivers on the host (kfd + DRI devices). The host user
 # running Docker must be in the `video` and `render` groups.
--- a/scripts/check-docker-amd-gpu.sh
+++ b/scripts/check-docker-amd-gpu.sh
@@ -0,0 +1,205 @@
+#!/usr/bin/env bash
+# check-docker-amd-gpu.sh - read-only AMD/ROCm Docker passthrough diagnostic.
+#
+# This script does not install packages, edit .env, or restart Docker. It only
+# checks host AMD device nodes, Docker access, and whether a small container can
+# see /dev/kfd and /dev/dri. The Odysseus slim image does not include ROCm tools
+# such as rocm-smi, so container verification checks devices instead.
+
+set -u
+
+PASS=0
+FAIL=0
+WARN=0
+RENDER_GID=""
+VIDEO_GID=""
+TEST_IMAGE="${ODYSSEUS_AMD_TEST_IMAGE:-alpine:3.20}"
+
+_pass() { printf '\033[32m[PASS]\033[0m %s\n' "$*"; PASS=$((PASS + 1)); }
+_fail() { printf '\033[31m[FAIL]\033[0m %s\n' "$*"; FAIL=$((FAIL + 1)); }
+_warn() { printf '\033[33m[WARN]\033[0m %s\n' "$*"; WARN=$((WARN + 1)); }
+_info() { printf '\033[34m[INFO]\033[0m %s\n' "$*"; }
+
+_usage() {
+    cat <<'USAGE'
+Usage: scripts/check-docker-amd-gpu.sh
+
+Read-only AMD/ROCm Docker GPU diagnostic. Installs nothing, edits nothing, and
+does not restart Docker.
+
+Checks:
+  - host /dev/kfd and /dev/dri/renderD* exist
+  - host render group GID for RENDER_GID in .env
+  - optional host rocminfo visibility
+  - Docker can pass AMD device nodes into a small container
+
+Environment:
+  ODYSSEUS_AMD_TEST_IMAGE   Docker image for the passthrough smoke
+                            (default: alpine:3.20)
+USAGE
+}
+
+case "${1:-}" in
+    --help|-h)
+        _usage
+        exit 0
+        ;;
+    "")
+        ;;
+    *)
+        printf 'Unknown option: %s\n\n' "$1" >&2
+        _usage >&2
+        exit 1
+        ;;
+esac
+
+_find_cmd() {
+    if command -v "$1" >/dev/null 2>&1; then
+        command -v "$1"
+        return 0
+    fi
+    if [ -x "/opt/rocm/bin/$1" ]; then
+        printf '/opt/rocm/bin/%s\n' "$1"
+        return 0
+    fi
+    return 1
+}
+
+_check_host_devices() {
+    _info "Checking host AMD device nodes..."
+    if [ -e /dev/kfd ]; then
+        _pass "/dev/kfd exists"
+    else
+        _fail "/dev/kfd is missing - ROCm kernel driver access is not available."
+    fi
+
+    if [ -d /dev/dri ]; then
+        _pass "/dev/dri exists"
+    else
+        _fail "/dev/dri is missing - render devices are not available."
+        return
+    fi
+
+    render_nodes="$(find /dev/dri -maxdepth 1 -type c -name 'renderD*' -print 2>/dev/null | sort)"
+    if [ -n "${render_nodes}" ]; then
+        _pass "Render nodes found:"
+        printf '%s\n' "${render_nodes}" | sed 's/^/        /'
+    else
+        _fail "No /dev/dri/renderD* node found."
+    fi
+    echo
+}
+
+_check_groups() {
+    _info "Checking host render/video groups..."
+    RENDER_GID="$(getent group render | awk -F: '{print $3; exit}')"
+    VIDEO_GID="$(getent group video | awk -F: '{print $3; exit}')"
+
+    if [ -n "${RENDER_GID}" ]; then
+        _pass "render group GID: ${RENDER_GID}"
+    else
+        _fail "render group not found - set RENDER_GID manually if your distro uses a different group."
+    fi
+
+    if [ -n "${VIDEO_GID}" ]; then
+        _pass "video group GID: ${VIDEO_GID}"
+    else
+        _warn "video group not found. /dev/kfd and renderD* may still be enough on some hosts."
+    fi
+    echo
+}
+
+_check_host_rocm() {
+    _info "Checking host ROCm tools..."
+    rocminfo_cmd="$(_find_cmd rocminfo || true)"
+    if [ -n "${rocminfo_cmd}" ]; then
+        if "${rocminfo_cmd}" 2>/dev/null | grep -Eq 'gfx[0-9a-f]+'; then
+            _pass "rocminfo works on the host: ${rocminfo_cmd}"
+            "${rocminfo_cmd}" 2>/dev/null \
+                | grep -E 'Marketing Name:|Name:[[:space:]]+gfx' \
+                | head -12 \
+                | sed 's/^/        /'
+        else
+            _warn "rocminfo exists but did not list a gfx target."
+        fi
+    else
+        _warn "rocminfo not found on PATH or /opt/rocm/bin. This does not block Docker passthrough, but host ROCm may be incomplete."
+    fi
+    echo
+}
+
+_check_docker() {
+    _info "Checking Docker..."
+    if ! command -v docker >/dev/null 2>&1; then
+        _fail "docker not found - install Docker first."
+        echo
+        return 1
+    fi
+    if docker info >/dev/null 2>&1; then
+        _pass "Docker daemon is running."
+    else
+        _fail "Docker daemon is not running or this user lacks Docker permission."
+        echo
+        return 1
+    fi
+    echo
+}
+
+_check_docker_passthrough() {
+    if [ -z "${RENDER_GID}" ]; then
+        _fail "Skipping Docker passthrough smoke because render GID is unknown."
+        echo
+        return
+    fi
+
+    _info "Testing AMD device passthrough with ${TEST_IMAGE} (may pull on first run)..."
+    group_args=(--group-add "${RENDER_GID}")
+    if [ -n "${VIDEO_GID}" ]; then
+        group_args+=(--group-add "${VIDEO_GID}")
+    fi
+
+    if docker run --rm \
+        --device=/dev/kfd \
+        --device=/dev/dri \
+        "${group_args[@]}" \
+        "${TEST_IMAGE}" \
+        sh -lc 'test -e /dev/kfd && test -d /dev/dri && ls /dev/dri/renderD* >/dev/null' \
+        >/dev/null 2>&1; then
+        _pass "Docker can pass /dev/kfd and /dev/dri render nodes into a container."
+    else
+        _fail "Docker AMD device passthrough failed."
+        _info "Check that Docker can access /dev/kfd and /dev/dri, then retry."
+    fi
+    echo
+}
+
+_print_next_steps() {
+    echo "=== Suggested .env values ==="
+    if [ -n "${RENDER_GID}" ]; then
+        printf 'COMPOSE_FILE=docker-compose.yml:docker/gpu.amd.yml\n'
+        printf 'RENDER_GID=%s\n' "${RENDER_GID}"
+    else
+        printf 'COMPOSE_FILE=docker-compose.yml:docker/gpu.amd.yml\n'
+        printf 'RENDER_GID=<numeric render group id>\n'
+    fi
+    echo
+    echo "After restarting Odysseus, verify the slim app container sees devices:"
+    echo "  docker compose exec odysseus sh -lc 'test -e /dev/kfd && test -d /dev/dri && ls -l /dev/kfd /dev/dri/renderD*'"
+    echo
+    echo "Note: rocm-smi/rocminfo are not expected inside the slim Odysseus image."
+    echo "Device passthrough is necessary but not sufficient for GPU serving; vLLM and"
+    echo "llama.cpp still need ROCm-compatible builds or ROCm-specific Docker images."
+}
+
+echo "=== Odysseus AMD Docker GPU diagnostic ==="
+echo
+_check_host_devices
+_check_groups
+_check_host_rocm
+if _check_docker; then
+    _check_docker_passthrough
+fi
+_print_next_steps
+echo
+echo "=== Results: ${PASS} passed, ${WARN} warnings, ${FAIL} failed ==="
+[ "${FAIL}" -eq 0 ]