docs: add AMD Docker GPU preflight (#1168)

This commit is contained in:
spooky
2026-06-02 23:54:08 +10:00
committed by GitHub
parent 4e769d537c
commit 18a445ba22
4 changed files with 230 additions and 8 deletions

View File

@@ -145,7 +145,8 @@ SEARXNG_INSTANCE=http://localhost:8080
#
# AMD ROCm (requires ROCm drivers on the host and the GID of the render group):
# COMPOSE_FILE=docker-compose.yml:docker/gpu.amd.yml
# RENDER_GID=992
# Find the render GID with: getent group render | cut -d: -f3
# RENDER_GID=989
#
# These overlays only expose the GPU devices. The slim Odysseus image
# still needs CUDA/ROCm userspace via Cookbook -> Dependencies (vLLM,

View File

@@ -130,11 +130,13 @@ Odysseus SSH key and add the public key to the remote server's
ssh-copy-id -i data/ssh/id_ed25519.pub user@server
```
**NVIDIA Docker GPU overlay.** CPU-only users can skip this section.
`scripts/check-docker-gpu.sh` diagnoses GPU passthrough and can optionally
install the host runtime or update `.env`. Cookbook can only detect GPUs that
Docker exposes to the container — if the host runtime is not configured,
Cookbook sees the iGPU, another card, or CPU instead of your NVIDIA GPU.
**Docker GPU overlays.** CPU-only users can skip this section. Cookbook can
only detect GPUs that Docker exposes to the container — if the host runtime or
device passthrough is not configured, Cookbook sees the iGPU, another card, or
CPU instead of your intended GPU.
For NVIDIA, `scripts/check-docker-gpu.sh` diagnoses GPU passthrough and can
optionally install the host runtime or update `.env`.
```bash
# Read-only diagnostic (default — installs nothing, never edits .env):
@@ -168,10 +170,18 @@ To enable manually without the script, add this to `.env`:
COMPOSE_FILE=docker-compose.yml:docker/gpu.nvidia.yml
```
**AMD / ROCm.** AMD GPU passthrough is not automated. Add manually:
**AMD / ROCm.** AMD setup is read-only diagnostic plus manual `.env` edit. Run:
```bash
scripts/check-docker-amd-gpu.sh
```
Then add the reported values to `.env`, replacing `RENDER_GID` with your host's
numeric render group id:
```bash
COMPOSE_FILE=docker-compose.yml:docker/gpu.amd.yml
RENDER_GID=989
```
For NVIDIA/AMD GPU support, also read the comments in the selected overlay file: docker/gpu.nvidia.yml or docker/gpu.amd.yml.
@@ -180,7 +190,7 @@ Verify after enabling either overlay:
```bash
docker compose exec odysseus nvidia-smi -L # NVIDIA
docker compose exec odysseus rocm-smi # AMD
docker compose exec odysseus sh -lc 'test -e /dev/kfd && test -d /dev/dri && ls -l /dev/kfd /dev/dri/renderD*' # AMD
```
> **GPU passthrough ≠ llama.cpp CUDA.** `nvidia-smi` passing inside the
@@ -190,6 +200,11 @@ docker compose exec odysseus rocm-smi # AMD
> tensors/layers assigned to CPU, that is a Cookbook/llama.cpp build issue —
> not a Docker passthrough failure. Re-install the serve engine via
> **Cookbook → Dependencies** to get a CUDA-enabled build.
>
> The same split applies to AMD/ROCm: seeing `/dev/kfd` and `/dev/dri` inside
> the container confirms device passthrough, not ROCm userspace or a
> ROCm-enabled vLLM/llama.cpp build. `rocm-smi` and `rocminfo` are not expected
> inside the slim Odysseus image.
**Ollama with Docker.** If Ollama runs on the host, add this endpoint in
Settings:

View File

@@ -1,5 +1,6 @@
# AMD ROCm GPU overlay. Enable by setting COMPOSE_FILE in .env:
# COMPOSE_FILE=docker-compose.yml:docker/gpu.amd.yml
# RENDER_GID=<numeric output of: getent group render | cut -d: -f3>
#
# Requires ROCm drivers on the host (kfd + DRI devices). The host user
# running Docker must be in the `video` and `render` groups.

205
scripts/check-docker-amd-gpu.sh Executable file
View File

@@ -0,0 +1,205 @@
#!/usr/bin/env bash
# check-docker-amd-gpu.sh - read-only AMD/ROCm Docker passthrough diagnostic.
#
# This script does not install packages, edit .env, or restart Docker. It only
# checks host AMD device nodes, Docker access, and whether a small container can
# see /dev/kfd and /dev/dri. The Odysseus slim image does not include ROCm tools
# such as rocm-smi, so container verification checks devices instead.
set -u
PASS=0
FAIL=0
WARN=0
RENDER_GID=""
VIDEO_GID=""
TEST_IMAGE="${ODYSSEUS_AMD_TEST_IMAGE:-alpine:3.20}"
_pass() { printf '\033[32m[PASS]\033[0m %s\n' "$*"; PASS=$((PASS + 1)); }
_fail() { printf '\033[31m[FAIL]\033[0m %s\n' "$*"; FAIL=$((FAIL + 1)); }
_warn() { printf '\033[33m[WARN]\033[0m %s\n' "$*"; WARN=$((WARN + 1)); }
_info() { printf '\033[34m[INFO]\033[0m %s\n' "$*"; }
_usage() {
cat <<'USAGE'
Usage: scripts/check-docker-amd-gpu.sh
Read-only AMD/ROCm Docker GPU diagnostic. Installs nothing, edits nothing, and
does not restart Docker.
Checks:
- host /dev/kfd and /dev/dri/renderD* exist
- host render group GID for RENDER_GID in .env
- optional host rocminfo visibility
- Docker can pass AMD device nodes into a small container
Environment:
ODYSSEUS_AMD_TEST_IMAGE Docker image for the passthrough smoke
(default: alpine:3.20)
USAGE
}
case "${1:-}" in
--help|-h)
_usage
exit 0
;;
"")
;;
*)
printf 'Unknown option: %s\n\n' "$1" >&2
_usage >&2
exit 1
;;
esac
_find_cmd() {
if command -v "$1" >/dev/null 2>&1; then
command -v "$1"
return 0
fi
if [ -x "/opt/rocm/bin/$1" ]; then
printf '/opt/rocm/bin/%s\n' "$1"
return 0
fi
return 1
}
_check_host_devices() {
_info "Checking host AMD device nodes..."
if [ -e /dev/kfd ]; then
_pass "/dev/kfd exists"
else
_fail "/dev/kfd is missing - ROCm kernel driver access is not available."
fi
if [ -d /dev/dri ]; then
_pass "/dev/dri exists"
else
_fail "/dev/dri is missing - render devices are not available."
return
fi
render_nodes="$(find /dev/dri -maxdepth 1 -type c -name 'renderD*' -print 2>/dev/null | sort)"
if [ -n "${render_nodes}" ]; then
_pass "Render nodes found:"
printf '%s\n' "${render_nodes}" | sed 's/^/ /'
else
_fail "No /dev/dri/renderD* node found."
fi
echo
}
_check_groups() {
_info "Checking host render/video groups..."
RENDER_GID="$(getent group render | awk -F: '{print $3; exit}')"
VIDEO_GID="$(getent group video | awk -F: '{print $3; exit}')"
if [ -n "${RENDER_GID}" ]; then
_pass "render group GID: ${RENDER_GID}"
else
_fail "render group not found - set RENDER_GID manually if your distro uses a different group."
fi
if [ -n "${VIDEO_GID}" ]; then
_pass "video group GID: ${VIDEO_GID}"
else
_warn "video group not found. /dev/kfd and renderD* may still be enough on some hosts."
fi
echo
}
_check_host_rocm() {
_info "Checking host ROCm tools..."
rocminfo_cmd="$(_find_cmd rocminfo || true)"
if [ -n "${rocminfo_cmd}" ]; then
if "${rocminfo_cmd}" 2>/dev/null | grep -Eq 'gfx[0-9a-f]+'; then
_pass "rocminfo works on the host: ${rocminfo_cmd}"
"${rocminfo_cmd}" 2>/dev/null \
| grep -E 'Marketing Name:|Name:[[:space:]]+gfx' \
| head -12 \
| sed 's/^/ /'
else
_warn "rocminfo exists but did not list a gfx target."
fi
else
_warn "rocminfo not found on PATH or /opt/rocm/bin. This does not block Docker passthrough, but host ROCm may be incomplete."
fi
echo
}
_check_docker() {
_info "Checking Docker..."
if ! command -v docker >/dev/null 2>&1; then
_fail "docker not found - install Docker first."
echo
return 1
fi
if docker info >/dev/null 2>&1; then
_pass "Docker daemon is running."
else
_fail "Docker daemon is not running or this user lacks Docker permission."
echo
return 1
fi
echo
}
_check_docker_passthrough() {
if [ -z "${RENDER_GID}" ]; then
_fail "Skipping Docker passthrough smoke because render GID is unknown."
echo
return
fi
_info "Testing AMD device passthrough with ${TEST_IMAGE} (may pull on first run)..."
group_args=(--group-add "${RENDER_GID}")
if [ -n "${VIDEO_GID}" ]; then
group_args+=(--group-add "${VIDEO_GID}")
fi
if docker run --rm \
--device=/dev/kfd \
--device=/dev/dri \
"${group_args[@]}" \
"${TEST_IMAGE}" \
sh -lc 'test -e /dev/kfd && test -d /dev/dri && ls /dev/dri/renderD* >/dev/null' \
>/dev/null 2>&1; then
_pass "Docker can pass /dev/kfd and /dev/dri render nodes into a container."
else
_fail "Docker AMD device passthrough failed."
_info "Check that Docker can access /dev/kfd and /dev/dri, then retry."
fi
echo
}
_print_next_steps() {
echo "=== Suggested .env values ==="
if [ -n "${RENDER_GID}" ]; then
printf 'COMPOSE_FILE=docker-compose.yml:docker/gpu.amd.yml\n'
printf 'RENDER_GID=%s\n' "${RENDER_GID}"
else
printf 'COMPOSE_FILE=docker-compose.yml:docker/gpu.amd.yml\n'
printf 'RENDER_GID=<numeric render group id>\n'
fi
echo
echo "After restarting Odysseus, verify the slim app container sees devices:"
echo " docker compose exec odysseus sh -lc 'test -e /dev/kfd && test -d /dev/dri && ls -l /dev/kfd /dev/dri/renderD*'"
echo
echo "Note: rocm-smi/rocminfo are not expected inside the slim Odysseus image."
echo "Device passthrough is necessary but not sufficient for GPU serving; vLLM and"
echo "llama.cpp still need ROCm-compatible builds or ROCm-specific Docker images."
}
echo "=== Odysseus AMD Docker GPU diagnostic ==="
echo
_check_host_devices
_check_groups
_check_host_rocm
if _check_docker; then
_check_docker_passthrough
fi
_print_next_steps
echo
echo "=== Results: ${PASS} passed, ${WARN} warnings, ${FAIL} failed ==="
[ "${FAIL}" -eq 0 ]