Files
odysseus/scripts/check-docker-amd-gpu.sh
2026-06-02 22:54:08 +09:00

206 lines
6.1 KiB
Bash
Executable File

#!/usr/bin/env bash
# check-docker-amd-gpu.sh - read-only AMD/ROCm Docker passthrough diagnostic.
#
# This script does not install packages, edit .env, or restart Docker. It only
# checks host AMD device nodes, Docker access, and whether a small container can
# see /dev/kfd and /dev/dri. The Odysseus slim image does not include ROCm tools
# such as rocm-smi, so container verification checks devices instead.
set -u
PASS=0
FAIL=0
WARN=0
RENDER_GID=""
VIDEO_GID=""
TEST_IMAGE="${ODYSSEUS_AMD_TEST_IMAGE:-alpine:3.20}"
_pass() { printf '\033[32m[PASS]\033[0m %s\n' "$*"; PASS=$((PASS + 1)); }
_fail() { printf '\033[31m[FAIL]\033[0m %s\n' "$*"; FAIL=$((FAIL + 1)); }
_warn() { printf '\033[33m[WARN]\033[0m %s\n' "$*"; WARN=$((WARN + 1)); }
_info() { printf '\033[34m[INFO]\033[0m %s\n' "$*"; }
_usage() {
cat <<'USAGE'
Usage: scripts/check-docker-amd-gpu.sh
Read-only AMD/ROCm Docker GPU diagnostic. Installs nothing, edits nothing, and
does not restart Docker.
Checks:
- host /dev/kfd and /dev/dri/renderD* exist
- host render group GID for RENDER_GID in .env
- optional host rocminfo visibility
- Docker can pass AMD device nodes into a small container
Environment:
ODYSSEUS_AMD_TEST_IMAGE Docker image for the passthrough smoke
(default: alpine:3.20)
USAGE
}
case "${1:-}" in
--help|-h)
_usage
exit 0
;;
"")
;;
*)
printf 'Unknown option: %s\n\n' "$1" >&2
_usage >&2
exit 1
;;
esac
_find_cmd() {
if command -v "$1" >/dev/null 2>&1; then
command -v "$1"
return 0
fi
if [ -x "/opt/rocm/bin/$1" ]; then
printf '/opt/rocm/bin/%s\n' "$1"
return 0
fi
return 1
}
_check_host_devices() {
_info "Checking host AMD device nodes..."
if [ -e /dev/kfd ]; then
_pass "/dev/kfd exists"
else
_fail "/dev/kfd is missing - ROCm kernel driver access is not available."
fi
if [ -d /dev/dri ]; then
_pass "/dev/dri exists"
else
_fail "/dev/dri is missing - render devices are not available."
return
fi
render_nodes="$(find /dev/dri -maxdepth 1 -type c -name 'renderD*' -print 2>/dev/null | sort)"
if [ -n "${render_nodes}" ]; then
_pass "Render nodes found:"
printf '%s\n' "${render_nodes}" | sed 's/^/ /'
else
_fail "No /dev/dri/renderD* node found."
fi
echo
}
_check_groups() {
_info "Checking host render/video groups..."
RENDER_GID="$(getent group render | awk -F: '{print $3; exit}')"
VIDEO_GID="$(getent group video | awk -F: '{print $3; exit}')"
if [ -n "${RENDER_GID}" ]; then
_pass "render group GID: ${RENDER_GID}"
else
_fail "render group not found - set RENDER_GID manually if your distro uses a different group."
fi
if [ -n "${VIDEO_GID}" ]; then
_pass "video group GID: ${VIDEO_GID}"
else
_warn "video group not found. /dev/kfd and renderD* may still be enough on some hosts."
fi
echo
}
_check_host_rocm() {
_info "Checking host ROCm tools..."
rocminfo_cmd="$(_find_cmd rocminfo || true)"
if [ -n "${rocminfo_cmd}" ]; then
if "${rocminfo_cmd}" 2>/dev/null | grep -Eq 'gfx[0-9a-f]+'; then
_pass "rocminfo works on the host: ${rocminfo_cmd}"
"${rocminfo_cmd}" 2>/dev/null \
| grep -E 'Marketing Name:|Name:[[:space:]]+gfx' \
| head -12 \
| sed 's/^/ /'
else
_warn "rocminfo exists but did not list a gfx target."
fi
else
_warn "rocminfo not found on PATH or /opt/rocm/bin. This does not block Docker passthrough, but host ROCm may be incomplete."
fi
echo
}
_check_docker() {
_info "Checking Docker..."
if ! command -v docker >/dev/null 2>&1; then
_fail "docker not found - install Docker first."
echo
return 1
fi
if docker info >/dev/null 2>&1; then
_pass "Docker daemon is running."
else
_fail "Docker daemon is not running or this user lacks Docker permission."
echo
return 1
fi
echo
}
_check_docker_passthrough() {
if [ -z "${RENDER_GID}" ]; then
_fail "Skipping Docker passthrough smoke because render GID is unknown."
echo
return
fi
_info "Testing AMD device passthrough with ${TEST_IMAGE} (may pull on first run)..."
group_args=(--group-add "${RENDER_GID}")
if [ -n "${VIDEO_GID}" ]; then
group_args+=(--group-add "${VIDEO_GID}")
fi
if docker run --rm \
--device=/dev/kfd \
--device=/dev/dri \
"${group_args[@]}" \
"${TEST_IMAGE}" \
sh -lc 'test -e /dev/kfd && test -d /dev/dri && ls /dev/dri/renderD* >/dev/null' \
>/dev/null 2>&1; then
_pass "Docker can pass /dev/kfd and /dev/dri render nodes into a container."
else
_fail "Docker AMD device passthrough failed."
_info "Check that Docker can access /dev/kfd and /dev/dri, then retry."
fi
echo
}
_print_next_steps() {
echo "=== Suggested .env values ==="
if [ -n "${RENDER_GID}" ]; then
printf 'COMPOSE_FILE=docker-compose.yml:docker/gpu.amd.yml\n'
printf 'RENDER_GID=%s\n' "${RENDER_GID}"
else
printf 'COMPOSE_FILE=docker-compose.yml:docker/gpu.amd.yml\n'
printf 'RENDER_GID=<numeric render group id>\n'
fi
echo
echo "After restarting Odysseus, verify the slim app container sees devices:"
echo " docker compose exec odysseus sh -lc 'test -e /dev/kfd && test -d /dev/dri && ls -l /dev/kfd /dev/dri/renderD*'"
echo
echo "Note: rocm-smi/rocminfo are not expected inside the slim Odysseus image."
echo "Device passthrough is necessary but not sufficient for GPU serving; vLLM and"
echo "llama.cpp still need ROCm-compatible builds or ROCm-specific Docker images."
}
echo "=== Odysseus AMD Docker GPU diagnostic ==="
echo
_check_host_devices
_check_groups
_check_host_rocm
if _check_docker; then
_check_docker_passthrough
fi
_print_next_steps
echo
echo "=== Results: ${PASS} passed, ${WARN} warnings, ${FAIL} failed ==="
[ "${FAIL}" -eq 0 ]