diff --git a/.dockerignore b/.dockerignore index ed30dd7..aed7e93 100644 --- a/.dockerignore +++ b/.dockerignore @@ -9,6 +9,7 @@ __pycache__/ dist/ build/ .env +.env.bak.* /data/ /logs/ .git/ diff --git a/.gitignore b/.gitignore index cba02b2..c48f6cd 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ venv/ # Environment .env +.env.bak.* !.env.example # Data — all user data stays local diff --git a/README.md b/README.md index 0022b4d..5079bd7 100644 --- a/README.md +++ b/README.md @@ -124,21 +124,65 @@ Odysseus SSH key and add the public key to the remote server's ssh-copy-id -i data/ssh/id_ed25519.pub user@server ``` -**NVIDIA / AMD Docker GPU overlays.** Install the host runtime first, then add -one of these to `.env`: +**NVIDIA Docker GPU overlay.** CPU-only users can skip this section. +`scripts/check-docker-gpu.sh` diagnoses GPU passthrough and can optionally +install the host runtime or update `.env`. Cookbook can only detect GPUs that +Docker exposes to the container — if the host runtime is not configured, +Cookbook sees the iGPU, another card, or CPU instead of your NVIDIA GPU. + +```bash +# Read-only diagnostic (default — installs nothing, never edits .env): +scripts/check-docker-gpu.sh + +# Print OS-specific install commands without running them: +scripts/check-docker-gpu.sh --print-install-commands + +# Install NVIDIA Container Toolkit on Ubuntu/Debian (requires sudo): +scripts/check-docker-gpu.sh --install-nvidia-toolkit + +# Write COMPOSE_FILE to .env (only when GPU passthrough is confirmed working): +scripts/check-docker-gpu.sh --enable-nvidia-overlay + +# Full assisted setup — install toolkit, then enable overlay if passthrough works: +scripts/check-docker-gpu.sh --install-nvidia-toolkit --enable-nvidia-overlay +``` + +Safety notes: +- The app never installs host GPU runtime automatically. +- The app never edits `.env` automatically. +- `.env` is only modified when `--enable-nvidia-overlay` is explicitly passed, + and only after GPU passthrough succeeds. `--yes` skips prompts but does not + bypass the passthrough gate. +- `.env.bak.*` backups created by `--enable-nvidia-overlay` are ignored by + Git and the Docker build context. + +To enable manually without the script, add this to `.env`: ```bash COMPOSE_FILE=docker-compose.yml:docker/gpu.nvidia.yml +``` + +**AMD / ROCm.** AMD GPU passthrough is not automated. Add manually: + +```bash COMPOSE_FILE=docker-compose.yml:docker/gpu.amd.yml ``` -Verify with: +Verify after enabling either overlay: ```bash -docker compose exec odysseus nvidia-smi -L -docker compose exec odysseus rocm-smi +docker compose exec odysseus nvidia-smi -L # NVIDIA +docker compose exec odysseus rocm-smi # AMD ``` +> **GPU passthrough ≠ llama.cpp CUDA.** `nvidia-smi` passing inside the +> container confirms Docker GPU access, but llama.cpp also needs `cudart` and +> the CUDA Toolkit at runtime. If Cookbook logs show `Unable to find cudart +> library`, `Could NOT find CUDAToolkit`, `CUDA Toolkit not found`, or +> tensors/layers assigned to CPU, that is a Cookbook/llama.cpp build issue — +> not a Docker passthrough failure. Re-install the serve engine via +> **Cookbook → Dependencies** to get a CUDA-enabled build. + **Ollama with Docker.** If Ollama runs on the host, add this endpoint in Settings: diff --git a/docker/gpu.nvidia.yml b/docker/gpu.nvidia.yml index 32f7fb2..5590ba4 100644 --- a/docker/gpu.nvidia.yml +++ b/docker/gpu.nvidia.yml @@ -1,6 +1,11 @@ # NVIDIA GPU overlay. Enable by setting COMPOSE_FILE in .env: # COMPOSE_FILE=docker-compose.yml:docker/gpu.nvidia.yml # +# Use scripts/check-docker-gpu.sh to diagnose GPU passthrough, optionally +# install the NVIDIA Container Toolkit (Ubuntu/Debian), and write COMPOSE_FILE +# to .env. The script is read-only by default — it installs nothing and never +# edits .env unless explicitly asked. +# # Requires the NVIDIA Container Toolkit on the host. # Arch: sudo pacman -S nvidia-container-toolkit # Debian: sudo apt install nvidia-container-toolkit diff --git a/scripts/check-docker-gpu.sh b/scripts/check-docker-gpu.sh new file mode 100755 index 0000000..b80122e --- /dev/null +++ b/scripts/check-docker-gpu.sh @@ -0,0 +1,579 @@ +#!/usr/bin/env bash +# check-docker-gpu.sh — Diagnostic and optional setup helper for NVIDIA Docker GPU access. +# +# Default mode is READ-ONLY — does not install packages, modify config, or restart Docker. +# The Odysseus app never calls this script automatically. +# +# USAGE +# scripts/check-docker-gpu.sh # read-only diagnostics (default) +# scripts/check-docker-gpu.sh --enable-nvidia-overlay # also write COMPOSE_FILE to .env +# scripts/check-docker-gpu.sh --print-install-commands # show OS-specific commands, don't run +# scripts/check-docker-gpu.sh --install-nvidia-toolkit # install toolkit (Ubuntu/Debian only) +# scripts/check-docker-gpu.sh --install-nvidia-toolkit --enable-nvidia-overlay +# scripts/check-docker-gpu.sh --install-nvidia-toolkit --enable-nvidia-overlay --yes +# scripts/check-docker-gpu.sh --help + +MODE="check" +OPT_YES=0 +OPT_ENABLE_OVERLAY=0 +_GPU_PASSTHROUGH_OK=0 + +# ─── output helpers ────────────────────────────────────────────────────────── + +PASS=0 +FAIL=0 + +_pass() { printf '\033[32m[PASS]\033[0m %s\n' "$*"; PASS=$((PASS + 1)); } +_fail() { printf '\033[31m[FAIL]\033[0m %s\n' "$*"; FAIL=$((FAIL + 1)); } +_info() { printf '\033[34m[INFO]\033[0m %s\n' "$*"; } +_warn() { printf '\033[33m[WARN]\033[0m %s\n' "$*"; } +_step() { printf '\033[36m[STEP]\033[0m %s\n' "$*"; } + +_confirm() { + printf '%s [y/N] ' "$1" + read -r _ans + case "${_ans}" in + [Yy]|[Yy][Ee][Ss]) return 0 ;; + *) return 1 ;; + esac +} + +# ─── paths ─────────────────────────────────────────────────────────────────── + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" + +# ─── arg parsing ───────────────────────────────────────────────────────────── + +_usage() { + cat <<'USAGE' +Usage: scripts/check-docker-gpu.sh [OPTIONS] + +Read-only diagnostic (default — safe to run at any time, installs nothing): + (no flags) Check host nvidia-smi, Docker daemon, and Docker + GPU passthrough. Prints PASS/FAIL and next steps. + +Informational: + --print-install-commands Detect the OS and print recommended NVIDIA + Container Toolkit commands without running them. + Inspect these before deciding to install. + --help Show this help. + +Opt-in .env update (requires .env or .env.example in the repo root): + --enable-nvidia-overlay Write COMPOSE_FILE=docker-compose.yml:docker/gpu.nvidia.yml + into .env. Creates a timestamped backup first. + Blocked if GPU passthrough is not working — fix + passthrough first, then re-run. --yes does not + override this gate. + Never edits .env unless this flag is passed. + +Opt-in install (Ubuntu/Debian only, requires sudo): + --install-nvidia-toolkit Add NVIDIA's apt repository, install + nvidia-container-toolkit, configure the Docker + runtime, and optionally restart Docker. + Shows all commands and prompts before any + privileged action. + --yes Skip confirmation prompts (for use with + --install-nvidia-toolkit and/or + --enable-nvidia-overlay in automated setups). + +Examples: + # Diagnose GPU passthrough before enabling the NVIDIA compose overlay: + scripts/check-docker-gpu.sh + + # See what install commands apply to this system without running them: + scripts/check-docker-gpu.sh --print-install-commands + + # Diagnose and automatically update .env with the NVIDIA overlay: + scripts/check-docker-gpu.sh --enable-nvidia-overlay + + # Install toolkit interactively, then enable the overlay if it works: + scripts/check-docker-gpu.sh --install-nvidia-toolkit --enable-nvidia-overlay + + # Full assisted setup without prompts (automated/CI use): + scripts/check-docker-gpu.sh --install-nvidia-toolkit --enable-nvidia-overlay --yes + +After a successful setup, start Odysseus: + docker compose up -d --build + +Full guide: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html +USAGE +} + +for _arg in "$@"; do + case "${_arg}" in + --help|-h) + _usage + exit 0 + ;; + --print-install-commands) + MODE="print" + ;; + --install-nvidia-toolkit) + MODE="install" + ;; + --enable-nvidia-overlay) + OPT_ENABLE_OVERLAY=1 + ;; + --yes|-y) + OPT_YES=1 + ;; + *) + printf 'Unknown option: %s\n\n' "${_arg}" >&2 + _usage >&2 + exit 1 + ;; + esac +done + +# ─── OS/distro detection ───────────────────────────────────────────────────── + +DISTRO_ID="" +DISTRO_LIKE="" +DISTRO_VERSION="" +DISTRO_ARCH="$(uname -m 2>/dev/null || echo unknown)" + +if [ -f /etc/os-release ]; then + DISTRO_ID="$(grep '^ID=' /etc/os-release | cut -d= -f2 | tr -d '"')" + DISTRO_LIKE="$(grep '^ID_LIKE=' /etc/os-release | cut -d= -f2 | tr -d '"')" + DISTRO_VERSION="$(grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | tr -d '"')" +fi + +_is_debian_family() { + case "${DISTRO_ID}" in + ubuntu|debian|linuxmint|pop|elementary) return 0 ;; + esac + # ID_LIKE can be a space-separated list, e.g. "ubuntu debian" + case " ${DISTRO_LIKE} " in + *" debian "*|*" ubuntu "*) return 0 ;; + esac + return 1 +} + +_distro_label() { + if [ -n "${DISTRO_ID}" ]; then + printf '%s%s (%s)' \ + "${DISTRO_ID}" \ + "${DISTRO_VERSION:+ ${DISTRO_VERSION}}" \ + "${DISTRO_ARCH}" + else + printf 'unknown Linux (%s)' "${DISTRO_ARCH}" + fi +} + +# ─── Ubuntu/Debian install command text ────────────────────────────────────── +# Printed both by --print-install-commands and shown before --install runs. + +_debian_install_steps() { + cat <<'STEPS' + + # 1. Install prerequisites + sudo apt-get update + sudo apt-get install -y curl gpg + + # 2. Add NVIDIA's signing key + curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \ + | sudo gpg --batch --yes --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg + + # 3. Add NVIDIA's apt repository + curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \ + | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \ + | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list + + # 4. Install the toolkit + sudo apt-get update + sudo apt-get install -y nvidia-container-toolkit + + # 5. Configure the Docker runtime + sudo nvidia-ctk runtime configure --runtime=docker + + # 6. Restart Docker + sudo systemctl restart docker + + # 7. Verify + docker run --rm --gpus all nvidia/cuda:12.4.1-base-ubuntu22.04 nvidia-smi + +STEPS +} + +# ─── read-only checks ──────────────────────────────────────────────────────── + +_check_nvidia_smi() { + _info "Checking host nvidia-smi..." + if command -v nvidia-smi >/dev/null 2>&1; then + if nvidia-smi -L 2>/dev/null | grep -q 'GPU '; then + _pass "nvidia-smi is working. Detected GPUs:" + nvidia-smi -L 2>/dev/null | sed 's/^/ /' + else + _fail "nvidia-smi found but no GPUs listed — check your NVIDIA driver installation." + fi + else + _fail "nvidia-smi not found — install the NVIDIA driver for your distribution." + _info "No NVIDIA GPU? Skip this script — the NVIDIA overlay is not needed for CPU-only use." + fi + echo +} + +# Returns 1 if Docker is unavailable (callers should stop further GPU checks). +_check_docker() { + _info "Checking Docker..." + if ! command -v docker >/dev/null 2>&1; then + _fail "docker not found — install Docker: https://docs.docker.com/engine/install/" + echo "Cannot continue without Docker." + return 1 + fi + if docker info >/dev/null 2>&1; then + _pass "Docker daemon is running." + else + _fail "Docker daemon is not running or current user lacks permission." + _info "Try: sudo systemctl start docker" + _info "Or add your user to the docker group: sudo usermod -aG docker \$USER" + echo "Cannot continue — GPU passthrough test requires a running Docker daemon." + return 1 + fi + echo +} + +_check_gpu_passthrough() { + _info "Testing GPU passthrough (may pull image on first run):" + _info " docker run --rm --gpus all nvidia/cuda:12.4.1-base-ubuntu22.04 nvidia-smi" + echo + if docker run --rm --gpus all nvidia/cuda:12.4.1-base-ubuntu22.04 nvidia-smi 2>&1; then + echo + _GPU_PASSTHROUGH_OK=1 + _pass "GPU passthrough is working — the NVIDIA compose overlay should work." + _info "Passthrough means Docker can see your GPU. It does NOT guarantee" + _info "llama.cpp will use CUDA. If Cookbook logs show:" + _info " 'Unable to find cudart library'" + _info " 'Could NOT find CUDAToolkit' / 'CUDA Toolkit not found'" + _info " tensors or layers assigned to CPU" + _info "that is a Cookbook/llama.cpp CUDA build or runtime issue, not a" + _info "passthrough failure. Re-install the serve engine via" + _info "Cookbook -> Dependencies to get a CUDA-enabled build." + if [ "${OPT_ENABLE_OVERLAY}" -eq 0 ]; then + _info "Enable the overlay in .env with:" + _info " scripts/check-docker-gpu.sh --enable-nvidia-overlay" + fi + else + echo + _fail "GPU passthrough failed. Check these steps in order:" + echo + echo " 1. Install NVIDIA Container Toolkit (if not already installed):" + echo " Arch: sudo pacman -S nvidia-container-toolkit" + echo " Debian: sudo apt install nvidia-container-toolkit" + echo " Fedora: sudo dnf install nvidia-container-toolkit" + echo " Full guide: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html" + echo + echo " 2. Configure the Docker runtime:" + echo " sudo nvidia-ctk runtime configure --runtime=docker" + echo + echo " 3. Restart Docker:" + echo " sudo systemctl restart docker" + echo + echo " Then re-run this script to confirm." + echo + _warn "Without GPU passthrough, Cookbook will detect the iGPU, another card, or" + _warn "CPU instead of your NVIDIA GPU — model recommendations will use the wrong VRAM." + _info "Run with --print-install-commands to see OS-specific commands." + _info "Run with --install-nvidia-toolkit to install on Ubuntu/Debian." + fi + echo +} + +# ─── --enable-nvidia-overlay ───────────────────────────────────────────────── + +_enable_nvidia_overlay() { + echo "=== Enabling NVIDIA compose overlay ===" + echo + + local _env_file="${REPO_ROOT}/.env" + local _env_example="${REPO_ROOT}/.env.example" + local _overlay_fragment="docker/gpu.nvidia.yml" + local _backup_ts + _backup_ts="$(date +%Y%m%d-%H%M%S)" + + # Ensure .env exists + if [ ! -f "${_env_file}" ]; then + if [ -f "${_env_example}" ]; then + _info ".env not found. .env.example is available." + local _do_copy=0 + if [ "${OPT_YES}" -eq 1 ]; then + _do_copy=1 + elif _confirm "Copy .env.example to .env?"; then + _do_copy=1 + fi + if [ "${_do_copy}" -eq 1 ]; then + if ! cp "${_env_example}" "${_env_file}"; then + _fail "Failed to copy .env.example to .env." + return 1 + fi + _pass "Copied .env.example to .env." + else + _fail ".env is required to set COMPOSE_FILE — aborted." + return 1 + fi + else + _fail ".env not found and .env.example is missing." + _info "Create a .env file in the repo root, then re-run." + return 1 + fi + fi + + # Read current active (uncommented) COMPOSE_FILE value, if any + local _current_cf + _current_cf="$(grep '^COMPOSE_FILE=' "${_env_file}" | tail -1 | cut -d= -f2-)" + + # Idempotency check + if echo "${_current_cf}" | grep -qF "${_overlay_fragment}"; then + _pass "COMPOSE_FILE already includes the NVIDIA overlay — nothing to change." + echo + _info "Start or restart Odysseus to apply:" + _info " docker compose up -d --build" + return 0 + fi + + # Back up .env before any edit + local _backup="${_env_file}.bak.${_backup_ts}" + if ! cp "${_env_file}" "${_backup}"; then + _fail "Failed to create backup of .env — aborting to avoid data loss." + return 1 + fi + _info "Backup created: .env.bak.${_backup_ts}" + + local _new_cf="" + if [ -z "${_current_cf}" ]; then + # No active COMPOSE_FILE line — append one + _new_cf="docker-compose.yml:${_overlay_fragment}" + if ! printf '\nCOMPOSE_FILE=%s\n' "${_new_cf}" >> "${_env_file}"; then + _fail "Failed to write COMPOSE_FILE to .env." + return 1 + fi + else + # Existing COMPOSE_FILE — append the overlay to the existing value + _new_cf="${_current_cf}:${_overlay_fragment}" + local _tmp="${_env_file}.tmp" + if ! sed "s|^COMPOSE_FILE=.*|COMPOSE_FILE=${_new_cf}|" "${_env_file}" > "${_tmp}"; then + _fail "Failed to update COMPOSE_FILE in .env." + rm -f "${_tmp}" + return 1 + fi + if ! mv "${_tmp}" "${_env_file}"; then + _fail "Failed to write updated .env." + rm -f "${_tmp}" + return 1 + fi + fi + + _pass "COMPOSE_FILE set to: ${_new_cf}" + echo + _info "Start or restart Odysseus with the NVIDIA overlay:" + _info " docker compose up -d --build" + echo + _info "To undo, restore the backup:" + _info " cp ${_backup} ${_env_file}" +} + +# ─── mode: default read-only diagnostic ────────────────────────────────────── + +_mode_check() { + echo "=== Odysseus Docker GPU diagnostic ===" + echo + _check_nvidia_smi + _check_docker || { echo "=== Results: ${PASS} passed, ${FAIL} failed ==="; return 1; } + _check_gpu_passthrough + + if [ "${OPT_ENABLE_OVERLAY}" -eq 1 ]; then + if [ "${_GPU_PASSTHROUGH_OK}" -eq 0 ]; then + # Hard gate: broken passthrough blocks .env edits regardless of --yes. + # Writing COMPOSE_FILE before passthrough works causes Odysseus to fail + # at startup, so this is not a prompt — it is a stop. + _fail "GPU passthrough is not working — .env will not be modified." + _info "Fix passthrough first, then re-run with --enable-nvidia-overlay:" + _info " Ubuntu/Debian: scripts/check-docker-gpu.sh --install-nvidia-toolkit" + _info " Other distros: scripts/check-docker-gpu.sh --print-install-commands" + echo + else + _enable_nvidia_overlay + fi + fi + + echo "=== Results: ${PASS} passed, ${FAIL} failed ===" + [ "${FAIL}" -eq 0 ] +} + +# ─── mode: --print-install-commands ────────────────────────────────────────── + +_mode_print() { + echo "=== NVIDIA Container Toolkit — install commands ===" + echo + _info "Detected system: $(_distro_label)" + echo + + if _is_debian_family; then + _info "Ubuntu/Debian — recommended install commands:" + _debian_install_steps + _info "After running these, re-run the diagnostic to confirm:" + _info " scripts/check-docker-gpu.sh" + else + case "${DISTRO_ID}" in + fedora|rhel|centos|rocky|almalinux) + _info "Fedora/RHEL — install commands:" + echo + echo " sudo dnf install -y nvidia-container-toolkit" + echo " sudo nvidia-ctk runtime configure --runtime=docker" + echo " sudo systemctl restart docker" + echo " docker run --rm --gpus all nvidia/cuda:12.4.1-base-ubuntu22.04 nvidia-smi" + ;; + opensuse*|sles) + _info "OpenSUSE/SLES — install commands:" + echo + echo " sudo zypper install nvidia-container-toolkit" + echo " sudo nvidia-ctk runtime configure --runtime=docker" + echo " sudo systemctl restart docker" + echo " docker run --rm --gpus all nvidia/cuda:12.4.1-base-ubuntu22.04 nvidia-smi" + ;; + arch|manjaro|endeavouros) + _info "Arch Linux — install commands:" + echo + echo " sudo pacman -S nvidia-container-toolkit" + echo " sudo nvidia-ctk runtime configure --runtime=docker" + echo " sudo systemctl restart docker" + echo " docker run --rm --gpus all nvidia/cuda:12.4.1-base-ubuntu22.04 nvidia-smi" + ;; + *) + _warn "Distro '${DISTRO_ID:-unknown}' is not specifically recognized." + echo + echo " See the full guide for your distribution:" + echo " https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html" + ;; + esac + echo + _info "Automated install (--install-nvidia-toolkit) supports Ubuntu/Debian only." + _info "For other distros, run the commands above manually, then re-run:" + _info " scripts/check-docker-gpu.sh" + fi +} + +# ─── mode: --install-nvidia-toolkit ────────────────────────────────────────── + +_mode_install() { + echo "=== NVIDIA Container Toolkit — interactive installer ===" + echo + + if [ "$(uname -s)" != "Linux" ]; then + _fail "Install mode is Linux-only. Detected: $(uname -s)" + exit 1 + fi + + if ! _is_debian_family; then + _fail "Automated install currently supports Ubuntu/Debian only." + _info "Detected: $(_distro_label)" + _info "Run --print-install-commands to see manual steps for your distro." + exit 1 + fi + + _info "Detected system: $(_distro_label)" + echo + + echo "This will run the following commands with sudo:" + _debian_install_steps + + if [ "${OPT_YES}" -eq 0 ]; then + if ! _confirm "Proceed with the above steps?"; then + echo "Aborted — nothing was changed." + exit 0 + fi + echo + fi + + # Step 1: prerequisites + _step "Updating package lists..." + sudo apt-get update -qq || { _fail "apt-get update failed."; exit 1; } + _step "Installing prerequisites (curl, gpg)..." + sudo apt-get install -y curl gpg || { _fail "Failed to install prerequisites."; exit 1; } + _pass "Prerequisites ready." + echo + + # Step 2: signing key + _step "Adding NVIDIA GPG signing key..." + curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \ + | sudo gpg --batch --yes --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ + || { _fail "Failed to add NVIDIA GPG key."; exit 1; } + _pass "Signing key added." + echo + + # Step 3: apt repository + _step "Adding NVIDIA apt repository..." + curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \ + | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \ + | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list > /dev/null \ + || { _fail "Failed to add NVIDIA apt repository."; exit 1; } + _pass "apt repository added." + echo + + # Step 4: install toolkit + _step "Installing nvidia-container-toolkit..." + sudo apt-get update -qq || { _fail "apt-get update failed after adding NVIDIA repo."; exit 1; } + sudo apt-get install -y nvidia-container-toolkit \ + || { _fail "Failed to install nvidia-container-toolkit."; exit 1; } + _pass "nvidia-container-toolkit installed." + echo + + # Step 5: configure Docker runtime + _step "Configuring Docker runtime..." + sudo nvidia-ctk runtime configure --runtime=docker \ + || { _fail "nvidia-ctk runtime configure failed."; exit 1; } + _pass "Docker runtime configured." + echo + + # Step 6: restart Docker + _step "A Docker restart is required for the runtime change to take effect." + local _do_restart=0 + if [ "${OPT_YES}" -eq 1 ]; then + _do_restart=1 + elif _confirm "Restart Docker now?"; then + _do_restart=1 + else + _warn "Docker not restarted." + _warn "Run 'sudo systemctl restart docker' before testing GPU passthrough." + fi + + if [ "${_do_restart}" -eq 1 ]; then + _step "Restarting Docker..." + if sudo systemctl restart docker; then + _pass "Docker restarted." + else + _fail "Docker restart failed — run: sudo systemctl restart docker" + fi + fi + echo + + # Step 7: verification + _info "Running GPU passthrough verification..." + echo + _check_docker || { echo "=== Results: ${PASS} passed, ${FAIL} failed ==="; exit 1; } + _check_gpu_passthrough + + # Step 8: enable overlay (only if passthrough verified) + if [ "${OPT_ENABLE_OVERLAY}" -eq 1 ]; then + if [ "${_GPU_PASSTHROUGH_OK}" -eq 1 ]; then + _enable_nvidia_overlay + else + _warn "GPU passthrough verification failed — skipping overlay setup." + _warn "Fix the passthrough issue, then run:" + _warn " scripts/check-docker-gpu.sh --enable-nvidia-overlay" + echo + fi + fi + + echo "=== Results: ${PASS} passed, ${FAIL} failed ===" + [ "${FAIL}" -eq 0 ] +} + +# ─── dispatch ──────────────────────────────────────────────────────────────── + +case "${MODE}" in + check) _mode_check ;; + print) _mode_print ;; + install) _mode_install ;; +esac