diff --git a/.env.example b/.env.example
index ed4adf2..e3d6a13 100644
--- a/.env.example
+++ b/.env.example
@@ -137,6 +137,7 @@ SEARXNG_INSTANCE=http://localhost:8080
 # NVIDIA (requires nvidia-container-toolkit + `nvidia-ctk runtime
 # configure --runtime=docker` on the host):
 # COMPOSE_FILE=docker-compose.yml:docker/gpu.nvidia.yml
+# COMPOSE_FILE=docker-compose.yml;docker/gpu.nvidia.yml    #(Windows)
 #
 # AMD ROCm (requires ROCm drivers on the host):
 # COMPOSE_FILE=docker-compose.yml:docker/gpu.amd.yml
diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh
index 1af879c..a378ff2 100644
--- a/docker/entrypoint.sh
+++ b/docker/entrypoint.sh
@@ -56,13 +56,25 @@ done
 # Auto-set CUDA_HOME if a pip-installed nvcc is present, and disable the
 # FlashInfer JIT sampler — sampler only, no impact on attention path.
 # No-op when vllm isn't installed.
-for cu in /app/.local/lib/python*/site-packages/nvidia/cu13; do
+#
+# Checked layouts (all are real pip-wheel install paths):
+#   nvidia/cu13        — nvidia-nvcc-cu13 (CUDA 13.x wheel style)
+#   nvidia/cu12        — nvidia-nvcc-cu12 (CUDA 12.x wheel style)
+#   nvidia/cuda_nvcc   — nvidia-cuda-nvcc-cu12 (older cu12 sub-package style)
+for cu in \
+    /app/.local/lib/python*/site-packages/nvidia/cu13 \
+    /app/.local/lib/python*/site-packages/nvidia/cu12 \
+    /app/.local/lib/python*/site-packages/nvidia/cuda_nvcc; do
     if [ -x "$cu/bin/nvcc" ]; then
         export CUDA_HOME="$cu"
-        export VLLM_USE_FLASHINFER_SAMPLER="${VLLM_USE_FLASHINFER_SAMPLER:-0}"
         break
     fi
 done
+# Disable the FlashInfer JIT sampler unconditionally — it is sampler-only
+# and has no impact on the attention path, but requires nvcc + matching
+# CUDA headers at startup. Without this, vLLM crashes with "Could not find
+# nvcc" even when the GPU itself is fully visible to the container.
+export VLLM_USE_FLASHINFER_SAMPLER="${VLLM_USE_FLASHINFER_SAMPLER:-0}"
 
 # Drop root and run the actual app. `gosu` is preferred over `su` /
 # `sudo` because it cleans up the process tree (no extra shell layer)
diff --git a/routes/cookbook_routes.py b/routes/cookbook_routes.py
index b14a147..909cc6d 100644
--- a/routes/cookbook_routes.py
+++ b/routes/cookbook_routes.py
@@ -1004,9 +1004,33 @@ def setup_cookbook_routes() -> APIRouter:
                 runner_lines.append('      && cmake --build build -j"$NPROC" --target llama-server \\')
                 runner_lines.append('      && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
                 runner_lines.append('  else')
-                runner_lines.append('    cd ~/llama.cpp && { cmake -B build -DGGML_CUDA=ON 2>/dev/null || cmake -B build; } \\')
-                runner_lines.append('      && cmake --build build -j"$NPROC" --target llama-server \\')
-                runner_lines.append('      && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
+                # Detect pip-installed nvcc (from vLLM/nvidia CUDA wheels) and put
+                # it on PATH so cmake's CUDA configure can find it.  We check the
+                # same three layouts as entrypoint.sh:
+                #   nvidia/cu13       — nvidia-nvcc-cu13
+                #   nvidia/cu12       — nvidia-nvcc-cu12
+                #   nvidia/cuda_nvcc  — nvidia-cuda-nvcc-cu12 (sub-package style)
+                runner_lines.append('    for _cudir in ~/.local/lib/python*/site-packages/nvidia/cu13 ~/.local/lib/python*/site-packages/nvidia/cu12 ~/.local/lib/python*/site-packages/nvidia/cuda_nvcc; do')
+                runner_lines.append('      [ -x "$_cudir/bin/nvcc" ] && export CUDA_HOME="$_cudir" && export PATH="$_cudir/bin:$PATH" && break')
+                runner_lines.append('    done')
+                # rm -rf build so a prior poisoned CMakeCache.txt (e.g. from a
+                # failed CUDA attempt) doesn't cause the next configure to reuse
+                # stale settings and silently produce a CPU-only binary.
+                runner_lines.append('    cd ~/llama.cpp && rm -rf build')
+                runner_lines.append('    if command -v nvcc &>/dev/null; then')
+                runner_lines.append('      echo "[odysseus] CUDA nvcc found — building llama-server with CUDA (GPU) support..."')
+                runner_lines.append('      cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON \\')
+                runner_lines.append('        && cmake --build build -j"$NPROC" --target llama-server \\')
+                runner_lines.append('        && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
+                runner_lines.append('    else')
+                runner_lines.append('      echo "[odysseus] WARNING: nvcc not found — building llama-server for CPU only."')
+                runner_lines.append('      echo "[odysseus]   GPU inference will not be available for this llama.cpp build."')
+                runner_lines.append('      echo "[odysseus]   To get a GPU build, first install vLLM via Cookbook -> Dependencies"')
+                runner_lines.append('      echo "[odysseus]   (its CUDA wheels include nvcc), then re-launch this serve task."')
+                runner_lines.append('      cmake -B build -DCMAKE_BUILD_TYPE=Release \\')
+                runner_lines.append('        && cmake --build build -j"$NPROC" --target llama-server \\')
+                runner_lines.append('        && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
+                runner_lines.append('    fi')
                 runner_lines.append('  fi')
                 runner_lines.append('  # If the native build failed, fall back to the Python bindings.')
                 runner_lines.append('  if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then')