fix: Cookbook local GGUF serving inside Docker (#1264)

* fix: Cookbook local GGUF serving inside Docker

Cookbook’s in-container GGUF serve flow had multiple Docker-specific breakages that made local llama.cpp models fail or register against the wrong endpoint.

Fixes included here:

use the scanned model cache root when generating GGUF serve commands instead of hardcoding $HOME/.cache/huggingface/hub
fix malformed llama.cpp preflight build lines that generated invalid bash in serve runner scripts
preserve loopback model URLs inside Docker when the target port is already reachable from the Odysseus container, instead of rewriting them unconditionally to host.docker.internal
Before this change, Docker local serves could fail in several ways:

Cookbook pointed llama.cpp at the wrong GGUF path
generated serve runner scripts crashed before launch with a shell syntax error
successfully started in-container model servers were auto-registered as host.docker.internal: instead of localhost/127.0.0.1
This makes the Docker Cookbook path work as expected for: downloaded GGUF -> local llama.cpp serve -> endpoint registration

* test: add test for docker-local endpoint rewrites
This commit is contained in:
Michael Gerber
2026-06-02 19:08:09 +02:00
committed by GitHub
parent dc6711b3c5
commit e392be0d65
4 changed files with 52 additions and 12 deletions

View File

@@ -541,21 +541,15 @@ def _append_llama_cpp_linux_accel_build_lines(runner_lines: list[str]) -> None:
runner_lines.append(' export HIP_PATH="${HIP_PATH:-$(hipconfig -R)}"')
runner_lines.append(' fi')
runner_lines.append(' echo "[odysseus] ROCm/HIP detected — building llama-server with HIP support..."')
runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_HIP=ON \\\\')
runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\\\')
runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_HIP=ON && cmake --build build -j"$NPROC" --target llama-server && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
runner_lines.append(' elif command -v nvcc &>/dev/null; then')
runner_lines.append(' echo "[odysseus] CUDA nvcc found — building llama-server with CUDA (GPU) support..."')
runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON \\\\')
runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\\\')
runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON && cmake --build build -j"$NPROC" --target llama-server && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
runner_lines.append(' else')
runner_lines.append(' echo "[odysseus] WARNING: no HIP/CUDA toolchain found — building llama-server for CPU only."')
runner_lines.append(' echo "[odysseus] GPU inference will not be available for this llama.cpp build."')
runner_lines.append(' echo "[odysseus] Install ROCm for AMD GPUs or vLLM/CUDA tooling for NVIDIA, then re-launch this serve task."')
runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release \\\\')
runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\\\')
runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release && cmake --build build -j"$NPROC" --target llama-server && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
runner_lines.append(' fi')
class ModelDownloadRequest(BaseModel):

View File

@@ -148,6 +148,32 @@ def _docker_host_gateway_reachable() -> bool:
except OSError:
return False
def _container_loopback_reachable(base_url: str, timeout: float = 0.2) -> bool:
"""True when the requested loopback host:port is already reachable from
inside the current container.
This distinguishes "a model server running alongside Odysseus in the same
container" from "a model server running on the Docker host". Only the
latter should be rewritten to host.docker.internal.
"""
try:
parsed = urlparse(base_url)
except Exception:
return False
host = (parsed.hostname or "").lower()
port = parsed.port
if host not in _LOOPBACK_HOSTS or not port:
return False
probe_host = "::1" if host == "::1" else "127.0.0.1"
family = socket.AF_INET6 if probe_host == "::1" else socket.AF_INET
try:
with socket.socket(family, socket.SOCK_STREAM) as sock:
sock.settimeout(timeout)
sock.connect((probe_host, port))
return True
except OSError:
return False
def _rewrite_loopback_for_docker(base_url: str, *, container_local: bool = False) -> str:
"""Rewrite a loopback model-endpoint URL to ``host.docker.internal`` when
@@ -176,6 +202,8 @@ def _rewrite_loopback_for_docker(base_url: str, *, container_local: bool = False
if host in _ANY_BIND_HOSTS and not _docker_host_gateway_reachable():
netloc = "127.0.0.1" + (f":{parsed.port}" if parsed.port else "")
return urlunparse(parsed._replace(netloc=netloc))
if _container_loopback_reachable(base_url):
return base_url
if not _docker_host_gateway_reachable():
return base_url
netloc = "host.docker.internal" + (f":{parsed.port}" if parsed.port else "")

View File

@@ -246,10 +246,20 @@ function _selectedGgufExpr(model, repo, relPath) {
const base = String(model.path || '').replace(/\/+$/, '');
return `$(printf %s ${_shellPathExpr(`${base}/${repo}/${rel}`)})`;
}
if (model.path) {
const base = String(model.path || '').replace(/\/+$/, '');
return `$(printf %s ${_shellPathExpr(`${base}/models--${repo.replace(/\//g, '--')}/snapshots/${rel}`)})`;
}
const cacheRepo = repo.replace(/\//g, '--');
return `$(printf %s \${HOME}${_shellQuote(`/.cache/huggingface/hub/models--${cacheRepo}/snapshots/${rel}`)})`;
}
function _ggufSearchDirExpr(model, repo) {
if (model.is_local_dir && model.path) return _shellQuote(`${String(model.path || '').replace(/\/+$/, '')}/${repo}`);
if (model.path) return _shellQuote(`${String(model.path || '').replace(/\/+$/, '')}/models--${repo.replace(/\//g, '--')}/snapshots`);
return `"$HOME/.cache/huggingface/hub/models--${repo.replace(/\//g, '--')}/snapshots"`;
}
function _rerenderCachedModels() {
const list = document.getElementById('hwfit-cached-list');
const tagContainer = document.getElementById('serve-tags');
@@ -736,13 +746,12 @@ function _rerenderCachedModels() {
// For multi-part GGUFs, llama.cpp requires the first split
// (-00001-of-NNNNN.gguf). Prefer it (sorted, so UD-IQ4_XS/001 comes
// before Q4_K_M/001 etc); fall back to any single GGUF sorted.
// Use $HOME (not ~) so tilde survives variable interpolation inside $(...).
const dir = `"$HOME/.cache/huggingface/hub/models--${repo.replace(/\//g, '--')}/snapshots"`;
const dir = _ggufSearchDirExpr(m, repo);
// GGUF needs the actual .gguf FILE, not the folder. For a custom-dir
// model the file lives under "<path>/<repo>" — search there just like we
// search the HF snapshots dir, so serving a GGUF from a custom dir works
// instead of handing llama.cpp a directory (which fails).
const _ldir = `"${m.path}/${repo}"`;
const _ldir = m.path ? _shellQuote(`${m.path}/${repo}`) : '""';
f._gguf_path = selectedGguf
? _selectedGgufExpr(m, repo, selectedGguf.rel_path)
: m.is_local_dir && m.path

View File

@@ -198,11 +198,20 @@ class TestPingEndpoint:
class TestDockerLoopbackRewrite:
def test_manual_loopback_rewrites_to_docker_host_when_available(self, monkeypatch):
monkeypatch.setattr(model_routes, "_docker_host_gateway_reachable", lambda: True)
monkeypatch.setattr(model_routes, "_container_loopback_reachable", lambda base_url: False)
assert (
_rewrite_loopback_for_docker("http://localhost:8000/v1")
== "http://host.docker.internal:8000/v1"
)
def test_reachable_container_loopback_stays_local_even_without_container_flag(self, monkeypatch):
monkeypatch.setattr(model_routes, "_docker_host_gateway_reachable", lambda: True)
monkeypatch.setattr(model_routes, "_container_loopback_reachable", lambda base_url: True)
assert (
_rewrite_loopback_for_docker("http://127.0.0.1:8001/v1")
== "http://127.0.0.1:8001/v1"
)
def test_cookbook_container_local_loopback_stays_inside_container(self, monkeypatch):
monkeypatch.setattr(model_routes, "_docker_host_gateway_reachable", lambda: True)
assert (