fix: Cookbook local GGUF serving inside Docker (#1264)

* fix: Cookbook local GGUF serving inside Docker Cookbook’s in-container GGUF serve flow had multiple Docker-specific breakages that made local llama.cpp models fail or register against the wrong endpoint. Fixes included here: use the scanned model cache root when generating GGUF serve commands instead of hardcoding $HOME/.cache/huggingface/hub fix malformed llama.cpp preflight build lines that generated invalid bash in serve runner scripts preserve loopback model URLs inside Docker when the target port is already reachable from the Odysseus container, instead of rewriting them unconditionally to host.docker.internal Before this change, Docker local serves could fail in several ways: Cookbook pointed llama.cpp at the wrong GGUF path generated serve runner scripts crashed before launch with a shell syntax error successfully started in-container model servers were auto-registered as host.docker.internal: instead of localhost/127.0.0.1 This makes the Docker Cookbook path work as expected for: downloaded GGUF -> local llama.cpp serve -> endpoint registration * test: add test for docker-local endpoint rewrites
2026-06-02 19:08:09 +02:00
parent dc6711b3c5
commit e392be0d65
4 changed files with 52 additions and 12 deletions
--- a/routes/cookbook_helpers.py
+++ b/routes/cookbook_helpers.py
@@ -541,21 +541,15 @@ def _append_llama_cpp_linux_accel_build_lines(runner_lines: list[str]) -> None:
    runner_lines.append('        export HIP_PATH="${HIP_PATH:-$(hipconfig -R)}"')
    runner_lines.append('      fi')
    runner_lines.append('      echo "[odysseus] ROCm/HIP detected — building llama-server with HIP support..."')
-    runner_lines.append('      cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_HIP=ON \\\\')
-    runner_lines.append('        && cmake --build build -j"$NPROC" --target llama-server \\\\')
-    runner_lines.append('        && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
+    runner_lines.append('      cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_HIP=ON && cmake --build build -j"$NPROC" --target llama-server && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
    runner_lines.append('    elif command -v nvcc &>/dev/null; then')
    runner_lines.append('      echo "[odysseus] CUDA nvcc found — building llama-server with CUDA (GPU) support..."')
-    runner_lines.append('      cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON \\\\')
-    runner_lines.append('        && cmake --build build -j"$NPROC" --target llama-server \\\\')
-    runner_lines.append('        && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
+    runner_lines.append('      cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON && cmake --build build -j"$NPROC" --target llama-server && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
    runner_lines.append('    else')
    runner_lines.append('      echo "[odysseus] WARNING: no HIP/CUDA toolchain found — building llama-server for CPU only."')
    runner_lines.append('      echo "[odysseus]   GPU inference will not be available for this llama.cpp build."')
    runner_lines.append('      echo "[odysseus]   Install ROCm for AMD GPUs or vLLM/CUDA tooling for NVIDIA, then re-launch this serve task."')
-    runner_lines.append('      cmake -B build -DCMAKE_BUILD_TYPE=Release \\\\')
-    runner_lines.append('        && cmake --build build -j"$NPROC" --target llama-server \\\\')
-    runner_lines.append('        && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
+    runner_lines.append('      cmake -B build -DCMAKE_BUILD_TYPE=Release && cmake --build build -j"$NPROC" --target llama-server && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
    runner_lines.append('    fi')

 class ModelDownloadRequest(BaseModel):
--- a/routes/model_routes.py
+++ b/routes/model_routes.py
@@ -148,6 +148,32 @@ def _docker_host_gateway_reachable() -> bool:
    except OSError:
        return False

+def _container_loopback_reachable(base_url: str, timeout: float = 0.2) -> bool:
+    """True when the requested loopback host:port is already reachable from
+    inside the current container.
+
+    This distinguishes "a model server running alongside Odysseus in the same
+    container" from "a model server running on the Docker host". Only the
+    latter should be rewritten to host.docker.internal.
+    """
+    try:
+        parsed = urlparse(base_url)
+    except Exception:
+        return False
+    host = (parsed.hostname or "").lower()
+    port = parsed.port
+    if host not in _LOOPBACK_HOSTS or not port:
+        return False
+    probe_host = "::1" if host == "::1" else "127.0.0.1"
+    family = socket.AF_INET6 if probe_host == "::1" else socket.AF_INET
+    try:
+        with socket.socket(family, socket.SOCK_STREAM) as sock:
+            sock.settimeout(timeout)
+            sock.connect((probe_host, port))
+        return True
+    except OSError:
+        return False
+

 def _rewrite_loopback_for_docker(base_url: str, *, container_local: bool = False) -> str:
    """Rewrite a loopback model-endpoint URL to ``host.docker.internal`` when
@@ -176,6 +202,8 @@ def _rewrite_loopback_for_docker(base_url: str, *, container_local: bool = False
    if host in _ANY_BIND_HOSTS and not _docker_host_gateway_reachable():
        netloc = "127.0.0.1" + (f":{parsed.port}" if parsed.port else "")
        return urlunparse(parsed._replace(netloc=netloc))
+    if _container_loopback_reachable(base_url):
+        return base_url
    if not _docker_host_gateway_reachable():
        return base_url
    netloc = "host.docker.internal" + (f":{parsed.port}" if parsed.port else "")
--- a/static/js/cookbookServe.js
+++ b/static/js/cookbookServe.js
@@ -246,10 +246,20 @@ function _selectedGgufExpr(model, repo, relPath) {
    const base = String(model.path || '').replace(/\/+$/, '');
    return `$(printf %s ${_shellPathExpr(`${base}/${repo}/${rel}`)})`;
  }
+  if (model.path) {
+    const base = String(model.path || '').replace(/\/+$/, '');
+    return `$(printf %s ${_shellPathExpr(`${base}/models--${repo.replace(/\//g, '--')}/snapshots/${rel}`)})`;
+  }
  const cacheRepo = repo.replace(/\//g, '--');
  return `$(printf %s \${HOME}${_shellQuote(`/.cache/huggingface/hub/models--${cacheRepo}/snapshots/${rel}`)})`;
 }

+function _ggufSearchDirExpr(model, repo) {
+  if (model.is_local_dir && model.path) return _shellQuote(`${String(model.path || '').replace(/\/+$/, '')}/${repo}`);
+  if (model.path) return _shellQuote(`${String(model.path || '').replace(/\/+$/, '')}/models--${repo.replace(/\//g, '--')}/snapshots`);
+  return `"$HOME/.cache/huggingface/hub/models--${repo.replace(/\//g, '--')}/snapshots"`;
+}
+
 function _rerenderCachedModels() {
  const list = document.getElementById('hwfit-cached-list');
  const tagContainer = document.getElementById('serve-tags');
@@ -736,13 +746,12 @@ function _rerenderCachedModels() {
          // For multi-part GGUFs, llama.cpp requires the first split
          // (-00001-of-NNNNN.gguf). Prefer it (sorted, so UD-IQ4_XS/001 comes
          // before Q4_K_M/001 etc); fall back to any single GGUF sorted.
-          // Use $HOME (not ~) so tilde survives variable interpolation inside $(...).
-          const dir = `"$HOME/.cache/huggingface/hub/models--${repo.replace(/\//g, '--')}/snapshots"`;
+          const dir = _ggufSearchDirExpr(m, repo);
          // GGUF needs the actual .gguf FILE, not the folder. For a custom-dir
          // model the file lives under "<path>/<repo>" — search there just like we
          // search the HF snapshots dir, so serving a GGUF from a custom dir works
          // instead of handing llama.cpp a directory (which fails).
-          const _ldir = `"${m.path}/${repo}"`;
+          const _ldir = m.path ? _shellQuote(`${m.path}/${repo}`) : '""';
          f._gguf_path = selectedGguf
            ? _selectedGgufExpr(m, repo, selectedGguf.rel_path)
            : m.is_local_dir && m.path
--- a/tests/test_endpoint_probing.py
+++ b/tests/test_endpoint_probing.py
@@ -198,11 +198,20 @@ class TestPingEndpoint:
 class TestDockerLoopbackRewrite:
    def test_manual_loopback_rewrites_to_docker_host_when_available(self, monkeypatch):
        monkeypatch.setattr(model_routes, "_docker_host_gateway_reachable", lambda: True)
+        monkeypatch.setattr(model_routes, "_container_loopback_reachable", lambda base_url: False)
        assert (
            _rewrite_loopback_for_docker("http://localhost:8000/v1")
            == "http://host.docker.internal:8000/v1"
        )

+    def test_reachable_container_loopback_stays_local_even_without_container_flag(self, monkeypatch):
+        monkeypatch.setattr(model_routes, "_docker_host_gateway_reachable", lambda: True)
+        monkeypatch.setattr(model_routes, "_container_loopback_reachable", lambda base_url: True)
+        assert (
+            _rewrite_loopback_for_docker("http://127.0.0.1:8001/v1")
+            == "http://127.0.0.1:8001/v1"
+        )
+
    def test_cookbook_container_local_loopback_stays_inside_container(self, monkeypatch):
        monkeypatch.setattr(model_routes, "_docker_host_gateway_reachable", lambda: True)
        assert (