diff --git a/routes/cookbook_helpers.py b/routes/cookbook_helpers.py index 30f99e7..9dc232a 100644 --- a/routes/cookbook_helpers.py +++ b/routes/cookbook_helpers.py @@ -541,21 +541,15 @@ def _append_llama_cpp_linux_accel_build_lines(runner_lines: list[str]) -> None: runner_lines.append(' export HIP_PATH="${HIP_PATH:-$(hipconfig -R)}"') runner_lines.append(' fi') runner_lines.append(' echo "[odysseus] ROCm/HIP detected — building llama-server with HIP support..."') - runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_HIP=ON \\\\') - runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\\\') - runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server') + runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_HIP=ON && cmake --build build -j"$NPROC" --target llama-server && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server') runner_lines.append(' elif command -v nvcc &>/dev/null; then') runner_lines.append(' echo "[odysseus] CUDA nvcc found — building llama-server with CUDA (GPU) support..."') - runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON \\\\') - runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\\\') - runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server') + runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON && cmake --build build -j"$NPROC" --target llama-server && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server') runner_lines.append(' else') runner_lines.append(' echo "[odysseus] WARNING: no HIP/CUDA toolchain found — building llama-server for CPU only."') runner_lines.append(' echo "[odysseus] GPU inference will not be available for this llama.cpp build."') runner_lines.append(' echo "[odysseus] Install ROCm for AMD GPUs or vLLM/CUDA tooling for NVIDIA, then re-launch this serve task."') - runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release \\\\') - runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\\\') - runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server') + runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release && cmake --build build -j"$NPROC" --target llama-server && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server') runner_lines.append(' fi') class ModelDownloadRequest(BaseModel): diff --git a/routes/model_routes.py b/routes/model_routes.py index f66cdd6..0135d1c 100644 --- a/routes/model_routes.py +++ b/routes/model_routes.py @@ -148,6 +148,32 @@ def _docker_host_gateway_reachable() -> bool: except OSError: return False +def _container_loopback_reachable(base_url: str, timeout: float = 0.2) -> bool: + """True when the requested loopback host:port is already reachable from + inside the current container. + + This distinguishes "a model server running alongside Odysseus in the same + container" from "a model server running on the Docker host". Only the + latter should be rewritten to host.docker.internal. + """ + try: + parsed = urlparse(base_url) + except Exception: + return False + host = (parsed.hostname or "").lower() + port = parsed.port + if host not in _LOOPBACK_HOSTS or not port: + return False + probe_host = "::1" if host == "::1" else "127.0.0.1" + family = socket.AF_INET6 if probe_host == "::1" else socket.AF_INET + try: + with socket.socket(family, socket.SOCK_STREAM) as sock: + sock.settimeout(timeout) + sock.connect((probe_host, port)) + return True + except OSError: + return False + def _rewrite_loopback_for_docker(base_url: str, *, container_local: bool = False) -> str: """Rewrite a loopback model-endpoint URL to ``host.docker.internal`` when @@ -176,6 +202,8 @@ def _rewrite_loopback_for_docker(base_url: str, *, container_local: bool = False if host in _ANY_BIND_HOSTS and not _docker_host_gateway_reachable(): netloc = "127.0.0.1" + (f":{parsed.port}" if parsed.port else "") return urlunparse(parsed._replace(netloc=netloc)) + if _container_loopback_reachable(base_url): + return base_url if not _docker_host_gateway_reachable(): return base_url netloc = "host.docker.internal" + (f":{parsed.port}" if parsed.port else "") diff --git a/static/js/cookbookServe.js b/static/js/cookbookServe.js index 6e71791..0c2fcd2 100644 --- a/static/js/cookbookServe.js +++ b/static/js/cookbookServe.js @@ -246,10 +246,20 @@ function _selectedGgufExpr(model, repo, relPath) { const base = String(model.path || '').replace(/\/+$/, ''); return `$(printf %s ${_shellPathExpr(`${base}/${repo}/${rel}`)})`; } + if (model.path) { + const base = String(model.path || '').replace(/\/+$/, ''); + return `$(printf %s ${_shellPathExpr(`${base}/models--${repo.replace(/\//g, '--')}/snapshots/${rel}`)})`; + } const cacheRepo = repo.replace(/\//g, '--'); return `$(printf %s \${HOME}${_shellQuote(`/.cache/huggingface/hub/models--${cacheRepo}/snapshots/${rel}`)})`; } +function _ggufSearchDirExpr(model, repo) { + if (model.is_local_dir && model.path) return _shellQuote(`${String(model.path || '').replace(/\/+$/, '')}/${repo}`); + if (model.path) return _shellQuote(`${String(model.path || '').replace(/\/+$/, '')}/models--${repo.replace(/\//g, '--')}/snapshots`); + return `"$HOME/.cache/huggingface/hub/models--${repo.replace(/\//g, '--')}/snapshots"`; +} + function _rerenderCachedModels() { const list = document.getElementById('hwfit-cached-list'); const tagContainer = document.getElementById('serve-tags'); @@ -736,13 +746,12 @@ function _rerenderCachedModels() { // For multi-part GGUFs, llama.cpp requires the first split // (-00001-of-NNNNN.gguf). Prefer it (sorted, so UD-IQ4_XS/001 comes // before Q4_K_M/001 etc); fall back to any single GGUF sorted. - // Use $HOME (not ~) so tilde survives variable interpolation inside $(...). - const dir = `"$HOME/.cache/huggingface/hub/models--${repo.replace(/\//g, '--')}/snapshots"`; + const dir = _ggufSearchDirExpr(m, repo); // GGUF needs the actual .gguf FILE, not the folder. For a custom-dir // model the file lives under "/" — search there just like we // search the HF snapshots dir, so serving a GGUF from a custom dir works // instead of handing llama.cpp a directory (which fails). - const _ldir = `"${m.path}/${repo}"`; + const _ldir = m.path ? _shellQuote(`${m.path}/${repo}`) : '""'; f._gguf_path = selectedGguf ? _selectedGgufExpr(m, repo, selectedGguf.rel_path) : m.is_local_dir && m.path diff --git a/tests/test_endpoint_probing.py b/tests/test_endpoint_probing.py index aab4c52..0c7a2ca 100644 --- a/tests/test_endpoint_probing.py +++ b/tests/test_endpoint_probing.py @@ -198,11 +198,20 @@ class TestPingEndpoint: class TestDockerLoopbackRewrite: def test_manual_loopback_rewrites_to_docker_host_when_available(self, monkeypatch): monkeypatch.setattr(model_routes, "_docker_host_gateway_reachable", lambda: True) + monkeypatch.setattr(model_routes, "_container_loopback_reachable", lambda base_url: False) assert ( _rewrite_loopback_for_docker("http://localhost:8000/v1") == "http://host.docker.internal:8000/v1" ) + def test_reachable_container_loopback_stays_local_even_without_container_flag(self, monkeypatch): + monkeypatch.setattr(model_routes, "_docker_host_gateway_reachable", lambda: True) + monkeypatch.setattr(model_routes, "_container_loopback_reachable", lambda base_url: True) + assert ( + _rewrite_loopback_for_docker("http://127.0.0.1:8001/v1") + == "http://127.0.0.1:8001/v1" + ) + def test_cookbook_container_local_loopback_stays_inside_container(self, monkeypatch): monkeypatch.setattr(model_routes, "_docker_host_gateway_reachable", lambda: True) assert (