Add a 'Rebuild llama.cpp' Cookbook action to force a fresh GPU build (#1787)
The serve bootstrap builds llama-server from source only when it is missing from PATH, so a host that first compiled CPU-only (no nvcc present at build time) reuses that CPU-only binary on every later serve and never gets a GPU build, even after a CUDA/ROCm toolkit is installed. There was no UI lever to force a rebuild. Adds a 'Rebuild llama.cpp' button to the Cookbook Dependencies tab. It clears the cached ~/bin/llama-server symlink and ~/llama.cpp/build directory (locally or on the selected remote server) so the next serve recompiles and picks up CUDA/HIP if a toolchain is now present. It installs and downloads nothing. - routes/cookbook_helpers.py: _llama_cpp_rebuild_cmd() (single source of truth) - routes/shell_routes.py: POST /api/cookbook/rebuild-engine (admin-only, reuses the existing SSH plumbing for remote hosts) - static/js/cookbook.js: header button + handler honoring the deps server selector - tests: cover the command shape and a clean run on a fresh HOME Motivated by #831 (RTX 4070 user stuck on a CPU-only build with no way to re-trigger the build). Co-authored-by: ghreprimand <203024559+ghreprimand@users.noreply.github.com>
This commit is contained in:
@@ -552,6 +552,27 @@ def _append_llama_cpp_linux_accel_build_lines(runner_lines: list[str]) -> None:
|
||||
runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release && cmake --build build -j"$NPROC" --target llama-server && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
|
||||
runner_lines.append(' fi')
|
||||
|
||||
|
||||
def _llama_cpp_rebuild_cmd() -> str:
|
||||
"""Shell command that clears the Cookbook-managed llama.cpp build.
|
||||
|
||||
Removes the cached ``llama-server`` symlink and the ``~/llama.cpp/build``
|
||||
directory so the next llama.cpp serve recompiles from source, picking up a
|
||||
CUDA or HIP toolchain if one is now available. The serve bootstrap only
|
||||
builds when ``llama-server`` is missing from PATH, so without this an
|
||||
existing CPU-only build is reused forever. It deliberately installs and
|
||||
downloads nothing; the rebuild itself happens on the next serve.
|
||||
"""
|
||||
return (
|
||||
'mkdir -p "$HOME/bin" && '
|
||||
'rm -f "$HOME/bin/llama-server" && '
|
||||
'rm -rf "$HOME/llama.cpp/build" && '
|
||||
'echo "[odysseus] Cleared the cached llama.cpp build. '
|
||||
'Re-launch the serve task to rebuild llama-server from source '
|
||||
'(CUDA or HIP will be used if a toolchain is now available)."'
|
||||
)
|
||||
|
||||
|
||||
class ModelDownloadRequest(BaseModel):
|
||||
repo_id: str
|
||||
include: str | None = None # glob pattern e.g. "*Q4_K_M*"
|
||||
|
||||
@@ -1058,4 +1058,39 @@ def setup_shell_routes() -> APIRouter:
|
||||
return {"ok": True, "output": stdout.decode()[-200:]}
|
||||
return {"ok": False, "error": stderr.decode()[-300:]}
|
||||
|
||||
@router.post("/api/cookbook/rebuild-engine")
|
||||
async def rebuild_engine(request: Request):
|
||||
"""Clear the cached llama.cpp build so the next serve recompiles.
|
||||
|
||||
Admin only — this removes the Cookbook-managed ``~/bin/llama-server``
|
||||
symlink and ``~/llama.cpp/build`` directory, locally or on the selected
|
||||
remote server. It installs and downloads nothing; the next llama.cpp
|
||||
serve rebuilds from source and picks up CUDA/HIP if a toolchain is now
|
||||
present. This is the missing "force a fresh GPU build" lever for hosts
|
||||
stuck on a CPU-only llama-server.
|
||||
"""
|
||||
_require_admin(request)
|
||||
from routes.cookbook_helpers import _llama_cpp_rebuild_cmd
|
||||
body = await request.json()
|
||||
engine = str(body.get("engine") or "llamacpp").strip()
|
||||
if engine != "llamacpp":
|
||||
return {"ok": False, "error": f"Unsupported engine: {engine}"}
|
||||
host = str(body.get("remote_host") or "").strip()
|
||||
ssh_port = body.get("ssh_port")
|
||||
cmd = _llama_cpp_rebuild_cmd()
|
||||
try:
|
||||
argv = (_ssh_base_argv(host, ssh_port) + [cmd]) if host else ["bash", "-lc", cmd]
|
||||
except ValueError as e:
|
||||
raise HTTPException(400, str(e))
|
||||
try:
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*argv, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
out, err = await asyncio.wait_for(proc.communicate(), timeout=30)
|
||||
except asyncio.TimeoutError:
|
||||
return {"ok": False, "error": "Rebuild-engine command timed out."}
|
||||
if proc.returncode == 0:
|
||||
return {"ok": True, "output": out.decode("utf-8", errors="replace")[-400:]}
|
||||
return {"ok": False, "error": err.decode("utf-8", errors="replace")[-400:]}
|
||||
|
||||
return router
|
||||
|
||||
@@ -1018,6 +1018,51 @@ function _wireTabEvents(body) {
|
||||
});
|
||||
}
|
||||
|
||||
// "Rebuild llama.cpp" clears the cached build so the next serve recompiles.
|
||||
// The serve bootstrap only builds llama-server when it is missing from PATH,
|
||||
// so a host that first built CPU-only (no nvcc at build time) keeps reusing
|
||||
// that binary forever; this is the lever to force a fresh GPU build after a
|
||||
// CUDA/ROCm toolkit is installed.
|
||||
const rebuildBtn = document.getElementById('cookbook-rebuild-engine');
|
||||
if (rebuildBtn && !rebuildBtn._wired) {
|
||||
rebuildBtn._wired = true;
|
||||
rebuildBtn.addEventListener('click', async () => {
|
||||
// Match _installDep: honor the Dependencies server selector so the clear
|
||||
// runs on the same host the build runs on.
|
||||
const sel = document.getElementById('hwfit-deps-server');
|
||||
if (sel) _applyServerSelection(sel.value);
|
||||
const host = _envState.remoteHost || '';
|
||||
const where = host || 'this server';
|
||||
if (!confirm(`Rebuild the llama.cpp engine on ${where}?\n\nThis clears the cached llama-server build so the next serve recompiles from source (with CUDA/HIP if a toolchain is present). It does not download or install anything.`)) return;
|
||||
const _label = rebuildBtn.textContent;
|
||||
rebuildBtn.disabled = true;
|
||||
rebuildBtn.textContent = 'Clearing...';
|
||||
try {
|
||||
const res = await fetch('/api/cookbook/rebuild-engine', {
|
||||
method: 'POST', credentials: 'same-origin',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
engine: 'llamacpp',
|
||||
remote_host: host || undefined,
|
||||
ssh_port: _getPort(host) || undefined,
|
||||
}),
|
||||
});
|
||||
const data = await res.json().catch(() => ({}));
|
||||
if (!res.ok || !data.ok) {
|
||||
const reason = data.detail || data.error || `HTTP ${res.status}`;
|
||||
uiModule.showToast('Rebuild failed: ' + String(reason).slice(0, 200));
|
||||
} else {
|
||||
uiModule.showToast(`Cleared llama.cpp build on ${where}. Re-launch the serve task to rebuild with GPU support.`);
|
||||
}
|
||||
} catch (err) {
|
||||
uiModule.showToast('Rebuild failed: ' + err.message);
|
||||
} finally {
|
||||
rebuildBtn.disabled = false;
|
||||
rebuildBtn.textContent = _label;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Serve sort
|
||||
const serveSort = document.getElementById('serve-sort');
|
||||
if (serveSort) {
|
||||
@@ -1616,6 +1661,7 @@ function _renderRecipes() {
|
||||
html += '<div class="admin-card" style="flex:1;display:flex;flex-direction:column;overflow:hidden;">';
|
||||
html += '<div style="display:flex;align-items:center;gap:8px;margin-bottom:4px;">';
|
||||
html += '<h2 style="margin:0;padding:0;line-height:1;">Dependencies</h2>';
|
||||
html += '<button class="cookbook-field-input" id="cookbook-rebuild-engine" title="Clear the cached llama.cpp build so the next serve recompiles from source (use after installing a CUDA/ROCm toolkit to turn a CPU-only build into a GPU build)." style="height:24px;font-size:10px;padding:0 8px;cursor:pointer;width:auto;">Rebuild llama.cpp</button>';
|
||||
html += '<span style="font-size:10px;opacity:0.5;margin-left:auto;">Server</span>';
|
||||
html += '<select class="cookbook-field-input" id="hwfit-deps-server" style="height:28px;min-width:70px;">';
|
||||
html += _buildServerOpts(false);
|
||||
|
||||
@@ -10,6 +10,7 @@ from routes.cookbook_helpers import (
|
||||
_append_llama_cpp_linux_accel_build_lines,
|
||||
_append_serve_exit_code_lines,
|
||||
_append_serve_preflight_exit_lines,
|
||||
_llama_cpp_rebuild_cmd,
|
||||
_local_tooling_path_export,
|
||||
_pip_install_attempt,
|
||||
_pip_install_fallback_chain,
|
||||
@@ -338,6 +339,38 @@ def test_llama_cpp_linux_bootstrap_keeps_cpu_fallback_when_no_gpu_toolchain():
|
||||
assert 'WARNING: no HIP/CUDA toolchain found — building llama-server for CPU only.' in script
|
||||
assert 'Install ROCm for AMD GPUs or vLLM/CUDA tooling for NVIDIA' in script
|
||||
|
||||
|
||||
def test_llama_cpp_rebuild_cmd_clears_cached_build_paths():
|
||||
cmd = _llama_cpp_rebuild_cmd()
|
||||
|
||||
# Must remove both the cached symlink and the build dir the serve bootstrap
|
||||
# links/creates, so the next serve recompiles from source.
|
||||
assert 'rm -f "$HOME/bin/llama-server"' in cmd
|
||||
assert 'rm -rf "$HOME/llama.cpp/build"' in cmd
|
||||
# Recreates ~/bin so a never-served host does not error on a missing dir.
|
||||
assert 'mkdir -p "$HOME/bin"' in cmd
|
||||
# Diagnosis-only on the destructive side: it must not install or fetch.
|
||||
assert 'pip install' not in cmd
|
||||
assert 'git clone' not in cmd
|
||||
assert 'curl' not in cmd and 'wget' not in cmd
|
||||
|
||||
|
||||
def test_llama_cpp_rebuild_cmd_runs_clean_on_a_fresh_home(tmp_path):
|
||||
"""The command should succeed even when neither path exists yet."""
|
||||
import os
|
||||
|
||||
env = dict(os.environ)
|
||||
env["HOME"] = str(tmp_path)
|
||||
result = subprocess.run(
|
||||
["bash", "-c", _llama_cpp_rebuild_cmd()],
|
||||
capture_output=True, text=True, env=env, timeout=10,
|
||||
)
|
||||
|
||||
assert result.returncode == 0, result.stderr
|
||||
assert (tmp_path / "bin").is_dir()
|
||||
assert "Cleared the cached llama.cpp build" in result.stdout
|
||||
|
||||
|
||||
def test_cached_model_scan_reports_plain_dir_gguf(tmp_path):
|
||||
"""Custom download dirs may sit inside the HF hub cache and contain plain
|
||||
per-model folders. They must show up in Serve and keep the GGUF signal."""
|
||||
|
||||
Reference in New Issue
Block a user