Drop GPU-only flags from the CPU-only (-ngl 0) serve command (#1433)
A CPU-only llama.cpp serve config still emitted --flash-attn on and exported GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 (independent toggles, often left on by an Auto profile), so the command mixed "zero GPU layers" with CUDA/flash-attn and failed to start (issue #1291). Gate both on a _cpuOnly check (ngl == 0). GPU serving is unchanged — the gate only affects the ngl=0 path. Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -401,13 +401,17 @@ export function _buildServeCmd(f, modelName, backend) {
|
|||||||
const ggufPath = f._gguf_path || 'model.gguf';
|
const ggufPath = f._gguf_path || 'model.gguf';
|
||||||
const gpuId = f.gpu_id?.trim() || '';
|
const gpuId = f.gpu_id?.trim() || '';
|
||||||
const py = _isWindows() ? 'python' : 'python3';
|
const py = _isWindows() ? 'python' : 'python3';
|
||||||
|
// CPU-only serve (-ngl 0): drop the GPU-only flags, otherwise the command
|
||||||
|
// mixes "zero GPU layers" with CUDA unified-memory + flash-attn and fails to
|
||||||
|
// start (issue #1291). Only affects the ngl=0 path; GPU serving is unchanged.
|
||||||
|
const _cpuOnly = String(f.ngl).trim() === '0';
|
||||||
const lcPrefix = (() => {
|
const lcPrefix = (() => {
|
||||||
let p = '';
|
let p = '';
|
||||||
if (f.unified_mem && !_isWindows()) p += `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 `;
|
if (f.unified_mem && !_cpuOnly && !_isWindows()) p += `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 `;
|
||||||
if (gpuId && !_isWindows()) p += `CUDA_VISIBLE_DEVICES=${gpuId} `;
|
if (gpuId && !_isWindows()) p += `CUDA_VISIBLE_DEVICES=${gpuId} `;
|
||||||
return p;
|
return p;
|
||||||
})();
|
})();
|
||||||
if (f.unified_mem && _isWindows()) cmd += `$env:GGML_CUDA_ENABLE_UNIFIED_MEMORY="1"; `;
|
if (f.unified_mem && !_cpuOnly && _isWindows()) cmd += `$env:GGML_CUDA_ENABLE_UNIFIED_MEMORY="1"; `;
|
||||||
if (gpuId && _isWindows()) cmd += `$env:CUDA_VISIBLE_DEVICES="${gpuId}"; `;
|
if (gpuId && _isWindows()) cmd += `$env:CUDA_VISIBLE_DEVICES="${gpuId}"; `;
|
||||||
if (!_isWindows()) {
|
if (!_isWindows()) {
|
||||||
// Resolve GGUF path once, fail loudly if nothing matched (prevents
|
// Resolve GGUF path once, fail loudly if nothing matched (prevents
|
||||||
@@ -439,7 +443,7 @@ export function _buildServeCmd(f, modelName, backend) {
|
|||||||
_lcExtra += ` --n-cpu-moe ${_ncm}`;
|
_lcExtra += ` --n-cpu-moe ${_ncm}`;
|
||||||
_lcpExtra += ` --n_cpu_moe ${_ncm}`; // llama-cpp-python uses underscores
|
_lcpExtra += ` --n_cpu_moe ${_ncm}`; // llama-cpp-python uses underscores
|
||||||
}
|
}
|
||||||
if (f.flash_attn) {
|
if (f.flash_attn && !_cpuOnly) {
|
||||||
_lcExtra += ' --flash-attn on';
|
_lcExtra += ' --flash-attn on';
|
||||||
_lcpExtra += ' --flash_attn true';
|
_lcpExtra += ' --flash_attn true';
|
||||||
}
|
}
|
||||||
|
|||||||
30
tests/test_cookbook_cpu_only_serve.py
Normal file
30
tests/test_cookbook_cpu_only_serve.py
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
"""Regression guard for issue #1291 — CPU-only serve still emitted GPU-only flags.
|
||||||
|
|
||||||
|
The llama.cpp serve command builder (static/js/cookbook.js) added
|
||||||
|
`--flash-attn on` and exported `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` from
|
||||||
|
independent toggles, so a CPU-only config (`-ngl 0`, often with flash-attn left
|
||||||
|
on by an Auto profile) produced a command that mixes "zero GPU layers" with
|
||||||
|
CUDA/flash-attn and fails to start. The builder now drops those GPU-only flags
|
||||||
|
when ngl == 0, per the maintainer's guidance.
|
||||||
|
|
||||||
|
cookbook.js pulls in browser globals so it can't run under node; guard the fix
|
||||||
|
at the source level: a `_cpuOnly` gate exists and is applied to flash-attn and
|
||||||
|
the CUDA unified-memory env.
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
SRC = Path(__file__).resolve().parent.parent / "static/js/cookbook.js"
|
||||||
|
|
||||||
|
|
||||||
|
def test_cpu_only_drops_gpu_only_flags():
|
||||||
|
text = SRC.read_text(encoding="utf-8")
|
||||||
|
# A CPU-only flag derived from ngl == 0.
|
||||||
|
assert re.search(r"_cpuOnly\s*=\s*String\(f\.ngl\)\.trim\(\)\s*===\s*'0'", text), \
|
||||||
|
"expected a _cpuOnly gate derived from ngl==0"
|
||||||
|
# flash-attn must be suppressed for CPU-only.
|
||||||
|
assert re.search(r"if\s*\(\s*f\.flash_attn\s*&&\s*!_cpuOnly\s*\)", text), \
|
||||||
|
"flash-attn must be gated on !_cpuOnly"
|
||||||
|
# The CUDA unified-memory env must be suppressed for CPU-only too.
|
||||||
|
assert "f.unified_mem && !_cpuOnly" in text, \
|
||||||
|
"GGML_CUDA_ENABLE_UNIFIED_MEMORY must be gated on !_cpuOnly"
|
||||||
Reference in New Issue
Block a user