From 0e6cbd83159018bc280ae35f463b9f9ab74b3729 Mon Sep 17 00:00:00 2001 From: lekt8 Date: Wed, 3 Jun 2026 03:26:15 +0800 Subject: [PATCH] Drop GPU-only flags from the CPU-only (-ngl 0) serve command (#1433) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A CPU-only llama.cpp serve config still emitted --flash-attn on and exported GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 (independent toggles, often left on by an Auto profile), so the command mixed "zero GPU layers" with CUDA/flash-attn and failed to start (issue #1291). Gate both on a _cpuOnly check (ngl == 0). GPU serving is unchanged — the gate only affects the ngl=0 path. Co-authored-by: Claude Opus 4.8 (1M context) --- static/js/cookbook.js | 10 ++++++--- tests/test_cookbook_cpu_only_serve.py | 30 +++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 3 deletions(-) create mode 100644 tests/test_cookbook_cpu_only_serve.py diff --git a/static/js/cookbook.js b/static/js/cookbook.js index ce17ecc..fe8b073 100644 --- a/static/js/cookbook.js +++ b/static/js/cookbook.js @@ -401,13 +401,17 @@ export function _buildServeCmd(f, modelName, backend) { const ggufPath = f._gguf_path || 'model.gguf'; const gpuId = f.gpu_id?.trim() || ''; const py = _isWindows() ? 'python' : 'python3'; + // CPU-only serve (-ngl 0): drop the GPU-only flags, otherwise the command + // mixes "zero GPU layers" with CUDA unified-memory + flash-attn and fails to + // start (issue #1291). Only affects the ngl=0 path; GPU serving is unchanged. + const _cpuOnly = String(f.ngl).trim() === '0'; const lcPrefix = (() => { let p = ''; - if (f.unified_mem && !_isWindows()) p += `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 `; + if (f.unified_mem && !_cpuOnly && !_isWindows()) p += `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 `; if (gpuId && !_isWindows()) p += `CUDA_VISIBLE_DEVICES=${gpuId} `; return p; })(); - if (f.unified_mem && _isWindows()) cmd += `$env:GGML_CUDA_ENABLE_UNIFIED_MEMORY="1"; `; + if (f.unified_mem && !_cpuOnly && _isWindows()) cmd += `$env:GGML_CUDA_ENABLE_UNIFIED_MEMORY="1"; `; if (gpuId && _isWindows()) cmd += `$env:CUDA_VISIBLE_DEVICES="${gpuId}"; `; if (!_isWindows()) { // Resolve GGUF path once, fail loudly if nothing matched (prevents @@ -439,7 +443,7 @@ export function _buildServeCmd(f, modelName, backend) { _lcExtra += ` --n-cpu-moe ${_ncm}`; _lcpExtra += ` --n_cpu_moe ${_ncm}`; // llama-cpp-python uses underscores } - if (f.flash_attn) { + if (f.flash_attn && !_cpuOnly) { _lcExtra += ' --flash-attn on'; _lcpExtra += ' --flash_attn true'; } diff --git a/tests/test_cookbook_cpu_only_serve.py b/tests/test_cookbook_cpu_only_serve.py new file mode 100644 index 0000000..ad4b795 --- /dev/null +++ b/tests/test_cookbook_cpu_only_serve.py @@ -0,0 +1,30 @@ +"""Regression guard for issue #1291 — CPU-only serve still emitted GPU-only flags. + +The llama.cpp serve command builder (static/js/cookbook.js) added +`--flash-attn on` and exported `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` from +independent toggles, so a CPU-only config (`-ngl 0`, often with flash-attn left +on by an Auto profile) produced a command that mixes "zero GPU layers" with +CUDA/flash-attn and fails to start. The builder now drops those GPU-only flags +when ngl == 0, per the maintainer's guidance. + +cookbook.js pulls in browser globals so it can't run under node; guard the fix +at the source level: a `_cpuOnly` gate exists and is applied to flash-attn and +the CUDA unified-memory env. +""" +import re +from pathlib import Path + +SRC = Path(__file__).resolve().parent.parent / "static/js/cookbook.js" + + +def test_cpu_only_drops_gpu_only_flags(): + text = SRC.read_text(encoding="utf-8") + # A CPU-only flag derived from ngl == 0. + assert re.search(r"_cpuOnly\s*=\s*String\(f\.ngl\)\.trim\(\)\s*===\s*'0'", text), \ + "expected a _cpuOnly gate derived from ngl==0" + # flash-attn must be suppressed for CPU-only. + assert re.search(r"if\s*\(\s*f\.flash_attn\s*&&\s*!_cpuOnly\s*\)", text), \ + "flash-attn must be gated on !_cpuOnly" + # The CUDA unified-memory env must be suppressed for CPU-only too. + assert "f.unified_mem && !_cpuOnly" in text, \ + "GGML_CUDA_ENABLE_UNIFIED_MEMORY must be gated on !_cpuOnly"