Expose advanced llama.cpp serve controls

This commit is contained in:
spooky
2026-06-02 13:46:16 +10:00
committed by GitHub
parent 05fb48e9d5
commit 0f3280ee05
4 changed files with 92 additions and 0 deletions

View File

@@ -423,6 +423,14 @@ export function _buildServeCmd(f, modelName, backend) {
// speed things up. Only emitted when set, so manual/older flows are unchanged.
const _ncm = (f.n_cpu_moe ?? '').toString().trim();
const _kv = (f.cache_type ?? '').toString().trim();
const _llamaNum = (v) => {
const s = String(v || '').trim();
return /^\d+$/.test(s) ? s : '';
};
const _llamaCsv = (v) => {
const s = String(v || '').replace(/\s+/g, '');
return /^\d+(?:\.\d+)?(?:,\d+(?:\.\d+)?)*$/.test(s) ? s : '';
};
let _lcExtra = '';
let _lcpExtra = '';
if (_ncm !== '' && Number(_ncm) > 0) {
@@ -438,6 +446,27 @@ export function _buildServeCmd(f, modelName, backend) {
// llama-cpp-python exposes these as type_k/type_v; pass through best-effort.
_lcpExtra += ` --type_k ${_kv} --type_v ${_kv}`;
}
const _llamaFit = String(f.llama_fit || '').trim();
if (['on', 'off'].includes(_llamaFit)) _lcExtra += ` --fit ${_llamaFit}`;
if (f.llama_no_mmap) _lcExtra += ' --no-mmap';
if (f.llama_no_warmup) _lcExtra += ' --no-warmup';
const _llamaSplitMode = String(f.llama_split_mode || '').trim();
if (['none', 'layer', 'row', 'tensor'].includes(_llamaSplitMode)) _lcExtra += ` --split-mode ${_llamaSplitMode}`;
const _llamaTensorSplit = _llamaCsv(f.llama_tensor_split);
if (_llamaTensorSplit) _lcExtra += ` --tensor-split ${_llamaTensorSplit}`;
const _llamaMainGpu = _llamaNum(f.llama_main_gpu);
if (_llamaMainGpu) _lcExtra += ` --main-gpu ${_llamaMainGpu}`;
const _llamaParallel = _llamaNum(f.llama_parallel);
if (_llamaParallel) _lcExtra += ` --parallel ${_llamaParallel}`;
const _llamaBatch = _llamaNum(f.llama_batch_size);
if (_llamaBatch) _lcExtra += ` --batch-size ${_llamaBatch}`;
const _llamaUBatch = _llamaNum(f.llama_ubatch_size);
if (_llamaUBatch) _lcExtra += ` --ubatch-size ${_llamaUBatch}`;
if (f.llama_speculative_mtp) {
const specTokens = parseInt(f.llama_spec_tokens, 10);
const specN = Number.isFinite(specTokens) && specTokens > 0 ? specTokens : 3;
_lcExtra += ` --spec-type draft-mtp --spec-draft-n-max ${specN}`;
}
// Vision: serve the multimodal projector so the model can read images. The
// mmproj path is resolved at runtime (find mmproj-*.gguf next to the model);
// only emitted when the Vision toggle is on AND a projector was found.

View File

@@ -1195,10 +1195,24 @@ function _parseServeCmdToFields(cmd) {
dtype: ex(/--dtype\s+(\w+)/) || 'auto',
max_seqs: ex(/--max-num-seqs\s+(\d+)/) || '',
gpus: ex(/CUDA_VISIBLE_DEVICES=(\S+)/) || '',
cache_type: ex(/(?:--cache-type-k|-ctk)\s+(\S+)/) || '',
llama_fit: ex(/(?:--fit|-fit)\s+(on|off)/) || '',
llama_split_mode: ex(/(?:--split-mode|-sm)\s+(none|layer|row|tensor)/) || '',
llama_tensor_split: ex(/(?:--tensor-split|-ts)\s+([0-9.,]+)/) || '',
llama_main_gpu: ex(/(?:--main-gpu|-mg)\s+(\d+)/) || '',
llama_parallel: ex(/(?:--parallel|-np)\s+(\d+)/) || '',
llama_batch_size: ex(/(?:--batch-size|-b)\s+(\d+)/) || '',
llama_ubatch_size: ex(/(?:--ubatch-size|-ub)\s+(\d+)/) || '',
llama_spec_tokens: ex(/--spec-draft-n-max\s+(\d+)/) || '3',
enforce_eager: cmd.includes('--enforce-eager'),
trust_remote: cmd.includes('--trust-remote-code'),
prefix_cache: cmd.includes('--enable-prefix-caching'),
auto_tool: cmd.includes('--enable-auto-tool-choice'),
flash_attn: /--flash-attn\s+on\b/.test(cmd),
unified_mem: /GGML_CUDA_ENABLE_UNIFIED_MEMORY=1/.test(cmd),
llama_no_mmap: /--no-mmap\b/.test(cmd),
llama_no_warmup: /--no-warmup\b/.test(cmd),
llama_speculative_mtp: /--spec-type\s+\S*draft-mtp/.test(cmd),
speculative: cmd.includes('--speculative-config'),
};
const spec = cmd.match(/--speculative-config\s+'?\{[^}]*"method"\s*:\s*"([^"]+)"[^}]*"num_speculative_tokens"\s*:\s*(\d+)/);

View File

@@ -544,11 +544,25 @@ function _rerenderCachedModels() {
panelHtml += `</div>`;
// Row 2c: llama.cpp fit/perf flags (set by Auto profiles, editable by hand)
const _kvOpts = ['', 'q4_0', 'q8_0', 'f16'].map(k => `<option value="${k}"${sv('cache_type','')===k?' selected':''}>${k||'default'}</option>`).join('');
const llamaFitOpts = ['', 'off', 'on'].map(d => `<option value="${d}"${sv('llama_fit','')===d?' selected':''}>${d||'default'}</option>`).join('');
const llamaSplitModeOpts = ['', 'layer', 'tensor', 'row', 'none'].map(d => `<option value="${d}"${sv('llama_split_mode','')===d?' selected':''}>${d||'default'}</option>`).join('');
panelHtml += `<div class="hwfit-serve-row hwfit-backend-llamacpp">`;
panelHtml += `<label>${_l('CPU MoE','n-cpu-moe: number of MoE expert layers to run on CPU when the model is bigger than VRAM. 0 = all on GPU. Set automatically by the Auto profiles below.')}<input type="text" class="hwfit-sf" data-field="n_cpu_moe" value="${esc(sv('n_cpu_moe',''))}" placeholder="0" style="width:54px;" /></label>`;
panelHtml += `<label>${_l('KV Cache','cache-type-k/v: quantize the KV cache. q4_0 = smallest (more context), q8_0 = sharp long-context, f16 = full. Blank = llama.cpp default.')}<select class="hwfit-sf" data-field="cache_type">${_kvOpts}</select></label>`;
panelHtml += `<label class="hwfit-sf-cb" style="align-self:end;"><input type="checkbox" class="hwfit-sf" data-field="flash_attn"${sv('flash_attn',false)?' checked':''} /> Flash Attn${_h('--flash-attn on: faster attention + needed for quantized KV cache.')}</label>`;
panelHtml += `<label class="hwfit-sf-cb" style="align-self:end;"><input type="checkbox" class="hwfit-sf" data-field="vision"${sv('vision',false)?' checked':''} /> Vision${_h('Serve with the vision encoder so the model can read images. Auto-finds an mmproj-*.gguf next to the model (download one into the model folder). Adds ~1 GB VRAM + a small per-image cost.')}</label>`;
panelHtml += `<label>${_l('Fit','llama.cpp --fit. Leave default unless you need explicit off/on behavior for a preset.')}<select class="hwfit-sf" data-field="llama_fit">${llamaFitOpts}</select></label>`;
panelHtml += `</div>`;
// Row 2d: native llama-server placement/runtime controls. These are
// explicit overrides for known-good advanced presets; blank keeps
// llama.cpp/profile defaults.
panelHtml += `<div class="hwfit-serve-row hwfit-backend-llamacpp">`;
panelHtml += `<label>${_l('Split Mode','llama.cpp GPU placement. layer is the usual default; tensor splits weights and KV across GPUs.')}<select class="hwfit-sf" data-field="llama_split_mode">${llamaSplitModeOpts}</select></label>`;
panelHtml += `<label>${_l('Tensor Split','GPU proportions for llama.cpp, e.g. 50,50 across two visible GPUs. Leave blank for auto.')}<input type="text" class="hwfit-sf" data-field="llama_tensor_split" value="${esc(sv('llama_tensor_split', ''))}" placeholder="50,50" /></label>`;
panelHtml += `<label>${_l('Main GPU','llama.cpp --main-gpu index inside the visible GPU set. Mostly useful for split mode none/row.')}<input type="text" class="hwfit-sf" data-field="llama_main_gpu" value="${esc(sv('llama_main_gpu', ''))}" placeholder="auto" /></label>`;
panelHtml += `<label>${_l('Parallel','llama.cpp parallel slots. Leave blank for llama.cpp default; 1 matches single-lane presets.')}<input type="text" class="hwfit-sf" data-field="llama_parallel" value="${esc(sv('llama_parallel', ''))}" placeholder="1" /></label>`;
panelHtml += `<label>${_l('Batch','llama.cpp prompt batch size. Leave blank for llama.cpp default.')}<input type="text" class="hwfit-sf" data-field="llama_batch_size" value="${esc(sv('llama_batch_size', ''))}" placeholder="2048" /></label>`;
panelHtml += `<label>${_l('UBatch','llama.cpp physical micro-batch size. Leave blank for llama.cpp default.')}<input type="text" class="hwfit-sf" data-field="llama_ubatch_size" value="${esc(sv('llama_ubatch_size', ''))}" placeholder="512" /></label>`;
panelHtml += `</div>`;
// Row 2d: Auto profiles — computed from detected hardware (see profiles.py).
// Buttons are injected after the panel mounts (needs an async fetch).
@@ -566,6 +580,9 @@ function _rerenderCachedModels() {
// Row 3a: Checkboxes (llama.cpp-only)
panelHtml += `<div class="hwfit-serve-checks hwfit-backend-llamacpp">`;
panelHtml += `<label class="hwfit-sf-cb"><input type="checkbox" class="hwfit-sf" data-field="unified_mem"${sv('unified_mem',false)?' checked':''} /> Unified Memory${_h('For AMD APUs / Strix Halo: exports GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 so llama.cpp can address the full BIOS VRAM carveout instead of the default ~28 GB cap. No-op on discrete GPUs.')}</label>`;
panelHtml += `<label class="hwfit-sf-cb"><input type="checkbox" class="hwfit-sf" data-field="llama_no_mmap"${sv('llama_no_mmap',false)?' checked':''} /> No mmap${_h('Adds --no-mmap for native llama-server. Useful for some high-context/local-storage setups, but not a universal default.')}</label>`;
panelHtml += `<label class="hwfit-sf-cb"><input type="checkbox" class="hwfit-sf" data-field="llama_no_warmup"${sv('llama_no_warmup',false)?' checked':''} /> Skip warmup${_h('Adds --no-warmup. Can reduce startup memory spikes for tight launches, but llama.cpp defaults to warming up.')}</label>`;
panelHtml += `<label class="hwfit-sf-cb hwfit-spec-group"><input type="checkbox" class="hwfit-sf" data-field="llama_speculative_mtp"${sv('llama_speculative_mtp',false)?' checked':''} /> MTP Spec${_h('llama.cpp native MTP speculative decoding: --spec-type draft-mtp. Requires a GGUF with MTP heads and a recent llama-server build.')} <span class="hwfit-numstep"><button type="button" class="hwfit-numstep-btn" data-step="-1" tabindex="-1" aria-label="Decrease"></button><input type="number" class="hwfit-sf hwfit-spec-tokens" data-field="llama_spec_tokens" value="${esc(sv('llama_spec_tokens', '3'))}" min="1" max="10" title="--spec-draft-n-max" /><button type="button" class="hwfit-numstep-btn" data-step="1" tabindex="-1" aria-label="Increase"></button></span></label>`;
panelHtml += `</div>`;
// Row 3b: Checkboxes (diffusers)
panelHtml += `<div class="hwfit-serve-checks hwfit-backend-diffusers">`;
@@ -866,6 +883,15 @@ function _rerenderCachedModels() {
swap: _ex(/--swap-space\s+(\d+)/) || '',
dtype: _ex(/--dtype\s+(\w+)/) || 'auto',
max_seqs: _ex(/--max-num-seqs\s+(\d+)/) || '',
cache_type: _ex(/(?:--cache-type-k|-ctk)\s+(\S+)/) || '',
llama_fit: _ex(/(?:--fit|-fit)\s+(on|off)/) || '',
llama_split_mode: _ex(/(?:--split-mode|-sm)\s+(none|layer|row|tensor)/) || '',
llama_tensor_split: _ex(/(?:--tensor-split|-ts)\s+([0-9.,]+)/) || '',
llama_main_gpu: _ex(/(?:--main-gpu|-mg)\s+(\d+)/) || '',
llama_parallel: _ex(/(?:--parallel|-np)\s+(\d+)/) || '',
llama_batch_size: _ex(/(?:--batch-size|-b)\s+(\d+)/) || '',
llama_ubatch_size: _ex(/(?:--ubatch-size|-ub)\s+(\d+)/) || '',
llama_spec_tokens: _ex(/--spec-draft-n-max\s+(\d+)/) || '3',
venv: p.envPath || '',
};
const checks = {
@@ -873,6 +899,11 @@ function _rerenderCachedModels() {
trust_remote: cmd.includes('--trust-remote-code'),
prefix_cache: cmd.includes('--enable-prefix-caching'),
auto_tool: cmd.includes('--enable-auto-tool-choice'),
flash_attn: /--flash-attn\s+on\b/.test(cmd),
unified_mem: /GGML_CUDA_ENABLE_UNIFIED_MEMORY=1/.test(cmd),
llama_no_mmap: /--no-mmap\b/.test(cmd),
llama_no_warmup: /--no-warmup\b/.test(cmd),
llama_speculative_mtp: /--spec-type\s+\S*draft-mtp/.test(cmd),
speculative: cmd.includes('--speculative-config'),
};
const _specMatch = cmd.match(/--speculative-config\s+'?\{[^}]*"method"\s*:\s*"([^"]+)"[^}]*"num_speculative_tokens"\s*:\s*(\d+)/);

View File

@@ -15,6 +15,7 @@ from routes.cookbook_helpers import (
_safe_env_prefix,
_validate_gpus,
_validate_repo_id,
_validate_serve_cmd,
_validate_serve_model_id,
_validate_ssh_port,
)
@@ -131,6 +132,23 @@ def test_serve_runner_preserves_command_exit_code():
assert 'echo "=== Process exited with code $? ==="' not in script
def test_validate_serve_cmd_accepts_llama_advanced_controls():
cmd = (
"MODEL_FILE=$(printf %s ${HOME}'/.cache/huggingface/hub/models--Qwen--Qwen3-GGUF/snapshots/model.gguf') "
'&& { [ -n "$MODEL_FILE" ] && [ -f "$MODEL_FILE" ]; } '
'|| { echo "ERROR: No GGUF found on this host."; exit 1; } && '
'GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 CUDA_VISIBLE_DEVICES=0,1 llama-server '
'--model "$MODEL_FILE" --host 0.0.0.0 --port 8000 -ngl 99 -c 131072 '
'--n-cpu-moe 0 --cache-type-k q8_0 --cache-type-v q8_0 --flash-attn on '
'--fit off --split-mode tensor --tensor-split 50,50 --main-gpu 0 '
'--parallel 1 --batch-size 2048 --ubatch-size 512 --no-mmap --no-warmup '
'--spec-type draft-mtp --spec-draft-n-max 3 '
'|| python3 -m llama_cpp.server --model "$MODEL_FILE" --host 0.0.0.0 --port 8000'
)
assert _validate_serve_cmd(cmd) == cmd
def test_ollama_serve_defaults_to_loopback_bind():
assert _ollama_bind_from_cmd("ollama serve") == ("127.0.0.1", "11434")
assert _ollama_bind_from_cmd("ollama run qwen2.5:0.5b") == ("127.0.0.1", "11434")