From 0f3280ee058adda480476bae3bab1e60fe02045c Mon Sep 17 00:00:00 2001 From: spooky Date: Tue, 2 Jun 2026 13:46:16 +1000 Subject: [PATCH] Expose advanced llama.cpp serve controls --- static/js/cookbook.js | 29 +++++++++++++++++++++++++++++ static/js/cookbookRunning.js | 14 ++++++++++++++ static/js/cookbookServe.js | 31 +++++++++++++++++++++++++++++++ tests/test_cookbook_helpers.py | 18 ++++++++++++++++++ 4 files changed, 92 insertions(+) diff --git a/static/js/cookbook.js b/static/js/cookbook.js index 8c23a5a..89583a7 100644 --- a/static/js/cookbook.js +++ b/static/js/cookbook.js @@ -423,6 +423,14 @@ export function _buildServeCmd(f, modelName, backend) { // speed things up. Only emitted when set, so manual/older flows are unchanged. const _ncm = (f.n_cpu_moe ?? '').toString().trim(); const _kv = (f.cache_type ?? '').toString().trim(); + const _llamaNum = (v) => { + const s = String(v || '').trim(); + return /^\d+$/.test(s) ? s : ''; + }; + const _llamaCsv = (v) => { + const s = String(v || '').replace(/\s+/g, ''); + return /^\d+(?:\.\d+)?(?:,\d+(?:\.\d+)?)*$/.test(s) ? s : ''; + }; let _lcExtra = ''; let _lcpExtra = ''; if (_ncm !== '' && Number(_ncm) > 0) { @@ -438,6 +446,27 @@ export function _buildServeCmd(f, modelName, backend) { // llama-cpp-python exposes these as type_k/type_v; pass through best-effort. _lcpExtra += ` --type_k ${_kv} --type_v ${_kv}`; } + const _llamaFit = String(f.llama_fit || '').trim(); + if (['on', 'off'].includes(_llamaFit)) _lcExtra += ` --fit ${_llamaFit}`; + if (f.llama_no_mmap) _lcExtra += ' --no-mmap'; + if (f.llama_no_warmup) _lcExtra += ' --no-warmup'; + const _llamaSplitMode = String(f.llama_split_mode || '').trim(); + if (['none', 'layer', 'row', 'tensor'].includes(_llamaSplitMode)) _lcExtra += ` --split-mode ${_llamaSplitMode}`; + const _llamaTensorSplit = _llamaCsv(f.llama_tensor_split); + if (_llamaTensorSplit) _lcExtra += ` --tensor-split ${_llamaTensorSplit}`; + const _llamaMainGpu = _llamaNum(f.llama_main_gpu); + if (_llamaMainGpu) _lcExtra += ` --main-gpu ${_llamaMainGpu}`; + const _llamaParallel = _llamaNum(f.llama_parallel); + if (_llamaParallel) _lcExtra += ` --parallel ${_llamaParallel}`; + const _llamaBatch = _llamaNum(f.llama_batch_size); + if (_llamaBatch) _lcExtra += ` --batch-size ${_llamaBatch}`; + const _llamaUBatch = _llamaNum(f.llama_ubatch_size); + if (_llamaUBatch) _lcExtra += ` --ubatch-size ${_llamaUBatch}`; + if (f.llama_speculative_mtp) { + const specTokens = parseInt(f.llama_spec_tokens, 10); + const specN = Number.isFinite(specTokens) && specTokens > 0 ? specTokens : 3; + _lcExtra += ` --spec-type draft-mtp --spec-draft-n-max ${specN}`; + } // Vision: serve the multimodal projector so the model can read images. The // mmproj path is resolved at runtime (find mmproj-*.gguf next to the model); // only emitted when the Vision toggle is on AND a projector was found. diff --git a/static/js/cookbookRunning.js b/static/js/cookbookRunning.js index 51a32ce..1a6f700 100644 --- a/static/js/cookbookRunning.js +++ b/static/js/cookbookRunning.js @@ -1195,10 +1195,24 @@ function _parseServeCmdToFields(cmd) { dtype: ex(/--dtype\s+(\w+)/) || 'auto', max_seqs: ex(/--max-num-seqs\s+(\d+)/) || '', gpus: ex(/CUDA_VISIBLE_DEVICES=(\S+)/) || '', + cache_type: ex(/(?:--cache-type-k|-ctk)\s+(\S+)/) || '', + llama_fit: ex(/(?:--fit|-fit)\s+(on|off)/) || '', + llama_split_mode: ex(/(?:--split-mode|-sm)\s+(none|layer|row|tensor)/) || '', + llama_tensor_split: ex(/(?:--tensor-split|-ts)\s+([0-9.,]+)/) || '', + llama_main_gpu: ex(/(?:--main-gpu|-mg)\s+(\d+)/) || '', + llama_parallel: ex(/(?:--parallel|-np)\s+(\d+)/) || '', + llama_batch_size: ex(/(?:--batch-size|-b)\s+(\d+)/) || '', + llama_ubatch_size: ex(/(?:--ubatch-size|-ub)\s+(\d+)/) || '', + llama_spec_tokens: ex(/--spec-draft-n-max\s+(\d+)/) || '3', enforce_eager: cmd.includes('--enforce-eager'), trust_remote: cmd.includes('--trust-remote-code'), prefix_cache: cmd.includes('--enable-prefix-caching'), auto_tool: cmd.includes('--enable-auto-tool-choice'), + flash_attn: /--flash-attn\s+on\b/.test(cmd), + unified_mem: /GGML_CUDA_ENABLE_UNIFIED_MEMORY=1/.test(cmd), + llama_no_mmap: /--no-mmap\b/.test(cmd), + llama_no_warmup: /--no-warmup\b/.test(cmd), + llama_speculative_mtp: /--spec-type\s+\S*draft-mtp/.test(cmd), speculative: cmd.includes('--speculative-config'), }; const spec = cmd.match(/--speculative-config\s+'?\{[^}]*"method"\s*:\s*"([^"]+)"[^}]*"num_speculative_tokens"\s*:\s*(\d+)/); diff --git a/static/js/cookbookServe.js b/static/js/cookbookServe.js index 0a863db..b8db61b 100644 --- a/static/js/cookbookServe.js +++ b/static/js/cookbookServe.js @@ -544,11 +544,25 @@ function _rerenderCachedModels() { panelHtml += ``; // Row 2c: llama.cpp fit/perf flags (set by Auto profiles, editable by hand) const _kvOpts = ['', 'q4_0', 'q8_0', 'f16'].map(k => ``).join(''); + const llamaFitOpts = ['', 'off', 'on'].map(d => ``).join(''); + const llamaSplitModeOpts = ['', 'layer', 'tensor', 'row', 'none'].map(d => ``).join(''); panelHtml += `
`; panelHtml += ``; panelHtml += ``; panelHtml += ``; panelHtml += ``; + panelHtml += ``; + panelHtml += `
`; + // Row 2d: native llama-server placement/runtime controls. These are + // explicit overrides for known-good advanced presets; blank keeps + // llama.cpp/profile defaults. + panelHtml += `
`; + panelHtml += ``; + panelHtml += ``; + panelHtml += ``; + panelHtml += ``; + panelHtml += ``; + panelHtml += ``; panelHtml += `
`; // Row 2d: Auto profiles — computed from detected hardware (see profiles.py). // Buttons are injected after the panel mounts (needs an async fetch). @@ -566,6 +580,9 @@ function _rerenderCachedModels() { // Row 3a: Checkboxes (llama.cpp-only) panelHtml += `
`; panelHtml += ``; + panelHtml += ``; + panelHtml += ``; + panelHtml += ``; panelHtml += `
`; // Row 3b: Checkboxes (diffusers) panelHtml += `
`; @@ -866,6 +883,15 @@ function _rerenderCachedModels() { swap: _ex(/--swap-space\s+(\d+)/) || '', dtype: _ex(/--dtype\s+(\w+)/) || 'auto', max_seqs: _ex(/--max-num-seqs\s+(\d+)/) || '', + cache_type: _ex(/(?:--cache-type-k|-ctk)\s+(\S+)/) || '', + llama_fit: _ex(/(?:--fit|-fit)\s+(on|off)/) || '', + llama_split_mode: _ex(/(?:--split-mode|-sm)\s+(none|layer|row|tensor)/) || '', + llama_tensor_split: _ex(/(?:--tensor-split|-ts)\s+([0-9.,]+)/) || '', + llama_main_gpu: _ex(/(?:--main-gpu|-mg)\s+(\d+)/) || '', + llama_parallel: _ex(/(?:--parallel|-np)\s+(\d+)/) || '', + llama_batch_size: _ex(/(?:--batch-size|-b)\s+(\d+)/) || '', + llama_ubatch_size: _ex(/(?:--ubatch-size|-ub)\s+(\d+)/) || '', + llama_spec_tokens: _ex(/--spec-draft-n-max\s+(\d+)/) || '3', venv: p.envPath || '', }; const checks = { @@ -873,6 +899,11 @@ function _rerenderCachedModels() { trust_remote: cmd.includes('--trust-remote-code'), prefix_cache: cmd.includes('--enable-prefix-caching'), auto_tool: cmd.includes('--enable-auto-tool-choice'), + flash_attn: /--flash-attn\s+on\b/.test(cmd), + unified_mem: /GGML_CUDA_ENABLE_UNIFIED_MEMORY=1/.test(cmd), + llama_no_mmap: /--no-mmap\b/.test(cmd), + llama_no_warmup: /--no-warmup\b/.test(cmd), + llama_speculative_mtp: /--spec-type\s+\S*draft-mtp/.test(cmd), speculative: cmd.includes('--speculative-config'), }; const _specMatch = cmd.match(/--speculative-config\s+'?\{[^}]*"method"\s*:\s*"([^"]+)"[^}]*"num_speculative_tokens"\s*:\s*(\d+)/); diff --git a/tests/test_cookbook_helpers.py b/tests/test_cookbook_helpers.py index ff6f38b..073631e 100644 --- a/tests/test_cookbook_helpers.py +++ b/tests/test_cookbook_helpers.py @@ -15,6 +15,7 @@ from routes.cookbook_helpers import ( _safe_env_prefix, _validate_gpus, _validate_repo_id, + _validate_serve_cmd, _validate_serve_model_id, _validate_ssh_port, ) @@ -131,6 +132,23 @@ def test_serve_runner_preserves_command_exit_code(): assert 'echo "=== Process exited with code $? ==="' not in script +def test_validate_serve_cmd_accepts_llama_advanced_controls(): + cmd = ( + "MODEL_FILE=$(printf %s ${HOME}'/.cache/huggingface/hub/models--Qwen--Qwen3-GGUF/snapshots/model.gguf') " + '&& { [ -n "$MODEL_FILE" ] && [ -f "$MODEL_FILE" ]; } ' + '|| { echo "ERROR: No GGUF found on this host."; exit 1; } && ' + 'GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 CUDA_VISIBLE_DEVICES=0,1 llama-server ' + '--model "$MODEL_FILE" --host 0.0.0.0 --port 8000 -ngl 99 -c 131072 ' + '--n-cpu-moe 0 --cache-type-k q8_0 --cache-type-v q8_0 --flash-attn on ' + '--fit off --split-mode tensor --tensor-split 50,50 --main-gpu 0 ' + '--parallel 1 --batch-size 2048 --ubatch-size 512 --no-mmap --no-warmup ' + '--spec-type draft-mtp --spec-draft-n-max 3 ' + '|| python3 -m llama_cpp.server --model "$MODEL_FILE" --host 0.0.0.0 --port 8000' + ) + + assert _validate_serve_cmd(cmd) == cmd + + def test_ollama_serve_defaults_to_loopback_bind(): assert _ollama_bind_from_cmd("ollama serve") == ("127.0.0.1", "11434") assert _ollama_bind_from_cmd("ollama run qwen2.5:0.5b") == ("127.0.0.1", "11434")