Expose advanced llama.cpp serve controls

2026-06-02 13:46:16 +10:00
parent 05fb48e9d5
commit 0f3280ee05
4 changed files with 92 additions and 0 deletions
--- a/static/js/cookbook.js
+++ b/static/js/cookbook.js
@@ -423,6 +423,14 @@ export function _buildServeCmd(f, modelName, backend) {
    // speed things up. Only emitted when set, so manual/older flows are unchanged.
    const _ncm = (f.n_cpu_moe ?? '').toString().trim();
    const _kv = (f.cache_type ?? '').toString().trim();
+    const _llamaNum = (v) => {
+      const s = String(v || '').trim();
+      return /^\d+$/.test(s) ? s : '';
+    };
+    const _llamaCsv = (v) => {
+      const s = String(v || '').replace(/\s+/g, '');
+      return /^\d+(?:\.\d+)?(?:,\d+(?:\.\d+)?)*$/.test(s) ? s : '';
+    };
    let _lcExtra = '';
    let _lcpExtra = '';
    if (_ncm !== '' && Number(_ncm) > 0) {
@@ -438,6 +446,27 @@ export function _buildServeCmd(f, modelName, backend) {
      // llama-cpp-python exposes these as type_k/type_v; pass through best-effort.
      _lcpExtra += ` --type_k ${_kv} --type_v ${_kv}`;
    }
+    const _llamaFit = String(f.llama_fit || '').trim();
+    if (['on', 'off'].includes(_llamaFit)) _lcExtra += ` --fit ${_llamaFit}`;
+    if (f.llama_no_mmap) _lcExtra += ' --no-mmap';
+    if (f.llama_no_warmup) _lcExtra += ' --no-warmup';
+    const _llamaSplitMode = String(f.llama_split_mode || '').trim();
+    if (['none', 'layer', 'row', 'tensor'].includes(_llamaSplitMode)) _lcExtra += ` --split-mode ${_llamaSplitMode}`;
+    const _llamaTensorSplit = _llamaCsv(f.llama_tensor_split);
+    if (_llamaTensorSplit) _lcExtra += ` --tensor-split ${_llamaTensorSplit}`;
+    const _llamaMainGpu = _llamaNum(f.llama_main_gpu);
+    if (_llamaMainGpu) _lcExtra += ` --main-gpu ${_llamaMainGpu}`;
+    const _llamaParallel = _llamaNum(f.llama_parallel);
+    if (_llamaParallel) _lcExtra += ` --parallel ${_llamaParallel}`;
+    const _llamaBatch = _llamaNum(f.llama_batch_size);
+    if (_llamaBatch) _lcExtra += ` --batch-size ${_llamaBatch}`;
+    const _llamaUBatch = _llamaNum(f.llama_ubatch_size);
+    if (_llamaUBatch) _lcExtra += ` --ubatch-size ${_llamaUBatch}`;
+    if (f.llama_speculative_mtp) {
+      const specTokens = parseInt(f.llama_spec_tokens, 10);
+      const specN = Number.isFinite(specTokens) && specTokens > 0 ? specTokens : 3;
+      _lcExtra += ` --spec-type draft-mtp --spec-draft-n-max ${specN}`;
+    }
    // Vision: serve the multimodal projector so the model can read images. The
    // mmproj path is resolved at runtime (find mmproj-*.gguf next to the model);
    // only emitted when the Vision toggle is on AND a projector was found.
--- a/static/js/cookbookRunning.js
+++ b/static/js/cookbookRunning.js
@@ -1195,10 +1195,24 @@ function _parseServeCmdToFields(cmd) {
    dtype: ex(/--dtype\s+(\w+)/) || 'auto',
    max_seqs: ex(/--max-num-seqs\s+(\d+)/) || '',
    gpus: ex(/CUDA_VISIBLE_DEVICES=(\S+)/) || '',
+    cache_type: ex(/(?:--cache-type-k|-ctk)\s+(\S+)/) || '',
+    llama_fit: ex(/(?:--fit|-fit)\s+(on|off)/) || '',
+    llama_split_mode: ex(/(?:--split-mode|-sm)\s+(none|layer|row|tensor)/) || '',
+    llama_tensor_split: ex(/(?:--tensor-split|-ts)\s+([0-9.,]+)/) || '',
+    llama_main_gpu: ex(/(?:--main-gpu|-mg)\s+(\d+)/) || '',
+    llama_parallel: ex(/(?:--parallel|-np)\s+(\d+)/) || '',
+    llama_batch_size: ex(/(?:--batch-size|-b)\s+(\d+)/) || '',
+    llama_ubatch_size: ex(/(?:--ubatch-size|-ub)\s+(\d+)/) || '',
+    llama_spec_tokens: ex(/--spec-draft-n-max\s+(\d+)/) || '3',
    enforce_eager: cmd.includes('--enforce-eager'),
    trust_remote: cmd.includes('--trust-remote-code'),
    prefix_cache: cmd.includes('--enable-prefix-caching'),
    auto_tool: cmd.includes('--enable-auto-tool-choice'),
+    flash_attn: /--flash-attn\s+on\b/.test(cmd),
+    unified_mem: /GGML_CUDA_ENABLE_UNIFIED_MEMORY=1/.test(cmd),
+    llama_no_mmap: /--no-mmap\b/.test(cmd),
+    llama_no_warmup: /--no-warmup\b/.test(cmd),
+    llama_speculative_mtp: /--spec-type\s+\S*draft-mtp/.test(cmd),
    speculative: cmd.includes('--speculative-config'),
  };
  const spec = cmd.match(/--speculative-config\s+'?\{[^}]*"method"\s*:\s*"([^"]+)"[^}]*"num_speculative_tokens"\s*:\s*(\d+)/);
--- a/static/js/cookbookServe.js
+++ b/static/js/cookbookServe.js
@@ -544,11 +544,25 @@ function _rerenderCachedModels() {
      panelHtml += `</div>`;
      // Row 2c: llama.cpp fit/perf flags (set by Auto profiles, editable by hand)
      const _kvOpts = ['', 'q4_0', 'q8_0', 'f16'].map(k => `<option value="${k}"${sv('cache_type','')===k?' selected':''}>${k||'default'}</option>`).join('');
+      const llamaFitOpts = ['', 'off', 'on'].map(d => `<option value="${d}"${sv('llama_fit','')===d?' selected':''}>${d||'default'}</option>`).join('');
+      const llamaSplitModeOpts = ['', 'layer', 'tensor', 'row', 'none'].map(d => `<option value="${d}"${sv('llama_split_mode','')===d?' selected':''}>${d||'default'}</option>`).join('');
      panelHtml += `<div class="hwfit-serve-row hwfit-backend-llamacpp">`;
      panelHtml += `<label>${_l('CPU MoE','n-cpu-moe: number of MoE expert layers to run on CPU when the model is bigger than VRAM. 0 = all on GPU. Set automatically by the Auto profiles below.')}<input type="text" class="hwfit-sf" data-field="n_cpu_moe" value="${esc(sv('n_cpu_moe',''))}" placeholder="0" style="width:54px;" /></label>`;
      panelHtml += `<label>${_l('KV Cache','cache-type-k/v: quantize the KV cache. q4_0 = smallest (more context), q8_0 = sharp long-context, f16 = full. Blank = llama.cpp default.')}<select class="hwfit-sf" data-field="cache_type">${_kvOpts}</select></label>`;
      panelHtml += `<label class="hwfit-sf-cb" style="align-self:end;"><input type="checkbox" class="hwfit-sf" data-field="flash_attn"${sv('flash_attn',false)?' checked':''} /> Flash Attn${_h('--flash-attn on: faster attention + needed for quantized KV cache.')}</label>`;
      panelHtml += `<label class="hwfit-sf-cb" style="align-self:end;"><input type="checkbox" class="hwfit-sf" data-field="vision"${sv('vision',false)?' checked':''} /> Vision${_h('Serve with the vision encoder so the model can read images. Auto-finds an mmproj-*.gguf next to the model (download one into the model folder). Adds ~1 GB VRAM + a small per-image cost.')}</label>`;
+      panelHtml += `<label>${_l('Fit','llama.cpp --fit. Leave default unless you need explicit off/on behavior for a preset.')}<select class="hwfit-sf" data-field="llama_fit">${llamaFitOpts}</select></label>`;
+      panelHtml += `</div>`;
+      // Row 2d: native llama-server placement/runtime controls. These are
+      // explicit overrides for known-good advanced presets; blank keeps
+      // llama.cpp/profile defaults.
+      panelHtml += `<div class="hwfit-serve-row hwfit-backend-llamacpp">`;
+      panelHtml += `<label>${_l('Split Mode','llama.cpp GPU placement. layer is the usual default; tensor splits weights and KV across GPUs.')}<select class="hwfit-sf" data-field="llama_split_mode">${llamaSplitModeOpts}</select></label>`;
+      panelHtml += `<label>${_l('Tensor Split','GPU proportions for llama.cpp, e.g. 50,50 across two visible GPUs. Leave blank for auto.')}<input type="text" class="hwfit-sf" data-field="llama_tensor_split" value="${esc(sv('llama_tensor_split', ''))}" placeholder="50,50" /></label>`;
+      panelHtml += `<label>${_l('Main GPU','llama.cpp --main-gpu index inside the visible GPU set. Mostly useful for split mode none/row.')}<input type="text" class="hwfit-sf" data-field="llama_main_gpu" value="${esc(sv('llama_main_gpu', ''))}" placeholder="auto" /></label>`;
+      panelHtml += `<label>${_l('Parallel','llama.cpp parallel slots. Leave blank for llama.cpp default; 1 matches single-lane presets.')}<input type="text" class="hwfit-sf" data-field="llama_parallel" value="${esc(sv('llama_parallel', ''))}" placeholder="1" /></label>`;
+      panelHtml += `<label>${_l('Batch','llama.cpp prompt batch size. Leave blank for llama.cpp default.')}<input type="text" class="hwfit-sf" data-field="llama_batch_size" value="${esc(sv('llama_batch_size', ''))}" placeholder="2048" /></label>`;
+      panelHtml += `<label>${_l('UBatch','llama.cpp physical micro-batch size. Leave blank for llama.cpp default.')}<input type="text" class="hwfit-sf" data-field="llama_ubatch_size" value="${esc(sv('llama_ubatch_size', ''))}" placeholder="512" /></label>`;
      panelHtml += `</div>`;
      // Row 2d: Auto profiles — computed from detected hardware (see profiles.py).
      // Buttons are injected after the panel mounts (needs an async fetch).
@@ -566,6 +580,9 @@ function _rerenderCachedModels() {
      // Row 3a: Checkboxes (llama.cpp-only)
      panelHtml += `<div class="hwfit-serve-checks hwfit-backend-llamacpp">`;
      panelHtml += `<label class="hwfit-sf-cb"><input type="checkbox" class="hwfit-sf" data-field="unified_mem"${sv('unified_mem',false)?' checked':''} /> Unified Memory${_h('For AMD APUs / Strix Halo: exports GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 so llama.cpp can address the full BIOS VRAM carveout instead of the default ~28 GB cap. No-op on discrete GPUs.')}</label>`;
+      panelHtml += `<label class="hwfit-sf-cb"><input type="checkbox" class="hwfit-sf" data-field="llama_no_mmap"${sv('llama_no_mmap',false)?' checked':''} /> No mmap${_h('Adds --no-mmap for native llama-server. Useful for some high-context/local-storage setups, but not a universal default.')}</label>`;
+      panelHtml += `<label class="hwfit-sf-cb"><input type="checkbox" class="hwfit-sf" data-field="llama_no_warmup"${sv('llama_no_warmup',false)?' checked':''} /> Skip warmup${_h('Adds --no-warmup. Can reduce startup memory spikes for tight launches, but llama.cpp defaults to warming up.')}</label>`;
+      panelHtml += `<label class="hwfit-sf-cb hwfit-spec-group"><input type="checkbox" class="hwfit-sf" data-field="llama_speculative_mtp"${sv('llama_speculative_mtp',false)?' checked':''} /> MTP Spec${_h('llama.cpp native MTP speculative decoding: --spec-type draft-mtp. Requires a GGUF with MTP heads and a recent llama-server build.')} <span class="hwfit-numstep"><button type="button" class="hwfit-numstep-btn" data-step="-1" tabindex="-1" aria-label="Decrease">‹</button><input type="number" class="hwfit-sf hwfit-spec-tokens" data-field="llama_spec_tokens" value="${esc(sv('llama_spec_tokens', '3'))}" min="1" max="10" title="--spec-draft-n-max" /><button type="button" class="hwfit-numstep-btn" data-step="1" tabindex="-1" aria-label="Increase">›</button></span></label>`;
      panelHtml += `</div>`;
      // Row 3b: Checkboxes (diffusers)
      panelHtml += `<div class="hwfit-serve-checks hwfit-backend-diffusers">`;
@@ -866,6 +883,15 @@ function _rerenderCachedModels() {
            swap: _ex(/--swap-space\s+(\d+)/) || '',
            dtype: _ex(/--dtype\s+(\w+)/) || 'auto',
            max_seqs: _ex(/--max-num-seqs\s+(\d+)/) || '',
+            cache_type: _ex(/(?:--cache-type-k|-ctk)\s+(\S+)/) || '',
+            llama_fit: _ex(/(?:--fit|-fit)\s+(on|off)/) || '',
+            llama_split_mode: _ex(/(?:--split-mode|-sm)\s+(none|layer|row|tensor)/) || '',
+            llama_tensor_split: _ex(/(?:--tensor-split|-ts)\s+([0-9.,]+)/) || '',
+            llama_main_gpu: _ex(/(?:--main-gpu|-mg)\s+(\d+)/) || '',
+            llama_parallel: _ex(/(?:--parallel|-np)\s+(\d+)/) || '',
+            llama_batch_size: _ex(/(?:--batch-size|-b)\s+(\d+)/) || '',
+            llama_ubatch_size: _ex(/(?:--ubatch-size|-ub)\s+(\d+)/) || '',
+            llama_spec_tokens: _ex(/--spec-draft-n-max\s+(\d+)/) || '3',
            venv: p.envPath || '',
          };
          const checks = {
@@ -873,6 +899,11 @@ function _rerenderCachedModels() {
            trust_remote: cmd.includes('--trust-remote-code'),
            prefix_cache: cmd.includes('--enable-prefix-caching'),
            auto_tool: cmd.includes('--enable-auto-tool-choice'),
+            flash_attn: /--flash-attn\s+on\b/.test(cmd),
+            unified_mem: /GGML_CUDA_ENABLE_UNIFIED_MEMORY=1/.test(cmd),
+            llama_no_mmap: /--no-mmap\b/.test(cmd),
+            llama_no_warmup: /--no-warmup\b/.test(cmd),
+            llama_speculative_mtp: /--spec-type\s+\S*draft-mtp/.test(cmd),
            speculative: cmd.includes('--speculative-config'),
          };
          const _specMatch = cmd.match(/--speculative-config\s+'?\{[^}]*"method"\s*:\s*"([^"]+)"[^}]*"num_speculative_tokens"\s*:\s*(\d+)/);
--- a/tests/test_cookbook_helpers.py
+++ b/tests/test_cookbook_helpers.py
@@ -15,6 +15,7 @@ from routes.cookbook_helpers import (
    _safe_env_prefix,
    _validate_gpus,
    _validate_repo_id,
+    _validate_serve_cmd,
    _validate_serve_model_id,
    _validate_ssh_port,
 )
@@ -131,6 +132,23 @@ def test_serve_runner_preserves_command_exit_code():
    assert 'echo "=== Process exited with code $? ==="' not in script


+def test_validate_serve_cmd_accepts_llama_advanced_controls():
+    cmd = (
+        "MODEL_FILE=$(printf %s ${HOME}'/.cache/huggingface/hub/models--Qwen--Qwen3-GGUF/snapshots/model.gguf') "
+        '&& { [ -n "$MODEL_FILE" ] && [ -f "$MODEL_FILE" ]; } '
+        '|| { echo "ERROR: No GGUF found on this host."; exit 1; } && '
+        'GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 CUDA_VISIBLE_DEVICES=0,1 llama-server '
+        '--model "$MODEL_FILE" --host 0.0.0.0 --port 8000 -ngl 99 -c 131072 '
+        '--n-cpu-moe 0 --cache-type-k q8_0 --cache-type-v q8_0 --flash-attn on '
+        '--fit off --split-mode tensor --tensor-split 50,50 --main-gpu 0 '
+        '--parallel 1 --batch-size 2048 --ubatch-size 512 --no-mmap --no-warmup '
+        '--spec-type draft-mtp --spec-draft-n-max 3 '
+        '|| python3 -m llama_cpp.server --model "$MODEL_FILE" --host 0.0.0.0 --port 8000'
+    )
+
+    assert _validate_serve_cmd(cmd) == cmd
+
+
 def test_ollama_serve_defaults_to_loopback_bind():
    assert _ollama_bind_from_cmd("ollama serve") == ("127.0.0.1", "11434")
    assert _ollama_bind_from_cmd("ollama run qwen2.5:0.5b") == ("127.0.0.1", "11434")