diff --git a/static/js/cookbook.js b/static/js/cookbook.js
index 8c23a5a..89583a7 100644
--- a/static/js/cookbook.js
+++ b/static/js/cookbook.js
@@ -423,6 +423,14 @@ export function _buildServeCmd(f, modelName, backend) {
// speed things up. Only emitted when set, so manual/older flows are unchanged.
const _ncm = (f.n_cpu_moe ?? '').toString().trim();
const _kv = (f.cache_type ?? '').toString().trim();
+ const _llamaNum = (v) => {
+ const s = String(v || '').trim();
+ return /^\d+$/.test(s) ? s : '';
+ };
+ const _llamaCsv = (v) => {
+ const s = String(v || '').replace(/\s+/g, '');
+ return /^\d+(?:\.\d+)?(?:,\d+(?:\.\d+)?)*$/.test(s) ? s : '';
+ };
let _lcExtra = '';
let _lcpExtra = '';
if (_ncm !== '' && Number(_ncm) > 0) {
@@ -438,6 +446,27 @@ export function _buildServeCmd(f, modelName, backend) {
// llama-cpp-python exposes these as type_k/type_v; pass through best-effort.
_lcpExtra += ` --type_k ${_kv} --type_v ${_kv}`;
}
+ const _llamaFit = String(f.llama_fit || '').trim();
+ if (['on', 'off'].includes(_llamaFit)) _lcExtra += ` --fit ${_llamaFit}`;
+ if (f.llama_no_mmap) _lcExtra += ' --no-mmap';
+ if (f.llama_no_warmup) _lcExtra += ' --no-warmup';
+ const _llamaSplitMode = String(f.llama_split_mode || '').trim();
+ if (['none', 'layer', 'row', 'tensor'].includes(_llamaSplitMode)) _lcExtra += ` --split-mode ${_llamaSplitMode}`;
+ const _llamaTensorSplit = _llamaCsv(f.llama_tensor_split);
+ if (_llamaTensorSplit) _lcExtra += ` --tensor-split ${_llamaTensorSplit}`;
+ const _llamaMainGpu = _llamaNum(f.llama_main_gpu);
+ if (_llamaMainGpu) _lcExtra += ` --main-gpu ${_llamaMainGpu}`;
+ const _llamaParallel = _llamaNum(f.llama_parallel);
+ if (_llamaParallel) _lcExtra += ` --parallel ${_llamaParallel}`;
+ const _llamaBatch = _llamaNum(f.llama_batch_size);
+ if (_llamaBatch) _lcExtra += ` --batch-size ${_llamaBatch}`;
+ const _llamaUBatch = _llamaNum(f.llama_ubatch_size);
+ if (_llamaUBatch) _lcExtra += ` --ubatch-size ${_llamaUBatch}`;
+ if (f.llama_speculative_mtp) {
+ const specTokens = parseInt(f.llama_spec_tokens, 10);
+ const specN = Number.isFinite(specTokens) && specTokens > 0 ? specTokens : 3;
+ _lcExtra += ` --spec-type draft-mtp --spec-draft-n-max ${specN}`;
+ }
// Vision: serve the multimodal projector so the model can read images. The
// mmproj path is resolved at runtime (find mmproj-*.gguf next to the model);
// only emitted when the Vision toggle is on AND a projector was found.
diff --git a/static/js/cookbookRunning.js b/static/js/cookbookRunning.js
index 51a32ce..1a6f700 100644
--- a/static/js/cookbookRunning.js
+++ b/static/js/cookbookRunning.js
@@ -1195,10 +1195,24 @@ function _parseServeCmdToFields(cmd) {
dtype: ex(/--dtype\s+(\w+)/) || 'auto',
max_seqs: ex(/--max-num-seqs\s+(\d+)/) || '',
gpus: ex(/CUDA_VISIBLE_DEVICES=(\S+)/) || '',
+ cache_type: ex(/(?:--cache-type-k|-ctk)\s+(\S+)/) || '',
+ llama_fit: ex(/(?:--fit|-fit)\s+(on|off)/) || '',
+ llama_split_mode: ex(/(?:--split-mode|-sm)\s+(none|layer|row|tensor)/) || '',
+ llama_tensor_split: ex(/(?:--tensor-split|-ts)\s+([0-9.,]+)/) || '',
+ llama_main_gpu: ex(/(?:--main-gpu|-mg)\s+(\d+)/) || '',
+ llama_parallel: ex(/(?:--parallel|-np)\s+(\d+)/) || '',
+ llama_batch_size: ex(/(?:--batch-size|-b)\s+(\d+)/) || '',
+ llama_ubatch_size: ex(/(?:--ubatch-size|-ub)\s+(\d+)/) || '',
+ llama_spec_tokens: ex(/--spec-draft-n-max\s+(\d+)/) || '3',
enforce_eager: cmd.includes('--enforce-eager'),
trust_remote: cmd.includes('--trust-remote-code'),
prefix_cache: cmd.includes('--enable-prefix-caching'),
auto_tool: cmd.includes('--enable-auto-tool-choice'),
+ flash_attn: /--flash-attn\s+on\b/.test(cmd),
+ unified_mem: /GGML_CUDA_ENABLE_UNIFIED_MEMORY=1/.test(cmd),
+ llama_no_mmap: /--no-mmap\b/.test(cmd),
+ llama_no_warmup: /--no-warmup\b/.test(cmd),
+ llama_speculative_mtp: /--spec-type\s+\S*draft-mtp/.test(cmd),
speculative: cmd.includes('--speculative-config'),
};
const spec = cmd.match(/--speculative-config\s+'?\{[^}]*"method"\s*:\s*"([^"]+)"[^}]*"num_speculative_tokens"\s*:\s*(\d+)/);
diff --git a/static/js/cookbookServe.js b/static/js/cookbookServe.js
index 0a863db..b8db61b 100644
--- a/static/js/cookbookServe.js
+++ b/static/js/cookbookServe.js
@@ -544,11 +544,25 @@ function _rerenderCachedModels() {
panelHtml += ``;
// Row 2c: llama.cpp fit/perf flags (set by Auto profiles, editable by hand)
const _kvOpts = ['', 'q4_0', 'q8_0', 'f16'].map(k => `${k||'default'} `).join('');
+ const llamaFitOpts = ['', 'off', 'on'].map(d => `${d||'default'} `).join('');
+ const llamaSplitModeOpts = ['', 'layer', 'tensor', 'row', 'none'].map(d => `${d||'default'} `).join('');
panelHtml += `
`;
panelHtml += `${_l('CPU MoE','n-cpu-moe: number of MoE expert layers to run on CPU when the model is bigger than VRAM. 0 = all on GPU. Set automatically by the Auto profiles below.')} `;
panelHtml += `${_l('KV Cache','cache-type-k/v: quantize the KV cache. q4_0 = smallest (more context), q8_0 = sharp long-context, f16 = full. Blank = llama.cpp default.')}${_kvOpts} `;
panelHtml += ` Flash Attn${_h('--flash-attn on: faster attention + needed for quantized KV cache.')} `;
panelHtml += ` Vision${_h('Serve with the vision encoder so the model can read images. Auto-finds an mmproj-*.gguf next to the model (download one into the model folder). Adds ~1 GB VRAM + a small per-image cost.')} `;
+ panelHtml += `${_l('Fit','llama.cpp --fit. Leave default unless you need explicit off/on behavior for a preset.')}${llamaFitOpts} `;
+ panelHtml += `
`;
+ // Row 2d: native llama-server placement/runtime controls. These are
+ // explicit overrides for known-good advanced presets; blank keeps
+ // llama.cpp/profile defaults.
+ panelHtml += ``;
+ panelHtml += `${_l('Split Mode','llama.cpp GPU placement. layer is the usual default; tensor splits weights and KV across GPUs.')}${llamaSplitModeOpts} `;
+ panelHtml += `${_l('Tensor Split','GPU proportions for llama.cpp, e.g. 50,50 across two visible GPUs. Leave blank for auto.')} `;
+ panelHtml += `${_l('Main GPU','llama.cpp --main-gpu index inside the visible GPU set. Mostly useful for split mode none/row.')} `;
+ panelHtml += `${_l('Parallel','llama.cpp parallel slots. Leave blank for llama.cpp default; 1 matches single-lane presets.')} `;
+ panelHtml += `${_l('Batch','llama.cpp prompt batch size. Leave blank for llama.cpp default.')} `;
+ panelHtml += `${_l('UBatch','llama.cpp physical micro-batch size. Leave blank for llama.cpp default.')} `;
panelHtml += `
`;
// Row 2d: Auto profiles — computed from detected hardware (see profiles.py).
// Buttons are injected after the panel mounts (needs an async fetch).
@@ -566,6 +580,9 @@ function _rerenderCachedModels() {
// Row 3a: Checkboxes (llama.cpp-only)
panelHtml += ``;
panelHtml += ` Unified Memory${_h('For AMD APUs / Strix Halo: exports GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 so llama.cpp can address the full BIOS VRAM carveout instead of the default ~28 GB cap. No-op on discrete GPUs.')} `;
+ panelHtml += ` No mmap${_h('Adds --no-mmap for native llama-server. Useful for some high-context/local-storage setups, but not a universal default.')} `;
+ panelHtml += ` Skip warmup${_h('Adds --no-warmup. Can reduce startup memory spikes for tight launches, but llama.cpp defaults to warming up.')} `;
+ panelHtml += ` MTP Spec${_h('llama.cpp native MTP speculative decoding: --spec-type draft-mtp. Requires a GGUF with MTP heads and a recent llama-server build.')} ‹ › `;
panelHtml += `
`;
// Row 3b: Checkboxes (diffusers)
panelHtml += ``;
@@ -866,6 +883,15 @@ function _rerenderCachedModels() {
swap: _ex(/--swap-space\s+(\d+)/) || '',
dtype: _ex(/--dtype\s+(\w+)/) || 'auto',
max_seqs: _ex(/--max-num-seqs\s+(\d+)/) || '',
+ cache_type: _ex(/(?:--cache-type-k|-ctk)\s+(\S+)/) || '',
+ llama_fit: _ex(/(?:--fit|-fit)\s+(on|off)/) || '',
+ llama_split_mode: _ex(/(?:--split-mode|-sm)\s+(none|layer|row|tensor)/) || '',
+ llama_tensor_split: _ex(/(?:--tensor-split|-ts)\s+([0-9.,]+)/) || '',
+ llama_main_gpu: _ex(/(?:--main-gpu|-mg)\s+(\d+)/) || '',
+ llama_parallel: _ex(/(?:--parallel|-np)\s+(\d+)/) || '',
+ llama_batch_size: _ex(/(?:--batch-size|-b)\s+(\d+)/) || '',
+ llama_ubatch_size: _ex(/(?:--ubatch-size|-ub)\s+(\d+)/) || '',
+ llama_spec_tokens: _ex(/--spec-draft-n-max\s+(\d+)/) || '3',
venv: p.envPath || '',
};
const checks = {
@@ -873,6 +899,11 @@ function _rerenderCachedModels() {
trust_remote: cmd.includes('--trust-remote-code'),
prefix_cache: cmd.includes('--enable-prefix-caching'),
auto_tool: cmd.includes('--enable-auto-tool-choice'),
+ flash_attn: /--flash-attn\s+on\b/.test(cmd),
+ unified_mem: /GGML_CUDA_ENABLE_UNIFIED_MEMORY=1/.test(cmd),
+ llama_no_mmap: /--no-mmap\b/.test(cmd),
+ llama_no_warmup: /--no-warmup\b/.test(cmd),
+ llama_speculative_mtp: /--spec-type\s+\S*draft-mtp/.test(cmd),
speculative: cmd.includes('--speculative-config'),
};
const _specMatch = cmd.match(/--speculative-config\s+'?\{[^}]*"method"\s*:\s*"([^"]+)"[^}]*"num_speculative_tokens"\s*:\s*(\d+)/);
diff --git a/tests/test_cookbook_helpers.py b/tests/test_cookbook_helpers.py
index ff6f38b..073631e 100644
--- a/tests/test_cookbook_helpers.py
+++ b/tests/test_cookbook_helpers.py
@@ -15,6 +15,7 @@ from routes.cookbook_helpers import (
_safe_env_prefix,
_validate_gpus,
_validate_repo_id,
+ _validate_serve_cmd,
_validate_serve_model_id,
_validate_ssh_port,
)
@@ -131,6 +132,23 @@ def test_serve_runner_preserves_command_exit_code():
assert 'echo "=== Process exited with code $? ==="' not in script
+def test_validate_serve_cmd_accepts_llama_advanced_controls():
+ cmd = (
+ "MODEL_FILE=$(printf %s ${HOME}'/.cache/huggingface/hub/models--Qwen--Qwen3-GGUF/snapshots/model.gguf') "
+ '&& { [ -n "$MODEL_FILE" ] && [ -f "$MODEL_FILE" ]; } '
+ '|| { echo "ERROR: No GGUF found on this host."; exit 1; } && '
+ 'GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 CUDA_VISIBLE_DEVICES=0,1 llama-server '
+ '--model "$MODEL_FILE" --host 0.0.0.0 --port 8000 -ngl 99 -c 131072 '
+ '--n-cpu-moe 0 --cache-type-k q8_0 --cache-type-v q8_0 --flash-attn on '
+ '--fit off --split-mode tensor --tensor-split 50,50 --main-gpu 0 '
+ '--parallel 1 --batch-size 2048 --ubatch-size 512 --no-mmap --no-warmup '
+ '--spec-type draft-mtp --spec-draft-n-max 3 '
+ '|| python3 -m llama_cpp.server --model "$MODEL_FILE" --host 0.0.0.0 --port 8000'
+ )
+
+ assert _validate_serve_cmd(cmd) == cmd
+
+
def test_ollama_serve_defaults_to_loopback_bind():
assert _ollama_bind_from_cmd("ollama serve") == ("127.0.0.1", "11434")
assert _ollama_bind_from_cmd("ollama run qwen2.5:0.5b") == ("127.0.0.1", "11434")