diff --git a/static/js/cookbook.js b/static/js/cookbook.js index d4f264e..33e2e10 100644 --- a/static/js/cookbook.js +++ b/static/js/cookbook.js @@ -366,6 +366,8 @@ export function _buildServeCmd(f, modelName, backend) { cmd += ` --gpu-memory-utilization ${f.gpu_mem || '0.90'}`; if (f.swap && f.swap !== '0') cmd += ` --swap-space ${f.swap}`; cmd += ` --dtype ${f.dtype || 'auto'}`; + const _kv = (f.vllm_kv_cache_dtype ?? '').toString().trim(); + if (_kv === 'fp8') cmd += ' --kv-cache-dtype fp8'; if (f.max_seqs && f.max_seqs.toString().trim()) cmd += ` --max-num-seqs ${f.max_seqs.toString().trim()}`; if (f.enforce_eager) cmd += ' --enforce-eager'; if (f.trust_remote) cmd += ' --trust-remote-code'; diff --git a/static/js/cookbookRunning.js b/static/js/cookbookRunning.js index 7cab3c7..ac27335 100644 --- a/static/js/cookbookRunning.js +++ b/static/js/cookbookRunning.js @@ -1332,6 +1332,7 @@ function _parseServeCmdToFields(cmd) { gpu_mem: ex(/--gpu-memory-utilization\s+([\d.]+)/) || '0.90', swap: ex(/--swap-space\s+(\d+)/) || '', dtype: ex(/--dtype\s+(\w+)/) || 'auto', + vllm_kv_cache_dtype: ex(/--kv-cache-dtype\s+([\w.-]+)/) || 'auto', max_seqs: ex(/--max-num-seqs\s+(\d+)/) || '', gpus: ex(/CUDA_VISIBLE_DEVICES=(\S+)/) || '', cache_type: ex(/(?:--cache-type-k|-ctk)\s+(\S+)/) || '', diff --git a/static/js/cookbookServe.js b/static/js/cookbookServe.js index b8db61b..666d947 100644 --- a/static/js/cookbookServe.js +++ b/static/js/cookbookServe.js @@ -453,6 +453,7 @@ function _rerenderCachedModels() { : (_es.gpus || detectedGpuIds)); const tpOpts = [1,2,4,8].map(n => `${n}`).join(''); const dtypeOpts = ['auto','float16','bfloat16'].map(d => ``).join(''); + const vllmKvCacheOpts = ['auto','fp8'].map(d => ``).join(''); const _l = (name, tip) => `${name}?`; const _ggufChoices = _runnableGgufFiles(m); const _savedGguf = String(sv('gguf_file', '') || ''); @@ -524,6 +525,7 @@ function _rerenderCachedModels() { panelHtml += ``; panelHtml += ``; panelHtml += ``; + panelHtml += ``; panelHtml += ``; // Row 2b: Diffusers settings const diffDtypeOpts = ['bfloat16','float16','float32'].map(d => ``).join(''); @@ -882,6 +884,7 @@ function _rerenderCachedModels() { gpu_mem: _ex(/--gpu-memory-utilization\s+([\d.]+)/) || '0.90', swap: _ex(/--swap-space\s+(\d+)/) || '', dtype: _ex(/--dtype\s+(\w+)/) || 'auto', + vllm_kv_cache_dtype: _ex(/--kv-cache-dtype\s+([\w.-]+)/) || 'auto', max_seqs: _ex(/--max-num-seqs\s+(\d+)/) || '', cache_type: _ex(/(?:--cache-type-k|-ctk)\s+(\S+)/) || '', llama_fit: _ex(/(?:--fit|-fit)\s+(on|off)/) || '', diff --git a/tests/test_cookbook_helpers.py b/tests/test_cookbook_helpers.py index b52fb00..cd7c980 100644 --- a/tests/test_cookbook_helpers.py +++ b/tests/test_cookbook_helpers.py @@ -257,6 +257,16 @@ def test_serve_runner_preserves_command_exit_code(): assert 'echo "=== Process exited with code $? ==="' not in script +def test_validate_serve_cmd_accepts_vllm_kv_cache_dtype(): + cmd = ( + "CUDA_VISIBLE_DEVICES=0,1 vllm serve nvidia/Qwen3.6-35B-A3B-NVFP4 " + "--host 0.0.0.0 --port 8000 --tensor-parallel-size 2 " + "--max-model-len 4096 --dtype auto --kv-cache-dtype fp8" + ) + + assert _validate_serve_cmd(cmd) == cmd + + def test_validate_serve_cmd_accepts_llama_advanced_controls(): cmd = ( "MODEL_FILE=$(printf %s ${HOME}'/.cache/huggingface/hub/models--Qwen--Qwen3-GGUF/snapshots/model.gguf') "