feat: select cached gguf artifacts for serve (#891)

This commit is contained in:
spooky
2026-06-02 13:32:40 +10:00
committed by GitHub
parent 8455b88643
commit 8b3c0d8ad4
4 changed files with 131 additions and 9 deletions

View File

@@ -191,6 +191,38 @@ def _cached_model_scan_script(model_dirs: list[str] | None = None) -> str:
" for root, dirs, fns in os.walk(top, followlinks=False):",
" dirs[:] = [d for d in dirs if not os.path.islink(os.path.join(root, d)) and safe_path(os.path.join(root, d))]",
" yield root, dirs, fns",
"def gguf_role(name):",
" n = name.lower()",
" if n.startswith('mmproj') or 'mmproj' in n: return 'projector'",
" return 'model'",
"def gguf_quant(name):",
" m = re.search(r'(?i)(UD-)?(IQ[0-9]_[A-Z0-9_]+|Q[0-9](?:_[A-Z0-9]+)+|BF16|F16|FP16|F32|Q8_0)', name)",
" return m.group(0).upper() if m else ''",
"def collect_ggufs(base):",
" files = []",
" split_groups = {}",
" if not os.path.isdir(base) or not safe_path(base): return files",
" for root, dirs, fns in safe_walk(base):",
" for fn in sorted(fns):",
" if not fn.lower().endswith('.gguf'): continue",
" fp = os.path.join(root, fn)",
" try: size = os.path.getsize(fp)",
" except Exception: size = 0",
" try: rel = os.path.relpath(fp, base).replace(os.sep, '/')",
" except Exception: rel = fn",
" sm = re.match(r'(?i)^(.+)-(\\d+)-of-(\\d+)\\.gguf$', fn)",
" if sm:",
" prefix, part_s, total_s = sm.group(1), sm.group(2), sm.group(3)",
" key = (root, prefix, total_s)",
" g = split_groups.setdefault(key, {'name':fn,'rel_path':rel,'size_bytes':0,'role':gguf_role(fn),'quant':gguf_quant(fn),'parts':int(total_s),'split':True})",
" g['size_bytes'] += size",
" if int(part_s) == 1:",
" g.update({'name':fn,'rel_path':rel,'role':gguf_role(fn),'quant':gguf_quant(fn)})",
" continue",
" files.append({'name':fn,'rel_path':rel,'size_bytes':size,'role':gguf_role(fn),'quant':gguf_quant(fn)})",
" files.extend(split_groups.values())",
" files.sort(key=lambda f: (f.get('role') != 'model', f.get('rel_path', '')))",
" return files",
"def scan_hf(cache):",
" if not os.path.isdir(cache): return",
" for d in sorted(os.listdir(cache)):",
@@ -205,16 +237,14 @@ def _cached_model_scan_script(model_dirs: list[str] | None = None) -> str:
" if f.is_file(): nf += 1; sz += f.stat().st_size",
" if f.name.endswith('.incomplete'): ic = True",
" snap = os.path.join(cache, d, 'snapshots')",
" is_diffusion = False; is_gguf = False",
" is_diffusion = False; gguf_files = []",
" if os.path.isdir(snap):",
" for sd in os.listdir(snap):",
" sf = os.path.join(snap, sd)",
" if not os.path.isdir(sf): continue",
" if os.path.exists(os.path.join(sf, 'model_index.json')): is_diffusion = True",
" try:",
" if any(x.endswith('.gguf') for x in os.listdir(sf)): is_gguf = True",
" except Exception: pass",
" models.append({'repo_id':rid,'size_bytes':sz,'nb_files':nf,'has_incomplete':ic,'path':cache,'is_diffusion':is_diffusion,'is_gguf':is_gguf})",
" for f in collect_ggufs(sf): f['rel_path'] = sd + '/' + f['rel_path']; gguf_files.append(f)",
" models.append({'repo_id':rid,'size_bytes':sz,'nb_files':nf,'has_incomplete':ic,'path':cache,'is_diffusion':is_diffusion,'is_gguf':bool(gguf_files),'gguf_files':gguf_files})",
"def scan_dir(p):",
" if not os.path.isdir(p) or not safe_path(p): return",
" for d in sorted(os.listdir(p)):",
@@ -223,13 +253,14 @@ def _cached_model_scan_script(model_dirs: list[str] | None = None) -> str:
" fp = os.path.join(p, d)",
" if not os.path.isdir(fp) or os.path.islink(fp) or not safe_path(fp): continue",
" if d in seen: continue",
" is_model = False; is_gguf = False",
" is_model = False; gguf_files = []",
" for root, dirs, fns in safe_walk(fp):",
" for fn in fns:",
" if fn.endswith('.gguf'): is_gguf = True; is_model = True",
" if fn.lower().endswith('.gguf'): is_model = True",
" elif fn == 'config.json' or fn.endswith('.safetensors') or fn.endswith('.bin'): is_model = True",
" if is_model: break",
" if not is_model: continue",
" gguf_files = collect_ggufs(fp)",
" seen.add(d)",
" sz, nf = 0, 0",
" for dp, _, fns in safe_walk(fp):",
@@ -237,7 +268,7 @@ def _cached_model_scan_script(model_dirs: list[str] | None = None) -> str:
" try: nf += 1; sz += os.path.getsize(os.path.join(dp, fn))",
" except Exception: pass",
" is_diff = os.path.exists(os.path.join(fp, 'model_index.json'))",
" models.append({'repo_id':d,'size_bytes':sz,'nb_files':nf,'has_incomplete':False,'path':p,'is_local_dir':True,'is_diffusion':is_diff,'is_gguf':is_gguf})",
" models.append({'repo_id':d,'size_bytes':sz,'nb_files':nf,'has_incomplete':False,'path':p,'is_local_dir':True,'is_diffusion':is_diff,'is_gguf':bool(gguf_files),'gguf_files':gguf_files})",
"def parse_size(num, unit):",
" try: n = float(num)",
" except Exception: return 0",

View File

@@ -731,6 +731,8 @@ def setup_cookbook_routes() -> APIRouter:
entry["backend"] = m.get("backend")
if m.get("is_ollama"):
entry["is_ollama"] = True
if isinstance(m.get("gguf_files"), list):
entry["gguf_files"] = m["gguf_files"]
models.append(entry)
except Exception as e:
logger.warning(f"Failed to parse cached models: {e}")

View File

@@ -141,6 +141,54 @@ function _isActivelyServing(repoId) {
} catch { return false; }
}
function _formatGgufSize(bytes) {
const n = Number(bytes || 0);
if (!Number.isFinite(n) || n <= 0) return '';
if (n >= 1024 ** 3) return `${(n / (1024 ** 3)).toFixed(1)} GB`;
if (n >= 1024 ** 2) return `${Math.round(n / (1024 ** 2))} MB`;
return `${Math.max(1, Math.round(n / 1024))} KB`;
}
function _ggufFilesForModel(model) {
return Array.isArray(model?.gguf_files)
? model.gguf_files.filter(f => f && typeof f.rel_path === 'string' && f.rel_path)
: [];
}
function _runnableGgufFiles(model) {
const files = _ggufFilesForModel(model);
const primary = files.filter(f => (f.role || 'model') === 'model');
return primary.length ? primary : files;
}
function _ggufFileLabel(file) {
const base = (file.name || file.rel_path || '').split('/').pop();
const size = _formatGgufSize(file.size_bytes);
const quant = file.quant ? `${file.quant} ` : '';
const parts = Number(file.parts || 0);
const split = parts > 1 ? `, ${parts} parts` : '';
const role = file.role && file.role !== 'model' ? ` ${file.role}` : '';
return `${quant}${base}${size || split ? ` (${[size, split.replace(/^, /, '')].filter(Boolean).join(', ')})` : ''}${role}`;
}
function _shellPathExpr(path) {
const s = String(path || '');
if (s === '~') return '${HOME}';
if (s.startsWith('~/')) return '${HOME}' + _shellQuote(s.slice(1));
return _shellQuote(s);
}
function _selectedGgufExpr(model, repo, relPath) {
const rel = String(relPath || '').replace(/^\/+/, '');
if (!rel) return '';
if (model.is_local_dir && model.path) {
const base = String(model.path || '').replace(/\/+$/, '');
return `$(printf %s ${_shellPathExpr(`${base}/${repo}/${rel}`)})`;
}
const cacheRepo = repo.replace(/\//g, '--');
return `$(printf %s \${HOME}${_shellQuote(`/.cache/huggingface/hub/models--${cacheRepo}/snapshots/${rel}`)})`;
}
function _rerenderCachedModels() {
const list = document.getElementById('hwfit-cached-list');
const tagContainer = document.getElementById('serve-tags');
@@ -173,6 +221,8 @@ function _rerenderCachedModels() {
if (m.path) {
metaParts.push(`<span style="opacity:0.7;">${esc(m.path)}</span>`);
}
const ggufCount = _runnableGgufFiles(m).length;
if (ggufCount > 1) metaParts.push(`${ggufCount} GGUFs`);
if (m.status === 'downloading') {
const _active = _isActivelyDownloading(m.repo_id);
metaParts.push(`<span class="cookbook-dl-status" style="color:var(--accent,var(--red));">${_active ? 'downloading' : 'download stalled'}</span>`);
@@ -404,6 +454,14 @@ function _rerenderCachedModels() {
const tpOpts = [1,2,4,8].map(n => `<option${defaultTp==String(n)?' selected':''}>${n}</option>`).join('');
const dtypeOpts = ['auto','float16','bfloat16'].map(d => `<option value="${d}"${sv('dtype','auto')===d?' selected':''}>${d}</option>`).join('');
const _l = (name, tip) => `<span>${name}<span class="hwfit-hint" title="${tip}">?</span></span>`;
const _ggufChoices = _runnableGgufFiles(m);
const _savedGguf = String(sv('gguf_file', '') || '');
const _defaultGguf = _ggufChoices.some(f => f.rel_path === _savedGguf)
? _savedGguf
: (_ggufChoices[0]?.rel_path || '');
const _ggufOptions = _ggufChoices.map(f =>
`<option value="${esc(f.rel_path)}"${f.rel_path === _defaultGguf ? ' selected' : ''}>${esc(_ggufFileLabel(f))}</option>`
).join('');
// Build save slots
const _allPresets = _loadPresets();
const _repoShort = repo.split('/').pop();
@@ -450,6 +508,13 @@ function _rerenderCachedModels() {
}
panelHtml += `<label>${_l('GPUs','Toggle which GPUs to use')}<div class="cookbook-gpu-group">${_gpuBtnsHtml}</div><input type="hidden" class="hwfit-sf" data-field="gpus" value="${esc(defaultGpus)}" /></label>`;
panelHtml += `</div>`;
if (_ggufChoices.length > 1) {
panelHtml += `<div class="hwfit-serve-row hwfit-backend-llamacpp">`;
panelHtml += `<label class="hwfit-backend-llamacpp">${_l('GGUF File','Choose the exact GGUF artifact to serve from this cached model folder.')}<select class="hwfit-sf hwfit-sf-wide" data-field="gguf_file">${_ggufOptions}</select></label>`;
panelHtml += `</div>`;
} else if (_defaultGguf) {
panelHtml += `<input type="hidden" class="hwfit-sf" data-field="gguf_file" value="${esc(_defaultGguf)}" />`;
}
// Row 2: Core settings
panelHtml += `<div class="hwfit-serve-row hwfit-backend-vllm hwfit-backend-sglang hwfit-backend-llamacpp">`;
panelHtml += `<label class="hwfit-backend-vllm hwfit-backend-sglang">${_l('TP','Tensor Parallelism — split model across N GPUs')}<select class="hwfit-sf" data-field="tp">${tpOpts}</select></label>`;
@@ -559,6 +624,8 @@ function _rerenderCachedModels() {
const backend = f.backend || 'vllm';
const serveModel = m.is_local_dir && m.path ? `${m.path}/${repo}` : repo;
if (backend === 'llamacpp') {
const ggufChoices = _runnableGgufFiles(m);
const selectedGguf = ggufChoices.find(file => file.rel_path === f.gguf_file);
// For multi-part GGUFs, llama.cpp requires the first split
// (-00001-of-NNNNN.gguf). Prefer it (sorted, so UD-IQ4_XS/001 comes
// before Q4_K_M/001 etc); fall back to any single GGUF sorted.
@@ -569,7 +636,9 @@ function _rerenderCachedModels() {
// search the HF snapshots dir, so serving a GGUF from a custom dir works
// instead of handing llama.cpp a directory (which fails).
const _ldir = `"${m.path}/${repo}"`;
f._gguf_path = m.is_local_dir && m.path
f._gguf_path = selectedGguf
? _selectedGgufExpr(m, repo, selectedGguf.rel_path)
: m.is_local_dir && m.path
? `$({ find ${_ldir} -name '*-00001-of-*.gguf' 2>/dev/null | sort; find ${_ldir} -name '*.gguf' 2>/dev/null | sort; } | head -1)`
: `$({ find ${dir} -name '*-00001-of-*.gguf' 2>/dev/null | sort; find ${dir} -name '*.gguf' 2>/dev/null | sort; } | head -1)`;
}

View File

@@ -171,6 +171,11 @@ def test_cached_model_scan_reports_plain_dir_gguf(tmp_path):
plain = tmp_path / "Qwen3.6-27B"
plain.mkdir()
(plain / "Qwen3.6-27B-Q4_K_M.gguf").write_bytes(b"gguf")
(plain / "Qwen3.6-27B-Q5_K_M-00001-of-00003.gguf").write_bytes(b"part1")
(plain / "Qwen3.6-27B-Q5_K_M-00002-of-00003.gguf").write_bytes(b"part2")
(plain / "Qwen3.6-27B-Q5_K_M-00003-of-00003.gguf").write_bytes(b"part3")
(plain / "Qwen3.6-27B-Q6_K_XL.gguf").write_bytes(b"ggufgguf")
(plain / "mmproj-BF16.gguf").write_bytes(b"projector")
hf_internal = tmp_path / "models--Qwen--Qwen3.6-27B"
(hf_internal / "snapshots" / "abc").mkdir(parents=True)
@@ -189,3 +194,18 @@ def test_cached_model_scan_reports_plain_dir_gguf(tmp_path):
assert "models--Qwen--Qwen3.6-27B" not in by_repo
assert by_repo["Qwen3.6-27B"]["is_local_dir"] is True
assert by_repo["Qwen3.6-27B"]["is_gguf"] is True
ggufs = by_repo["Qwen3.6-27B"]["gguf_files"]
assert [f["rel_path"] for f in ggufs] == [
"Qwen3.6-27B-Q4_K_M.gguf",
"Qwen3.6-27B-Q5_K_M-00001-of-00003.gguf",
"Qwen3.6-27B-Q6_K_XL.gguf",
"mmproj-BF16.gguf",
]
assert [f["role"] for f in ggufs] == ["model", "model", "model", "projector"]
assert ggufs[0]["quant"] == "Q4_K_M"
assert ggufs[1]["quant"] == "Q5_K_M"
assert ggufs[1]["split"] is True
assert ggufs[1]["parts"] == 3
assert ggufs[1]["size_bytes"] == len(b"part1part2part3")
assert ggufs[2]["quant"] == "Q6_K_XL"
assert ggufs[3]["quant"] == "BF16"