feat: select cached gguf artifacts for serve (#891)
This commit is contained in:
@@ -191,6 +191,38 @@ def _cached_model_scan_script(model_dirs: list[str] | None = None) -> str:
|
||||
" for root, dirs, fns in os.walk(top, followlinks=False):",
|
||||
" dirs[:] = [d for d in dirs if not os.path.islink(os.path.join(root, d)) and safe_path(os.path.join(root, d))]",
|
||||
" yield root, dirs, fns",
|
||||
"def gguf_role(name):",
|
||||
" n = name.lower()",
|
||||
" if n.startswith('mmproj') or 'mmproj' in n: return 'projector'",
|
||||
" return 'model'",
|
||||
"def gguf_quant(name):",
|
||||
" m = re.search(r'(?i)(UD-)?(IQ[0-9]_[A-Z0-9_]+|Q[0-9](?:_[A-Z0-9]+)+|BF16|F16|FP16|F32|Q8_0)', name)",
|
||||
" return m.group(0).upper() if m else ''",
|
||||
"def collect_ggufs(base):",
|
||||
" files = []",
|
||||
" split_groups = {}",
|
||||
" if not os.path.isdir(base) or not safe_path(base): return files",
|
||||
" for root, dirs, fns in safe_walk(base):",
|
||||
" for fn in sorted(fns):",
|
||||
" if not fn.lower().endswith('.gguf'): continue",
|
||||
" fp = os.path.join(root, fn)",
|
||||
" try: size = os.path.getsize(fp)",
|
||||
" except Exception: size = 0",
|
||||
" try: rel = os.path.relpath(fp, base).replace(os.sep, '/')",
|
||||
" except Exception: rel = fn",
|
||||
" sm = re.match(r'(?i)^(.+)-(\\d+)-of-(\\d+)\\.gguf$', fn)",
|
||||
" if sm:",
|
||||
" prefix, part_s, total_s = sm.group(1), sm.group(2), sm.group(3)",
|
||||
" key = (root, prefix, total_s)",
|
||||
" g = split_groups.setdefault(key, {'name':fn,'rel_path':rel,'size_bytes':0,'role':gguf_role(fn),'quant':gguf_quant(fn),'parts':int(total_s),'split':True})",
|
||||
" g['size_bytes'] += size",
|
||||
" if int(part_s) == 1:",
|
||||
" g.update({'name':fn,'rel_path':rel,'role':gguf_role(fn),'quant':gguf_quant(fn)})",
|
||||
" continue",
|
||||
" files.append({'name':fn,'rel_path':rel,'size_bytes':size,'role':gguf_role(fn),'quant':gguf_quant(fn)})",
|
||||
" files.extend(split_groups.values())",
|
||||
" files.sort(key=lambda f: (f.get('role') != 'model', f.get('rel_path', '')))",
|
||||
" return files",
|
||||
"def scan_hf(cache):",
|
||||
" if not os.path.isdir(cache): return",
|
||||
" for d in sorted(os.listdir(cache)):",
|
||||
@@ -205,16 +237,14 @@ def _cached_model_scan_script(model_dirs: list[str] | None = None) -> str:
|
||||
" if f.is_file(): nf += 1; sz += f.stat().st_size",
|
||||
" if f.name.endswith('.incomplete'): ic = True",
|
||||
" snap = os.path.join(cache, d, 'snapshots')",
|
||||
" is_diffusion = False; is_gguf = False",
|
||||
" is_diffusion = False; gguf_files = []",
|
||||
" if os.path.isdir(snap):",
|
||||
" for sd in os.listdir(snap):",
|
||||
" sf = os.path.join(snap, sd)",
|
||||
" if not os.path.isdir(sf): continue",
|
||||
" if os.path.exists(os.path.join(sf, 'model_index.json')): is_diffusion = True",
|
||||
" try:",
|
||||
" if any(x.endswith('.gguf') for x in os.listdir(sf)): is_gguf = True",
|
||||
" except Exception: pass",
|
||||
" models.append({'repo_id':rid,'size_bytes':sz,'nb_files':nf,'has_incomplete':ic,'path':cache,'is_diffusion':is_diffusion,'is_gguf':is_gguf})",
|
||||
" for f in collect_ggufs(sf): f['rel_path'] = sd + '/' + f['rel_path']; gguf_files.append(f)",
|
||||
" models.append({'repo_id':rid,'size_bytes':sz,'nb_files':nf,'has_incomplete':ic,'path':cache,'is_diffusion':is_diffusion,'is_gguf':bool(gguf_files),'gguf_files':gguf_files})",
|
||||
"def scan_dir(p):",
|
||||
" if not os.path.isdir(p) or not safe_path(p): return",
|
||||
" for d in sorted(os.listdir(p)):",
|
||||
@@ -223,13 +253,14 @@ def _cached_model_scan_script(model_dirs: list[str] | None = None) -> str:
|
||||
" fp = os.path.join(p, d)",
|
||||
" if not os.path.isdir(fp) or os.path.islink(fp) or not safe_path(fp): continue",
|
||||
" if d in seen: continue",
|
||||
" is_model = False; is_gguf = False",
|
||||
" is_model = False; gguf_files = []",
|
||||
" for root, dirs, fns in safe_walk(fp):",
|
||||
" for fn in fns:",
|
||||
" if fn.endswith('.gguf'): is_gguf = True; is_model = True",
|
||||
" if fn.lower().endswith('.gguf'): is_model = True",
|
||||
" elif fn == 'config.json' or fn.endswith('.safetensors') or fn.endswith('.bin'): is_model = True",
|
||||
" if is_model: break",
|
||||
" if not is_model: continue",
|
||||
" gguf_files = collect_ggufs(fp)",
|
||||
" seen.add(d)",
|
||||
" sz, nf = 0, 0",
|
||||
" for dp, _, fns in safe_walk(fp):",
|
||||
@@ -237,7 +268,7 @@ def _cached_model_scan_script(model_dirs: list[str] | None = None) -> str:
|
||||
" try: nf += 1; sz += os.path.getsize(os.path.join(dp, fn))",
|
||||
" except Exception: pass",
|
||||
" is_diff = os.path.exists(os.path.join(fp, 'model_index.json'))",
|
||||
" models.append({'repo_id':d,'size_bytes':sz,'nb_files':nf,'has_incomplete':False,'path':p,'is_local_dir':True,'is_diffusion':is_diff,'is_gguf':is_gguf})",
|
||||
" models.append({'repo_id':d,'size_bytes':sz,'nb_files':nf,'has_incomplete':False,'path':p,'is_local_dir':True,'is_diffusion':is_diff,'is_gguf':bool(gguf_files),'gguf_files':gguf_files})",
|
||||
"def parse_size(num, unit):",
|
||||
" try: n = float(num)",
|
||||
" except Exception: return 0",
|
||||
|
||||
@@ -731,6 +731,8 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
entry["backend"] = m.get("backend")
|
||||
if m.get("is_ollama"):
|
||||
entry["is_ollama"] = True
|
||||
if isinstance(m.get("gguf_files"), list):
|
||||
entry["gguf_files"] = m["gguf_files"]
|
||||
models.append(entry)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse cached models: {e}")
|
||||
|
||||
@@ -141,6 +141,54 @@ function _isActivelyServing(repoId) {
|
||||
} catch { return false; }
|
||||
}
|
||||
|
||||
function _formatGgufSize(bytes) {
|
||||
const n = Number(bytes || 0);
|
||||
if (!Number.isFinite(n) || n <= 0) return '';
|
||||
if (n >= 1024 ** 3) return `${(n / (1024 ** 3)).toFixed(1)} GB`;
|
||||
if (n >= 1024 ** 2) return `${Math.round(n / (1024 ** 2))} MB`;
|
||||
return `${Math.max(1, Math.round(n / 1024))} KB`;
|
||||
}
|
||||
|
||||
function _ggufFilesForModel(model) {
|
||||
return Array.isArray(model?.gguf_files)
|
||||
? model.gguf_files.filter(f => f && typeof f.rel_path === 'string' && f.rel_path)
|
||||
: [];
|
||||
}
|
||||
|
||||
function _runnableGgufFiles(model) {
|
||||
const files = _ggufFilesForModel(model);
|
||||
const primary = files.filter(f => (f.role || 'model') === 'model');
|
||||
return primary.length ? primary : files;
|
||||
}
|
||||
|
||||
function _ggufFileLabel(file) {
|
||||
const base = (file.name || file.rel_path || '').split('/').pop();
|
||||
const size = _formatGgufSize(file.size_bytes);
|
||||
const quant = file.quant ? `${file.quant} ` : '';
|
||||
const parts = Number(file.parts || 0);
|
||||
const split = parts > 1 ? `, ${parts} parts` : '';
|
||||
const role = file.role && file.role !== 'model' ? ` ${file.role}` : '';
|
||||
return `${quant}${base}${size || split ? ` (${[size, split.replace(/^, /, '')].filter(Boolean).join(', ')})` : ''}${role}`;
|
||||
}
|
||||
|
||||
function _shellPathExpr(path) {
|
||||
const s = String(path || '');
|
||||
if (s === '~') return '${HOME}';
|
||||
if (s.startsWith('~/')) return '${HOME}' + _shellQuote(s.slice(1));
|
||||
return _shellQuote(s);
|
||||
}
|
||||
|
||||
function _selectedGgufExpr(model, repo, relPath) {
|
||||
const rel = String(relPath || '').replace(/^\/+/, '');
|
||||
if (!rel) return '';
|
||||
if (model.is_local_dir && model.path) {
|
||||
const base = String(model.path || '').replace(/\/+$/, '');
|
||||
return `$(printf %s ${_shellPathExpr(`${base}/${repo}/${rel}`)})`;
|
||||
}
|
||||
const cacheRepo = repo.replace(/\//g, '--');
|
||||
return `$(printf %s \${HOME}${_shellQuote(`/.cache/huggingface/hub/models--${cacheRepo}/snapshots/${rel}`)})`;
|
||||
}
|
||||
|
||||
function _rerenderCachedModels() {
|
||||
const list = document.getElementById('hwfit-cached-list');
|
||||
const tagContainer = document.getElementById('serve-tags');
|
||||
@@ -173,6 +221,8 @@ function _rerenderCachedModels() {
|
||||
if (m.path) {
|
||||
metaParts.push(`<span style="opacity:0.7;">${esc(m.path)}</span>`);
|
||||
}
|
||||
const ggufCount = _runnableGgufFiles(m).length;
|
||||
if (ggufCount > 1) metaParts.push(`${ggufCount} GGUFs`);
|
||||
if (m.status === 'downloading') {
|
||||
const _active = _isActivelyDownloading(m.repo_id);
|
||||
metaParts.push(`<span class="cookbook-dl-status" style="color:var(--accent,var(--red));">${_active ? 'downloading' : 'download stalled'}</span>`);
|
||||
@@ -404,6 +454,14 @@ function _rerenderCachedModels() {
|
||||
const tpOpts = [1,2,4,8].map(n => `<option${defaultTp==String(n)?' selected':''}>${n}</option>`).join('');
|
||||
const dtypeOpts = ['auto','float16','bfloat16'].map(d => `<option value="${d}"${sv('dtype','auto')===d?' selected':''}>${d}</option>`).join('');
|
||||
const _l = (name, tip) => `<span>${name}<span class="hwfit-hint" title="${tip}">?</span></span>`;
|
||||
const _ggufChoices = _runnableGgufFiles(m);
|
||||
const _savedGguf = String(sv('gguf_file', '') || '');
|
||||
const _defaultGguf = _ggufChoices.some(f => f.rel_path === _savedGguf)
|
||||
? _savedGguf
|
||||
: (_ggufChoices[0]?.rel_path || '');
|
||||
const _ggufOptions = _ggufChoices.map(f =>
|
||||
`<option value="${esc(f.rel_path)}"${f.rel_path === _defaultGguf ? ' selected' : ''}>${esc(_ggufFileLabel(f))}</option>`
|
||||
).join('');
|
||||
// Build save slots
|
||||
const _allPresets = _loadPresets();
|
||||
const _repoShort = repo.split('/').pop();
|
||||
@@ -450,6 +508,13 @@ function _rerenderCachedModels() {
|
||||
}
|
||||
panelHtml += `<label>${_l('GPUs','Toggle which GPUs to use')}<div class="cookbook-gpu-group">${_gpuBtnsHtml}</div><input type="hidden" class="hwfit-sf" data-field="gpus" value="${esc(defaultGpus)}" /></label>`;
|
||||
panelHtml += `</div>`;
|
||||
if (_ggufChoices.length > 1) {
|
||||
panelHtml += `<div class="hwfit-serve-row hwfit-backend-llamacpp">`;
|
||||
panelHtml += `<label class="hwfit-backend-llamacpp">${_l('GGUF File','Choose the exact GGUF artifact to serve from this cached model folder.')}<select class="hwfit-sf hwfit-sf-wide" data-field="gguf_file">${_ggufOptions}</select></label>`;
|
||||
panelHtml += `</div>`;
|
||||
} else if (_defaultGguf) {
|
||||
panelHtml += `<input type="hidden" class="hwfit-sf" data-field="gguf_file" value="${esc(_defaultGguf)}" />`;
|
||||
}
|
||||
// Row 2: Core settings
|
||||
panelHtml += `<div class="hwfit-serve-row hwfit-backend-vllm hwfit-backend-sglang hwfit-backend-llamacpp">`;
|
||||
panelHtml += `<label class="hwfit-backend-vllm hwfit-backend-sglang">${_l('TP','Tensor Parallelism — split model across N GPUs')}<select class="hwfit-sf" data-field="tp">${tpOpts}</select></label>`;
|
||||
@@ -559,6 +624,8 @@ function _rerenderCachedModels() {
|
||||
const backend = f.backend || 'vllm';
|
||||
const serveModel = m.is_local_dir && m.path ? `${m.path}/${repo}` : repo;
|
||||
if (backend === 'llamacpp') {
|
||||
const ggufChoices = _runnableGgufFiles(m);
|
||||
const selectedGguf = ggufChoices.find(file => file.rel_path === f.gguf_file);
|
||||
// For multi-part GGUFs, llama.cpp requires the first split
|
||||
// (-00001-of-NNNNN.gguf). Prefer it (sorted, so UD-IQ4_XS/001 comes
|
||||
// before Q4_K_M/001 etc); fall back to any single GGUF sorted.
|
||||
@@ -569,7 +636,9 @@ function _rerenderCachedModels() {
|
||||
// search the HF snapshots dir, so serving a GGUF from a custom dir works
|
||||
// instead of handing llama.cpp a directory (which fails).
|
||||
const _ldir = `"${m.path}/${repo}"`;
|
||||
f._gguf_path = m.is_local_dir && m.path
|
||||
f._gguf_path = selectedGguf
|
||||
? _selectedGgufExpr(m, repo, selectedGguf.rel_path)
|
||||
: m.is_local_dir && m.path
|
||||
? `$({ find ${_ldir} -name '*-00001-of-*.gguf' 2>/dev/null | sort; find ${_ldir} -name '*.gguf' 2>/dev/null | sort; } | head -1)`
|
||||
: `$({ find ${dir} -name '*-00001-of-*.gguf' 2>/dev/null | sort; find ${dir} -name '*.gguf' 2>/dev/null | sort; } | head -1)`;
|
||||
}
|
||||
|
||||
@@ -171,6 +171,11 @@ def test_cached_model_scan_reports_plain_dir_gguf(tmp_path):
|
||||
plain = tmp_path / "Qwen3.6-27B"
|
||||
plain.mkdir()
|
||||
(plain / "Qwen3.6-27B-Q4_K_M.gguf").write_bytes(b"gguf")
|
||||
(plain / "Qwen3.6-27B-Q5_K_M-00001-of-00003.gguf").write_bytes(b"part1")
|
||||
(plain / "Qwen3.6-27B-Q5_K_M-00002-of-00003.gguf").write_bytes(b"part2")
|
||||
(plain / "Qwen3.6-27B-Q5_K_M-00003-of-00003.gguf").write_bytes(b"part3")
|
||||
(plain / "Qwen3.6-27B-Q6_K_XL.gguf").write_bytes(b"ggufgguf")
|
||||
(plain / "mmproj-BF16.gguf").write_bytes(b"projector")
|
||||
|
||||
hf_internal = tmp_path / "models--Qwen--Qwen3.6-27B"
|
||||
(hf_internal / "snapshots" / "abc").mkdir(parents=True)
|
||||
@@ -189,3 +194,18 @@ def test_cached_model_scan_reports_plain_dir_gguf(tmp_path):
|
||||
assert "models--Qwen--Qwen3.6-27B" not in by_repo
|
||||
assert by_repo["Qwen3.6-27B"]["is_local_dir"] is True
|
||||
assert by_repo["Qwen3.6-27B"]["is_gguf"] is True
|
||||
ggufs = by_repo["Qwen3.6-27B"]["gguf_files"]
|
||||
assert [f["rel_path"] for f in ggufs] == [
|
||||
"Qwen3.6-27B-Q4_K_M.gguf",
|
||||
"Qwen3.6-27B-Q5_K_M-00001-of-00003.gguf",
|
||||
"Qwen3.6-27B-Q6_K_XL.gguf",
|
||||
"mmproj-BF16.gguf",
|
||||
]
|
||||
assert [f["role"] for f in ggufs] == ["model", "model", "model", "projector"]
|
||||
assert ggufs[0]["quant"] == "Q4_K_M"
|
||||
assert ggufs[1]["quant"] == "Q5_K_M"
|
||||
assert ggufs[1]["split"] is True
|
||||
assert ggufs[1]["parts"] == 3
|
||||
assert ggufs[1]["size_bytes"] == len(b"part1part2part3")
|
||||
assert ggufs[2]["quant"] == "Q6_K_XL"
|
||||
assert ggufs[3]["quant"] == "BF16"
|
||||
|
||||
Reference in New Issue
Block a user