feat: select cached gguf artifacts for serve (#891)

2026-06-02 13:32:40 +10:00
parent 8455b88643
commit 8b3c0d8ad4
4 changed files with 131 additions and 9 deletions
--- a/routes/cookbook_helpers.py
+++ b/routes/cookbook_helpers.py
@@ -191,6 +191,38 @@ def _cached_model_scan_script(model_dirs: list[str] | None = None) -> str:
        "    for root, dirs, fns in os.walk(top, followlinks=False):",
        "        dirs[:] = [d for d in dirs if not os.path.islink(os.path.join(root, d)) and safe_path(os.path.join(root, d))]",
        "        yield root, dirs, fns",
+        "def gguf_role(name):",
+        "    n = name.lower()",
+        "    if n.startswith('mmproj') or 'mmproj' in n: return 'projector'",
+        "    return 'model'",
+        "def gguf_quant(name):",
+        "    m = re.search(r'(?i)(UD-)?(IQ[0-9]_[A-Z0-9_]+|Q[0-9](?:_[A-Z0-9]+)+|BF16|F16|FP16|F32|Q8_0)', name)",
+        "    return m.group(0).upper() if m else ''",
+        "def collect_ggufs(base):",
+        "    files = []",
+        "    split_groups = {}",
+        "    if not os.path.isdir(base) or not safe_path(base): return files",
+        "    for root, dirs, fns in safe_walk(base):",
+        "        for fn in sorted(fns):",
+        "            if not fn.lower().endswith('.gguf'): continue",
+        "            fp = os.path.join(root, fn)",
+        "            try: size = os.path.getsize(fp)",
+        "            except Exception: size = 0",
+        "            try: rel = os.path.relpath(fp, base).replace(os.sep, '/')",
+        "            except Exception: rel = fn",
+        "            sm = re.match(r'(?i)^(.+)-(\\d+)-of-(\\d+)\\.gguf$', fn)",
+        "            if sm:",
+        "                prefix, part_s, total_s = sm.group(1), sm.group(2), sm.group(3)",
+        "                key = (root, prefix, total_s)",
+        "                g = split_groups.setdefault(key, {'name':fn,'rel_path':rel,'size_bytes':0,'role':gguf_role(fn),'quant':gguf_quant(fn),'parts':int(total_s),'split':True})",
+        "                g['size_bytes'] += size",
+        "                if int(part_s) == 1:",
+        "                    g.update({'name':fn,'rel_path':rel,'role':gguf_role(fn),'quant':gguf_quant(fn)})",
+        "                continue",
+        "            files.append({'name':fn,'rel_path':rel,'size_bytes':size,'role':gguf_role(fn),'quant':gguf_quant(fn)})",
+        "    files.extend(split_groups.values())",
+        "    files.sort(key=lambda f: (f.get('role') != 'model', f.get('rel_path', '')))",
+        "    return files",
        "def scan_hf(cache):",
        "    if not os.path.isdir(cache): return",
        "    for d in sorted(os.listdir(cache)):",
@@ -205,16 +237,14 @@ def _cached_model_scan_script(model_dirs: list[str] | None = None) -> str:
        "                if f.is_file(): nf += 1; sz += f.stat().st_size",
        "                if f.name.endswith('.incomplete'): ic = True",
        "        snap = os.path.join(cache, d, 'snapshots')",
-        "        is_diffusion = False; is_gguf = False",
+        "        is_diffusion = False; gguf_files = []",
        "        if os.path.isdir(snap):",
        "            for sd in os.listdir(snap):",
        "                sf = os.path.join(snap, sd)",
        "                if not os.path.isdir(sf): continue",
        "                if os.path.exists(os.path.join(sf, 'model_index.json')): is_diffusion = True",
-        "                try:",
-        "                    if any(x.endswith('.gguf') for x in os.listdir(sf)): is_gguf = True",
-        "                except Exception: pass",
-        "        models.append({'repo_id':rid,'size_bytes':sz,'nb_files':nf,'has_incomplete':ic,'path':cache,'is_diffusion':is_diffusion,'is_gguf':is_gguf})",
+        "                for f in collect_ggufs(sf): f['rel_path'] = sd + '/' + f['rel_path']; gguf_files.append(f)",
+        "        models.append({'repo_id':rid,'size_bytes':sz,'nb_files':nf,'has_incomplete':ic,'path':cache,'is_diffusion':is_diffusion,'is_gguf':bool(gguf_files),'gguf_files':gguf_files})",
        "def scan_dir(p):",
        "    if not os.path.isdir(p) or not safe_path(p): return",
        "    for d in sorted(os.listdir(p)):",
@@ -223,13 +253,14 @@ def _cached_model_scan_script(model_dirs: list[str] | None = None) -> str:
        "        fp = os.path.join(p, d)",
        "        if not os.path.isdir(fp) or os.path.islink(fp) or not safe_path(fp): continue",
        "        if d in seen: continue",
-        "        is_model = False; is_gguf = False",
+        "        is_model = False; gguf_files = []",
        "        for root, dirs, fns in safe_walk(fp):",
        "            for fn in fns:",
-        "                if fn.endswith('.gguf'): is_gguf = True; is_model = True",
+        "                if fn.lower().endswith('.gguf'): is_model = True",
        "                elif fn == 'config.json' or fn.endswith('.safetensors') or fn.endswith('.bin'): is_model = True",
        "            if is_model: break",
        "        if not is_model: continue",
+        "        gguf_files = collect_ggufs(fp)",
        "        seen.add(d)",
        "        sz, nf = 0, 0",
        "        for dp, _, fns in safe_walk(fp):",
@@ -237,7 +268,7 @@ def _cached_model_scan_script(model_dirs: list[str] | None = None) -> str:
        "                try: nf += 1; sz += os.path.getsize(os.path.join(dp, fn))",
        "                except Exception: pass",
        "        is_diff = os.path.exists(os.path.join(fp, 'model_index.json'))",
-        "        models.append({'repo_id':d,'size_bytes':sz,'nb_files':nf,'has_incomplete':False,'path':p,'is_local_dir':True,'is_diffusion':is_diff,'is_gguf':is_gguf})",
+        "        models.append({'repo_id':d,'size_bytes':sz,'nb_files':nf,'has_incomplete':False,'path':p,'is_local_dir':True,'is_diffusion':is_diff,'is_gguf':bool(gguf_files),'gguf_files':gguf_files})",
        "def parse_size(num, unit):",
        "    try: n = float(num)",
        "    except Exception: return 0",
--- a/routes/cookbook_routes.py
+++ b/routes/cookbook_routes.py
@@ -731,6 +731,8 @@ def setup_cookbook_routes() -> APIRouter:
                    entry["backend"] = m.get("backend")
                if m.get("is_ollama"):
                    entry["is_ollama"] = True
+                if isinstance(m.get("gguf_files"), list):
+                    entry["gguf_files"] = m["gguf_files"]
                models.append(entry)
        except Exception as e:
            logger.warning(f"Failed to parse cached models: {e}")
--- a/static/js/cookbookServe.js
+++ b/static/js/cookbookServe.js
@@ -141,6 +141,54 @@ function _isActivelyServing(repoId) {
  } catch { return false; }
 }

+function _formatGgufSize(bytes) {
+  const n = Number(bytes || 0);
+  if (!Number.isFinite(n) || n <= 0) return '';
+  if (n >= 1024 ** 3) return `${(n / (1024 ** 3)).toFixed(1)} GB`;
+  if (n >= 1024 ** 2) return `${Math.round(n / (1024 ** 2))} MB`;
+  return `${Math.max(1, Math.round(n / 1024))} KB`;
+}
+
+function _ggufFilesForModel(model) {
+  return Array.isArray(model?.gguf_files)
+    ? model.gguf_files.filter(f => f && typeof f.rel_path === 'string' && f.rel_path)
+    : [];
+}
+
+function _runnableGgufFiles(model) {
+  const files = _ggufFilesForModel(model);
+  const primary = files.filter(f => (f.role || 'model') === 'model');
+  return primary.length ? primary : files;
+}
+
+function _ggufFileLabel(file) {
+  const base = (file.name || file.rel_path || '').split('/').pop();
+  const size = _formatGgufSize(file.size_bytes);
+  const quant = file.quant ? `${file.quant} ` : '';
+  const parts = Number(file.parts || 0);
+  const split = parts > 1 ? `, ${parts} parts` : '';
+  const role = file.role && file.role !== 'model' ? ` ${file.role}` : '';
+  return `${quant}${base}${size || split ? ` (${[size, split.replace(/^, /, '')].filter(Boolean).join(', ')})` : ''}${role}`;
+}
+
+function _shellPathExpr(path) {
+  const s = String(path || '');
+  if (s === '~') return '${HOME}';
+  if (s.startsWith('~/')) return '${HOME}' + _shellQuote(s.slice(1));
+  return _shellQuote(s);
+}
+
+function _selectedGgufExpr(model, repo, relPath) {
+  const rel = String(relPath || '').replace(/^\/+/, '');
+  if (!rel) return '';
+  if (model.is_local_dir && model.path) {
+    const base = String(model.path || '').replace(/\/+$/, '');
+    return `$(printf %s ${_shellPathExpr(`${base}/${repo}/${rel}`)})`;
+  }
+  const cacheRepo = repo.replace(/\//g, '--');
+  return `$(printf %s \${HOME}${_shellQuote(`/.cache/huggingface/hub/models--${cacheRepo}/snapshots/${rel}`)})`;
+}
+
 function _rerenderCachedModels() {
  const list = document.getElementById('hwfit-cached-list');
  const tagContainer = document.getElementById('serve-tags');
@@ -173,6 +221,8 @@ function _rerenderCachedModels() {
    if (m.path) {
      metaParts.push(`<span style="opacity:0.7;">${esc(m.path)}</span>`);
    }
+    const ggufCount = _runnableGgufFiles(m).length;
+    if (ggufCount > 1) metaParts.push(`${ggufCount} GGUFs`);
    if (m.status === 'downloading') {
      const _active = _isActivelyDownloading(m.repo_id);
      metaParts.push(`<span class="cookbook-dl-status" style="color:var(--accent,var(--red));">${_active ? 'downloading' : 'download stalled'}</span>`);
@@ -404,6 +454,14 @@ function _rerenderCachedModels() {
      const tpOpts = [1,2,4,8].map(n => `<option${defaultTp==String(n)?' selected':''}>${n}</option>`).join('');
      const dtypeOpts = ['auto','float16','bfloat16'].map(d => `<option value="${d}"${sv('dtype','auto')===d?' selected':''}>${d}</option>`).join('');
      const _l = (name, tip) => `<span>${name}<span class="hwfit-hint" title="${tip}">?</span></span>`;
+      const _ggufChoices = _runnableGgufFiles(m);
+      const _savedGguf = String(sv('gguf_file', '') || '');
+      const _defaultGguf = _ggufChoices.some(f => f.rel_path === _savedGguf)
+        ? _savedGguf
+        : (_ggufChoices[0]?.rel_path || '');
+      const _ggufOptions = _ggufChoices.map(f =>
+        `<option value="${esc(f.rel_path)}"${f.rel_path === _defaultGguf ? ' selected' : ''}>${esc(_ggufFileLabel(f))}</option>`
+      ).join('');
      // Build save slots
      const _allPresets = _loadPresets();
      const _repoShort = repo.split('/').pop();
@@ -450,6 +508,13 @@ function _rerenderCachedModels() {
      }
      panelHtml += `<label>${_l('GPUs','Toggle which GPUs to use')}<div class="cookbook-gpu-group">${_gpuBtnsHtml}</div><input type="hidden" class="hwfit-sf" data-field="gpus" value="${esc(defaultGpus)}" /></label>`;
      panelHtml += `</div>`;
+      if (_ggufChoices.length > 1) {
+        panelHtml += `<div class="hwfit-serve-row hwfit-backend-llamacpp">`;
+        panelHtml += `<label class="hwfit-backend-llamacpp">${_l('GGUF File','Choose the exact GGUF artifact to serve from this cached model folder.')}<select class="hwfit-sf hwfit-sf-wide" data-field="gguf_file">${_ggufOptions}</select></label>`;
+        panelHtml += `</div>`;
+      } else if (_defaultGguf) {
+        panelHtml += `<input type="hidden" class="hwfit-sf" data-field="gguf_file" value="${esc(_defaultGguf)}" />`;
+      }
      // Row 2: Core settings
      panelHtml += `<div class="hwfit-serve-row hwfit-backend-vllm hwfit-backend-sglang hwfit-backend-llamacpp">`;
      panelHtml += `<label class="hwfit-backend-vllm hwfit-backend-sglang">${_l('TP','Tensor Parallelism — split model across N GPUs')}<select class="hwfit-sf" data-field="tp">${tpOpts}</select></label>`;
@@ -559,6 +624,8 @@ function _rerenderCachedModels() {
        const backend = f.backend || 'vllm';
        const serveModel = m.is_local_dir && m.path ? `${m.path}/${repo}` : repo;
        if (backend === 'llamacpp') {
+          const ggufChoices = _runnableGgufFiles(m);
+          const selectedGguf = ggufChoices.find(file => file.rel_path === f.gguf_file);
          // For multi-part GGUFs, llama.cpp requires the first split
          // (-00001-of-NNNNN.gguf). Prefer it (sorted, so UD-IQ4_XS/001 comes
          // before Q4_K_M/001 etc); fall back to any single GGUF sorted.
@@ -569,7 +636,9 @@ function _rerenderCachedModels() {
          // search the HF snapshots dir, so serving a GGUF from a custom dir works
          // instead of handing llama.cpp a directory (which fails).
          const _ldir = `"${m.path}/${repo}"`;
-          f._gguf_path = m.is_local_dir && m.path
+          f._gguf_path = selectedGguf
+            ? _selectedGgufExpr(m, repo, selectedGguf.rel_path)
+            : m.is_local_dir && m.path
            ? `$({ find ${_ldir} -name '*-00001-of-*.gguf' 2>/dev/null | sort; find ${_ldir} -name '*.gguf' 2>/dev/null | sort; } | head -1)`
            : `$({ find ${dir} -name '*-00001-of-*.gguf' 2>/dev/null | sort; find ${dir} -name '*.gguf' 2>/dev/null | sort; } | head -1)`;
        }
--- a/tests/test_cookbook_helpers.py
+++ b/tests/test_cookbook_helpers.py
@@ -171,6 +171,11 @@ def test_cached_model_scan_reports_plain_dir_gguf(tmp_path):
    plain = tmp_path / "Qwen3.6-27B"
    plain.mkdir()
    (plain / "Qwen3.6-27B-Q4_K_M.gguf").write_bytes(b"gguf")
+    (plain / "Qwen3.6-27B-Q5_K_M-00001-of-00003.gguf").write_bytes(b"part1")
+    (plain / "Qwen3.6-27B-Q5_K_M-00002-of-00003.gguf").write_bytes(b"part2")
+    (plain / "Qwen3.6-27B-Q5_K_M-00003-of-00003.gguf").write_bytes(b"part3")
+    (plain / "Qwen3.6-27B-Q6_K_XL.gguf").write_bytes(b"ggufgguf")
+    (plain / "mmproj-BF16.gguf").write_bytes(b"projector")

    hf_internal = tmp_path / "models--Qwen--Qwen3.6-27B"
    (hf_internal / "snapshots" / "abc").mkdir(parents=True)
@@ -189,3 +194,18 @@ def test_cached_model_scan_reports_plain_dir_gguf(tmp_path):
    assert "models--Qwen--Qwen3.6-27B" not in by_repo
    assert by_repo["Qwen3.6-27B"]["is_local_dir"] is True
    assert by_repo["Qwen3.6-27B"]["is_gguf"] is True
+    ggufs = by_repo["Qwen3.6-27B"]["gguf_files"]
+    assert [f["rel_path"] for f in ggufs] == [
+        "Qwen3.6-27B-Q4_K_M.gguf",
+        "Qwen3.6-27B-Q5_K_M-00001-of-00003.gguf",
+        "Qwen3.6-27B-Q6_K_XL.gguf",
+        "mmproj-BF16.gguf",
+    ]
+    assert [f["role"] for f in ggufs] == ["model", "model", "model", "projector"]
+    assert ggufs[0]["quant"] == "Q4_K_M"
+    assert ggufs[1]["quant"] == "Q5_K_M"
+    assert ggufs[1]["split"] is True
+    assert ggufs[1]["parts"] == 3
+    assert ggufs[1]["size_bytes"] == len(b"part1part2part3")
+    assert ggufs[2]["quant"] == "Q6_K_XL"
+    assert ggufs[3]["quant"] == "BF16"