diff --git a/routes/cookbook_routes.py b/routes/cookbook_routes.py
index 28a2897..106460f 100644
--- a/routes/cookbook_routes.py
+++ b/routes/cookbook_routes.py
@@ -1401,9 +1401,16 @@ def setup_cookbook_routes() -> APIRouter:
total_mb = max(0, int(total_bytes / (1024 * 1024)))
used_mb = max(0, min(total_mb, int(used_bytes / (1024 * 1024))))
free_mb = max(0, total_mb - used_mb)
+ # GTT = the system-RAM pool the GPU pages into when VRAM is full.
+ # On a discrete card a large gtt_used means the model spilled past
+ # VRAM into RAM over PCIe — much slower. Surface it so the UI can
+ # warn "spilling to RAM" instead of the user wondering why it's slow.
+ gtt_used_raw = await _gpu_read_file(f"{base}/mem_info_gtt_used", host, ssh_port)
+ gtt_used_mb = max(0, int(int(gtt_used_raw) / (1024 * 1024))) if (gtt_used_raw and gtt_used_raw.isdigit()) else 0
gpus.append({
"index": len(gpus), "name": name, "uuid": entry,
"free_mb": free_mb, "total_mb": total_mb, "used_mb": used_mb,
+ "gtt_used_mb": gtt_used_mb,
"util_pct": 0, "busy": bool(total_mb and (free_mb / total_mb) < 0.85),
"processes": [], "backend": "rocm", "source": "amd-sysfs",
"unified_memory": unified,
diff --git a/routes/hwfit_routes.py b/routes/hwfit_routes.py
index 9a0a4e9..94ff90d 100644
--- a/routes/hwfit_routes.py
+++ b/routes/hwfit_routes.py
@@ -1,3 +1,4 @@
+import re
from copy import deepcopy
from fastapi import APIRouter
@@ -174,6 +175,64 @@ def setup_hwfit_routes():
results = rank_models(system, use_case=use_case or None, limit=limit, search=search or None, sort=sort, quant=quant or None)
return {"system": system, "models": results}
+ @router.get("/profiles")
+ def get_serve_profiles(model: str = "", host: str = "", ssh_port: str = "", platform: str = "", fresh: bool = False, serve_weights_gb: float = 0.0, serve_quant: str = ""):
+ """Compute llama.cpp serve profiles (Quality/Balanced/Speed) for `model`
+ against the detected hardware on `host` (or local). Returns concrete
+ flags (n_gpu_layers, n_cpu_moe, cache_type, ctx) the serve UI can apply.
+
+ `model` is matched against the catalog by name; if it's not in the
+ catalog (e.g. an ad-hoc HF repo), pass enough hints via a minimal synthetic
+ entry isn't possible here, so we return [] and the UI keeps manual flags.
+ """
+ from services.hwfit.hardware import detect_system
+ from services.hwfit.models import get_models
+ from services.hwfit.profiles import compute_serve_profiles
+ system = detect_system(host=host, ssh_port=ssh_port, platform=platform, fresh=fresh)
+ if system.get("error"):
+ return {"system": system, "profiles": [], "error": system["error"]}
+ catalog = {m.get("name"): m for m in (get_models() or [])}
+
+ def _norm(s):
+ # Normalize for matching: drop org/ prefix, a trailing -GGUF/-gguf
+ # marker, and any quant tag, lowercase. So "DeepSeek-Coder-V2-Lite-
+ # Instruct-GGUF" (a local folder name) matches catalog entry
+ # "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct".
+ s = (s or "").lower().strip()
+ s = s.split("/")[-1] # drop org prefix
+ s = re.sub(r"[-_.]?gguf$", "", s) # drop trailing gguf marker
+ s = re.sub(r"[-_.](q\d[^/]*|iq\d[^/]*|fp8|bf16|f16|awq[^/]*|gptq[^/]*)$", "", s)
+ return s
+
+ m = catalog.get(model)
+ if m is None and model:
+ want = _norm(model)
+ for name, entry in catalog.items():
+ nn = _norm(name)
+ if nn and (nn == want or want.endswith(nn) or nn.endswith(want)):
+ m = entry
+ break
+ if m is None:
+ return {"system": system, "profiles": [], "error": "model not in catalog"}
+ # Surface the model's trained context limit so the serve UI can clamp a
+ # user-typed context down to it (asking for ctx > n_ctx_train overflows
+ # and, with a quantized KV cache, can crash the GPU).
+ model_ctx_max = 0
+ for k in ("context_length", "max_position_embeddings", "n_ctx_train", "context"):
+ v = m.get(k)
+ if isinstance(v, (int, float)) and v > 0:
+ model_ctx_max = int(v)
+ break
+ return {
+ "system": system,
+ "profiles": compute_serve_profiles(
+ system, m,
+ serve_weights_gb=(serve_weights_gb or None),
+ serve_quant=(serve_quant or None),
+ ),
+ "model_ctx_max": model_ctx_max,
+ }
+
@router.get("/image-models")
def get_image_models(sort: str = "fit", search: str = "", host: str = "", gpu_count: str = "", ssh_port: str = "", platform: str = "", fresh: bool = False, manual_mode: str = "", manual_gpu_count: str = "", manual_vram_gb: str = "", manual_ram_gb: str = "", manual_backend: str = "", ignore_detected_gpu: bool = False, ignore_detected_ram: bool = False):
"""Rank image generation models against detected hardware."""
diff --git a/services/hwfit/profiles.py b/services/hwfit/profiles.py
new file mode 100644
index 0000000..87aa147
--- /dev/null
+++ b/services/hwfit/profiles.py
@@ -0,0 +1,229 @@
+"""Compute intelligent llama.cpp serve profiles from detected hardware.
+
+Given a system (VRAM/RAM/arch) and a model, produce 1-4 ready-to-launch
+profiles — Quality / Balanced / Speed — with concrete llama.cpp flags
+(n_gpu_layers, n_cpu_moe, cache-type, context). This turns the by-hand tuning
+(how many MoE layers fit on the GPU, when to spend VRAM on a q8 KV cache vs more
+context, how much headroom to leave for a vision encoder) into a formula.
+
+Pure/deterministic — no benchmarking, no I/O. Reuses the same VRAM math as
+fit.py/models.py so "what the Cookbook recommends" and "what it serves" agree.
+
+NOTE: token/s figures are NOT computed here — real speed on partial-offload MoE
+is CPU-bound and not reliably predictable from specs. The UI labels profiles by
+their tradeoff (Quality/Balanced/Speed), and the VRAM fit (the part that decides
+whether it even loads) is what's computed from real numbers.
+"""
+
+from services.hwfit.models import (
+ QUANT_BPP,
+ params_b,
+ _active_params_b,
+ is_prequantized,
+)
+
+# GGUF KV-cache cost per token, in bytes-per-active-billion-param, by cache type.
+# q4_0 is ~half of q8_0 is ~half of f16. The 8e-6 base in estimate_memory_gb is
+# the q8_0-ish figure; scale from there.
+_KV_FACTOR = {"q4_0": 0.5, "q8_0": 1.0, "f16": 2.0}
+
+# Quant ladder from highest quality/size down. A profile that wants "best quant
+# that fits fully on GPU" walks this until one fits.
+_QUANT_LADDER = ["Q8_0", "Q6_K", "Q5_K_M", "Q4_K_M", "Q3_K_M", "Q2_K"]
+
+
+def _weights_gb(model, quant, fixed_gb=None):
+ """VRAM for the full weights. When fixed_gb is given (serving a specific GGUF
+ file already on disk), use its real size — the quant is whatever the file is,
+ not something we get to pick."""
+ if fixed_gb and fixed_gb > 0:
+ return float(fixed_gb)
+ return params_b(model) * QUANT_BPP.get(quant, 0.58)
+
+
+def _kv_gb(model, ctx, kv_type):
+ """KV-cache VRAM at a context length and cache type."""
+ kv_params = _active_params_b(model)
+ return 0.000008 * kv_params * ctx * _KV_FACTOR.get(kv_type, 1.0)
+
+
+def _n_layers(model):
+ """Best-effort total transformer block count (for n-cpu-moe math)."""
+ for k in ("num_hidden_layers", "n_layers", "num_layers", "block_count"):
+ v = model.get(k)
+ if isinstance(v, (int, float)) and v > 0:
+ return int(v)
+ # Fallback heuristic by size — most MoE/dense LLMs land 28-64 layers.
+ pb = params_b(model)
+ if pb >= 60:
+ return 64
+ if pb >= 25:
+ return 48
+ if pb >= 12:
+ return 40
+ return 32
+
+
+def _cpu_moe_for_budget(model, quant, kv_gb, vram_budget_gb, fixed_gb=None):
+ """How many MoE layers must move to CPU so weights+KV fit vram_budget_gb.
+
+ Returns (n_cpu_moe, fits_fully). When the model already fits, n_cpu_moe=0.
+ Each offloaded layer frees roughly weights/n_layers of VRAM. We only model
+ this for MoE (where --n-cpu-moe applies); dense models just report whether
+ they fit at the given n_gpu_layers=999.
+ """
+ weights = _weights_gb(model, quant, fixed_gb)
+ needed = weights + kv_gb + 0.6 # +0.6 GB runtime/compute buffers
+ if needed <= vram_budget_gb:
+ return 0, True
+ if not model.get("is_moe"):
+ # Dense: no per-expert offload knob; either it fits or it spills via -ngl.
+ return 0, False
+ layers = _n_layers(model)
+ per_layer = weights / max(layers, 1)
+ overflow = needed - vram_budget_gb
+ import math
+ n = math.ceil(overflow / max(per_layer, 1e-6))
+ n = max(0, min(n, layers)) # clamp
+ return n, False
+
+
+def compute_serve_profiles(system, model, serve_weights_gb=None, serve_quant=None):
+ """Return a list of profile dicts for llama.cpp serving of `model` on `system`.
+
+ Each profile: {key, label, quant, n_gpu_layers, n_cpu_moe, cache_type, ctx,
+ est_vram_gb, fits, note}. Empty list if no GGUF path makes
+ sense (caller should fall back to manual flags).
+
+ DOWNLOAD mode (default): the quant isn't chosen yet, so profiles vary it
+ (Quality=Q6, Balanced=Q4, Speed=Q2…) to show download options.
+
+ SERVE mode (serve_weights_gb set): a specific GGUF file already exists on
+ disk — its quant is FIXED. Profiles then keep that quant/size and differ only
+ in the actual serving knobs (n_cpu_moe, KV-cache type, context). serve_quant
+ is the file's quant label (e.g. "Q4_K_M") just for display.
+ """
+ vram = float(system.get("gpu_vram_gb") or 0)
+ if vram <= 0:
+ return []
+
+ serve_mode = bool(serve_weights_gb and serve_weights_gb > 0)
+
+ # Never propose more context than the model was trained for — asking llama.cpp
+ # for ctx > n_ctx_train triggers a "training context overflow" and, with a
+ # quantized KV cache, an oversized allocation that can crash the GPU
+ # (radv/amdgpu ErrorDeviceLost). Cap every profile at the model's real limit.
+ model_ctx_max = 0
+ for k in ("context_length", "max_position_embeddings", "n_ctx_train", "context"):
+ v = model.get(k)
+ if isinstance(v, (int, float)) and v > 0:
+ model_ctx_max = int(v)
+ break
+ if model_ctx_max <= 0:
+ model_ctx_max = 131072 # conservative default when the catalog omits it
+
+ # Vision models need headroom for the image encoder (~1 GB on top of weights).
+ is_vision = bool(
+ model.get("is_multimodal") or model.get("vision") or model.get("mmproj")
+ or "vl" in str(model.get("name", "")).lower()
+ )
+ headroom = 1.1 if is_vision else 0.4
+ budget = max(vram - headroom, 1.0)
+
+ # Prequantized (AWQ/GPTQ/FP8) served via GGUF fallback use a fixed ~Q4 quant;
+ # GGUF models can pick their quant. Pick a sensible per-profile quant.
+ fixed_quant = model.get("quantization") if is_prequantized(model) else None
+
+ is_moe = bool(model.get("is_moe"))
+
+ def _pick_quant(prefer, require_full_fit):
+ """Choose a quant for a profile.
+
+ - fixed_quant (AWQ/GPTQ/FP8 served via GGUF): always that.
+ - require_full_fit=True (Speed): walk DOWN from `prefer` to the best quant
+ whose weights fit fully on the GPU (no offload) — fastest.
+ - require_full_fit=False (Quality on MoE): keep `prefer` even if it must
+ offload experts to CPU; that's the whole point of n-cpu-moe on a card
+ too small to hold the weights. For dense models we can't offload
+ per-expert, so fall back to the largest fully-fitting quant.
+ """
+ if fixed_quant:
+ return fixed_quant
+ start = _QUANT_LADDER.index(prefer) if prefer in _QUANT_LADDER else 3
+ if require_full_fit or not is_moe:
+ for q in _QUANT_LADDER[start:]:
+ if _weights_gb(model, q) + 0.6 <= budget:
+ return q
+ return _QUANT_LADDER[-1]
+ # MoE quality: keep the preferred (big) quant; offload handles overflow.
+ return prefer
+
+ if serve_mode:
+ # Fixed file on disk — quant can't change. Vary only the serving knobs.
+ fq = serve_quant or model.get("quantization") or "GGUF"
+ specs = [
+ # key, label, prefer_quant, full_fit, kv_type, ctx, note
+ ("quality", "Quality", fq, False, "q8_0", 131072,
+ "Sharp q8 KV cache + full context. Best long-context accuracy; offloads MoE layers to CPU if needed."),
+ ("balanced", "Balanced", fq, False, "q4_0", 131072,
+ "Compact q4 KV at full context — good speed/quality mix."),
+ ("speed", "Speed", fq, False, "q4_0", 32768,
+ "Trimmed context + light KV for the fastest tokens/s."),
+ ]
+ else:
+ specs = [
+ # key, label, prefer_quant, full_fit, kv_type, ctx, note
+ ("quality", "Quality", "Q6_K", False, "q8_0", 131072,
+ "Biggest quant + sharp q8 KV cache. Best answers; offloads MoE layers to CPU if needed."),
+ ("balanced", "Balanced", "Q4_K_M", False, "q4_0", 131072,
+ "Q4 weights + compact q4 KV. Good speed/quality mix at full context."),
+ ("speed", "Speed", "Q4_K_M", True, "q4_0", 32768,
+ "Smallest offload + trimmed context for the fastest tokens/s."),
+ ]
+
+ profiles = []
+ for key, label, prefer_q, full_fit, kv_type, ctx, note in specs:
+ # In serve mode the quant is fixed (the file's); in download mode we pick.
+ quant = prefer_q if serve_mode else _pick_quant(prefer_q, full_fit)
+ # Shrink context if even the chosen KV won't fit alongside weights.
+ # Start from the smaller of the profile's target and the model's limit.
+ cur_ctx = min(ctx, model_ctx_max)
+ while cur_ctx >= 8192:
+ kv = _kv_gb(model, cur_ctx, kv_type)
+ n_cpu_moe, fits = _cpu_moe_for_budget(model, quant, kv, budget, fixed_gb=serve_weights_gb)
+ est = _weights_gb(model, quant, serve_weights_gb) + kv + 0.6
+ # If a non-MoE model can't fit even fully offloaded, try less context.
+ if model.get("is_moe") or fits or cur_ctx <= 8192:
+ profiles.append({
+ "key": key,
+ "label": label,
+ "quant": quant,
+ "n_gpu_layers": 999,
+ "n_cpu_moe": n_cpu_moe,
+ "cache_type": kv_type,
+ "ctx": cur_ctx,
+ # When experts offload, GPU-resident VRAM tops out at the
+ # budget (weights beyond it live in system RAM), so cap the
+ # estimate at `budget`, not the full card — this also leaves
+ # the vision-encoder headroom visible in the number.
+ "est_vram_gb": round(min(est, budget), 1),
+ # For MoE we treat it as fitting via offload; report whether
+ # it fit WITHOUT offload as the "clean" flag.
+ "fits": fits or bool(model.get("is_moe")),
+ "offloads": n_cpu_moe > 0,
+ "note": note,
+ })
+ break
+ cur_ctx //= 2
+
+ # De-dupe identical profiles (e.g. tiny model where all three collapse to the
+ # same all-GPU config) — keep the first/highest-quality label.
+ seen = set()
+ deduped = []
+ for p in profiles:
+ sig = (p["quant"], p["n_cpu_moe"], p["cache_type"], p["ctx"])
+ if sig in seen:
+ continue
+ seen.add(sig)
+ deduped.append(p)
+ return deduped
diff --git a/static/js/cookbook-hwfit.js b/static/js/cookbook-hwfit.js
index 6ed895d..bd49a17 100644
--- a/static/js/cookbook-hwfit.js
+++ b/static/js/cookbook-hwfit.js
@@ -365,6 +365,17 @@ function _hwfitShowError(list, host, detail) {
if (rb) rb.addEventListener('click', () => { _resetGpuToggleState(); _hwfitFetch(true); });
}
+// Client-side "Engine" filter (llama.cpp / vLLM / SGLang). Empty = show all.
+// Uses the same _detectBackend() the serve commands use, so what you filter to
+// is exactly what would be launched. Pure view filter — no refetch needed.
+function _applyEngineFilter(models) {
+ const want = document.getElementById('hwfit-engine')?.value || '';
+ if (!want || !Array.isArray(models)) return models || [];
+ return models.filter(m => {
+ try { return _detectBackend(m).backend === want; } catch { return true; }
+ });
+}
+
export async function _hwfitFetch(fresh = false) {
const _tk = ++_hwfitFetchToken;
const useCase = document.getElementById('hwfit-usecase')?.value || '';
@@ -384,7 +395,7 @@ export async function _hwfitFetch(fresh = false) {
if (_cached) {
_hwfitCache = _cached;
_hwfitRenderHw(hw, _cached.system);
- _hwfitRenderList(list, _cached.models);
+ _hwfitRenderList(list, _applyEngineFilter(_cached.models));
} else {
// Show spinner while scanning — stack the spinner above a text label
// (the .hwfit-loading class is a centered flex ROW, so force column here).
@@ -530,7 +541,7 @@ export async function _hwfitFetch(fresh = false) {
return asc ? av - bv : bv - av;
});
}
- _hwfitRenderList(list, data.models);
+ _hwfitRenderList(list, _applyEngineFilter(data.models));
// Persist this result so the next page load can paint it instantly.
_writeScanCache(_sig, data);
// Render GPU toggles — only on first scan (no override active)
@@ -773,9 +784,10 @@ export function _hwfitRenderList(el, models) {
const hasHw = sys && ((sys.gpu_vram_gb || 0) > 0 || (sys.total_ram_gb || 0) > 8);
const hasFilters = !!(document.getElementById('hwfit-search')?.value?.trim()
|| document.getElementById('hwfit-usecase')?.value
- || document.getElementById('hwfit-quant')?.value);
+ || document.getElementById('hwfit-quant')?.value
+ || document.getElementById('hwfit-engine')?.value);
let msg;
- if (hasFilters) msg = 'No models match these filters — try clearing the search, use-case, or quant.';
+ if (hasFilters) msg = 'No models match these filters — try clearing the search, use-case, quant, or engine.';
else if (hasHw) msg = 'No models fit — the hardware probe may have under-reported. Try Rescan.';
else msg = 'No models fit your hardware';
el.innerHTML = `
${msg}
`;
@@ -1122,6 +1134,17 @@ export function _hwfitInit() {
if (uc) uc.addEventListener('change', () => _hwfitFetch());
if (sort) sort.addEventListener('change', () => _hwfitFetch());
if (qpref) qpref.addEventListener('change', () => _hwfitFetch());
+ // Engine filter is a pure client-side view filter over the already-fetched
+ // list, so just re-render from cache instead of re-probing hardware.
+ const engine = document.getElementById('hwfit-engine');
+ if (engine) engine.addEventListener('change', () => {
+ const list = document.getElementById('hwfit-list');
+ if (list && _hwfitCache && Array.isArray(_hwfitCache.models)) {
+ _hwfitRenderList(list, _applyEngineFilter(_hwfitCache.models));
+ } else {
+ _hwfitFetch();
+ }
+ });
// Rescan — force a fresh hardware probe (bypasses the per-host cache).
const rescan = document.getElementById('hwfit-rescan');
if (rescan && !rescan.dataset.bound) {
diff --git a/static/js/cookbook.js b/static/js/cookbook.js
index af8d911..8c23a5a 100644
--- a/static/js/cookbook.js
+++ b/static/js/cookbook.js
@@ -417,11 +417,40 @@ export function _buildServeCmd(f, modelName, backend) {
// renders modern GGUF chat templates that the Python bindings' Jinja2
// rejects (do_tojson ensure_ascii). Fall back to llama_cpp.server.
// Don't suppress stderr — surface real errors (missing file, lib, OOM).
- const _lcpServer = `${lcPrefix}${py} -m llama_cpp.server --model ${modelArg} --host 0.0.0.0 --port ${f.port || '8080'} --n_gpu_layers ${f.ngl || '99'} --n_ctx ${f.ctx || '8192'}`;
+ // Optional perf/fit flags from a hardware profile (see services/hwfit/
+ // profiles.py). n_cpu_moe offloads MoE expert layers to CPU when the model
+ // is bigger than VRAM; flash-attn + a quantized KV cache cut KV memory and
+ // speed things up. Only emitted when set, so manual/older flows are unchanged.
+ const _ncm = (f.n_cpu_moe ?? '').toString().trim();
+ const _kv = (f.cache_type ?? '').toString().trim();
+ let _lcExtra = '';
+ let _lcpExtra = '';
+ if (_ncm !== '' && Number(_ncm) > 0) {
+ _lcExtra += ` --n-cpu-moe ${_ncm}`;
+ _lcpExtra += ` --n_cpu_moe ${_ncm}`; // llama-cpp-python uses underscores
+ }
+ if (f.flash_attn) {
+ _lcExtra += ' --flash-attn on';
+ _lcpExtra += ' --flash_attn true';
+ }
+ if (_kv) {
+ _lcExtra += ` --cache-type-k ${_kv} --cache-type-v ${_kv}`;
+ // llama-cpp-python exposes these as type_k/type_v; pass through best-effort.
+ _lcpExtra += ` --type_k ${_kv} --type_v ${_kv}`;
+ }
+ // Vision: serve the multimodal projector so the model can read images. The
+ // mmproj path is resolved at runtime (find mmproj-*.gguf next to the model);
+ // only emitted when the Vision toggle is on AND a projector was found.
+ if (f.vision && f._mmproj_path) {
+ _lcExtra += ` --mmproj "${f._mmproj_path}" --image-max-tokens 1024`;
+ // llama-cpp-python takes the projector via --clip_model_path.
+ _lcpExtra += ` --clip_model_path "${f._mmproj_path}"`;
+ }
+ const _lcpServer = `${lcPrefix}${py} -m llama_cpp.server --model ${modelArg} --host 0.0.0.0 --port ${f.port || '8080'} --n_gpu_layers ${f.ngl || '99'} --n_ctx ${f.ctx || '8192'}${_lcpExtra}`;
if (_isWindows()) {
cmd += _lcpServer;
} else {
- cmd += `${lcPrefix}llama-server --model ${modelArg} --host 0.0.0.0 --port ${f.port || '8080'} -ngl ${f.ngl || '99'} -c ${f.ctx || '8192'}`;
+ cmd += `${lcPrefix}llama-server --model ${modelArg} --host 0.0.0.0 --port ${f.port || '8080'} -ngl ${f.ngl || '99'} -c ${f.ctx || '8192'}${_lcExtra}`;
cmd += ` || ${_lcpServer}`;
}
} else if (backend === 'ollama') {
@@ -1460,6 +1489,16 @@ function _renderRecipes() {
html += '';
html += '';
html += '';
+ // Engine filter: show only models whose serve engine matches. "llama.cpp"
+ // (GGUF) runs everywhere incl. consumer AMD/Apple; vLLM/SGLang are CUDA /
+ // datacenter-ROCm. Filtering is client-side via _detectBackend() in the
+ // hwfit renderer, so it composes with the quant/type/search filters.
+ html += '';
html += '';
html += '
';
html += '
`;
+ // Row 2c: llama.cpp fit/perf flags (set by Auto profiles, editable by hand)
+ const _kvOpts = ['', 'q4_0', 'q8_0', 'f16'].map(k => ``).join('');
+ panelHtml += `
`;
+ // Row 2d: Auto profiles — computed from detected hardware (see profiles.py).
+ // Buttons are injected after the panel mounts (needs an async fetch).
+ panelHtml += `
`;
+ // Live VRAM / RAM-spillover monitor for the serve target's GPU. Polls
+ // /api/cookbook/gpus while the panel is open so you can SEE whether the
+ // config fits VRAM (fast) or spills to system RAM (slow). Populated after mount.
+ panelHtml += `