"""Compute intelligent llama.cpp serve profiles from detected hardware. Given a system (VRAM/RAM/arch) and a model, produce 1-4 ready-to-launch profiles — Quality / Balanced / Speed — with concrete llama.cpp flags (n_gpu_layers, n_cpu_moe, cache-type, context). This turns the by-hand tuning (how many MoE layers fit on the GPU, when to spend VRAM on a q8 KV cache vs more context, how much headroom to leave for a vision encoder) into a formula. Pure/deterministic — no benchmarking, no I/O. Reuses the same VRAM math as fit.py/models.py so "what the Cookbook recommends" and "what it serves" agree. NOTE: token/s figures are NOT computed here — real speed on partial-offload MoE is CPU-bound and not reliably predictable from specs. The UI labels profiles by their tradeoff (Quality/Balanced/Speed), and the VRAM fit (the part that decides whether it even loads) is what's computed from real numbers. """ from services.hwfit.models import ( QUANT_BPP, params_b, _active_params_b, is_prequantized, ) # GGUF KV-cache cost per token, in bytes-per-active-billion-param, by cache type. # q4_0 is ~half of q8_0 is ~half of f16. The 8e-6 base in estimate_memory_gb is # the q8_0-ish figure; scale from there. _KV_FACTOR = {"q4_0": 0.5, "q8_0": 1.0, "f16": 2.0} # Quant ladder from highest quality/size down. A profile that wants "best quant # that fits fully on GPU" walks this until one fits. _QUANT_LADDER = ["Q8_0", "Q6_K", "Q5_K_M", "Q4_K_M", "Q3_K_M", "Q2_K"] def _weights_gb(model, quant, fixed_gb=None): """VRAM for the full weights. When fixed_gb is given (serving a specific GGUF file already on disk), use its real size — the quant is whatever the file is, not something we get to pick.""" if fixed_gb and fixed_gb > 0: return float(fixed_gb) return params_b(model) * QUANT_BPP.get(quant, 0.58) def _kv_gb(model, ctx, kv_type): """KV-cache VRAM at a context length and cache type.""" kv_params = _active_params_b(model) return 0.000008 * kv_params * ctx * _KV_FACTOR.get(kv_type, 1.0) def _n_layers(model): """Best-effort total transformer block count (for n-cpu-moe math).""" for k in ("num_hidden_layers", "n_layers", "num_layers", "block_count"): v = model.get(k) if isinstance(v, (int, float)) and v > 0: return int(v) # Fallback heuristic by size — most MoE/dense LLMs land 28-64 layers. pb = params_b(model) if pb >= 60: return 64 if pb >= 25: return 48 if pb >= 12: return 40 return 32 def _cpu_moe_for_budget(model, quant, kv_gb, vram_budget_gb, fixed_gb=None): """How many MoE layers must move to CPU so weights+KV fit vram_budget_gb. Returns (n_cpu_moe, fits_fully). When the model already fits, n_cpu_moe=0. Each offloaded layer frees roughly weights/n_layers of VRAM. We only model this for MoE (where --n-cpu-moe applies); dense models just report whether they fit at the given n_gpu_layers=999. """ weights = _weights_gb(model, quant, fixed_gb) needed = weights + kv_gb + 0.6 # +0.6 GB runtime/compute buffers if needed <= vram_budget_gb: return 0, True if not model.get("is_moe"): # Dense: no per-expert offload knob; either it fits or it spills via -ngl. return 0, False layers = _n_layers(model) per_layer = weights / max(layers, 1) overflow = needed - vram_budget_gb import math n = math.ceil(overflow / max(per_layer, 1e-6)) n = max(0, min(n, layers)) # clamp return n, False def compute_serve_profiles(system, model, serve_weights_gb=None, serve_quant=None): """Return a list of profile dicts for llama.cpp serving of `model` on `system`. Each profile: {key, label, quant, n_gpu_layers, n_cpu_moe, cache_type, ctx, est_vram_gb, fits, note}. Empty list if no GGUF path makes sense (caller should fall back to manual flags). DOWNLOAD mode (default): the quant isn't chosen yet, so profiles vary it (Quality=Q6, Balanced=Q4, Speed=Q2…) to show download options. SERVE mode (serve_weights_gb set): a specific GGUF file already exists on disk — its quant is FIXED. Profiles then keep that quant/size and differ only in the actual serving knobs (n_cpu_moe, KV-cache type, context). serve_quant is the file's quant label (e.g. "Q4_K_M") just for display. """ vram = float(system.get("gpu_vram_gb") or 0) if vram <= 0: return [] serve_mode = bool(serve_weights_gb and serve_weights_gb > 0) # Never propose more context than the model was trained for — asking llama.cpp # for ctx > n_ctx_train triggers a "training context overflow" and, with a # quantized KV cache, an oversized allocation that can crash the GPU # (radv/amdgpu ErrorDeviceLost). Cap every profile at the model's real limit. model_ctx_max = 0 for k in ("context_length", "max_position_embeddings", "n_ctx_train", "context"): v = model.get(k) if isinstance(v, (int, float)) and v > 0: model_ctx_max = int(v) break if model_ctx_max <= 0: model_ctx_max = 131072 # conservative default when the catalog omits it # Vision models need headroom for the image encoder (~1 GB on top of weights). is_vision = bool( model.get("is_multimodal") or model.get("vision") or model.get("mmproj") or "vl" in str(model.get("name", "")).lower() ) headroom = 1.1 if is_vision else 0.4 budget = max(vram - headroom, 1.0) # Prequantized (AWQ/GPTQ/FP8) served via GGUF fallback use a fixed ~Q4 quant; # GGUF models can pick their quant. Pick a sensible per-profile quant. fixed_quant = model.get("quantization") if is_prequantized(model) else None is_moe = bool(model.get("is_moe")) def _pick_quant(prefer, require_full_fit): """Choose a quant for a profile. - fixed_quant (AWQ/GPTQ/FP8 served via GGUF): always that. - require_full_fit=True (Speed): walk DOWN from `prefer` to the best quant whose weights fit fully on the GPU (no offload) — fastest. - require_full_fit=False (Quality on MoE): keep `prefer` even if it must offload experts to CPU; that's the whole point of n-cpu-moe on a card too small to hold the weights. For dense models we can't offload per-expert, so fall back to the largest fully-fitting quant. """ if fixed_quant: return fixed_quant start = _QUANT_LADDER.index(prefer) if prefer in _QUANT_LADDER else 3 if require_full_fit or not is_moe: for q in _QUANT_LADDER[start:]: if _weights_gb(model, q) + 0.6 <= budget: return q return _QUANT_LADDER[-1] # MoE quality: keep the preferred (big) quant; offload handles overflow. return prefer if serve_mode: # Fixed file on disk — quant can't change. Vary only the serving knobs. fq = serve_quant or model.get("quantization") or "GGUF" specs = [ # key, label, prefer_quant, full_fit, kv_type, ctx, note ("quality", "Quality", fq, False, "q8_0", 131072, "Sharp q8 KV cache + full context. Best long-context accuracy; offloads MoE layers to CPU if needed."), ("balanced", "Balanced", fq, False, "q4_0", 131072, "Compact q4 KV at full context — good speed/quality mix."), ("speed", "Speed", fq, False, "q4_0", 32768, "Trimmed context + light KV for the fastest tokens/s."), ] else: specs = [ # key, label, prefer_quant, full_fit, kv_type, ctx, note ("quality", "Quality", "Q6_K", False, "q8_0", 131072, "Biggest quant + sharp q8 KV cache. Best answers; offloads MoE layers to CPU if needed."), ("balanced", "Balanced", "Q4_K_M", False, "q4_0", 131072, "Q4 weights + compact q4 KV. Good speed/quality mix at full context."), ("speed", "Speed", "Q4_K_M", True, "q4_0", 32768, "Smallest offload + trimmed context for the fastest tokens/s."), ] profiles = [] for key, label, prefer_q, full_fit, kv_type, ctx, note in specs: # In serve mode the quant is fixed (the file's); in download mode we pick. quant = prefer_q if serve_mode else _pick_quant(prefer_q, full_fit) # Shrink context if even the chosen KV won't fit alongside weights. # Start from the smaller of the profile's target and the model's limit. cur_ctx = min(ctx, model_ctx_max) while cur_ctx >= 8192: kv = _kv_gb(model, cur_ctx, kv_type) n_cpu_moe, fits = _cpu_moe_for_budget(model, quant, kv, budget, fixed_gb=serve_weights_gb) est = _weights_gb(model, quant, serve_weights_gb) + kv + 0.6 # If a non-MoE model can't fit even fully offloaded, try less context. if model.get("is_moe") or fits or cur_ctx <= 8192: profiles.append({ "key": key, "label": label, "quant": quant, "n_gpu_layers": 999, "n_cpu_moe": n_cpu_moe, "cache_type": kv_type, "ctx": cur_ctx, # When experts offload, GPU-resident VRAM tops out at the # budget (weights beyond it live in system RAM), so cap the # estimate at `budget`, not the full card — this also leaves # the vision-encoder headroom visible in the number. "est_vram_gb": round(min(est, budget), 1), # For MoE we treat it as fitting via offload; report whether # it fit WITHOUT offload as the "clean" flag. "fits": fits or bool(model.get("is_moe")), "offloads": n_cpu_moe > 0, "note": note, }) break cur_ctx //= 2 # De-dupe identical profiles (e.g. tiny model where all three collapse to the # same all-GPU config) — keep the first/highest-quality label. seen = set() deduped = [] for p in profiles: sig = (p["quant"], p["n_cpu_moe"], p["cache_type"], p["ctx"]) if sig in seen: continue seen.add(sig) deduped.append(p) return deduped