import json import os import re QUANT_HIERARCHY = ["Q8_0", "Q6_K", "Q5_K_M", "Q4_K_M", "Q3_K_M", "Q2_K"] QUANT_BPP = { "F32": 4.0, "F16": 2.0, "BF16": 2.0, "FP8": 1.0, "FP4": 0.50, "NVFP4": 0.50, "MXFP4": 0.50, "NF4": 0.50, "INT4": 0.50, "INT8": 1.0, "W4A16": 0.50, "W8A8": 1.0, "W8A16": 1.0, "Q8_0": 1.05, "Q6_K": 0.80, "Q5_K_M": 0.68, "Q4_K_M": 0.58, "Q4_0": 0.58, "Q3_K_M": 0.48, "Q2_K": 0.37, "AWQ-4bit": 0.50, "AWQ-8bit": 1.0, "GPTQ-Int4": 0.50, "GPTQ-Int8": 1.0, "mlx-4bit": 0.55, "mlx-8bit": 1.0, "mlx-6bit": 0.75, # DeepSeek-V4-style mixed: MoE experts in FP4 (bulk), attention + non- # expert dense in FP8, embeddings/LM head in BF16. By weight count the # experts dominate so the effective BPP sits closer to FP4 than FP8. # Empirical: DeepSeek-V4-Flash 284B / 156 GB ≈ 0.55 B/param. "FP4-MoE-Mixed": 0.55, # FP8-Mixed = the *-Base variants (MoE experts also FP8, not FP4). "FP8-Mixed": 1.0, } QUANT_SPEED_MULT = { "F16": 0.6, "BF16": 0.6, "FP8": 0.85, "FP4": 1.15, "NVFP4": 1.15, "MXFP4": 1.15, "NF4": 1.10, "INT4": 1.15, "INT8": 0.85, "W4A16": 1.15, "W8A8": 0.85, "W8A16": 0.85, "Q8_0": 0.8, "Q6_K": 0.95, "Q5_K_M": 1.0, "Q4_K_M": 1.15, "Q4_0": 1.15, "Q3_K_M": 1.25, "Q2_K": 1.35, "AWQ-4bit": 1.2, "AWQ-8bit": 0.85, "GPTQ-Int4": 1.2, "GPTQ-Int8": 0.85, "mlx-4bit": 1.15, "mlx-8bit": 0.85, "mlx-6bit": 1.0, "FP4-MoE-Mixed": 1.10, # slightly slower than pure FP4 because of mixed-dtype dispatch "FP8-Mixed": 0.85, } QUANT_QUALITY_PENALTY = { "F16": 0.0, "BF16": 0.0, "FP8": 0.0, "FP4": -3.0, "NVFP4": -3.0, "MXFP4": -3.0, "NF4": -4.0, "INT4": -4.0, "INT8": 0.0, "W4A16": -4.0, "W8A8": 0.0, "W8A16": 0.0, "Q8_0": 0.0, "Q6_K": -1.0, "Q5_K_M": -2.0, "Q4_K_M": -5.0, "Q4_0": -5.0, "Q3_K_M": -8.0, "Q2_K": -12.0, # Bare "AWQ" and "AWQ-8bit" used to be 0.0 (tied with FP8). In practice # AWQ-anything is a calibrated reconstruction, not raw 8-bit weights — # there's a small but real quality loss vs FP8. Give them a slight # penalty so FP8 wins when both fit. AWQ-4bit stays heavier. "AWQ": -1.0, "AWQ-4bit": -4.0, "AWQ-8bit": -1.0, "GPTQ": -1.0, "GPTQ-Int4": -4.0, "GPTQ-Int8": -1.0, "mlx-4bit": -4.0, "mlx-8bit": -0.5, "mlx-6bit": -1.5, # DeepSeek-V4 mixed: only MoE experts at FP4 (the rest is FP8/BF16), # so the realized quality is much closer to FP8 than to pure FP4 — # the activation-sensitive layers stay high-precision. ~0 penalty. "FP4-MoE-Mixed": -0.5, "FP8-Mixed": 0.0, } QUANT_BYTES_PER_PARAM = { "F16": 2.0, "BF16": 2.0, "FP8": 1.0, "FP4": 0.5, "NVFP4": 0.5, "MXFP4": 0.5, "NF4": 0.5, "INT4": 0.5, "INT8": 1.0, "W4A16": 0.5, "W8A8": 1.0, "W8A16": 1.0, "Q8_0": 1.0, "Q6_K": 0.75, "Q5_K_M": 0.625, "Q4_K_M": 0.5, "Q4_0": 0.5, "Q3_K_M": 0.375, "Q2_K": 0.25, "AWQ-4bit": 0.5, "AWQ-8bit": 1.0, "GPTQ-Int4": 0.5, "GPTQ-Int8": 1.0, "mlx-4bit": 0.5, "mlx-8bit": 1.0, "mlx-6bit": 0.75, "FP4-MoE-Mixed": 0.55, "FP8-Mixed": 1.0, } # Pre-quantized formats that should NOT go through the GGUF quant hierarchy. # These are native HF/vLLM-style repos, not llama.cpp GGUF quant tiers. PREQUANTIZED_PREFIXES = ( "AWQ-", "GPTQ-", "mlx-", "FP8", "FP4", "NVFP4", "MXFP4", "NF4", "INT4", "INT8", "W4A16", "W8A8", "W8A16", "FP4-MoE-Mixed", "FP8-Mixed", ) def infer_quantization_from_name(name): n = (name or "").lower() if "nvfp4" in n: return "NVFP4" if "mxfp4" in n: return "MXFP4" if re.search(r"(^|[-_/])nf4($|[-_/])", n): return "NF4" if re.search(r"(^|[-_/])fp4($|[-_/])", n): return "FP4" if re.search(r"(^|[-_/])w4a16($|[-_/])", n): return "W4A16" if re.search(r"(^|[-_/])w8a8($|[-_/])", n): return "W8A8" if re.search(r"(^|[-_/])w8a16($|[-_/])", n): return "W8A16" is8 = "8bit" in n or "8-bit" in n or "int8" in n if "awq" in n: return "AWQ-8bit" if is8 else "AWQ-4bit" if "gptq" in n: return "GPTQ-Int8" if is8 else "GPTQ-Int4" if "mlx" in n: if "6bit" in n: return "mlx-6bit" return "mlx-8bit" if is8 else "mlx-4bit" if "fp8" in n: return "FP8" if "int4" in n or "4bit" in n or "4-bit" in n: return "INT4" if "int8" in n or "8bit" in n or "8-bit" in n: return "INT8" return "" def _normalize_model_entry(model): if not isinstance(model, dict): return model inferred = infer_quantization_from_name(model.get("name", "")) if inferred and (model.get("quantization") in (None, "", "Q4_K_M") or model.get("_discovered")): model["quantization"] = inferred return model def is_prequantized(model): q = model.get("quantization", "") name = (model.get("name") or "").lower() fmt = (model.get("format") or "").lower() text = f"{name} {fmt}" return ( "nvfp4" in text or re.search(r"(^|[-_/])fp8($|[-_/\s])", text) is not None or (not (model.get("is_gguf") or model.get("gguf_sources")) and re.search(r"(^|[-_/])(?:int)?8bit($|[-_/\s])", text) is not None) or any(x in text for x in ("awq", "gptq", "mlx")) or any(q.startswith(p) for p in PREQUANTIZED_PREFIXES) ) def params_b(model): raw = model.get("parameters_raw") if raw and raw > 0: return raw / 1_000_000_000.0 pc = model.get("parameter_count", "") if pc: pc = pc.strip().upper() m = re.match(r"^([\d.]+)\s*([BKMGT]?)$", pc) if m: try: val = float(m.group(1)) except ValueError: # Malformed count like "1.5.3B" — [\d.]+ matches but float() # rejects it. One bad catalog row must not abort the whole # ranking pass, so treat it as unknown size. return 0.0 suffix = m.group(2) if suffix == "B": return val elif suffix == "M": return val / 1000.0 elif suffix == "K": return val / 1_000_000.0 elif suffix == "T": return val * 1000.0 else: # No unit. A bare number this size is conventionally a millions # count (e.g. "355" = 355M), NOT billions — otherwise a 355M # model would sort as 355B and leap above every 7B/70B model. # A genuine billions figure carries a "B" suffix and is handled # above; very large bare values are raw parameter counts. if val >= 1_000_000: return val / 1_000_000_000.0 # raw count if val >= 1000: return val / 1000.0 # thousands of millions? treat as millions return val / 1000.0 # e.g. "355" → 0.355B return 0.0 def estimate_memory_gb(model, quant, ctx): """Estimate VRAM needed to serve a model. All weights must be loaded, even for MoE (all experts live in memory, only active ones compute per token). KV cache scales with active params for MoE (only active experts have KV state).""" pb = params_b(model) bpp = QUANT_BPP.get(quant, 0.58) kv_params = _active_params_b(model) return pb * bpp + 0.000008 * kv_params * ctx + 0.5 def _active_params_b(model): """For MoE: active params per token (affects KV cache and speed, not total VRAM). For dense: same as total params.""" if model.get("is_moe") and model.get("active_parameters"): return model["active_parameters"] / 1_000_000_000.0 return params_b(model) def best_quant_for_budget(model, budget_gb, ctx): """Find best quant that fits in budget_gb of VRAM. Pre-quantized models (AWQ/GPTQ/MLX) use their native quant only. Returns (quant, ctx, mem_gb) or (None, None, None). """ if is_prequantized(model): q = model.get("quantization", "Q4_K_M") mem = estimate_memory_gb(model, q, ctx) if mem <= budget_gb: return q, ctx, mem # Try halving context cur_ctx = ctx // 2 while cur_ctx >= 1024: mem = estimate_memory_gb(model, q, cur_ctx) if mem <= budget_gb: return q, cur_ctx, mem cur_ctx //= 2 return None, None, None # GGUF: try best quality first, then fall back for q in QUANT_HIERARCHY: mem = estimate_memory_gb(model, q, ctx) if mem <= budget_gb: return q, ctx, mem cur_ctx = ctx // 2 while cur_ctx >= 1024: for q in QUANT_HIERARCHY: mem = estimate_memory_gb(model, q, cur_ctx) if mem <= budget_gb: return q, cur_ctx, mem cur_ctx //= 2 return None, None, None def infer_use_case(model): name = model.get("name", "").lower() uc = model.get("use_case", "").lower() combined = name + " " + uc if any(k in combined for k in ("embedding", "embed", "bge")): return "embedding" if any(k in combined for k in ("tts", "text-to-speech", "speech-synthesis", "cosyvoice", "parler")): return "tts" if any(k in combined for k in ("stt", "speech-to-text", "whisper", "transcri", "asr")): return "stt" if "code" in combined: return "coding" if any(k in combined for k in ("vision", "multimodal", "vlm", "vl-")): return "multimodal" if any(k in combined for k in ("reason", "chain-of-thought", "deepseek-r1")): return "reasoning" if any(k in combined for k in ("chat", "instruction")): return "chat" return "general" _models_cache = None def get_models(): global _models_cache if _models_cache is None: data_path = os.path.join(os.path.dirname(__file__), "data", "hf_models.json") try: with open(data_path, encoding="utf-8") as f: _models_cache = [_normalize_model_entry(m) for m in json.load(f)] except (FileNotFoundError, json.JSONDecodeError): _models_cache = [] return _models_cache def model_catalog_path(): return os.path.join(os.path.dirname(__file__), "data", "hf_models.json")