Fix native Cookbook quant classification

2026-06-02 14:07:20 +10:00
parent 65b5d65059
commit cd4f496cb4
6 changed files with 201 additions and 44 deletions
--- a/scripts/add_hwfit_models.py
+++ b/scripts/add_hwfit_models.py
@@ -43,7 +43,8 @@ _GENERIC_TAGS = {
    "transformers", "safetensors", "conversational", "text-generation",
    "image-text-to-text", "text-generation-inference", "endpoints_compatible",
    "autotrain_compatible", "compressed-tensors", "gguf", "mlx", "vllm", "4-bit",
-    "8-bit", "awq", "gptq", "fp8", "quantized", "chat",
+    "8-bit", "awq", "gptq", "fp8", "fp4", "nvfp4", "mxfp4", "nf4",
+    "quantized", "chat",
 }

 api = HfApi()
@@ -79,6 +80,20 @@ def _base_model_tag(tags):

 def _quant_from_name(name):
    n = name.lower()
+    if "nvfp4" in n:
+        return "NVFP4"
+    if "mxfp4" in n:
+        return "MXFP4"
+    if re.search(r"(^|[-_/])nf4($|[-_/])", n):
+        return "NF4"
+    if re.search(r"(^|[-_/])fp4($|[-_/])", n):
+        return "FP4"
+    if re.search(r"(^|[-_/])w4a16($|[-_/])", n):
+        return "W4A16"
+    if re.search(r"(^|[-_/])w8a8($|[-_/])", n):
+        return "W8A8"
+    if re.search(r"(^|[-_/])w8a16($|[-_/])", n):
+        return "W8A16"
    is8 = "8bit" in n or "8-bit" in n or "int8" in n
    if "awq" in n:
        return "AWQ-8bit" if is8 else "AWQ-4bit"
@@ -93,7 +108,9 @@ def _quant_from_name(name):
    if "fp8" in n:
        return "FP8"
    if "int4" in n or "4bit" in n or "4-bit" in n:
-        return "AWQ-4bit"
+        return "INT4"
+    if "int8" in n or "8bit" in n or "8-bit" in n:
+        return "INT8"
    return "Q4_K_M"


@@ -160,7 +177,10 @@ def _entry_from_modelinfo(mi, overrides):
    rel = created.strftime("%Y-%m-%d") if created else datetime.utcnow().strftime("%Y-%m-%d")
    # Rough RAM/VRAM hints (fit.py recomputes the real requirement from params+quant).
    _BPP = {"AWQ-4bit": 0.58, "GPTQ-Int4": 0.58, "mlx-4bit": 0.55, "mlx-6bit": 0.85,
-            "AWQ-8bit": 1.1, "GPTQ-Int8": 1.1, "mlx-8bit": 1.1, "FP8": 1.1, "NVFP4": 0.6, "Q4_K_M": 0.6}
+            "AWQ-8bit": 1.1, "GPTQ-Int8": 1.1, "mlx-8bit": 1.1, "FP8": 1.1,
+            "FP4": 0.58, "NVFP4": 0.58, "MXFP4": 0.58, "NF4": 0.58,
+            "INT4": 0.58, "INT8": 1.1, "W4A16": 0.58, "W8A8": 1.1, "W8A16": 1.1,
+            "Q4_K_M": 0.6}
    bpp = _BPP.get(quant, 0.6)
    vram = round(pb * bpp + 0.5, 1)
    entry = {
--- a/services/hwfit/fit.py
+++ b/services/hwfit/fit.py
@@ -219,9 +219,9 @@ def _quant_bits(q):
    Returns 0 when unknown (caller treats unknown as "don't filter")."""
    qu = (q or "").upper().replace("-", "").replace("_", "").replace(" ", "")
    # GGUF k-quants + float formats
-    if qu.startswith("Q8") or "FP8" in qu:
+    if qu.startswith("Q8") or "FP8" in qu or "INT8" in qu or qu.startswith("W8"):
        return 8
-    if qu.startswith("Q4") or qu.startswith("IQ4"):
+    if qu.startswith("Q4") or qu.startswith("IQ4") or "FP4" in qu or "NF4" in qu or "INT4" in qu or qu.startswith("W4"):
        return 4
    if qu.startswith("Q2") or qu.startswith("IQ2"):
        return 2
@@ -233,7 +233,7 @@ def _quant_bits(q):
        return 6
    if qu.startswith("F16") or qu.startswith("BF16") or qu.startswith("F32"):
        return 16
-    # Prequantized formats: pull the bit-width digit (AWQ4 / AWQ4BIT / GPTQ8 / 4BIT / INT8 …)
+    # Prequantized formats: pull the bit-width digit (AWQ4 / AWQ4BIT / GPTQ8 / 4BIT / INT8 ...)
    m = re.search(r"(?:AWQ|GPTQ|MLX|EXL2|BNB|INT|W)(\d{1,2})", qu) or re.search(r"(\d{1,2})BIT", qu)
    if m:
        b = int(m.group(1))
@@ -282,15 +282,21 @@ def analyze_model(model, system, target_quant=None, scoring_use_case=None):
    else:
        effective_vram = gpu_vram

+    native_gpu_only = preq and not native_quant.startswith("mlx-")
+
    # Determine which quant to evaluate at
+    native_quant_prefixes = (
+        "AWQ-", "GPTQ-", "FP8", "FP4", "NVFP4", "MXFP4", "NF4",
+        "INT4", "INT8", "W4A16", "W8A8", "W8A16",
+    )
+
    if preq:
-        # AWQ/GPTQ/FP8/MLX come at a fixed bit-width. If the user picked a
-        # GGUF quant tier (Q4/Q8/etc.), do not treat a same-bit AWQ/GPTQ build
-        # as equivalent. "Q4" means llama.cpp/Ollama-style GGUF in this UI;
-        # AWQ/GPTQ/FP8 are separate GPU-serving formats and must only appear
-        # when explicitly selected or when no quant filter is applied.
+        # Native HF/vLLM quantized repos come at a fixed format. If the user
+        # picked a GGUF quant tier (Q4/Q8/etc.), do not treat same-bit
+        # AWQ/GPTQ/FP8/FP4 builds as equivalent; those formats are separate
+        # serving paths and only appear when explicitly selected or unfiltered.
        if target_quant:
-            if not any(target_quant.startswith(p) for p in ("AWQ-", "GPTQ-", "FP8", "NVFP4")):
+            if not any(target_quant.startswith(p) for p in native_quant_prefixes):
                return None
            _tb, _nb = _quant_bits(target_quant), _quant_bits(native_quant)
            if _tb and _nb and _tb != _nb:
@@ -303,16 +309,7 @@ def analyze_model(model, system, target_quant=None, scoring_use_case=None):
        # Default: Q4_K_M (user's stated preference)
        quant_to_try = "Q4_K_M"

-    result = _try_quant_at(model, quant_to_try, ctx, effective_vram, eff_ram)
-
-    # If target quant doesn't fit and it's not pre-quantized, try lower quants
-    if result is None and not preq and target_quant:
-        from services.hwfit.models import QUANT_HIERARCHY
-        idx = QUANT_HIERARCHY.index(target_quant) if target_quant in QUANT_HIERARCHY else -1
-        for q in QUANT_HIERARCHY[idx + 1:]:
-            result = _try_quant_at(model, q, ctx, effective_vram, eff_ram)
-            if result:
-                break
+    result = _try_quant_at(model, quant_to_try, ctx, effective_vram, 0 if native_gpu_only else eff_ram)

    if result is None:
        # Model doesn't fit on the user's current hardware. Surface it
@@ -447,8 +444,11 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
            results.sort(key=sort_fn, reverse=(sort != "vram"))
            return results[:limit]

-    # If user picked a prequantized format (AWQ/FP8/GPTQ/NVFP4), filter to only those models
-    filter_native = quant and any(quant.startswith(p) for p in ("AWQ-", "GPTQ-", "FP8", "NVFP4"))
+    # If user picked a native prequantized format, filter to only those models.
+    filter_native = quant and any(quant.startswith(p) for p in (
+        "AWQ-", "GPTQ-", "FP8", "FP4", "NVFP4", "MXFP4", "NF4",
+        "INT4", "INT8", "W4A16", "W8A8", "W8A16",
+    ))

    system_backend = (system.get("backend") or "").lower()
    apple_silicon = system_backend in ("mps", "metal", "apple")
@@ -459,9 +459,9 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
        if "nvfp4" in (m.get("name") or "").lower():
            native_q = "NVFP4"

-        # MLX is Apple Silicon only. Hide MLX rows on non-Mac hardware scans,
-        # but leave them visible on Metal/MPS so Mac support is not broken.
-        if not apple_silicon and (native_q.startswith("mlx-") or "mlx" in (m.get("name") or "").lower()):
+        # MLX needs the mlx_lm runtime, which Odysseus does not generate serve
+        # commands for. Hide it on every backend, including Metal.
+        if native_q.startswith("mlx-") or "mlx" in (m.get("name") or "").lower():
            continue

        # ROCm support for vLLM/SGLang quantized safetensors is too brittle to
@@ -479,20 +479,23 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
        # default GGUF quant) and vLLM-only AWQ/GPTQ/FP8 builds alike. Without
        # this the Cookbook recommends models the Mac can't run; on CUDA these
        # stay visible because vLLM serves safetensors directly.
-        is_mlx = native_q.startswith("mlx-") or "mlx" in (m.get("name") or "").lower()
-        if apple_silicon and not (m.get("is_gguf") or m.get("gguf_sources") or is_mlx):
+        if apple_silicon and not (m.get("is_gguf") or m.get("gguf_sources")):
            continue

-        # Format filter: AWQ tab → only AWQ models, FP8 tab → only FP8 models
+        # Format filter: AWQ tab -> only AWQ models, FP4 tab -> FP4-family models, etc.
        if filter_native:
            if quant == "FP8" and native_q != "FP8":
                continue
+            if quant == "FP4" and native_q not in ("FP4", "NVFP4", "MXFP4", "NF4"):
+                continue
            if quant.startswith("AWQ") and not native_q.startswith("AWQ"):
                continue
            if quant.startswith("GPTQ") and not native_q.startswith("GPTQ"):
                continue
            if quant.startswith("NVFP4") and not native_q.startswith("NVFP4"):
                continue
+            if quant in ("INT4", "INT8", "W4A16", "W8A8", "W8A16") and native_q != quant:
+                continue

        if search:
            name = m.get("name", "").lower()
--- a/services/hwfit/models.py
+++ b/services/hwfit/models.py
@@ -5,7 +5,9 @@ import re
 QUANT_HIERARCHY = ["Q8_0", "Q6_K", "Q5_K_M", "Q4_K_M", "Q3_K_M", "Q2_K"]

 QUANT_BPP = {
-    "F32": 4.0, "F16": 2.0, "BF16": 2.0, "FP8": 1.0, "NVFP4": 0.5,
+    "F32": 4.0, "F16": 2.0, "BF16": 2.0, "FP8": 1.0,
+    "FP4": 0.50, "NVFP4": 0.50, "MXFP4": 0.50, "NF4": 0.50,
+    "INT4": 0.50, "INT8": 1.0, "W4A16": 0.50, "W8A8": 1.0, "W8A16": 1.0,
    "Q8_0": 1.05, "Q6_K": 0.80, "Q5_K_M": 0.68,
    "Q4_K_M": 0.58, "Q4_0": 0.58, "Q3_K_M": 0.48, "Q2_K": 0.37,
    "AWQ-4bit": 0.50, "AWQ-8bit": 1.0,
@@ -14,7 +16,9 @@ QUANT_BPP = {
 }

 QUANT_SPEED_MULT = {
-    "F16": 0.6, "BF16": 0.6, "FP8": 0.85, "NVFP4": 1.1,
+    "F16": 0.6, "BF16": 0.6, "FP8": 0.85,
+    "FP4": 1.15, "NVFP4": 1.15, "MXFP4": 1.15, "NF4": 1.10,
+    "INT4": 1.15, "INT8": 0.85, "W4A16": 1.15, "W8A8": 0.85, "W8A16": 0.85,
    "Q8_0": 0.8, "Q6_K": 0.95, "Q5_K_M": 1.0,
    "Q4_K_M": 1.15, "Q4_0": 1.15, "Q3_K_M": 1.25, "Q2_K": 1.35,
    "AWQ-4bit": 1.2, "AWQ-8bit": 0.85,
@@ -23,7 +27,9 @@ QUANT_SPEED_MULT = {
 }

 QUANT_QUALITY_PENALTY = {
-    "F16": 0.0, "BF16": 0.0, "FP8": 0.0, "NVFP4": 0.0,
+    "F16": 0.0, "BF16": 0.0, "FP8": 0.0,
+    "FP4": -3.0, "NVFP4": -3.0, "MXFP4": -3.0, "NF4": -4.0,
+    "INT4": -4.0, "INT8": 0.0, "W4A16": -4.0, "W8A8": 0.0, "W8A16": 0.0,
    "Q8_0": 0.0, "Q6_K": -1.0, "Q5_K_M": -2.0,
    "Q4_K_M": -5.0, "Q4_0": -5.0, "Q3_K_M": -8.0, "Q2_K": -12.0,
    "AWQ-4bit": -3.0, "AWQ-8bit": 0.0,
@@ -32,7 +38,9 @@ QUANT_QUALITY_PENALTY = {
 }

 QUANT_BYTES_PER_PARAM = {
-    "F16": 2.0, "BF16": 2.0, "FP8": 1.0, "NVFP4": 0.5,
+    "F16": 2.0, "BF16": 2.0, "FP8": 1.0,
+    "FP4": 0.5, "NVFP4": 0.5, "MXFP4": 0.5, "NF4": 0.5,
+    "INT4": 0.5, "INT8": 1.0, "W4A16": 0.5, "W8A8": 1.0, "W8A16": 1.0,
    "Q8_0": 1.0, "Q6_K": 0.75, "Q5_K_M": 0.625,
    "Q4_K_M": 0.5, "Q4_0": 0.5, "Q3_K_M": 0.375, "Q2_K": 0.25,
    "AWQ-4bit": 0.5, "AWQ-8bit": 1.0,
@@ -40,14 +48,60 @@ QUANT_BYTES_PER_PARAM = {
    "mlx-4bit": 0.5, "mlx-8bit": 1.0, "mlx-6bit": 0.75,
 }

-# Pre-quantized formats that should NOT go through the GGUF quant hierarchy
-PREQUANTIZED_PREFIXES = ("AWQ-", "GPTQ-", "mlx-", "FP8", "NVFP4")
+# Pre-quantized formats that should NOT go through the GGUF quant hierarchy.
+# These are native HF/vLLM-style repos, not llama.cpp GGUF quant tiers.
+PREQUANTIZED_PREFIXES = (
+    "AWQ-", "GPTQ-", "mlx-", "FP8", "FP4", "NVFP4", "MXFP4", "NF4",
+    "INT4", "INT8", "W4A16", "W8A8", "W8A16",
+)
+
+
+def infer_quantization_from_name(name):
+    n = (name or "").lower()
+    if "nvfp4" in n:
+        return "NVFP4"
+    if "mxfp4" in n:
+        return "MXFP4"
+    if re.search(r"(^|[-_/])nf4($|[-_/])", n):
+        return "NF4"
+    if re.search(r"(^|[-_/])fp4($|[-_/])", n):
+        return "FP4"
+    if re.search(r"(^|[-_/])w4a16($|[-_/])", n):
+        return "W4A16"
+    if re.search(r"(^|[-_/])w8a8($|[-_/])", n):
+        return "W8A8"
+    if re.search(r"(^|[-_/])w8a16($|[-_/])", n):
+        return "W8A16"
+    is8 = "8bit" in n or "8-bit" in n or "int8" in n
+    if "awq" in n:
+        return "AWQ-8bit" if is8 else "AWQ-4bit"
+    if "gptq" in n:
+        return "GPTQ-Int8" if is8 else "GPTQ-Int4"
+    if "mlx" in n:
+        if "6bit" in n:
+            return "mlx-6bit"
+        return "mlx-8bit" if is8 else "mlx-4bit"
+    if "fp8" in n:
+        return "FP8"
+    if "int4" in n or "4bit" in n or "4-bit" in n:
+        return "INT4"
+    if "int8" in n or "8bit" in n or "8-bit" in n:
+        return "INT8"
+    return ""
+
+
+def _normalize_model_entry(model):
+    if not isinstance(model, dict):
+        return model
+    inferred = infer_quantization_from_name(model.get("name", ""))
+    if inferred and (model.get("quantization") in (None, "", "Q4_K_M") or model.get("_discovered")):
+        model["quantization"] = inferred
+    return model


 def is_prequantized(model):
    q = model.get("quantization", "")
-    name = (model.get("name") or "").lower()
-    return "nvfp4" in name or any(q.startswith(p) for p in PREQUANTIZED_PREFIXES)
+    return any(q.startswith(p) for p in PREQUANTIZED_PREFIXES)


 def params_b(model):
@@ -168,7 +222,7 @@ def get_models():
        data_path = os.path.join(os.path.dirname(__file__), "data", "hf_models.json")
        try:
            with open(data_path, encoding="utf-8") as f:
-                _models_cache = json.load(f)
+                _models_cache = [_normalize_model_entry(m) for m in json.load(f)]
        except (FileNotFoundError, json.JSONDecodeError):
            _models_cache = []
    return _models_cache
--- a/static/js/cookbook-hwfit.js
+++ b/static/js/cookbook-hwfit.js
@@ -827,7 +827,9 @@ export function _hwfitRenderList(el, models) {
    const pcount = m.parameter_count || '?';
    const ctx = m.context ? (m.context >= 1024 ? (m.context / 1024).toFixed(0) + 'k' : m.context) : '?';
    const fitLabel = (m.fit_level || '').replace('_', ' ');
-    const modeLabel = (m.run_mode || '').replace('_', '+');
+    const modeLabel = m.run_mode === 'cpu_offload'
+      ? 'cpu+offload'
+      : (m.run_mode || '').replace(/_/g, ' ');
    const vramLabel = m.required_gb ? m.required_gb.toFixed(1) + 'G' : '?';
    const moeBadge = m.is_moe ? '<span class="hwfit-badge hwfit-moe">MoE</span>' : '';
    const imgBadge = m.is_image_gen ? '<span class="hwfit-badge" style="background:color-mix(in srgb, var(--red) 20%, transparent);color:var(--red);font-size:8px;padding:1px 4px;border-radius:3px;margin-left:4px;">IMG</span>' : '';
--- a/static/js/cookbook.js
+++ b/static/js/cookbook.js
@@ -262,10 +262,10 @@ export function _detectBackend(model) {
  const isRocm = sysBackend === 'rocm';
  const isAppleSilicon = ['metal', 'mps', 'apple'].includes(sysBackend);
  const _nm = `${model.repo_id || ''} ${model.path || ''} ${model.name || ''}`.toLowerCase();
-  if (!isAppleSilicon && (/\bmlx\b|mlx-|_mlx/i.test(_nm) || q.startsWith('MLX'))) {
+  if (/\bmlx\b|mlx-|_mlx/i.test(_nm) || q.startsWith('MLX')) {
    return { backend: 'unsupported', label: 'Unsupported' };
  }
-  const isAwqLike = /^AWQ|^GPTQ|^NVFP4/.test(q) || q === 'FP8' || /\b(awq|gptq|fp8|nvfp4)\b/i.test(_nm);
+  const isAwqLike = /^AWQ|^GPTQ|^NVFP4/.test(q) || ['FP8', 'FP4', 'MXFP4', 'NF4', 'INT4', 'INT8', 'W4A16', 'W8A8', 'W8A16'].includes(q) || /\b(awq|gptq|fp8|fp4|nvfp4|mxfp4|nf4|int4|int8|w4a16|w8a8|w8a16)\b/i.test(_nm);
  const isGgufLike = model.is_gguf || /^Q[2-8]/.test(q) || /^IQ/.test(q) || q === 'GGUF' || _nm.includes('gguf');

  // Image gen models → diffusers
@@ -291,7 +291,7 @@ export function _detectBackend(model) {
  }

  // Apple Silicon (Metal) → llama.cpp (GGUF). vLLM/SGLang are CUDA/ROCm-only and
-  // don't run on macOS; AWQ/GPTQ/FP8 (vLLM-only) models are already filtered out
+  // don't run on macOS; vLLM-native quantized models are already filtered out
  // of metal Cookbook results, so llama.cpp is always the right engine here.
  if (['metal', 'mps', 'apple'].includes(sysBackend)) {
    return { backend: 'llamacpp', label: 'llama.cpp' };
@@ -1516,7 +1516,7 @@ function _renderRecipes() {
  html += '<option value="Q4_K_M">Q4</option><option value="Q8_0">Q8</option>';
  html += '<option value="Q6_K">Q6</option><option value="Q5_K_M">Q5</option>';
  html += '<option value="Q3_K_M">Q3</option><option value="Q2_K">Q2</option>';
-  html += '<option value="AWQ-4bit">AWQ</option><option value="FP8">FP8</option>';
+  html += '<option value="AWQ-4bit">AWQ</option><option value="FP8">FP8</option><option value="FP4">FP4</option>';
  html += '<option value="">Native</option></select>';
  // Engine filter: show only models whose serve engine matches. "llama.cpp"
  // (GGUF) runs everywhere incl. consumer AMD/Apple; vLLM/SGLang are CUDA /
--- a/tests/test_hwfit_quant_formats.py
+++ b/tests/test_hwfit_quant_formats.py
@@ -0,0 +1,78 @@
+from services.hwfit.fit import analyze_model, rank_models
+from services.hwfit.models import (
+    get_models,
+    infer_quantization_from_name,
+    is_prequantized,
+)
+
+
+def _dual_5060ti_system():
+    return {
+        "has_gpu": True,
+        "backend": "cuda",
+        "gpu_name": "NVIDIA GeForce RTX 5060 Ti",
+        "gpu_vram_gb": 31.0,
+        "gpu_count": 2,
+        "available_ram_gb": 128.0,
+        "total_ram_gb": 128.0,
+    }
+
+
+def test_infers_native_hf_quant_formats_from_repo_names():
+    cases = {
+        "txn545/Qwen3.5-122B-A10B-NVFP4": "NVFP4",
+        "some/model-MXFP4": "MXFP4",
+        "some/model-NF4": "NF4",
+        "some/model-FP4": "FP4",
+        "some/model-W4A16": "W4A16",
+        "some/model-W8A8": "W8A8",
+        "some/model-W8A16": "W8A16",
+        "some/model-INT4": "INT4",
+        "some/model-8bit": "INT8",
+    }
+    assert {name: infer_quantization_from_name(name) for name in cases} == cases
+
+
+def test_nvfp4_catalog_quant_is_preserved():
+    catalog = {m["name"]: m for m in get_models()}
+    model = catalog["txn545/Qwen3.5-122B-A10B-NVFP4"]
+
+    assert model["quantization"] == "NVFP4"
+    assert is_prequantized(model)
+
+
+def test_nvfp4_search_result_is_not_gguf_or_cpu_offload():
+    catalog = {m["name"]: m for m in get_models()}
+    model = catalog["txn545/Qwen3.5-122B-A10B-NVFP4"]
+
+    fit = analyze_model(model, _dual_5060ti_system())
+    assert fit["quant"] == "NVFP4"
+    assert fit["run_mode"] != "cpu_offload"
+
+    results = rank_models(
+        _dual_5060ti_system(),
+        search="Qwen3.5-122B-A10B-NVFP4",
+        limit=10,
+    )
+    hit = next(r for r in results if r["name"] == "txn545/Qwen3.5-122B-A10B-NVFP4")
+    assert hit["quant"] == "NVFP4"
+    assert hit["run_mode"] != "cpu_offload"
+
+
+def test_selected_gguf_quant_is_strict_not_lower_quant_fallback():
+    model = {
+        "name": "local/Huge-GGUF",
+        "provider": "local",
+        "parameter_count": "100B",
+        "parameters_raw": 100_000_000_000,
+        "quantization": "Q4_K_M",
+        "context_length": 4096,
+    }
+
+    system = _dual_5060ti_system()
+    system["available_ram_gb"] = 80.0
+    system["total_ram_gb"] = 80.0
+    fit = analyze_model(model, system, target_quant="Q8_0")
+
+    assert fit["quant"] == "Q8_0"
+    assert fit["run_mode"] == "no_fit"