From 9955f5bc952fcd7968a91d50c13e8b12fa74f599 Mon Sep 17 00:00:00 2001 From: Sirsyorrz Date: Mon, 1 Jun 2026 19:32:58 +1000 Subject: [PATCH] Fix VRAM estimates for pre-quantized HF repos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Cookbook fit scanner was reporting impossibly low VRAM requirements for some pre-quantized models — e.g. cyankiwi/Qwen3-Coder-Next-REAM-AWQ-4bit shown as 7.1 GB ('perfect' on a 12 GB card) when the real load is ~40 GB. Root cause is in the catalog builder. When _entry_from_modelinfo falls back to safetensors metadata for the parameter count, it stored safetensors.total directly. For pre-quantized repos that figure reflects *packed* element counts: AWQ/GPTQ-Int4 pack 8x 4-bit weights into one I32, AWQ-8bit/GPTQ-Int8/FP8 pack 4x. The catalog therefore recorded ~1/8 of the real parameter count, and min_vram_gb = packed * bpp double-applied the quantization. Fix the safetensors fallback: * prefer the per-dtype parameters dict when available and unpack only the I32/I64 entries (the F16/BF16 scale/zero tensors and embeddings are already at their real element counts) * fall back to total * pack_factor when only total is exposed Patch the catalog entries that were affected by the old fallback so the fit ratings reflect reality without waiting for a full catalog rebuild: * cyankiwi/Qwen3-Coder-Next-REAM-AWQ-4bit 11.4B -> 79.7B (40.8 GB VRAM) * stelterlab/Qwen3-Coder-30B-A3B-Instruct-AWQ 4.6B -> 30.5B * stelterlab/NVIDIA-Nemotron-3-Nano-30B-A3B-AWQ 5.1B -> 30.5B * warshanks/Qwen3-8B-abliterated-AWQ 2.2B -> 8.2B * QuantTrio/sarvam-30b-AWQ 7B -> 30B * QuantTrio/sarvam-105b-AWQ 19B -> 105B Closes #377. --- scripts/add_hwfit_models.py | 30 ++++++++++-- services/hwfit/data/hf_models.json | 77 +++++++++++++++++------------- 2 files changed, 68 insertions(+), 39 deletions(-) diff --git a/scripts/add_hwfit_models.py b/scripts/add_hwfit_models.py index 2d7129c..fa48de9 100644 --- a/scripts/add_hwfit_models.py +++ b/scripts/add_hwfit_models.py @@ -120,20 +120,40 @@ def _entry_from_modelinfo(mi, overrides): total = bt if ba and active is None: active = ba - # Last resort: read safetensors param count (note: for quantized repos this - # is the *packed* count, so it's only an approximation). + # Determine quant first — we need it to unpack the safetensors fallback. + quant = _quant_from_name(name) + # Last resort: read safetensors element counts. For pre-quantized repos + # (AWQ/GPTQ/MLX-Int4 etc.) the weights are packed: 8× 4-bit weights per + # I32 element, 4× 8-bit weights per I32. The bare safetensors total + # therefore undercounts real parameter count by the same factor, which + # then feeds a wrong `min_vram_gb` downstream. Sum per-dtype and unpack + # the packed I32 tensors so the catalog stores the true param count. if total is None: try: full = api.model_info(name, files_metadata=False) st = getattr(full, "safetensors", None) - if st and getattr(st, "total", None): - total = int(st.total) + if st: + params_by_dtype = getattr(st, "parameters", None) or {} + if quant.endswith("4bit") or quant.endswith("Int4"): + pack_factor = 8 + elif quant.endswith("8bit") or quant.endswith("Int8") or quant == "FP8": + pack_factor = 4 + else: + pack_factor = 1 + if params_by_dtype: + # I32/I64 hold the packed quantized weights; everything + # else (F16/BF16 scales, zeros, embeddings) is already at + # its real element count. + packed = sum(c for d, c in params_by_dtype.items() if d in ("I32", "I64")) + rest = sum(c for d, c in params_by_dtype.items() if d not in ("I32", "I64")) + total = packed * pack_factor + rest + elif getattr(st, "total", None): + total = int(st.total) * pack_factor except Exception: pass if total is None: return None # can't size it — skip pb = total / 1e9 - quant = _quant_from_name(name) created = getattr(mi, "created_at", None) rel = created.strftime("%Y-%m-%d") if created else datetime.utcnow().strftime("%Y-%m-%d") # Rough RAM/VRAM hints (fit.py recomputes the real requirement from params+quant). diff --git a/services/hwfit/data/hf_models.json b/services/hwfit/data/hf_models.json index d4766fb..19ce4ef 100644 --- a/services/hwfit/data/hf_models.json +++ b/services/hwfit/data/hf_models.json @@ -3350,11 +3350,11 @@ { "name": "warshanks/Qwen3-8B-abliterated-AWQ", "provider": "warshanks", - "parameter_count": "2.2B", - "parameters_raw": 2174236152, - "min_ram_gb": 1.2, - "recommended_ram_gb": 2.0, - "min_vram_gb": 1.1, + "parameter_count": "8.2B", + "parameters_raw": 8190735872, + "min_ram_gb": 3.2, + "recommended_ram_gb": 6.4, + "min_vram_gb": 5.3, "quantization": "AWQ-4bit", "context_length": 40960, "use_case": "General purpose text generation", @@ -4564,11 +4564,11 @@ { "name": "stelterlab/Qwen3-Coder-30B-A3B-Instruct-AWQ", "provider": "stelterlab", - "parameter_count": "4.6B", - "parameters_raw": 4605856128, - "min_ram_gb": 2.6, - "recommended_ram_gb": 4.3, - "min_vram_gb": 2.4, + "parameter_count": "30.5B", + "parameters_raw": 30532122624, + "min_ram_gb": 10.9, + "recommended_ram_gb": 21.8, + "min_vram_gb": 18.2, "quantization": "AWQ-4bit", "context_length": 262144, "use_case": "Code generation and completion", @@ -4583,7 +4583,7 @@ "is_moe": true, "num_experts": 128, "active_experts": 8, - "active_parameters": 503765510, + "active_parameters": 3300000000, "_discovered": true, "format": "awq" }, @@ -4697,11 +4697,11 @@ { "name": "stelterlab/NVIDIA-Nemotron-3-Nano-30B-A3B-AWQ", "provider": "stelterlab", - "parameter_count": "5.1B", - "parameters_raw": 5053827112, - "min_ram_gb": 2.8, - "recommended_ram_gb": 4.7, - "min_vram_gb": 2.6, + "parameter_count": "30.5B", + "parameters_raw": 30532122624, + "min_ram_gb": 10.9, + "recommended_ram_gb": 21.8, + "min_vram_gb": 18.2, "quantization": "AWQ-4bit", "context_length": 262144, "use_case": "General purpose text generation", @@ -4712,7 +4712,11 @@ "hf_likes": 4, "release_date": "2026-01-31", "_discovered": true, - "format": "awq" + "format": "awq", + "is_moe": true, + "num_experts": 128, + "active_experts": 8, + "active_parameters": 3300000000 }, { "name": "lmstudio-community/Qwen3-32B-MLX-4bit", @@ -12586,11 +12590,11 @@ { "name": "QuantTrio/sarvam-30b-AWQ", "provider": "QuantTrio", - "parameter_count": "7.0B", - "parameters_raw": 7000000000, - "min_ram_gb": 4.0, - "recommended_ram_gb": 5.2, - "min_vram_gb": 4.0, + "parameter_count": "30.0B", + "parameters_raw": 30000000000, + "min_ram_gb": 10.7, + "recommended_ram_gb": 21.5, + "min_vram_gb": 17.9, "quantization": "AWQ-4bit", "context_length": 131072, "use_case": "Chat, multilingual", @@ -12605,11 +12609,11 @@ { "name": "QuantTrio/sarvam-105b-AWQ", "provider": "QuantTrio", - "parameter_count": "19.0B", - "parameters_raw": 19000000000, - "min_ram_gb": 10.0, - "recommended_ram_gb": 13.0, - "min_vram_gb": 10.0, + "parameter_count": "105.0B", + "parameters_raw": 105000000000, + "min_ram_gb": 36.8, + "recommended_ram_gb": 73.7, + "min_vram_gb": 61.4, "quantization": "AWQ-4bit", "context_length": 131072, "use_case": "Chat, multilingual", @@ -17884,21 +17888,26 @@ { "name": "cyankiwi/Qwen3-Coder-Next-REAM-AWQ-4bit", "provider": "cyankiwi", - "parameter_count": "11.4B", - "parameters_raw": 11412204288, - "min_ram_gb": 4.3, - "recommended_ram_gb": 8.5, - "min_vram_gb": 7.1, + "parameter_count": "79.7B", + "parameters_raw": 79674391296, + "min_ram_gb": 22.3, + "recommended_ram_gb": 44.6, + "min_vram_gb": 40.8, "quantization": "AWQ-4bit", "context_length": 32768, - "use_case": "General purpose", + "use_case": "Coding", "capabilities": [], "pipeline_tag": "text-generation", "architecture": "qwen3_next", "hf_downloads": 695, "hf_likes": 10, "release_date": "2026-02-19", - "_discovered": true + "is_moe": true, + "num_experts": 512, + "active_experts": 10, + "active_parameters": null, + "_discovered": true, + "format": "awq" }, { "name": "cyankiwi/INTELLECT-3.1-AWQ-8bit",