Fix VRAM estimates for pre-quantized HF repos
The Cookbook fit scanner was reporting impossibly low VRAM requirements
for some pre-quantized models — e.g. cyankiwi/Qwen3-Coder-Next-REAM-AWQ-4bit
shown as 7.1 GB ('perfect' on a 12 GB card) when the real load is ~40 GB.
Root cause is in the catalog builder. When _entry_from_modelinfo falls
back to safetensors metadata for the parameter count, it stored
safetensors.total directly. For pre-quantized repos that figure reflects
*packed* element counts: AWQ/GPTQ-Int4 pack 8x 4-bit weights into one
I32, AWQ-8bit/GPTQ-Int8/FP8 pack 4x. The catalog therefore recorded
~1/8 of the real parameter count, and min_vram_gb = packed * bpp
double-applied the quantization.
Fix the safetensors fallback:
* prefer the per-dtype parameters dict when available and unpack only the
I32/I64 entries (the F16/BF16 scale/zero tensors and embeddings are
already at their real element counts)
* fall back to total * pack_factor when only total is exposed
Patch the catalog entries that were affected by the old fallback so the
fit ratings reflect reality without waiting for a full catalog rebuild:
* cyankiwi/Qwen3-Coder-Next-REAM-AWQ-4bit 11.4B -> 79.7B (40.8 GB VRAM)
* stelterlab/Qwen3-Coder-30B-A3B-Instruct-AWQ 4.6B -> 30.5B
* stelterlab/NVIDIA-Nemotron-3-Nano-30B-A3B-AWQ 5.1B -> 30.5B
* warshanks/Qwen3-8B-abliterated-AWQ 2.2B -> 8.2B
* QuantTrio/sarvam-30b-AWQ 7B -> 30B
* QuantTrio/sarvam-105b-AWQ 19B -> 105B
Closes #377.
This commit is contained in:
@@ -120,20 +120,40 @@ def _entry_from_modelinfo(mi, overrides):
|
||||
total = bt
|
||||
if ba and active is None:
|
||||
active = ba
|
||||
# Last resort: read safetensors param count (note: for quantized repos this
|
||||
# is the *packed* count, so it's only an approximation).
|
||||
# Determine quant first — we need it to unpack the safetensors fallback.
|
||||
quant = _quant_from_name(name)
|
||||
# Last resort: read safetensors element counts. For pre-quantized repos
|
||||
# (AWQ/GPTQ/MLX-Int4 etc.) the weights are packed: 8× 4-bit weights per
|
||||
# I32 element, 4× 8-bit weights per I32. The bare safetensors total
|
||||
# therefore undercounts real parameter count by the same factor, which
|
||||
# then feeds a wrong `min_vram_gb` downstream. Sum per-dtype and unpack
|
||||
# the packed I32 tensors so the catalog stores the true param count.
|
||||
if total is None:
|
||||
try:
|
||||
full = api.model_info(name, files_metadata=False)
|
||||
st = getattr(full, "safetensors", None)
|
||||
if st and getattr(st, "total", None):
|
||||
total = int(st.total)
|
||||
if st:
|
||||
params_by_dtype = getattr(st, "parameters", None) or {}
|
||||
if quant.endswith("4bit") or quant.endswith("Int4"):
|
||||
pack_factor = 8
|
||||
elif quant.endswith("8bit") or quant.endswith("Int8") or quant == "FP8":
|
||||
pack_factor = 4
|
||||
else:
|
||||
pack_factor = 1
|
||||
if params_by_dtype:
|
||||
# I32/I64 hold the packed quantized weights; everything
|
||||
# else (F16/BF16 scales, zeros, embeddings) is already at
|
||||
# its real element count.
|
||||
packed = sum(c for d, c in params_by_dtype.items() if d in ("I32", "I64"))
|
||||
rest = sum(c for d, c in params_by_dtype.items() if d not in ("I32", "I64"))
|
||||
total = packed * pack_factor + rest
|
||||
elif getattr(st, "total", None):
|
||||
total = int(st.total) * pack_factor
|
||||
except Exception:
|
||||
pass
|
||||
if total is None:
|
||||
return None # can't size it — skip
|
||||
pb = total / 1e9
|
||||
quant = _quant_from_name(name)
|
||||
created = getattr(mi, "created_at", None)
|
||||
rel = created.strftime("%Y-%m-%d") if created else datetime.utcnow().strftime("%Y-%m-%d")
|
||||
# Rough RAM/VRAM hints (fit.py recomputes the real requirement from params+quant).
|
||||
|
||||
@@ -3350,11 +3350,11 @@
|
||||
{
|
||||
"name": "warshanks/Qwen3-8B-abliterated-AWQ",
|
||||
"provider": "warshanks",
|
||||
"parameter_count": "2.2B",
|
||||
"parameters_raw": 2174236152,
|
||||
"min_ram_gb": 1.2,
|
||||
"recommended_ram_gb": 2.0,
|
||||
"min_vram_gb": 1.1,
|
||||
"parameter_count": "8.2B",
|
||||
"parameters_raw": 8190735872,
|
||||
"min_ram_gb": 3.2,
|
||||
"recommended_ram_gb": 6.4,
|
||||
"min_vram_gb": 5.3,
|
||||
"quantization": "AWQ-4bit",
|
||||
"context_length": 40960,
|
||||
"use_case": "General purpose text generation",
|
||||
@@ -4564,11 +4564,11 @@
|
||||
{
|
||||
"name": "stelterlab/Qwen3-Coder-30B-A3B-Instruct-AWQ",
|
||||
"provider": "stelterlab",
|
||||
"parameter_count": "4.6B",
|
||||
"parameters_raw": 4605856128,
|
||||
"min_ram_gb": 2.6,
|
||||
"recommended_ram_gb": 4.3,
|
||||
"min_vram_gb": 2.4,
|
||||
"parameter_count": "30.5B",
|
||||
"parameters_raw": 30532122624,
|
||||
"min_ram_gb": 10.9,
|
||||
"recommended_ram_gb": 21.8,
|
||||
"min_vram_gb": 18.2,
|
||||
"quantization": "AWQ-4bit",
|
||||
"context_length": 262144,
|
||||
"use_case": "Code generation and completion",
|
||||
@@ -4583,7 +4583,7 @@
|
||||
"is_moe": true,
|
||||
"num_experts": 128,
|
||||
"active_experts": 8,
|
||||
"active_parameters": 503765510,
|
||||
"active_parameters": 3300000000,
|
||||
"_discovered": true,
|
||||
"format": "awq"
|
||||
},
|
||||
@@ -4697,11 +4697,11 @@
|
||||
{
|
||||
"name": "stelterlab/NVIDIA-Nemotron-3-Nano-30B-A3B-AWQ",
|
||||
"provider": "stelterlab",
|
||||
"parameter_count": "5.1B",
|
||||
"parameters_raw": 5053827112,
|
||||
"min_ram_gb": 2.8,
|
||||
"recommended_ram_gb": 4.7,
|
||||
"min_vram_gb": 2.6,
|
||||
"parameter_count": "30.5B",
|
||||
"parameters_raw": 30532122624,
|
||||
"min_ram_gb": 10.9,
|
||||
"recommended_ram_gb": 21.8,
|
||||
"min_vram_gb": 18.2,
|
||||
"quantization": "AWQ-4bit",
|
||||
"context_length": 262144,
|
||||
"use_case": "General purpose text generation",
|
||||
@@ -4712,7 +4712,11 @@
|
||||
"hf_likes": 4,
|
||||
"release_date": "2026-01-31",
|
||||
"_discovered": true,
|
||||
"format": "awq"
|
||||
"format": "awq",
|
||||
"is_moe": true,
|
||||
"num_experts": 128,
|
||||
"active_experts": 8,
|
||||
"active_parameters": 3300000000
|
||||
},
|
||||
{
|
||||
"name": "lmstudio-community/Qwen3-32B-MLX-4bit",
|
||||
@@ -12586,11 +12590,11 @@
|
||||
{
|
||||
"name": "QuantTrio/sarvam-30b-AWQ",
|
||||
"provider": "QuantTrio",
|
||||
"parameter_count": "7.0B",
|
||||
"parameters_raw": 7000000000,
|
||||
"min_ram_gb": 4.0,
|
||||
"recommended_ram_gb": 5.2,
|
||||
"min_vram_gb": 4.0,
|
||||
"parameter_count": "30.0B",
|
||||
"parameters_raw": 30000000000,
|
||||
"min_ram_gb": 10.7,
|
||||
"recommended_ram_gb": 21.5,
|
||||
"min_vram_gb": 17.9,
|
||||
"quantization": "AWQ-4bit",
|
||||
"context_length": 131072,
|
||||
"use_case": "Chat, multilingual",
|
||||
@@ -12605,11 +12609,11 @@
|
||||
{
|
||||
"name": "QuantTrio/sarvam-105b-AWQ",
|
||||
"provider": "QuantTrio",
|
||||
"parameter_count": "19.0B",
|
||||
"parameters_raw": 19000000000,
|
||||
"min_ram_gb": 10.0,
|
||||
"recommended_ram_gb": 13.0,
|
||||
"min_vram_gb": 10.0,
|
||||
"parameter_count": "105.0B",
|
||||
"parameters_raw": 105000000000,
|
||||
"min_ram_gb": 36.8,
|
||||
"recommended_ram_gb": 73.7,
|
||||
"min_vram_gb": 61.4,
|
||||
"quantization": "AWQ-4bit",
|
||||
"context_length": 131072,
|
||||
"use_case": "Chat, multilingual",
|
||||
@@ -17884,21 +17888,26 @@
|
||||
{
|
||||
"name": "cyankiwi/Qwen3-Coder-Next-REAM-AWQ-4bit",
|
||||
"provider": "cyankiwi",
|
||||
"parameter_count": "11.4B",
|
||||
"parameters_raw": 11412204288,
|
||||
"min_ram_gb": 4.3,
|
||||
"recommended_ram_gb": 8.5,
|
||||
"min_vram_gb": 7.1,
|
||||
"parameter_count": "79.7B",
|
||||
"parameters_raw": 79674391296,
|
||||
"min_ram_gb": 22.3,
|
||||
"recommended_ram_gb": 44.6,
|
||||
"min_vram_gb": 40.8,
|
||||
"quantization": "AWQ-4bit",
|
||||
"context_length": 32768,
|
||||
"use_case": "General purpose",
|
||||
"use_case": "Coding",
|
||||
"capabilities": [],
|
||||
"pipeline_tag": "text-generation",
|
||||
"architecture": "qwen3_next",
|
||||
"hf_downloads": 695,
|
||||
"hf_likes": 10,
|
||||
"release_date": "2026-02-19",
|
||||
"_discovered": true
|
||||
"is_moe": true,
|
||||
"num_experts": 512,
|
||||
"active_experts": 10,
|
||||
"active_parameters": null,
|
||||
"_discovered": true,
|
||||
"format": "awq"
|
||||
},
|
||||
{
|
||||
"name": "cyankiwi/INTELLECT-3.1-AWQ-8bit",
|
||||
|
||||
Reference in New Issue
Block a user