Files
odysseus/scripts/add_hwfit_models.py
Sirsyorrz 9955f5bc95 Fix VRAM estimates for pre-quantized HF repos
The Cookbook fit scanner was reporting impossibly low VRAM requirements
for some pre-quantized models — e.g. cyankiwi/Qwen3-Coder-Next-REAM-AWQ-4bit
shown as 7.1 GB ('perfect' on a 12 GB card) when the real load is ~40 GB.

Root cause is in the catalog builder. When _entry_from_modelinfo falls
back to safetensors metadata for the parameter count, it stored
safetensors.total directly. For pre-quantized repos that figure reflects
*packed* element counts: AWQ/GPTQ-Int4 pack 8x 4-bit weights into one
I32, AWQ-8bit/GPTQ-Int8/FP8 pack 4x. The catalog therefore recorded
~1/8 of the real parameter count, and min_vram_gb = packed * bpp
double-applied the quantization.

Fix the safetensors fallback:

* prefer the per-dtype parameters dict when available and unpack only the
  I32/I64 entries (the F16/BF16 scale/zero tensors and embeddings are
  already at their real element counts)
* fall back to total * pack_factor when only total is exposed

Patch the catalog entries that were affected by the old fallback so the
fit ratings reflect reality without waiting for a full catalog rebuild:

* cyankiwi/Qwen3-Coder-Next-REAM-AWQ-4bit  11.4B -> 79.7B (40.8 GB VRAM)
* stelterlab/Qwen3-Coder-30B-A3B-Instruct-AWQ  4.6B -> 30.5B
* stelterlab/NVIDIA-Nemotron-3-Nano-30B-A3B-AWQ  5.1B -> 30.5B
* warshanks/Qwen3-8B-abliterated-AWQ  2.2B -> 8.2B
* QuantTrio/sarvam-30b-AWQ  7B -> 30B
* QuantTrio/sarvam-105b-AWQ  19B -> 105B

Closes #377.
2026-06-01 18:32:58 +09:00

255 lines
9.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
add_hwfit_models.py — bulk-add Hugging Face models to the hwfit catalog
(services/hwfit/data/hf_models.json).
Adds:
* every model from one or more HF authors (e.g. cyankiwi's AWQ quants)
* any explicitly-listed repos
Metadata is taken from the HF Hub `list_models(full=True)` response plus the
repo name (which encodes the param size, e.g. "Qwen3.6-35B-A3B"). Param-less
names fall back to a single per-repo model_info() call to read safetensors.
Re-runnable: merges by `name`, leaving existing entries untouched unless
--overwrite is passed. Writes a .bak first.
Usage:
python3 scripts/add_hwfit_models.py
"""
import json
import os
import re
import sys
from datetime import datetime
from huggingface_hub import HfApi
DATA_PATH = os.path.join(os.path.dirname(__file__), "..", "services", "hwfit", "data", "hf_models.json")
DATA_PATH = os.path.abspath(DATA_PATH)
AUTHORS = ["cyankiwi"]
# Specific repos to add (in addition to the authors above). Optional explicit
# overrides {repo: {field: value}} for things the name/metadata can't convey.
EXTRA_REPOS = {
"deepseek-ai/DeepSeek-V4-Flash": {"parameter_count": "168B", "quantization": "Q4_K_M"},
"MiniMaxAI/MiniMax-M2.7": {"parameter_count": "228.7B", "quantization": "Q4_K_M"},
"bullerwins/MiniMax-M2.7-REAP-172B-fp8": {"parameter_count": "172B", "quantization": "FP8"},
"cyankiwi/MiniMax-M2.7-AWQ-4bit": {"parameter_count": "228.7B", "quantization": "AWQ-4bit"},
}
# Tags that are not architecture names.
_GENERIC_TAGS = {
"transformers", "safetensors", "conversational", "text-generation",
"image-text-to-text", "text-generation-inference", "endpoints_compatible",
"autotrain_compatible", "compressed-tensors", "gguf", "mlx", "vllm", "4-bit",
"8-bit", "awq", "gptq", "fp8", "quantized", "chat",
}
api = HfApi()
def _parse_params(name):
"""Return (parameters_raw, active_parameters_or_None) from a repo name.
Handles dense ("27B") and MoE ("235B-A22B") naming."""
base = name.split("/")[-1]
active = None
m_active = re.search(r"-[Aa](\d+\.?\d*)[Bb](?![a-zA-Z])", base)
if m_active:
active = int(float(m_active.group(1)) * 1e9)
base_wo = base[:m_active.start()] + base[m_active.end():]
else:
base_wo = base
# First "<num>B" token that is a plausible size. Case-insensitive b, but the
# negative lookahead means "8bit"/"4bit" are NOT treated as "8B"/"4B".
total = None
for m in re.finditer(r"(\d+\.?\d*)[Bb](?![a-zA-Z])", base_wo):
total = int(float(m.group(1)) * 1e9)
break
return total, active
def _base_model_tag(tags):
"""Return the `base_model:...` repo id from tags, if any."""
for t in (tags or []):
if t.startswith("base_model:"):
return t.split(":")[-1]
return None
def _quant_from_name(name):
n = name.lower()
is8 = "8bit" in n or "8-bit" in n or "int8" in n
if "awq" in n:
return "AWQ-8bit" if is8 else "AWQ-4bit"
if "gptq" in n:
return "GPTQ-Int8" if is8 else "GPTQ-Int4"
if "mlx" in n:
if "6bit" in n:
return "mlx-6bit"
return "mlx-8bit" if is8 else "mlx-4bit"
if "fp8" in n:
return "FP8"
if "int4" in n or "4bit" in n or "4-bit" in n:
return "AWQ-4bit"
return "Q4_K_M"
def _arch_from_tags(tags):
for t in (tags or []):
if ":" in t or t in _GENERIC_TAGS:
continue
if re.fullmatch(r"[a-z0-9_]+", t) and any(c.isalpha() for c in t):
return t
return ""
def _entry_from_modelinfo(mi, overrides):
name = mi.id
provider = name.split("/")[0]
total, active = _parse_params(name)
# If the name has no size but an override supplies one, use that.
if total is None and overrides and overrides.get("parameter_count"):
total, _ov_active = _parse_params("x/" + overrides["parameter_count"])
# Next, try the base_model tag (the unquantized parent often names its size).
if total is None:
bm = _base_model_tag(getattr(mi, "tags", None))
if bm:
bt, ba = _parse_params(bm)
if bt:
total = bt
if ba and active is None:
active = ba
# Determine quant first — we need it to unpack the safetensors fallback.
quant = _quant_from_name(name)
# Last resort: read safetensors element counts. For pre-quantized repos
# (AWQ/GPTQ/MLX-Int4 etc.) the weights are packed: 8× 4-bit weights per
# I32 element, 4× 8-bit weights per I32. The bare safetensors total
# therefore undercounts real parameter count by the same factor, which
# then feeds a wrong `min_vram_gb` downstream. Sum per-dtype and unpack
# the packed I32 tensors so the catalog stores the true param count.
if total is None:
try:
full = api.model_info(name, files_metadata=False)
st = getattr(full, "safetensors", None)
if st:
params_by_dtype = getattr(st, "parameters", None) or {}
if quant.endswith("4bit") or quant.endswith("Int4"):
pack_factor = 8
elif quant.endswith("8bit") or quant.endswith("Int8") or quant == "FP8":
pack_factor = 4
else:
pack_factor = 1
if params_by_dtype:
# I32/I64 hold the packed quantized weights; everything
# else (F16/BF16 scales, zeros, embeddings) is already at
# its real element count.
packed = sum(c for d, c in params_by_dtype.items() if d in ("I32", "I64"))
rest = sum(c for d, c in params_by_dtype.items() if d not in ("I32", "I64"))
total = packed * pack_factor + rest
elif getattr(st, "total", None):
total = int(st.total) * pack_factor
except Exception:
pass
if total is None:
return None # can't size it — skip
pb = total / 1e9
created = getattr(mi, "created_at", None)
rel = created.strftime("%Y-%m-%d") if created else datetime.utcnow().strftime("%Y-%m-%d")
# Rough RAM/VRAM hints (fit.py recomputes the real requirement from params+quant).
_BPP = {"AWQ-4bit": 0.58, "GPTQ-Int4": 0.58, "mlx-4bit": 0.55, "mlx-6bit": 0.85,
"AWQ-8bit": 1.1, "GPTQ-Int8": 1.1, "mlx-8bit": 1.1, "FP8": 1.1, "Q4_K_M": 0.6}
bpp = _BPP.get(quant, 0.6)
vram = round(pb * bpp + 0.5, 1)
entry = {
"name": name,
"provider": provider,
"parameter_count": f"{round(pb, 1)}B",
"parameters_raw": total,
"min_ram_gb": max(1.0, round(vram * 0.6, 1)),
"recommended_ram_gb": max(2.0, round(vram * 1.2, 1)),
"min_vram_gb": vram,
"quantization": quant,
"context_length": 32768,
"use_case": "General purpose",
"capabilities": [],
"pipeline_tag": getattr(mi, "pipeline_tag", None) or "text-generation",
"architecture": _arch_from_tags(getattr(mi, "tags", None)),
"hf_downloads": getattr(mi, "downloads", 0) or 0,
"hf_likes": getattr(mi, "likes", 0) or 0,
"release_date": rel,
"_discovered": True,
}
if active:
entry["is_moe"] = True
entry["active_parameters"] = active
entry.update(overrides or {})
# If an override set parameter_count, keep parameters_raw consistent.
if overrides and "parameter_count" in overrides and "parameters_raw" not in overrides:
t2, _ = _parse_params("x/" + overrides["parameter_count"])
if t2:
entry["parameters_raw"] = t2
return entry
def main():
with open(DATA_PATH, encoding="utf-8") as f:
catalog = json.load(f)
by_name = {m["name"]: m for m in catalog}
existing = set(by_name)
overwrite = "--overwrite" in sys.argv
to_add = {}
# Authors
for author in AUTHORS:
print(f"Fetching author: {author} ...", flush=True)
models = list(api.list_models(author=author, full=True, cardData=True))
print(f" {len(models)} repos", flush=True)
for mi in models:
if mi.id in existing and not overwrite:
continue
ov = EXTRA_REPOS.get(mi.id)
entry = _entry_from_modelinfo(mi, ov)
if entry:
to_add[mi.id] = entry
# Explicit extra repos (not covered by an author scan)
for repo, ov in EXTRA_REPOS.items():
if repo in to_add:
continue
if repo in existing and not overwrite:
continue
try:
mi = api.model_info(repo, files_metadata=False)
except Exception as e:
print(f" SKIP {repo}: {e}", flush=True)
continue
entry = _entry_from_modelinfo(mi, ov)
if entry:
to_add[repo] = entry
if not to_add:
print("Nothing new to add.")
return
# Backup + merge
with open(DATA_PATH + ".bak", "w", encoding="utf-8") as f:
json.dump(catalog, f, indent=2)
for name, entry in to_add.items():
by_name[name] = entry
merged = list(by_name.values())
with open(DATA_PATH, "w", encoding="utf-8") as f:
json.dump(merged, f, indent=2)
print(f"\nAdded/updated {len(to_add)} models. Catalog now {len(merged)} (was {len(catalog)}).")
for n in sorted(to_add)[:20]:
e = to_add[n]
print(f" + {n} [{e['parameter_count']}, {e['quantization']}]")
if len(to_add) > 20:
print(f" ... and {len(to_add) - 20} more")
if __name__ == "__main__":
main()