diff --git a/scripts/add_hwfit_models.py b/scripts/add_hwfit_models.py index 6694735..f26288d 100644 --- a/scripts/add_hwfit_models.py +++ b/scripts/add_hwfit_models.py @@ -9,7 +9,9 @@ Adds: Metadata is taken from the HF Hub `list_models(full=True)` response plus the repo name (which encodes the param size, e.g. "Qwen3.6-35B-A3B"). Param-less -names fall back to a single per-repo model_info() call to read safetensors. +names fall back, in order, to the parent `base_model:` tag, the repo's +`config.json` (computed from `hidden_size` / `num_hidden_layers` / MoE +fields), and finally a per-repo `model_info()` call to read safetensors. Re-runnable: merges by `name`, leaving existing entries untouched unless --overwrite is passed. Writes a .bak first. @@ -23,7 +25,8 @@ import re import sys from datetime import datetime -from huggingface_hub import HfApi +from huggingface_hub import HfApi, hf_hub_download +from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError DATA_PATH = os.path.join(os.path.dirname(__file__), "..", "services", "hwfit", "data", "hf_models.json") DATA_PATH = os.path.abspath(DATA_PATH) @@ -70,6 +73,128 @@ def _parse_params(name): return total, active +def _params_from_config(cfg): + """Estimate (total, active) parameter counts from a HF config.json dict. + + Returns (None, None) when the architecture fields aren't usable. Covers: + * explicit ``num_parameters`` / ``n_params`` (rare but authoritative) + * dense transformers (LLaMA / Qwen / Mistral / GLM-dense / etc.) via + embeddings + per-layer attention + MLP + * MoE (Qwen3-MoE, GLM-4-MoE, DeepSeek-style) using ``num_experts`` or + ``n_routed_experts`` (+ ``n_shared_experts``). Active count assumes + ``num_experts_per_tok`` routed experts plus any shared experts. + + The estimate is intentionally coarse — within ~5-10% of the true count for + standard decoder-only architectures — which is fine for the downstream + ``min_vram_gb`` heuristic (it already buckets via ``parameter_count`` to + one decimal place of "B"). + """ + if not isinstance(cfg, dict): + return None, None + + # Authoritative fields first. Some custom configs embed the trained + # parameter count directly. + for key in ("num_parameters", "n_params", "total_params"): + v = cfg.get(key) + if isinstance(v, (int, float)) and v > 0: + return int(v), None + + def _i(key, default=None): + v = cfg.get(key, default) + try: + return int(v) if v is not None else None + except (TypeError, ValueError): + return None + + h = _i("hidden_size") + L = _i("num_hidden_layers") + if not h or not L: + return None, None + + vocab = _i("vocab_size") or 0 + ffn = _i("intermediate_size") or (4 * h) + n_heads = _i("num_attention_heads") or 0 + n_kv = _i("num_key_value_heads") or n_heads + head_dim = _i("head_dim") or (h // n_heads if n_heads else h) + + # Attention: Q is hidden_size wide, KV is grouped (GQA / MQA). + q_proj = h * (n_heads * head_dim if n_heads else h) + kv_proj = 2 * h * (n_kv * head_dim if n_kv else h) + o_proj = (n_heads * head_dim if n_heads else h) * h + per_layer_attn = q_proj + kv_proj + o_proj + + # Dense MLP: gate + up + down (SwiGLU / GeGLU). Configs without a gate + # (plain GELU) are within the noise floor of this estimate. + per_layer_dense_mlp = 3 * h * ffn + + # MoE routing. Both naming conventions are seen in the wild. + n_experts = _i("num_experts") or _i("n_routed_experts") or 0 + n_shared = _i("n_shared_experts") or 0 + n_active = _i("num_experts_per_tok") or 0 + moe_ffn = _i("moe_intermediate_size") or ffn + # Some configs (GLM-4-MoE, DeepSeek-V3) keep the first K layers dense. + first_dense = _i("first_k_dense_replace") or 0 + + if n_experts > 0 and n_active > 0: + moe_layers = max(0, L - first_dense) + dense_layers = L - moe_layers + per_expert = 3 * h * moe_ffn + total_mlp = ( + dense_layers * per_layer_dense_mlp + + moe_layers * (n_experts + n_shared) * per_expert + ) + active_mlp = ( + dense_layers * per_layer_dense_mlp + + moe_layers * (n_active + n_shared) * per_expert + ) + else: + total_mlp = L * per_layer_dense_mlp + active_mlp = total_mlp + + embed = vocab * h + # Untied output head doubles the embedding contribution. + head = 0 if cfg.get("tie_word_embeddings", True) else vocab * h + + total = embed + head + L * per_layer_attn + total_mlp + active = embed + head + L * per_layer_attn + active_mlp + if total <= 0: + return None, None + if active == total or n_experts == 0: + return int(total), None + return int(total), int(active) + + +_CONFIG_CACHE = {} + + +def _fetch_config_json(repo_id): + """Download and cache a repo's config.json. Returns a dict or None. + + Network / 404 / private-repo failures are swallowed — the caller already + has a safetensors fallback below this. We rely on huggingface_hub's own + on-disk cache so repeated script runs don't re-hit the Hub. + """ + if repo_id in _CONFIG_CACHE: + return _CONFIG_CACHE[repo_id] + try: + path = hf_hub_download(repo_id=repo_id, filename="config.json") + except (EntryNotFoundError, RepositoryNotFoundError): + _CONFIG_CACHE[repo_id] = None + return None + except Exception: + # Network hiccup, gated repo, etc. — don't crash the bulk run. + _CONFIG_CACHE[repo_id] = None + return None + try: + with open(path, encoding="utf-8") as f: + cfg = json.load(f) + except (OSError, ValueError): + _CONFIG_CACHE[repo_id] = None + return None + _CONFIG_CACHE[repo_id] = cfg + return cfg + + def _base_model_tag(tags): """Return the `base_model:...` repo id from tags, if any.""" for t in (tags or []): @@ -141,6 +266,27 @@ def _entry_from_modelinfo(mi, overrides): active = ba # Determine quant first — we need it to unpack the safetensors fallback. quant = _quant_from_name(name) + # Next-to-last resort: parse config.json. This is robust against + # parameter-less repo names (e.g. "GLM-4.5" with no "9B" suffix) where + # both the regex and the base_model tag come up empty. We try this + # before safetensors so non-standard names still resolve without a + # per-repo manual override in EXTRA_REPOS. Source repo first (works for + # unquantized models) then the quantized parent via base_model:. + if total is None: + config_targets = [name] + bm = _base_model_tag(getattr(mi, "tags", None)) + if bm and bm != name: + config_targets.append(bm) + for target in config_targets: + cfg = _fetch_config_json(target) + if not cfg: + continue + ct, ca = _params_from_config(cfg) + if ct: + total = ct + if ca and active is None: + active = ca + break # Last resort: read safetensors element counts. For pre-quantized repos # (AWQ/GPTQ/MLX-Int4 etc.) the weights are packed: 8× 4-bit weights per # I32 element, 4× 8-bit weights per I32. The bare safetensors total