Files
odysseus/services/hwfit/models.py
pewdiepie-archdaemon eb79b76432 Cookbook: scoring fixes, UI polish, false-finished + stale-state bug fixes
Backend (services/hwfit + routes):
- rank_models picks visible set by REQUESTED column, not always score —
  sorting by Param now shows highest-param models PERIOD (incl. too_tight).
- New fit_only param. Multi-GPU rigs filter GGUF Q*/IQ quants (vLLM/SGLang
  cannot serve them); default non-prequantized to BF16 on 2+ GPUs.
- AWQ / GPTQ-8bit get a -1.0 quality penalty (was 0.0, tied with FP8), so
  FP8 wins when both fit.
- Version-aware tiebreaker (parse Mn.n / Vn) — MiniMax-M2.7 ranks above
  M2.5 on equal composite score; >=100B integers not misread as versions.
- /api/cookbook/hf-latest no longer drops models without an "NB" pattern in
  the repo id (MiniMax-M2.7, DeepSeek-V4-Pro etc. were silently filtered).
- Cached-model scan: atexit flushes models JSON even if the script is
  killed mid-walk; each scan_dir wrapped in try/except; timeout 60s -> 180s.
- KB granularity for sub-MB sizes (was "0 MB" for 12 KB shells). New
  "stalled" status for shells <1 MB with no .incomplete files.
- /api/cookbook/state POST guard: rejects "done" download tasks lacking
  DOWNLOAD_OK / DOWNLOAD_FAILED / /snapshots/ when the last-mentioned
  shard is N<total — stops stale tabs from poisoning persisted state.
- hf_models.json: add zai-org/GLM-5.1; flip zai-org/GLM-5 quantization
  Q4_K_M -> BF16 (it is the native base, not a quant).

Frontend (static/js):
- Scan/Download toolbar: quant defaults to All; ctx slider (8k/16k/32k/
  50k/128k/Max) ported from origin/main with sort=fit on drag, sort=score
  on Max. GPU toggle commits _activeCount to maxGpu on initial render. Fit
  column header tagged with active budget (RAM / GPU / N GPU).
- Foldable Download admin-card: the Download h2 is the chevron trigger;
  state persists in localStorage.
- Download card surfaces destination dir (Dir: <path>). Same dir on running
  task row, font/color matched to uptime (9px Fira Code muted, opacity .4).
- Serve panel ctx text input always resets to model max on open. Sub-MB
  cached models show with red "download stalled" badge.
- Bulk-select Cancel + Delete reset the Select button label on exit.
- Cookbook running: false-finished bug fixed — DOWNLOAD_OK or /snapshots/
  required; bare "Download complete" no longer marks the task done after
  the first config file. Clear button now sends tmux kill-session too.
  True overall % for multi-shard downloads: ((N-1)+frac)/total instead of
  hf_transfer per-shard aggregate.
- Diagnosis card simplified: removed fold toggle, copy button, dismiss X.
  Suggestion font matches message body (12px).
- HF token field flashes green check + "Saved" on save.
- Cached scan no longer counts stalled rows as downloaded in Scan/Download.

CSS:
- dep Install button width pinned to 76px to match Installed split.
- task-sub row +1px; task-status badge gets margin-right 8px.
- Ctx slider styled like gallery editor sliders (thin pill rail, red thumb).
- Bulk-select cancel button top -3px -> -5px.
2026-06-03 16:32:20 +09:00

191 lines
6.9 KiB
Python

import json
import os
import re
QUANT_HIERARCHY = ["Q8_0", "Q6_K", "Q5_K_M", "Q4_K_M", "Q3_K_M", "Q2_K"]
QUANT_BPP = {
"F32": 4.0, "F16": 2.0, "BF16": 2.0, "FP8": 1.0, "INT8": 1.0, "NVFP4": 0.5,
"Q8_0": 1.05, "Q6_K": 0.80, "Q5_K_M": 0.68,
"Q4_K_M": 0.58, "Q4_0": 0.58, "Q3_K_M": 0.48, "Q2_K": 0.37,
"AWQ-4bit": 0.50, "AWQ-8bit": 1.0,
"GPTQ-Int4": 0.50, "GPTQ-Int8": 1.0,
"mlx-4bit": 0.55, "mlx-8bit": 1.0, "mlx-6bit": 0.75,
}
QUANT_SPEED_MULT = {
"F16": 0.6, "BF16": 0.6, "FP8": 0.85, "INT8": 0.85, "NVFP4": 1.1,
"Q8_0": 0.8, "Q6_K": 0.95, "Q5_K_M": 1.0,
"Q4_K_M": 1.15, "Q4_0": 1.15, "Q3_K_M": 1.25, "Q2_K": 1.35,
"AWQ-4bit": 1.2, "AWQ-8bit": 0.85,
"GPTQ-Int4": 1.2, "GPTQ-Int8": 0.85,
"mlx-4bit": 1.15, "mlx-8bit": 0.85, "mlx-6bit": 1.0,
}
QUANT_QUALITY_PENALTY = {
"F16": 0.0, "BF16": 0.0, "FP8": 0.0, "INT8": 0.0, "NVFP4": -0.5,
"Q8_0": -0.5, "Q6_K": -1.5, "Q5_K_M": -2.5,
"Q4_K_M": -5.0, "Q4_0": -5.0, "Q3_K_M": -8.0, "Q2_K": -12.0,
# Bare "AWQ" and "AWQ-8bit" used to be 0.0 (tied with FP8). In practice
# AWQ-anything is a calibrated reconstruction, not raw 8-bit weights —
# there's a small but real quality loss vs FP8. Give them a slight
# penalty so FP8 wins when both fit. AWQ-4bit stays heavier.
"AWQ": -1.0, "AWQ-4bit": -4.0, "AWQ-8bit": -1.0,
"GPTQ": -1.0, "GPTQ-Int4": -4.0, "GPTQ-Int8": -1.0,
"mlx-4bit": -4.0, "mlx-8bit": -0.5, "mlx-6bit": -1.5,
}
QUANT_BYTES_PER_PARAM = {
"F16": 2.0, "BF16": 2.0, "FP8": 1.0, "INT8": 1.0, "NVFP4": 0.5,
"Q8_0": 1.0, "Q6_K": 0.75, "Q5_K_M": 0.625,
"Q4_K_M": 0.5, "Q4_0": 0.5, "Q3_K_M": 0.375, "Q2_K": 0.25,
"AWQ-4bit": 0.5, "AWQ-8bit": 1.0,
"GPTQ-Int4": 0.5, "GPTQ-Int8": 1.0,
"mlx-4bit": 0.5, "mlx-8bit": 1.0, "mlx-6bit": 0.75,
}
# Pre-quantized formats that should NOT go through the GGUF quant hierarchy
PREQUANTIZED_PREFIXES = ("AWQ-", "GPTQ-", "mlx-", "FP8", "INT8", "NVFP4")
def is_prequantized(model):
q = model.get("quantization", "")
name = (model.get("name") or "").lower()
fmt = (model.get("format") or "").lower()
text = f"{name} {fmt}"
return (
"nvfp4" in text
or re.search(r"(^|[-_/])fp8($|[-_/\s])", text) is not None
or (not (model.get("is_gguf") or model.get("gguf_sources")) and re.search(r"(^|[-_/])(?:int)?8bit($|[-_/\s])", text) is not None)
or any(x in text for x in ("awq", "gptq", "mlx"))
or any(q.startswith(p) for p in PREQUANTIZED_PREFIXES)
)
def params_b(model):
raw = model.get("parameters_raw")
if raw and raw > 0:
return raw / 1_000_000_000.0
pc = model.get("parameter_count", "")
if pc:
pc = pc.strip().upper()
m = re.match(r"^([\d.]+)\s*([BKMGT]?)$", pc)
if m:
val = float(m.group(1))
suffix = m.group(2)
if suffix == "B":
return val
elif suffix == "M":
return val / 1000.0
elif suffix == "K":
return val / 1_000_000.0
elif suffix == "T":
return val * 1000.0
else:
# No unit. A bare number this size is conventionally a millions
# count (e.g. "355" = 355M), NOT billions — otherwise a 355M
# model would sort as 355B and leap above every 7B/70B model.
# A genuine billions figure carries a "B" suffix and is handled
# above; very large bare values are raw parameter counts.
if val >= 1_000_000:
return val / 1_000_000_000.0 # raw count
if val >= 1000:
return val / 1000.0 # thousands of millions? treat as millions
return val / 1000.0 # e.g. "355" → 0.355B
return 0.0
def estimate_memory_gb(model, quant, ctx):
"""Estimate VRAM needed to serve a model. All weights must be loaded,
even for MoE (all experts live in memory, only active ones compute per token).
KV cache scales with active params for MoE (only active experts have KV state)."""
pb = params_b(model)
bpp = QUANT_BPP.get(quant, 0.58)
kv_params = _active_params_b(model)
return pb * bpp + 0.000008 * kv_params * ctx + 0.5
def _active_params_b(model):
"""For MoE: active params per token (affects KV cache and speed, not total VRAM).
For dense: same as total params."""
if model.get("is_moe") and model.get("active_parameters"):
return model["active_parameters"] / 1_000_000_000.0
return params_b(model)
def best_quant_for_budget(model, budget_gb, ctx):
"""Find best quant that fits in budget_gb of VRAM.
Pre-quantized models (AWQ/GPTQ/MLX) use their native quant only.
Returns (quant, ctx, mem_gb) or (None, None, None).
"""
if is_prequantized(model):
q = model.get("quantization", "Q4_K_M")
mem = estimate_memory_gb(model, q, ctx)
if mem <= budget_gb:
return q, ctx, mem
# Try halving context
cur_ctx = ctx // 2
while cur_ctx >= 1024:
mem = estimate_memory_gb(model, q, cur_ctx)
if mem <= budget_gb:
return q, cur_ctx, mem
cur_ctx //= 2
return None, None, None
# GGUF: try best quality first, then fall back
for q in QUANT_HIERARCHY:
mem = estimate_memory_gb(model, q, ctx)
if mem <= budget_gb:
return q, ctx, mem
cur_ctx = ctx // 2
while cur_ctx >= 1024:
for q in QUANT_HIERARCHY:
mem = estimate_memory_gb(model, q, cur_ctx)
if mem <= budget_gb:
return q, cur_ctx, mem
cur_ctx //= 2
return None, None, None
def infer_use_case(model):
name = model.get("name", "").lower()
uc = model.get("use_case", "").lower()
combined = name + " " + uc
if any(k in combined for k in ("embedding", "embed", "bge")):
return "embedding"
if any(k in combined for k in ("tts", "text-to-speech", "speech-synthesis", "cosyvoice", "parler")):
return "tts"
if any(k in combined for k in ("stt", "speech-to-text", "whisper", "transcri", "asr")):
return "stt"
if "code" in combined:
return "coding"
if any(k in combined for k in ("vision", "multimodal", "vlm", "vl-")):
return "multimodal"
if any(k in combined for k in ("reason", "chain-of-thought", "deepseek-r1")):
return "reasoning"
if any(k in combined for k in ("chat", "instruction")):
return "chat"
return "general"
_models_cache = None
def get_models():
global _models_cache
if _models_cache is None:
data_path = os.path.join(os.path.dirname(__file__), "data", "hf_models.json")
try:
with open(data_path, encoding="utf-8") as f:
_models_cache = json.load(f)
except (FileNotFoundError, json.JSONDecodeError):
_models_cache = []
return _models_cache
def model_catalog_path():
return os.path.join(os.path.dirname(__file__), "data", "hf_models.json")