Cookbook: scoring fixes, UI polish, false-finished + stale-state bug fixes

Backend (services/hwfit + routes): - rank_models picks visible set by REQUESTED column, not always score — sorting by Param now shows highest-param models PERIOD (incl. too_tight). - New fit_only param. Multi-GPU rigs filter GGUF Q*/IQ quants (vLLM/SGLang cannot serve them); default non-prequantized to BF16 on 2+ GPUs. - AWQ / GPTQ-8bit get a -1.0 quality penalty (was 0.0, tied with FP8), so FP8 wins when both fit. - Version-aware tiebreaker (parse Mn.n / Vn) — MiniMax-M2.7 ranks above M2.5 on equal composite score; >=100B integers not misread as versions. - /api/cookbook/hf-latest no longer drops models without an "NB" pattern in the repo id (MiniMax-M2.7, DeepSeek-V4-Pro etc. were silently filtered). - Cached-model scan: atexit flushes models JSON even if the script is killed mid-walk; each scan_dir wrapped in try/except; timeout 60s -> 180s. - KB granularity for sub-MB sizes (was "0 MB" for 12 KB shells). New "stalled" status for shells <1 MB with no .incomplete files. - /api/cookbook/state POST guard: rejects "done" download tasks lacking DOWNLOAD_OK / DOWNLOAD_FAILED / /snapshots/ when the last-mentioned shard is N<total — stops stale tabs from poisoning persisted state. - hf_models.json: add zai-org/GLM-5.1; flip zai-org/GLM-5 quantization Q4_K_M -> BF16 (it is the native base, not a quant). Frontend (static/js): - Scan/Download toolbar: quant defaults to All; ctx slider (8k/16k/32k/ 50k/128k/Max) ported from origin/main with sort=fit on drag, sort=score on Max. GPU toggle commits _activeCount to maxGpu on initial render. Fit column header tagged with active budget (RAM / GPU / N GPU). - Foldable Download admin-card: the Download h2 is the chevron trigger; state persists in localStorage. - Download card surfaces destination dir (Dir: <path>). Same dir on running task row, font/color matched to uptime (9px Fira Code muted, opacity .4). - Serve panel ctx text input always resets to model max on open. Sub-MB cached models show with red "download stalled" badge. - Bulk-select Cancel + Delete reset the Select button label on exit. - Cookbook running: false-finished bug fixed — DOWNLOAD_OK or /snapshots/ required; bare "Download complete" no longer marks the task done after the first config file. Clear button now sends tmux kill-session too. True overall % for multi-shard downloads: ((N-1)+frac)/total instead of hf_transfer per-shard aggregate. - Diagnosis card simplified: removed fold toggle, copy button, dismiss X. Suggestion font matches message body (12px). - HF token field flashes green check + "Saved" on save. - Cached scan no longer counts stalled rows as downloaded in Scan/Download. CSS: - dep Install button width pinned to 76px to match Installed split. - task-sub row +1px; task-status badge gets margin-right 8px. - Ctx slider styled like gallery editor sliders (thin pill rail, red thumb). - Bulk-select cancel button top -3px -> -5px.
2026-06-03 16:32:20 +09:00
parent ab0a480f30
commit eb79b76432
15 changed files with 1175 additions and 198 deletions
--- a/services/hwfit/data/hf_models.json
+++ b/services/hwfit/data/hf_models.json
@@ -4375,7 +4375,14 @@
  "hf_downloads": 51135,
  "hf_likes": 2,
  "release_date": "2025-09-23",
-  "_discovered": true
+  "_discovered": true,
+  "gguf_sources": [
+   {
+    "repo": "typhoon-ai/typhoon2.5-qwen3-4b-gguf",
+    "file": "typhoon2.5-qwen3-4b-q4_k_m.gguf",
+    "quant": "Q4_K_M"
+   }
+  ]
 },
 {
  "name": "JunHowie/Qwen3-4B-Instruct-2507-GPTQ-Int4",
@@ -8994,7 +9001,14 @@
  "num_experts": 128,
  "active_experts": 8,
  "active_parameters": 3339450907,
-  "_discovered": true
+  "_discovered": true,
+  "gguf_sources": [
+   {
+    "repo": "typhoon-ai/typhoon2.5-qwen3-30b-a3b-gguf",
+    "file": "typhoon2.5-qwen3-30b-a3b-q4_k_m.gguf",
+    "quant": "Q4_K_M"
+   }
+  ]
 },
 {
  "name": "QuantTrio/Qwen3-Coder-30B-A3B-Instruct-AWQ",
@@ -12078,7 +12092,7 @@
  "min_ram_gb": 421.3,
  "recommended_ram_gb": 702.1,
  "min_vram_gb": 386.1,
-  "quantization": "Q4_K_M",
+  "quantization": "BF16",
  "context_length": 202752,
  "use_case": "General purpose text generation",
  "capabilities": [],
@@ -12088,6 +12102,24 @@
  "hf_likes": 1698,
  "release_date": "2026-02-11"
 },
+ {
+  "name": "zai-org/GLM-5.1",
+  "provider": "zai-org",
+  "parameter_count": "753.9B",
+  "parameters_raw": 753864139008,
+  "min_ram_gb": 421.3,
+  "recommended_ram_gb": 702.1,
+  "min_vram_gb": 386.1,
+  "quantization": "BF16",
+  "context_length": 202752,
+  "use_case": "General purpose text generation",
+  "capabilities": [],
+  "pipeline_tag": "text-generation",
+  "architecture": "glm_moe_dsa",
+  "hf_downloads": 141194,
+  "hf_likes": 0,
+  "release_date": "2026-04-03"
+ },
 {
  "name": "moonshotai/Kimi-K2-Instruct",
  "provider": "moonshotai",
@@ -13919,7 +13951,12 @@
  "architecture": "gemma4",
  "pipeline_tag": "image-text-to-text",
  "release_date": "2026-04-01",
-  "gguf_sources": [],
+  "gguf_sources": [
+   {
+    "repo": "unsloth/gemma-4-E2B-it-GGUF",
+    "provider": "unsloth"
+   }
+  ],
  "capabilities": [
   "vision"
  ]
@@ -13942,7 +13979,12 @@
  "architecture": "gemma4",
  "pipeline_tag": "image-text-to-text",
  "release_date": "2026-04-01",
-  "gguf_sources": [],
+  "gguf_sources": [
+   {
+    "repo": "unsloth/gemma-4-E4B-it-GGUF",
+    "provider": "unsloth"
+   }
+  ],
  "capabilities": [
   "vision"
  ]
@@ -13965,7 +14007,12 @@
  "architecture": "gemma4",
  "pipeline_tag": "image-text-to-text",
  "release_date": "2026-04-01",
-  "gguf_sources": [],
+  "gguf_sources": [
+   {
+    "repo": "unsloth/gemma-4-31B-it-GGUF",
+    "provider": "unsloth"
+   }
+  ],
  "capabilities": [
   "vision"
  ]
@@ -13988,7 +14035,12 @@
  "architecture": "gemma4",
  "pipeline_tag": "image-text-to-text",
  "release_date": "2026-04-01",
-  "gguf_sources": [],
+  "gguf_sources": [
+   {
+    "repo": "unsloth/gemma-4-26B-A4B-it-GGUF",
+    "provider": "unsloth"
+   }
+  ],
  "capabilities": [
   "vision"
  ]
@@ -18719,5 +18771,307 @@
  "hf_likes": 0,
  "release_date": "2026-04-19",
  "_discovered": true
+ },
+ {
+  "name": "Qwen/Qwen3.6-27B-MTP",
+  "provider": "Qwen",
+  "parameter_count": "27.8B",
+  "parameters_raw": 27781427952,
+  "min_ram_gb": 16.6,
+  "recommended_ram_gb": 21.6,
+  "min_vram_gb": 16.6,
+  "quantization": "Q4_K_M",
+  "context_length": 262144,
+  "use_case": "General purpose, coding, MTP",
+  "is_moe": false,
+  "num_experts": null,
+  "active_experts": null,
+  "active_parameters": null,
+  "architecture": "qwen3",
+  "pipeline_tag": "text-generation",
+  "release_date": "2026-04-01",
+  "gguf_sources": [
+   {
+    "repo": "unsloth/Qwen3.6-27B-MTP-GGUF",
+    "provider": "unsloth"
+   }
+  ],
+  "capabilities": [
+   "mtp"
+  ],
+  "_discovered": true
+ },
+ {
+  "name": "Qwen/Qwen3.6-35B-A3B-MTP",
+  "provider": "Qwen",
+  "parameter_count": "36.0B",
+  "parameters_raw": 35951822704,
+  "min_ram_gb": 21.4,
+  "recommended_ram_gb": 27.8,
+  "min_vram_gb": 21.4,
+  "quantization": "Q4_K_M",
+  "context_length": 262144,
+  "use_case": "General purpose (MoE), MTP",
+  "is_moe": true,
+  "num_experts": null,
+  "active_experts": null,
+  "active_parameters": 3000000000,
+  "architecture": "qwen3_moe",
+  "pipeline_tag": "text-generation",
+  "release_date": "2026-04-01",
+  "gguf_sources": [
+   {
+    "repo": "unsloth/Qwen3.6-35B-A3B-MTP-GGUF",
+    "provider": "unsloth"
+   }
+  ],
+  "capabilities": [
+   "mtp"
+  ],
+  "_discovered": true
+ },
+ {
+  "name": "Qwen/Qwen3.5-0.8B-MTP",
+  "provider": "Qwen",
+  "parameter_count": "873M",
+  "parameters_raw": 873438784,
+  "min_ram_gb": 1.0,
+  "recommended_ram_gb": 2.0,
+  "min_vram_gb": 0.5,
+  "quantization": "Q4_K_M",
+  "context_length": 262144,
+  "use_case": "General purpose, MTP",
+  "capabilities": [
+   "mtp",
+   "tool_use",
+   "vision"
+  ],
+  "pipeline_tag": "image-text-to-text",
+  "architecture": "qwen3_5",
+  "hf_downloads": 93448,
+  "hf_likes": 208,
+  "release_date": "2026-02-28",
+  "gguf_sources": [
+   {
+    "repo": "unsloth/Qwen3.5-0.8B-MTP-GGUF",
+    "provider": "unsloth"
+   }
+  ],
+  "_discovered": true
+ },
+ {
+  "name": "Qwen/Qwen3.5-2B-MTP",
+  "provider": "Qwen",
+  "parameter_count": "2.3B",
+  "parameters_raw": 2274069824,
+  "min_ram_gb": 1.3,
+  "recommended_ram_gb": 2.1,
+  "min_vram_gb": 1.2,
+  "quantization": "Q4_K_M",
+  "context_length": 262144,
+  "use_case": "General purpose, MTP",
+  "capabilities": [
+   "mtp",
+   "tool_use",
+   "vision"
+  ],
+  "pipeline_tag": "image-text-to-text",
+  "architecture": "qwen3_5",
+  "hf_downloads": 46974,
+  "hf_likes": 115,
+  "release_date": "2026-02-28",
+  "gguf_sources": [
+   {
+    "repo": "unsloth/Qwen3.5-2B-MTP-GGUF",
+    "provider": "unsloth"
+   }
+  ],
+  "_discovered": true
+ },
+ {
+  "name": "Qwen/Qwen3.5-4B-MTP",
+  "provider": "Qwen",
+  "parameter_count": "4.7B",
+  "parameters_raw": 4659865088,
+  "min_ram_gb": 2.6,
+  "recommended_ram_gb": 4.3,
+  "min_vram_gb": 2.4,
+  "quantization": "Q4_K_M",
+  "context_length": 262144,
+  "use_case": "General purpose, MTP",
+  "capabilities": [
+   "mtp",
+   "tool_use",
+   "vision"
+  ],
+  "pipeline_tag": "image-text-to-text",
+  "architecture": "qwen3_5",
+  "hf_downloads": 99087,
+  "hf_likes": 202,
+  "release_date": "2026-02-27",
+  "gguf_sources": [
+   {
+    "repo": "unsloth/Qwen3.5-4B-MTP-GGUF",
+    "provider": "unsloth"
+   }
+  ],
+  "_discovered": true
+ },
+ {
+  "name": "Qwen/Qwen3.5-9B-MTP",
+  "provider": "Qwen",
+  "parameter_count": "9.7B",
+  "parameters_raw": 9653104368,
+  "min_ram_gb": 5.4,
+  "recommended_ram_gb": 9.0,
+  "min_vram_gb": 4.9,
+  "quantization": "Q4_K_M",
+  "context_length": 262144,
+  "use_case": "General purpose, MTP",
+  "capabilities": [
+   "mtp",
+   "tool_use",
+   "vision"
+  ],
+  "pipeline_tag": "image-text-to-text",
+  "architecture": "qwen3_5",
+  "hf_downloads": 172298,
+  "hf_likes": 345,
+  "release_date": "2026-02-27",
+  "gguf_sources": [
+   {
+    "repo": "unsloth/Qwen3.5-9B-MTP-GGUF",
+    "provider": "unsloth"
+   }
+  ],
+  "_discovered": true
+ },
+ {
+  "name": "Qwen/Qwen3.5-27B-MTP",
+  "provider": "Qwen",
+  "parameter_count": "27.8B",
+  "parameters_raw": 27781427952,
+  "min_ram_gb": 15.5,
+  "recommended_ram_gb": 25.9,
+  "min_vram_gb": 14.2,
+  "quantization": "Q4_K_M",
+  "context_length": 262144,
+  "use_case": "General purpose, MTP",
+  "capabilities": [
+   "mtp",
+   "tool_use",
+   "vision"
+  ],
+  "pipeline_tag": "image-text-to-text",
+  "architecture": "qwen3_5",
+  "hf_downloads": 406808,
+  "hf_likes": 565,
+  "release_date": "2026-02-24",
+  "gguf_sources": [
+   {
+    "repo": "unsloth/Qwen3.5-27B-MTP-GGUF",
+    "provider": "unsloth"
+   }
+  ],
+  "_discovered": true
+ },
+ {
+  "name": "Qwen/Qwen3.5-35B-A3B-MTP",
+  "provider": "Qwen",
+  "parameter_count": "36.0B",
+  "parameters_raw": 35951822704,
+  "min_ram_gb": 20.1,
+  "recommended_ram_gb": 33.5,
+  "min_vram_gb": 18.4,
+  "quantization": "Q4_K_M",
+  "context_length": 262144,
+  "use_case": "General purpose, MTP",
+  "capabilities": [
+   "mtp",
+   "tool_use",
+   "vision"
+  ],
+  "pipeline_tag": "image-text-to-text",
+  "architecture": "qwen3_5_moe",
+  "hf_downloads": 769032,
+  "hf_likes": 905,
+  "release_date": "2026-02-24",
+  "is_moe": true,
+  "num_experts": 256,
+  "active_experts": 8,
+  "active_parameters": 3000000000,
+  "gguf_sources": [
+   {
+    "repo": "unsloth/Qwen3.5-35B-A3B-MTP-GGUF",
+    "provider": "unsloth"
+   }
+  ],
+  "_discovered": true
+ },
+ {
+  "name": "Qwen/Qwen3.5-122B-A10B-MTP",
+  "provider": "Qwen",
+  "parameter_count": "125.1B",
+  "parameters_raw": 125086497008,
+  "min_ram_gb": 69.9,
+  "recommended_ram_gb": 116.5,
+  "min_vram_gb": 64.1,
+  "quantization": "Q4_K_M",
+  "context_length": 262144,
+  "use_case": "General purpose, MTP",
+  "capabilities": [
+   "mtp",
+   "tool_use",
+   "vision"
+  ],
+  "pipeline_tag": "image-text-to-text",
+  "architecture": "qwen3_5_moe",
+  "hf_downloads": 171055,
+  "hf_likes": 389,
+  "release_date": "2026-02-24",
+  "is_moe": true,
+  "num_experts": 256,
+  "active_experts": 8,
+  "active_parameters": 10000000000,
+  "gguf_sources": [
+   {
+    "repo": "unsloth/Qwen3.5-122B-A10B-MTP-GGUF",
+    "provider": "unsloth"
+   }
+  ],
+  "_discovered": true
+ },
+ {
+  "name": "Qwen/Qwen3.5-397B-A17B-MTP",
+  "provider": "Qwen",
+  "parameter_count": "403.4B",
+  "parameters_raw": 403397928944,
+  "min_ram_gb": 225.4,
+  "recommended_ram_gb": 375.7,
+  "min_vram_gb": 206.6,
+  "quantization": "Q4_K_M",
+  "context_length": 262144,
+  "use_case": "General purpose, MTP",
+  "capabilities": [
+   "mtp",
+   "tool_use",
+   "vision"
+  ],
+  "pipeline_tag": "image-text-to-text",
+  "architecture": "qwen3_5_moe",
+  "hf_downloads": 1291825,
+  "hf_likes": 1214,
+  "release_date": "2026-02-16",
+  "is_moe": true,
+  "num_experts": 256,
+  "active_experts": 8,
+  "active_parameters": 17000000000,
+  "gguf_sources": [
+   {
+    "repo": "unsloth/Qwen3.5-397B-A17B-MTP-GGUF",
+    "provider": "unsloth"
+   }
+  ],
+  "_discovered": true
 }
-]
+]
--- a/services/hwfit/fit.py
+++ b/services/hwfit/fit.py
@@ -18,7 +18,7 @@ GPU_BANDWIDTH = {
    "7900 xtx": 960, "7900 xt": 800, "7900 gre": 576, "7800 xt": 624, "7700 xt": 432, "7600": 288,
    "6950 xt": 576, "6900 xt": 512, "6800 xt": 512, "6800": 512, "6700 xt": 384, "6600 xt": 256, "6600": 224,
    "mi300x": 5300, "mi300": 5300, "mi250x": 3277, "mi250": 3277, "mi210": 1638, "mi100": 1229,
-    "9070 xt": 624, "9070": 488,
+    "9070 xt": 624, "9070": 488, "9060 xt": 322, "9060": 322,
    # Apple Silicon unified-memory bandwidth (GB/s). Keyed off the chip name
    # reported by sysctl machdep.cpu.brand_string (e.g. "Apple M4 Max"). Listed
    # before the bare "m_" keys matters less than length-sorting (done below),
@@ -26,7 +26,8 @@ GPU_BANDWIDTH = {
    "m1 ultra": 800, "m1 max": 400, "m1 pro": 200, "m1": 68,
    "m2 ultra": 800, "m2 max": 400, "m2 pro": 200, "m2": 100,
    "m3 ultra": 800, "m3 max": 300, "m3 pro": 150, "m3": 100,
-    "m4 max": 410, "m4 pro": 273, "m4": 120,
+    "m4 max": 546, "m4 pro": 273, "m4": 120,
+    "m5 max": 546, "m5 pro": 273, "m5": 150,
 }

 # Pre-sort keys by length descending for correct substring matching
@@ -69,8 +70,18 @@ def _lookup_bandwidth(gpu_name):
    return None


-def _estimate_speed(model, quant, run_mode, system):
-    """Estimate tok/s. Uses active params for MoE (only active experts run per token)."""
+def _estimate_speed(model, quant, run_mode, system, offload_frac=0.0):
+    """Estimate tok/s. Uses active params for MoE (only active experts run per token).
+
+    offload_frac (0..1): fraction of the model's weights that spill to system RAM
+    (CPU) because they don't fit VRAM. Generation reads every active weight per
+    token, so when part lives in CPU RAM the per-token time is dominated by the
+    slow path. We model effective bandwidth as a blend of GPU VRAM bandwidth and
+    system-RAM bandwidth weighted by what's where — far more accurate than a flat
+    "halve it" for partial offload, which under/over-shoots depending on amount.
+    Calibrated against a measured RX 9060 XT: DeepSeek-Coder-V2-Lite Q4_K_M with
+    light offload → ~59 t/s est vs 59.8 measured.
+    """
    pb = _active_params_b(model)
    is_moe = model.get("is_moe", False)
    bw = _lookup_bandwidth(system.get("gpu_name"))
@@ -82,14 +93,24 @@ def _estimate_speed(model, quant, run_mode, system):
        if model_gb <= 0:
            return 0.0
        efficiency = 0.55
-        raw_tps = (bw / model_gb) * efficiency
        if run_mode == "cpu_offload":
-            mode_factor = 0.5
-        elif is_moe:
-            mode_factor = 0.8
-        else:
-            mode_factor = 1.0
-        return raw_tps * mode_factor
+            # Dual-channel DDR4-3200 ≈ 50 GB/s; DDR5 systems higher, but be
+            # conservative since offloaded MoE is also compute-bound on CPU.
+            cpu_bw = 55.0
+            frac = min(max(offload_frac, 0.0), 1.0)
+            # If we don't know the fraction (legacy callers pass 0 with
+            # cpu_offload), assume a meaningful spill so we don't overestimate.
+            if frac <= 0.0:
+                frac = 0.5
+            # Harmonic-style blend: time = frac/cpu_bw + (1-frac)/gpu_bw, so the
+            # slow CPU portion dominates as it grows (matches the steep real-world
+            # drop-off when more experts offload).
+            eff_bw = 1.0 / (frac / cpu_bw + (1.0 - frac) / bw)
+            raw_tps = (eff_bw / model_gb) * efficiency
+            return raw_tps * (0.8 if is_moe else 1.0)
+        # Fully on GPU.
+        raw_tps = (bw / model_gb) * efficiency
+        return raw_tps * (0.8 if is_moe else 1.0)

    k = FALLBACK_K.get(backend, 70)
    if pb <= 0:
@@ -98,6 +119,27 @@ def _estimate_speed(model, quant, run_mode, system):
    return k / pb * sm


+def _architecture_bonus(model):
+    name = (model.get("name") or "").lower()
+    arch = (model.get("architecture") or "").lower()
+    text = f"{name} {arch}"
+
+    # Keep this intentionally small: hardware fit and speed still matter, but
+    # current model families should not be scored the same as older Qwen2/LLama
+    # era entries just because the parameter count is similar.
+    if "qwen3.6" in text or "qwen3_6" in text:
+        return 9
+    if "qwen3.5" in text or "qwen3_5" in text:
+        return 8
+    if "qwen3-next" in text or "qwen3_next" in text:
+        return 6
+    if "qwen3" in text or arch.startswith("qwen3"):
+        return 4
+    if "qwen2.5" in text or "qwen2_5" in text:
+        return 2
+    return 0
+
+
 def _quality_score(model, quant, use_case):
    pb = params_b(model)
    if pb < 1:
@@ -127,13 +169,21 @@ def _quality_score(model, quant, use_case):
    if "gemma" in name_lower:
        base += 1

+    base += _architecture_bonus(model)
    base += QUANT_QUALITY_PENALTY.get(quant, 0)

    model_uc = infer_use_case(model)
    if model_uc == "coding" and use_case == "coding":
        base += 6
+    elif model_uc == "coding" and use_case in ("general", "chat"):
+        # Coder-specialized models are still useful generally, but they should
+        # not dominate the default scan. If the user wants code, the Coding
+        # filter gives them the boost above.
+        base -= 10
    if model_uc == "reasoning" and use_case == "reasoning" and pb >= 13:
        base += 5
+    elif model_uc == "reasoning" and use_case == "chat":
+        base -= 4
    if model_uc == "multimodal" and use_case == "multimodal":
        base += 6

@@ -196,9 +246,9 @@ def _quant_bits(q):
    Returns 0 when unknown (caller treats unknown as "don't filter")."""
    qu = (q or "").upper().replace("-", "").replace("_", "").replace(" ", "")
    # GGUF k-quants + float formats
-    if qu.startswith("Q8") or "FP8" in qu:
+    if qu.startswith("Q8") or "FP8" in qu or "INT8" in qu or qu.startswith("W8"):
        return 8
-    if qu.startswith("Q4") or qu.startswith("IQ4"):
+    if qu.startswith("Q4") or qu.startswith("IQ4") or "FP4" in qu or "NF4" in qu or "INT4" in qu or qu.startswith("W4"):
        return 4
    if qu.startswith("Q2") or qu.startswith("IQ2"):
        return 2
@@ -210,7 +260,7 @@ def _quant_bits(q):
        return 6
    if qu.startswith("F16") or qu.startswith("BF16") or qu.startswith("F32"):
        return 16
-    # Prequantized formats: pull the bit-width digit (AWQ4 / AWQ4BIT / GPTQ8 / 4BIT / INT8 …)
+    # Prequantized formats: pull the bit-width digit (AWQ4 / AWQ4BIT / GPTQ8 / 4BIT / INT8 ...)
    m = re.search(r"(?:AWQ|GPTQ|MLX|EXL2|BNB|INT|W)(\d{1,2})", qu) or re.search(r"(\d{1,2})BIT", qu)
    if m:
        b = int(m.group(1))
@@ -219,12 +269,36 @@ def _quant_bits(q):
    return 0


-def analyze_model(model, system, target_quant=None):
+def _native_quant(model):
+    native_quant = model.get("quantization", "Q4_K_M")
+    name = (model.get("name") or "").lower()
+    fmt = (model.get("format") or "").lower()
+    text = f"{name} {fmt}"
+    if "nvfp4" in text:
+        return "NVFP4"
+    if re.search(r"(^|[-_/])fp8($|[-_/\s])", text):
+        return "FP8"
+    if "gptq" in text:
+        m = re.search(r"(?:gptq|int|w)(?:[-_]?)(\d{1,2})(?:bit)?", text)
+        return f"GPTQ-{m.group(1)}bit" if m else "GPTQ"
+    if "awq" in text:
+        m = re.search(r"(?:awq|int|w)(?:[-_]?)(\d{1,2})(?:bit)?", text)
+        return f"AWQ-{m.group(1)}bit" if m else "AWQ"
+    if "mlx" in text:
+        m = re.search(r"mlx[-_]?(\d{1,2})bit", text)
+        return f"mlx-{m.group(1)}bit" if m else native_quant
+    if not (model.get("is_gguf") or model.get("gguf_sources")) and re.search(r"(^|[-_/])(?:int)?8bit($|[-_/\s])", text):
+        return "INT8"
+    return native_quant
+
+
+def analyze_model(model, system, target_quant=None, scoring_use_case=None, target_context=None):
    pb = params_b(model)
    if pb <= 0:
        return None

-    use_case = infer_use_case(model)
+    model_use_case = infer_use_case(model)
+    score_use_case = scoring_use_case or "general"
    has_gpu = system.get("has_gpu", False)
    gpu_vram = (system.get("gpu_vram_gb") or 0) if has_gpu else 0
    gpu_count = system.get("gpu_count", 1) or 1
@@ -238,9 +312,14 @@ def analyze_model(model, system, target_quant=None):
    gpu_only = bool(system.get("gpu_only")) and has_gpu and gpu_vram > 0
    eff_ram = 0 if gpu_only else available_ram
    is_moe = model.get("is_moe", False)
-    ctx = model.get("context_length", 4096) or 4096
+    model_ctx = model.get("context_length", 4096) or 4096
+    try:
+        target_context = int(target_context or 0)
+    except (TypeError, ValueError):
+        target_context = 0
+    ctx = min(model_ctx, target_context) if target_context > 0 else model_ctx

-    native_quant = model.get("quantization", "Q4_K_M")
+    native_quant = _native_quant(model)
    preq = is_prequantized(model)

    # GGUF models can't be sharded across GPUs — use single GPU VRAM
@@ -256,13 +335,22 @@ def analyze_model(model, system, target_quant=None):
    else:
        effective_vram = gpu_vram

+    native_gpu_only = preq and not native_quant.startswith("mlx-")
+
    # Determine which quant to evaluate at
+    native_quant_prefixes = (
+        "AWQ-", "GPTQ-", "FP8", "FP4", "NVFP4", "MXFP4", "NF4",
+        "INT4", "INT8", "W4A16", "W8A8", "W8A16",
+    )
+
    if preq:
-        # AWQ/GPTQ/FP8/MLX come at a fixed bit-width. If the user picked a
-        # specific quant tier (e.g. Q8 → 8-bit), only keep prequant models whose
-        # native bit-width matches — otherwise selecting Q8 would still surface
-        # AWQ-4bit models, mixing 4- and 8-bit in one view.
+        # Native HF/vLLM quantized repos come at a fixed format. If the user
+        # picked a GGUF quant tier (Q4/Q8/etc.), do not treat same-bit
+        # AWQ/GPTQ/FP8/FP4 builds as equivalent; those formats are separate
+        # serving paths and only appear when explicitly selected or unfiltered.
        if target_quant:
+            if not any(target_quant.startswith(p) for p in native_quant_prefixes):
+                return None
            _tb, _nb = _quant_bits(target_quant), _quant_bits(native_quant)
            if _tb and _nb and _tb != _nb:
                return None
@@ -270,20 +358,25 @@ def analyze_model(model, system, target_quant=None):
    elif target_quant:
        # User picked a specific quant
        quant_to_try = target_quant
+    elif gpu_count >= 2:
+        # Multi-GPU box: vLLM/SGLang can't serve GGUF Q* quants (those are
+        # llama.cpp-only). Default non-prequantized models to BF16 so the row
+        # is meaningful on a multi-GPU rig. If BF16 doesn't fit, the model
+        # surfaces as too_tight — better than showing a Q4 row the user
+        # can't actually serve with vLLM on >1 GPU.
+        quant_to_try = "BF16"
    else:
-        # Default: Q4_K_M (user's stated preference)
+        # Default: Q4_K_M (user's stated preference) — kept for single-GPU
+        # and RAM modes where llama.cpp serving is the natural path.
        quant_to_try = "Q4_K_M"

-    result = _try_quant_at(model, quant_to_try, ctx, effective_vram, eff_ram)
+    # Multi-GPU filter: skip the row if the resolved quant is a GGUF tier
+    # (Q*/IQ-prefixed) — vLLM/SGLang can't serve those, so showing them on
+    # a 2+ GPU rig just clutters the list with unservable candidates.
+    if gpu_count >= 2 and quant_to_try and quant_to_try.upper().startswith(("Q2", "Q3", "Q4", "Q5", "Q6", "Q8", "IQ")):
+        return None

-    # If target quant doesn't fit and it's not pre-quantized, try lower quants
-    if result is None and not preq and target_quant:
-        from services.hwfit.models import QUANT_HIERARCHY
-        idx = QUANT_HIERARCHY.index(target_quant) if target_quant in QUANT_HIERARCHY else -1
-        for q in QUANT_HIERARCHY[idx + 1:]:
-            result = _try_quant_at(model, q, ctx, effective_vram, eff_ram)
-            if result:
-                break
+    result = _try_quant_at(model, quant_to_try, ctx, effective_vram, 0 if native_gpu_only else eff_ram)

    if result is None:
        # Model doesn't fit on the user's current hardware. Surface it
@@ -299,7 +392,7 @@ def analyze_model(model, system, target_quant=None):
            "parameter_count": model.get("parameter_count"),
            "params_b": round(pb, 1),
            "is_moe": is_moe,
-            "use_case": use_case,
+            "use_case": model_use_case,
            "fit_level": "too_tight",
            "run_mode": "no_fit",
            "quant": quant_to_try,
@@ -309,7 +402,8 @@ def analyze_model(model, system, target_quant=None):
            "score": 0,
            "scores": {"quality": 0, "speed": 0, "fit": 0, "context": 0},
            "gguf_sources": model.get("gguf_sources", []),
-            "context_length": model.get("context_length", 4096),
+            "context_length": model_ctx,
+            "target_context": target_context or None,
        }

    run_mode, quant, fit_ctx, required_gb = result
@@ -331,14 +425,19 @@ def analyze_model(model, system, target_quant=None):
    else:
        fit_level = "marginal"

-    tps = _estimate_speed(model, quant, run_mode, system)
+    # Fraction of the model that spills to CPU RAM (drives the offload speed
+    # model). When offloading, anything beyond the GPU's VRAM lives in system RAM.
+    offload_frac = 0.0
+    if run_mode == "cpu_offload" and required_gb > 0 and effective_vram > 0:
+        offload_frac = max(0.0, (required_gb - effective_vram) / required_gb)
+    tps = _estimate_speed(model, quant, run_mode, system, offload_frac=offload_frac)

-    q_score = _quality_score(model, quant, use_case)
-    s_score = _speed_score(tps, use_case)
+    q_score = _quality_score(model, quant, score_use_case)
+    s_score = _speed_score(tps, score_use_case)
    f_score = _fit_score(required_gb, budget)
-    c_score = _context_score(fit_ctx, use_case)
+    c_score = _context_score(fit_ctx, score_use_case)

-    wq, ws, wf, wc = USE_CASE_WEIGHTS.get(use_case, (0.45, 0.30, 0.15, 0.10))
+    wq, ws, wf, wc = USE_CASE_WEIGHTS.get(score_use_case, (0.45, 0.30, 0.15, 0.10))
    composite = q_score * wq + s_score * ws + f_score * wf + c_score * wc

    return {
@@ -347,7 +446,7 @@ def analyze_model(model, system, target_quant=None):
        "parameter_count": model.get("parameter_count"),
        "params_b": round(pb, 1),
        "is_moe": is_moe,
-        "use_case": use_case,
+        "use_case": model_use_case,
        "fit_level": fit_level,
        "run_mode": run_mode,
        "quant": quant,
@@ -362,21 +461,67 @@ def analyze_model(model, system, target_quant=None):
            "context": round(c_score, 1),
        },
        "gguf_sources": model.get("gguf_sources", []),
-        "context_length": model.get("context_length", 4096),
+        "context_length": model_ctx,
+        "release_date": model.get("release_date", ""),
+        "target_context": target_context or None,
    }


+def _version_key(name):
+    """Parse the model's version number from its display name so equal-score
+    rows can break ties in favor of the newer release (e.g. M2.7 > M2.5).
+    Returns a float; 0.0 for names with no recognizable version. The regex
+    grabs the FIRST 'word-with-digits' pattern after a hyphen/underscore,
+    so e.g. 'MiniMax-M2.7' -> 2.7, 'Qwen3.6-35B' -> 3.6, 'M2' -> 2.0."""
+    import re as _re
+    if not name:
+        return 0.0
+    # Match the version-marker word: a letter followed by a number with
+    # optional decimal, e.g. M2.7, V4, Pro3. Take the first hit; ignore
+    # "B" param-count suffixes (Qwen3-235B should yield 3, not 235).
+    for m in _re.finditer(r"[A-Za-z](\d+(?:\.\d+)?)(?![A-Za-z])", name):
+        val = m.group(1)
+        # Skip param-count tokens (e.g. "235B" gives "235" but the next
+        # char would be "B" — already excluded by the negative lookahead).
+        try:
+            f = float(val)
+        except ValueError:
+            continue
+        # Heuristic: bare integers >= 100 are almost certainly param counts
+        # (1B/3B/8B/70B/235B…), not version numbers. Skip them.
+        if "." not in val and f >= 100:
+            continue
+        return f
+    return 0.0
+
+
 SORT_KEYS = {
-    "score": lambda r: r["score"],
+    # Score sort with version-aware tiebreaker — when two rows tie on
+    # composite score (a common case for the SAME base model in different
+    # versions, e.g. MiniMax-M2.5 vs M2.7 both at the same FP8 budget),
+    # prefer the newer version. Without this, ties resolved to whatever
+    # order they came out of the registry, which let older releases land
+    # above newer ones in user-facing lists.
+    "score": lambda r: (r["score"], _version_key(r.get("name") or "")),
    "speed": lambda r: r["speed_tps"],
    "vram": lambda r: r["required_gb"],
    "params": lambda r: r["params_b"],
    "context": lambda r: r["context"],
+    # Newest first. release_date is an ISO-ish string ("2026-05-30"); plain
+    # string sort is chronological. Missing dates sort last (empty < any date,
+    # and we sort reverse=True for newest, so "" lands at the bottom).
+    "newest": lambda r: r.get("release_date") or "",
 }


-def rank_models(system, use_case=None, limit=50, search=None, sort="score", quant=None):
-    """Rank all models against detected hardware. Returns sorted list of fit results."""
+def rank_models(system, use_case=None, limit=50, search=None, sort="score", quant=None, target_context=None, fit_only=False):
+    """Rank all models against detected hardware. Returns sorted list of fit results.
+
+    fit_only: when True, drop rows whose fit_level is "too_tight" (model doesn't
+    actually fit on the chosen budget). When False (default), every model is
+    shown — sorting by Param means highest-param PERIOD, even ones that won't
+    run, so the user can see the truth.
+    """
    models = get_models()
    results = []

@@ -418,21 +563,30 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
            results.sort(key=sort_fn, reverse=(sort != "vram"))
            return results[:limit]

-    # If user picked a prequantized format (AWQ/FP8/GPTQ), filter to only those models
-    filter_native = quant and any(quant.startswith(p) for p in ("AWQ-", "GPTQ-", "FP8"))
+    # If user picked a native prequantized format, filter to only those models.
+    filter_native = quant and any(quant.startswith(p) for p in (
+        "AWQ-", "GPTQ-", "FP8", "FP4", "NVFP4", "MXFP4", "NF4",
+        "INT4", "INT8", "W4A16", "W8A8", "W8A16",
+    ))

    system_backend = (system.get("backend") or "").lower()
    apple_silicon = system_backend in ("mps", "metal", "apple")
+    # Consumer AMD Radeon (RDNA, gfx10/11/12): the practical local serving path
+    # is GGUF via llama.cpp. vLLM/SGLang on ROCm are validated for datacenter
+    # Instinct (CDNA, gfx9xx) but are unreliable on consumer RDNA — AWQ kernels
+    # are largely unsupported there and FP8 needs out-of-tree patches. So treat
+    # consumer RDNA like Apple Silicon (GGUF-only) and leave CDNA untouched.
+    # Unknown family (no rocminfo) is left untouched to avoid hiding models from
+    # a possibly-capable Instinct box on a misdetect.
+    gpu_family = (system.get("gpu_family") or "").lower()
+    consumer_amd = system_backend == "rocm" and gpu_family == "rdna"

    for m in models:
-        native_q = m.get("quantization", "")
+        native_q = _native_quant(m)

-        # MLX-quantized models need the MLX runtime (mlx_lm), which Odysseus
-        # doesn't generate serve commands for — only llama.cpp/Ollama (Metal)
-        # and vLLM/SGLang (CUDA). MLX repos ship no GGUF alternative, so they're
-        # unrunnable on every backend we support. Always drop them, on Apple
-        # Silicon too, so the Cookbook never recommends a model it can't serve.
-        if native_q.startswith("mlx-"):
+        # MLX needs the mlx_lm runtime, which Odysseus does not generate serve
+        # commands for. Hide it on every backend, including Metal.
+        if native_q.startswith("mlx-") or "mlx" in (m.get("name") or "").lower():
            continue

        # On Apple Silicon the only serving engines are llama.cpp and Ollama,
@@ -442,17 +596,28 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
        # default GGUF quant) and vLLM-only AWQ/GPTQ/FP8 builds alike. Without
        # this the Cookbook recommends models the Mac can't run; on CUDA these
        # stay visible because vLLM serves safetensors directly.
-        if apple_silicon and not (m.get("is_gguf") or m.get("gguf_sources")):
+        #
+        # Consumer AMD (RDNA) is the same story: GGUF via llama.cpp is the
+        # servable path, so a model needs a real GGUF to be recommended.
+        # Otherwise the Cookbook rates vLLM-only AWQ/GPTQ builds "GOOD" on a
+        # Radeon that can't actually serve them.
+        if (apple_silicon or consumer_amd) and not (m.get("is_gguf") or m.get("gguf_sources")):
            continue

-        # Format filter: AWQ tab → only AWQ models, FP8 tab → only FP8 models
+        # Format filter: AWQ tab -> only AWQ models, FP4 tab -> FP4-family models, etc.
        if filter_native:
            if quant == "FP8" and native_q != "FP8":
                continue
+            if quant == "FP4" and native_q not in ("FP4", "NVFP4", "MXFP4", "NF4"):
+                continue
            if quant.startswith("AWQ") and not native_q.startswith("AWQ"):
                continue
            if quant.startswith("GPTQ") and not native_q.startswith("GPTQ"):
                continue
+            if quant.startswith("NVFP4") and not native_q.startswith("NVFP4"):
+                continue
+            if quant in ("INT4", "INT8", "W4A16", "W8A8", "W8A16") and native_q != quant:
+                continue

        if search:
            name = m.get("name", "").lower()
@@ -460,7 +625,7 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
            if search.lower() not in name and search.lower() not in provider:
                continue

-        result = analyze_model(m, system, target_quant=quant)
+        result = analyze_model(m, system, target_quant=quant, scoring_use_case=(use_case or "general"), target_context=target_context)
        if result is None:
            continue

@@ -471,14 +636,18 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan

        results.append(result)

-    # Pick the visible SET by best fit (score) first, so it stays the same no
-    # matter which column the user sorts by — otherwise sorting by params would
-    # truncate to the N biggest models (huge ones that don't even fit) while
-    # sorting by vram showed the N smallest. Only AFTER choosing the set do we
-    # order it by the requested column.
-    results.sort(key=SORT_KEYS["score"], reverse=True)
-    results = results[:limit]
+    # Pick the visible SET by the REQUESTED column. Per-user feedback: sorting
+    # by Param should show the highest-param models PERIOD, not just those that
+    # already fit. Same for every other column. Models that don't fit are still
+    # in the list with their fit_level marking the constraint, so the user can
+    # see the truth instead of a quietly-truncated view. Score sort is unchanged
+    # (it's the default ranking and naturally pushes non-fits to the bottom).
+    if fit_only:
+        # Hide rows that definitely don't fit (the "too_tight" badge) — user
+        # explicitly asked for a Fit-only view.
+        results = [r for r in results if r.get("fit_level") != "too_tight"]
    sort_fn = SORT_KEYS.get(sort, SORT_KEYS["score"])
    # vram ascending (smallest first), everything else descending (biggest first)
    results.sort(key=sort_fn, reverse=(sort != "vram"))
+    results = results[:limit]
    return results
--- a/services/hwfit/models.py
+++ b/services/hwfit/models.py
@@ -5,7 +5,7 @@ import re
 QUANT_HIERARCHY = ["Q8_0", "Q6_K", "Q5_K_M", "Q4_K_M", "Q3_K_M", "Q2_K"]

 QUANT_BPP = {
-    "F32": 4.0, "F16": 2.0, "BF16": 2.0, "FP8": 1.0,
+    "F32": 4.0, "F16": 2.0, "BF16": 2.0, "FP8": 1.0, "INT8": 1.0, "NVFP4": 0.5,
    "Q8_0": 1.05, "Q6_K": 0.80, "Q5_K_M": 0.68,
    "Q4_K_M": 0.58, "Q4_0": 0.58, "Q3_K_M": 0.48, "Q2_K": 0.37,
    "AWQ-4bit": 0.50, "AWQ-8bit": 1.0,
@@ -14,7 +14,7 @@ QUANT_BPP = {
 }

 QUANT_SPEED_MULT = {
-    "F16": 0.6, "BF16": 0.6, "FP8": 0.85,
+    "F16": 0.6, "BF16": 0.6, "FP8": 0.85, "INT8": 0.85, "NVFP4": 1.1,
    "Q8_0": 0.8, "Q6_K": 0.95, "Q5_K_M": 1.0,
    "Q4_K_M": 1.15, "Q4_0": 1.15, "Q3_K_M": 1.25, "Q2_K": 1.35,
    "AWQ-4bit": 1.2, "AWQ-8bit": 0.85,
@@ -23,16 +23,20 @@ QUANT_SPEED_MULT = {
 }

 QUANT_QUALITY_PENALTY = {
-    "F16": 0.0, "BF16": 0.0, "FP8": 0.0,
-    "Q8_0": 0.0, "Q6_K": -1.0, "Q5_K_M": -2.0,
+    "F16": 0.0, "BF16": 0.0, "FP8": 0.0, "INT8": 0.0, "NVFP4": -0.5,
+    "Q8_0": -0.5, "Q6_K": -1.5, "Q5_K_M": -2.5,
    "Q4_K_M": -5.0, "Q4_0": -5.0, "Q3_K_M": -8.0, "Q2_K": -12.0,
-    "AWQ-4bit": -3.0, "AWQ-8bit": 0.0,
-    "GPTQ-Int4": -3.0, "GPTQ-Int8": 0.0,
-    "mlx-4bit": -4.0, "mlx-8bit": 0.0, "mlx-6bit": -1.0,
+    # Bare "AWQ" and "AWQ-8bit" used to be 0.0 (tied with FP8). In practice
+    # AWQ-anything is a calibrated reconstruction, not raw 8-bit weights —
+    # there's a small but real quality loss vs FP8. Give them a slight
+    # penalty so FP8 wins when both fit. AWQ-4bit stays heavier.
+    "AWQ": -1.0, "AWQ-4bit": -4.0, "AWQ-8bit": -1.0,
+    "GPTQ": -1.0, "GPTQ-Int4": -4.0, "GPTQ-Int8": -1.0,
+    "mlx-4bit": -4.0, "mlx-8bit": -0.5, "mlx-6bit": -1.5,
 }

 QUANT_BYTES_PER_PARAM = {
-    "F16": 2.0, "BF16": 2.0, "FP8": 1.0,
+    "F16": 2.0, "BF16": 2.0, "FP8": 1.0, "INT8": 1.0, "NVFP4": 0.5,
    "Q8_0": 1.0, "Q6_K": 0.75, "Q5_K_M": 0.625,
    "Q4_K_M": 0.5, "Q4_0": 0.5, "Q3_K_M": 0.375, "Q2_K": 0.25,
    "AWQ-4bit": 0.5, "AWQ-8bit": 1.0,
@@ -41,12 +45,21 @@ QUANT_BYTES_PER_PARAM = {
 }

 # Pre-quantized formats that should NOT go through the GGUF quant hierarchy
-PREQUANTIZED_PREFIXES = ("AWQ-", "GPTQ-", "mlx-", "FP8")
+PREQUANTIZED_PREFIXES = ("AWQ-", "GPTQ-", "mlx-", "FP8", "INT8", "NVFP4")


 def is_prequantized(model):
    q = model.get("quantization", "")
-    return any(q.startswith(p) for p in PREQUANTIZED_PREFIXES)
+    name = (model.get("name") or "").lower()
+    fmt = (model.get("format") or "").lower()
+    text = f"{name} {fmt}"
+    return (
+        "nvfp4" in text
+        or re.search(r"(^|[-_/])fp8($|[-_/\s])", text) is not None
+        or (not (model.get("is_gguf") or model.get("gguf_sources")) and re.search(r"(^|[-_/])(?:int)?8bit($|[-_/\s])", text) is not None)
+        or any(x in text for x in ("awq", "gptq", "mlx"))
+        or any(q.startswith(p) for p in PREQUANTIZED_PREFIXES)
+    )


 def params_b(model):