Files
odysseus/services/hwfit/fit.py
Zen0-99 7188737294 fix(hwfit): filter non-GGUF models on Windows (#2530)
Odysseus only supports llama.cpp on Windows (vLLM/SGLang are
explicitly blocked). llama.cpp requires GGUF, so AWQ/GPTQ/FP8
safetensors models without a GGUF alternate should not be
recommended in the Cookbook on Windows hosts.

Changes:
- hardware.py: add 'platform': 'windows' to _detect_windows()
  so downstream logic can identify Windows hosts.
- fit.py: include is_windows in the existing GGUF-only filter
  alongside apple_silicon and consumer_amd.
- tests: add test_hwfit_windows.py with regression tests.

Fixes #122, #614 (root cause: unservable models recommended).
2026-06-04 20:02:13 +02:00

676 lines
29 KiB
Python

import re
from services.hwfit.models import (
params_b, estimate_memory_gb, infer_use_case,
get_models, is_prequantized, _active_params_b, QUANT_BYTES_PER_PARAM,
QUANT_SPEED_MULT, QUANT_QUALITY_PENALTY,
)
GPU_BANDWIDTH = {
"5090": 1792, "5080": 960, "5070 ti": 896, "5070": 672, "5060 ti": 448, "5060": 256,
"4090": 1008, "4080 super": 736, "4080": 717, "4070 ti super": 672, "4070 ti": 504, "4070 super": 504, "4070": 504, "4060 ti": 288, "4060": 272,
"3090 ti": 1008, "3090": 936, "3080 ti": 912, "3080": 760, "3070 ti": 608, "3070": 448, "3060 ti": 448, "3060": 360,
"2080 ti": 616, "2080 super": 496, "2080": 448, "2070 super": 448, "2070": 448, "2060 super": 448, "2060": 336,
"1660 ti": 288, "1660 super": 336, "1660": 192, "1650 super": 192, "1650": 128,
"h100 sxm": 3350, "h100": 2039, "h200": 4800, "a100 sxm": 2039, "a100": 1555,
"l40s": 864, "l40": 864, "l4": 300, "a10g": 600, "a10": 600, "t4": 320,
"v100 sxm": 900, "v100": 897, "a6000": 768, "a5000": 768, "a4000": 448,
"7900 xtx": 960, "7900 xt": 800, "7900 gre": 576, "7800 xt": 624, "7700 xt": 432, "7600": 288,
"6950 xt": 576, "6900 xt": 512, "6800 xt": 512, "6800": 512, "6700 xt": 384, "6600 xt": 256, "6600": 224,
"mi300x": 5300, "mi300": 5300, "mi250x": 3277, "mi250": 3277, "mi210": 1638, "mi100": 1229,
"9070 xt": 624, "9070": 488, "9060 xt": 322, "9060": 322,
# Apple Silicon unified-memory bandwidth (GB/s). Keyed off the chip name
# reported by sysctl machdep.cpu.brand_string (e.g. "Apple M4 Max"). Listed
# before the bare "m_" keys matters less than length-sorting (done below),
# which guarantees "m4 max" is tried before "m4".
"m1 ultra": 800, "m1 max": 400, "m1 pro": 200, "m1": 68,
"m2 ultra": 800, "m2 max": 400, "m2 pro": 200, "m2": 100,
"m3 ultra": 800, "m3 max": 300, "m3 pro": 150, "m3": 100,
"m4 max": 546, "m4 pro": 273, "m4": 120,
"m5 max": 546, "m5 pro": 273, "m5": 150,
}
# Pre-sort keys by length descending for correct substring matching
_BW_KEYS_SORTED = sorted(GPU_BANDWIDTH.keys(), key=len, reverse=True)
# metal: backstop for Apple Silicon chips not in GPU_BANDWIDTH (e.g. a future
# M5) — the named chips above take the accurate bandwidth path instead.
FALLBACK_K = {"cuda": 220, "rocm": 180, "metal": 150, "cpu_x86": 70, "cpu_arm": 90}
USE_CASE_WEIGHTS = {
"general": (0.45, 0.30, 0.15, 0.10),
"coding": (0.50, 0.20, 0.15, 0.15),
"reasoning": (0.55, 0.15, 0.15, 0.15),
"chat": (0.40, 0.35, 0.15, 0.10),
"multimodal": (0.50, 0.20, 0.15, 0.15),
"embedding": (0.30, 0.40, 0.20, 0.10),
"tts": (0.40, 0.35, 0.15, 0.10),
"stt": (0.40, 0.35, 0.15, 0.10),
}
SPEED_TARGET = {
"general": 40, "coding": 40, "multimodal": 40, "chat": 40,
"reasoning": 25, "embedding": 200, "tts": 40, "stt": 40,
}
CONTEXT_TARGET = {
"general": 4096, "chat": 4096, "coding": 8192,
"reasoning": 8192, "multimodal": 4096, "embedding": 512,
"tts": 2048, "stt": 2048,
}
def _lookup_bandwidth(gpu_name):
if not isinstance(gpu_name, str) or not gpu_name:
return None
gn = gpu_name.lower()
for key in _BW_KEYS_SORTED:
if key in gn:
return GPU_BANDWIDTH[key]
return None
def _estimate_speed(model, quant, run_mode, system, offload_frac=0.0):
"""Estimate tok/s. Uses active params for MoE (only active experts run per token).
offload_frac (0..1): fraction of the model's weights that spill to system RAM
(CPU) because they don't fit VRAM. Generation reads every active weight per
token, so when part lives in CPU RAM the per-token time is dominated by the
slow path. We model effective bandwidth as a blend of GPU VRAM bandwidth and
system-RAM bandwidth weighted by what's where — far more accurate than a flat
"halve it" for partial offload, which under/over-shoots depending on amount.
Calibrated against a measured RX 9060 XT: DeepSeek-Coder-V2-Lite Q4_K_M with
light offload → ~59 t/s est vs 59.8 measured.
"""
pb = _active_params_b(model)
is_moe = model.get("is_moe", False)
bw = _lookup_bandwidth(system.get("gpu_name"))
backend = system.get("backend", "cpu_x86")
if bw and run_mode in ("gpu", "cpu_offload"):
bpp = QUANT_BYTES_PER_PARAM.get(quant, 0.5)
model_gb = pb * bpp
if model_gb <= 0:
return 0.0
efficiency = 0.55
if run_mode == "cpu_offload":
# Dual-channel DDR4-3200 ≈ 50 GB/s; DDR5 systems higher, but be
# conservative since offloaded MoE is also compute-bound on CPU.
cpu_bw = 55.0
frac = min(max(offload_frac, 0.0), 1.0)
# If we don't know the fraction (legacy callers pass 0 with
# cpu_offload), assume a meaningful spill so we don't overestimate.
if frac <= 0.0:
frac = 0.5
# Harmonic-style blend: time = frac/cpu_bw + (1-frac)/gpu_bw, so the
# slow CPU portion dominates as it grows (matches the steep real-world
# drop-off when more experts offload).
eff_bw = 1.0 / (frac / cpu_bw + (1.0 - frac) / bw)
raw_tps = (eff_bw / model_gb) * efficiency
return raw_tps * (0.8 if is_moe else 1.0)
# Fully on GPU.
raw_tps = (bw / model_gb) * efficiency
return raw_tps * (0.8 if is_moe else 1.0)
k = FALLBACK_K.get(backend, 70)
if pb <= 0:
return 0.0
sm = QUANT_SPEED_MULT.get(quant, 1.0)
return k / pb * sm
def _architecture_bonus(model):
name = (model.get("name") or "").lower()
arch = (model.get("architecture") or "").lower()
text = f"{name} {arch}"
# Keep this intentionally small: hardware fit and speed still matter, but
# current model families should not be scored the same as older Qwen2/LLama
# era entries just because the parameter count is similar.
if "qwen3.6" in text or "qwen3_6" in text:
return 9
if "qwen3.5" in text or "qwen3_5" in text:
return 8
if "qwen3-next" in text or "qwen3_next" in text:
return 6
if "qwen3" in text or arch.startswith("qwen3"):
return 4
if "qwen2.5" in text or "qwen2_5" in text:
return 2
return 0
def _quality_score(model, quant, use_case):
pb = params_b(model)
if pb < 1:
base = 30
elif pb < 3:
base = 45
elif pb < 7:
base = 60
elif pb < 10:
base = 75
elif pb < 20:
base = 82
elif pb < 40:
base = 89
else:
base = 95
name_lower = model.get("name", "").lower()
if "qwen" in name_lower:
base += 2
if "deepseek" in name_lower:
base += 3
if "llama" in name_lower:
base += 2
if "mistral" in name_lower or "mixtral" in name_lower:
base += 1
if "gemma" in name_lower:
base += 1
base += _architecture_bonus(model)
base += QUANT_QUALITY_PENALTY.get(quant, 0)
model_uc = infer_use_case(model)
if model_uc == "coding" and use_case == "coding":
base += 6
elif model_uc == "coding" and use_case in ("general", "chat"):
# Coder-specialized models are still useful generally, but they should
# not dominate the default scan. If the user wants code, the Coding
# filter gives them the boost above.
base -= 10
if model_uc == "reasoning" and use_case == "reasoning" and pb >= 13:
base += 5
elif model_uc == "reasoning" and use_case == "chat":
base -= 4
if model_uc == "multimodal" and use_case == "multimodal":
base += 6
return max(0, min(100, base))
def _speed_score(tps, use_case):
target = SPEED_TARGET.get(use_case, 40)
return max(0, min(100, (tps / target) * 100))
def _fit_score(required, available):
if required > available:
return 0
if available <= 0:
return 0
ratio = required / available
if ratio <= 0.5:
return 60 + (ratio / 0.5) * 40
if ratio <= 0.8:
return 100
if ratio <= 0.9:
return 70
return 50
def _context_score(ctx, use_case):
target = CONTEXT_TARGET.get(use_case, 4096)
if ctx >= target:
return 100
if ctx >= target / 2:
return 70
return 30
def _try_quant_at(model, quant, ctx, gpu_vram, available_ram):
"""Try a specific quant at a given context. Returns (run_mode, quant, ctx, mem) or None."""
mem = estimate_memory_gb(model, quant, ctx)
if gpu_vram > 0 and mem <= gpu_vram:
return "gpu", quant, ctx, mem
if gpu_vram > 0 and mem <= available_ram:
return "cpu_offload", quant, ctx, mem
if gpu_vram <= 0 and mem <= available_ram:
return "cpu_only", quant, ctx, mem
# Try halving context
cur_ctx = ctx // 2
while cur_ctx >= 1024:
mem = estimate_memory_gb(model, quant, cur_ctx)
if gpu_vram > 0 and mem <= gpu_vram:
return "gpu", quant, cur_ctx, mem
if mem <= available_ram:
return ("cpu_offload" if gpu_vram > 0 else "cpu_only"), quant, cur_ctx, mem
cur_ctx //= 2
return None
def _quant_bits(q):
"""Approximate bit-width of a quant label so GGUF quant tiers (Q4/Q8/…) can
be matched against prequantized formats (AWQ 4, AWQ-8bit, FP8, GPTQ-4bit…).
Returns 0 when unknown (caller treats unknown as "don't filter")."""
qu = (q or "").upper().replace("-", "").replace("_", "").replace(" ", "")
# GGUF k-quants + float formats
if qu.startswith("Q8") or "FP8" in qu or "INT8" in qu or qu.startswith("W8"):
return 8
if qu.startswith("Q4") or qu.startswith("IQ4") or "FP4" in qu or "NF4" in qu or "INT4" in qu or qu.startswith("W4"):
return 4
if qu.startswith("Q2") or qu.startswith("IQ2"):
return 2
if qu.startswith("Q3") or qu.startswith("IQ3"):
return 3
if qu.startswith("Q5"):
return 5
if qu.startswith("Q6"):
return 6
if qu.startswith("F16") or qu.startswith("BF16") or qu.startswith("F32"):
return 16
# Prequantized formats: pull the bit-width digit (AWQ4 / AWQ4BIT / GPTQ8 / 4BIT / INT8 ...)
m = re.search(r"(?:AWQ|GPTQ|MLX|EXL2|BNB|INT|W)(\d{1,2})", qu) or re.search(r"(\d{1,2})BIT", qu)
if m:
b = int(m.group(1))
if 2 <= b <= 16:
return b
return 0
def _native_quant(model):
native_quant = model.get("quantization", "Q4_K_M")
name = (model.get("name") or "").lower()
fmt = (model.get("format") or "").lower()
text = f"{name} {fmt}"
if "nvfp4" in text:
return "NVFP4"
if re.search(r"(^|[-_/])fp8($|[-_/\s])", text):
return "FP8"
if "gptq" in text:
m = re.search(r"(?:gptq|int|w)(?:[-_]?)(\d{1,2})(?:bit)?", text)
# Canonical catalog label is "GPTQ-Int4"/"GPTQ-Int8" (see models.py
# QUANT_BPP / QUANT_QUALITY_PENALTY keys); "GPTQ-4bit" misses both
# maps, so BPP and the quality penalty silently fall to defaults.
return f"GPTQ-Int{m.group(1)}" if m else "GPTQ-Int4"
if "awq" in text:
m = re.search(r"(?:awq|int|w)(?:[-_]?)(\d{1,2})(?:bit)?", text)
# Catalog keys are "AWQ-4bit"/"AWQ-8bit"; bare "AWQ" misses the maps.
return f"AWQ-{m.group(1)}bit" if m else "AWQ-4bit"
if "mlx" in text:
m = re.search(r"mlx[-_]?(\d{1,2})bit", text)
return f"mlx-{m.group(1)}bit" if m else native_quant
if not (model.get("is_gguf") or model.get("gguf_sources")) and re.search(r"(^|[-_/])(?:int)?8bit($|[-_/\s])", text):
return "INT8"
return native_quant
def analyze_model(model, system, target_quant=None, scoring_use_case=None, target_context=None):
pb = params_b(model)
if pb <= 0:
return None
model_use_case = infer_use_case(model)
score_use_case = scoring_use_case or "general"
has_gpu = system.get("has_gpu", False)
gpu_vram = (system.get("gpu_vram_gb") or 0) if has_gpu else 0
gpu_count = system.get("gpu_count", 1) or 1
single_gpu_vram = gpu_vram / gpu_count if gpu_count > 1 else gpu_vram
available_ram = system.get("available_ram_gb", 0)
# When the user has explicitly picked a GPU config (not RAM mode), they want
# to see what runs ON the GPU(s) — not big models that only "fit" by spilling
# most layers to system RAM. Zeroing the offload budget makes _try_quant_at
# take only its GPU branches (fit on VRAM, shrinking context if needed),
# otherwise return None. Fixes "96 GB GPU still lists a 175 GB model".
gpu_only = bool(system.get("gpu_only")) and has_gpu and gpu_vram > 0
eff_ram = 0 if gpu_only else available_ram
is_moe = model.get("is_moe", False)
model_ctx = model.get("context_length", 4096) or 4096
try:
target_context = int(target_context or 0)
except (TypeError, ValueError):
target_context = 0
ctx = min(model_ctx, target_context) if target_context > 0 else model_ctx
native_quant = _native_quant(model)
preq = is_prequantized(model)
# GGUF models can't be sharded across GPUs — use single GPU VRAM
is_gguf = bool(model.get("gguf_sources"))
quant_upper = (native_quant or "").upper()
is_gguf_quant = any(quant_upper.startswith(p) for p in ("Q2", "Q3", "Q4", "Q5", "Q6", "Q8", "IQ", "F16", "F32"))
# Single-GPU VRAM only applies to GGUF/dense builds (llama.cpp can't shard
# across GPUs). Prequantized formats (AWQ/GPTQ/FP8) are served sharded by
# vLLM across all GPUs, so they get the FULL multi-GPU VRAM — even when the
# model also lists a GGUF alternate download (gguf_sources).
if (is_gguf or is_gguf_quant) and not preq:
effective_vram = single_gpu_vram
else:
effective_vram = gpu_vram
native_gpu_only = preq and not native_quant.startswith("mlx-")
# Determine which quant to evaluate at
native_quant_prefixes = (
"AWQ-", "GPTQ-", "FP8", "FP4", "NVFP4", "MXFP4", "NF4",
"INT4", "INT8", "W4A16", "W8A8", "W8A16",
)
if preq:
# Native HF/vLLM quantized repos come at a fixed format. If the user
# picked a GGUF quant tier (Q4/Q8/etc.), do not treat same-bit
# AWQ/GPTQ/FP8/FP4 builds as equivalent; those formats are separate
# serving paths and only appear when explicitly selected or unfiltered.
if target_quant:
if not any(target_quant.startswith(p) for p in native_quant_prefixes):
return None
_tb, _nb = _quant_bits(target_quant), _quant_bits(native_quant)
if _tb and _nb and _tb != _nb:
return None
quant_to_try = native_quant
elif target_quant:
# User picked a specific quant
quant_to_try = target_quant
elif gpu_count >= 2:
# Multi-GPU box: vLLM/SGLang can't serve GGUF Q* quants (those are
# llama.cpp-only). Default non-prequantized models to BF16 so the row
# is meaningful on a multi-GPU rig. If BF16 doesn't fit, the model
# surfaces as too_tight — better than showing a Q4 row the user
# can't actually serve with vLLM on >1 GPU.
quant_to_try = "BF16"
else:
# Default: Q4_K_M (user's stated preference) — kept for single-GPU
# and RAM modes where llama.cpp serving is the natural path.
quant_to_try = "Q4_K_M"
# Multi-GPU filter: skip the row if the resolved quant is a GGUF tier
# (Q*/IQ-prefixed) — vLLM/SGLang can't serve those, so showing them on
# a 2+ GPU rig just clutters the list with unservable candidates.
if gpu_count >= 2 and quant_to_try and not target_quant and quant_to_try.upper().startswith(("Q2", "Q3", "Q4", "Q5", "Q6", "Q8", "IQ")):
return None
result = _try_quant_at(model, quant_to_try, ctx, effective_vram, 0 if native_gpu_only else eff_ram)
if result is None:
# Model doesn't fit on the user's current hardware. Surface it
# anyway with a "too_tight" badge instead of silently dropping
# it — without this, editing the hardware config to try LARGER
# tiers never revealed the bigger models, because they were
# filtered out before the user could see what would fit. The
# client already knows how to render too_tight (red row).
oversized_required = estimate_memory_gb(model, quant_to_try, ctx)
return {
"name": model.get("name"),
"provider": model.get("provider"),
"parameter_count": model.get("parameter_count"),
"params_b": round(pb, 1),
"is_moe": is_moe,
"use_case": model_use_case,
"fit_level": "too_tight",
"run_mode": "no_fit",
"quant": quant_to_try,
"context": ctx,
"required_gb": round(oversized_required, 1),
"speed_tps": 0,
"score": 0,
"scores": {"quality": 0, "speed": 0, "fit": 0, "context": 0},
"gguf_sources": model.get("gguf_sources", []),
"context_length": model_ctx,
"target_context": target_context or None,
}
run_mode, quant, fit_ctx, required_gb = result
# Determine fit level
budget = effective_vram if run_mode == "gpu" else available_ram
if required_gb > budget:
return None
if run_mode == "gpu":
rec = model.get("recommended_ram_gb") or required_gb
if rec <= gpu_vram:
fit_level = "perfect"
elif gpu_vram >= required_gb * 1.2:
fit_level = "good"
else:
fit_level = "marginal"
elif run_mode == "cpu_offload":
fit_level = "good" if available_ram >= required_gb * 1.2 else "marginal"
else:
fit_level = "marginal"
# Fraction of the model that spills to CPU RAM (drives the offload speed
# model). When offloading, anything beyond the GPU's VRAM lives in system RAM.
offload_frac = 0.0
if run_mode == "cpu_offload" and required_gb > 0 and effective_vram > 0:
offload_frac = max(0.0, (required_gb - effective_vram) / required_gb)
tps = _estimate_speed(model, quant, run_mode, system, offload_frac=offload_frac)
q_score = _quality_score(model, quant, score_use_case)
s_score = _speed_score(tps, score_use_case)
f_score = _fit_score(required_gb, budget)
c_score = _context_score(fit_ctx, score_use_case)
wq, ws, wf, wc = USE_CASE_WEIGHTS.get(score_use_case, (0.45, 0.30, 0.15, 0.10))
composite = q_score * wq + s_score * ws + f_score * wf + c_score * wc
return {
"name": model.get("name"),
"provider": model.get("provider"),
"parameter_count": model.get("parameter_count"),
"params_b": round(pb, 1),
"is_moe": is_moe,
"use_case": model_use_case,
"fit_level": fit_level,
"run_mode": run_mode,
"quant": quant,
"context": fit_ctx,
"required_gb": round(required_gb, 1),
"speed_tps": round(tps, 1),
"score": round(composite, 1),
"scores": {
"quality": round(q_score, 1),
"speed": round(s_score, 1),
"fit": round(f_score, 1),
"context": round(c_score, 1),
},
"gguf_sources": model.get("gguf_sources", []),
"context_length": model_ctx,
"release_date": model.get("release_date", ""),
"target_context": target_context or None,
}
def _version_key(name):
"""Parse the model's version number from its display name so equal-score
rows can break ties in favor of the newer release (e.g. M2.7 > M2.5).
Returns a float; 0.0 for names with no recognizable version. The regex
grabs the FIRST 'word-with-digits' pattern after a hyphen/underscore,
so e.g. 'MiniMax-M2.7' -> 2.7, 'Qwen3.6-35B' -> 3.6, 'M2' -> 2.0."""
import re as _re
if not name:
return 0.0
# Match the version-marker word: a letter followed by a number with
# optional decimal, e.g. M2.7, V4, Pro3. Take the first hit; ignore
# "B" param-count suffixes (Qwen3-235B should yield 3, not 235).
for m in _re.finditer(r"[A-Za-z](\d+(?:\.\d+)?)(?![A-Za-z])", name):
val = m.group(1)
# Skip param-count tokens (e.g. "235B" gives "235" but the next
# char would be "B" — already excluded by the negative lookahead).
try:
f = float(val)
except ValueError:
continue
# Heuristic: bare integers >= 100 are almost certainly param counts
# (1B/3B/8B/70B/235B…), not version numbers. Skip them.
if "." not in val and f >= 100:
continue
return f
return 0.0
SORT_KEYS = {
# Score sort with version-aware tiebreaker — when two rows tie on
# composite score (a common case for the SAME base model in different
# versions, e.g. MiniMax-M2.5 vs M2.7 both at the same FP8 budget),
# prefer the newer version. Without this, ties resolved to whatever
# order they came out of the registry, which let older releases land
# above newer ones in user-facing lists.
"score": lambda r: (r["score"], _version_key(r.get("name") or "")),
"speed": lambda r: r["speed_tps"],
"vram": lambda r: r["required_gb"],
"params": lambda r: r["params_b"],
"context": lambda r: r["context"],
# Newest first. release_date is an ISO-ish string ("2026-05-30"); plain
# string sort is chronological. Missing dates sort last (empty < any date,
# and we sort reverse=True for newest, so "" lands at the bottom).
"newest": lambda r: r.get("release_date") or "",
}
def rank_models(system, use_case=None, limit=50, search=None, sort="score", quant=None, target_context=None, fit_only=False):
"""Rank all models against detected hardware. Returns sorted list of fit results.
fit_only: when True, drop rows whose fit_level is "too_tight" (model doesn't
actually fit on the chosen budget). When False (default), every model is
shown — sorting by Param means highest-param PERIOD, even ones that won't
run, so the user can see the truth.
"""
models = get_models()
results = []
# Include image gen models only when explicitly filtered
if use_case == "image_gen":
try:
from services.hwfit.image_models import rank_image_models
except ImportError:
rank_image_models = None
if rank_image_models:
img_results = rank_image_models(system, search=search)
else:
img_results = []
for im in img_results:
fit_map = {"perfect": "perfect", "good": "good", "tight": "marginal", "no_fit": "too_tight", "no_gpu": "too_tight"}
results.append({
"name": im["id"],
"provider": im["provider"],
"parameter_count": f"{im['params_b']}B",
"params_b": im["params_b"],
"is_moe": False,
"use_case": "image_gen",
"fit_level": fit_map.get(im["fit"], "too_tight"),
"run_mode": "gpu" if im["fits"] else "no_fit",
"quant": im.get("quant", "BF16"),
"context": 0,
"context_length": 0,
"required_gb": round(im.get("vram_needed") or 0, 1),
"speed_tps": 0,
"score": float(im["score"]),
"scores": {"quality": float(im["quality"]), "speed": float(im["speed"]), "fit": 0, "context": 0},
"gguf_sources": [],
"is_image_gen": True,
"capabilities": im.get("capabilities", []),
"description": im.get("description", ""),
})
if use_case == "image_gen":
sort_fn = SORT_KEYS.get(sort, SORT_KEYS["score"])
results.sort(key=sort_fn, reverse=True) # see main path below
return results[:limit]
# If user picked a native prequantized format, filter to only those models.
filter_native = quant and any(quant.startswith(p) for p in (
"AWQ-", "GPTQ-", "FP8", "FP4", "NVFP4", "MXFP4", "NF4",
"INT4", "INT8", "W4A16", "W8A8", "W8A16",
))
system_backend = (system.get("backend") or "").lower()
apple_silicon = system_backend in ("mps", "metal", "apple")
rocm = system_backend == "rocm"
is_windows = system.get("platform") == "windows"
# Consumer AMD Radeon (RDNA, gfx10/11/12): the practical local serving path
# is GGUF via llama.cpp. vLLM/SGLang on ROCm are validated for datacenter
# Instinct (CDNA, gfx9xx) but are unreliable on consumer RDNA — AWQ kernels
# are largely unsupported there and FP8 needs out-of-tree patches. So treat
# consumer RDNA like Apple Silicon (GGUF-only) and leave CDNA untouched.
# Unknown family (no rocminfo) is left untouched to avoid hiding models from
# a possibly-capable Instinct box on a misdetect.
gpu_family = (system.get("gpu_family") or "").lower()
consumer_amd = system_backend == "rocm" and gpu_family == "rdna"
for m in models:
native_q = _native_quant(m)
# MLX needs the mlx_lm runtime, which Odysseus does not generate serve
# commands for. Hide it on every backend, including Metal.
if native_q.startswith("mlx-") or "mlx" in (m.get("name") or "").lower():
continue
# ROCm support for vLLM/SGLang quantized safetensors is too brittle to
# recommend blindly in the default scan. Keep AWQ/GPTQ/FP8 discoverable
# only when the user explicitly picks that format from the quant filter;
# otherwise prefer GGUF/Q* entries that Odysseus can route through
# llama.cpp/Ollama without pretending "fits VRAM" means "servable".
if rocm and is_prequantized(m) and not filter_native:
continue
# On Apple Silicon the only serving engines are llama.cpp and Ollama,
# both GGUF-only (vLLM/SGLang are CUDA/ROCm and don't run on macOS). So
# a model is Metal-servable ONLY if it ships a real GGUF. Drop everything
# else — raw safetensors repos (which the catalog still tags with a
# default GGUF quant) and vLLM-only AWQ/GPTQ/FP8 builds alike. Without
# this the Cookbook recommends models the Mac can't run; on CUDA these
# stay visible because vLLM serves safetensors directly.
#
# Consumer AMD (RDNA) is the same story: GGUF via llama.cpp is the
# servable path, so a model needs a real GGUF to be recommended.
# Otherwise the Cookbook rates vLLM-only AWQ/GPTQ builds "GOOD" on a
# Radeon that can't actually serve them.
#
# Windows is the same: Odysseus only supports llama.cpp on Windows,
# which requires GGUF. vLLM/SGLang are explicitly blocked, so AWQ/GPTQ
# models without a GGUF source are unservable there.
if (apple_silicon or consumer_amd or is_windows) and not (m.get("is_gguf") or m.get("gguf_sources")):
continue
# Format filter: AWQ tab -> only AWQ models, FP4 tab -> FP4-family models, etc.
if filter_native:
if quant == "FP8" and native_q != "FP8":
continue
if quant == "FP4" and native_q not in ("FP4", "NVFP4", "MXFP4", "NF4"):
continue
if quant.startswith("AWQ") and not native_q.startswith("AWQ"):
continue
if quant.startswith("GPTQ") and not native_q.startswith("GPTQ"):
continue
if quant.startswith("NVFP4") and not native_q.startswith("NVFP4"):
continue
if quant in ("INT4", "INT8", "W4A16", "W8A8", "W8A16") and native_q != quant:
continue
if search:
name = m.get("name", "").lower()
provider = m.get("provider", "").lower()
if search.lower() not in name and search.lower() not in provider:
continue
result = analyze_model(m, system, target_quant=quant, scoring_use_case=(use_case or "general"), target_context=target_context)
if result is None:
continue
if use_case:
model_uc = infer_use_case(m)
if use_case != model_uc and use_case != "general":
continue
results.append(result)
# Pick the visible SET by the REQUESTED column. Per-user feedback: sorting
# by Param should show the highest-param models PERIOD, not just those that
# already fit. Same for every other column. Models that don't fit are still
# in the list with their fit_level marking the constraint, so the user can
# see the truth instead of a quietly-truncated view. Score sort is unchanged
# (it's the default ranking and naturally pushes non-fits to the bottom).
if fit_only:
# Hide rows that definitely don't fit (the "too_tight" badge) — user
# explicitly asked for a Fit-only view.
results = [r for r in results if r.get("fit_level") != "too_tight"]
sort_fn = SORT_KEYS.get(sort, SORT_KEYS["score"])
# Always sort descending then truncate top-N so each column shows the
# global highest by that metric. Before, vram was special-cased
# ascending → truncate kept the 50 SMALLEST models and "highest VRAM"
# could never appear, breaking the column-click toggle.
results.sort(key=sort_fn, reverse=True)
results = results[:limit]
return results