From cf5c5118d869149ef239733ecdc8b300ed69666b Mon Sep 17 00:00:00 2001
From: raf <146721410+rafdog1222@users.noreply.github.com>
Date: Thu, 4 Jun 2026 21:25:36 +0800
Subject: [PATCH] fix(hwfit): return no_fit instead of None when target_quant
 is a GGUF tier on multi-GPU (#2375)

The multi-GPU GGUF filter at fit.py:380 returned None unconditionally
for Q*/IQ quants on 2+ GPU systems. When the caller explicitly passes
target_quant, they are asking 'what happens if I try this?' and expect
a structured no_fit response, not a silent None.

Fix: skip the filter when target_quant is explicitly provided so the
call falls through to the existing no_fit path.

Fixes #
---
 services/hwfit/fit.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/hwfit/fit.py b/services/hwfit/fit.py
index 3136d7b..9a45b53 100644
--- a/services/hwfit/fit.py
+++ b/services/hwfit/fit.py
@@ -377,7 +377,7 @@ def analyze_model(model, system, target_quant=None, scoring_use_case=None, targe
     # Multi-GPU filter: skip the row if the resolved quant is a GGUF tier
     # (Q*/IQ-prefixed) — vLLM/SGLang can't serve those, so showing them on
     # a 2+ GPU rig just clutters the list with unservable candidates.
-    if gpu_count >= 2 and quant_to_try and quant_to_try.upper().startswith(("Q2", "Q3", "Q4", "Q5", "Q6", "Q8", "IQ")):
+    if gpu_count >= 2 and quant_to_try and not target_quant and quant_to_try.upper().startswith(("Q2", "Q3", "Q4", "Q5", "Q6", "Q8", "IQ")):
         return None
 
     result = _try_quant_at(model, quant_to_try, ctx, effective_vram, 0 if native_gpu_only else eff_ram)