79 lines
2.2 KiB
Python
79 lines
2.2 KiB
Python
from services.hwfit.fit import analyze_model, rank_models
|
|
from services.hwfit.models import (
|
|
get_models,
|
|
infer_quantization_from_name,
|
|
is_prequantized,
|
|
)
|
|
|
|
|
|
def _dual_5060ti_system():
|
|
return {
|
|
"has_gpu": True,
|
|
"backend": "cuda",
|
|
"gpu_name": "NVIDIA GeForce RTX 5060 Ti",
|
|
"gpu_vram_gb": 31.0,
|
|
"gpu_count": 2,
|
|
"available_ram_gb": 128.0,
|
|
"total_ram_gb": 128.0,
|
|
}
|
|
|
|
|
|
def test_infers_native_hf_quant_formats_from_repo_names():
|
|
cases = {
|
|
"txn545/Qwen3.5-122B-A10B-NVFP4": "NVFP4",
|
|
"some/model-MXFP4": "MXFP4",
|
|
"some/model-NF4": "NF4",
|
|
"some/model-FP4": "FP4",
|
|
"some/model-W4A16": "W4A16",
|
|
"some/model-W8A8": "W8A8",
|
|
"some/model-W8A16": "W8A16",
|
|
"some/model-INT4": "INT4",
|
|
"some/model-8bit": "INT8",
|
|
}
|
|
assert {name: infer_quantization_from_name(name) for name in cases} == cases
|
|
|
|
|
|
def test_nvfp4_catalog_quant_is_preserved():
|
|
catalog = {m["name"]: m for m in get_models()}
|
|
model = catalog["txn545/Qwen3.5-122B-A10B-NVFP4"]
|
|
|
|
assert model["quantization"] == "NVFP4"
|
|
assert is_prequantized(model)
|
|
|
|
|
|
def test_nvfp4_search_result_is_not_gguf_or_cpu_offload():
|
|
catalog = {m["name"]: m for m in get_models()}
|
|
model = catalog["txn545/Qwen3.5-122B-A10B-NVFP4"]
|
|
|
|
fit = analyze_model(model, _dual_5060ti_system())
|
|
assert fit["quant"] == "NVFP4"
|
|
assert fit["run_mode"] != "cpu_offload"
|
|
|
|
results = rank_models(
|
|
_dual_5060ti_system(),
|
|
search="Qwen3.5-122B-A10B-NVFP4",
|
|
limit=10,
|
|
)
|
|
hit = next(r for r in results if r["name"] == "txn545/Qwen3.5-122B-A10B-NVFP4")
|
|
assert hit["quant"] == "NVFP4"
|
|
assert hit["run_mode"] != "cpu_offload"
|
|
|
|
|
|
def test_selected_gguf_quant_is_strict_not_lower_quant_fallback():
|
|
model = {
|
|
"name": "local/Huge-GGUF",
|
|
"provider": "local",
|
|
"parameter_count": "100B",
|
|
"parameters_raw": 100_000_000_000,
|
|
"quantization": "Q4_K_M",
|
|
"context_length": 4096,
|
|
}
|
|
|
|
system = _dual_5060ti_system()
|
|
system["available_ram_gb"] = 80.0
|
|
system["total_ram_gb"] = 80.0
|
|
fit = analyze_model(model, system, target_quant="Q8_0")
|
|
|
|
assert fit["quant"] == "Q8_0"
|
|
assert fit["run_mode"] == "no_fit"
|