odysseus/tests/test_cookbook_cpu_only_serve.py

"""Regression guard for issue #1291 — CPU-only serve still emitted GPU-only flags.

The llama.cpp serve command builder (static/js/cookbook.js) added
`--flash-attn on` and exported `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` from
independent toggles, so a CPU-only config (`-ngl 0`, often with flash-attn left
on by an Auto profile) produced a command that mixes "zero GPU layers" with
CUDA/flash-attn and fails to start. The builder now drops those GPU-only flags
when ngl == 0, per the maintainer's guidance.

cookbook.js pulls in browser globals so it can't run under node; guard the fix
at the source level: a `_cpuOnly` gate exists and is applied to flash-attn and
the CUDA unified-memory env.
"""
import re
from pathlib import Path

SRC = Path(__file__).resolve().parent.parent / "static/js/cookbook.js"


def test_cpu_only_drops_gpu_only_flags():
    text = SRC.read_text(encoding="utf-8")
    # A CPU-only flag derived from ngl == 0.
    assert re.search(r"_cpuOnly\s*=\s*String\(f\.ngl\)\.trim\(\)\s*===\s*'0'", text), \
        "expected a _cpuOnly gate derived from ngl==0"
    # flash-attn must be suppressed for CPU-only.
    assert re.search(r"if\s*\(\s*f\.flash_attn\s*&&\s*!_cpuOnly\s*\)", text), \
        "flash-attn must be gated on !_cpuOnly"
    # The CUDA unified-memory env must be suppressed for CPU-only too.
    assert "f.unified_mem && !_cpuOnly" in text, \
        "GGML_CUDA_ENABLE_UNIFIED_MEMORY must be gated on !_cpuOnly"