odysseus/routes/cookbook_routes.py

"""Cookbook routes — model download, serve, cache scanning, and cookbook state sync."""

import asyncio
import json
import logging
import os
import re
import shlex
import shutil
import uuid
from pathlib import Path

from fastapi import APIRouter, HTTPException, Request, Depends

from src.auth_helpers import require_user
from pydantic import BaseModel

from core.middleware import require_admin
from routes.shell_routes import TMUX_LOG_DIR

logger = logging.getLogger(__name__)

from routes.cookbook_helpers import (
    _SSH_PORT_RE, _REMOTE_HOST_RE, _SESSION_ID_RE,
    _validate_repo_id, _validate_include, _validate_remote_host, _validate_token,
    _validate_local_dir, _validate_ssh_port, _validate_gpus, _shell_path,
    _ps_squote, _bash_squote, _validate_serve_cmd, _parse_serve_phase,
    _safe_env_prefix,
    ModelDownloadRequest, ServeRequest,
)

_HF_TOKEN_STATUS_SNIPPET = (
    'if [ -n "$HF_TOKEN" ]; then '
    'echo "[odysseus] HF token: applied"; '
    'else '
    'echo "[odysseus] HF token: NOT SET — gated/private models will be denied. '
    'Add one in Odysseus Settings -> Cookbook -> HuggingFace Token."; '
    'fi'
)

def setup_cookbook_routes() -> APIRouter:
    router = APIRouter(tags=["cookbook"])
    _cookbook_state_path = Path(os.environ.get("DATA_DIR", "data")) / "cookbook_state.json"

    def _mask_secret(value: str) -> str:
        if not value:
            return ""
        if len(value) <= 8:
            return "stored"
        return f"{value[:4]}...{value[-4:]}"

    def _decrypt_secret(value: str | None) -> str:
        if not value:
            return ""
        from src.secret_storage import decrypt
        return decrypt(value)

    def _encrypt_secret(value: str) -> str:
        from src.secret_storage import encrypt
        return encrypt(value)

    def _strip_task_secrets(state):
        tasks = state.get("tasks") if isinstance(state, dict) else None
        if isinstance(tasks, list):
            for task in tasks:
                if isinstance(task, dict) and isinstance(task.get("payload"), dict):
                    task["payload"].pop("hf_token", None)
        return state

    def _diagnose_serve_output(text: str) -> dict | None:
        """Server-side mirror of the Cookbook UI's common serve diagnoses.

        The browser uses cookbook-diagnosis.js for clickable fixes. This gives
        the agent/tool path the same structured signal so it can retry with an
        adjusted command instead of guessing from raw tmux output.
        """
        if not text:
            return None
        tail = text[-6000:]
        patterns = [
            (
                r"No available memory for the cache blocks|Available KV cache memory:.*-",
                "No GPU memory left for KV cache after loading model.",
                [
                    {"label": "retry with GPU memory utilization 0.95", "op": "replace", "flag": "--gpu-memory-utilization", "value": "0.95"},
                    {"label": "retry with context 2048", "op": "replace", "flag": "--max-model-len", "value": "2048"},
                ],
            ),
            (
                r"CUDA out of memory|torch\.cuda\.OutOfMemoryError|CUDA error: out of memory|warming up sampler|max_num_seqs.*gpu_memory_utilization",
                "GPU ran out of memory during startup or warmup.",
                [
                    {"label": "retry with context 4096", "op": "replace", "flag": "--max-model-len", "value": "4096"},
                    {"label": "retry with GPU memory utilization 0.80", "op": "replace", "flag": "--gpu-memory-utilization", "value": "0.80"},
                    {"label": "retry with --enforce-eager", "op": "append", "arg": "--enforce-eager"},
                ],
            ),
            (
                r"not divisib|must be divisible|attention heads.*divisible",
                "Tensor parallel size is incompatible with the model.",
                [
                    {"label": "retry with tensor parallel size 1", "op": "replace", "flag": "--tensor-parallel-size", "value": "1"},
                    {"label": "retry with tensor parallel size 2", "op": "replace", "flag": "--tensor-parallel-size", "value": "2"},
                ],
            ),
            (
                r"KV cache.*too (small|large)|max_model_len.*exceeds|maximum.*context",
                "Context length is too large for available GPU memory.",
                [
                    {"label": "retry with context 8192", "op": "replace", "flag": "--max-model-len", "value": "8192"},
                    {"label": "retry with context 4096", "op": "replace", "flag": "--max-model-len", "value": "4096"},
                ],
            ),
            (
                r"enable-auto-tool-choice requires --tool-call-parser",
                "Auto tool choice requires an explicit tool call parser.",
                [{"label": "retry with Hermes tool parser", "op": "append", "arg": "--tool-call-parser hermes"}],
            ),
            (
                r"Please pass.*trust.remote.code=True|contains custom code which must be executed to correctly load|does not recognize this architecture|model type.*but Transformers does not",
                "Model requires custom code or newer model support.",
                [{"label": "retry with --trust-remote-code", "op": "append", "arg": "--trust-remote-code"}],
            ),
            (
                r"Address already in use|bind.*address.*in use",
                "Port is already in use.",
                [{"label": "retry on port 8001", "op": "replace", "flag": "--port", "value": "8001"}],
            ),
            (
                r"No CUDA GPUs are available|no GPU.*found|CUDA_VISIBLE_DEVICES.*invalid",
                "No GPUs are visible to the serve process.",
                [{"label": "clear Cookbook GPU selection or choose available GPUs", "op": "settings", "field": "gpus", "value": ""}],
            ),
            (
                r"vllm.*command not found|No module named vllm|ERROR: vLLM is not installed",
                "vLLM is not installed or not in PATH on this server.",
                [{"label": "install vLLM in Cookbook Dependencies", "op": "dependency", "package": "vllm"}],
            ),
            (
                r"sglang.*command not found|No module named sglang|SGLang is not installed",
                "SGLang is not installed or not in PATH on this server.",
                [{"label": "install SGLang in Cookbook Dependencies", "op": "dependency", "package": "sglang[all]"}],
            ),
            (
                r"llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'|git: command not found|cmake: command not found",
                "llama.cpp / llama-cpp-python dependencies are missing.",
                [{"label": "install llama.cpp dependencies or llama-cpp-python[server]", "op": "dependency", "package": "llama-cpp-python[server]"}],
            ),
            (
                r"403 Forbidden|401 Unauthorized|Access to model.*is restricted|gated repo|not in the authorized list|awaiting a review",
                "Model access is gated or unauthorized.",
                [{"label": "set HF token and request model access on HuggingFace", "op": "manual"}],
            ),
        ]
        for pattern, message, suggestions in patterns:
            if re.search(pattern, tail, re.I):
                return {"message": message, "suggestions": suggestions}
        if re.search(r"Traceback \(most recent call last\)", tail, re.I) and not re.search(
            r"Application startup complete|GET /v1/|Uvicorn running on", tail, re.I
        ):
            return {
                "message": "Python traceback detected during serve startup.",
                "suggestions": [{"label": "inspect traceback and retry with adjusted backend/settings", "op": "manual"}],
            }
        return None

    def _state_for_client(state):
        """Return cookbook state without raw secrets for browser clients."""
        _strip_task_secrets(state)
        env = state.get("env") if isinstance(state, dict) else None
        if isinstance(env, dict):
            token = _decrypt_secret(env.get("hfToken"))
            env.pop("hfToken", None)
            env["hfTokenConfigured"] = bool(token)
            env["hfTokenMasked"] = _mask_secret(token)
        return state

    def _state_for_storage(state, on_disk=None):
        """Encrypt cookbook secrets before writing state to disk."""
        _strip_task_secrets(state)
        env = state.get("env") if isinstance(state, dict) else None
        disk_env = on_disk.get("env") if isinstance(on_disk, dict) and isinstance(on_disk.get("env"), dict) else {}
        if isinstance(env, dict):
            incoming = env.get("hfToken")
            if incoming:
                _validate_token(incoming)
                env["hfToken"] = _encrypt_secret(incoming)
            elif disk_env.get("hfToken"):
                env["hfToken"] = disk_env["hfToken"]
            else:
                env.pop("hfToken", None)
            env.pop("hfTokenMasked", None)
            env.pop("hfTokenConfigured", None)
        return state

    def _load_stored_hf_token() -> str:
        if not _cookbook_state_path.exists():
            return ""
        try:
            state = json.loads(_cookbook_state_path.read_text())
            env = state.get("env") if isinstance(state, dict) else {}
            return _decrypt_secret(env.get("hfToken") if isinstance(env, dict) else "")
        except Exception:
            return ""

    def _cookbook_ssh_dir() -> Path:
        app_ssh = Path("/app/.ssh")
        if Path("/app").exists():
            return app_ssh
        return Path.home() / ".ssh"

    def _cookbook_ssh_key_path() -> Path:
        return _cookbook_ssh_dir() / "id_ed25519"

    def _read_cookbook_public_key() -> str:
        pub = _cookbook_ssh_key_path().with_suffix(".pub")
        if not pub.exists():
            return ""
        return pub.read_text(encoding="utf-8", errors="replace").strip()

    @router.get("/api/cookbook/ssh-key")
    async def get_cookbook_ssh_key(request: Request):
        require_admin(request)
        public_key = _read_cookbook_public_key()
        return {
            "configured": bool(public_key),
            "public_key": public_key,
        }

    @router.post("/api/cookbook/ssh-key")
    async def generate_cookbook_ssh_key(request: Request):
        require_admin(request)
        ssh_dir = _cookbook_ssh_dir()
        key_path = _cookbook_ssh_key_path()
        ssh_dir.mkdir(parents=True, exist_ok=True)
        try:
            os.chmod(ssh_dir, 0o700)
        except Exception:
            pass
        if not key_path.exists():
            proc = await asyncio.create_subprocess_exec(
                "ssh-keygen", "-t", "ed25519", "-N", "", "-C", "odysseus-cookbook", "-f", str(key_path),
                stdout=asyncio.subprocess.PIPE,
                stderr=asyncio.subprocess.PIPE,
            )
            stdout, stderr = await proc.communicate()
            if proc.returncode != 0:
                detail = (stderr or stdout).decode("utf-8", errors="replace").strip()[-500:]
                return {"ok": False, "error": detail or "Failed to generate SSH key"}
        try:
            os.chmod(key_path, 0o600)
            os.chmod(key_path.with_suffix(".pub"), 0o644)
        except Exception:
            pass
        return {"ok": True, "public_key": _read_cookbook_public_key()}

    def _user_shell_path_bootstrap() -> list[str]:
        return [
            'ODYSSEUS_USER_SHELL="${SHELL:-}"',
            'if [ -n "$ODYSSEUS_USER_SHELL" ] && [ -x "$ODYSSEUS_USER_SHELL" ]; then',
            '  ODYSSEUS_USER_PATH="$("$ODYSSEUS_USER_SHELL" -ic \'printf "__ODYSSEUS_PATH__%s\\n" "$PATH"\' 2>/dev/null | sed -n \'s/^__ODYSSEUS_PATH__//p\' | tail -n 1 || true)"',
            '  if [ -n "$ODYSSEUS_USER_PATH" ]; then export PATH="$ODYSSEUS_USER_PATH:$PATH"; fi',
            'fi',
        ]

    def _needs_binary(cmd: str, binary: str) -> bool:
        return bool(re.search(rf"(^|[\s;&|()]){re.escape(binary)}($|[\s;&|()])", cmd or ""))

    def _missing_binary_message(binary: str, target: str) -> str:
        if binary == "tmux":
            return (
                f"tmux is required for Cookbook background downloads/serves on {target}. "
                "Install it with your OS package manager, or run Cookbook server setup for that server."
            )
        if binary == "docker":
            return (
                f"Docker is required by this Cookbook launch command on {target}, but the docker CLI was not found. "
                "Install Docker and make sure this user can run `docker`, then retry."
            )
        return f"{binary} is required on {target}, but it was not found."

    async def _remote_binary_available(remote: str, ssh_port: str | None, binary: str, *, windows: bool = False) -> bool:
        _port = ssh_port or ""
        _pf = ["-p", _port] if _port and _port != "22" else []
        if windows:
            check = f"powershell -NoProfile -Command \"if (Get-Command {binary} -ErrorAction SilentlyContinue) {{ exit 0 }} else {{ exit 127 }}\""
        else:
            check = f"command -v {shlex.quote(binary)} >/dev/null 2>&1"
        try:
            proc = await asyncio.create_subprocess_exec(
                "ssh", "-o", "ConnectTimeout=6", "-o", "StrictHostKeyChecking=no",
                *_pf, remote, check,
                stdout=asyncio.subprocess.PIPE,
                stderr=asyncio.subprocess.PIPE,
            )
            await asyncio.wait_for(proc.communicate(), timeout=10)
            return proc.returncode == 0
        except Exception:
            return False

    async def _binary_available(binary: str, remote: str | None, ssh_port: str | None, *, windows: bool = False) -> bool:
        if remote:
            return await _remote_binary_available(remote, ssh_port, binary, windows=windows)
        return shutil.which(binary) is not None

    @router.post("/api/model/download")
    async def model_download(request: Request, req: ModelDownloadRequest):
        """Download a HuggingFace model in a tmux session.
        Uses `hf download` CLI directly — runs in tmux via `script -qc`
        for real TTY progress, streams ANSI-stripped output via log file."""
        require_admin(request)
        # Defence-in-depth: even though this endpoint is admin-gated, refuse
        # values that would land in shell contexts with metacharacters.
        _validate_repo_id(req.repo_id)
        _validate_include(req.include)
        _validate_remote_host(req.remote_host)
        req.ssh_port = _validate_ssh_port(req.ssh_port)
        req.local_dir = _validate_local_dir(req.local_dir)
        req.hf_token = req.hf_token or _load_stored_hf_token()
        _validate_token(req.hf_token)
        TMUX_LOG_DIR.mkdir(parents=True, exist_ok=True)
        session_id = f"cookbook-{uuid.uuid4().hex[:8]}"
        wrapper_script = TMUX_LOG_DIR / f"{session_id}.sh"

        # When a download directory is set, target a per-model subfolder under it
        # (<dir>/<name>) so the flat-directory cache scan lists it as its own
        # model. Without it, hf/snapshot_download falls back to the HF cache.
        _dl_short = req.repo_id.split("/")[-1] if "/" in req.repo_id else req.repo_id
        _dl_base = (req.local_dir.rstrip("/") + "/" + _dl_short) if req.local_dir else None
        _dl_shell = _shell_path(_dl_base) if _dl_base else None      # for hf CLI / bash
        _dl_pyarg = (", local_dir=os.path.expanduser(" + repr(_dl_base) + ")") if _dl_base else ""

        # Build the hf download command. Redirection to suppress the interactive
        # "update available? [Y/n]" prompt is added per-platform further down
        # (< /dev/null on bash, $null | on PowerShell).
        hf_cmd = f"hf download {req.repo_id}"
        if req.include:
            hf_cmd += f" --include '{req.include}'"
        if _dl_shell:
            hf_cmd += f" --local-dir {_dl_shell}"

        # Build the shell wrapper — runs hf download directly in tmux (which is a TTY)
        # No script/tee needed — we'll use tmux capture-pane to read output
        lines = ["#!/bin/bash"]
        lines.extend(_user_shell_path_bootstrap())
        if req.hf_token:
            lines.append(f"export HF_TOKEN='{_bash_squote(req.hf_token)}'")
        # Ensure pip-user scripts (e.g. hf CLI installed via --user) are on PATH
        lines.append('export PATH="$HOME/.local/bin:$PATH"')
        # Best-effort install hf CLI (always). hf_transfer (Rust parallel downloader)
        # is fast but flaky on large files — it tends to crash near the end at high
        # throughput. Retries set disable_hf_transfer to fall back to the plain,
        # slower-but-reliable downloader (resumes cleanly from the .incomplete files).
        lines.append("command -v hf >/dev/null 2>&1 || pip install --user --break-system-packages -q -U huggingface_hub 2>/dev/null || pip install -q -U huggingface_hub 2>/dev/null")
        if req.disable_hf_transfer:
            lines.append("export HF_HUB_ENABLE_HF_TRANSFER=0")
            lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=4")
        else:
            lines.append("python3 -c 'import hf_transfer' 2>/dev/null || pip install --user --break-system-packages -q hf_transfer 2>/dev/null || pip install -q hf_transfer 2>/dev/null")
            lines.append("python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
            lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=8")

        remote = req.remote_host  # None for local
        is_windows = req.platform == "windows"
        logger.info(f"Download request: repo={req.repo_id}, remote={remote}, ssh_port={req.ssh_port}, platform={req.platform}")

        if not is_windows and not await _binary_available("tmux", remote, req.ssh_port):
            return {
                "ok": False,
                "error": _missing_binary_message("tmux", remote or "local server"),
                "session_id": session_id,
            }

        if remote and is_windows:
            # ── Windows remote: generate .ps1 runner, use Start-Process for background ──
            remote_runner = f".{session_id}_run.ps1"
            ps_lines = []
            ps_lines.append('$sessionDir = "$env:TEMP\\odysseus-sessions"')
            ps_lines.append('New-Item -ItemType Directory -Force -Path $sessionDir | Out-Null')
            if req.hf_token:
                ps_lines.append(f"$env:HF_TOKEN = '{_ps_squote(req.hf_token)}'")
            if req.env_prefix:
                ps_lines.append(_safe_env_prefix(req.env_prefix))
            # Try hf CLI, fall back to Python huggingface_hub, then auto-install
            ps_lines.append('try {{')
            ps_lines.append('  $hfPath = Get-Command hf -ErrorAction SilentlyContinue')
            ps_lines.append('  if ($hfPath) {{')
            # Pipe $null to stdin to suppress interactive "update available? [Y/n]" prompt
            ps_lines.append(f'    $null | {hf_cmd}')
            ps_lines.append('  }} else {{')
            ps_lines.append('    python -c "import huggingface_hub" 2>$null')
            ps_lines.append('    if ($LASTEXITCODE -eq 0) {{')
            ps_lines.append('      Write-Host "hf CLI not found, using Python huggingface_hub..."')
            ps_lines.append('      python -m pip install -q hf_transfer 2>$null')
            ps_lines.append('      $env:HF_HUB_ENABLE_HF_TRANSFER = "1"')
            ps_lines.append(f"      python -c \"import os; from huggingface_hub import snapshot_download; snapshot_download('{req.repo_id}'{_dl_pyarg}, max_workers=8)\"")
            ps_lines.append('    }} else {{')
            ps_lines.append('      Write-Host "Installing huggingface-hub..."')
            ps_lines.append('      python -m pip install -q huggingface-hub hf_transfer')
            ps_lines.append('      $env:HF_HUB_ENABLE_HF_TRANSFER = "1"')
            ps_lines.append(f"      python -c \"import os; from huggingface_hub import snapshot_download; snapshot_download('{req.repo_id}'{_dl_pyarg}, max_workers=8)\"")
            ps_lines.append('    }}')
            ps_lines.append('  }}')
            ps_lines.append('  if ($LASTEXITCODE -eq 0) {{ Write-Host ""; Write-Host "DOWNLOAD_OK" }}')
            ps_lines.append('  else {{ Write-Host ""; Write-Host "DOWNLOAD_FAILED (exit $LASTEXITCODE)" }}')
            ps_lines.append('}} catch {{')
            ps_lines.append('  Write-Host ""; Write-Host "DOWNLOAD_FAILED ($_)"')
            ps_lines.append('}}')
            ps_lines.append(f'Remove-Item -Force "$HOME\\{remote_runner}" -ErrorAction SilentlyContinue')
            runner_path = TMUX_LOG_DIR / f"{session_id}_run.ps1"
            runner_path.write_text("\r\n".join(ps_lines) + "\r\n")

            # scp the .ps1 script, then launch it as a detached process with log + pid files
            _port = req.ssh_port
            _Pf = f"-P {_port} " if _port and _port != "22" else ""
            _pf = f"-p {_port} " if _port and _port != "22" else ""
            # Start-Process creates a fully detached process that survives SSH disconnect
            launch_ps = (
                "$sd = \\\"$env:TEMP\\odysseus-sessions\\\"; "
                f"Start-Process powershell -ArgumentList '-ExecutionPolicy','Bypass','-File','$HOME\\{remote_runner}' "
                f"-RedirectStandardOutput \\\"$sd\\{session_id}.log\\\" "
                f"-RedirectStandardError \\\"$sd\\{session_id}.err.log\\\" "
                f"-NoNewWindow -PassThru | ForEach-Object {{ $_.Id | Out-File \\\"$sd\\{session_id}.pid\\\" }}"
            )
            setup_cmd = (
                f"scp -O {_Pf}-q '{runner_path}' {remote}:{remote_runner} && "
                f'ssh {_pf}{remote} "powershell -Command \\"{launch_ps}\\""'
            )

        elif remote:
            # ── Linux/Termux remote: create tmux session ON the remote host ──
            remote_runner = f".{session_id}_run.sh"
            runner_lines = ["#!/bin/bash"]
            runner_lines.extend(_user_shell_path_bootstrap())
            runner_lines.append("# Auto-detect environment")
            runner_lines.append("deactivate 2>/dev/null; hash -r")
            if req.hf_token:
                runner_lines.append(f"export HF_TOKEN='{_bash_squote(req.hf_token)}'")
            if req.env_prefix:
                runner_lines.append(_safe_env_prefix(req.env_prefix))
            else:
                # Fallback: find a venv with hf CLI, or install huggingface-hub
                runner_lines.append(
                    'for p in ~/vllm-env ~/venv ~/.venv; do '
                    'if [ -f "$p/bin/activate" ]; then source "$p/bin/activate"; break; fi; '
                    'done'
                )
            # Ensure pip-user scripts (e.g. hf CLI installed via --user) are on PATH
            runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
            # Install hf CLI + hf_transfer best-effort so future runs get the fast path.
            # Use --break-system-packages on PEP-668 systems (Arch, newer Debian) so it doesn't bail.
            runner_lines.append("command -v hf >/dev/null 2>&1 || pip install --user --break-system-packages -q -U huggingface_hub 2>/dev/null || pip install -q -U huggingface_hub 2>/dev/null")
            runner_lines.append("python3 -c 'import hf_transfer' 2>/dev/null || pip install --user --break-system-packages -q hf_transfer 2>/dev/null || pip install -q hf_transfer 2>/dev/null")
            runner_lines.append("python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
            runner_lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=8")
            # Surface whether the HF token actually reached THIS server, so a gated
            # download's "not authorized" failure can be told apart from a missing
            # token (the token is masked — we only print applied / not-set).
            runner_lines.append(_HF_TOKEN_STATUS_SNIPPET)
            # Try hf CLI first, fall back to Python huggingface_hub, then auto-install
            runner_lines.append('if command -v hf &>/dev/null; then')
            # < /dev/null suppresses interactive "update available? [Y/n]" prompt
            runner_lines.append(f'  {hf_cmd} < /dev/null')
            runner_lines.append('elif python3 -c "import huggingface_hub" 2>/dev/null; then')
            runner_lines.append('  echo "hf CLI not found, using Python huggingface_hub..."')
            runner_lines.append(f'  python3 -c "import os; from huggingface_hub import snapshot_download; snapshot_download(\'{req.repo_id}\'{_dl_pyarg}, max_workers=8)"')
            runner_lines.append('else')
            runner_lines.append('  echo "Installing huggingface-hub and dependencies..."')
            runner_lines.append('  pip install --no-deps -q huggingface-hub 2>/dev/null')
            runner_lines.append('  pip install -q filelock fsspec packaging pyyaml tqdm typer httpx requests hf_transfer 2>/dev/null')
            runner_lines.append("  python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
            runner_lines.append(f'  python3 -c "import os; from huggingface_hub import snapshot_download; snapshot_download(\'{req.repo_id}\'{_dl_pyarg}, max_workers=8)"')
            runner_lines.append('fi')
            runner_lines.append('if [ $? -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $?)"; fi')
            runner_lines.append(f"rm -f {remote_runner}")
            runner_lines.append('exec "${SHELL:-/bin/bash}"')
            runner_path = TMUX_LOG_DIR / f"{session_id}_run.sh"
            runner_path.write_text("\n".join(runner_lines) + "\n")
            runner_path.chmod(0o755)

            # scp the runner script, then create tmux session on the remote
            _port = req.ssh_port
            _pf = f"-P {_port} " if _port and _port != "22" else ""
            _spf = f"-p {_port} " if _port and _port != "22" else ""
            setup_cmd = (
                f"scp -O {_pf}-q '{runner_path}' {remote}:{remote_runner} && "
                f"ssh {_spf}{remote} 'chmod +x {remote_runner} && tmux new-session -d -s {session_id} \"./{remote_runner}\"'"
            )
        else:
            # Local: run hf download in a local tmux session
            if req.env_prefix:
                lines.append(_safe_env_prefix(req.env_prefix))
            else:
                lines.append("deactivate 2>/dev/null; hash -r")
            # Show whether the HF token reached this run (masked) — tells a gated
            # "not authorized" failure apart from a missing token.
            lines.append(_HF_TOKEN_STATUS_SNIPPET)
            # < /dev/null suppresses interactive "update available? [Y/n]" prompt
            lines.append(f"{hf_cmd} < /dev/null")
            lines.append('if [ $? -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $?)"; fi')
            lines.append(f"rm -f '{wrapper_script}'")
            lines.append('exec "${SHELL:-/bin/bash}"')
            wrapper_script.write_text("\n".join(lines) + "\n")
            wrapper_script.chmod(0o755)
            setup_cmd = f"tmux new-session -d -s {session_id} {shlex.quote(str(wrapper_script))}"

        logger.info(f"Model download: {req.repo_id} (include={req.include}, session={session_id}, remote={remote})")
        logger.info(f"Download setup_cmd: {setup_cmd}")

        proc = await asyncio.create_subprocess_shell(
            setup_cmd,
            stdout=asyncio.subprocess.PIPE,
            stderr=asyncio.subprocess.PIPE,
        )
        await proc.wait()

        if proc.returncode != 0:
            stderr = (await proc.stderr.read()).decode(errors="replace")
            logger.error(f"Download failed (rc={proc.returncode}): {stderr}")
            return {"ok": False, "error": stderr, "session_id": session_id}

        # Log to assistant
        try:
            from src.assistant_log import log_to_assistant
            from src.auth_helpers import get_current_user
            owner = get_current_user(request)
            log_to_assistant(
                owner,
                f"Started downloading {req.repo_id} to {remote or 'local'}",
                category="Download",
            )
        except Exception:
            pass

        return {"ok": True, "session_id": session_id, "remote": remote or "local"}

    @router.get("/api/model/cached")
    async def model_cached(request: Request, host: str | None = None, model_dir: str | None = None, ssh_port: str | None = None, platform: str | None = None):
        """List cached models. Scans HF cache + optional model directory."""
        require_admin(request)
        # Validate shell-bound inputs, matching the sibling list_gpus endpoint —
        # `host`/`ssh_port` are interpolated into an ssh command below, so an
        # unvalidated value (e.g. "x'; rm -rf ~ #") would be command injection.
        host = _validate_remote_host(host)
        if ssh_port is not None and ssh_port != "" and not _SSH_PORT_RE.fullmatch(ssh_port):
            raise HTTPException(400, "Invalid ssh_port")
        TMUX_LOG_DIR.mkdir(parents=True, exist_ok=True)

        paths_code = "import json, os\n"
        paths_code += "models = []\n"
        paths_code += "seen = set()\n"
        paths_code += "BLOCKED_ROOTS = ('/sys', '/proc', '/dev', '/run', '/var/run')\n"
        paths_code += "def safe_path(p):\n"
        paths_code += "    try:\n"
        paths_code += "        rp = os.path.realpath(os.path.expanduser(p))\n"
        paths_code += "        return not any(rp == b or rp.startswith(b + os.sep) for b in BLOCKED_ROOTS)\n"
        paths_code += "    except Exception:\n"
        paths_code += "        return False\n"
        paths_code += "def safe_walk(top):\n"
        paths_code += "    if not safe_path(top): return\n"
        paths_code += "    for root, dirs, fns in os.walk(top, followlinks=False):\n"
        paths_code += "        dirs[:] = [d for d in dirs if not os.path.islink(os.path.join(root, d)) and safe_path(os.path.join(root, d))]\n"
        paths_code += "        yield root, dirs, fns\n"
        # Scan HF cache format (models-- directories with blobs/)
        paths_code += "def scan_hf(cache):\n"
        paths_code += "    if not os.path.isdir(cache): return\n"
        paths_code += "    for d in sorted(os.listdir(cache)):\n"
        paths_code += "        if not d.startswith('models--'): continue\n"
        paths_code += "        rid = d.replace('models--','').replace('--','/')\n"
        paths_code += "        if rid in seen: continue\n"
        paths_code += "        seen.add(rid)\n"
        paths_code += "        blobs = os.path.join(cache, d, 'blobs')\n"
        paths_code += "        sz, nf, ic = 0, 0, False\n"
        paths_code += "        if os.path.isdir(blobs):\n"
        paths_code += "            for f in os.scandir(blobs):\n"
        paths_code += "                if f.is_file(): nf += 1; sz += f.stat().st_size\n"
        paths_code += "                if f.name.endswith('.incomplete'): ic = True\n"
        paths_code += "        # Check if it's an LLM (has config.json with model_type) vs diffusion (has model_index.json)\n"
        paths_code += "        snap = os.path.join(cache, d, 'snapshots')\n"
        paths_code += "        is_diffusion = False; is_gguf = False\n"
        paths_code += "        if os.path.isdir(snap):\n"
        paths_code += "            for sd in os.listdir(snap):\n"
        paths_code += "                sf = os.path.join(snap, sd)\n"
        paths_code += "                if not os.path.isdir(sf): continue\n"
        paths_code += "                if os.path.exists(os.path.join(sf, 'model_index.json')): is_diffusion = True\n"
        paths_code += "                try:\n"
        paths_code += "                    if any(x.endswith('.gguf') for x in os.listdir(sf)): is_gguf = True\n"
        paths_code += "                except Exception: pass\n"
        paths_code += "        models.append({'repo_id':rid,'size_bytes':sz,'nb_files':nf,'has_incomplete':ic,'path':cache,'is_diffusion':is_diffusion,'is_gguf':is_gguf})\n"
        # Scan plain directory (each subdirectory = a model if it has model files)
        paths_code += "def scan_dir(p):\n"
        paths_code += "    if not os.path.isdir(p) or not safe_path(p): return\n"
        paths_code += "    for d in sorted(os.listdir(p)):\n"
        paths_code += "        if d.startswith('.'): continue\n"
        paths_code += "        fp = os.path.join(p, d)\n"
        paths_code += "        if not os.path.isdir(fp) or os.path.islink(fp) or not safe_path(fp): continue\n"
        paths_code += "        if d in seen: continue\n"
        paths_code += "        # Check if it looks like a model (has config.json, safetensors, bin, or gguf)\n"
        paths_code += "        is_model = False; is_gguf = False\n"
        paths_code += "        for root, dirs, fns in safe_walk(fp):\n"
        paths_code += "            for fn in fns:\n"
        paths_code += "                if fn.endswith('.gguf'): is_gguf = True; is_model = True\n"
        paths_code += "                elif fn == 'config.json' or fn.endswith('.safetensors') or fn.endswith('.bin'): is_model = True\n"
        paths_code += "            if is_model: break\n"
        paths_code += "        if not is_model: continue\n"
        paths_code += "        seen.add(d)\n"
        paths_code += "        sz, nf = 0, 0\n"
        paths_code += "        for dp, _, fns in safe_walk(fp):\n"
        paths_code += "            for fn in fns:\n"
        paths_code += "                try: nf += 1; sz += os.path.getsize(os.path.join(dp, fn))\n"
        paths_code += "                except Exception: pass\n"
        paths_code += "        is_diff = os.path.exists(os.path.join(fp, 'model_index.json'))\n"
        paths_code += "        models.append({'repo_id':d,'size_bytes':sz,'nb_files':nf,'has_incomplete':False,'path':p,'is_local_dir':True,'is_diffusion':is_diff,'is_gguf':is_gguf})\n"
        # Always scan HF cache
        paths_code += "scan_hf(os.path.expanduser('~/.cache/huggingface/hub'))\n"
        # Also scan custom model dirs (comma-separated) if specified
        if model_dir:
            for d in model_dir.split(','):
                d = d.strip()
                if d and d != '~/.cache/huggingface/hub':
                    # repr() encodes the dir as a properly-escaped Python string
                    # literal. The old f"...'{d}'..." broke out of the quotes on
                    # any `'` in the value, injecting arbitrary Python that then
                    # ran locally or over ssh.
                    paths_code += f"scan_dir(os.path.expanduser({d!r}))\n"
        paths_code += "print(json.dumps(models))\n"

        scan_py = TMUX_LOG_DIR / "scan_cache.py"
        scan_py.write_text(paths_code)

        if host:
            _pf = f"-p {ssh_port} " if ssh_port and ssh_port != "22" else ""
            if platform == "windows":
                # Windows: use 'python' and pipe via stdin with double-quote wrapping
                cmd = f'ssh {_pf}{host} "python -" < \'{scan_py}\''
            else:
                cmd = f"ssh {_pf}{host} 'python3 -' < '{scan_py}'"
        else:
            cmd = f"python3 '{scan_py}'"

        proc = await asyncio.create_subprocess_shell(
            cmd,
            stdout=asyncio.subprocess.PIPE,
            stderr=asyncio.subprocess.PIPE,
            cwd=str(Path.home()),
        )
        stdout_b, stderr_b = await asyncio.wait_for(proc.communicate(), timeout=60)

        models = []
        try:
            raw = json.loads(stdout_b.decode(errors="replace").strip())
            for m in raw:
                size_gb = m["size_bytes"] / (1024 ** 3)
                if size_gb >= 1:
                    size_str = f"{size_gb:.1f} GB"
                else:
                    size_str = f"{m['size_bytes'] / (1024**2):.0f} MB"
                entry = {
                    "repo_id": m["repo_id"],
                    "size": size_str,
                    "nb_files": m["nb_files"],
                    "has_incomplete": m["has_incomplete"],
                    "status": "downloading" if m["has_incomplete"] else "ready",
                    "path": m.get("path", ""),
                    "is_diffusion": m.get("is_diffusion", False),
                }
                if m.get("is_local_dir"):
                    entry["is_local_dir"] = True
                models.append(entry)
        except Exception as e:
            logger.warning(f"Failed to parse cached models: {e}")
            logger.warning(f"stderr: {stderr_b.decode(errors='replace')[:500]}")

        return {"models": models, "host": host or "local"}

    def _auto_register_image_endpoint(req: ServeRequest, remote: str | None) -> str | None:
        """Register a diffusion model as an image endpoint so it appears in the model selector."""
        import re
        from core.database import SessionLocal, ModelEndpoint

        # Parse port from command (--port NNNN), default 8100 for diffusion_server
        port_match = re.search(r'--port\s+(\d+)', req.cmd)
        port = int(port_match.group(1)) if port_match else 8100

        # Determine host
        if remote:
            # SSH alias — use as hostname (Tailscale resolves it later)
            host = remote.split("@")[-1] if "@" in remote else remote
        else:
            host = "localhost"

        base_url = f"http://{host}:{port}/v1"

        # Friendly display name from repo_id
        short_name = req.repo_id.split("/")[-1] if "/" in req.repo_id else req.repo_id
        display_name = f"{short_name} (image)"

        db = SessionLocal()
        try:
            # Check for existing endpoint with same base_url — update it
            existing = db.query(ModelEndpoint).filter(ModelEndpoint.base_url == base_url).first()
            if existing:
                existing.is_enabled = True
                existing.model_type = "image"
                existing.name = display_name
                db.commit()
                logger.info(f"Updated existing image endpoint: {base_url}")
                return existing.id

            ep_id = f"img-{uuid.uuid4().hex[:8]}"
            ep = ModelEndpoint(
                id=ep_id,
                name=display_name,
                base_url=base_url,
                api_key=None,
                is_enabled=True,
                model_type="image",
            )
            db.add(ep)
            db.commit()
            logger.info(f"Auto-registered image endpoint: {display_name} @ {base_url}")
            return ep_id
        except Exception as e:
            logger.error(f"Failed to auto-register image endpoint: {e}")
            db.rollback()
            return None
        finally:
            db.close()

    @router.post("/api/model/serve")
    async def model_serve(request: Request, req: ServeRequest):
        """Launch a model server in a tmux session (or PowerShell background process on Windows).

        `repo_id` is dual-purpose: a HuggingFace repo (`<org>/<name>`) for
        model-serve commands, OR a bare pip package name when the cmd is a
        `python -m pip install …`. We only enforce the strict HF format on
        the model paths.
        """
        require_admin(request)
        # Defence-in-depth: reject values that could break out of shell contexts.
        _validate_remote_host(req.remote_host)
        req.ssh_port = _validate_ssh_port(req.ssh_port)
        req.gpus = _validate_gpus(req.gpus)
        req.hf_token = req.hf_token or _load_stored_hf_token()
        _validate_token(req.hf_token)
        # Normalize away backslash-newline continuations (multi-line pasted
        # serve commands) so the cleaned single-line command is what gets
        # written into the runner script and used for engine auto-detection.
        # `_validate_serve_cmd` returns None for empty input; coerce to "" so the
        # many downstream `"engine" in req.cmd` membership checks can't hit
        # `TypeError: argument of type 'NoneType'` (a 500 instead of a clean 400).
        req.cmd = _validate_serve_cmd(req.cmd) or ""
        is_pip_install = bool(req.cmd and "pip install" in req.cmd)
        if is_pip_install:
            # PEP-508-style package spec — letters, digits, `.-_` for the
            # name; `[` `]` for extras; `<>=!~,` for version specifiers.
            # v2 review HIGH-14: tightened from the previous regex which
            # also allowed spaces and `+`, both of which can be abused to
            # introduce extra shell tokens once interpolated into the
            # serve command. We now use `re.fullmatch` and drop space/`+`.
            if not req.repo_id or not re.fullmatch(
                r"[A-Za-z0-9][A-Za-z0-9._\-\[\]<>=!,~]{0,200}", req.repo_id
            ):
                raise HTTPException(400, "Invalid pip package name")
        else:
            _validate_repo_id(req.repo_id)
        TMUX_LOG_DIR.mkdir(parents=True, exist_ok=True)
        session_id = f"serve-{uuid.uuid4().hex[:8]}"
        remote = req.remote_host
        is_windows = req.platform == "windows"

        if not is_windows and not await _binary_available("tmux", remote, req.ssh_port):
            return {
                "ok": False,
                "error": _missing_binary_message("tmux", remote or "local server"),
                "session_id": session_id,
            }
        if _needs_binary(req.cmd, "docker") and not await _binary_available("docker", remote, req.ssh_port, windows=is_windows):
            return {
                "ok": False,
                "error": _missing_binary_message("docker", remote or "local server"),
                "session_id": session_id,
            }

        if is_windows and remote:
            # ── Windows remote: generate .ps1 serve runner ──
            remote_runner = f".{session_id}_run.ps1"
            ps_lines = []
            ps_lines.append('$sessionDir = "$env:TEMP\\odysseus-sessions"')
            ps_lines.append('New-Item -ItemType Directory -Force -Path $sessionDir | Out-Null')
            if req.hf_token:
                ps_lines.append(f"$env:HF_TOKEN = '{_ps_squote(req.hf_token)}'")
            if req.gpus:
                ps_lines.append(f"$env:CUDA_VISIBLE_DEVICES = '{req.gpus}'")
            if req.env_prefix:
                ps_lines.append(_safe_env_prefix(req.env_prefix))
            # Auto-install ollama if the command uses it
            if "ollama" in req.cmd:
                ps_lines.append('# Check if ollama is available')
                ps_lines.append('if (-not (Get-Command ollama -ErrorAction SilentlyContinue)) {')
                ps_lines.append('  Write-Host "Ollama not found. Please install from https://ollama.com/download/windows"')
                ps_lines.append('  exit 1')
                ps_lines.append('}')
            elif "llama_cpp" in req.cmd or "llama-server" in req.cmd:
                ps_lines.append('# Auto-install llama-cpp-python if missing')
                ps_lines.append('try { python -c "import llama_cpp" 2>$null } catch {}')
                ps_lines.append('if ($LASTEXITCODE -ne 0) {')
                ps_lines.append('  Write-Host "Installing llama-cpp-python..."')
                ps_lines.append('  python -m pip install llama-cpp-python[server]')
                ps_lines.append('}')
            elif "vllm" in req.cmd:
                ps_lines.append('Write-Host "ERROR: vLLM is not supported on Windows. Use Ollama or llama.cpp instead."')
                ps_lines.append('exit 1')
            ps_lines.append(req.cmd)
            ps_lines.append('Write-Host ""')
            ps_lines.append('Write-Host "=== Process exited with code $LASTEXITCODE ==="')
            runner_path = TMUX_LOG_DIR / f"{session_id}_run.ps1"
            runner_path.write_text("\r\n".join(ps_lines) + "\r\n")

            _port = req.ssh_port
            _Pf = f"-P {_port} " if _port and _port != "22" else ""
            _pf = f"-p {_port} " if _port and _port != "22" else ""
            launch_ps = (
                "$sd = \\\"$env:TEMP\\odysseus-sessions\\\"; "
                f"Start-Process powershell -ArgumentList '-ExecutionPolicy','Bypass','-File','$HOME\\{remote_runner}' "
                f"-RedirectStandardOutput \\\"$sd\\{session_id}.log\\\" "
                f"-RedirectStandardError \\\"$sd\\{session_id}.err.log\\\" "
                f"-NoNewWindow -PassThru | ForEach-Object {{ $_.Id | Out-File \\\"$sd\\{session_id}.pid\\\" }}"
            )
            setup_cmd = (
                f"scp -O {_Pf}-q '{runner_path}' {remote}:{remote_runner} && "
                f'ssh {_pf}{remote} "powershell -Command \\"{launch_ps}\\""'
            )
        else:
            # ── Linux/Termux: bash + tmux (existing flow) ──
            runner_lines = ["#!/bin/bash"]
            runner_lines.extend(_user_shell_path_bootstrap())
            runner_lines.append("export FLASHINFER_DISABLE_VERSION_CHECK=1")
            if req.hf_token:
                runner_lines.append(f"export HF_TOKEN='{_bash_squote(req.hf_token)}'")
            if req.gpus:
                runner_lines.append(f"export CUDA_VISIBLE_DEVICES='{req.gpus}'")
            if req.env_prefix:
                runner_lines.append(_safe_env_prefix(req.env_prefix))
            else:
                runner_lines.append("deactivate 2>/dev/null; hash -r")
            # Show whether the HF token reached this server (masked) — a gated
            # model vLLM has to download will be denied without it.
            runner_lines.append(_HF_TOKEN_STATUS_SNIPPET)
            # Auto-install inference engine if missing
            if "llama_cpp" in req.cmd or "llama-server" in req.cmd:
                # Prefer the NATIVE llama-server binary — its minja templating
                # renders modern GGUF chat templates that the Python bindings'
                # Jinja2 rejects (do_tojson ensure_ascii). Build it once from
                # source if missing; keep llama-cpp-python only as a fallback.
                runner_lines.append('# Ensure a llama.cpp server (prefer native llama-server)')
                runner_lines.append('export PATH="$HOME/.local/bin:$HOME/bin:$HOME/llama.cpp/build/bin:$PATH"')
                runner_lines.append('if [ -d /data/data/com.termux ]; then')
                runner_lines.append('  # Termux: no native build — use the Python bindings (CPU).')
                runner_lines.append('  if ! python3 -c "import llama_cpp" 2>/dev/null; then')
                runner_lines.append('    pkg install -y cmake 2>/dev/null')
                runner_lines.append('    pip install numpy diskcache jinja2 2>/dev/null')
                runner_lines.append('    CMAKE_ARGS="-DGGML_BLAS=OFF -DGGML_LLAMAFILE=OFF" pip install llama-cpp-python --no-build-isolation --no-cache-dir 2>&1 || true')
                runner_lines.append('  fi')
                runner_lines.append('elif ! command -v llama-server &>/dev/null; then')
                runner_lines.append('  echo "Native llama-server not found — building from source (one-time, may take a few minutes)..."')
                runner_lines.append('  mkdir -p ~/bin')
                runner_lines.append('  cd ~ && [ -d llama.cpp ] || git clone --depth 1 https://github.com/ggml-org/llama.cpp')
                # GPU build if CUDA is present; fall back to a plain (CPU) build.
                runner_lines.append('  cd ~/llama.cpp && { cmake -B build -DGGML_CUDA=ON 2>/dev/null || cmake -B build; } \\')
                runner_lines.append('    && cmake --build build -j"$(nproc)" --target llama-server \\')
                runner_lines.append('    && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
                runner_lines.append('  # If the native build failed, fall back to the Python bindings.')
                runner_lines.append('  if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then')
                runner_lines.append('    echo "llama-server build failed — installing Python bindings as fallback..."')
                runner_lines.append('    pip install --user --break-system-packages -q llama-cpp-python 2>/dev/null || pip install -q llama-cpp-python 2>/dev/null || true')
                runner_lines.append('  fi')
                runner_lines.append('fi')
            elif "vllm serve" in req.cmd:
                # Put ~/.local/bin on PATH first — without a venv, vllm installs
                # there via --user and the non-login serve shell otherwise can't
                # find the `vllm` CLI ("command not found"). Mirrors llama.cpp above.
                runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
                runner_lines.append('if ! command -v vllm &>/dev/null; then')
                runner_lines.append('  echo "ERROR: vLLM is not installed. Open Cookbook -> Dependencies and install vllm on this server, then launch again."')
                runner_lines.append('  exit 127')
                runner_lines.append('fi')
            elif "sglang.launch_server" in req.cmd:
                runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
                runner_lines.append('if ! python3 -c "import sglang" 2>/dev/null; then')
                runner_lines.append('  echo "ERROR: SGLang is not installed. Open Cookbook -> Dependencies and install sglang on this server, then launch again."')
                runner_lines.append('  exit 127')
                runner_lines.append('fi')

            runner_lines.append(req.cmd)
            # Keep shell open after exit so user can see errors
            runner_lines.append('echo ""; echo "=== Process exited with code $? ==="; exec "${SHELL:-/bin/bash}"')

            runner_path = TMUX_LOG_DIR / f"{session_id}_run.sh"
            runner_path.write_text("\n".join(runner_lines) + "\n")
            runner_path.chmod(0o755)

            if remote:
                remote_runner = f".{session_id}_run.sh"
                # If command references scripts/, scp those too
                scp_extras = ""
                _port = req.ssh_port
                _Pf = f"-P {_port} " if _port and _port != "22" else ""
                _pf = f"-p {_port} " if _port and _port != "22" else ""
                if "scripts/diffusion_server.py" in req.cmd:
                    from core.constants import BASE_DIR
                    diff_script = Path(BASE_DIR) / "scripts" / "diffusion_server.py"
                    if diff_script.exists():
                        scp_extras = f"scp -O {_Pf}-q '{diff_script}' {remote}:.diffusion_server.py && "
                        runner_path.write_text(
                            runner_path.read_text().replace(
                                "scripts/diffusion_server.py", ".diffusion_server.py"
                            )
                        )
                setup_cmd = (
                    f"{scp_extras}"
                    f"scp -O {_Pf}-q '{runner_path}' {remote}:{remote_runner} && "
                    f"ssh {_pf}{remote} 'chmod +x {remote_runner} && tmux new-session -d -s {session_id} \"./{remote_runner}\"'"
                )
            else:
                setup_cmd = f"tmux new-session -d -s {session_id} {shlex.quote(str(runner_path))}"

        proc = await asyncio.create_subprocess_shell(
            setup_cmd,
            stdout=asyncio.subprocess.PIPE,
            stderr=asyncio.subprocess.PIPE,
        )
        await proc.wait()

        if proc.returncode != 0:
            stderr = (await proc.stderr.read()).decode(errors="replace")
            return {"ok": False, "error": stderr, "session_id": session_id}

        # Auto-register as model endpoint if serving a diffusion model
        endpoint_id = None
        is_diffusion = "diffusion_server.py" in req.cmd
        if is_diffusion:
            endpoint_id = _auto_register_image_endpoint(req, remote)

        # Log to assistant
        try:
            from src.assistant_log import log_to_assistant
            from src.auth_helpers import get_current_user
            owner = get_current_user(request)
            short = req.repo_id.split("/")[-1] if "/" in req.repo_id else req.repo_id
            log_to_assistant(
                owner,
                f"Started serving {short} on {remote or 'local'}",
                category="Serve",
            )
        except Exception:
            pass

        return {"ok": True, "session_id": session_id, "remote": remote or "local",
                "endpoint_id": endpoint_id}

    # ── Server setup (install deps on remote) ──

    class SetupRequest(BaseModel):
        host: str
        ssh_port: str | None = None

    @router.post("/api/cookbook/setup")
    async def server_setup(request: Request, req: SetupRequest):
        """Install required dependencies on a remote server via SSH."""
        require_admin(request)
        host = _validate_remote_host(req.host)
        if not host:
            raise HTTPException(400, "host is required")
        port = req.ssh_port
        if port is not None and port != "" and not re.fullmatch(r"\d{1,5}", port):
            raise HTTPException(400, "Invalid ssh_port")
        pf = f"-p {port} " if port and port != "22" else ""

        # Detect platform: Windows first (echo %OS% → Windows_NT), then Termux, then Linux
        detect_cmd = f'ssh {pf}{host} "echo %OS%"'
        platform = "linux"
        try:
            proc = await asyncio.create_subprocess_shell(
                detect_cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
            )
            stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=10)
            out = stdout.decode().strip()
            if "Windows_NT" in out:
                platform = "windows"
            else:
                # Check for Termux
                detect_cmd2 = f"ssh {pf}{host} 'test -d /data/data/com.termux && echo termux || echo linux'"
                proc2 = await asyncio.create_subprocess_shell(
                    detect_cmd2, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
                )
                stdout2, _ = await asyncio.wait_for(proc2.communicate(), timeout=10)
                platform = stdout2.decode().strip()
        except Exception:
            platform = "linux"

        if platform == "windows":
            # Windows setup: ensure Python + pip + huggingface-hub via PowerShell
            # Also create the session directory for background tasks
            setup_script = (
                'powershell -Command "'
                "New-Item -ItemType Directory -Force -Path $env:TEMP\\odysseus-sessions | Out-Null; "
                "try { python --version } catch { Write-Host 'ERROR: Python not found — install from python.org'; exit 1 }; "
                "python -m pip install -q huggingface-hub 2>$null; "
                "python -c \\\"from huggingface_hub import snapshot_download; print('OK')\\\""
                '"'
            )
            cmd = f'ssh {pf}{host} {setup_script}'
        elif platform == "termux":
            setup_script = (
                "pkg install -y python tmux 2>/dev/null; "
                "pip install --no-deps -q huggingface-hub 2>/dev/null; "
                "pip install -q filelock fsspec packaging pyyaml tqdm typer httpx requests 2>/dev/null; "
                "python3 -c 'from huggingface_hub import snapshot_download; print(\"OK\")'"
            )
            cmd = f"ssh {pf}{host} '{setup_script}'"
        else:
            # Linux: auto-install tmux (via whichever package manager is available)
            # and huggingface_hub + hf_transfer (falling back to --user/--break-system-packages
            # on PEP-668 locked distros like Arch / newer Debian).
            setup_script = (
                # Install tmux if missing — try common package managers; skip if no sudo
                "if ! command -v tmux >/dev/null 2>&1; then "
                "  if command -v apt-get >/dev/null 2>&1; then sudo -n apt-get install -y tmux 2>/dev/null; "
                "  elif command -v pacman >/dev/null 2>&1; then sudo -n pacman -S --noconfirm tmux 2>/dev/null; "
                "  elif command -v dnf >/dev/null 2>&1; then sudo -n dnf install -y tmux 2>/dev/null; "
                "  elif command -v apk >/dev/null 2>&1; then sudo -n apk add --no-interactive tmux 2>/dev/null; "
                "  elif command -v zypper >/dev/null 2>&1; then sudo -n zypper --non-interactive install tmux 2>/dev/null; "
                "  fi; "
                "fi; "
                "command -v tmux >/dev/null 2>&1 || echo 'WARNING: tmux missing and auto-install failed (need passwordless sudo). Install manually.'; "
                # Install Python bits. Try system install first; fall back to --user --break-system-packages on PEP 668 systems.
                "pip install -q huggingface_hub hf_transfer 2>/dev/null || "
                "pip install --user --break-system-packages -q huggingface_hub hf_transfer 2>/dev/null || "
                "pip3 install --user --break-system-packages -q huggingface_hub hf_transfer 2>/dev/null; "
                "python3 -c 'from huggingface_hub import snapshot_download; print(\"OK\")'"
            )
            cmd = f"ssh {pf}{host} '{setup_script}'"

        try:
            proc = await asyncio.create_subprocess_shell(
                cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
            )
            stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=120)
            output = stdout.decode() + stderr.decode()
            ok = "OK" in output
            return {"ok": ok, "output": output.strip(), "platform": platform}
        except asyncio.TimeoutError:
            return {"ok": False, "error": "Setup timed out (120s)", "platform": platform}
        except Exception as e:
            return {"ok": False, "error": str(e), "platform": platform}

    # ── GPU availability probe ──

    async def _run_nvidia_smi(query: str, host: str | None, ssh_port: str | None, timeout: int = 8):
        """Run nvidia-smi locally or over SSH. Returns (stdout, error_or_None)."""
        if host:
            pf = f"-p {ssh_port} " if ssh_port and ssh_port != "22" else ""
            cmd = f"ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no {pf}{host} '{query}'"
            proc = await asyncio.create_subprocess_shell(
                cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
            )
        else:
            proc = await asyncio.create_subprocess_exec(
                *shlex.split(query),
                stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE,
            )
        try:
            stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout)
        except asyncio.TimeoutError:
            proc.kill()
            return None, "nvidia-smi timed out"
        if proc.returncode != 0:
            err = (stderr.decode("utf-8", errors="replace") or "").strip()[:200]
            return None, err or "nvidia-smi failed"
        return stdout.decode("utf-8", errors="replace"), None

    async def _run_gpu_shell(cmd_text: str, host: str | None, ssh_port: str | None, timeout: int = 8):
        """Run a small GPU probe shell command locally or over SSH."""
        if host:
            pf = f"-p {ssh_port} " if ssh_port and ssh_port != "22" else ""
            quoted_cmd = shlex.quote(cmd_text)
            remote_cmd = (
                f"if command -v sh >/dev/null 2>&1; then sh -lc {quoted_cmd}; "
                f"elif command -v bash >/dev/null 2>&1; then bash -lc {quoted_cmd}; "
                f"elif command -v zsh >/dev/null 2>&1; then zsh -lc {quoted_cmd}; "
                "else echo 'No POSIX shell found for GPU probe' >&2; exit 127; fi"
            )
            cmd = f"ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no {pf}{host} {shlex.quote(remote_cmd)}"
            proc = await asyncio.create_subprocess_shell(
                cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
            )
        else:
            proc = await asyncio.create_subprocess_shell(
                cmd_text, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
            )
        try:
            stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout)
        except asyncio.TimeoutError:
            proc.kill()
            return None, "GPU probe timed out"
        if proc.returncode != 0:
            err = (stderr.decode("utf-8", errors="replace") or "").strip()[:200]
            return None, err or f"GPU probe failed ({proc.returncode})"
        return stdout.decode("utf-8", errors="replace"), None

    async def _gpu_read_file(path: str, host: str | None, ssh_port: str | None) -> str | None:
        out, err = await _run_gpu_shell(f"cat {shlex.quote(path)} 2>/dev/null", host, ssh_port, timeout=4)
        if err is not None or out is None:
            return None
        return out.strip()

    async def _probe_gpu_device_processes(host: str | None, ssh_port: str | None) -> list[dict]:
        pid_cmd = (
            "{ command -v lsof >/dev/null 2>&1 && "
            "lsof -w -t /dev/kfd /dev/dri/renderD* 2>/dev/null || true; "
            "command -v fuser >/dev/null 2>&1 && "
            "fuser /dev/kfd /dev/dri/renderD* 2>/dev/null || true; } "
            "| tr ' ' '\\n' | sed '/^[0-9][0-9]*$/!d' | sort -n -u"
        )
        out, err = await _run_gpu_shell(pid_cmd, host, ssh_port, timeout=5)
        if err is not None or not out:
            return []
        processes = []
        seen = set()
        for raw in out.splitlines():
            try:
                pid = int(raw.strip())
            except ValueError:
                continue
            if pid in seen:
                continue
            seen.add(pid)
            name_out, _ = await _run_gpu_shell(f"ps -p {pid} -o comm= 2>/dev/null", host, ssh_port, timeout=3)
            name = (name_out or "").strip().splitlines()[0] if (name_out or "").strip() else "process"
            processes.append({"pid": pid, "name": name[:80], "used_mb": 0})
        return processes

    async def _probe_amd_sysfs(host: str | None, ssh_port: str | None) -> list[dict]:
        out, err = await _run_gpu_shell("ls -1 /sys/class/drm 2>/dev/null", host, ssh_port, timeout=4)
        if err is not None or not out:
            return []
        gpus = []
        for entry in out.split():
            if not entry.startswith("card") or "-" in entry:
                continue
            base = f"/sys/class/drm/{entry}/device"
            vendor = await _gpu_read_file(f"{base}/vendor", host, ssh_port)
            if vendor != "0x1002":
                continue
            vram_raw = await _gpu_read_file(f"{base}/mem_info_vram_total", host, ssh_port)
            vis_raw = await _gpu_read_file(f"{base}/mem_info_vis_vram_total", host, ssh_port)
            gtt_raw = await _gpu_read_file(f"{base}/mem_info_gtt_total", host, ssh_port)
            vram_bytes = int(vram_raw) if vram_raw and vram_raw.isdigit() else 0
            vis_bytes = int(vis_raw) if vis_raw and vis_raw.isdigit() else 0
            gtt_bytes = int(gtt_raw) if gtt_raw and gtt_raw.isdigit() else 0
            total_bytes = max(vram_bytes, vis_bytes)
            used_attr = "mem_info_vis_vram_used" if vis_bytes and vis_bytes >= vram_bytes else "mem_info_vram_used"
            unified = bool(vis_bytes and vis_bytes >= vram_bytes)
            if total_bytes <= 0:
                total_bytes = gtt_bytes
                used_attr = "mem_info_gtt_used"
                unified = True
            if total_bytes <= 0:
                continue
            used_raw = await _gpu_read_file(f"{base}/{used_attr}", host, ssh_port)
            used_bytes = int(used_raw) if used_raw and used_raw.isdigit() else 0
            name = await _gpu_read_file(f"{base}/product_name", host, ssh_port)
            if not name:
                device = await _gpu_read_file(f"{base}/device", host, ssh_port)
                name = f"AMD GPU {device or entry}"
            total_mb = max(0, int(total_bytes / (1024 * 1024)))
            used_mb = max(0, min(total_mb, int(used_bytes / (1024 * 1024))))
            free_mb = max(0, total_mb - used_mb)
            gpus.append({
                "index": len(gpus), "name": name, "uuid": entry,
                "free_mb": free_mb, "total_mb": total_mb, "used_mb": used_mb,
                "util_pct": 0, "busy": bool(total_mb and (free_mb / total_mb) < 0.85),
                "processes": [], "backend": "rocm", "source": "amd-sysfs",
                "unified_memory": unified,
            })
        if gpus:
            processes = await _probe_gpu_device_processes(host, ssh_port)
            if processes:
                gpus[0]["processes"] = processes
                gpus[0]["busy"] = True
        return gpus

    @router.get("/api/cookbook/gpus")
    async def list_gpus(request: Request, host: str | None = None, ssh_port: str | None = None):
        """Probe GPU memory/process state locally or via SSH.

        Probe order:
            1. NVIDIA via nvidia-smi
            2. AMD/ROCm and unified-memory APUs via /sys/class/drm
            3. Generic GPU device holders via /dev/kfd and /dev/dri/renderD*

        Returned shape:
            { "ok": True, "gpus": [
                {"index": 0, "name": "...", "free_mb": int, "total_mb": int,
                 "used_mb": int, "util_pct": int, "busy": bool,
                 "uuid": "GPU-...",
                 "processes": [{"pid": int, "name": str, "used_mb": int}, ...]
                }, ...
            ]}
        `busy` is True when free_mb/total_mb < 0.5.
        """
        require_admin(request)
        host = _validate_remote_host(host)
        if ssh_port is not None and ssh_port != "" and not _SSH_PORT_RE.fullmatch(ssh_port):
            raise HTTPException(400, "Invalid ssh_port")
        gpu_query = "nvidia-smi --query-gpu=index,name,memory.free,memory.total,memory.used,utilization.gpu,uuid --format=csv,noheader,nounits"
        nvidia_error = None
        try:
            gpu_out, err = await _run_nvidia_smi(gpu_query, host, ssh_port)
            if err is not None:
                nvidia_error = err
                gpu_out = ""
        except FileNotFoundError:
            nvidia_error = "nvidia-smi not found"
            gpu_out = ""
        except Exception as e:
            nvidia_error = str(e)[:200]
            gpu_out = ""

        gpus = []
        uuid_to_idx: dict[str, int] = {}
        for line in (gpu_out or "").strip().splitlines():
            parts = [p.strip() for p in line.split(",")]
            if len(parts) < 7:
                continue
            try:
                idx = int(parts[0])
                name = parts[1]
                free_mb = int(float(parts[2]))
                total_mb = int(float(parts[3]))
                used_mb = int(float(parts[4]))
                util_pct = int(float(parts[5]))
                gpu_uuid = parts[6]
            except (ValueError, IndexError):
                continue
            busy = total_mb > 0 and (free_mb / total_mb) < 0.5
            uuid_to_idx[gpu_uuid] = idx
            gpus.append({
                "index": idx, "name": name, "uuid": gpu_uuid,
                "free_mb": free_mb, "total_mb": total_mb,
                "used_mb": used_mb, "util_pct": util_pct,
                "busy": busy, "processes": [],
            })

        # Best-effort process listing — skip silently if it fails
        proc_query = "nvidia-smi --query-compute-apps=pid,gpu_uuid,process_name,used_memory --format=csv,noheader,nounits"
        try:
            proc_out, proc_err = await _run_nvidia_smi(proc_query, host, ssh_port, timeout=5)
            if proc_err is None and proc_out:
                gpus_by_idx = {g["index"]: g for g in gpus}
                for line in proc_out.strip().splitlines():
                    parts = [p.strip() for p in line.split(",")]
                    if len(parts) < 4:
                        continue
                    try:
                        pid = int(parts[0])
                        pname = parts[2]
                        pmem = int(float(parts[3]))
                    except (ValueError, IndexError):
                        continue
                    idx = uuid_to_idx.get(parts[1])
                    if idx is None or idx not in gpus_by_idx:
                        continue
                    gpus_by_idx[idx]["processes"].append({
                        "pid": pid, "name": pname, "used_mb": pmem,
                    })
        except Exception:
            pass

        if gpus:
            return {"ok": True, "gpus": gpus, "backend": "cuda", "source": "nvidia-smi"}

        amd_gpus = await _probe_amd_sysfs(host, ssh_port)
        if amd_gpus:
            return {
                "ok": True,
                "gpus": amd_gpus,
                "backend": "rocm",
                "source": "amd-sysfs",
                "fallback_from": "nvidia-smi",
                "nvidia_error": nvidia_error,
            }

        processes = await _probe_gpu_device_processes(host, ssh_port)
        if processes:
            return {
                "ok": True,
                "gpus": [{
                    "index": 0, "name": "GPU device holders", "uuid": "dev-dri",
                    "free_mb": 0, "total_mb": 0, "used_mb": 0, "util_pct": 0,
                    "busy": True, "processes": processes,
                    "backend": "generic", "source": "gpu-devices",
                }],
                "backend": "generic",
                "source": "gpu-devices",
                "fallback_from": "nvidia-smi",
                "nvidia_error": nvidia_error,
            }

        return {"ok": False, "error": nvidia_error or "No GPU memory probe available", "gpus": []}

    class KillPidRequest(BaseModel):
        pid: int
        host: str | None = None
        ssh_port: str | None = None
        signal: str = "TERM"  # TERM (graceful) or KILL (force)

    @router.post("/api/cookbook/kill-pid")
    async def kill_pid(request: Request, req: KillPidRequest):
        """Kill a PID that's holding GPU memory.

        Admin-gated. Validates PID is positive int, signal is TERM/KILL, and
        forbids low PIDs (<100) to avoid accidentally signalling init/system
        daemons. Uses `kill -<sig> <pid>` locally or over SSH.
        """
        require_admin(request)
        if req.pid < 100:
            raise HTTPException(400, f"Refusing to signal PID {req.pid} (<100, likely system process)")
        sig = (req.signal or "TERM").upper()
        if sig not in ("TERM", "KILL", "INT"):
            raise HTTPException(400, "signal must be TERM, KILL, or INT")
        host = _validate_remote_host(req.host)
        if req.ssh_port and not _SSH_PORT_RE.fullmatch(req.ssh_port):
            raise HTTPException(400, "Invalid ssh_port")
        kill_cmd = f"kill -{sig} {req.pid}"
        try:
            if host:
                pf = f"-p {req.ssh_port} " if req.ssh_port and req.ssh_port != "22" else ""
                cmd = f"ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no {pf}{host} '{kill_cmd}'"
                proc = await asyncio.create_subprocess_shell(
                    cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
                )
            else:
                proc = await asyncio.create_subprocess_exec(
                    "kill", f"-{sig}", str(req.pid),
                    stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE,
                )
            stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=5)
            if proc.returncode != 0:
                err = (stderr.decode("utf-8", errors="replace") or "").strip()[:200]
                return {"ok": False, "error": err or f"kill returned {proc.returncode}"}
            return {"ok": True, "pid": req.pid, "signal": sig}
        except asyncio.TimeoutError:
            return {"ok": False, "error": "kill command timed out"}
        except Exception as e:
            return {"ok": False, "error": str(e)[:200]}

    # ── Cookbook state persistence (cross-device sync) ──

    @router.get("/api/cookbook/state")
    async def get_cookbook_state(request: Request):
        """Load saved cookbook state (tasks, servers, presets, settings)."""
        require_admin(request)
        if _cookbook_state_path.exists():
            try:
                return _state_for_client(json.loads(_cookbook_state_path.read_text()))
            except Exception:
                return {}
        return {}

    @router.post("/api/cookbook/state")
    async def save_cookbook_state(request: Request):
        """Save cookbook state for cross-device sync.

        Admin-gated because cookbook state is read back into shell-quoting
        contexts when polling tmux session status (see status handler).

        Merge guard: the UI debounces a `_syncToServer` POST every few
        seconds with whatever localStorage has. The agent's tool layer
        writes server-side tasks (e.g. `download_model` registering a
        task). Without a merge, every UI sync wipes the agent's recent
        additions. We preserve any on-disk task that the incoming body
        omits but was added in the last RACE_WINDOW seconds — that's a
        race, not an intentional delete.
        """
        require_admin(request)
        RACE_WINDOW_MS = 60_000
        try:
            from core.atomic_io import atomic_write_json
            data = await request.json()
            if not isinstance(data, dict):
                data = {}
            try:
                if _cookbook_state_path.exists():
                    on_disk = json.loads(_cookbook_state_path.read_text())
                else:
                    on_disk = {}
            except Exception:
                on_disk = {}
            # Anti-wipe guard for env servers. The UI debounces a
            # sync of whatever is in memory; if it fires before the state has
            # hydrated from GET /state (a load-time race) or during a render
            # glitch, `env.servers` would be empty and silently overwrite the
            # saved servers on disk. Never let an empty/absent incoming
            # env.servers clobber a populated on-disk one — preserve the disk
            # values while still accepting the rest of the incoming env.
            disk_env = on_disk.get("env") if isinstance(on_disk, dict) and isinstance(on_disk.get("env"), dict) else None
            if disk_env:
                inc_env = data.get("env") if isinstance(data.get("env"), dict) else None
                if inc_env is None:
                    data["env"] = disk_env
                    logger.warning("cookbook state POST: incoming body had no env; preserved on-disk env (anti-wipe guard)")
                elif disk_env.get("servers") and not inc_env.get("servers"):
                    inc_env["servers"] = disk_env["servers"]
                    logger.warning("cookbook state POST: incoming env.servers empty; preserved on-disk servers (anti-wipe guard)")

            disk_tasks = on_disk.get("tasks") or [] if isinstance(on_disk, dict) else []
            incoming_tasks = data.get("tasks") if isinstance(data.get("tasks"), list) else []
            incoming_ids = {t.get("sessionId") for t in incoming_tasks if isinstance(t, dict) and t.get("sessionId")}
            import time as _t
            now_ms = int(_t.time() * 1000)
            preserved = []
            for t in disk_tasks:
                if not isinstance(t, dict):
                    continue
                sid = t.get("sessionId")
                if not sid or sid in incoming_ids:
                    continue  # client's version wins
                ts = t.get("ts") or 0
                if isinstance(ts, (int, float)) and (now_ms - ts) <= RACE_WINDOW_MS:
                    preserved.append(t)
            if preserved:
                logger.info(f"cookbook state POST: preserving {len(preserved)} recent task(s) "
                            f"not in incoming body (race guard): "
                            f"{[t.get('sessionId') for t in preserved]}")
                data["tasks"] = incoming_tasks + preserved
            atomic_write_json(str(_cookbook_state_path), _state_for_storage(data, on_disk), indent=2)
            return {"ok": True, "preserved": len(preserved)}
        except Exception as e:
            return {"ok": False, "error": str(e)}

    @router.get("/api/cookbook/hf-latest")
    async def hf_latest(vram_gb: float = 0, limit: int = 10, pipeline: str = "text-generation", owner: str = Depends(require_user)):
        """Fetch latest HuggingFace models, filtered by what fits in available VRAM.

        vram_gb: total available VRAM in GB. 0 = no filter (return everything).
        limit:   how many models to return (default 10).
        pipeline: HF pipeline_tag filter (text-generation, text-to-image, etc.).
        """
        import re
        import httpx

        # Fetch a larger pool so we have enough to filter from (we drop ~80%)
        pool_size = max(limit * 15, 100)
        url = (
            "https://huggingface.co/api/models"
            f"?sort=trendingScore&direction=-1&limit={pool_size}&filter={pipeline}"
        )
        try:
            async with httpx.AsyncClient(timeout=15) as client:
                resp = await client.get(url)
                if resp.status_code != 200:
                    return {"models": [], "error": f"HF API HTTP {resp.status_code}"}
                raw = resp.json()
        except Exception as e:
            return {"models": [], "error": str(e)}

        # Estimate VRAM from the model id. Looks for patterns like "7B", "70B", "1.5B" etc.
        # Returns approx VRAM in GB at fp16 (params*2). Caller adjusts for quant.
        def _est_vram_fp16(repo_id: str) -> float | None:
            m = re.search(r'[-_/](\d+(?:\.\d+)?)\s*[Bb](?![a-zA-Z])', repo_id)
            if not m:
                return None
            params_b = float(m.group(1))
            return params_b * 2.0  # fp16 baseline

        # Detect quantization from repo_id / tags. Returns a multiplier on fp16 size.
        def _quant_factor(repo_id: str, tags: list) -> float:
            text = (repo_id + " " + " ".join(tags or [])).lower()
            if "fp4" in text or "nf4" in text or "int4" in text or "4bit" in text or "q4" in text or "awq" in text or "gptq" in text:
                return 0.25
            if "int8" in text or "8bit" in text or "q8" in text or "fp8" in text:
                return 0.5
            if "bf16" in text or "fp16" in text:
                return 1.0
            return 1.0  # default fp16

        # Exclude adapters, LoRAs, datasets, GGUF-only repos, and other non-runnable artifacts
        EXCLUDE_TAG_SUBSTRINGS = (
            "lora", "adapter", "peft", "qlora",
            "dataset", "embeddings",
            "merge", "control-lora",
            "diffusion-lora", "stable-diffusion-lora",
            "text-classification", "token-classification",
            "feature-extraction", "sentence-similarity",
        )
        EXCLUDE_NAME_SUBSTRINGS = (
            "lora", "adapter", "peft", "qlora",
            "embedding", "embed-",
            "dataset",
        )

        def _is_excluded(repo_id: str, tags: list) -> bool:
            text = repo_id.lower()
            for s in EXCLUDE_NAME_SUBSTRINGS:
                if s in text:
                    return True
            tag_text = " ".join(t.lower() for t in (tags or []))
            for s in EXCLUDE_TAG_SUBSTRINGS:
                if s in tag_text:
                    return True
            return False

        out = []
        for entry in raw:
            repo_id = entry.get("modelId") or entry.get("id") or ""
            if not repo_id:
                continue
            tags = entry.get("tags") or []
            pipeline_tag = entry.get("pipeline_tag") or ""

            # Hard filter: only the requested pipeline (HF's filter param is loose)
            if pipeline and pipeline_tag and pipeline_tag != pipeline:
                continue
            # Skip adapters, LoRAs, datasets, etc.
            if _is_excluded(repo_id, tags):
                continue

            est_fp16 = _est_vram_fp16(repo_id)
            quant_mult = _quant_factor(repo_id, tags)
            est_vram = (est_fp16 * quant_mult) if est_fp16 else None
            # Add 30% headroom for KV cache, activations, etc.
            needed_vram = (est_vram * 1.3) if est_vram else None

            if vram_gb > 0 and needed_vram is not None and needed_vram > vram_gb:
                continue
            # Skip if no size info — without a size we can't tell if it's a real
            # full-weight model or a tiny adapter, so we'd rather drop it
            if est_vram is None:
                continue

            out.append({
                "repo_id": repo_id,
                "downloads": entry.get("downloads", 0),
                "likes": entry.get("likes", 0),
                "createdAt": entry.get("createdAt", ""),
                "tags": tags[:5],  # trim
                "pipeline_tag": pipeline_tag,
                "est_vram_gb": round(est_vram, 1) if est_vram else None,
                "needed_vram_gb": round(needed_vram, 1) if needed_vram else None,
            })
            if len(out) >= limit:
                break

        return {"models": out}

    @router.get("/api/cookbook/tasks/status")
    async def cookbook_tasks_status(request: Request):
        """Check status of all active cookbook tmux sessions.

        Critical: every subprocess.run inside this handler is a sync blocking
        call that — when this was a plain async def — froze the entire server
        event loop. Now the whole body runs in a worker thread via
        asyncio.to_thread so other requests stay responsive."""
        require_admin(request)
        return await asyncio.to_thread(_cookbook_tasks_status_sync)

    def _cookbook_tasks_status_sync():
        import subprocess

        # Load saved tasks from cookbook state
        tasks = []
        if _cookbook_state_path.exists():
            try:
                state = json.loads(_cookbook_state_path.read_text())
                saved_tasks = state.get("tasks", [])
                if isinstance(saved_tasks, list):
                    tasks = saved_tasks
                elif isinstance(saved_tasks, dict):
                    tasks = list(saved_tasks.values())
            except Exception:
                pass

        results = []
        for task in tasks:
            session_id = task.get("sessionId", "")
            if not session_id:
                continue
            remote = task.get("remoteHost", "")
            task_type = task.get("type", "download")  # "download" or "serve"
            # Field name varies depending on whether the task was added
            # via the download flow (`repoId`), the serve flow (`modelId`),
            # or the UI-side serve preset (which uses `name` + `payload.repo_id`).
            _payload = task.get("payload") or {}
            model = (
                task.get("modelId")
                or task.get("repoId")
                or task.get("name")
                or _payload.get("repo_id")
                or _payload.get("modelId")
                or ""
            )
            task_platform = task.get("platform", "")

            # Check if session is alive + capture output
            _tport = task.get("sshPort", "")
            # Defense-in-depth: cookbook state is admin-writable but the values
            # land in shell-interpolated commands below. Reject anything that
            # isn't a benign session-id / hostname / port.
            if not _SESSION_ID_RE.match(session_id):
                logger.warning(f"Skipping task with unsafe session_id: {session_id!r}")
                continue
            if remote and not _REMOTE_HOST_RE.match(remote):
                logger.warning(f"Skipping task with unsafe remoteHost: {remote!r}")
                continue
            if _tport and not _SSH_PORT_RE.match(str(_tport)):
                logger.warning(f"Skipping task with unsafe sshPort: {_tport!r}")
                continue
            if task_platform == "windows" and remote:
                # Windows: check PID file + Get-Process, read log tail
                sd = "$env:TEMP\\odysseus-sessions"
                ssh_base = ["ssh"]
                if _tport and _tport != "22":
                    ssh_base.extend(["-p", str(_tport)])
                check_cmd = ssh_base + [
                    remote,
                    "powershell",
                    "-Command",
                    f"$pid = Get-Content \"{sd}\\{session_id}.pid\" -ErrorAction SilentlyContinue; "
                    "if ($pid) {{ Get-Process -Id $pid -ErrorAction SilentlyContinue | Out-Null; if ($?) {{ exit 0 }} else {{ exit 1 }} }} else {{ exit 1 }}"
                ]
                capture_cmd = ssh_base + [
                    remote,
                    "powershell",
                    "-Command",
                    f"Get-Content \"{sd}\\{session_id}.log\" -Tail 10 -ErrorAction SilentlyContinue",
                ]
            elif remote:
                ssh_base = ["ssh"]
                if _tport and _tport != "22":
                    ssh_base.extend(["-p", str(_tport)])
                check_cmd = ssh_base + [remote, "tmux", "has-session", "-t", session_id]
                capture_cmd = ssh_base + [remote, "tmux", "capture-pane", "-t", session_id, "-p", "-S", "-50"]
            else:
                check_cmd = ["tmux", "has-session", "-t", session_id]
                capture_cmd = ["tmux", "capture-pane", "-t", session_id, "-p", "-S", "-50"]

            try:
                alive = subprocess.run(check_cmd, timeout=10, capture_output=True)
                is_alive = alive.returncode == 0
            except Exception:
                is_alive = False

            # Capture last lines for progress. Prefer the "Downloading" line
            # (real aggregate bytes) over "Fetching N files" (whole-file count that
            # lags with hf_transfer). Falls back to the true last line otherwise.
            progress_text = ""
            full_snapshot = ""
            if is_alive:
                try:
                    cap = subprocess.run(capture_cmd, timeout=10, capture_output=True, text=True)
                    if cap.returncode == 0:
                        full_snapshot = cap.stdout.strip()
                        lines = [l.strip() for l in full_snapshot.split('\n') if l.strip()]
                        downloading_lines = [l for l in lines if l.startswith("Downloading")]
                        if downloading_lines:
                            progress_text = downloading_lines[-1]
                        elif lines:
                            progress_text = lines[-1]
                except Exception:
                    pass

            # Determine status
            status = "unknown"
            if is_alive:
                lower = full_snapshot.lower()
                has_exit = "=== process exited with code" in lower
                has_error = "error" in lower or "failed" in lower or "traceback" in lower
                if has_exit and task_type == "serve":
                    # Serve tasks that exit are always errors — they should run indefinitely
                    status = "error"
                elif has_exit and "unrecognized arguments" in lower:
                    status = "error"
                elif has_error and not ("application startup complete" in lower):
                    status = "error"
                elif task_type == "download" and ("100%" in full_snapshot or "DOWNLOAD_OK" in full_snapshot):
                    # Only download tasks treat 100% as "completed".
                    # Serve tasks log 100%|██████| during inference progress
                    # (diffusion sampling, etc.) — that's "running", not done.
                    status = "completed"
                elif "application startup complete" in lower:
                    status = "ready"
                else:
                    status = "running"
            else:
                # Session is dead — check if it completed or crashed
                status = "stopped"

            # Parse structured phase info — single source of truth for the UI
            phase_info = _parse_serve_phase(full_snapshot, task_type) if (task_type == "serve" and status == "running" and full_snapshot) else {}
            if phase_info.get("status") == "ready":
                status = "ready"
            serve_phase = phase_info.get("phase", "")
            diagnosis = _diagnose_serve_output(full_snapshot) if task_type == "serve" and full_snapshot else None
            if diagnosis and status in {"running", "unknown", "stopped"}:
                status = "error"
            output_tail = "\n".join(full_snapshot.splitlines()[-12:]) if full_snapshot else ""

            results.append({
                "session_id": session_id,
                "type": task_type,
                "model": model.split("/")[-1] if "/" in model else model,
                "status": status,
                "progress": serve_phase if task_type == "serve" else progress_text[:120],
                "phase": serve_phase,
                "diagnosis": diagnosis,
                "output_tail": output_tail,
                "cmd": _payload.get("_cmd") or "",
                "tps": phase_info.get("tps"),
                "reqs": phase_info.get("reqs"),
                "pct": phase_info.get("pct"),
                "remote": remote or "local",
            })

        return {"tasks": results}

    return router