1729 lines
88 KiB
Python
1729 lines
88 KiB
Python
"""Cookbook routes — model download, serve, cache scanning, and cookbook state sync."""
|
|
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
import shlex
|
|
import shutil
|
|
import uuid
|
|
from pathlib import Path
|
|
|
|
from fastapi import APIRouter, HTTPException, Request, Depends
|
|
|
|
from src.auth_helpers import require_user
|
|
from pydantic import BaseModel
|
|
|
|
from core.middleware import require_admin
|
|
from routes.shell_routes import TMUX_LOG_DIR
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
from routes.cookbook_helpers import (
|
|
_SSH_PORT_RE, _REMOTE_HOST_RE, _SESSION_ID_RE,
|
|
_validate_repo_id, _validate_include, _validate_remote_host, _validate_token,
|
|
_validate_local_dir, _validate_ssh_port, _validate_gpus, _shell_path,
|
|
_ps_squote, _bash_squote, _validate_serve_cmd, _parse_serve_phase,
|
|
_safe_env_prefix,
|
|
ModelDownloadRequest, ServeRequest,
|
|
)
|
|
|
|
_HF_TOKEN_STATUS_SNIPPET = (
|
|
'if [ -n "$HF_TOKEN" ]; then '
|
|
'echo "[odysseus] HF token: applied"; '
|
|
'else '
|
|
'echo "[odysseus] HF token: NOT SET — gated/private models will be denied. '
|
|
'Add one in Odysseus Settings -> Cookbook -> HuggingFace Token."; '
|
|
'fi'
|
|
)
|
|
|
|
def setup_cookbook_routes() -> APIRouter:
|
|
router = APIRouter(tags=["cookbook"])
|
|
_cookbook_state_path = Path(os.environ.get("DATA_DIR", "data")) / "cookbook_state.json"
|
|
|
|
def _mask_secret(value: str) -> str:
|
|
if not value:
|
|
return ""
|
|
if len(value) <= 8:
|
|
return "stored"
|
|
return f"{value[:4]}...{value[-4:]}"
|
|
|
|
def _decrypt_secret(value: str | None) -> str:
|
|
if not value:
|
|
return ""
|
|
from src.secret_storage import decrypt
|
|
return decrypt(value)
|
|
|
|
def _encrypt_secret(value: str) -> str:
|
|
from src.secret_storage import encrypt
|
|
return encrypt(value)
|
|
|
|
def _strip_task_secrets(state):
|
|
tasks = state.get("tasks") if isinstance(state, dict) else None
|
|
if isinstance(tasks, list):
|
|
for task in tasks:
|
|
if isinstance(task, dict) and isinstance(task.get("payload"), dict):
|
|
task["payload"].pop("hf_token", None)
|
|
return state
|
|
|
|
def _diagnose_serve_output(text: str) -> dict | None:
|
|
"""Server-side mirror of the Cookbook UI's common serve diagnoses.
|
|
|
|
The browser uses cookbook-diagnosis.js for clickable fixes. This gives
|
|
the agent/tool path the same structured signal so it can retry with an
|
|
adjusted command instead of guessing from raw tmux output.
|
|
"""
|
|
if not text:
|
|
return None
|
|
tail = text[-6000:]
|
|
patterns = [
|
|
(
|
|
r"No available memory for the cache blocks|Available KV cache memory:.*-",
|
|
"No GPU memory left for KV cache after loading model.",
|
|
[
|
|
{"label": "retry with GPU memory utilization 0.95", "op": "replace", "flag": "--gpu-memory-utilization", "value": "0.95"},
|
|
{"label": "retry with context 2048", "op": "replace", "flag": "--max-model-len", "value": "2048"},
|
|
],
|
|
),
|
|
(
|
|
r"CUDA out of memory|torch\.cuda\.OutOfMemoryError|CUDA error: out of memory|warming up sampler|max_num_seqs.*gpu_memory_utilization",
|
|
"GPU ran out of memory during startup or warmup.",
|
|
[
|
|
{"label": "retry with context 4096", "op": "replace", "flag": "--max-model-len", "value": "4096"},
|
|
{"label": "retry with GPU memory utilization 0.80", "op": "replace", "flag": "--gpu-memory-utilization", "value": "0.80"},
|
|
{"label": "retry with --enforce-eager", "op": "append", "arg": "--enforce-eager"},
|
|
],
|
|
),
|
|
(
|
|
r"not divisib|must be divisible|attention heads.*divisible",
|
|
"Tensor parallel size is incompatible with the model.",
|
|
[
|
|
{"label": "retry with tensor parallel size 1", "op": "replace", "flag": "--tensor-parallel-size", "value": "1"},
|
|
{"label": "retry with tensor parallel size 2", "op": "replace", "flag": "--tensor-parallel-size", "value": "2"},
|
|
],
|
|
),
|
|
(
|
|
r"KV cache.*too (small|large)|max_model_len.*exceeds|maximum.*context",
|
|
"Context length is too large for available GPU memory.",
|
|
[
|
|
{"label": "retry with context 8192", "op": "replace", "flag": "--max-model-len", "value": "8192"},
|
|
{"label": "retry with context 4096", "op": "replace", "flag": "--max-model-len", "value": "4096"},
|
|
],
|
|
),
|
|
(
|
|
r"enable-auto-tool-choice requires --tool-call-parser",
|
|
"Auto tool choice requires an explicit tool call parser.",
|
|
[{"label": "retry with Hermes tool parser", "op": "append", "arg": "--tool-call-parser hermes"}],
|
|
),
|
|
(
|
|
r"Please pass.*trust.remote.code=True|contains custom code which must be executed to correctly load|does not recognize this architecture|model type.*but Transformers does not",
|
|
"Model requires custom code or newer model support.",
|
|
[{"label": "retry with --trust-remote-code", "op": "append", "arg": "--trust-remote-code"}],
|
|
),
|
|
(
|
|
r"Address already in use|bind.*address.*in use",
|
|
"Port is already in use.",
|
|
[{"label": "retry on port 8001", "op": "replace", "flag": "--port", "value": "8001"}],
|
|
),
|
|
(
|
|
r"No CUDA GPUs are available|no GPU.*found|CUDA_VISIBLE_DEVICES.*invalid",
|
|
"No GPUs are visible to the serve process.",
|
|
[{"label": "clear Cookbook GPU selection or choose available GPUs", "op": "settings", "field": "gpus", "value": ""}],
|
|
),
|
|
(
|
|
r"vllm.*command not found|No module named vllm|ERROR: vLLM is not installed",
|
|
"vLLM is not installed or not in PATH on this server.",
|
|
[{"label": "install vLLM in Cookbook Dependencies", "op": "dependency", "package": "vllm"}],
|
|
),
|
|
(
|
|
r"sglang.*command not found|No module named sglang|SGLang is not installed",
|
|
"SGLang is not installed or not in PATH on this server.",
|
|
[{"label": "install SGLang in Cookbook Dependencies", "op": "dependency", "package": "sglang[all]"}],
|
|
),
|
|
(
|
|
r"llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'|git: command not found|cmake: command not found",
|
|
"llama.cpp / llama-cpp-python dependencies are missing.",
|
|
[{"label": "install llama.cpp dependencies or llama-cpp-python[server]", "op": "dependency", "package": "llama-cpp-python[server]"}],
|
|
),
|
|
(
|
|
r"403 Forbidden|401 Unauthorized|Access to model.*is restricted|gated repo|not in the authorized list|awaiting a review",
|
|
"Model access is gated or unauthorized.",
|
|
[{"label": "set HF token and request model access on HuggingFace", "op": "manual"}],
|
|
),
|
|
]
|
|
for pattern, message, suggestions in patterns:
|
|
if re.search(pattern, tail, re.I):
|
|
return {"message": message, "suggestions": suggestions}
|
|
if re.search(r"Traceback \(most recent call last\)", tail, re.I) and not re.search(
|
|
r"Application startup complete|GET /v1/|Uvicorn running on", tail, re.I
|
|
):
|
|
return {
|
|
"message": "Python traceback detected during serve startup.",
|
|
"suggestions": [{"label": "inspect traceback and retry with adjusted backend/settings", "op": "manual"}],
|
|
}
|
|
return None
|
|
|
|
def _state_for_client(state):
|
|
"""Return cookbook state without raw secrets for browser clients."""
|
|
_strip_task_secrets(state)
|
|
env = state.get("env") if isinstance(state, dict) else None
|
|
if isinstance(env, dict):
|
|
token = _decrypt_secret(env.get("hfToken"))
|
|
env.pop("hfToken", None)
|
|
env["hfTokenConfigured"] = bool(token)
|
|
env["hfTokenMasked"] = _mask_secret(token)
|
|
return state
|
|
|
|
def _state_for_storage(state, on_disk=None):
|
|
"""Encrypt cookbook secrets before writing state to disk."""
|
|
_strip_task_secrets(state)
|
|
env = state.get("env") if isinstance(state, dict) else None
|
|
disk_env = on_disk.get("env") if isinstance(on_disk, dict) and isinstance(on_disk.get("env"), dict) else {}
|
|
if isinstance(env, dict):
|
|
incoming = env.get("hfToken")
|
|
if incoming:
|
|
_validate_token(incoming)
|
|
env["hfToken"] = _encrypt_secret(incoming)
|
|
elif disk_env.get("hfToken"):
|
|
env["hfToken"] = disk_env["hfToken"]
|
|
else:
|
|
env.pop("hfToken", None)
|
|
env.pop("hfTokenMasked", None)
|
|
env.pop("hfTokenConfigured", None)
|
|
return state
|
|
|
|
def _load_stored_hf_token() -> str:
|
|
if not _cookbook_state_path.exists():
|
|
return ""
|
|
try:
|
|
state = json.loads(_cookbook_state_path.read_text())
|
|
env = state.get("env") if isinstance(state, dict) else {}
|
|
return _decrypt_secret(env.get("hfToken") if isinstance(env, dict) else "")
|
|
except Exception:
|
|
return ""
|
|
|
|
def _cookbook_ssh_dir() -> Path:
|
|
app_ssh = Path("/app/.ssh")
|
|
if Path("/app").exists():
|
|
return app_ssh
|
|
return Path.home() / ".ssh"
|
|
|
|
def _cookbook_ssh_key_path() -> Path:
|
|
return _cookbook_ssh_dir() / "id_ed25519"
|
|
|
|
def _read_cookbook_public_key() -> str:
|
|
pub = _cookbook_ssh_key_path().with_suffix(".pub")
|
|
if not pub.exists():
|
|
return ""
|
|
return pub.read_text(encoding="utf-8", errors="replace").strip()
|
|
|
|
@router.get("/api/cookbook/ssh-key")
|
|
async def get_cookbook_ssh_key(request: Request):
|
|
require_admin(request)
|
|
public_key = _read_cookbook_public_key()
|
|
return {
|
|
"configured": bool(public_key),
|
|
"public_key": public_key,
|
|
}
|
|
|
|
@router.post("/api/cookbook/ssh-key")
|
|
async def generate_cookbook_ssh_key(request: Request):
|
|
require_admin(request)
|
|
ssh_dir = _cookbook_ssh_dir()
|
|
key_path = _cookbook_ssh_key_path()
|
|
ssh_dir.mkdir(parents=True, exist_ok=True)
|
|
try:
|
|
os.chmod(ssh_dir, 0o700)
|
|
except Exception:
|
|
pass
|
|
if not key_path.exists():
|
|
proc = await asyncio.create_subprocess_exec(
|
|
"ssh-keygen", "-t", "ed25519", "-N", "", "-C", "odysseus-cookbook", "-f", str(key_path),
|
|
stdout=asyncio.subprocess.PIPE,
|
|
stderr=asyncio.subprocess.PIPE,
|
|
)
|
|
stdout, stderr = await proc.communicate()
|
|
if proc.returncode != 0:
|
|
detail = (stderr or stdout).decode("utf-8", errors="replace").strip()[-500:]
|
|
return {"ok": False, "error": detail or "Failed to generate SSH key"}
|
|
try:
|
|
os.chmod(key_path, 0o600)
|
|
os.chmod(key_path.with_suffix(".pub"), 0o644)
|
|
except Exception:
|
|
pass
|
|
return {"ok": True, "public_key": _read_cookbook_public_key()}
|
|
|
|
def _user_shell_path_bootstrap() -> list[str]:
|
|
return [
|
|
'ODYSSEUS_USER_SHELL="${SHELL:-}"',
|
|
'if [ -n "$ODYSSEUS_USER_SHELL" ] && [ -x "$ODYSSEUS_USER_SHELL" ]; then',
|
|
' ODYSSEUS_USER_PATH="$("$ODYSSEUS_USER_SHELL" -ic \'printf "__ODYSSEUS_PATH__%s\\n" "$PATH"\' 2>/dev/null | sed -n \'s/^__ODYSSEUS_PATH__//p\' | tail -n 1 || true)"',
|
|
' if [ -n "$ODYSSEUS_USER_PATH" ]; then export PATH="$ODYSSEUS_USER_PATH:$PATH"; fi',
|
|
'fi',
|
|
]
|
|
|
|
def _needs_binary(cmd: str, binary: str) -> bool:
|
|
return bool(re.search(rf"(^|[\s;&|()]){re.escape(binary)}($|[\s;&|()])", cmd or ""))
|
|
|
|
def _missing_binary_message(binary: str, target: str) -> str:
|
|
if binary == "tmux":
|
|
return (
|
|
f"tmux is required for Cookbook background downloads/serves on {target}. "
|
|
"Install it with your OS package manager, or run Cookbook server setup for that server."
|
|
)
|
|
if binary == "docker":
|
|
return (
|
|
f"Docker is required by this Cookbook launch command on {target}, but the docker CLI was not found. "
|
|
"Install Docker and make sure this user can run `docker`, then retry."
|
|
)
|
|
return f"{binary} is required on {target}, but it was not found."
|
|
|
|
async def _remote_binary_available(remote: str, ssh_port: str | None, binary: str, *, windows: bool = False) -> bool:
|
|
_port = ssh_port or ""
|
|
_pf = ["-p", _port] if _port and _port != "22" else []
|
|
if windows:
|
|
check = f"powershell -NoProfile -Command \"if (Get-Command {binary} -ErrorAction SilentlyContinue) {{ exit 0 }} else {{ exit 127 }}\""
|
|
else:
|
|
check = f"command -v {shlex.quote(binary)} >/dev/null 2>&1"
|
|
try:
|
|
proc = await asyncio.create_subprocess_exec(
|
|
"ssh", "-o", "ConnectTimeout=6", "-o", "StrictHostKeyChecking=no",
|
|
*_pf, remote, check,
|
|
stdout=asyncio.subprocess.PIPE,
|
|
stderr=asyncio.subprocess.PIPE,
|
|
)
|
|
await asyncio.wait_for(proc.communicate(), timeout=10)
|
|
return proc.returncode == 0
|
|
except Exception:
|
|
return False
|
|
|
|
async def _binary_available(binary: str, remote: str | None, ssh_port: str | None, *, windows: bool = False) -> bool:
|
|
if remote:
|
|
return await _remote_binary_available(remote, ssh_port, binary, windows=windows)
|
|
return shutil.which(binary) is not None
|
|
|
|
@router.post("/api/model/download")
|
|
async def model_download(request: Request, req: ModelDownloadRequest):
|
|
"""Download a HuggingFace model in a tmux session.
|
|
Uses `hf download` CLI directly — runs in tmux via `script -qc`
|
|
for real TTY progress, streams ANSI-stripped output via log file."""
|
|
require_admin(request)
|
|
# Defence-in-depth: even though this endpoint is admin-gated, refuse
|
|
# values that would land in shell contexts with metacharacters.
|
|
_validate_repo_id(req.repo_id)
|
|
_validate_include(req.include)
|
|
_validate_remote_host(req.remote_host)
|
|
req.ssh_port = _validate_ssh_port(req.ssh_port)
|
|
req.local_dir = _validate_local_dir(req.local_dir)
|
|
req.hf_token = req.hf_token or _load_stored_hf_token()
|
|
_validate_token(req.hf_token)
|
|
TMUX_LOG_DIR.mkdir(parents=True, exist_ok=True)
|
|
session_id = f"cookbook-{uuid.uuid4().hex[:8]}"
|
|
wrapper_script = TMUX_LOG_DIR / f"{session_id}.sh"
|
|
|
|
# When a download directory is set, target a per-model subfolder under it
|
|
# (<dir>/<name>) so the flat-directory cache scan lists it as its own
|
|
# model. Without it, hf/snapshot_download falls back to the HF cache.
|
|
_dl_short = req.repo_id.split("/")[-1] if "/" in req.repo_id else req.repo_id
|
|
_dl_base = (req.local_dir.rstrip("/") + "/" + _dl_short) if req.local_dir else None
|
|
_dl_shell = _shell_path(_dl_base) if _dl_base else None # for hf CLI / bash
|
|
_dl_pyarg = (", local_dir=os.path.expanduser(" + repr(_dl_base) + ")") if _dl_base else ""
|
|
|
|
# Build the hf download command. Redirection to suppress the interactive
|
|
# "update available? [Y/n]" prompt is added per-platform further down
|
|
# (< /dev/null on bash, $null | on PowerShell).
|
|
hf_cmd = f"hf download {req.repo_id}"
|
|
if req.include:
|
|
hf_cmd += f" --include '{req.include}'"
|
|
if _dl_shell:
|
|
hf_cmd += f" --local-dir {_dl_shell}"
|
|
|
|
# Build the shell wrapper — runs hf download directly in tmux (which is a TTY)
|
|
# No script/tee needed — we'll use tmux capture-pane to read output
|
|
lines = ["#!/bin/bash"]
|
|
lines.extend(_user_shell_path_bootstrap())
|
|
if req.hf_token:
|
|
lines.append(f"export HF_TOKEN='{_bash_squote(req.hf_token)}'")
|
|
# Ensure pip-user scripts (e.g. hf CLI installed via --user) are on PATH
|
|
lines.append('export PATH="$HOME/.local/bin:$PATH"')
|
|
# Best-effort install hf CLI (always). hf_transfer (Rust parallel downloader)
|
|
# is fast but flaky on large files — it tends to crash near the end at high
|
|
# throughput. Retries set disable_hf_transfer to fall back to the plain,
|
|
# slower-but-reliable downloader (resumes cleanly from the .incomplete files).
|
|
lines.append("command -v hf >/dev/null 2>&1 || pip install --user --break-system-packages -q -U huggingface_hub 2>/dev/null || pip install -q -U huggingface_hub 2>/dev/null")
|
|
if req.disable_hf_transfer:
|
|
lines.append("export HF_HUB_ENABLE_HF_TRANSFER=0")
|
|
lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=4")
|
|
else:
|
|
lines.append("python3 -c 'import hf_transfer' 2>/dev/null || pip install --user --break-system-packages -q hf_transfer 2>/dev/null || pip install -q hf_transfer 2>/dev/null")
|
|
lines.append("python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
|
|
lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=8")
|
|
|
|
remote = req.remote_host # None for local
|
|
is_windows = req.platform == "windows"
|
|
logger.info(f"Download request: repo={req.repo_id}, remote={remote}, ssh_port={req.ssh_port}, platform={req.platform}")
|
|
|
|
if not is_windows and not await _binary_available("tmux", remote, req.ssh_port):
|
|
return {
|
|
"ok": False,
|
|
"error": _missing_binary_message("tmux", remote or "local server"),
|
|
"session_id": session_id,
|
|
}
|
|
|
|
if remote and is_windows:
|
|
# ── Windows remote: generate .ps1 runner, use Start-Process for background ──
|
|
remote_runner = f".{session_id}_run.ps1"
|
|
ps_lines = []
|
|
ps_lines.append('$sessionDir = "$env:TEMP\\odysseus-sessions"')
|
|
ps_lines.append('New-Item -ItemType Directory -Force -Path $sessionDir | Out-Null')
|
|
if req.hf_token:
|
|
ps_lines.append(f"$env:HF_TOKEN = '{_ps_squote(req.hf_token)}'")
|
|
if req.env_prefix:
|
|
ps_lines.append(_safe_env_prefix(req.env_prefix))
|
|
# Try hf CLI, fall back to Python huggingface_hub, then auto-install
|
|
ps_lines.append('try {{')
|
|
ps_lines.append(' $hfPath = Get-Command hf -ErrorAction SilentlyContinue')
|
|
ps_lines.append(' if ($hfPath) {{')
|
|
# Pipe $null to stdin to suppress interactive "update available? [Y/n]" prompt
|
|
ps_lines.append(f' $null | {hf_cmd}')
|
|
ps_lines.append(' }} else {{')
|
|
ps_lines.append(' python -c "import huggingface_hub" 2>$null')
|
|
ps_lines.append(' if ($LASTEXITCODE -eq 0) {{')
|
|
ps_lines.append(' Write-Host "hf CLI not found, using Python huggingface_hub..."')
|
|
ps_lines.append(' python -m pip install -q hf_transfer 2>$null')
|
|
ps_lines.append(' $env:HF_HUB_ENABLE_HF_TRANSFER = "1"')
|
|
ps_lines.append(f" python -c \"import os; from huggingface_hub import snapshot_download; snapshot_download('{req.repo_id}'{_dl_pyarg}, max_workers=8)\"")
|
|
ps_lines.append(' }} else {{')
|
|
ps_lines.append(' Write-Host "Installing huggingface-hub..."')
|
|
ps_lines.append(' python -m pip install -q huggingface-hub hf_transfer')
|
|
ps_lines.append(' $env:HF_HUB_ENABLE_HF_TRANSFER = "1"')
|
|
ps_lines.append(f" python -c \"import os; from huggingface_hub import snapshot_download; snapshot_download('{req.repo_id}'{_dl_pyarg}, max_workers=8)\"")
|
|
ps_lines.append(' }}')
|
|
ps_lines.append(' }}')
|
|
ps_lines.append(' if ($LASTEXITCODE -eq 0) {{ Write-Host ""; Write-Host "DOWNLOAD_OK" }}')
|
|
ps_lines.append(' else {{ Write-Host ""; Write-Host "DOWNLOAD_FAILED (exit $LASTEXITCODE)" }}')
|
|
ps_lines.append('}} catch {{')
|
|
ps_lines.append(' Write-Host ""; Write-Host "DOWNLOAD_FAILED ($_)"')
|
|
ps_lines.append('}}')
|
|
ps_lines.append(f'Remove-Item -Force "$HOME\\{remote_runner}" -ErrorAction SilentlyContinue')
|
|
runner_path = TMUX_LOG_DIR / f"{session_id}_run.ps1"
|
|
runner_path.write_text("\r\n".join(ps_lines) + "\r\n")
|
|
|
|
# scp the .ps1 script, then launch it as a detached process with log + pid files
|
|
_port = req.ssh_port
|
|
_Pf = f"-P {_port} " if _port and _port != "22" else ""
|
|
_pf = f"-p {_port} " if _port and _port != "22" else ""
|
|
# Start-Process creates a fully detached process that survives SSH disconnect
|
|
launch_ps = (
|
|
"$sd = \\\"$env:TEMP\\odysseus-sessions\\\"; "
|
|
f"Start-Process powershell -ArgumentList '-ExecutionPolicy','Bypass','-File','$HOME\\{remote_runner}' "
|
|
f"-RedirectStandardOutput \\\"$sd\\{session_id}.log\\\" "
|
|
f"-RedirectStandardError \\\"$sd\\{session_id}.err.log\\\" "
|
|
f"-NoNewWindow -PassThru | ForEach-Object {{ $_.Id | Out-File \\\"$sd\\{session_id}.pid\\\" }}"
|
|
)
|
|
setup_cmd = (
|
|
f"scp -O {_Pf}-q '{runner_path}' {remote}:{remote_runner} && "
|
|
f'ssh {_pf}{remote} "powershell -Command \\"{launch_ps}\\""'
|
|
)
|
|
|
|
elif remote:
|
|
# ── Linux/Termux remote: create tmux session ON the remote host ──
|
|
remote_runner = f".{session_id}_run.sh"
|
|
runner_lines = ["#!/bin/bash"]
|
|
runner_lines.extend(_user_shell_path_bootstrap())
|
|
runner_lines.append("# Auto-detect environment")
|
|
runner_lines.append("deactivate 2>/dev/null; hash -r")
|
|
if req.hf_token:
|
|
runner_lines.append(f"export HF_TOKEN='{_bash_squote(req.hf_token)}'")
|
|
if req.env_prefix:
|
|
runner_lines.append(_safe_env_prefix(req.env_prefix))
|
|
else:
|
|
# Fallback: find a venv with hf CLI, or install huggingface-hub
|
|
runner_lines.append(
|
|
'for p in ~/vllm-env ~/venv ~/.venv; do '
|
|
'if [ -f "$p/bin/activate" ]; then source "$p/bin/activate"; break; fi; '
|
|
'done'
|
|
)
|
|
# Ensure pip-user scripts (e.g. hf CLI installed via --user) are on PATH
|
|
runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
|
|
# Install hf CLI + hf_transfer best-effort so future runs get the fast path.
|
|
# Use --break-system-packages on PEP-668 systems (Arch, newer Debian) so it doesn't bail.
|
|
runner_lines.append("command -v hf >/dev/null 2>&1 || pip install --user --break-system-packages -q -U huggingface_hub 2>/dev/null || pip install -q -U huggingface_hub 2>/dev/null")
|
|
runner_lines.append("python3 -c 'import hf_transfer' 2>/dev/null || pip install --user --break-system-packages -q hf_transfer 2>/dev/null || pip install -q hf_transfer 2>/dev/null")
|
|
runner_lines.append("python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
|
|
runner_lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=8")
|
|
# Surface whether the HF token actually reached THIS server, so a gated
|
|
# download's "not authorized" failure can be told apart from a missing
|
|
# token (the token is masked — we only print applied / not-set).
|
|
runner_lines.append(_HF_TOKEN_STATUS_SNIPPET)
|
|
# Try hf CLI first, fall back to Python huggingface_hub, then auto-install
|
|
runner_lines.append('if command -v hf &>/dev/null; then')
|
|
# < /dev/null suppresses interactive "update available? [Y/n]" prompt
|
|
runner_lines.append(f' {hf_cmd} < /dev/null')
|
|
runner_lines.append('elif python3 -c "import huggingface_hub" 2>/dev/null; then')
|
|
runner_lines.append(' echo "hf CLI not found, using Python huggingface_hub..."')
|
|
runner_lines.append(f' python3 -c "import os; from huggingface_hub import snapshot_download; snapshot_download(\'{req.repo_id}\'{_dl_pyarg}, max_workers=8)"')
|
|
runner_lines.append('else')
|
|
runner_lines.append(' echo "Installing huggingface-hub and dependencies..."')
|
|
runner_lines.append(' pip install --no-deps -q huggingface-hub 2>/dev/null')
|
|
runner_lines.append(' pip install -q filelock fsspec packaging pyyaml tqdm typer httpx requests hf_transfer 2>/dev/null')
|
|
runner_lines.append(" python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
|
|
runner_lines.append(f' python3 -c "import os; from huggingface_hub import snapshot_download; snapshot_download(\'{req.repo_id}\'{_dl_pyarg}, max_workers=8)"')
|
|
runner_lines.append('fi')
|
|
runner_lines.append('if [ $? -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $?)"; fi')
|
|
runner_lines.append(f"rm -f {remote_runner}")
|
|
runner_lines.append('exec "${SHELL:-/bin/bash}"')
|
|
runner_path = TMUX_LOG_DIR / f"{session_id}_run.sh"
|
|
runner_path.write_text("\n".join(runner_lines) + "\n")
|
|
runner_path.chmod(0o755)
|
|
|
|
# scp the runner script, then create tmux session on the remote
|
|
_port = req.ssh_port
|
|
_pf = f"-P {_port} " if _port and _port != "22" else ""
|
|
_spf = f"-p {_port} " if _port and _port != "22" else ""
|
|
setup_cmd = (
|
|
f"scp -O {_pf}-q '{runner_path}' {remote}:{remote_runner} && "
|
|
f"ssh {_spf}{remote} 'chmod +x {remote_runner} && tmux new-session -d -s {session_id} \"./{remote_runner}\"'"
|
|
)
|
|
else:
|
|
# Local: run hf download in a local tmux session
|
|
if req.env_prefix:
|
|
lines.append(_safe_env_prefix(req.env_prefix))
|
|
else:
|
|
lines.append("deactivate 2>/dev/null; hash -r")
|
|
# Show whether the HF token reached this run (masked) — tells a gated
|
|
# "not authorized" failure apart from a missing token.
|
|
lines.append(_HF_TOKEN_STATUS_SNIPPET)
|
|
# < /dev/null suppresses interactive "update available? [Y/n]" prompt
|
|
lines.append(f"{hf_cmd} < /dev/null")
|
|
lines.append('if [ $? -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $?)"; fi')
|
|
lines.append(f"rm -f '{wrapper_script}'")
|
|
lines.append('exec "${SHELL:-/bin/bash}"')
|
|
wrapper_script.write_text("\n".join(lines) + "\n")
|
|
wrapper_script.chmod(0o755)
|
|
setup_cmd = f"tmux new-session -d -s {session_id} {shlex.quote(str(wrapper_script))}"
|
|
|
|
logger.info(f"Model download: {req.repo_id} (include={req.include}, session={session_id}, remote={remote})")
|
|
logger.info(f"Download setup_cmd: {setup_cmd}")
|
|
|
|
proc = await asyncio.create_subprocess_shell(
|
|
setup_cmd,
|
|
stdout=asyncio.subprocess.PIPE,
|
|
stderr=asyncio.subprocess.PIPE,
|
|
)
|
|
await proc.wait()
|
|
|
|
if proc.returncode != 0:
|
|
stderr = (await proc.stderr.read()).decode(errors="replace")
|
|
logger.error(f"Download failed (rc={proc.returncode}): {stderr}")
|
|
return {"ok": False, "error": stderr, "session_id": session_id}
|
|
|
|
# Log to assistant
|
|
try:
|
|
from src.assistant_log import log_to_assistant
|
|
from src.auth_helpers import get_current_user
|
|
owner = get_current_user(request)
|
|
log_to_assistant(
|
|
owner,
|
|
f"Started downloading {req.repo_id} to {remote or 'local'}",
|
|
category="Download",
|
|
)
|
|
except Exception:
|
|
pass
|
|
|
|
return {"ok": True, "session_id": session_id, "remote": remote or "local"}
|
|
|
|
@router.get("/api/model/cached")
|
|
async def model_cached(request: Request, host: str | None = None, model_dir: str | None = None, ssh_port: str | None = None, platform: str | None = None):
|
|
"""List cached models. Scans HF cache + optional model directory."""
|
|
require_admin(request)
|
|
# Validate shell-bound inputs, matching the sibling list_gpus endpoint —
|
|
# `host`/`ssh_port` are interpolated into an ssh command below, so an
|
|
# unvalidated value (e.g. "x'; rm -rf ~ #") would be command injection.
|
|
host = _validate_remote_host(host)
|
|
if ssh_port is not None and ssh_port != "" and not _SSH_PORT_RE.fullmatch(ssh_port):
|
|
raise HTTPException(400, "Invalid ssh_port")
|
|
TMUX_LOG_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
paths_code = "import json, os\n"
|
|
paths_code += "models = []\n"
|
|
paths_code += "seen = set()\n"
|
|
paths_code += "BLOCKED_ROOTS = ('/sys', '/proc', '/dev', '/run', '/var/run')\n"
|
|
paths_code += "def safe_path(p):\n"
|
|
paths_code += " try:\n"
|
|
paths_code += " rp = os.path.realpath(os.path.expanduser(p))\n"
|
|
paths_code += " return not any(rp == b or rp.startswith(b + os.sep) for b in BLOCKED_ROOTS)\n"
|
|
paths_code += " except Exception:\n"
|
|
paths_code += " return False\n"
|
|
paths_code += "def safe_walk(top):\n"
|
|
paths_code += " if not safe_path(top): return\n"
|
|
paths_code += " for root, dirs, fns in os.walk(top, followlinks=False):\n"
|
|
paths_code += " dirs[:] = [d for d in dirs if not os.path.islink(os.path.join(root, d)) and safe_path(os.path.join(root, d))]\n"
|
|
paths_code += " yield root, dirs, fns\n"
|
|
# Scan HF cache format (models-- directories with blobs/)
|
|
paths_code += "def scan_hf(cache):\n"
|
|
paths_code += " if not os.path.isdir(cache): return\n"
|
|
paths_code += " for d in sorted(os.listdir(cache)):\n"
|
|
paths_code += " if not d.startswith('models--'): continue\n"
|
|
paths_code += " rid = d.replace('models--','').replace('--','/')\n"
|
|
paths_code += " if rid in seen: continue\n"
|
|
paths_code += " seen.add(rid)\n"
|
|
paths_code += " blobs = os.path.join(cache, d, 'blobs')\n"
|
|
paths_code += " sz, nf, ic = 0, 0, False\n"
|
|
paths_code += " if os.path.isdir(blobs):\n"
|
|
paths_code += " for f in os.scandir(blobs):\n"
|
|
paths_code += " if f.is_file(): nf += 1; sz += f.stat().st_size\n"
|
|
paths_code += " if f.name.endswith('.incomplete'): ic = True\n"
|
|
paths_code += " # Check if it's an LLM (has config.json with model_type) vs diffusion (has model_index.json)\n"
|
|
paths_code += " snap = os.path.join(cache, d, 'snapshots')\n"
|
|
paths_code += " is_diffusion = False; is_gguf = False\n"
|
|
paths_code += " if os.path.isdir(snap):\n"
|
|
paths_code += " for sd in os.listdir(snap):\n"
|
|
paths_code += " sf = os.path.join(snap, sd)\n"
|
|
paths_code += " if not os.path.isdir(sf): continue\n"
|
|
paths_code += " if os.path.exists(os.path.join(sf, 'model_index.json')): is_diffusion = True\n"
|
|
paths_code += " try:\n"
|
|
paths_code += " if any(x.endswith('.gguf') for x in os.listdir(sf)): is_gguf = True\n"
|
|
paths_code += " except Exception: pass\n"
|
|
paths_code += " models.append({'repo_id':rid,'size_bytes':sz,'nb_files':nf,'has_incomplete':ic,'path':cache,'is_diffusion':is_diffusion,'is_gguf':is_gguf})\n"
|
|
# Scan plain directory (each subdirectory = a model if it has model files)
|
|
paths_code += "def scan_dir(p):\n"
|
|
paths_code += " if not os.path.isdir(p) or not safe_path(p): return\n"
|
|
paths_code += " for d in sorted(os.listdir(p)):\n"
|
|
paths_code += " if d.startswith('.'): continue\n"
|
|
paths_code += " fp = os.path.join(p, d)\n"
|
|
paths_code += " if not os.path.isdir(fp) or os.path.islink(fp) or not safe_path(fp): continue\n"
|
|
paths_code += " if d in seen: continue\n"
|
|
paths_code += " # Check if it looks like a model (has config.json, safetensors, bin, or gguf)\n"
|
|
paths_code += " is_model = False; is_gguf = False\n"
|
|
paths_code += " for root, dirs, fns in safe_walk(fp):\n"
|
|
paths_code += " for fn in fns:\n"
|
|
paths_code += " if fn.endswith('.gguf'): is_gguf = True; is_model = True\n"
|
|
paths_code += " elif fn == 'config.json' or fn.endswith('.safetensors') or fn.endswith('.bin'): is_model = True\n"
|
|
paths_code += " if is_model: break\n"
|
|
paths_code += " if not is_model: continue\n"
|
|
paths_code += " seen.add(d)\n"
|
|
paths_code += " sz, nf = 0, 0\n"
|
|
paths_code += " for dp, _, fns in safe_walk(fp):\n"
|
|
paths_code += " for fn in fns:\n"
|
|
paths_code += " try: nf += 1; sz += os.path.getsize(os.path.join(dp, fn))\n"
|
|
paths_code += " except Exception: pass\n"
|
|
paths_code += " is_diff = os.path.exists(os.path.join(fp, 'model_index.json'))\n"
|
|
paths_code += " models.append({'repo_id':d,'size_bytes':sz,'nb_files':nf,'has_incomplete':False,'path':p,'is_local_dir':True,'is_diffusion':is_diff,'is_gguf':is_gguf})\n"
|
|
# Always scan HF cache
|
|
paths_code += "scan_hf(os.path.expanduser('~/.cache/huggingface/hub'))\n"
|
|
# Also scan custom model dirs (comma-separated) if specified
|
|
if model_dir:
|
|
for d in model_dir.split(','):
|
|
d = d.strip()
|
|
if d and d != '~/.cache/huggingface/hub':
|
|
# repr() encodes the dir as a properly-escaped Python string
|
|
# literal. The old f"...'{d}'..." broke out of the quotes on
|
|
# any `'` in the value, injecting arbitrary Python that then
|
|
# ran locally or over ssh.
|
|
paths_code += f"scan_dir(os.path.expanduser({d!r}))\n"
|
|
paths_code += "print(json.dumps(models))\n"
|
|
|
|
scan_py = TMUX_LOG_DIR / "scan_cache.py"
|
|
scan_py.write_text(paths_code)
|
|
|
|
if host:
|
|
_pf = f"-p {ssh_port} " if ssh_port and ssh_port != "22" else ""
|
|
if platform == "windows":
|
|
# Windows: use 'python' and pipe via stdin with double-quote wrapping
|
|
cmd = f'ssh {_pf}{host} "python -" < \'{scan_py}\''
|
|
else:
|
|
cmd = f"ssh {_pf}{host} 'python3 -' < '{scan_py}'"
|
|
else:
|
|
cmd = f"python3 '{scan_py}'"
|
|
|
|
proc = await asyncio.create_subprocess_shell(
|
|
cmd,
|
|
stdout=asyncio.subprocess.PIPE,
|
|
stderr=asyncio.subprocess.PIPE,
|
|
cwd=str(Path.home()),
|
|
)
|
|
stdout_b, stderr_b = await asyncio.wait_for(proc.communicate(), timeout=60)
|
|
|
|
models = []
|
|
try:
|
|
raw = json.loads(stdout_b.decode(errors="replace").strip())
|
|
for m in raw:
|
|
size_gb = m["size_bytes"] / (1024 ** 3)
|
|
if size_gb >= 1:
|
|
size_str = f"{size_gb:.1f} GB"
|
|
else:
|
|
size_str = f"{m['size_bytes'] / (1024**2):.0f} MB"
|
|
entry = {
|
|
"repo_id": m["repo_id"],
|
|
"size": size_str,
|
|
"nb_files": m["nb_files"],
|
|
"has_incomplete": m["has_incomplete"],
|
|
"status": "downloading" if m["has_incomplete"] else "ready",
|
|
"path": m.get("path", ""),
|
|
"is_diffusion": m.get("is_diffusion", False),
|
|
}
|
|
if m.get("is_local_dir"):
|
|
entry["is_local_dir"] = True
|
|
models.append(entry)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to parse cached models: {e}")
|
|
logger.warning(f"stderr: {stderr_b.decode(errors='replace')[:500]}")
|
|
|
|
return {"models": models, "host": host or "local"}
|
|
|
|
def _auto_register_image_endpoint(req: ServeRequest, remote: str | None) -> str | None:
|
|
"""Register a diffusion model as an image endpoint so it appears in the model selector."""
|
|
import re
|
|
from core.database import SessionLocal, ModelEndpoint
|
|
|
|
# Parse port from command (--port NNNN), default 8100 for diffusion_server
|
|
port_match = re.search(r'--port\s+(\d+)', req.cmd)
|
|
port = int(port_match.group(1)) if port_match else 8100
|
|
|
|
# Determine host
|
|
if remote:
|
|
# SSH alias — use as hostname (Tailscale resolves it later)
|
|
host = remote.split("@")[-1] if "@" in remote else remote
|
|
else:
|
|
host = "localhost"
|
|
|
|
base_url = f"http://{host}:{port}/v1"
|
|
|
|
# Friendly display name from repo_id
|
|
short_name = req.repo_id.split("/")[-1] if "/" in req.repo_id else req.repo_id
|
|
display_name = f"{short_name} (image)"
|
|
|
|
db = SessionLocal()
|
|
try:
|
|
# Check for existing endpoint with same base_url — update it
|
|
existing = db.query(ModelEndpoint).filter(ModelEndpoint.base_url == base_url).first()
|
|
if existing:
|
|
existing.is_enabled = True
|
|
existing.model_type = "image"
|
|
existing.name = display_name
|
|
db.commit()
|
|
logger.info(f"Updated existing image endpoint: {base_url}")
|
|
return existing.id
|
|
|
|
ep_id = f"img-{uuid.uuid4().hex[:8]}"
|
|
ep = ModelEndpoint(
|
|
id=ep_id,
|
|
name=display_name,
|
|
base_url=base_url,
|
|
api_key=None,
|
|
is_enabled=True,
|
|
model_type="image",
|
|
)
|
|
db.add(ep)
|
|
db.commit()
|
|
logger.info(f"Auto-registered image endpoint: {display_name} @ {base_url}")
|
|
return ep_id
|
|
except Exception as e:
|
|
logger.error(f"Failed to auto-register image endpoint: {e}")
|
|
db.rollback()
|
|
return None
|
|
finally:
|
|
db.close()
|
|
|
|
@router.post("/api/model/serve")
|
|
async def model_serve(request: Request, req: ServeRequest):
|
|
"""Launch a model server in a tmux session (or PowerShell background process on Windows).
|
|
|
|
`repo_id` is dual-purpose: a HuggingFace repo (`<org>/<name>`) for
|
|
model-serve commands, OR a bare pip package name when the cmd is a
|
|
`python -m pip install …`. We only enforce the strict HF format on
|
|
the model paths.
|
|
"""
|
|
require_admin(request)
|
|
# Defence-in-depth: reject values that could break out of shell contexts.
|
|
_validate_remote_host(req.remote_host)
|
|
req.ssh_port = _validate_ssh_port(req.ssh_port)
|
|
req.gpus = _validate_gpus(req.gpus)
|
|
req.hf_token = req.hf_token or _load_stored_hf_token()
|
|
_validate_token(req.hf_token)
|
|
# Normalize away backslash-newline continuations (multi-line pasted
|
|
# serve commands) so the cleaned single-line command is what gets
|
|
# written into the runner script and used for engine auto-detection.
|
|
# `_validate_serve_cmd` returns None for empty input; coerce to "" so the
|
|
# many downstream `"engine" in req.cmd` membership checks can't hit
|
|
# `TypeError: argument of type 'NoneType'` (a 500 instead of a clean 400).
|
|
req.cmd = _validate_serve_cmd(req.cmd) or ""
|
|
is_pip_install = bool(req.cmd and "pip install" in req.cmd)
|
|
if is_pip_install:
|
|
# PEP-508-style package spec — letters, digits, `.-_` for the
|
|
# name; `[` `]` for extras; `<>=!~,` for version specifiers.
|
|
# v2 review HIGH-14: tightened from the previous regex which
|
|
# also allowed spaces and `+`, both of which can be abused to
|
|
# introduce extra shell tokens once interpolated into the
|
|
# serve command. We now use `re.fullmatch` and drop space/`+`.
|
|
if not req.repo_id or not re.fullmatch(
|
|
r"[A-Za-z0-9][A-Za-z0-9._\-\[\]<>=!,~]{0,200}", req.repo_id
|
|
):
|
|
raise HTTPException(400, "Invalid pip package name")
|
|
else:
|
|
_validate_repo_id(req.repo_id)
|
|
TMUX_LOG_DIR.mkdir(parents=True, exist_ok=True)
|
|
session_id = f"serve-{uuid.uuid4().hex[:8]}"
|
|
remote = req.remote_host
|
|
is_windows = req.platform == "windows"
|
|
|
|
if not is_windows and not await _binary_available("tmux", remote, req.ssh_port):
|
|
return {
|
|
"ok": False,
|
|
"error": _missing_binary_message("tmux", remote or "local server"),
|
|
"session_id": session_id,
|
|
}
|
|
if _needs_binary(req.cmd, "docker") and not await _binary_available("docker", remote, req.ssh_port, windows=is_windows):
|
|
return {
|
|
"ok": False,
|
|
"error": _missing_binary_message("docker", remote or "local server"),
|
|
"session_id": session_id,
|
|
}
|
|
|
|
if is_windows and remote:
|
|
# ── Windows remote: generate .ps1 serve runner ──
|
|
remote_runner = f".{session_id}_run.ps1"
|
|
ps_lines = []
|
|
ps_lines.append('$sessionDir = "$env:TEMP\\odysseus-sessions"')
|
|
ps_lines.append('New-Item -ItemType Directory -Force -Path $sessionDir | Out-Null')
|
|
if req.hf_token:
|
|
ps_lines.append(f"$env:HF_TOKEN = '{_ps_squote(req.hf_token)}'")
|
|
if req.gpus:
|
|
ps_lines.append(f"$env:CUDA_VISIBLE_DEVICES = '{req.gpus}'")
|
|
if req.env_prefix:
|
|
ps_lines.append(_safe_env_prefix(req.env_prefix))
|
|
# Auto-install ollama if the command uses it
|
|
if "ollama" in req.cmd:
|
|
ps_lines.append('# Check if ollama is available')
|
|
ps_lines.append('if (-not (Get-Command ollama -ErrorAction SilentlyContinue)) {')
|
|
ps_lines.append(' Write-Host "Ollama not found. Please install from https://ollama.com/download/windows"')
|
|
ps_lines.append(' exit 1')
|
|
ps_lines.append('}')
|
|
elif "llama_cpp" in req.cmd or "llama-server" in req.cmd:
|
|
ps_lines.append('# Auto-install llama-cpp-python if missing')
|
|
ps_lines.append('try { python -c "import llama_cpp" 2>$null } catch {}')
|
|
ps_lines.append('if ($LASTEXITCODE -ne 0) {')
|
|
ps_lines.append(' Write-Host "Installing llama-cpp-python..."')
|
|
ps_lines.append(' python -m pip install llama-cpp-python[server]')
|
|
ps_lines.append('}')
|
|
elif "vllm" in req.cmd:
|
|
ps_lines.append('Write-Host "ERROR: vLLM is not supported on Windows. Use Ollama or llama.cpp instead."')
|
|
ps_lines.append('exit 1')
|
|
ps_lines.append(req.cmd)
|
|
ps_lines.append('Write-Host ""')
|
|
ps_lines.append('Write-Host "=== Process exited with code $LASTEXITCODE ==="')
|
|
runner_path = TMUX_LOG_DIR / f"{session_id}_run.ps1"
|
|
runner_path.write_text("\r\n".join(ps_lines) + "\r\n")
|
|
|
|
_port = req.ssh_port
|
|
_Pf = f"-P {_port} " if _port and _port != "22" else ""
|
|
_pf = f"-p {_port} " if _port and _port != "22" else ""
|
|
launch_ps = (
|
|
"$sd = \\\"$env:TEMP\\odysseus-sessions\\\"; "
|
|
f"Start-Process powershell -ArgumentList '-ExecutionPolicy','Bypass','-File','$HOME\\{remote_runner}' "
|
|
f"-RedirectStandardOutput \\\"$sd\\{session_id}.log\\\" "
|
|
f"-RedirectStandardError \\\"$sd\\{session_id}.err.log\\\" "
|
|
f"-NoNewWindow -PassThru | ForEach-Object {{ $_.Id | Out-File \\\"$sd\\{session_id}.pid\\\" }}"
|
|
)
|
|
setup_cmd = (
|
|
f"scp -O {_Pf}-q '{runner_path}' {remote}:{remote_runner} && "
|
|
f'ssh {_pf}{remote} "powershell -Command \\"{launch_ps}\\""'
|
|
)
|
|
else:
|
|
# ── Linux/Termux: bash + tmux (existing flow) ──
|
|
runner_lines = ["#!/bin/bash"]
|
|
runner_lines.extend(_user_shell_path_bootstrap())
|
|
runner_lines.append("export FLASHINFER_DISABLE_VERSION_CHECK=1")
|
|
if req.hf_token:
|
|
runner_lines.append(f"export HF_TOKEN='{_bash_squote(req.hf_token)}'")
|
|
if req.gpus:
|
|
runner_lines.append(f"export CUDA_VISIBLE_DEVICES='{req.gpus}'")
|
|
if req.env_prefix:
|
|
runner_lines.append(_safe_env_prefix(req.env_prefix))
|
|
else:
|
|
runner_lines.append("deactivate 2>/dev/null; hash -r")
|
|
# Show whether the HF token reached this server (masked) — a gated
|
|
# model vLLM has to download will be denied without it.
|
|
runner_lines.append(_HF_TOKEN_STATUS_SNIPPET)
|
|
# Auto-install inference engine if missing
|
|
if "llama_cpp" in req.cmd or "llama-server" in req.cmd:
|
|
# Prefer the NATIVE llama-server binary — its minja templating
|
|
# renders modern GGUF chat templates that the Python bindings'
|
|
# Jinja2 rejects (do_tojson ensure_ascii). Build it once from
|
|
# source if missing; keep llama-cpp-python only as a fallback.
|
|
runner_lines.append('# Ensure a llama.cpp server (prefer native llama-server)')
|
|
runner_lines.append('export PATH="$HOME/.local/bin:$HOME/bin:$HOME/llama.cpp/build/bin:$PATH"')
|
|
runner_lines.append('if [ -d /data/data/com.termux ]; then')
|
|
runner_lines.append(' # Termux: no native build — use the Python bindings (CPU).')
|
|
runner_lines.append(' if ! python3 -c "import llama_cpp" 2>/dev/null; then')
|
|
runner_lines.append(' pkg install -y cmake 2>/dev/null')
|
|
runner_lines.append(' pip install numpy diskcache jinja2 2>/dev/null')
|
|
runner_lines.append(' CMAKE_ARGS="-DGGML_BLAS=OFF -DGGML_LLAMAFILE=OFF" pip install llama-cpp-python --no-build-isolation --no-cache-dir 2>&1 || true')
|
|
runner_lines.append(' fi')
|
|
runner_lines.append('elif ! command -v llama-server &>/dev/null; then')
|
|
runner_lines.append(' echo "Native llama-server not found — building from source (one-time, may take a few minutes)..."')
|
|
runner_lines.append(' mkdir -p ~/bin')
|
|
runner_lines.append(' cd ~ && [ -d llama.cpp ] || git clone --depth 1 https://github.com/ggml-org/llama.cpp')
|
|
# GPU build if CUDA is present; fall back to a plain (CPU) build.
|
|
runner_lines.append(' cd ~/llama.cpp && { cmake -B build -DGGML_CUDA=ON 2>/dev/null || cmake -B build; } \\')
|
|
runner_lines.append(' && cmake --build build -j"$(nproc)" --target llama-server \\')
|
|
runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
|
|
runner_lines.append(' # If the native build failed, fall back to the Python bindings.')
|
|
runner_lines.append(' if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then')
|
|
runner_lines.append(' echo "llama-server build failed — installing Python bindings as fallback..."')
|
|
runner_lines.append(' pip install --user --break-system-packages -q llama-cpp-python 2>/dev/null || pip install -q llama-cpp-python 2>/dev/null || true')
|
|
runner_lines.append(' fi')
|
|
runner_lines.append('fi')
|
|
elif "vllm serve" in req.cmd:
|
|
# Put ~/.local/bin on PATH first — without a venv, vllm installs
|
|
# there via --user and the non-login serve shell otherwise can't
|
|
# find the `vllm` CLI ("command not found"). Mirrors llama.cpp above.
|
|
runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
|
|
runner_lines.append('if ! command -v vllm &>/dev/null; then')
|
|
runner_lines.append(' echo "ERROR: vLLM is not installed. Open Cookbook -> Dependencies and install vllm on this server, then launch again."')
|
|
runner_lines.append(' exit 127')
|
|
runner_lines.append('fi')
|
|
elif "sglang.launch_server" in req.cmd:
|
|
runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
|
|
runner_lines.append('if ! python3 -c "import sglang" 2>/dev/null; then')
|
|
runner_lines.append(' echo "ERROR: SGLang is not installed. Open Cookbook -> Dependencies and install sglang on this server, then launch again."')
|
|
runner_lines.append(' exit 127')
|
|
runner_lines.append('fi')
|
|
|
|
runner_lines.append(req.cmd)
|
|
# Keep shell open after exit so user can see errors
|
|
runner_lines.append('echo ""; echo "=== Process exited with code $? ==="; exec "${SHELL:-/bin/bash}"')
|
|
|
|
runner_path = TMUX_LOG_DIR / f"{session_id}_run.sh"
|
|
runner_path.write_text("\n".join(runner_lines) + "\n")
|
|
runner_path.chmod(0o755)
|
|
|
|
if remote:
|
|
remote_runner = f".{session_id}_run.sh"
|
|
# If command references scripts/, scp those too
|
|
scp_extras = ""
|
|
_port = req.ssh_port
|
|
_Pf = f"-P {_port} " if _port and _port != "22" else ""
|
|
_pf = f"-p {_port} " if _port and _port != "22" else ""
|
|
if "scripts/diffusion_server.py" in req.cmd:
|
|
from core.constants import BASE_DIR
|
|
diff_script = Path(BASE_DIR) / "scripts" / "diffusion_server.py"
|
|
if diff_script.exists():
|
|
scp_extras = f"scp -O {_Pf}-q '{diff_script}' {remote}:.diffusion_server.py && "
|
|
runner_path.write_text(
|
|
runner_path.read_text().replace(
|
|
"scripts/diffusion_server.py", ".diffusion_server.py"
|
|
)
|
|
)
|
|
setup_cmd = (
|
|
f"{scp_extras}"
|
|
f"scp -O {_Pf}-q '{runner_path}' {remote}:{remote_runner} && "
|
|
f"ssh {_pf}{remote} 'chmod +x {remote_runner} && tmux new-session -d -s {session_id} \"./{remote_runner}\"'"
|
|
)
|
|
else:
|
|
setup_cmd = f"tmux new-session -d -s {session_id} {shlex.quote(str(runner_path))}"
|
|
|
|
proc = await asyncio.create_subprocess_shell(
|
|
setup_cmd,
|
|
stdout=asyncio.subprocess.PIPE,
|
|
stderr=asyncio.subprocess.PIPE,
|
|
)
|
|
await proc.wait()
|
|
|
|
if proc.returncode != 0:
|
|
stderr = (await proc.stderr.read()).decode(errors="replace")
|
|
return {"ok": False, "error": stderr, "session_id": session_id}
|
|
|
|
# Auto-register as model endpoint if serving a diffusion model
|
|
endpoint_id = None
|
|
is_diffusion = "diffusion_server.py" in req.cmd
|
|
if is_diffusion:
|
|
endpoint_id = _auto_register_image_endpoint(req, remote)
|
|
|
|
# Log to assistant
|
|
try:
|
|
from src.assistant_log import log_to_assistant
|
|
from src.auth_helpers import get_current_user
|
|
owner = get_current_user(request)
|
|
short = req.repo_id.split("/")[-1] if "/" in req.repo_id else req.repo_id
|
|
log_to_assistant(
|
|
owner,
|
|
f"Started serving {short} on {remote or 'local'}",
|
|
category="Serve",
|
|
)
|
|
except Exception:
|
|
pass
|
|
|
|
return {"ok": True, "session_id": session_id, "remote": remote or "local",
|
|
"endpoint_id": endpoint_id}
|
|
|
|
# ── Server setup (install deps on remote) ──
|
|
|
|
class SetupRequest(BaseModel):
|
|
host: str
|
|
ssh_port: str | None = None
|
|
|
|
@router.post("/api/cookbook/setup")
|
|
async def server_setup(request: Request, req: SetupRequest):
|
|
"""Install required dependencies on a remote server via SSH."""
|
|
require_admin(request)
|
|
host = _validate_remote_host(req.host)
|
|
if not host:
|
|
raise HTTPException(400, "host is required")
|
|
port = req.ssh_port
|
|
if port is not None and port != "" and not re.fullmatch(r"\d{1,5}", port):
|
|
raise HTTPException(400, "Invalid ssh_port")
|
|
pf = f"-p {port} " if port and port != "22" else ""
|
|
|
|
# Detect platform: Windows first (echo %OS% → Windows_NT), then Termux, then Linux
|
|
detect_cmd = f'ssh {pf}{host} "echo %OS%"'
|
|
platform = "linux"
|
|
try:
|
|
proc = await asyncio.create_subprocess_shell(
|
|
detect_cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
|
|
)
|
|
stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=10)
|
|
out = stdout.decode().strip()
|
|
if "Windows_NT" in out:
|
|
platform = "windows"
|
|
else:
|
|
# Check for Termux
|
|
detect_cmd2 = f"ssh {pf}{host} 'test -d /data/data/com.termux && echo termux || echo linux'"
|
|
proc2 = await asyncio.create_subprocess_shell(
|
|
detect_cmd2, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
|
|
)
|
|
stdout2, _ = await asyncio.wait_for(proc2.communicate(), timeout=10)
|
|
platform = stdout2.decode().strip()
|
|
except Exception:
|
|
platform = "linux"
|
|
|
|
if platform == "windows":
|
|
# Windows setup: ensure Python + pip + huggingface-hub via PowerShell
|
|
# Also create the session directory for background tasks
|
|
setup_script = (
|
|
'powershell -Command "'
|
|
"New-Item -ItemType Directory -Force -Path $env:TEMP\\odysseus-sessions | Out-Null; "
|
|
"try { python --version } catch { Write-Host 'ERROR: Python not found — install from python.org'; exit 1 }; "
|
|
"python -m pip install -q huggingface-hub 2>$null; "
|
|
"python -c \\\"from huggingface_hub import snapshot_download; print('OK')\\\""
|
|
'"'
|
|
)
|
|
cmd = f'ssh {pf}{host} {setup_script}'
|
|
elif platform == "termux":
|
|
setup_script = (
|
|
"pkg install -y python tmux 2>/dev/null; "
|
|
"pip install --no-deps -q huggingface-hub 2>/dev/null; "
|
|
"pip install -q filelock fsspec packaging pyyaml tqdm typer httpx requests 2>/dev/null; "
|
|
"python3 -c 'from huggingface_hub import snapshot_download; print(\"OK\")'"
|
|
)
|
|
cmd = f"ssh {pf}{host} '{setup_script}'"
|
|
else:
|
|
# Linux: auto-install tmux (via whichever package manager is available)
|
|
# and huggingface_hub + hf_transfer (falling back to --user/--break-system-packages
|
|
# on PEP-668 locked distros like Arch / newer Debian).
|
|
setup_script = (
|
|
# Install tmux if missing — try common package managers; skip if no sudo
|
|
"if ! command -v tmux >/dev/null 2>&1; then "
|
|
" if command -v apt-get >/dev/null 2>&1; then sudo -n apt-get install -y tmux 2>/dev/null; "
|
|
" elif command -v pacman >/dev/null 2>&1; then sudo -n pacman -S --noconfirm tmux 2>/dev/null; "
|
|
" elif command -v dnf >/dev/null 2>&1; then sudo -n dnf install -y tmux 2>/dev/null; "
|
|
" elif command -v apk >/dev/null 2>&1; then sudo -n apk add --no-interactive tmux 2>/dev/null; "
|
|
" elif command -v zypper >/dev/null 2>&1; then sudo -n zypper --non-interactive install tmux 2>/dev/null; "
|
|
" fi; "
|
|
"fi; "
|
|
"command -v tmux >/dev/null 2>&1 || echo 'WARNING: tmux missing and auto-install failed (need passwordless sudo). Install manually.'; "
|
|
# Install Python bits. Try system install first; fall back to --user --break-system-packages on PEP 668 systems.
|
|
"pip install -q huggingface_hub hf_transfer 2>/dev/null || "
|
|
"pip install --user --break-system-packages -q huggingface_hub hf_transfer 2>/dev/null || "
|
|
"pip3 install --user --break-system-packages -q huggingface_hub hf_transfer 2>/dev/null; "
|
|
"python3 -c 'from huggingface_hub import snapshot_download; print(\"OK\")'"
|
|
)
|
|
cmd = f"ssh {pf}{host} '{setup_script}'"
|
|
|
|
try:
|
|
proc = await asyncio.create_subprocess_shell(
|
|
cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
|
|
)
|
|
stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=120)
|
|
output = stdout.decode() + stderr.decode()
|
|
ok = "OK" in output
|
|
return {"ok": ok, "output": output.strip(), "platform": platform}
|
|
except asyncio.TimeoutError:
|
|
return {"ok": False, "error": "Setup timed out (120s)", "platform": platform}
|
|
except Exception as e:
|
|
return {"ok": False, "error": str(e), "platform": platform}
|
|
|
|
# ── GPU availability probe ──
|
|
|
|
async def _run_nvidia_smi(query: str, host: str | None, ssh_port: str | None, timeout: int = 8):
|
|
"""Run nvidia-smi locally or over SSH. Returns (stdout, error_or_None)."""
|
|
if host:
|
|
pf = f"-p {ssh_port} " if ssh_port and ssh_port != "22" else ""
|
|
cmd = f"ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no {pf}{host} '{query}'"
|
|
proc = await asyncio.create_subprocess_shell(
|
|
cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
|
|
)
|
|
else:
|
|
proc = await asyncio.create_subprocess_exec(
|
|
*shlex.split(query),
|
|
stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE,
|
|
)
|
|
try:
|
|
stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout)
|
|
except asyncio.TimeoutError:
|
|
proc.kill()
|
|
return None, "nvidia-smi timed out"
|
|
if proc.returncode != 0:
|
|
err = (stderr.decode("utf-8", errors="replace") or "").strip()[:200]
|
|
return None, err or "nvidia-smi failed"
|
|
return stdout.decode("utf-8", errors="replace"), None
|
|
|
|
async def _run_gpu_shell(cmd_text: str, host: str | None, ssh_port: str | None, timeout: int = 8):
|
|
"""Run a small GPU probe shell command locally or over SSH."""
|
|
if host:
|
|
pf = f"-p {ssh_port} " if ssh_port and ssh_port != "22" else ""
|
|
quoted_cmd = shlex.quote(cmd_text)
|
|
remote_cmd = (
|
|
f"if command -v sh >/dev/null 2>&1; then sh -lc {quoted_cmd}; "
|
|
f"elif command -v bash >/dev/null 2>&1; then bash -lc {quoted_cmd}; "
|
|
f"elif command -v zsh >/dev/null 2>&1; then zsh -lc {quoted_cmd}; "
|
|
"else echo 'No POSIX shell found for GPU probe' >&2; exit 127; fi"
|
|
)
|
|
cmd = f"ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no {pf}{host} {shlex.quote(remote_cmd)}"
|
|
proc = await asyncio.create_subprocess_shell(
|
|
cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
|
|
)
|
|
else:
|
|
proc = await asyncio.create_subprocess_shell(
|
|
cmd_text, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
|
|
)
|
|
try:
|
|
stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout)
|
|
except asyncio.TimeoutError:
|
|
proc.kill()
|
|
return None, "GPU probe timed out"
|
|
if proc.returncode != 0:
|
|
err = (stderr.decode("utf-8", errors="replace") or "").strip()[:200]
|
|
return None, err or f"GPU probe failed ({proc.returncode})"
|
|
return stdout.decode("utf-8", errors="replace"), None
|
|
|
|
async def _gpu_read_file(path: str, host: str | None, ssh_port: str | None) -> str | None:
|
|
out, err = await _run_gpu_shell(f"cat {shlex.quote(path)} 2>/dev/null", host, ssh_port, timeout=4)
|
|
if err is not None or out is None:
|
|
return None
|
|
return out.strip()
|
|
|
|
async def _probe_gpu_device_processes(host: str | None, ssh_port: str | None) -> list[dict]:
|
|
pid_cmd = (
|
|
"{ command -v lsof >/dev/null 2>&1 && "
|
|
"lsof -w -t /dev/kfd /dev/dri/renderD* 2>/dev/null || true; "
|
|
"command -v fuser >/dev/null 2>&1 && "
|
|
"fuser /dev/kfd /dev/dri/renderD* 2>/dev/null || true; } "
|
|
"| tr ' ' '\\n' | sed '/^[0-9][0-9]*$/!d' | sort -n -u"
|
|
)
|
|
out, err = await _run_gpu_shell(pid_cmd, host, ssh_port, timeout=5)
|
|
if err is not None or not out:
|
|
return []
|
|
processes = []
|
|
seen = set()
|
|
for raw in out.splitlines():
|
|
try:
|
|
pid = int(raw.strip())
|
|
except ValueError:
|
|
continue
|
|
if pid in seen:
|
|
continue
|
|
seen.add(pid)
|
|
name_out, _ = await _run_gpu_shell(f"ps -p {pid} -o comm= 2>/dev/null", host, ssh_port, timeout=3)
|
|
name = (name_out or "").strip().splitlines()[0] if (name_out or "").strip() else "process"
|
|
processes.append({"pid": pid, "name": name[:80], "used_mb": 0})
|
|
return processes
|
|
|
|
async def _probe_amd_sysfs(host: str | None, ssh_port: str | None) -> list[dict]:
|
|
out, err = await _run_gpu_shell("ls -1 /sys/class/drm 2>/dev/null", host, ssh_port, timeout=4)
|
|
if err is not None or not out:
|
|
return []
|
|
gpus = []
|
|
for entry in out.split():
|
|
if not entry.startswith("card") or "-" in entry:
|
|
continue
|
|
base = f"/sys/class/drm/{entry}/device"
|
|
vendor = await _gpu_read_file(f"{base}/vendor", host, ssh_port)
|
|
if vendor != "0x1002":
|
|
continue
|
|
vram_raw = await _gpu_read_file(f"{base}/mem_info_vram_total", host, ssh_port)
|
|
vis_raw = await _gpu_read_file(f"{base}/mem_info_vis_vram_total", host, ssh_port)
|
|
gtt_raw = await _gpu_read_file(f"{base}/mem_info_gtt_total", host, ssh_port)
|
|
vram_bytes = int(vram_raw) if vram_raw and vram_raw.isdigit() else 0
|
|
vis_bytes = int(vis_raw) if vis_raw and vis_raw.isdigit() else 0
|
|
gtt_bytes = int(gtt_raw) if gtt_raw and gtt_raw.isdigit() else 0
|
|
total_bytes = max(vram_bytes, vis_bytes)
|
|
used_attr = "mem_info_vis_vram_used" if vis_bytes and vis_bytes >= vram_bytes else "mem_info_vram_used"
|
|
unified = bool(vis_bytes and vis_bytes >= vram_bytes)
|
|
if total_bytes <= 0:
|
|
total_bytes = gtt_bytes
|
|
used_attr = "mem_info_gtt_used"
|
|
unified = True
|
|
if total_bytes <= 0:
|
|
continue
|
|
used_raw = await _gpu_read_file(f"{base}/{used_attr}", host, ssh_port)
|
|
used_bytes = int(used_raw) if used_raw and used_raw.isdigit() else 0
|
|
name = await _gpu_read_file(f"{base}/product_name", host, ssh_port)
|
|
if not name:
|
|
device = await _gpu_read_file(f"{base}/device", host, ssh_port)
|
|
name = f"AMD GPU {device or entry}"
|
|
total_mb = max(0, int(total_bytes / (1024 * 1024)))
|
|
used_mb = max(0, min(total_mb, int(used_bytes / (1024 * 1024))))
|
|
free_mb = max(0, total_mb - used_mb)
|
|
gpus.append({
|
|
"index": len(gpus), "name": name, "uuid": entry,
|
|
"free_mb": free_mb, "total_mb": total_mb, "used_mb": used_mb,
|
|
"util_pct": 0, "busy": bool(total_mb and (free_mb / total_mb) < 0.85),
|
|
"processes": [], "backend": "rocm", "source": "amd-sysfs",
|
|
"unified_memory": unified,
|
|
})
|
|
if gpus:
|
|
processes = await _probe_gpu_device_processes(host, ssh_port)
|
|
if processes:
|
|
gpus[0]["processes"] = processes
|
|
gpus[0]["busy"] = True
|
|
return gpus
|
|
|
|
@router.get("/api/cookbook/gpus")
|
|
async def list_gpus(request: Request, host: str | None = None, ssh_port: str | None = None):
|
|
"""Probe GPU memory/process state locally or via SSH.
|
|
|
|
Probe order:
|
|
1. NVIDIA via nvidia-smi
|
|
2. AMD/ROCm and unified-memory APUs via /sys/class/drm
|
|
3. Generic GPU device holders via /dev/kfd and /dev/dri/renderD*
|
|
|
|
Returned shape:
|
|
{ "ok": True, "gpus": [
|
|
{"index": 0, "name": "...", "free_mb": int, "total_mb": int,
|
|
"used_mb": int, "util_pct": int, "busy": bool,
|
|
"uuid": "GPU-...",
|
|
"processes": [{"pid": int, "name": str, "used_mb": int}, ...]
|
|
}, ...
|
|
]}
|
|
`busy` is True when free_mb/total_mb < 0.5.
|
|
"""
|
|
require_admin(request)
|
|
host = _validate_remote_host(host)
|
|
if ssh_port is not None and ssh_port != "" and not _SSH_PORT_RE.fullmatch(ssh_port):
|
|
raise HTTPException(400, "Invalid ssh_port")
|
|
gpu_query = "nvidia-smi --query-gpu=index,name,memory.free,memory.total,memory.used,utilization.gpu,uuid --format=csv,noheader,nounits"
|
|
nvidia_error = None
|
|
try:
|
|
gpu_out, err = await _run_nvidia_smi(gpu_query, host, ssh_port)
|
|
if err is not None:
|
|
nvidia_error = err
|
|
gpu_out = ""
|
|
except FileNotFoundError:
|
|
nvidia_error = "nvidia-smi not found"
|
|
gpu_out = ""
|
|
except Exception as e:
|
|
nvidia_error = str(e)[:200]
|
|
gpu_out = ""
|
|
|
|
gpus = []
|
|
uuid_to_idx: dict[str, int] = {}
|
|
for line in (gpu_out or "").strip().splitlines():
|
|
parts = [p.strip() for p in line.split(",")]
|
|
if len(parts) < 7:
|
|
continue
|
|
try:
|
|
idx = int(parts[0])
|
|
name = parts[1]
|
|
free_mb = int(float(parts[2]))
|
|
total_mb = int(float(parts[3]))
|
|
used_mb = int(float(parts[4]))
|
|
util_pct = int(float(parts[5]))
|
|
gpu_uuid = parts[6]
|
|
except (ValueError, IndexError):
|
|
continue
|
|
busy = total_mb > 0 and (free_mb / total_mb) < 0.5
|
|
uuid_to_idx[gpu_uuid] = idx
|
|
gpus.append({
|
|
"index": idx, "name": name, "uuid": gpu_uuid,
|
|
"free_mb": free_mb, "total_mb": total_mb,
|
|
"used_mb": used_mb, "util_pct": util_pct,
|
|
"busy": busy, "processes": [],
|
|
})
|
|
|
|
# Best-effort process listing — skip silently if it fails
|
|
proc_query = "nvidia-smi --query-compute-apps=pid,gpu_uuid,process_name,used_memory --format=csv,noheader,nounits"
|
|
try:
|
|
proc_out, proc_err = await _run_nvidia_smi(proc_query, host, ssh_port, timeout=5)
|
|
if proc_err is None and proc_out:
|
|
gpus_by_idx = {g["index"]: g for g in gpus}
|
|
for line in proc_out.strip().splitlines():
|
|
parts = [p.strip() for p in line.split(",")]
|
|
if len(parts) < 4:
|
|
continue
|
|
try:
|
|
pid = int(parts[0])
|
|
pname = parts[2]
|
|
pmem = int(float(parts[3]))
|
|
except (ValueError, IndexError):
|
|
continue
|
|
idx = uuid_to_idx.get(parts[1])
|
|
if idx is None or idx not in gpus_by_idx:
|
|
continue
|
|
gpus_by_idx[idx]["processes"].append({
|
|
"pid": pid, "name": pname, "used_mb": pmem,
|
|
})
|
|
except Exception:
|
|
pass
|
|
|
|
if gpus:
|
|
return {"ok": True, "gpus": gpus, "backend": "cuda", "source": "nvidia-smi"}
|
|
|
|
amd_gpus = await _probe_amd_sysfs(host, ssh_port)
|
|
if amd_gpus:
|
|
return {
|
|
"ok": True,
|
|
"gpus": amd_gpus,
|
|
"backend": "rocm",
|
|
"source": "amd-sysfs",
|
|
"fallback_from": "nvidia-smi",
|
|
"nvidia_error": nvidia_error,
|
|
}
|
|
|
|
processes = await _probe_gpu_device_processes(host, ssh_port)
|
|
if processes:
|
|
return {
|
|
"ok": True,
|
|
"gpus": [{
|
|
"index": 0, "name": "GPU device holders", "uuid": "dev-dri",
|
|
"free_mb": 0, "total_mb": 0, "used_mb": 0, "util_pct": 0,
|
|
"busy": True, "processes": processes,
|
|
"backend": "generic", "source": "gpu-devices",
|
|
}],
|
|
"backend": "generic",
|
|
"source": "gpu-devices",
|
|
"fallback_from": "nvidia-smi",
|
|
"nvidia_error": nvidia_error,
|
|
}
|
|
|
|
return {"ok": False, "error": nvidia_error or "No GPU memory probe available", "gpus": []}
|
|
|
|
class KillPidRequest(BaseModel):
|
|
pid: int
|
|
host: str | None = None
|
|
ssh_port: str | None = None
|
|
signal: str = "TERM" # TERM (graceful) or KILL (force)
|
|
|
|
@router.post("/api/cookbook/kill-pid")
|
|
async def kill_pid(request: Request, req: KillPidRequest):
|
|
"""Kill a PID that's holding GPU memory.
|
|
|
|
Admin-gated. Validates PID is positive int, signal is TERM/KILL, and
|
|
forbids low PIDs (<100) to avoid accidentally signalling init/system
|
|
daemons. Uses `kill -<sig> <pid>` locally or over SSH.
|
|
"""
|
|
require_admin(request)
|
|
if req.pid < 100:
|
|
raise HTTPException(400, f"Refusing to signal PID {req.pid} (<100, likely system process)")
|
|
sig = (req.signal or "TERM").upper()
|
|
if sig not in ("TERM", "KILL", "INT"):
|
|
raise HTTPException(400, "signal must be TERM, KILL, or INT")
|
|
host = _validate_remote_host(req.host)
|
|
if req.ssh_port and not _SSH_PORT_RE.fullmatch(req.ssh_port):
|
|
raise HTTPException(400, "Invalid ssh_port")
|
|
kill_cmd = f"kill -{sig} {req.pid}"
|
|
try:
|
|
if host:
|
|
pf = f"-p {req.ssh_port} " if req.ssh_port and req.ssh_port != "22" else ""
|
|
cmd = f"ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no {pf}{host} '{kill_cmd}'"
|
|
proc = await asyncio.create_subprocess_shell(
|
|
cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
|
|
)
|
|
else:
|
|
proc = await asyncio.create_subprocess_exec(
|
|
"kill", f"-{sig}", str(req.pid),
|
|
stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE,
|
|
)
|
|
stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=5)
|
|
if proc.returncode != 0:
|
|
err = (stderr.decode("utf-8", errors="replace") or "").strip()[:200]
|
|
return {"ok": False, "error": err or f"kill returned {proc.returncode}"}
|
|
return {"ok": True, "pid": req.pid, "signal": sig}
|
|
except asyncio.TimeoutError:
|
|
return {"ok": False, "error": "kill command timed out"}
|
|
except Exception as e:
|
|
return {"ok": False, "error": str(e)[:200]}
|
|
|
|
# ── Cookbook state persistence (cross-device sync) ──
|
|
|
|
@router.get("/api/cookbook/state")
|
|
async def get_cookbook_state(request: Request):
|
|
"""Load saved cookbook state (tasks, servers, presets, settings)."""
|
|
require_admin(request)
|
|
if _cookbook_state_path.exists():
|
|
try:
|
|
return _state_for_client(json.loads(_cookbook_state_path.read_text()))
|
|
except Exception:
|
|
return {}
|
|
return {}
|
|
|
|
@router.post("/api/cookbook/state")
|
|
async def save_cookbook_state(request: Request):
|
|
"""Save cookbook state for cross-device sync.
|
|
|
|
Admin-gated because cookbook state is read back into shell-quoting
|
|
contexts when polling tmux session status (see status handler).
|
|
|
|
Merge guard: the UI debounces a `_syncToServer` POST every few
|
|
seconds with whatever localStorage has. The agent's tool layer
|
|
writes server-side tasks (e.g. `download_model` registering a
|
|
task). Without a merge, every UI sync wipes the agent's recent
|
|
additions. We preserve any on-disk task that the incoming body
|
|
omits but was added in the last RACE_WINDOW seconds — that's a
|
|
race, not an intentional delete.
|
|
"""
|
|
require_admin(request)
|
|
RACE_WINDOW_MS = 60_000
|
|
try:
|
|
from core.atomic_io import atomic_write_json
|
|
data = await request.json()
|
|
if not isinstance(data, dict):
|
|
data = {}
|
|
try:
|
|
if _cookbook_state_path.exists():
|
|
on_disk = json.loads(_cookbook_state_path.read_text())
|
|
else:
|
|
on_disk = {}
|
|
except Exception:
|
|
on_disk = {}
|
|
# Anti-wipe guard for env servers. The UI debounces a
|
|
# sync of whatever is in memory; if it fires before the state has
|
|
# hydrated from GET /state (a load-time race) or during a render
|
|
# glitch, `env.servers` would be empty and silently overwrite the
|
|
# saved servers on disk. Never let an empty/absent incoming
|
|
# env.servers clobber a populated on-disk one — preserve the disk
|
|
# values while still accepting the rest of the incoming env.
|
|
disk_env = on_disk.get("env") if isinstance(on_disk, dict) and isinstance(on_disk.get("env"), dict) else None
|
|
if disk_env:
|
|
inc_env = data.get("env") if isinstance(data.get("env"), dict) else None
|
|
if inc_env is None:
|
|
data["env"] = disk_env
|
|
logger.warning("cookbook state POST: incoming body had no env; preserved on-disk env (anti-wipe guard)")
|
|
elif disk_env.get("servers") and not inc_env.get("servers"):
|
|
inc_env["servers"] = disk_env["servers"]
|
|
logger.warning("cookbook state POST: incoming env.servers empty; preserved on-disk servers (anti-wipe guard)")
|
|
|
|
disk_tasks = on_disk.get("tasks") or [] if isinstance(on_disk, dict) else []
|
|
incoming_tasks = data.get("tasks") if isinstance(data.get("tasks"), list) else []
|
|
incoming_ids = {t.get("sessionId") for t in incoming_tasks if isinstance(t, dict) and t.get("sessionId")}
|
|
import time as _t
|
|
now_ms = int(_t.time() * 1000)
|
|
preserved = []
|
|
for t in disk_tasks:
|
|
if not isinstance(t, dict):
|
|
continue
|
|
sid = t.get("sessionId")
|
|
if not sid or sid in incoming_ids:
|
|
continue # client's version wins
|
|
ts = t.get("ts") or 0
|
|
if isinstance(ts, (int, float)) and (now_ms - ts) <= RACE_WINDOW_MS:
|
|
preserved.append(t)
|
|
if preserved:
|
|
logger.info(f"cookbook state POST: preserving {len(preserved)} recent task(s) "
|
|
f"not in incoming body (race guard): "
|
|
f"{[t.get('sessionId') for t in preserved]}")
|
|
data["tasks"] = incoming_tasks + preserved
|
|
atomic_write_json(str(_cookbook_state_path), _state_for_storage(data, on_disk), indent=2)
|
|
return {"ok": True, "preserved": len(preserved)}
|
|
except Exception as e:
|
|
return {"ok": False, "error": str(e)}
|
|
|
|
@router.get("/api/cookbook/hf-latest")
|
|
async def hf_latest(vram_gb: float = 0, limit: int = 10, pipeline: str = "text-generation", owner: str = Depends(require_user)):
|
|
"""Fetch latest HuggingFace models, filtered by what fits in available VRAM.
|
|
|
|
vram_gb: total available VRAM in GB. 0 = no filter (return everything).
|
|
limit: how many models to return (default 10).
|
|
pipeline: HF pipeline_tag filter (text-generation, text-to-image, etc.).
|
|
"""
|
|
import re
|
|
import httpx
|
|
|
|
# Fetch a larger pool so we have enough to filter from (we drop ~80%)
|
|
pool_size = max(limit * 15, 100)
|
|
url = (
|
|
"https://huggingface.co/api/models"
|
|
f"?sort=trendingScore&direction=-1&limit={pool_size}&filter={pipeline}"
|
|
)
|
|
try:
|
|
async with httpx.AsyncClient(timeout=15) as client:
|
|
resp = await client.get(url)
|
|
if resp.status_code != 200:
|
|
return {"models": [], "error": f"HF API HTTP {resp.status_code}"}
|
|
raw = resp.json()
|
|
except Exception as e:
|
|
return {"models": [], "error": str(e)}
|
|
|
|
# Estimate VRAM from the model id. Looks for patterns like "7B", "70B", "1.5B" etc.
|
|
# Returns approx VRAM in GB at fp16 (params*2). Caller adjusts for quant.
|
|
def _est_vram_fp16(repo_id: str) -> float | None:
|
|
m = re.search(r'[-_/](\d+(?:\.\d+)?)\s*[Bb](?![a-zA-Z])', repo_id)
|
|
if not m:
|
|
return None
|
|
params_b = float(m.group(1))
|
|
return params_b * 2.0 # fp16 baseline
|
|
|
|
# Detect quantization from repo_id / tags. Returns a multiplier on fp16 size.
|
|
def _quant_factor(repo_id: str, tags: list) -> float:
|
|
text = (repo_id + " " + " ".join(tags or [])).lower()
|
|
if "fp4" in text or "nf4" in text or "int4" in text or "4bit" in text or "q4" in text or "awq" in text or "gptq" in text:
|
|
return 0.25
|
|
if "int8" in text or "8bit" in text or "q8" in text or "fp8" in text:
|
|
return 0.5
|
|
if "bf16" in text or "fp16" in text:
|
|
return 1.0
|
|
return 1.0 # default fp16
|
|
|
|
# Exclude adapters, LoRAs, datasets, GGUF-only repos, and other non-runnable artifacts
|
|
EXCLUDE_TAG_SUBSTRINGS = (
|
|
"lora", "adapter", "peft", "qlora",
|
|
"dataset", "embeddings",
|
|
"merge", "control-lora",
|
|
"diffusion-lora", "stable-diffusion-lora",
|
|
"text-classification", "token-classification",
|
|
"feature-extraction", "sentence-similarity",
|
|
)
|
|
EXCLUDE_NAME_SUBSTRINGS = (
|
|
"lora", "adapter", "peft", "qlora",
|
|
"embedding", "embed-",
|
|
"dataset",
|
|
)
|
|
|
|
def _is_excluded(repo_id: str, tags: list) -> bool:
|
|
text = repo_id.lower()
|
|
for s in EXCLUDE_NAME_SUBSTRINGS:
|
|
if s in text:
|
|
return True
|
|
tag_text = " ".join(t.lower() for t in (tags or []))
|
|
for s in EXCLUDE_TAG_SUBSTRINGS:
|
|
if s in tag_text:
|
|
return True
|
|
return False
|
|
|
|
out = []
|
|
for entry in raw:
|
|
repo_id = entry.get("modelId") or entry.get("id") or ""
|
|
if not repo_id:
|
|
continue
|
|
tags = entry.get("tags") or []
|
|
pipeline_tag = entry.get("pipeline_tag") or ""
|
|
|
|
# Hard filter: only the requested pipeline (HF's filter param is loose)
|
|
if pipeline and pipeline_tag and pipeline_tag != pipeline:
|
|
continue
|
|
# Skip adapters, LoRAs, datasets, etc.
|
|
if _is_excluded(repo_id, tags):
|
|
continue
|
|
|
|
est_fp16 = _est_vram_fp16(repo_id)
|
|
quant_mult = _quant_factor(repo_id, tags)
|
|
est_vram = (est_fp16 * quant_mult) if est_fp16 else None
|
|
# Add 30% headroom for KV cache, activations, etc.
|
|
needed_vram = (est_vram * 1.3) if est_vram else None
|
|
|
|
if vram_gb > 0 and needed_vram is not None and needed_vram > vram_gb:
|
|
continue
|
|
# Skip if no size info — without a size we can't tell if it's a real
|
|
# full-weight model or a tiny adapter, so we'd rather drop it
|
|
if est_vram is None:
|
|
continue
|
|
|
|
out.append({
|
|
"repo_id": repo_id,
|
|
"downloads": entry.get("downloads", 0),
|
|
"likes": entry.get("likes", 0),
|
|
"createdAt": entry.get("createdAt", ""),
|
|
"tags": tags[:5], # trim
|
|
"pipeline_tag": pipeline_tag,
|
|
"est_vram_gb": round(est_vram, 1) if est_vram else None,
|
|
"needed_vram_gb": round(needed_vram, 1) if needed_vram else None,
|
|
})
|
|
if len(out) >= limit:
|
|
break
|
|
|
|
return {"models": out}
|
|
|
|
@router.get("/api/cookbook/tasks/status")
|
|
async def cookbook_tasks_status(request: Request):
|
|
"""Check status of all active cookbook tmux sessions.
|
|
|
|
Critical: every subprocess.run inside this handler is a sync blocking
|
|
call that — when this was a plain async def — froze the entire server
|
|
event loop. Now the whole body runs in a worker thread via
|
|
asyncio.to_thread so other requests stay responsive."""
|
|
require_admin(request)
|
|
return await asyncio.to_thread(_cookbook_tasks_status_sync)
|
|
|
|
def _cookbook_tasks_status_sync():
|
|
import subprocess
|
|
|
|
# Load saved tasks from cookbook state
|
|
tasks = []
|
|
if _cookbook_state_path.exists():
|
|
try:
|
|
state = json.loads(_cookbook_state_path.read_text())
|
|
saved_tasks = state.get("tasks", [])
|
|
if isinstance(saved_tasks, list):
|
|
tasks = saved_tasks
|
|
elif isinstance(saved_tasks, dict):
|
|
tasks = list(saved_tasks.values())
|
|
except Exception:
|
|
pass
|
|
|
|
results = []
|
|
for task in tasks:
|
|
session_id = task.get("sessionId", "")
|
|
if not session_id:
|
|
continue
|
|
remote = task.get("remoteHost", "")
|
|
task_type = task.get("type", "download") # "download" or "serve"
|
|
# Field name varies depending on whether the task was added
|
|
# via the download flow (`repoId`), the serve flow (`modelId`),
|
|
# or the UI-side serve preset (which uses `name` + `payload.repo_id`).
|
|
_payload = task.get("payload") or {}
|
|
model = (
|
|
task.get("modelId")
|
|
or task.get("repoId")
|
|
or task.get("name")
|
|
or _payload.get("repo_id")
|
|
or _payload.get("modelId")
|
|
or ""
|
|
)
|
|
task_platform = task.get("platform", "")
|
|
|
|
# Check if session is alive + capture output
|
|
_tport = task.get("sshPort", "")
|
|
# Defense-in-depth: cookbook state is admin-writable but the values
|
|
# land in shell-interpolated commands below. Reject anything that
|
|
# isn't a benign session-id / hostname / port.
|
|
if not _SESSION_ID_RE.match(session_id):
|
|
logger.warning(f"Skipping task with unsafe session_id: {session_id!r}")
|
|
continue
|
|
if remote and not _REMOTE_HOST_RE.match(remote):
|
|
logger.warning(f"Skipping task with unsafe remoteHost: {remote!r}")
|
|
continue
|
|
if _tport and not _SSH_PORT_RE.match(str(_tport)):
|
|
logger.warning(f"Skipping task with unsafe sshPort: {_tport!r}")
|
|
continue
|
|
if task_platform == "windows" and remote:
|
|
# Windows: check PID file + Get-Process, read log tail
|
|
sd = "$env:TEMP\\odysseus-sessions"
|
|
ssh_base = ["ssh"]
|
|
if _tport and _tport != "22":
|
|
ssh_base.extend(["-p", str(_tport)])
|
|
check_cmd = ssh_base + [
|
|
remote,
|
|
"powershell",
|
|
"-Command",
|
|
f"$pid = Get-Content \"{sd}\\{session_id}.pid\" -ErrorAction SilentlyContinue; "
|
|
"if ($pid) {{ Get-Process -Id $pid -ErrorAction SilentlyContinue | Out-Null; if ($?) {{ exit 0 }} else {{ exit 1 }} }} else {{ exit 1 }}"
|
|
]
|
|
capture_cmd = ssh_base + [
|
|
remote,
|
|
"powershell",
|
|
"-Command",
|
|
f"Get-Content \"{sd}\\{session_id}.log\" -Tail 10 -ErrorAction SilentlyContinue",
|
|
]
|
|
elif remote:
|
|
ssh_base = ["ssh"]
|
|
if _tport and _tport != "22":
|
|
ssh_base.extend(["-p", str(_tport)])
|
|
check_cmd = ssh_base + [remote, "tmux", "has-session", "-t", session_id]
|
|
capture_cmd = ssh_base + [remote, "tmux", "capture-pane", "-t", session_id, "-p", "-S", "-50"]
|
|
else:
|
|
check_cmd = ["tmux", "has-session", "-t", session_id]
|
|
capture_cmd = ["tmux", "capture-pane", "-t", session_id, "-p", "-S", "-50"]
|
|
|
|
try:
|
|
alive = subprocess.run(check_cmd, timeout=10, capture_output=True)
|
|
is_alive = alive.returncode == 0
|
|
except Exception:
|
|
is_alive = False
|
|
|
|
# Capture last lines for progress. Prefer the "Downloading" line
|
|
# (real aggregate bytes) over "Fetching N files" (whole-file count that
|
|
# lags with hf_transfer). Falls back to the true last line otherwise.
|
|
progress_text = ""
|
|
full_snapshot = ""
|
|
if is_alive:
|
|
try:
|
|
cap = subprocess.run(capture_cmd, timeout=10, capture_output=True, text=True)
|
|
if cap.returncode == 0:
|
|
full_snapshot = cap.stdout.strip()
|
|
lines = [l.strip() for l in full_snapshot.split('\n') if l.strip()]
|
|
downloading_lines = [l for l in lines if l.startswith("Downloading")]
|
|
if downloading_lines:
|
|
progress_text = downloading_lines[-1]
|
|
elif lines:
|
|
progress_text = lines[-1]
|
|
except Exception:
|
|
pass
|
|
|
|
# Determine status
|
|
status = "unknown"
|
|
if is_alive:
|
|
lower = full_snapshot.lower()
|
|
has_exit = "=== process exited with code" in lower
|
|
has_error = "error" in lower or "failed" in lower or "traceback" in lower
|
|
if has_exit and task_type == "serve":
|
|
# Serve tasks that exit are always errors — they should run indefinitely
|
|
status = "error"
|
|
elif has_exit and "unrecognized arguments" in lower:
|
|
status = "error"
|
|
elif has_error and not ("application startup complete" in lower):
|
|
status = "error"
|
|
elif task_type == "download" and ("100%" in full_snapshot or "DOWNLOAD_OK" in full_snapshot):
|
|
# Only download tasks treat 100% as "completed".
|
|
# Serve tasks log 100%|██████| during inference progress
|
|
# (diffusion sampling, etc.) — that's "running", not done.
|
|
status = "completed"
|
|
elif "application startup complete" in lower:
|
|
status = "ready"
|
|
else:
|
|
status = "running"
|
|
else:
|
|
# Session is dead — check if it completed or crashed
|
|
status = "stopped"
|
|
|
|
# Parse structured phase info — single source of truth for the UI
|
|
phase_info = _parse_serve_phase(full_snapshot, task_type) if (task_type == "serve" and status == "running" and full_snapshot) else {}
|
|
if phase_info.get("status") == "ready":
|
|
status = "ready"
|
|
serve_phase = phase_info.get("phase", "")
|
|
diagnosis = _diagnose_serve_output(full_snapshot) if task_type == "serve" and full_snapshot else None
|
|
if diagnosis and status in {"running", "unknown", "stopped"}:
|
|
status = "error"
|
|
output_tail = "\n".join(full_snapshot.splitlines()[-12:]) if full_snapshot else ""
|
|
|
|
results.append({
|
|
"session_id": session_id,
|
|
"type": task_type,
|
|
"model": model.split("/")[-1] if "/" in model else model,
|
|
"status": status,
|
|
"progress": serve_phase if task_type == "serve" else progress_text[:120],
|
|
"phase": serve_phase,
|
|
"diagnosis": diagnosis,
|
|
"output_tail": output_tail,
|
|
"cmd": _payload.get("_cmd") or "",
|
|
"tps": phase_info.get("tps"),
|
|
"reqs": phase_info.get("reqs"),
|
|
"pct": phase_info.get("pct"),
|
|
"remote": remote or "local",
|
|
})
|
|
|
|
return {"tasks": results}
|
|
|
|
return router
|