odysseus/scripts/odysseus-cookbook

#!/usr/bin/env python3
"""odysseus-cookbook — shell wrapper for the cookbook feature.

The web UI orchestrates HuggingFace model downloads + local serving
through tmux sessions and writes its bookkeeping to
`data/cookbook_state.json`. This CLI exposes the same operations on
the shell so they can be cron'd, piped, or scripted:

    odysseus-cookbook list                       # active downloads + servers
    odysseus-cookbook gpus                       # nvidia-smi per-GPU JSON
    odysseus-cookbook cached                     # local HF cache snapshot
    odysseus-cookbook hf-latest --vram-gb 24     # trending HF models that fit
    odysseus-cookbook download Qwen/Qwen3-8B     # fire off `hf download` in tmux
    odysseus-cookbook kill cookbook-abc123       # tmux kill-session

Reads/writes the same `data/cookbook_state.json` the web UI uses, so
state stays in sync. Output is JSON on stdout, errors on stderr,
non-zero exit on failure.
"""


from __future__ import annotations
import sys
import os, sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "_lib"))
from cli import quiet_logs, emit, fail, common_parser, run, REPO_ROOT as _REPO_ROOT
quiet_logs()


import argparse
import json
import logging
import os
import re
import subprocess
import sys
import urllib.request
import urllib.parse
import uuid
from pathlib import Path

_DATA_DIR = Path(os.environ.get("DATA_DIR", str(_REPO_ROOT / "data")))
_STATE_PATH = _DATA_DIR / "cookbook_state.json"

# Mirror routes/shell_routes.TMUX_LOG_DIR — don't import it because that pulls
# the whole web app into the process. Match its definition instead.
import tempfile
_TMUX_LOG_DIR = Path(tempfile.gettempdir()) / "odysseus-tmux"


def fail(msg: str, code: int = 1) -> None:
    sys.stderr.write(f"error: {msg}\n")
    sys.exit(code)


def _tmux_sessions() -> list[str]:
    """Return active tmux session names, or [] if tmux isn't installed."""
    try:
        out = subprocess.run(
            ["tmux", "list-sessions", "-F", "#S"],
            capture_output=True, text=True, timeout=5,
        )
        if out.returncode != 0:
            return []
        return [s.strip() for s in out.stdout.splitlines() if s.strip()]
    except FileNotFoundError:
        return []
    except Exception:
        return []


def _read_state() -> dict:
    if not _STATE_PATH.exists():
        return {}
    try:
        return json.loads(_STATE_PATH.read_text())
    except Exception:
        return {}


# ─── list ────────────────────────────────────────────────────────────

def cmd_list(args) -> None:
    """Active tmux sessions + cookbook state, joined.
    Output: {state, sessions, cookbook_sessions} where cookbook_sessions
    is the subset of tmux sessions whose name starts with `cookbook-`."""
    sessions = _tmux_sessions()
    cookbook = [s for s in sessions if s.startswith("cookbook-")]
    emit({
        "state": _read_state(),
        "all_tmux_sessions": sessions,
        "cookbook_sessions": cookbook,
    }, args)


# ─── gpus ────────────────────────────────────────────────────────────

def _macos_metal_gpu() -> list | None:
    """Apple Silicon has no discrete VRAM — report total unified memory as the
    GPU budget so the web UI's picker shows the Mac's Metal GPU instead of
    'no GPU'. `free` is approximated from vm_stat (page-granular); macOS doesn't
    expose Metal utilization to the shell, so util is 0. Returns None off macOS."""
    if sys.platform != "darwin":
        return None

    def _sysctl(key: str) -> str | None:
        try:
            r = subprocess.run(["sysctl", "-n", key], capture_output=True, text=True, timeout=5)
            return r.stdout.strip() if r.returncode == 0 else None
        except Exception:
            return None

    memsize = _sysctl("hw.memsize")
    if not memsize or not memsize.isdigit():
        return None
    total_mb = int(memsize) // (1024 * 1024)
    name = _sysctl("machdep.cpu.brand_string") or "Apple Silicon"

    free_mb = total_mb
    try:
        vm = subprocess.run(["vm_stat"], capture_output=True, text=True, timeout=5)
        if vm.returncode == 0:
            page_size, pages = 4096, {}
            for line in vm.stdout.splitlines():
                if "page size of" in line:
                    m = re.search(r"page size of (\d+)", line)
                    if m:
                        page_size = int(m.group(1))
                elif ":" in line:
                    k, v = line.split(":", 1)
                    v = v.strip().rstrip(".")
                    if v.isdigit():
                        pages[k.strip()] = int(v)
            free_pages = (pages.get("Pages free", 0) + pages.get("Pages inactive", 0)
                          + pages.get("Pages speculative", 0))
            if free_pages:
                free_mb = (free_pages * page_size) // (1024 * 1024)
    except Exception:
        pass

    return [{
        "index": 0,
        "name": name,
        "free_mb": free_mb,
        "total_mb": total_mb,
        "used_mb": max(0, total_mb - free_mb),
        "util_pct": 0,
        "uuid": "apple-metal-0",
        "unified_memory": True,
        "busy": (free_mb / total_mb) < 0.5 if total_mb else False,
    }]


def cmd_gpus(args) -> None:
    """Same shape the web UI gets — index/name/free_mb/total_mb/used_mb/
    util_pct/uuid. On Apple Silicon (no nvidia-smi) reports the Metal GPU's
    unified memory instead. Returns `[]` with an `error` field only on a
    CPU-only non-Mac box. Pass `--host user@box` to run over SSH."""
    query = "nvidia-smi --query-gpu=index,name,memory.free,memory.total,memory.used,utilization.gpu,uuid --format=csv,noheader,nounits"
    prefix = _ssh_prefix(args.host, args.ssh_port)
    cmd = prefix + (query.split() if not prefix else [query])
    try:
        out = subprocess.run(cmd, capture_output=True, text=True, timeout=15)
    except FileNotFoundError:
        # No nvidia-smi locally → try the Metal fallback before giving up.
        if not prefix:
            mac = _macos_metal_gpu()
            if mac is not None:
                emit({"ok": True, "gpus": mac, "backend": "metal"}, args)
                return
        msg = "ssh not found" if prefix else "nvidia-smi not found"
        emit({"ok": False, "error": msg, "gpus": []}, args)
        return
    if out.returncode != 0:
        # nvidia-smi present but errored (or no NVIDIA GPU) — fall back to Metal.
        if not prefix:
            mac = _macos_metal_gpu()
            if mac is not None:
                emit({"ok": True, "gpus": mac, "backend": "metal"}, args)
                return
        emit({"ok": False, "error": out.stderr.strip()[:200], "gpus": []}, args)
        return
    gpus = []
    for line in out.stdout.strip().splitlines():
        parts = [p.strip() for p in line.split(",")]
        if len(parts) < 7:
            continue
        try:
            idx, name, free_mb, total_mb, used_mb, util, gpu_uuid = parts[:7]
            total_i, free_i = int(total_mb), int(free_mb)
            gpus.append({
                "index": int(idx),
                "name": name,
                "free_mb": free_i,
                "total_mb": total_i,
                "used_mb": int(used_mb),
                "util_pct": int(util),
                "uuid": gpu_uuid,
                "busy": (free_i / total_i) < 0.5 if total_i else False,
            })
        except (ValueError, ZeroDivisionError):
            continue
    emit({"ok": True, "gpus": gpus}, args)


# ─── cached ──────────────────────────────────────────────────────────

def cmd_cached(args) -> None:
    """List cached HuggingFace models. Walks ~/.cache/huggingface/hub
    (or $HF_HOME) and returns directory names with size summaries.
    Cheap version of the route's full-scan helper — good enough for a
    `which models do I already have` glance."""
    hf_home = Path(os.environ.get("HF_HOME") or os.path.expanduser("~/.cache/huggingface"))
    hub = hf_home / "hub"
    if not hub.is_dir():
        emit({"models": [], "hub_path": str(hub), "note": "no hub cache yet"}, args)
        return
    models = []
    for entry in sorted(hub.iterdir()):
        if not entry.is_dir():
            continue
        # Hub layout: `models--<org>--<repo>` becomes `<org>/<repo>`.
        if entry.name.startswith("models--"):
            repo = entry.name[len("models--"):].replace("--", "/")
        elif entry.name.startswith("datasets--"):
            repo = "datasets/" + entry.name[len("datasets--"):].replace("--", "/")
        else:
            repo = entry.name
        size = 0
        try:
            for f in entry.rglob("*"):
                if f.is_file() and not f.is_symlink():
                    size += f.stat().st_size
        except Exception:
            pass
        models.append({"repo": repo, "path": str(entry), "size_bytes": size})
    emit({"models": models, "hub_path": str(hub)}, args)


# ─── hf-latest ───────────────────────────────────────────────────────

def cmd_hf_latest(args) -> None:
    """Trending HF models, optionally filtered by VRAM-at-fp16 fit.
    Mirrors `/api/cookbook/hf-latest` so cron jobs that pre-pull
    "models that fit on my box this week" can use the same filter."""
    pool_size = max(args.limit * 15, 100)
    url = (
        "https://huggingface.co/api/models"
        f"?sort=trendingScore&direction=-1&limit={pool_size}&filter={urllib.parse.quote(args.pipeline)}"
    )
    try:
        with urllib.request.urlopen(url, timeout=15) as resp:
            raw = json.loads(resp.read().decode("utf-8"))
    except Exception as e:
        fail(f"HF API request failed: {e}")

    def _est_vram_fp16(repo_id: str) -> float | None:
        m = re.search(r'[-_/](\d+(?:\.\d+)?)\s*[Bb](?![a-zA-Z])', repo_id)
        if not m:
            return None
        params_b = float(m.group(1))
        return params_b * 2  # fp16 = 2 bytes/param

    out = []
    for m in raw:
        rid = m.get("id") or m.get("modelId") or ""
        if not rid:
            continue
        vram = _est_vram_fp16(rid)
        if args.vram_gb > 0 and vram is not None and vram > args.vram_gb:
            continue
        out.append({
            "id": rid,
            "downloads": m.get("downloads", 0),
            "likes": m.get("likes", 0),
            "trendingScore": m.get("trendingScore"),
            "pipeline_tag": m.get("pipeline_tag", ""),
            "est_vram_fp16_gb": vram,
        })
        if len(out) >= args.limit:
            break
    emit({"models": out, "vram_gb_filter": args.vram_gb}, args)


# ─── download ────────────────────────────────────────────────────────

def cmd_download(args) -> None:
    """Start `hf download <repo>` in a detached tmux session. Returns
    the session ID so callers can `tail` the log or `kill` later.

    Pass `--host user@box` to run the download on a remote machine
    over SSH. The remote needs `tmux` and `hf` installed; the local
    side just gets a session-id back."""
    if not re.fullmatch(r"[\w.-]+/[\w.-]+", args.repo):
        fail(f"invalid repo id {args.repo!r} — expected `org/name`")

    session_id = f"cookbook-dl-{uuid.uuid4().hex[:8]}"
    cmd_parts = ["hf", "download", args.repo]
    if args.include:
        cmd_parts += ["--include", args.include]
    if args.revision:
        cmd_parts += ["--revision", args.revision]

    if args.host:
        # Remote — let the remote shell decide log location.
        remote_log = f"/tmp/odysseus-tmux/{session_id}.log"
        hf_cmd = " ".join(map(_shell_quote, cmd_parts))
        remote_shell_cmd = (
            f"mkdir -p /tmp/odysseus-tmux && "
            f"tmux new-session -d -s {_shell_quote(session_id)} "
            f"bash -lc {_shell_quote(f'{hf_cmd} 2>&1 | tee {remote_log}; echo DONE')}"
        )
        ssh_argv = _ssh_prefix(args.host, args.ssh_port) + [remote_shell_cmd]
        try:
            out = subprocess.run(ssh_argv, capture_output=True, text=True, timeout=20)
        except FileNotFoundError:
            fail("ssh not installed")
        if out.returncode != 0:
            fail(f"remote tmux launch failed: {out.stderr.strip() or out.stdout.strip()}")
        emit({
            "ok": True,
            "session_id": session_id,
            "repo": args.repo,
            "host": args.host,
            "remote_log_path": remote_log,
            "tail_cmd": f"ssh {args.host} tail -f {remote_log}",
            "kill_cmd": f"odysseus-cookbook kill {session_id} --host {args.host}",
        }, args)
        return

    _TMUX_LOG_DIR.mkdir(parents=True, exist_ok=True)
    log_path = _TMUX_LOG_DIR / f"{session_id}.log"
    shell_cmd = " ".join(map(_shell_quote, cmd_parts)) + f" 2>&1 | tee {_shell_quote(str(log_path))}; echo DONE"
    try:
        subprocess.run(
            ["tmux", "new-session", "-d", "-s", session_id, "bash", "-lc", shell_cmd],
            check=True, capture_output=True, text=True, timeout=10,
        )
    except FileNotFoundError:
        fail("tmux not installed — can't run background sessions from CLI")
    except subprocess.CalledProcessError as e:
        fail(f"tmux failed: {e.stderr or e.stdout}")

    emit({
        "ok": True,
        "session_id": session_id,
        "repo": args.repo,
        "log_path": str(log_path),
        "tail_cmd": f"tail -f {log_path}",
        "kill_cmd": f"odysseus-cookbook kill {session_id}",
    }, args)


def _shell_quote(s: str) -> str:
    """Minimal POSIX-shell quoting — wraps `s` in single quotes and
    escapes any embedded single quotes."""
    return "'" + s.replace("'", "'\\''") + "'"


# ─── serve ───────────────────────────────────────────────────────────

def cmd_serve(args) -> None:
    """Run an arbitrary serve command in a detached tmux session.

    Deliberately not opinionated about flags — the web UI handles
    platform-specific template generation. For the CLI you pass the full
    serve command via `--cmd`. Common patterns:

        odysseus-cookbook serve qwen3-8b --cmd 'python -m vllm.entrypoints.openai.api_server --model Qwen/Qwen3-8B --port 8000'
        odysseus-cookbook serve sdxl --cmd 'python scripts/diffusion_server.py --model stabilityai/sdxl --port 8006'
    """
    if not args.cmd or not args.cmd.strip():
        fail("--cmd is required and must be a non-empty serve command")
    _TMUX_LOG_DIR.mkdir(parents=True, exist_ok=True)
    safe_name = re.sub(r"[^\w.-]", "-", args.name)[:32] or "anon"
    session_id = f"serve-{safe_name}-{uuid.uuid4().hex[:6]}"
    log_path = _TMUX_LOG_DIR / f"{session_id}.log"
    shell_cmd = f"{args.cmd} 2>&1 | tee {_shell_quote(str(log_path))}; echo DONE"
    try:
        subprocess.run(
            ["tmux", "new-session", "-d", "-s", session_id, "bash", "-lc", shell_cmd],
            check=True, capture_output=True, text=True, timeout=10,
        )
    except FileNotFoundError:
        fail("tmux not installed")
    except subprocess.CalledProcessError as e:
        fail(f"tmux failed: {e.stderr or e.stdout}")
    emit({
        "ok": True,
        "session_id": session_id,
        "log_path": str(log_path),
        "cmd": args.cmd,
        "tail_cmd": f"tail -f {log_path}",
        "kill_cmd": f"odysseus-cookbook kill {session_id}",
    }, args)


# ─── state set ───────────────────────────────────────────────────────

def cmd_state_set(args) -> None:
    """Write JSON from stdin to data/cookbook_state.json. Atomic via
    a temp-file + rename so a partial write can't corrupt the file.

    Before overwriting we copy the previous state to .bak — if you ever
    nuke your live state by piping the wrong thing into stdin, restore
    with `cp data/cookbook_state.json.bak data/cookbook_state.json`."""
    data = sys.stdin.read()
    if not data.strip():
        fail("expected JSON on stdin")
    try:
        obj = json.loads(data)
    except json.JSONDecodeError as e:
        fail(f"invalid JSON on stdin: {e}")
    _STATE_PATH.parent.mkdir(parents=True, exist_ok=True)
    # Backup the existing state — undo button if a bad pipe clobbers it.
    if _STATE_PATH.exists():
        bak = _STATE_PATH.with_suffix(_STATE_PATH.suffix + ".bak")
        try:
            bak.write_bytes(_STATE_PATH.read_bytes())
        except Exception:
            pass
    tmp = _STATE_PATH.with_suffix(_STATE_PATH.suffix + ".tmp")
    tmp.write_text(json.dumps(obj, indent=2, ensure_ascii=False))
    tmp.replace(_STATE_PATH)
    emit({"ok": True, "path": str(_STATE_PATH), "bytes": len(data)}, args)


# ─── remote helpers ──────────────────────────────────────────────────

def _ssh_prefix(host: str | None, port: str | None) -> list[str]:
    """Return the ssh argv prefix when --host is given, else []."""
    if not host:
        return []
    cmd = ["ssh"]
    if port:
        cmd += ["-p", str(port)]
    cmd += ["-o", "BatchMode=yes", "-o", "ConnectTimeout=5", host]
    return cmd


# ─── kill ────────────────────────────────────────────────────────────

def cmd_kill(args) -> None:
    """Terminate a tmux session by name. Idempotent — exits 0 even if
    the session is already gone. Pass `--host user@box` to kill a
    remote session created via `download --host`."""
    base = ["tmux", "kill-session", "-t", args.session]
    cmd = _ssh_prefix(args.host, args.ssh_port) + base if args.host else base
    try:
        out = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
    except FileNotFoundError:
        fail("tmux not installed" if not args.host else "ssh not installed")
    already_gone = out.returncode != 0 and "can't find session" in (out.stderr or "").lower()
    emit({
        "ok": True,
        "session": args.session,
        "host": args.host or "local",
        "was_running": out.returncode == 0,
        "already_gone": already_gone,
    }, args)


# ─── state ───────────────────────────────────────────────────────────

def cmd_state(args) -> None:
    """Dump the raw cookbook state file (the web UI's localStorage-y
    JSON for active downloads/servers)."""
    emit(_read_state(), args)


# ─── argparse ────────────────────────────────────────────────────────

def _build_parser() -> argparse.ArgumentParser:
    common = argparse.ArgumentParser(add_help=False)
    common.add_argument("--pretty", action="store_true", help="Pretty-print JSON")

    p = argparse.ArgumentParser(
        prog="odysseus-cookbook",
        description="Shell-friendly wrapper around the Odysseus cookbook (model download + serve).",
        parents=[common],
    )
    sub = p.add_subparsers(dest="cmd", required=True)

    pl = sub.add_parser("list", help="active tmux sessions + cookbook state", parents=[common])
    pl.set_defaults(func=cmd_list)

    pg = sub.add_parser("gpus", help="per-GPU free/used VRAM (nvidia-smi)", parents=[common])
    pg.add_argument("--host", help="run nvidia-smi over SSH against this host")
    pg.add_argument("--ssh-port", help="SSH port (default: 22)")
    pg.set_defaults(func=cmd_gpus)

    pc = sub.add_parser("cached", help="HuggingFace local cache snapshot", parents=[common])
    pc.set_defaults(func=cmd_cached)

    ph = sub.add_parser("hf-latest", help="trending HF models, VRAM-filtered", parents=[common])
    ph.add_argument("--vram-gb", type=float, default=0, help="filter to models that fit (0 = all)")
    ph.add_argument("--limit", type=int, default=10)
    ph.add_argument("--pipeline", default="text-generation",
                    help="HF pipeline_tag (text-generation, text-to-image, etc.)")
    ph.set_defaults(func=cmd_hf_latest)

    pd = sub.add_parser("download", help="`hf download <repo>` in a tmux session", parents=[common])
    pd.add_argument("repo", help="HF repo id, e.g. 'Qwen/Qwen3-8B'")
    pd.add_argument("--include", help="glob filter for specific files")
    pd.add_argument("--revision", help="git ref / branch / tag")
    pd.add_argument("--host", help="run on a remote machine over SSH")
    pd.add_argument("--ssh-port", help="SSH port (default: 22)")
    pd.set_defaults(func=cmd_download)

    pse = sub.add_parser("serve", help="run an arbitrary serve cmd in tmux", parents=[common])
    pse.add_argument("name", help="short label for the session (e.g. 'qwen3-8b')")
    pse.add_argument("--cmd", required=True, help="full shell command to run")
    pse.set_defaults(func=cmd_serve)

    pk = sub.add_parser("kill", help="tmux kill-session by name", parents=[common])
    pk.add_argument("session", help="session name, e.g. 'cookbook-dl-abc123'")
    pk.add_argument("--host", help="kill a remote session")
    pk.add_argument("--ssh-port", help="SSH port")
    pk.set_defaults(func=cmd_kill)

    pst = sub.add_parser("state", help="dump cookbook_state.json", parents=[common])
    pst.set_defaults(func=cmd_state)

    pss = sub.add_parser("state-set", help="write JSON from stdin into cookbook_state.json", parents=[common])
    pss.set_defaults(func=cmd_state_set)

    return p


if __name__ == "__main__":
    sys.exit(run(_build_parser()))