From e2f449f4ef7069c9e643916c7fba88d489464bbb Mon Sep 17 00:00:00 2001 From: pewdiepie-archdaemon Date: Fri, 5 Jun 2026 14:41:43 +0900 Subject: [PATCH] Cookbook scheduler + serve: schedule via Tasks, Stop verifies kill, Ollama auto port-pick MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Schedule cookbook serves through the existing ScheduledTask system: the serve preset gets a ^ button next to Launch that opens a daily/hourly/ weekly form mirroring the admin-switch style; the schedule action runs action_cookbook_serve, which delegates to /api/model/serve and stamps the resulting task with _scheduledStopAtMs. A background cookbook_serve_lifecycle loop ticks every 60s and kills any serve whose window has ended, also dropping the auto-registered endpoint so the model picker doesn't keep pointing at a dead server. - Stop and remove on a Running serve now awaits the SSH/tmux kill, re-checks tmux has-session, and surfaces an error toast (leaving the row) when the kill failed. Previously fire-and-forget, so a failed SSH/tmux call silently left the live serve running while the row vanished from the UI. - Cookbook tasks/status orphan-adoption sweep no longer requires the serve-/cookbook- session-id prefix; any tmux session whose pane is running a known model-server process gets auto-pulled into Running. Without this loosening, a cookbook-launched serve whose tmux id fell back to a bare number was invisible — you couldn't see it, let alone stop it. - Ollama serve always launches a fresh process under cookbook's tmux (no more monitor-mode reattach to a systemd/Docker ollama Stop can't reach). The handler pre-picks a free port by probing the target host over SSH and mutates req.cmd's OLLAMA_HOST so the runner script AND the auto-registered endpoint agree on the same bind port. - Auto-register uses host.docker.internal (when running inside Docker) instead of localhost, matching the URL /setup adds for Ollama by hand. Local cookbook serves now produce a chat-reachable endpoint on first launch. - Cascade-delete: removing a scheduled cookbook task also deletes any linked calendar event (cookbook_task_id marker in the description). - Tasks list groups cookbook_serve under a "Cookbook" category that sorts above the rest, so scheduler-launched serves are easy to find. --- app.py | 10 + routes/cookbook_routes.py | 136 ++++++++--- routes/task_routes.py | 119 ++++++++++ src/builtin_actions.py | 192 ++++++++++++++++ src/cookbook_serve_lifecycle.py | 193 ++++++++++++++++ src/task_scheduler.py | 4 + static/index.html | 1 + static/js/cookbookRunning.js | 82 +++++-- static/js/cookbookSchedule.js | 386 ++++++++++++++++++++++++++++++++ static/js/cookbookServe.js | 131 ++++++++++- static/js/tasks.js | 19 +- static/style.css | 228 ++++++++++++++++++- 12 files changed, 1434 insertions(+), 67 deletions(-) create mode 100644 src/cookbook_serve_lifecycle.py create mode 100644 static/js/cookbookSchedule.js diff --git a/app.py b/app.py index b34b818..f02fb6c 100644 --- a/app.py +++ b/app.py @@ -1067,6 +1067,16 @@ async def _startup_event(): logger.warning(f"Nightly skill audit failed: {e}") _startup_tasks.append(asyncio.create_task(_skill_audit_nightly_loop())) + + # Cookbook serve lifecycle — kills scheduler-launched serves whose + # window-end has passed. Paired with the cookbook_serve builtin + # action; both are no-ops unless a scheduled task actually launches + # something with end_after_min set. Removing this line + the + # cookbook_serve entry in BUILTIN_ACTIONS + src/cookbook_serve_lifecycle.py + # removes the feature. + from src.cookbook_serve_lifecycle import cookbook_serve_lifecycle_loop + _startup_tasks.append(asyncio.create_task(cookbook_serve_lifecycle_loop())) + logger.info("Application startup complete") async def _shutdown_event(): diff --git a/routes/cookbook_routes.py b/routes/cookbook_routes.py index bc15899..bf2365b 100644 --- a/routes/cookbook_routes.py +++ b/routes/cookbook_routes.py @@ -801,6 +801,55 @@ def setup_cookbook_routes() -> APIRouter: finally: db.close() + def _pick_free_port_for_ollama( + remote: str | None, ssh_port: str | None, start_port: int, max_offset: int + ) -> int | None: + """Return the first free port in [start_port, start_port+max_offset] on + the target host. Used to pick a real bind for `ollama serve` so we + don't reattach to an external systemd ollama (or other listener) the + Cookbook Stop button can't kill.""" + import socket + if remote: + # Probe over SSH. Bash's /dev/tcp gives a portable "is anything + # listening" check without requiring ss/netstat/nmap. + ssh_base = ["ssh", "-o", "ConnectTimeout=4", "-o", "StrictHostKeyChecking=no"] + if ssh_port and str(ssh_port) != "22": + if not _SSH_PORT_RE.match(str(ssh_port)): + return None + ssh_base.extend(["-p", str(ssh_port)]) + host_arg = remote + if not _REMOTE_HOST_RE.match(host_arg): + return None + probe_ports = " ".join(str(start_port + i) for i in range(max_offset + 1)) + script = ( + f"for p in {probe_ports}; do " + "if ! (exec 3<>/dev/tcp/127.0.0.1/$p) 2>/dev/null; then " + "echo $p; exit 0; fi; exec 3<&-; exec 3>&-; done; exit 1" + ) + try: + import subprocess + r = subprocess.run( + ssh_base + [host_arg, script], + capture_output=True, text=True, timeout=8, + ) + if r.returncode == 0: + out = (r.stdout or "").strip().splitlines() + if out and out[0].isdigit(): + return int(out[0]) + except Exception: + return None + return None + # Local: just try to connect. + for off in range(max_offset + 1): + p = start_port + off + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.settimeout(0.25) + try: + s.connect(("127.0.0.1", p)) + except (ConnectionRefusedError, socket.timeout, OSError): + return p + return None + def _auto_register_llm_endpoint(req: ServeRequest, remote: str | None) -> str | None: """Register a freshly-served LLM as a model endpoint so it appears in the model picker without a manual /setup step — the text-model sibling of @@ -815,21 +864,37 @@ def setup_cookbook_routes() -> APIRouter: import re from core.database import SessionLocal, ModelEndpoint - # Port: an explicit --port wins. Otherwise fall back by backend — Ollama - # is the only server in our generated commands that omits --port. + # Port: ordered fallbacks so we match whatever the user actually + # asked for, not a hardcoded default: + # 1. explicit `--port N` (vllm / sglang / llama-server) + # 2. `OLLAMA_HOST=host:port` (the way Ollama specifies its bind) + # 3. fallback by backend (11434 ollama / 8080 llama.cpp) + # Previously the OLLAMA_HOST form was silently ignored and we + # registered every Ollama endpoint at 11434 — even if the user + # set OLLAMA_HOST=0.0.0.0:11435 to avoid colliding with an + # existing systemd Ollama, the registered endpoint pointed at + # the OLD port and showed as offline. port_match = re.search(r'--port\s+(\d+)', req.cmd) + ollama_host_match = re.search(r'OLLAMA_HOST=[^\s]*?:(\d+)', req.cmd) if port_match: port = int(port_match.group(1)) + elif ollama_host_match: + port = int(ollama_host_match.group(1)) elif "ollama" in req.cmd: port = 11434 else: port = 8080 # llama.cpp's llama-server default — the Apple Silicon path # Determine host (mirrors the image path: SSH alias for remote serves). + # For local serves while Odysseus runs inside Docker, "localhost" + # resolves to the container itself — useless. Use host.docker.internal + # which compose maps to the actual host, matching what /setup adds + # for Ollama by hand. if remote: host = remote.split("@")[-1] if "@" in remote else remote else: - host = "localhost" + from routes.model_routes import _docker_host_gateway_reachable + host = "host.docker.internal" if _docker_host_gateway_reachable() else "localhost" base_url = f"http://{host}:{port}/v1" @@ -927,6 +992,19 @@ def setup_cookbook_routes() -> APIRouter: session_id = f"serve-{uuid.uuid4().hex[:8]}" remote = req.remote_host is_windows = req.platform == "windows" + + # Ollama: if the user didn't pin a port, resolve the actual port we'll + # bind to here (before runner construction) by probing the target host. + # Otherwise the runner script picks one at runtime and `_auto_register` + # below still registers the stale 11434 default — which on a host with + # a systemd ollama lands on the wrong (unreachable-from-docker) service. + if "ollama" in req.cmd and "OLLAMA_HOST=" not in req.cmd: + _ollama_bind_host = "0.0.0.0" if remote else "127.0.0.1" + _ollama_chosen_port = _pick_free_port_for_ollama( + remote, req.ssh_port, start_port=11434, max_offset=10, + ) + if _ollama_chosen_port: + req.cmd = f"OLLAMA_HOST={_ollama_bind_host}:{_ollama_chosen_port} {req.cmd}" # LOCAL execution on a native-Windows host never uses tmux (detached # process path below), regardless of the UI-supplied platform. local_windows = IS_WINDOWS and not remote @@ -1089,38 +1167,24 @@ def setup_cookbook_routes() -> APIRouter: req.cmd, default_host=_ollama_default_host, ) - # Ollama can be a host binary, a system service, or a Docker - # container. If the HTTP API is already reachable, the model is - # already served and we should not require a host `ollama` CLI. + # Always launch a fresh ollama under tmux so Stop reliably + # kills it. If the requested port is busy (e.g. a systemd + # ollama on 11434), scan upward for a free one rather than + # silently reattaching to an external service that Stop + # can't reach. runner_lines.append(f'ODYSSEUS_OLLAMA_HOST={_bash_squote(_ollama_host)}') runner_lines.append(f'ODYSSEUS_OLLAMA_PORT="{_ollama_port}"') - runner_lines.append('ODYSSEUS_OLLAMA_URL=""') - runner_lines.append('for _ody_ollama_try in $(seq 1 20); do') - runner_lines.append(' for _ody_ollama_port in "$ODYSSEUS_OLLAMA_PORT" 11434; do') - runner_lines.append(' [ -z "$_ody_ollama_port" ] && continue') - runner_lines.append(' for _ody_ollama_host in 127.0.0.1 localhost host.docker.internal; do') - runner_lines.append(' _ody_ollama_url="http://${_ody_ollama_host}:${_ody_ollama_port}"') - runner_lines.append(' if curl -sf "$_ody_ollama_url/api/tags" >/dev/null 2>&1; then') - runner_lines.append(' ODYSSEUS_OLLAMA_URL="$_ody_ollama_url"') - runner_lines.append(' ODYSSEUS_OLLAMA_PORT="$_ody_ollama_port"') - runner_lines.append(' break 3') - runner_lines.append(' fi') - runner_lines.append(' done') - runner_lines.append(' done') - runner_lines.append(' [ "$_ody_ollama_try" -eq 1 ] && echo "[odysseus] Waiting for an existing Ollama API on ports ${ODYSSEUS_OLLAMA_PORT}/11434..."') - runner_lines.append(' sleep 1') - runner_lines.append('done') - runner_lines.append('if [ -n "$ODYSSEUS_OLLAMA_URL" ]; then') - runner_lines.append(' if [ "$ODYSSEUS_OLLAMA_PORT" != "' + _ollama_port + '" ]; then') - runner_lines.append(' echo "[odysseus] Selected Ollama port ' + _ollama_port + ' was not reachable; using running Ollama on port ${ODYSSEUS_OLLAMA_PORT}."') + runner_lines.append('for _ody_off in 0 1 2 3 4 5 6 7 8 9; do') + runner_lines.append(' _ody_try_port=$((ODYSSEUS_OLLAMA_PORT + _ody_off))') + runner_lines.append(' if ! (exec 3<>/dev/tcp/127.0.0.1/$_ody_try_port) 2>/dev/null; then') + runner_lines.append(' exec 3<&-; exec 3>&-') + runner_lines.append(' ODYSSEUS_OLLAMA_PORT="$_ody_try_port"') + runner_lines.append(' break') runner_lines.append(' fi') - runner_lines.append(' echo "[odysseus] Ollama API ready on port ${ODYSSEUS_OLLAMA_PORT}: ${ODYSSEUS_OLLAMA_URL}"') - runner_lines.append(' echo "[odysseus] This task is monitoring an existing Ollama server; stopping it here will not stop an external Docker/system service."') - runner_lines.append(' exec bash -i') - runner_lines.append('fi') + runner_lines.append(' exec 3<&-; exec 3>&-') + runner_lines.append('done') runner_lines.append('if ! command -v ollama &>/dev/null; then') - runner_lines.append(' echo "ERROR: Ollama not found and no Ollama API is reachable on 127.0.0.1, localhost, or host.docker.internal (ports ${ODYSSEUS_OLLAMA_PORT}/11434)."') - runner_lines.append(' echo "Install Ollama, start an Ollama service/container on this server, or pick the port where it is already listening."') + runner_lines.append(' echo "ERROR: Ollama not found on this server. Install it from https://ollama.com/download or `curl -fsSL https://ollama.com/install.sh | sh`."') runner_lines.append(' echo') runner_lines.append(' echo "=== Process exited with code 127 ==="') runner_lines.append(' exec bash -i') @@ -2017,12 +2081,14 @@ def setup_cookbook_routes() -> APIRouter: sid = line.split(":", 1)[0].strip() if not sid or not _SESSION_ID_RE.match(sid): continue - # Only adopt sessions that LOOK like model serves; ignore - # bare numeric tmux sessions and unrelated work. - if not (sid.startswith("serve-") or sid.startswith("cookbook-")): - continue if sid in known_sids: continue + # Adopt any session whose pane is currently running a + # known model-server process (checked below). The earlier + # prefix gate (serve-/cookbook-) dropped legitimate + # serves whenever tmux fell back to numeric IDs, leaving + # them invisible in the Cookbook UI — so the user could + # neither see nor stop them. # Skip zombie / idle-shell sessions. A tmux session left # over from a crashed vllm just shows a bash prompt — # adopting it would pollute the UI with "running" tasks diff --git a/routes/task_routes.py b/routes/task_routes.py index a5c49ad..6604923 100644 --- a/routes/task_routes.py +++ b/routes/task_routes.py @@ -18,6 +18,119 @@ from routes.prefs_routes import _load_for_user, _save_for_user logger = logging.getLogger(__name__) +def _maybe_cascade_calendar_event(task) -> None: + """Delete the linked calendar event when a cookbook_serve task is + removed. Two lookup strategies: + + 1. PRIMARY — `cookbook_event_uid` marker stashed in task.prompt + by cookbookSchedule.js right after creating the event. Direct + UID match, no ambiguity. + + 2. FALLBACK — for tasks created before the marker was wired up + (or when the PATCH to add the marker failed silently), scan + the Cookbook calendar for events whose summary equals the + task name and delete the matches. + + Best-effort throughout: errors are logged but never block the task + deletion itself.""" + if not task or task.task_type != "action" or task.action != "cookbook_serve": + return + + import httpx + from core.middleware import INTERNAL_TOOL_HEADER, INTERNAL_TOOL_TOKEN + headers = {INTERNAL_TOOL_HEADER: INTERNAL_TOOL_TOKEN} + if task.owner: + headers["X-Odysseus-Owner"] = task.owner + + # Strategy 1: explicit UID marker in prompt. + event_uid = "" + if task.prompt: + try: + cfg = json.loads(task.prompt) + if isinstance(cfg, dict): + event_uid = (cfg.get("cookbook_event_uid") or "").strip() + except Exception: + pass + + def _try_delete(uid: str) -> bool: + try: + with httpx.Client(timeout=10) as client: + r = client.delete( + f"http://localhost:7000/api/calendar/events/{uid}", + headers=headers, + ) + if r.status_code >= 400: + logger.info( + f"task delete: cascade calendar event {uid} returned " + f"HTTP {r.status_code}" + ) + return False + return True + except Exception as e: + logger.warning(f"task delete: cascade calendar event {uid} failed: {e}") + return False + + if event_uid: + _try_delete(event_uid) + return + + # Strategy 2: scan the Cookbook calendar for matching summaries. + # Only runs for tasks missing the marker (old tasks or PATCH failures). + if not task.name: + return + try: + with httpx.Client(timeout=10) as client: + # Find the Cookbook calendar. + cal_r = client.get("http://localhost:7000/api/calendar/calendars", headers=headers) + if cal_r.status_code >= 400: + return + cals = (cal_r.json() or {}).get("calendars", []) + cookbook_cal = next( + (c for c in cals if (c.get("name") or "").lower() == "cookbook"), + None, + ) + if not cookbook_cal: + return + cal_href = cookbook_cal.get("href") or cookbook_cal.get("id") or "" + # List events in a wide window to catch recurring + upcoming. + from datetime import datetime as _dt, timedelta as _td, timezone as _tz + now = _dt.now(_tz.utc) + start = (now - _td(days=30)).isoformat() + end = (now + _td(days=365)).isoformat() + ev_r = client.get( + "http://localhost:7000/api/calendar/events", + params={"start": start, "end": end, "calendar": cal_href}, + headers=headers, + ) + if ev_r.status_code >= 400: + return + events = (ev_r.json() or {}).get("events", []) + # Match by exact summary. Tasks named "Serve: " are + # created from the schedule modal; the event's summary mirrors + # the task name 1:1 by design. + target = (task.name or "").strip() + uids_to_delete = set() + for ev in events: + if (ev.get("summary") or "").strip() != target: + continue + uid = ev.get("uid") or ev.get("id") or "" + # Strip the "::occurrence" suffix on recurring expansions — + # we want to delete the MASTER once, not each instance. + if "::" in uid: + uid = uid.split("::", 1)[0] + if uid: + uids_to_delete.add(uid) + for uid in uids_to_delete: + _try_delete(uid) + if uids_to_delete: + logger.info( + f"task delete: cascade matched {len(uids_to_delete)} calendar event(s) " + f"by summary fallback for task {task.id} ({target!r})" + ) + except Exception as e: + logger.warning(f"task delete: cascade fallback scan failed: {e}") + + class TaskCreate(BaseModel): name: Optional[str] = None prompt: Optional[str] = None @@ -616,6 +729,12 @@ def setup_task_routes(task_scheduler) -> APIRouter: raise HTTPException(404, "Task not found") if user and task.owner != user: raise HTTPException(403, "Access denied") + # Cascade: cookbook_serve tasks may have a linked calendar + # event (created via the "Create event in calendar" toggle + # in the schedule modal). If so, delete the calendar event + # too so the calendar doesn't end up holding a phantom event + # for a task that no longer exists. + _maybe_cascade_calendar_event(task) db.delete(task) db.commit() return {"ok": True} diff --git a/src/builtin_actions.py b/src/builtin_actions.py index 6b96e31..d532603 100644 --- a/src/builtin_actions.py +++ b/src/builtin_actions.py @@ -2001,6 +2001,197 @@ async def action_check_email_urgency(owner: str, **kwargs) -> Tuple[str, bool]: return str(e), False +async def action_cookbook_serve( + owner: str, + task_name: str = "", + progress_cb=None, + command: str = "", + **kwargs, +) -> Tuple[str, bool]: + """Launch a Cookbook model serve as a scheduled task. + + `command` is the JSON config string the task carries in `prompt`, + of shape: {"preset": "name"} OR {"repo_id": "...", "cmd": "...", "host": "..."}. + Optional `end_after_min: N` schedules a hard-stop N minutes after launch + (handled by cookbook_serve_lifecycle_loop in src/cookbook_serve_lifecycle.py). + """ + import json + import time as _time + import httpx + from pathlib import Path + from core.middleware import INTERNAL_TOOL_HEADER, INTERNAL_TOOL_TOKEN + from core.atomic_io import atomic_write_json + + headers = {INTERNAL_TOOL_HEADER: INTERNAL_TOOL_TOKEN} + try: + cfg = json.loads(command or "{}") + except Exception: + return f"Invalid JSON config: {command!r}", False + if not isinstance(cfg, dict): + return "Config must be a JSON object", False + + # Resolve the preset (if named) OR fall through with explicit fields. + preset_name = (cfg.get("preset") or "").strip() + repo_id = (cfg.get("repo_id") or "").strip() + cmd = (cfg.get("cmd") or "").strip() + host = (cfg.get("host") or cfg.get("remote_host") or "").strip() + try: + end_after_min = int(cfg.get("end_after_min") or 0) + except Exception: + end_after_min = 0 + + state_path = Path("/app/data/cookbook_state.json") + try: + state = json.loads(state_path.read_text(encoding="utf-8")) if state_path.exists() else {} + except Exception: + state = {} + + # Preset lookup. Try three matching strategies in order so the + # schedule still works even when the user's preset is named + # differently from the model's short name: + # + # 1. Exact preset.name == preset_name (case-insensitive) + # 2. preset.model / preset.modelId == repo_id (caller knows the repo) + # 3. preset.model's short name (after final /) == preset_name + # + # Without #2 and #3, scheduling "Qwen3.5-397B-A17B-AWQ" failed when + # the saved preset was named "vllm-qwen-397b" or had the model field + # populated with the full HF repo path. Either should resolve. + def _short(name: str) -> str: + return (name or "").rsplit("/", 1)[-1].lower() + + if not cmd or not repo_id: + presets = state.get("presets") or [] + chosen = None + # Strategy 1: exact name match. + if preset_name: + chosen = next( + (p for p in presets if isinstance(p, dict) + and (p.get("name") or "").lower() == preset_name.lower()), + None, + ) + # Strategy 2: repo_id matches the preset's model field. + if chosen is None and repo_id: + chosen = next( + (p for p in presets if isinstance(p, dict) + and (p.get("model") or p.get("modelId") or "").lower() == repo_id.lower()), + None, + ) + # Strategy 3: model's short name matches the preset_name. + if chosen is None and preset_name: + chosen = next( + (p for p in presets if isinstance(p, dict) + and _short(p.get("model") or p.get("modelId") or "") == preset_name.lower()), + None, + ) + if chosen is not None: + repo_id = repo_id or chosen.get("model") or chosen.get("modelId") or "" + cmd = cmd or (chosen.get("cmd") or "").strip() + host = host or chosen.get("host") or chosen.get("remoteHost") or "" + if not repo_id or not cmd or cmd.startswith("(adopted"): + # Surface what we tried so the user can name their preset to match. + preset_names = [(p.get("name") or "") for p in (state.get("presets") or []) if isinstance(p, dict)] + hint = f" Saved presets: {preset_names!r}" if preset_names else "" + return (f"No launchable config for {preset_name!r} (repo_id={repo_id!r}). " + f"Check Cookbook → Presets has a real cmd, not 'adopted'.{hint}", False) + + # Resolve env_prefix etc. from the host's saved cookbook server entry, + # matching the chat agent's serve_model path. + body = {"repo_id": repo_id, "cmd": cmd} + if host: + body["remote_host"] = host + env = (state.get("env") or {}) + srv = next( + (s for s in (env.get("servers") or []) + if isinstance(s, dict) and (s.get("host") == host or s.get("name") == host)), + {}, + ) + if srv.get("env") == "venv" and srv.get("envPath"): + body["env_prefix"] = f"source {srv['envPath']}/bin/activate" + elif srv.get("env") == "conda" and srv.get("envPath"): + body["env_prefix"] = f"conda activate {srv['envPath']}" + if srv.get("hfToken"): body["hf_token"] = srv["hfToken"] + if srv.get("port"): body["ssh_port"] = str(srv["port"]) + if srv.get("platform"): body["platform"] = srv["platform"] + + try: + async with httpx.AsyncClient(timeout=30) as client: + r = await client.post("http://localhost:7000/api/model/serve", + json=body, headers=headers) + data = r.json() if r.content else {} + except Exception as e: + return f"Launch HTTP failed: {e}", False + if not data.get("ok"): + return f"Launch rejected: {data.get('error') or data.get('detail') or 'unknown'}", False + + sid = data.get("session_id") or "" + # Register the new task in cookbook_state.json + stamp it with our + # scheduler-owner markers. /api/model/serve spawns the tmux session + # but leaves the state-write to the UI — when a scheduled action + # launches a serve from server-side, NOBODY writes the task into + # state, so the Cookbook tab never shows it. We do the write here. + if sid: + try: + # Re-read fresh (the route may have updated state already). + try: + fresh = json.loads(state_path.read_text(encoding="utf-8")) + except Exception: + fresh = {} + if not isinstance(fresh, dict): + fresh = {} + tasks = fresh.get("tasks") if isinstance(fresh.get("tasks"), list) else [] + existing = next( + (t for t in tasks if isinstance(t, dict) and t.get("sessionId") == sid), + None, + ) + if existing is None: + display_name = repo_id.split("/")[-1] if "/" in repo_id else repo_id + placeholder = ( + f"Launched by scheduled task {task_name!r} — waiting for tmux output…\n" + f" session: {sid}\n" + f" target: {host or 'local'}\n" + f" cmd: {cmd[:200]}{'…' if len(cmd) > 200 else ''}" + ) + existing = { + "id": sid, + "sessionId": sid, + "name": display_name, + "modelId": repo_id, + "type": "serve", + "status": "running", + "output": placeholder, + "ts": int(_time.time() * 1000), + "payload": {"repo_id": repo_id, "remote_host": host or "", "_cmd": cmd}, + "remoteHost": host or "", + "sshPort": "", + "platform": "linux", + "_serveReady": False, + "_endpointAdded": False, + } + tasks.append(existing) + # Stamp ownership + end-at on the task entry. + existing["_scheduledByTask"] = task_name or "" + existing["_scheduledByOwner"] = owner or "" + if end_after_min > 0: + existing["_scheduledStopAtMs"] = int(_time.time() * 1000) + end_after_min * 60 * 1000 + fresh["tasks"] = tasks + atomic_write_json(state_path, fresh) + except Exception as e: + logger.warning(f"cookbook_serve: state register/stamp failed: {e}") + # Don't try to render absolute clock time in the message — the + # server runs in UTC (Docker default), the user reads it as local, + # and the offset depends on the user's TZ which the action doesn't + # have a reliable handle on. The Tasks UI already shows the RUN + # timestamp in the user's local time right above this message, so + # "stops 8 min after that" gives the user everything they need. + if end_after_min: + return ( + f"Launched {repo_id} (session {sid}); stops {end_after_min} min after this ran", + True, + ) + return f"Launched {repo_id} (session {sid})", True + + BUILTIN_ACTIONS = { "tidy_sessions": action_tidy_sessions, "tidy_documents": action_tidy_documents, @@ -2020,6 +2211,7 @@ BUILTIN_ACTIONS = { "test_skills": action_test_skills, "audit_skills": action_audit_skills, "check_email_urgency": action_check_email_urgency, + "cookbook_serve": action_cookbook_serve, # ping_notes removed from the registry — runs only inside `_note_pings_loop`. } diff --git a/src/cookbook_serve_lifecycle.py b/src/cookbook_serve_lifecycle.py new file mode 100644 index 0000000..58d4242 --- /dev/null +++ b/src/cookbook_serve_lifecycle.py @@ -0,0 +1,193 @@ +"""Cookbook serve lifecycle: kills scheduler-owned serves whose end-of- +window has passed. + +Pairs with action_cookbook_serve in builtin_actions.py — that action +stamps the task it launches with `_scheduledStopAtMs`, this loop ticks +every 60s and kills any serve whose stamp is in the past. + +Single small module. Delete this file + the registration line in app.py +and the feature stops doing anything; scheduler-launched serves just +stay up until the user kills them manually. +""" + +from __future__ import annotations + +import asyncio +import json +import logging +import time +from pathlib import Path + +import httpx + +logger = logging.getLogger(__name__) + + +def _internal_headers() -> dict: + from core.middleware import INTERNAL_TOOL_HEADER, INTERNAL_TOOL_TOKEN + return {INTERNAL_TOOL_HEADER: INTERNAL_TOOL_TOKEN} + + +async def _delete_endpoint_for_task(task: dict) -> None: + """Drop the auto-registered model endpoint for a scheduled-stop serve. + + Without this, killing the tmux session leaves the endpoint sitting in + the picker (probe goes offline; chats still try to route there) and + the user has to delete it by hand in Settings -> Endpoints. + """ + import re as _re + payload = task.get("payload") or {} + cmd = str(payload.get("_cmd") or "") + remote = task.get("remoteHost") or "" + # Build host the same way _auto_register_llm_endpoint does so URL match wins. + if remote: + host = remote.split("@")[-1] if "@" in remote else remote + else: + host = "host.docker.internal" + port_match = _re.search(r"--port\s+(\d+)", cmd) + ollama_host_match = _re.search(r"OLLAMA_HOST=[^\s]*?:(\d+)", cmd) + if port_match: + port = int(port_match.group(1)) + elif ollama_host_match: + port = int(ollama_host_match.group(1)) + elif "ollama" in cmd: + port = 11434 + else: + port = 8080 + base_url = f"http://{host}:{port}/v1" + try: + async with httpx.AsyncClient(timeout=8) as client: + r = await client.get( + "http://localhost:7000/api/model-endpoints", + headers=_internal_headers(), + ) + if r.status_code >= 400: + return + eps = r.json() if r.content else [] + # Prefer exact URL match; fall back to host:port substring so we + # still catch the case where 0.0.0.0 vs the registered host + # representation diverged. + ep = next((e for e in eps if e.get("base_url") == base_url), None) + if not ep: + hostport = f"{host}:{port}" + ep = next((e for e in eps if hostport in (e.get("base_url") or "")), None) + if ep: + await client.delete( + f"http://localhost:7000/api/model-endpoints/{ep['id']}", + headers=_internal_headers(), + ) + logger.info( + f"cookbook_serve_lifecycle: deleted endpoint {ep.get('id')} " + f"({ep.get('base_url')}) after scheduled stop" + ) + except Exception as e: + logger.warning(f"cookbook_serve_lifecycle: endpoint delete failed: {e}") + + +async def _stop_serve(session_id: str, remote_host: str = "", ssh_port: str = "") -> bool: + """Kill the tmux session that hosts the serve. + + There's no `/api/model/stop` route — the cookbook UI and the chat + agent both kill via `/api/shell/exec` running a `tmux kill-session` + (wrapped in ssh for remote hosts). Mirror that here so the + lifecycle loop can actually stop scheduler-launched serves at + window-end. Without this, the action stamped `_scheduledStopAtMs` + correctly but every kill attempt failed silently (the route + returned 404 and the result was logged as "failed"). + """ + import shlex + if remote_host: + port_flag = f"-p {shlex.quote(str(ssh_port))} " if ssh_port and str(ssh_port) != "22" else "" + cmd = ( + f"ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no " + f"{port_flag}{shlex.quote(remote_host)} " + f"'tmux kill-session -t {shlex.quote(session_id)}'" + ) + else: + cmd = f"tmux kill-session -t {shlex.quote(session_id)}" + try: + async with httpx.AsyncClient(timeout=15) as client: + r = await client.post( + "http://localhost:7000/api/shell/exec", + json={"command": cmd}, + headers=_internal_headers(), + ) + if r.status_code >= 400: + return False + data = r.json() if r.content else {} + ec = data.get("exit_code") + # tmux returns non-zero when the session is already gone + # ("can't find session: ..."). That's still "stop succeeded" + # from our POV — the goal is no live session at the end. + if ec in (None, 0): + return True + stderr = (data.get("stderr") or "").lower() + return "no server" in stderr or "can't find session" in stderr or "session not found" in stderr + except Exception as e: + logger.warning(f"cookbook_serve_lifecycle: stop {session_id} failed: {e}") + return False + + +async def _tick() -> None: + state_path = Path("/app/data/cookbook_state.json") + if not state_path.exists(): + return + try: + state = json.loads(state_path.read_text(encoding="utf-8")) + except Exception: + return + tasks = state.get("tasks") or [] + now_ms = int(time.time() * 1000) + to_stop = [] + for t in tasks: + if not isinstance(t, dict): + continue + stop_at = t.get("_scheduledStopAtMs") + if not isinstance(stop_at, (int, float)): + continue + if stop_at > now_ms: + continue + if (t.get("status") or "").lower() in {"stopped", "ended", "killed", "crashed"}: + continue + sid = t.get("sessionId") or t.get("id") + if not sid: + continue + to_stop.append((sid, t.get("remoteHost") or "", t.get("sshPort") or "")) + if not to_stop: + return + # Re-read state once before writing so we capture any updates from + # concurrent UI syncs. + stopped_any = False + for sid, host, port in to_stop: + ok = await _stop_serve(sid, host, port) + logger.info(f"cookbook_serve_lifecycle: stop {sid} (host={host or 'local'}): {'ok' if ok else 'failed'}") + if ok: + stopped_any = True + # Drop the auto-registered endpoint so the model picker and + # the chat router don't keep pointing at a dead server. + for t in tasks: + if isinstance(t, dict) and (t.get("sessionId") == sid or t.get("id") == sid): + if t.get("type") == "serve": + await _delete_endpoint_for_task(t) + t["status"] = "stopped" + t["_scheduledStopAtMs"] = None + t["_lastStatusFlipAt"] = now_ms + break + if stopped_any: + try: + from core.atomic_io import atomic_write_json + state["tasks"] = tasks + atomic_write_json(state_path, state) + except Exception as e: + logger.warning(f"cookbook_serve_lifecycle: state write failed: {e}") + + +async def cookbook_serve_lifecycle_loop() -> None: + """Forever-loop. Registered as a startup task in app.py.""" + await asyncio.sleep(20) # let the rest of startup settle + while True: + try: + await _tick() + except Exception as e: + logger.warning(f"cookbook_serve_lifecycle tick failed: {e}") + await asyncio.sleep(60) diff --git a/src/task_scheduler.py b/src/task_scheduler.py index 65fc451..2fcb5dc 100644 --- a/src/task_scheduler.py +++ b/src/task_scheduler.py @@ -1018,6 +1018,10 @@ class TaskScheduler: kwargs = {"owner": task.owner, "task_name": task.name, "progress_cb": _progress} if task.action in ("run_script", "run_local", "ssh_command") and task.prompt: kwargs["script" if task.action in ("run_script", "run_local") else "command"] = task.prompt + # cookbook_serve carries its JSON config in task.prompt — feed it + # through as `command` so action_cookbook_serve can json.loads it. + elif task.action == "cookbook_serve" and task.prompt: + kwargs["command"] = task.prompt result, success = await action_fn(**kwargs) return result, success except TaskNoop: diff --git a/static/index.html b/static/index.html index c5f3828..a4637d3 100644 --- a/static/index.html +++ b/static/index.html @@ -2283,6 +2283,7 @@ + diff --git a/static/js/cookbookRunning.js b/static/js/cookbookRunning.js index 64ddd93..7f3cedd 100644 --- a/static/js/cookbookRunning.js +++ b/static/js/cookbookRunning.js @@ -2094,7 +2094,7 @@ export function _renderRunningTab() { // Edit serve — open the full serve panel (same as the edit icon), // switching to this task's server first so the model is found. if (task.type === 'serve' && task.payload?.repo_id) { - items.push({ label: 'Edit serve', action: 'edit-panel', custom: () => _openEdit() }); + items.push({ label: 'Edit in serve panel', action: 'edit-panel', tooltip: 'Open the full Serve config panel pre-filled with this task — pick a different backend, change GPUs, edit env vars, then Launch from there', custom: () => _openEdit() }); } // Save serve — save current launch config as a preset. if (task.type === 'serve' && task.payload?._cmd) { @@ -2107,7 +2107,7 @@ export function _renderRunningTab() { // Edit command — only meaningful for serve tasks that aren't running. // Lets the user tweak flags after a crash/error and relaunch. if (task.type === 'serve' && task.status !== 'running' && task.payload?._cmd) { - items.push({ label: 'Edit command', action: 'edit', custom: async () => { + items.push({ label: 'Edit cmd & relaunch', action: 'edit', tooltip: 'Edit the raw vllm/llama-server cmd string in a dialog and relaunch immediately on the same host', custom: async () => { const newCmd = await _promptEditServeCmd(task.payload._cmd); if (newCmd == null) return; // cancelled try { @@ -2201,7 +2201,19 @@ export function _renderRunningTab() { _copyText(last); uiModule.showToast('Copied last 50 lines'); }}); - items.push({ label: 'Remove', action: 'kill', danger: true }); + // Label matches behavior — the kill handler ALWAYS first kills + // the live tmux session and (for serve tasks) deletes the + // matching model-endpoint, THEN animates the task card out. + // Just "Remove" hid that it stops the live serve too. + const _isLive = task.type === 'serve' && ['running', 'ready', 'loading', 'warming', 'starting'].includes(task.status || ''); + items.push({ + label: _isLive ? 'Stop and remove' : 'Remove', + action: 'kill', + tooltip: _isLive + ? 'Kill the live tmux session, deregister the chat endpoint, and remove this row' + : 'Remove this row', + danger: true, + }); // Cancel = mobile-only dismiss item. Same pattern as the email kebab: // the `dropdown-cancel-mobile` class is hidden on desktop and styled // as a separated bottom row on mobile (border-top + extra padding). @@ -2228,6 +2240,7 @@ export function _renderRunningTab() { + (item.danger ? ' cookbook-dropdown-danger' : '') + (item.mobileOnly ? ' dropdown-cancel-mobile' : ''); div.style.cssText = 'display:flex;align-items:center;gap:8px;'; + if (item.tooltip) div.title = item.tooltip; const ic = _MENU_ICONS[item.action] || ''; div.innerHTML = `${ic}${item.label}`; div.addEventListener('click', () => { @@ -2347,22 +2360,57 @@ export function _renderRunningTab() { _animateOutThenRemove(el, task.sessionId); }); - // Wire kill - el.querySelector('.cookbook-task-action-kill').addEventListener('click', () => { + // Wire kill — awaits the SSH/tmux kill and verifies the session is + // actually gone before removing the row. Previously fire-and-forget, + // which meant a failed kill (wrong remoteHost, SSH error, tmux server + // already exited) silently left the live serve running while the + // row disappeared from the UI. + el.querySelector('.cookbook-task-action-kill').addEventListener('click', async () => { const outputText = el.querySelector('.cookbook-output-pre')?.textContent || task.output || ''; + const isLive = task.type === 'serve' && ['running', 'ready', 'loading', 'warming', 'starting'].includes(task.status || ''); const ollamaUnload = _ollamaUnloadCommand(task, outputText); if (ollamaUnload) { - fetch('/api/shell/exec', { + try { + await fetch('/api/shell/exec', { + method: 'POST', credentials: 'same-origin', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ command: ollamaUnload }), + }); + } catch (_) { /* unload best-effort */ } + } + let killOk = true; + try { + const r = await fetch('/api/shell/exec', { method: 'POST', credentials: 'same-origin', headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ command: ollamaUnload }), - }).catch(() => {}); + body: JSON.stringify({ command: _tmuxGracefulKill(task) }), + }); + if (r.ok) { + const out = await r.json(); + // Don't trust exit_code alone — tmux kill returns 0 even when + // there was nothing to kill. Verify the session is actually gone. + if (task.sessionId && isLive) { + try { + const probe = await fetch('/api/shell/exec', { + method: 'POST', credentials: 'same-origin', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ command: _tmuxCmd(task, `has-session -t ${task.sessionId}`) }), + }); + if (probe.ok) { + const pj = await probe.json(); + // has-session exits 0 when session STILL exists; non-zero = gone. + if ((pj.exit_code || 0) === 0) killOk = false; + } + } catch (_) { /* probe best-effort; trust kill */ } + } + } else { + killOk = false; + } + } catch (_) { killOk = false; } + if (!killOk) { + try { uiModule.showToast('Kill failed — session may still be running. Check `tmux ls` on the server.', 'error'); } catch (_) {} + return; // leave the row so the user can retry } - fetch('/api/shell/exec', { - method: 'POST', credentials: 'same-origin', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ command: _tmuxGracefulKill(task) }), - }).catch(() => {}); if (task.type === 'serve' && task.payload) { const endpointUrl = _endpointUrlForTask(task, outputText); _removeEndpointByUrl(endpointUrl); @@ -2401,7 +2449,13 @@ export function _renderRunningTab() { if (targetBody) targetBody.appendChild(el); else group.appendChild(el); - if (task.status === 'running') { + // Auto-attach the tmux output stream for any task whose underlying + // session could still be alive — not just 'running'. Scheduler- + // launched serves transition to 'ready' as soon as /v1/models + // responds; without this, the user opens the Running tab and sees + // only the placeholder ("Launched by scheduled task …") because + // _reconnectTask never fires for status 'ready'/'loading'/'warming'. + if (['running', 'ready', 'loading', 'warming', 'starting'].includes(task.status)) { _reconnectTask(el, task); } } diff --git a/static/js/cookbookSchedule.js b/static/js/cookbookSchedule.js new file mode 100644 index 0000000..a26de5d --- /dev/null +++ b/static/js/cookbookSchedule.js @@ -0,0 +1,386 @@ +// Cookbook Schedule — opens a small inline form (styled with the app's +// existing .cookbook-* classes) that creates a ScheduledTask with +// action=cookbook_serve. Mounted from two places: +// +// 1. The ^ button next to Launch in a serve panel. +// 2. The "Schedule…" entry in the cached-model ⋯ dropdown menu (which +// programmatically clicks the ^ button so this module owns the +// single source of truth). +// +// Feedback uses uiModule.showToast() — the same toast the rest of the +// app uses for "Saved", "Favorited", etc. — so the success message +// doesn't introduce a parallel notification style. +// +// To remove: delete this file + the