diff --git a/app.py b/app.py index 4f3dff0..87ef1ae 100644 --- a/app.py +++ b/app.py @@ -1062,6 +1062,16 @@ async def _startup_event(): logger.warning(f"Nightly skill audit failed: {e}") _startup_tasks.append(asyncio.create_task(_skill_audit_nightly_loop())) + + # Cookbook serve lifecycle — kills scheduler-launched serves whose + # window-end has passed. Paired with the cookbook_serve builtin + # action; both are no-ops unless a scheduled task actually launches + # something with end_after_min set. Removing this line + the + # cookbook_serve entry in BUILTIN_ACTIONS + src/cookbook_serve_lifecycle.py + # removes the feature. + from src.cookbook_serve_lifecycle import cookbook_serve_lifecycle_loop + _startup_tasks.append(asyncio.create_task(cookbook_serve_lifecycle_loop())) + logger.info("Application startup complete") async def _shutdown_event(): diff --git a/integrations/claude/skills/odysseus/SKILL.md b/integrations/claude/skills/odysseus/SKILL.md index 3877f9c..d3b55b3 100644 --- a/integrations/claude/skills/odysseus/SKILL.md +++ b/integrations/claude/skills/odysseus/SKILL.md @@ -1,6 +1,6 @@ --- name: odysseus -description: Use when the user asks Claude Code to read or write Odysseus data (todos, email, calendar, memory, documents) through the scoped Claude Agent API. Requires ODYSSEUS_URL and ODYSSEUS_API_TOKEN. +description: Use when the user asks Claude Code to read or write Odysseus data (todos, email, calendar, memory, documents) or to launch/monitor/stop a Cookbook model-serve task through the scoped Claude Agent API. Requires ODYSSEUS_URL and ODYSSEUS_API_TOKEN. --- # Odysseus @@ -105,6 +105,49 @@ python3 ~/.claude/skills/odysseus/scripts/odysseus_api.py POST /api/codex/memory - `POST /api/codex/emails/draft` — body matches `SendEmailRequest` (`to`, `cc`, `bcc`, `subject`, `body`, `body_html`, `attachments`, `account_id`, `in_reply_to`, `references`). Requires `email:draft` (or `email:send`). - `POST /api/codex/emails/send` — same body. Requires `email:send`. Never send without explicit user instruction. +## Cookbook serve (debug a failing model launch) + +The Cookbook surface lets you reproduce what a human would do in Odysseus → Cookbook: read which serves are running, tail their tmux output to see why they crashed, edit the launch command, relaunch, kill a stuck one. Use this when the user is debugging a model server that won't come up (compute-capability errors, OOM, missing kernels, wrong attention backend, etc.). + +- `GET /api/codex/cookbook/tasks` — list active serve/download/install tasks (sessionId, type, status, repo_id, remoteHost, payload._cmd). Requires `cookbook:read`. +- `GET /api/codex/cookbook/servers` — list configured servers (name, host, port, env type + path, model dirs). Requires `cookbook:read`. +- `GET /api/codex/cookbook/cached?host=` — list models already cached on the named server (HF cache + Ollama + extra modelDirs). Call BEFORE `serve` to see what's already on disk. Requires `cookbook:read`. +- `GET /api/codex/cookbook/presets` — list saved serve presets (model + host + port + cmd). The user's saved preset usually has a working cmd — try `preset NAME` before composing your own. Requires `cookbook:read`. +- `GET /api/codex/cookbook/output/{session_id}?tail=400` — read the last N lines of the task's persistent log file (preferred) or tmux pane (fallback). The log file persists across vllm crashes, so this returns the actual Python traceback even after the bash prompt + neofetch banner overwrites the pane. Default tail=400. Requires `cookbook:read`. +- `POST /api/codex/cookbook/serve` — launch a serve task. Body matches `ServeRequest`: `{ repo_id, cmd, remote_host?, ssh_port?, env_prefix?, gpus?, platform? }`. The `cmd` is validated: leading binary must be `vllm`/`python3`/`sglang`/`llama-server`/`ollama`/`node`/`npx`. NEVER prefix with `cd …`, `source …`, or chain with `&&`/`||`/`;`/`$(...)` — the validator rejects shell metacharacters. The venv activation (`env_prefix`) is added automatically from the host's saved settings, so pass the bare binary + args. Requires `cookbook:launch`. +- `POST /api/codex/cookbook/preset/{name}` — launch a saved preset by name. Reuses the working cmd + host the user already saved. Requires `cookbook:launch`. +- `POST /api/codex/cookbook/adopt` — register an externally-launched tmux session into cookbook tracking. Body: `{ tmux_session, model, host?, port? }`. Use this when serve_model rejected a cmd and you fell back to direct ssh+tmux — without adoption, the session is invisible to the UI. Requires `cookbook:launch`. +- `POST /api/codex/cookbook/stop/{session_id}` — kill the tmux session for that task. Requires `cookbook:launch`. + +```bash +# Survey what's running +python3 ~/.claude/skills/odysseus/scripts/odysseus_api.py cookbook tasks + +# Tail the failing one (sessionId from `cookbook tasks`) +python3 ~/.claude/skills/odysseus/scripts/odysseus_api.py cookbook output serve-abc12345 400 + +# Stop the previous attempt before you try a new flag set +python3 ~/.claude/skills/odysseus/scripts/odysseus_api.py cookbook stop serve-abc12345 + +# Relaunch with new flags. cmd MUST begin with one of the allowlisted binaries. +python3 ~/.claude/skills/odysseus/scripts/odysseus_api.py cookbook serve \ + /mnt/HADES/models/Qwen3.5-397B-A17B-AWQ \ + "vllm serve /mnt/HADES/models/Qwen3.5-397B-A17B-AWQ --host 0.0.0.0 --port 8001 --tensor-parallel-size 8 --max-model-len 262144 --gpu-memory-utilization 0.90 --dtype auto --max-num-seqs 8 --trust-remote-code --enable-expert-parallel --enable-auto-tool-choice --tool-call-parser qwen3_coder --reasoning-parser qwen3" \ + pewds@192.168.1.12 +``` + +**Debug loop pattern:** when a serve is failing, the productive sequence is + +1. `cookbook tasks` → find the failing sessionId. +2. `cookbook output SID 600` → read the last 600 lines, find the actual root-cause line (often above the visible tail because tmux scrollback rolled — request a larger `tail` if the error references "above"). +3. `cookbook stop SID` — kill the previous attempt before relaunching; two serves on the same `--port` collide. +4. `cookbook serve repo "new cmd"` — try the next variation. Wait ~20s, then `cookbook output` on the new sessionId. + +**Hard limits this surface enforces:** +- `cookbook serve` cmd allowlist + shell-metacharacter rejection — you cannot run arbitrary shell, only model-server binaries. +- `cookbook stop` only targets task sessionIds matching `[a-zA-Z0-9_-]+`. +- The agent CAN spawn GPU-pinning long-lived processes — always `cookbook stop` your previous attempt before relaunching, and check `cookbook tasks` for collisions on the same `--port` before launching. + ## Forbidden Bypass Pattern If you are about to reach the Odysseus host/container, import app internals, query the database, or call MCP helper modules directly, stop. Those paths bypass Odysseus Settings and token scopes. Ask the user to enable the relevant Claude Agent tool toggle instead. diff --git a/integrations/claude/skills/odysseus/scripts/odysseus_api.py b/integrations/claude/skills/odysseus/scripts/odysseus_api.py index 5fdd632..fcef8a7 100755 --- a/integrations/claude/skills/odysseus/scripts/odysseus_api.py +++ b/integrations/claude/skills/odysseus/scripts/odysseus_api.py @@ -17,6 +17,15 @@ def _usage() -> int: print(" odysseus_api.py todos add TITLE", file=sys.stderr) print(" odysseus_api.py emails list [limit]", file=sys.stderr) print(" odysseus_api.py emails read UID", file=sys.stderr) + print(" odysseus_api.py cookbook tasks", file=sys.stderr) + print(" odysseus_api.py cookbook servers", file=sys.stderr) + print(" odysseus_api.py cookbook cached [HOST]", file=sys.stderr) + print(" odysseus_api.py cookbook presets", file=sys.stderr) + print(" odysseus_api.py cookbook output SESSION_ID [tail]", file=sys.stderr) + print(" odysseus_api.py cookbook serve REPO_ID 'CMD' [REMOTE_HOST]", file=sys.stderr) + print(" odysseus_api.py cookbook preset NAME", file=sys.stderr) + print(" odysseus_api.py cookbook adopt SESSION_ID MODEL [HOST] [PORT]", file=sys.stderr) + print(" odysseus_api.py cookbook stop SESSION_ID", file=sys.stderr) print(" odysseus_api.py METHOD /api/codex/path [json-body]", file=sys.stderr) return 2 @@ -72,6 +81,61 @@ def main() -> int: body = None else: return _usage() + elif command == "cookbook": + if len(sys.argv) < 3: + return _usage() + action = sys.argv[2].lower() + if action == "tasks": + method = "GET" + path = "/api/codex/cookbook/tasks" + body = None + elif action == "servers": + method = "GET" + path = "/api/codex/cookbook/servers" + body = None + elif action == "output" and len(sys.argv) >= 4: + method = "GET" + sid = sys.argv[3] + tail = sys.argv[4] if len(sys.argv) >= 5 else "400" + path = f"/api/codex/cookbook/output/{sid}?tail={tail}" + body = None + elif action == "cached": + method = "GET" + if len(sys.argv) >= 4: + from urllib.parse import quote + path = f"/api/codex/cookbook/cached?host={quote(sys.argv[3])}" + else: + path = "/api/codex/cookbook/cached" + body = None + elif action == "presets": + method = "GET" + path = "/api/codex/cookbook/presets" + body = None + elif action == "preset" and len(sys.argv) >= 4: + from urllib.parse import quote + method = "POST" + path = f"/api/codex/cookbook/preset/{quote(sys.argv[3])}" + body = None + elif action == "adopt" and len(sys.argv) >= 5: + method = "POST" + path = "/api/codex/cookbook/adopt" + payload = {"tmux_session": sys.argv[3], "model": sys.argv[4]} + if len(sys.argv) >= 6: payload["host"] = sys.argv[5] + if len(sys.argv) >= 7: payload["port"] = int(sys.argv[6]) + body = json.dumps(payload) + elif action == "serve" and len(sys.argv) >= 5: + method = "POST" + path = "/api/codex/cookbook/serve" + payload = {"repo_id": sys.argv[3], "cmd": sys.argv[4]} + if len(sys.argv) >= 6: + payload["remote_host"] = sys.argv[5] + body = json.dumps(payload) + elif action == "stop" and len(sys.argv) >= 4: + method = "POST" + path = f"/api/codex/cookbook/stop/{sys.argv[3]}" + body = None + else: + return _usage() else: if len(sys.argv) < 3: return _usage() diff --git a/integrations/codex/scripts/odysseus_api.py b/integrations/codex/scripts/odysseus_api.py index 5fdd632..fcef8a7 100755 --- a/integrations/codex/scripts/odysseus_api.py +++ b/integrations/codex/scripts/odysseus_api.py @@ -17,6 +17,15 @@ def _usage() -> int: print(" odysseus_api.py todos add TITLE", file=sys.stderr) print(" odysseus_api.py emails list [limit]", file=sys.stderr) print(" odysseus_api.py emails read UID", file=sys.stderr) + print(" odysseus_api.py cookbook tasks", file=sys.stderr) + print(" odysseus_api.py cookbook servers", file=sys.stderr) + print(" odysseus_api.py cookbook cached [HOST]", file=sys.stderr) + print(" odysseus_api.py cookbook presets", file=sys.stderr) + print(" odysseus_api.py cookbook output SESSION_ID [tail]", file=sys.stderr) + print(" odysseus_api.py cookbook serve REPO_ID 'CMD' [REMOTE_HOST]", file=sys.stderr) + print(" odysseus_api.py cookbook preset NAME", file=sys.stderr) + print(" odysseus_api.py cookbook adopt SESSION_ID MODEL [HOST] [PORT]", file=sys.stderr) + print(" odysseus_api.py cookbook stop SESSION_ID", file=sys.stderr) print(" odysseus_api.py METHOD /api/codex/path [json-body]", file=sys.stderr) return 2 @@ -72,6 +81,61 @@ def main() -> int: body = None else: return _usage() + elif command == "cookbook": + if len(sys.argv) < 3: + return _usage() + action = sys.argv[2].lower() + if action == "tasks": + method = "GET" + path = "/api/codex/cookbook/tasks" + body = None + elif action == "servers": + method = "GET" + path = "/api/codex/cookbook/servers" + body = None + elif action == "output" and len(sys.argv) >= 4: + method = "GET" + sid = sys.argv[3] + tail = sys.argv[4] if len(sys.argv) >= 5 else "400" + path = f"/api/codex/cookbook/output/{sid}?tail={tail}" + body = None + elif action == "cached": + method = "GET" + if len(sys.argv) >= 4: + from urllib.parse import quote + path = f"/api/codex/cookbook/cached?host={quote(sys.argv[3])}" + else: + path = "/api/codex/cookbook/cached" + body = None + elif action == "presets": + method = "GET" + path = "/api/codex/cookbook/presets" + body = None + elif action == "preset" and len(sys.argv) >= 4: + from urllib.parse import quote + method = "POST" + path = f"/api/codex/cookbook/preset/{quote(sys.argv[3])}" + body = None + elif action == "adopt" and len(sys.argv) >= 5: + method = "POST" + path = "/api/codex/cookbook/adopt" + payload = {"tmux_session": sys.argv[3], "model": sys.argv[4]} + if len(sys.argv) >= 6: payload["host"] = sys.argv[5] + if len(sys.argv) >= 7: payload["port"] = int(sys.argv[6]) + body = json.dumps(payload) + elif action == "serve" and len(sys.argv) >= 5: + method = "POST" + path = "/api/codex/cookbook/serve" + payload = {"repo_id": sys.argv[3], "cmd": sys.argv[4]} + if len(sys.argv) >= 6: + payload["remote_host"] = sys.argv[5] + body = json.dumps(payload) + elif action == "stop" and len(sys.argv) >= 4: + method = "POST" + path = f"/api/codex/cookbook/stop/{sys.argv[3]}" + body = None + else: + return _usage() else: if len(sys.argv) < 3: return _usage() diff --git a/integrations/codex/skills/odysseus/SKILL.md b/integrations/codex/skills/odysseus/SKILL.md index 1e2be01..4cff140 100644 --- a/integrations/codex/skills/odysseus/SKILL.md +++ b/integrations/codex/skills/odysseus/SKILL.md @@ -1,6 +1,6 @@ --- name: odysseus -description: Use when the user asks Codex to read or write Odysseus data from a terminal Codex session through the scoped Codex Agent API. Requires ODYSSEUS_URL and ODYSSEUS_API_TOKEN. +description: Use when the user asks Codex to read or write Odysseus data (todos, email, calendar, memory, documents) or to launch/monitor/stop a Cookbook model-serve task through the scoped Codex Agent API. Requires ODYSSEUS_URL and ODYSSEUS_API_TOKEN. --- # Odysseus @@ -105,6 +105,37 @@ python3 integrations/codex/scripts/odysseus_api.py POST /api/codex/memory '{"tex - `POST /api/codex/emails/draft` — body matches `SendEmailRequest` (`to`, `cc`, `bcc`, `subject`, `body`, `body_html`, `attachments`, `account_id`, `in_reply_to`, `references`). Requires `email:draft` (or `email:send`). - `POST /api/codex/emails/send` — same body. Requires `email:send`. Never send without explicit user instruction. +## Cookbook serve (debug a failing model launch) + +The Cookbook surface lets you reproduce what a human would do in Odysseus → Cookbook: read which serves are running, tail their tmux output to see why they crashed, edit the launch command, relaunch, kill a stuck one. Use this when the user is debugging a model server that won't come up (compute-capability errors, OOM, missing kernels, wrong attention backend, etc.). + +- `GET /api/codex/cookbook/tasks` — list active serve/download/install tasks (sessionId, type, status, repo_id, remoteHost, payload._cmd). Requires `cookbook:read`. +- `GET /api/codex/cookbook/servers` — list configured servers (name, host, port, env type + path, model dirs). Requires `cookbook:read`. +- `GET /api/codex/cookbook/cached?host=` — list models already cached on the named server (HF cache + Ollama + extra modelDirs). Call BEFORE `serve` to see what's already on disk. Requires `cookbook:read`. +- `GET /api/codex/cookbook/presets` — list saved serve presets (model + host + port + cmd). The user's saved preset usually has a working cmd — try `preset NAME` before composing your own. Requires `cookbook:read`. +- `GET /api/codex/cookbook/output/{session_id}?tail=400` — read the last N lines of the task's persistent log file (preferred) or tmux pane (fallback). The log file persists across vllm crashes, so this returns the actual Python traceback even after the bash prompt + neofetch banner overwrites the pane. Default tail=400. Requires `cookbook:read`. +- `POST /api/codex/cookbook/serve` — launch a serve task. Body matches `ServeRequest`: `{ repo_id, cmd, remote_host?, ssh_port?, env_prefix?, gpus?, platform? }`. The `cmd` is validated: leading binary must be `vllm`/`python3`/`sglang`/`llama-server`/`ollama`/`node`/`npx`. NEVER prefix with `cd …`, `source …`, or chain with `&&`/`||`/`;`/`$(...)` — the validator rejects shell metacharacters. The venv activation (`env_prefix`) is added automatically from the host's saved settings, so pass the bare binary + args. Requires `cookbook:launch`. +- `POST /api/codex/cookbook/preset/{name}` — launch a saved preset by name. Reuses the working cmd + host the user already saved. Requires `cookbook:launch`. +- `POST /api/codex/cookbook/adopt` — register an externally-launched tmux session into cookbook tracking. Body: `{ tmux_session, model, host?, port? }`. Use this when serve_model rejected a cmd and you fell back to direct ssh+tmux — without adoption, the session is invisible to the UI. Requires `cookbook:launch`. +- `POST /api/codex/cookbook/stop/{session_id}` — kill the tmux session. Requires `cookbook:launch`. + +```bash +python3 ~/plugins/odysseus/scripts/odysseus_api.py cookbook tasks +python3 ~/plugins/odysseus/scripts/odysseus_api.py cookbook output serve-abc12345 400 +python3 ~/plugins/odysseus/scripts/odysseus_api.py cookbook stop serve-abc12345 +python3 ~/plugins/odysseus/scripts/odysseus_api.py cookbook serve \ + /mnt/HADES/models/Qwen3.5-397B-A17B-AWQ \ + "vllm serve /mnt/HADES/models/Qwen3.5-397B-A17B-AWQ --host 0.0.0.0 --port 8001 --tensor-parallel-size 8 --max-model-len 262144 --gpu-memory-utilization 0.90 --dtype auto --max-num-seqs 8 --trust-remote-code --enable-expert-parallel --enable-auto-tool-choice --tool-call-parser qwen3_coder --reasoning-parser qwen3" \ + pewds@192.168.1.12 +``` + +**Debug loop pattern:** `tasks` → `output SID 600` (find root cause; request larger `tail` if it references "above") → `stop SID` → `serve repo "new cmd"` → wait ~20s → `output` on the new sessionId. + +**Hard limits this surface enforces:** +- `cookbook serve` cmd allowlist + shell-metacharacter rejection. +- `cookbook stop` requires sessionIds matching `[a-zA-Z0-9_-]+`. +- Agent CAN spawn GPU-pinning long-lived processes — always `cookbook stop` your previous attempt before relaunching. + ## Forbidden Bypass Pattern If you are about to reach the Odysseus host/container, import app internals, query the database, or call MCP helper modules directly, stop. Those paths bypass Odysseus Settings and token scopes. Ask the user to enable the relevant Codex Agent tool toggle instead. diff --git a/routes/codex_routes.py b/routes/codex_routes.py index 8c59ee5..9898dae 100644 --- a/routes/codex_routes.py +++ b/routes/codex_routes.py @@ -19,6 +19,8 @@ from src.auth_helpers import require_user from src.tool_implementations import do_manage_notes +COOKBOOK_READ_SCOPES = {"cookbook:read", "cookbook:launch"} +COOKBOOK_LAUNCH_SCOPES = {"cookbook:launch"} TODO_READ_SCOPES = {"todos:read", "todos:write"} TODO_WRITE_SCOPES = {"todos:write"} EMAIL_READ_SCOPES = {"email:read", "email:draft", "email:send"} @@ -130,6 +132,11 @@ def setup_codex_routes( "actions": ["library", "read", "create", "delete"], "available": documents_library_endpoint is not None, }, + "cookbook": { + "read": scoped(COOKBOOK_READ_SCOPES), + "launch": scoped(COOKBOOK_LAUNCH_SCOPES), + "actions": ["tasks", "servers", "output", "serve", "stop"], + }, }, "safety": { "email_send_requires_confirmation": True, @@ -373,6 +380,374 @@ def setup_codex_routes( raise HTTPException(400, f"Invalid document payload: {exc}") return await _as_owner(request, owner, documents_create_endpoint, request, req) + # ── Cookbook surface ── + # Lets the agent run the same launch / monitor / kill loop the user + # would do by hand in the Cookbook UI: read the current task list + + # tmux output, launch a serve task, stop one. Two scopes: + # cookbook:read — list tasks + tail output + list servers + # cookbook:launch — also start/stop serves (host shell exec) + # `cookbook:launch` is genuinely powerful: /api/model/serve runs SSH'd + # commands on the user's hosts. The existing _validate_serve_cmd + # allowlist (vllm/python3/sglang/llama-server/etc., no shell metachars) + # keeps the agent inside the same sandbox the UI uses. + + async def _run_shell(cmd: str, timeout: float = 15.0) -> dict: + """Run a shell command, return {exit_code, stdout, stderr}.""" + import asyncio as _asyncio + try: + proc = await _asyncio.create_subprocess_shell( + cmd, + stdout=_asyncio.subprocess.PIPE, + stderr=_asyncio.subprocess.PIPE, + ) + try: + stdout_b, stderr_b = await _asyncio.wait_for(proc.communicate(), timeout=timeout) + except _asyncio.TimeoutError: + proc.kill() + return {"exit_code": -1, "stdout": "", "stderr": "timed out"} + return { + "exit_code": proc.returncode, + "stdout": stdout_b.decode(errors="replace"), + "stderr": stderr_b.decode(errors="replace"), + } + except Exception as exc: + return {"exit_code": -1, "stdout": "", "stderr": str(exc)} + + def _read_cookbook_state() -> dict: + from pathlib import Path as _Path + import os as _os, json as _json + p = _Path(_os.environ.get("DATA_DIR", "data")) / "cookbook_state.json" + if not p.exists(): + return {} + try: + return _json.loads(p.read_text(encoding="utf-8")) + except Exception: + return {} + + def _redact_task(t: dict) -> dict: + """Strip secrets before returning to the agent.""" + clean = {k: v for k, v in t.items() if k not in ("hf_token", "_secrets")} + if isinstance(clean.get("payload"), dict): + pl = clean["payload"] + clean["payload"] = {k: v for k, v in pl.items() + if k not in ("hf_token", "_secrets")} + return clean + + @router.get("/cookbook/tasks") + async def codex_cookbook_tasks(request: Request): + _scope_owner(request, COOKBOOK_READ_SCOPES) + state = _read_cookbook_state() + tasks = state.get("tasks") or [] + return {"tasks": [_redact_task(t) for t in tasks]} + + @router.get("/cookbook/servers") + async def codex_cookbook_servers(request: Request): + _scope_owner(request, COOKBOOK_READ_SCOPES) + state = _read_cookbook_state() + servers = state.get("env", {}).get("servers") or [] + # Strip ssh creds / passwords; keep only what's needed to pick a host. + cleaned = [] + for s in servers: + cleaned.append({ + "name": s.get("name"), + "host": s.get("host"), + "port": s.get("port"), + "env": s.get("env"), + "envPath": s.get("envPath"), + "platform": s.get("platform"), + "modelDirs": s.get("modelDirs"), + }) + return {"servers": cleaned} + + @router.get("/cookbook/output/{session_id}") + async def codex_cookbook_output(request: Request, session_id: str, tail: int = 400): + _scope_owner(request, COOKBOOK_READ_SCOPES) + # Defensive: session_id must be the tmux-style id we issue + # (`serve-XXXX` / `cookbook-XXXX` / `queue-XXXX`); anything else + # would let the agent run arbitrary `tmux capture-pane` targets. + import re as _re + if not _re.fullmatch(r"[a-zA-Z0-9_-]+", session_id): + raise HTTPException(400, "Invalid session id") + tail = max(20, min(int(tail or 400), 4000)) + # Resolve the task's host (if any) from cookbook state so we can + # ssh to the right box, exactly as the UI does in _reconnectTask. + state = _read_cookbook_state() + tasks = state.get("tasks") or [] + task = next((t for t in tasks if t.get("sessionId") == session_id), None) + if task is None: + raise HTTPException(404, "task not found") + host = (task.get("remoteHost") or "").strip() + ssh_port = (task.get("sshPort") or "").strip() + # Prefer the persisted log file over the tmux pane. The pane gets + # overwritten by the post-crash neofetch banner + bash prompt the + # moment vllm exits; the log file is the raw stdout/stderr and + # survives unchanged. Falls back to pane for older tasks predating + # the tee-to-log runner change. + log_path = f"/tmp/odysseus-tmux/{session_id}.log" + inner = ( + f"if [ -s {log_path} ]; then tail -n {tail} {log_path}; " + f"else tmux capture-pane -t {session_id} -p -S -{tail}; fi" + ) + if host: + port_flag = f"-p {ssh_port} " if ssh_port and ssh_port != "22" else "" + import shlex + cmd = f"ssh {port_flag}{host} {shlex.quote(inner)}" + else: + cmd = inner + result = await _run_shell(cmd, timeout=15) + return { + "session_id": session_id, + "host": host or "local", + "exit_code": result.get("exit_code"), + "output": result.get("stdout", ""), + "task": _redact_task(task), + } + + @router.post("/cookbook/serve") + async def codex_cookbook_serve(request: Request, body: dict[str, Any] = Body(default_factory=dict)): + _scope_owner(request, COOKBOOK_LAUNCH_SCOPES) + # Wraps /api/model/serve with the SAME validation the UI uses. + # _validate_serve_cmd (called inside model_serve) rejects shell + # metachars and requires the leading binary to be in the + # cookbook allowlist (vllm / python3 / sglang / llama-server / ...). + from routes.cookbook_helpers import ServeRequest + # Accept friendly aliases agents naturally reach for. Without these, + # passing `host` silently maps to nothing and the serve runs LOCAL + # instead of on the intended remote — exactly the bug an agent + # would never debug on its own. + norm = dict(body or {}) + if "host" in norm and "remote_host" not in norm: + norm["remote_host"] = norm.pop("host") + if "model" in norm and "repo_id" not in norm: + norm["repo_id"] = norm.pop("model") + if "ssh_port" not in norm and "port" in norm and (str(norm.get("port") or "").isdigit() and int(norm["port"]) >= 1000): + # Heuristic: if `port` looks like an SSH port (≥1000) and there's + # no explicit ssh_port, treat it as such. UI ports (8000, 8001, + # 30000) belong inside the cmd string, not here. + pass # leave as-is — user's `port` here is ambiguous; skip remap. + try: + req = ServeRequest(**norm) + except Exception as exc: + raise HTTPException(400, f"Invalid serve payload: {exc}") + serve_endpoint = _find_endpoint(None, "POST", "/api/model/serve") + # Fall back to importing from the cookbook router registered on app. + if serve_endpoint is None: + from fastapi import FastAPI + app: FastAPI = request.app + for route in app.routes: + if getattr(route, "path", None) == "/api/model/serve" and "POST" in getattr(route, "methods", set()): + serve_endpoint = route.endpoint + break + if serve_endpoint is None: + raise HTTPException(503, "model serve endpoint unavailable") + return await serve_endpoint(request, req) + + @router.post("/cookbook/stop/{session_id}") + async def codex_cookbook_stop(request: Request, session_id: str): + _scope_owner(request, COOKBOOK_LAUNCH_SCOPES) + import re as _re + if not _re.fullmatch(r"[a-zA-Z0-9_-]+", session_id): + raise HTTPException(400, "Invalid session id") + state = _read_cookbook_state() + tasks = state.get("tasks") or [] + task = next((t for t in tasks if t.get("sessionId") == session_id), None) + host = ((task or {}).get("remoteHost") or "").strip() + ssh_port = ((task or {}).get("sshPort") or "").strip() + if host: + port_flag = f"-p {ssh_port} " if ssh_port and ssh_port != "22" else "" + cmd = f"ssh {port_flag}{host} \"tmux kill-session -t {session_id}\"" + else: + cmd = f"tmux kill-session -t {session_id}" + result = await _run_shell(cmd, timeout=10) + return {"session_id": session_id, "exit_code": result.get("exit_code"), "host": host or "local"} + + @router.get("/cookbook/cached") + async def codex_cookbook_cached(request: Request, host: str | None = None): + """List cached models on a configured server (or local if host is omitted). + Mirrors `list_cached_models` from the chat agent so external agents have + the same inventory view before deciding what to serve/download.""" + _scope_owner(request, COOKBOOK_READ_SCOPES) + # Hit /api/model/cached internally, with the same modelDirs the chat + # agent's list_cached_models would resolve from cookbook state. + state = _read_cookbook_state() + env = state.get("env") if isinstance(state, dict) else {} + servers = (env.get("servers") if isinstance(env, dict) else None) or [] + HF_DEFAULTS = {"~/.cache/huggingface/hub", "~/.cache/huggingface"} + def _dirs_for(srv: dict) -> str: + mds = srv.get("modelDirs") if isinstance(srv, dict) else None + if isinstance(mds, list): + extras = [d for d in mds if isinstance(d, str) and d.strip() and d.strip() not in HF_DEFAULTS] + return ",".join(extras) + if isinstance(mds, str) and mds.strip() not in HF_DEFAULTS: + return mds + return "" + # Resolve friendly host name → real host (matches list_cached_models flow). + resolved_host = host or "" + srv: dict[str, Any] = {} + if host: + srv = next( + (s for s in servers if isinstance(s, dict) + and (s.get("name") == host or s.get("host") == host)), + {}, + ) + if srv and srv.get("host"): + resolved_host = srv["host"] + else: + srv = next((s for s in servers if isinstance(s, dict) and not (s.get("host") or "").strip()), {}) + params: dict[str, str] = {} + if resolved_host: + params["host"] = resolved_host + md = _dirs_for(srv) + if md: + params["model_dir"] = md + if srv.get("port"): + params["ssh_port"] = str(srv["port"]) + if srv.get("platform"): + params["platform"] = srv["platform"] + cached_endpoint = _find_endpoint(None, "GET", "/api/model/cached") + if cached_endpoint is None: + from fastapi import FastAPI + app: FastAPI = request.app + for route in app.routes: + if getattr(route, "path", None) == "/api/model/cached" and "GET" in getattr(route, "methods", set()): + cached_endpoint = route.endpoint + break + if cached_endpoint is None: + raise HTTPException(503, "model cached endpoint unavailable") + # The endpoint reads host/model_dir/ssh_port/platform as kwargs. + return await cached_endpoint( + request, + host=params.get("host") or None, + model_dir=params.get("model_dir") or None, + ssh_port=params.get("ssh_port") or None, + platform=params.get("platform") or None, + ) + + @router.get("/cookbook/presets") + async def codex_cookbook_presets(request: Request): + """List saved serve presets (model + host + port + launch cmd). + Counterpart to `list_serve_presets`. Use BEFORE composing a `serve` + body — the user's saved preset usually has the working cmd already.""" + _scope_owner(request, COOKBOOK_READ_SCOPES) + state = _read_cookbook_state() + presets = state.get("presets") or [] + out = [] + for p in presets: + if not isinstance(p, dict): + continue + out.append({ + "name": p.get("name"), + "model": p.get("model") or p.get("modelId"), + "host": p.get("host") or p.get("remoteHost"), + "port": p.get("port"), + "cmd": p.get("cmd"), + }) + return {"presets": out, "default_host": (state.get("env") or {}).get("defaultServer", "")} + + @router.post("/cookbook/preset/{name}") + async def codex_cookbook_serve_preset(request: Request, name: str): + """Launch a saved preset by name. Reuses the working cmd + host the + user already saved, avoiding the cmd-allowlist trial-and-error loop.""" + _scope_owner(request, COOKBOOK_LAUNCH_SCOPES) + import re as _re + if not _re.fullmatch(r"[A-Za-z0-9 _.:@\-]+", name): + raise HTTPException(400, "Invalid preset name") + state = _read_cookbook_state() + presets = state.get("presets") or [] + lname = name.lower().strip() + chosen = next( + (p for p in presets if isinstance(p, dict) and (p.get("name") or "").lower() == lname), + None, + ) + if chosen is None: + chosen = next( + (p for p in presets if isinstance(p, dict) and lname in (p.get("name") or "").lower()), + None, + ) + if chosen is None: + raise HTTPException(404, f"No preset matching {name!r}") + repo_id = chosen.get("model") or chosen.get("modelId") or "" + cmd = (chosen.get("cmd") or "").strip() + host = chosen.get("host") or chosen.get("remoteHost") or "" + if not repo_id or not cmd or cmd.startswith("(adopted"): + raise HTTPException(400, f"Preset {chosen.get('name')!r} has no launchable cmd " + "(adopted from external launch). Use POST /cookbook/serve " + "with the actual cmd instead.") + # Reuse the serve handler we already validated. + from routes.cookbook_helpers import ServeRequest + body = {"repo_id": repo_id, "cmd": cmd} + if host: + body["remote_host"] = host + try: + req = ServeRequest(**body) + except Exception as exc: + raise HTTPException(400, f"Preset payload invalid: {exc}") + serve_endpoint = _find_endpoint(None, "POST", "/api/model/serve") + if serve_endpoint is None: + from fastapi import FastAPI + app: FastAPI = request.app + for route in app.routes: + if getattr(route, "path", None) == "/api/model/serve" and "POST" in getattr(route, "methods", set()): + serve_endpoint = route.endpoint + break + if serve_endpoint is None: + raise HTTPException(503, "model serve endpoint unavailable") + return await serve_endpoint(request, req) + + @router.post("/cookbook/adopt") + async def codex_cookbook_adopt(request: Request, body: dict[str, Any] = Body(default_factory=dict)): + """Adopt an existing tmux session (one started via raw ssh+tmux) into + cookbook tracking. Needed when serve_model rejects a cmd and the + agent falls back to direct ssh — without adoption the session is + invisible to the UI. Body: {tmux_session, model, host?, port?}.""" + _scope_owner(request, COOKBOOK_LAUNCH_SCOPES) + norm = dict(body or {}) + sess = (norm.get("tmux_session") or norm.get("session_id") or "").strip() + model = (norm.get("model") or norm.get("repo_id") or "").strip() + host = (norm.get("host") or norm.get("remote_host") or "").strip() + port = norm.get("port") or 8000 + import re as _re + if not sess or not _re.fullmatch(r"[a-zA-Z0-9_-]+", sess): + raise HTTPException(400, "tmux_session required, [a-zA-Z0-9_-]+ only") + if not model: + raise HTTPException(400, "model required") + # Verify the tmux session exists on the target host before adopting. + import shlex + if host: + check = f"ssh {shlex.quote(host)} 'tmux has-session -t {shlex.quote(sess)}'" + else: + check = f"tmux has-session -t {shlex.quote(sess)}" + chk = await _run_shell(check, timeout=8) + if chk.get("exit_code") not in (0, None): + raise HTTPException(404, f"tmux session {sess!r} not found on {host or 'local'}") + # Write into cookbook_state.json. + import time as _t, json as _json + from core.atomic_io import atomic_write_json + from pathlib import Path as _Path + cookbook_state_path = _Path("/app/data/cookbook_state.json") + try: + state = _json.loads(cookbook_state_path.read_text(encoding="utf-8")) + except Exception: + state = {} + tasks = state.setdefault("tasks", []) + if any(isinstance(t, dict) and t.get("sessionId") == sess for t in tasks): + return {"ok": True, "already_tracked": True, "session_id": sess} + tasks.append({ + "id": sess, "sessionId": sess, + "name": model.split("/")[-1] if "/" in model else model, + "type": "serve", "status": "running", + "output": f"Adopted externally-launched session {sess!r} on {host or 'local'}.", + "ts": int(_t.time() * 1000), + "payload": {"repo_id": model, "remote_host": host, "_cmd": "(adopted — launched outside cookbook)", "port": int(port)}, + "remoteHost": host, "sshPort": "", "platform": "linux", + "_serveReady": False, "_endpointAdded": False, "_adoptedExternally": True, + }) + try: + atomic_write_json(cookbook_state_path, state) + except Exception as exc: + raise HTTPException(500, f"state write failed: {exc}") + return {"ok": True, "session_id": sess, "host": host or "local"} + return router diff --git a/routes/cookbook_helpers.py b/routes/cookbook_helpers.py index 9efb30d..454c67b 100644 --- a/routes/cookbook_helpers.py +++ b/routes/cookbook_helpers.py @@ -546,6 +546,13 @@ def _append_serve_preflight_exit_lines(runner_lines: list[str], *, keep_shell_op runner_lines.append('if [ -n "$ODYSSEUS_PREFLIGHT_EXIT" ]; then') runner_lines.append(' echo ""; echo "=== Process exited with code $ODYSSEUS_PREFLIGHT_EXIT ==="') if keep_shell_open: + # Decouple the post-crash interactive shell from the persistent log + # file. fds 3/4 were saved BEFORE the tee redirect at the top of + # the runner; restoring them here means the neofetch banner the + # user's .zshrc prints lands on the tmux pane only, not in the + # log file the agent's tail_serve_output reads. + runner_lines.append(' exec 1>&3 2>&4 3>&- 4>&- 2>/dev/null || true') + runner_lines.append(' sleep 0.2 # let tee child flush + exit') runner_lines.append(' exec "${SHELL:-/bin/bash}"') else: runner_lines.append(' exit "$ODYSSEUS_PREFLIGHT_EXIT"') @@ -563,7 +570,11 @@ def _append_serve_exit_code_lines( if is_pip_install: runner_lines.append('if [ $ODYSSEUS_CMD_EXIT -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; fi') if keep_shell_open: - runner_lines.append('echo ""; echo "=== Process exited with code $ODYSSEUS_CMD_EXIT ==="; exec "${SHELL:-/bin/bash}"') + runner_lines.append('echo ""; echo "=== Process exited with code $ODYSSEUS_CMD_EXIT ==="') + # See preflight branch above for the rationale on restoring fds 3/4. + runner_lines.append('exec 1>&3 2>&4 3>&- 4>&- 2>/dev/null || true') + runner_lines.append('sleep 0.2 # let tee child flush + exit') + runner_lines.append('exec "${SHELL:-/bin/bash}"') else: runner_lines.append('echo ""; echo "=== Process exited with code $ODYSSEUS_CMD_EXIT ==="') runner_lines.append('exit "$ODYSSEUS_CMD_EXIT"') diff --git a/routes/cookbook_routes.py b/routes/cookbook_routes.py index 56b95d6..bf2365b 100644 --- a/routes/cookbook_routes.py +++ b/routes/cookbook_routes.py @@ -801,6 +801,55 @@ def setup_cookbook_routes() -> APIRouter: finally: db.close() + def _pick_free_port_for_ollama( + remote: str | None, ssh_port: str | None, start_port: int, max_offset: int + ) -> int | None: + """Return the first free port in [start_port, start_port+max_offset] on + the target host. Used to pick a real bind for `ollama serve` so we + don't reattach to an external systemd ollama (or other listener) the + Cookbook Stop button can't kill.""" + import socket + if remote: + # Probe over SSH. Bash's /dev/tcp gives a portable "is anything + # listening" check without requiring ss/netstat/nmap. + ssh_base = ["ssh", "-o", "ConnectTimeout=4", "-o", "StrictHostKeyChecking=no"] + if ssh_port and str(ssh_port) != "22": + if not _SSH_PORT_RE.match(str(ssh_port)): + return None + ssh_base.extend(["-p", str(ssh_port)]) + host_arg = remote + if not _REMOTE_HOST_RE.match(host_arg): + return None + probe_ports = " ".join(str(start_port + i) for i in range(max_offset + 1)) + script = ( + f"for p in {probe_ports}; do " + "if ! (exec 3<>/dev/tcp/127.0.0.1/$p) 2>/dev/null; then " + "echo $p; exit 0; fi; exec 3<&-; exec 3>&-; done; exit 1" + ) + try: + import subprocess + r = subprocess.run( + ssh_base + [host_arg, script], + capture_output=True, text=True, timeout=8, + ) + if r.returncode == 0: + out = (r.stdout or "").strip().splitlines() + if out and out[0].isdigit(): + return int(out[0]) + except Exception: + return None + return None + # Local: just try to connect. + for off in range(max_offset + 1): + p = start_port + off + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.settimeout(0.25) + try: + s.connect(("127.0.0.1", p)) + except (ConnectionRefusedError, socket.timeout, OSError): + return p + return None + def _auto_register_llm_endpoint(req: ServeRequest, remote: str | None) -> str | None: """Register a freshly-served LLM as a model endpoint so it appears in the model picker without a manual /setup step — the text-model sibling of @@ -815,21 +864,37 @@ def setup_cookbook_routes() -> APIRouter: import re from core.database import SessionLocal, ModelEndpoint - # Port: an explicit --port wins. Otherwise fall back by backend — Ollama - # is the only server in our generated commands that omits --port. + # Port: ordered fallbacks so we match whatever the user actually + # asked for, not a hardcoded default: + # 1. explicit `--port N` (vllm / sglang / llama-server) + # 2. `OLLAMA_HOST=host:port` (the way Ollama specifies its bind) + # 3. fallback by backend (11434 ollama / 8080 llama.cpp) + # Previously the OLLAMA_HOST form was silently ignored and we + # registered every Ollama endpoint at 11434 — even if the user + # set OLLAMA_HOST=0.0.0.0:11435 to avoid colliding with an + # existing systemd Ollama, the registered endpoint pointed at + # the OLD port and showed as offline. port_match = re.search(r'--port\s+(\d+)', req.cmd) + ollama_host_match = re.search(r'OLLAMA_HOST=[^\s]*?:(\d+)', req.cmd) if port_match: port = int(port_match.group(1)) + elif ollama_host_match: + port = int(ollama_host_match.group(1)) elif "ollama" in req.cmd: port = 11434 else: port = 8080 # llama.cpp's llama-server default — the Apple Silicon path # Determine host (mirrors the image path: SSH alias for remote serves). + # For local serves while Odysseus runs inside Docker, "localhost" + # resolves to the container itself — useless. Use host.docker.internal + # which compose maps to the actual host, matching what /setup adds + # for Ollama by hand. if remote: host = remote.split("@")[-1] if "@" in remote else remote else: - host = "localhost" + from routes.model_routes import _docker_host_gateway_reachable + host = "host.docker.internal" if _docker_host_gateway_reachable() else "localhost" base_url = f"http://{host}:{port}/v1" @@ -927,6 +992,19 @@ def setup_cookbook_routes() -> APIRouter: session_id = f"serve-{uuid.uuid4().hex[:8]}" remote = req.remote_host is_windows = req.platform == "windows" + + # Ollama: if the user didn't pin a port, resolve the actual port we'll + # bind to here (before runner construction) by probing the target host. + # Otherwise the runner script picks one at runtime and `_auto_register` + # below still registers the stale 11434 default — which on a host with + # a systemd ollama lands on the wrong (unreachable-from-docker) service. + if "ollama" in req.cmd and "OLLAMA_HOST=" not in req.cmd: + _ollama_bind_host = "0.0.0.0" if remote else "127.0.0.1" + _ollama_chosen_port = _pick_free_port_for_ollama( + remote, req.ssh_port, start_port=11434, max_offset=10, + ) + if _ollama_chosen_port: + req.cmd = f"OLLAMA_HOST={_ollama_bind_host}:{_ollama_chosen_port} {req.cmd}" # LOCAL execution on a native-Windows host never uses tmux (detached # process path below), regardless of the UI-supplied platform. local_windows = IS_WINDOWS and not remote @@ -998,6 +1076,21 @@ def setup_cookbook_routes() -> APIRouter: else: # ── Linux/Termux: bash + tmux (existing flow) ── runner_lines = ["#!/bin/bash"] + # Mirror every line of stdout+stderr into a persistent log file + # on the host running the serve. This is the file tail_serve_output + # reads when the tmux pane has been overwritten by the post-crash + # bash prompt — without it, the agent's diagnostic tool sees the + # neofetch banner instead of the actual Python traceback. + # We save the original fds to 3/4 so we can RESTORE them before + # `exec ${SHELL}` at the end of the script. Without that restore, + # the post-crash interactive shell's neofetch banner ALSO gets + # teed into the log file and `tail -N` returns ONLY the banner — + # the actual traceback ends up earlier than the tail window. + runner_lines.append("mkdir -p /tmp/odysseus-tmux 2>/dev/null || true") + runner_lines.append("exec 3>&1 4>&2") + runner_lines.append( + f"exec > >(tee -a /tmp/odysseus-tmux/{session_id}.log) 2>&1" + ) runner_lines.extend(_user_shell_path_bootstrap()) runner_lines.append('ODYSSEUS_PREFLIGHT_EXIT=""') # Put Odysseus's own venv bin on PATH (local runs only) so the serve @@ -1074,38 +1167,24 @@ def setup_cookbook_routes() -> APIRouter: req.cmd, default_host=_ollama_default_host, ) - # Ollama can be a host binary, a system service, or a Docker - # container. If the HTTP API is already reachable, the model is - # already served and we should not require a host `ollama` CLI. + # Always launch a fresh ollama under tmux so Stop reliably + # kills it. If the requested port is busy (e.g. a systemd + # ollama on 11434), scan upward for a free one rather than + # silently reattaching to an external service that Stop + # can't reach. runner_lines.append(f'ODYSSEUS_OLLAMA_HOST={_bash_squote(_ollama_host)}') runner_lines.append(f'ODYSSEUS_OLLAMA_PORT="{_ollama_port}"') - runner_lines.append('ODYSSEUS_OLLAMA_URL=""') - runner_lines.append('for _ody_ollama_try in $(seq 1 20); do') - runner_lines.append(' for _ody_ollama_port in "$ODYSSEUS_OLLAMA_PORT" 11434; do') - runner_lines.append(' [ -z "$_ody_ollama_port" ] && continue') - runner_lines.append(' for _ody_ollama_host in 127.0.0.1 localhost host.docker.internal; do') - runner_lines.append(' _ody_ollama_url="http://${_ody_ollama_host}:${_ody_ollama_port}"') - runner_lines.append(' if curl -sf "$_ody_ollama_url/api/tags" >/dev/null 2>&1; then') - runner_lines.append(' ODYSSEUS_OLLAMA_URL="$_ody_ollama_url"') - runner_lines.append(' ODYSSEUS_OLLAMA_PORT="$_ody_ollama_port"') - runner_lines.append(' break 3') - runner_lines.append(' fi') - runner_lines.append(' done') - runner_lines.append(' done') - runner_lines.append(' [ "$_ody_ollama_try" -eq 1 ] && echo "[odysseus] Waiting for an existing Ollama API on ports ${ODYSSEUS_OLLAMA_PORT}/11434..."') - runner_lines.append(' sleep 1') - runner_lines.append('done') - runner_lines.append('if [ -n "$ODYSSEUS_OLLAMA_URL" ]; then') - runner_lines.append(' if [ "$ODYSSEUS_OLLAMA_PORT" != "' + _ollama_port + '" ]; then') - runner_lines.append(' echo "[odysseus] Selected Ollama port ' + _ollama_port + ' was not reachable; using running Ollama on port ${ODYSSEUS_OLLAMA_PORT}."') + runner_lines.append('for _ody_off in 0 1 2 3 4 5 6 7 8 9; do') + runner_lines.append(' _ody_try_port=$((ODYSSEUS_OLLAMA_PORT + _ody_off))') + runner_lines.append(' if ! (exec 3<>/dev/tcp/127.0.0.1/$_ody_try_port) 2>/dev/null; then') + runner_lines.append(' exec 3<&-; exec 3>&-') + runner_lines.append(' ODYSSEUS_OLLAMA_PORT="$_ody_try_port"') + runner_lines.append(' break') runner_lines.append(' fi') - runner_lines.append(' echo "[odysseus] Ollama API ready on port ${ODYSSEUS_OLLAMA_PORT}: ${ODYSSEUS_OLLAMA_URL}"') - runner_lines.append(' echo "[odysseus] This task is monitoring an existing Ollama server; stopping it here will not stop an external Docker/system service."') - runner_lines.append(' exec bash -i') - runner_lines.append('fi') + runner_lines.append(' exec 3<&-; exec 3>&-') + runner_lines.append('done') runner_lines.append('if ! command -v ollama &>/dev/null; then') - runner_lines.append(' echo "ERROR: Ollama not found and no Ollama API is reachable on 127.0.0.1, localhost, or host.docker.internal (ports ${ODYSSEUS_OLLAMA_PORT}/11434)."') - runner_lines.append(' echo "Install Ollama, start an Ollama service/container on this server, or pick the port where it is already listening."') + runner_lines.append(' echo "ERROR: Ollama not found on this server. Install it from https://ollama.com/download or `curl -fsSL https://ollama.com/install.sh | sh`."') runner_lines.append(' echo') runner_lines.append(' echo "=== Process exited with code 127 ==="') runner_lines.append(' exec bash -i') @@ -1940,6 +2019,153 @@ def setup_cookbook_routes() -> APIRouter: return {"models": out} + # Rate-limit for the orphan-tmux adoption sweep. The UI polls + # tasks/status every ~3s; we don't want to SSH every host on every + # poll. 20s is fast enough that a model the agent launched in the + # background shows up "almost immediately" in the UI without being + # wasteful. + _last_orphan_sweep_ts = [0.0] + _ORPHAN_SWEEP_MIN_INTERVAL_S = 20.0 + + def _maybe_sweep_orphans(tasks: list, state: dict) -> None: + """Scan each configured cookbook server for `serve-*` tmux sessions + the cookbook doesn't know about and adopt them into state.tasks. + + Writes are conditional: if no orphans are found, nothing is touched. + Rate-limited so polling UIs don't trigger SSH on every refresh. + """ + import time as _time + import subprocess + logger.info(f"_maybe_sweep_orphans: entered, last_ts={_last_orphan_sweep_ts[0]}") + now = _time.monotonic() + if now - _last_orphan_sweep_ts[0] < _ORPHAN_SWEEP_MIN_INTERVAL_S: + logger.info(f"_maybe_sweep_orphans: rate-limited, {now - _last_orphan_sweep_ts[0]:.1f}s since last") + return + _last_orphan_sweep_ts[0] = now + + env = state.get("env") if isinstance(state, dict) else {} + servers = env.get("servers") if isinstance(env, dict) else [] + logger.info(f"orphan sweep starting: {len(servers) if isinstance(servers, list) else 0} server(s), known_sids={len([t for t in tasks if isinstance(t, dict) and t.get('sessionId')])}") + if not isinstance(servers, list): + return + + known_sids = { + t.get("sessionId") for t in tasks + if isinstance(t, dict) and t.get("sessionId") + } + + adopted_any = False + for srv in servers: + if not isinstance(srv, dict): + continue + host = (srv.get("host") or "").strip() + if not host: + continue # local-only entry; the /proc scan handles it + if not _REMOTE_HOST_RE.match(host): + continue + sport = str(srv.get("port") or "").strip() + ssh_base = ["ssh", "-o", "ConnectTimeout=4", "-o", "StrictHostKeyChecking=no"] + if sport and sport != "22": + if not _SSH_PORT_RE.match(sport): + continue + ssh_base.extend(["-p", sport]) + + try: + ls = subprocess.run( + ssh_base + [host, "tmux ls 2>/dev/null"], + timeout=6, capture_output=True, text=True, + ) + except Exception: + continue + for line in (ls.stdout or "").splitlines(): + sid = line.split(":", 1)[0].strip() + if not sid or not _SESSION_ID_RE.match(sid): + continue + if sid in known_sids: + continue + # Adopt any session whose pane is currently running a + # known model-server process (checked below). The earlier + # prefix gate (serve-/cookbook-) dropped legitimate + # serves whenever tmux fell back to numeric IDs, leaving + # them invisible in the Cookbook UI — so the user could + # neither see nor stop them. + # Skip zombie / idle-shell sessions. A tmux session left + # over from a crashed vllm just shows a bash prompt — + # adopting it would pollute the UI with "running" tasks + # that aren't actually serving anything. pane_current_command + # is the foreground process in the pane right now; only + # real model serves leave a python/vllm/etc. process there. + try: + pc = subprocess.run( + ssh_base + [host, "tmux", "list-panes", "-t", sid, + "-F", "#{pane_current_command}"], + timeout=4, capture_output=True, text=True, + ) + cur = (pc.stdout or "").strip().splitlines() + except Exception: + cur = [] + LIVE_PROCS = {"python", "python3", "vllm", "llama-server", + "llama_cpp_main", "sglang", "lmdeploy", + "ollama", "node", "uvicorn"} + if not any(c in LIVE_PROCS for c in cur): + continue + # Try to recover a plausible repo_id + port from the + # pane buffer. Cheap heuristic — if we can't, register + # with placeholder fields; the UI still shows it. + try: + cap = subprocess.run( + ssh_base + [host, "tmux", "capture-pane", "-t", sid, "-p", "-S", "-300"], + timeout=6, capture_output=True, text=True, + ) + pane = cap.stdout or "" + except Exception: + pane = "" + import re as _re_orphan + # vLLM banner: "model /path/...". Falls back to the + # raw vllm-serve command if the banner already scrolled. + m_model = _re_orphan.search(r"model\s+(\S+)", pane) + model = m_model.group(1) if m_model else "" + if not model: + m_serve = _re_orphan.search(r"vllm\s+serve\s+(\S+)", pane) + model = m_serve.group(1) if m_serve else f"adopted:{sid}" + m_port = _re_orphan.search(r"--port\s+(\d+)", pane) + port = int(m_port.group(1)) if m_port else 0 + + import time as _t2 + tasks.append({ + "id": sid, + "sessionId": sid, + "name": model.split("/")[-1] if "/" in model else model, + "type": "serve", + "status": "running", + "output": f"Auto-adopted from orphan tmux session on {host}. " + "Open the task to see live output.", + "ts": int(_t2.time() * 1000), + "payload": { + "repo_id": model, + "remote_host": host, + "_cmd": "(orphan tmux session — original launch cmd unknown)", + "port": port, + }, + "remoteHost": host, + "sshPort": sport, + "platform": "linux", + "_serveReady": False, + "_endpointAdded": False, + "_adoptedExternally": True, + }) + known_sids.add(sid) + adopted_any = True + logger.info(f"auto-adopted orphan tmux session {sid!r} on {host}") + + if adopted_any: + try: + from core.atomic_io import atomic_write_json + state["tasks"] = tasks + atomic_write_json(_cookbook_state_path, state) + except Exception as e: + logger.warning(f"orphan sweep: state write failed: {e}") + @router.get("/api/cookbook/tasks/status") async def cookbook_tasks_status(request: Request): """Check status of all active cookbook tmux sessions. @@ -1993,6 +2219,7 @@ def setup_cookbook_routes() -> APIRouter: # Load saved tasks from cookbook state tasks = [] + state = {} if _cookbook_state_path.exists(): try: state = json.loads(_cookbook_state_path.read_text(encoding="utf-8")) @@ -2004,6 +2231,21 @@ def setup_cookbook_routes() -> APIRouter: except Exception: pass + # Orphan-tmux auto-adoption sweep. When the agent (or anyone) + # SSH-launches a `serve-*` tmux session — usually because + # serve_model rejected `source ... && vllm ...` or because of a + # manual relaunch via tmux send-keys — that session is invisible + # to the cookbook UI even though it's a live model server. The + # sweep finds those orphans on each configured remote host and + # writes them into state.tasks with _adoptedExternally=True, so + # they show up in the UI on the next poll without anyone having + # to remember to call adopt_served_model. Rate-limited via the + # module-level _last_orphan_sweep so we don't SSH every 3s. + try: + _maybe_sweep_orphans(tasks, state) + except Exception as _sweep_e: + logger.warning(f"orphan sweep failed (non-fatal): {_sweep_e!r}") + results = [] for task in tasks: session_id = task.get("sessionId", "") @@ -2063,7 +2305,12 @@ def setup_cookbook_routes() -> APIRouter: if _tport and _tport != "22": ssh_base.extend(["-p", str(_tport)]) check_cmd = ssh_base + [remote, "tmux", "has-session", "-t", session_id] - capture_cmd = ssh_base + [remote, "tmux", "capture-pane", "-t", session_id, "-p", "-S", "-50"] + # Capture 500 lines (was 50) so a Python traceback survives + # the post-crash neofetch banner + bash prompt that otherwise + # fills the visible tail. Without this, output_tail ends up + # as just "Locale: C / Ubuntu_Odysseus ❯" and the agent + # can't diagnose the actual error. + capture_cmd = ssh_base + [remote, "tmux", "capture-pane", "-t", session_id, "-p", "-S", "-500"] elif IS_WINDOWS: # LOCAL Windows task: launched as a detached process (no tmux). # Liveness comes from the .pid file, output from the @@ -2072,7 +2319,7 @@ def setup_cookbook_routes() -> APIRouter: capture_cmd = None else: check_cmd = ["tmux", "has-session", "-t", session_id] - capture_cmd = ["tmux", "capture-pane", "-t", session_id, "-p", "-S", "-50"] + capture_cmd = ["tmux", "capture-pane", "-t", session_id, "-p", "-S", "-500"] local_win_task = (not remote) and IS_WINDOWS diff --git a/routes/task_routes.py b/routes/task_routes.py index a5c49ad..6604923 100644 --- a/routes/task_routes.py +++ b/routes/task_routes.py @@ -18,6 +18,119 @@ from routes.prefs_routes import _load_for_user, _save_for_user logger = logging.getLogger(__name__) +def _maybe_cascade_calendar_event(task) -> None: + """Delete the linked calendar event when a cookbook_serve task is + removed. Two lookup strategies: + + 1. PRIMARY — `cookbook_event_uid` marker stashed in task.prompt + by cookbookSchedule.js right after creating the event. Direct + UID match, no ambiguity. + + 2. FALLBACK — for tasks created before the marker was wired up + (or when the PATCH to add the marker failed silently), scan + the Cookbook calendar for events whose summary equals the + task name and delete the matches. + + Best-effort throughout: errors are logged but never block the task + deletion itself.""" + if not task or task.task_type != "action" or task.action != "cookbook_serve": + return + + import httpx + from core.middleware import INTERNAL_TOOL_HEADER, INTERNAL_TOOL_TOKEN + headers = {INTERNAL_TOOL_HEADER: INTERNAL_TOOL_TOKEN} + if task.owner: + headers["X-Odysseus-Owner"] = task.owner + + # Strategy 1: explicit UID marker in prompt. + event_uid = "" + if task.prompt: + try: + cfg = json.loads(task.prompt) + if isinstance(cfg, dict): + event_uid = (cfg.get("cookbook_event_uid") or "").strip() + except Exception: + pass + + def _try_delete(uid: str) -> bool: + try: + with httpx.Client(timeout=10) as client: + r = client.delete( + f"http://localhost:7000/api/calendar/events/{uid}", + headers=headers, + ) + if r.status_code >= 400: + logger.info( + f"task delete: cascade calendar event {uid} returned " + f"HTTP {r.status_code}" + ) + return False + return True + except Exception as e: + logger.warning(f"task delete: cascade calendar event {uid} failed: {e}") + return False + + if event_uid: + _try_delete(event_uid) + return + + # Strategy 2: scan the Cookbook calendar for matching summaries. + # Only runs for tasks missing the marker (old tasks or PATCH failures). + if not task.name: + return + try: + with httpx.Client(timeout=10) as client: + # Find the Cookbook calendar. + cal_r = client.get("http://localhost:7000/api/calendar/calendars", headers=headers) + if cal_r.status_code >= 400: + return + cals = (cal_r.json() or {}).get("calendars", []) + cookbook_cal = next( + (c for c in cals if (c.get("name") or "").lower() == "cookbook"), + None, + ) + if not cookbook_cal: + return + cal_href = cookbook_cal.get("href") or cookbook_cal.get("id") or "" + # List events in a wide window to catch recurring + upcoming. + from datetime import datetime as _dt, timedelta as _td, timezone as _tz + now = _dt.now(_tz.utc) + start = (now - _td(days=30)).isoformat() + end = (now + _td(days=365)).isoformat() + ev_r = client.get( + "http://localhost:7000/api/calendar/events", + params={"start": start, "end": end, "calendar": cal_href}, + headers=headers, + ) + if ev_r.status_code >= 400: + return + events = (ev_r.json() or {}).get("events", []) + # Match by exact summary. Tasks named "Serve: " are + # created from the schedule modal; the event's summary mirrors + # the task name 1:1 by design. + target = (task.name or "").strip() + uids_to_delete = set() + for ev in events: + if (ev.get("summary") or "").strip() != target: + continue + uid = ev.get("uid") or ev.get("id") or "" + # Strip the "::occurrence" suffix on recurring expansions — + # we want to delete the MASTER once, not each instance. + if "::" in uid: + uid = uid.split("::", 1)[0] + if uid: + uids_to_delete.add(uid) + for uid in uids_to_delete: + _try_delete(uid) + if uids_to_delete: + logger.info( + f"task delete: cascade matched {len(uids_to_delete)} calendar event(s) " + f"by summary fallback for task {task.id} ({target!r})" + ) + except Exception as e: + logger.warning(f"task delete: cascade fallback scan failed: {e}") + + class TaskCreate(BaseModel): name: Optional[str] = None prompt: Optional[str] = None @@ -616,6 +729,12 @@ def setup_task_routes(task_scheduler) -> APIRouter: raise HTTPException(404, "Task not found") if user and task.owner != user: raise HTTPException(403, "Access denied") + # Cascade: cookbook_serve tasks may have a linked calendar + # event (created via the "Create event in calendar" toggle + # in the schedule modal). If so, delete the calendar event + # too so the calendar doesn't end up holding a phantom event + # for a task that no longer exists. + _maybe_cascade_calendar_event(task) db.delete(task) db.commit() return {"ok": True} diff --git a/src/agent_loop.py b/src/agent_loop.py index eabc340..6bd9ba8 100644 --- a/src/agent_loop.py +++ b/src/agent_loop.py @@ -337,6 +337,7 @@ If the user asks for a reminder/alarm before the event, pass `reminder_minutes` "ui_control": "- ```ui_control``` — Control the UI: toggle tools on/off, OPEN PANELS, open email reply drafts, switch models, change themes. Commands: `toggle on/off` (names: bash/shell, web/search, research, incognito, document_editor/documents), `open_panel ` (panels: documents, gallery, email, sessions, notes, memories/brain, skills, settings, cookbook), `open_email_reply ` (opens an email compose document, does NOT send), `set_mode agent/chat`, `switch_model `, `set_theme `, `create_theme ` (optional key=val for advanced colors AND background effects: bgPattern=, bgEffectColor=#RRGGBB, bgEffectIntensity=, bgEffectSize=, frosted=true|false). \"open documents\" / \"open library\" / \"show gallery\" / \"open inbox\" / \"open notes\" / \"open cookbook\" all map to `open_panel `. Theme presets: dark, light, midnight, paper, cyberpunk, retrowave, forest, ocean, ume, copper, terminal, organs, lavender, gpt, claude, cute.", "list_served_models": "- ```list_served_models``` — Show what the Cookbook (LLM-serving subsystem) is currently running. NO args. Use this for ANY 'what's running' / 'what's serving' / 'show my cookbook' / 'is anything up' query. DO NOT shell out (`ps aux`, `docker ps`, etc.) — this tool is the source of truth. Failed serve tasks include recent logs plus diagnosis/retry suggestions; use those suggestions to call `serve_model` again with an adjusted command when appropriate.", "stop_served_model": "- ```stop_served_model``` — Stop a running model server. Args (JSON): {\"session_id\": \"\"}. Use for 'kill my cookbook' / 'stop the model' / 'shut down vLLM'.", + "tail_serve_output": "- ```tail_serve_output``` — Read the actual tmux stderr/traceback of a CURRENTLY failing cookbook task. Args (JSON): {\"session_id\": \"\", \"tail\": 150?}. **Use ONLY after** you just launched something via `serve_model` AND `list_served_models` reports YOUR new task as `crashed`/`error`. DO NOT use it on old stopped/completed download tasks (they're historical noise — won't predict whether a new launch succeeds). DO NOT call it before launching a fresh attempt. When you do call it, bump `tail` to 400+ only if the visible error references 'see root cause above'.", "download_model": "- ```download_model``` — Download a HuggingFace model. Args (JSON): {\"repo_id\": \"Qwen/Qwen3-8B\", \"host\": \"user@gpu-box\"?, \"include\": \"*Q4_K_M*\"?}.", "serve_model": "- ```serve_model``` — Start serving a model with vLLM / SGLang / llama.cpp / Ollama / Diffusers. Args (JSON): {\"repo_id\": \"...\", \"cmd\": \"vllm serve ... --port 8000\" or \"python3 -m sglang.launch_server ... --port 30000\" or \"python3 scripts/diffusion_server.py --model diffusers/stable-diffusion-xl-1.0-inpainting-0.1 --port 8100\", \"host\": \"user@gpu-box\"?}. For image/inpaint/diffusion models, use the `scripts/diffusion_server.py` command exactly. After launch, call `list_served_models`; if it returns a diagnosis with an adjusted command, retry with that command.", "list_downloads": "- ```list_downloads``` — Show in-progress HuggingFace model downloads (filters Cookbook tasks/status to downloads only). NO args. Use for 'what's downloading' / 'show my downloads' / 'check download progress'.", @@ -1659,6 +1660,28 @@ async def stream_agent_loop( _tool_type_counts: collections.Counter = collections.Counter() _THINK_RE = re.compile(r'.*?', re.DOTALL | re.IGNORECASE) _force_answer = False # set by loop-breaker → next round runs with NO tools + # Supervisor: how many times we've nudged the model after it announced + # an action without emitting the tool call. Capped to prevent a model + # that *can't* call the tool from looping forever. + _intent_nudge_count = 0 + _MAX_INTENT_NUDGES = 2 + + # "I said I would, then didn't" detector. The pattern that breaks debug + # loops on weak models (deepseek-v4-flash mid-2026): the model writes + # "Let me tail the output to see the error" and then ends the turn with + # no tool_calls. The intent is sincere but the function call gets dropped. + # Match the common phrasings + an action verb that maps to an available + # tool, so we don't nudge on harmless transitional text like "let me + # know what you think". + _INTENT_RE = re.compile( + r"(?:^|\n)\s*(?:let me|i'?ll|i will|going to|let's)\s+" + r"(?:tail|check|investigate|look at|see|tail|read|fetch|inspect|" + r"verify|diagnose|examine|debug|capture|grab|pull|view|run|call|" + r"trigger|launch|start|kick off|stop|kill|restart|adopt|serve|" + r"register|adopt|list|search|find|query|hit|ping|test)" + r"\b[^.\n]{0,140}", + re.IGNORECASE, + ) # Document streaming state (persists across rounds) _doc_acc = "" # accumulated tool-call JSON arguments @@ -2010,6 +2033,46 @@ async def stream_agent_loop( # never re-verify an unchanged state in a loop. _effectful_used = False continue + # ── Intent-without-action supervisor ───────────────────── + # Catch "Let me tail the output" / "I'll check the logs" / + # "Let me investigate" patterns where the model announces an + # action but emits no tool_call. The bug shows up most on + # smaller models trained to verbalize plans before acting. + # We inject one sharp nudge ("you said you would X — call the + # actual tool now") and loop again. Capped at + # _MAX_INTENT_NUDGES so a model that genuinely cannot use the + # tool doesn't pin us in a forever loop. + _intent_text = _THINK_RE.sub("", cleaned_round).strip() + _intent_match = _INTENT_RE.search(_intent_text) if _intent_text else None + # Only nudge when the round REALLY looks like an unfinished + # promise: short response (<400 chars), no fenced code/answer, + # and an action-intent phrase was matched. Long answers that + # happen to contain "let me know" are not stalls. + _looks_like_promise = ( + _intent_match is not None + and len(_intent_text) < 400 + and "```" not in _intent_text + and _intent_nudge_count < _MAX_INTENT_NUDGES + ) + if _looks_like_promise: + _intent_nudge_count += 1 + _matched_phrase = _intent_match.group(0).strip() + logger.info(f"[agent] intent-without-action nudge #{_intent_nudge_count} on round {round_num}: {_matched_phrase!r}") + messages.append({ + "role": "system", + "content": ( + f"You just wrote: \"{_matched_phrase}\" — but ended the " + "turn without making the actual tool call. The user can " + "see you announced the action but didn't run it, which " + "is the most frustrating thing you can do. " + "DO IT NOW: emit the actual function call this turn. " + "If you decided not to do it after all, say so plainly in " + "one sentence instead of restating the plan." + ), + }) + # Visible signal in the stream so the user knows we caught it. + yield f'data: {json.dumps({"type": "agent_step", "round": round_num + 1})}\n\n' + continue break # no tools — done # ── Loop-breaker (Terminus-style stall detector) ────────────── @@ -2285,6 +2348,19 @@ async def stream_agent_loop( _anchor = f"\n\n[Open in Deep Research](#research-{_rsid})\n" yield 'data: ' + json.dumps({"delta": _anchor}) + '\n\n' + # Same pattern for notes: when manage_notes creates a note + # and returns note_id, drop a `[View note](#note-)` link + # into the stream so chatRenderer's click handler routes to + # the new openNote() in notes.js — opens the notes panel and + # scrolls/flashes the matching card. Without this, the agent + # would write "View note" as a phrase with no target. + _nid = result.get("note_id") + if _nid and block.tool_type == "manage_notes": + _title = (result.get("note_title") or "").strip() + _label = f"View note: {_title}" if _title else "View note" + _anchor = f"\n\n[{_label}](#note-{_nid})\n" + yield 'data: ' + json.dumps({"delta": _anchor}) + '\n\n' + # Save for history persistence tool_event = { "round": round_num, diff --git a/src/agent_tools.py b/src/agent_tools.py index 578b943..b86bd48 100644 --- a/src/agent_tools.py +++ b/src/agent_tools.py @@ -19,7 +19,7 @@ logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Constants (kept here — sub-modules import from here) # --------------------------------------------------------------------------- -MAX_AGENT_ROUNDS = 20 +MAX_AGENT_ROUNDS = 50 SHELL_TIMEOUT = 60 PYTHON_TIMEOUT = 30 MAX_OUTPUT_CHARS = 10_000 diff --git a/src/builtin_actions.py b/src/builtin_actions.py index 6b96e31..d532603 100644 --- a/src/builtin_actions.py +++ b/src/builtin_actions.py @@ -2001,6 +2001,197 @@ async def action_check_email_urgency(owner: str, **kwargs) -> Tuple[str, bool]: return str(e), False +async def action_cookbook_serve( + owner: str, + task_name: str = "", + progress_cb=None, + command: str = "", + **kwargs, +) -> Tuple[str, bool]: + """Launch a Cookbook model serve as a scheduled task. + + `command` is the JSON config string the task carries in `prompt`, + of shape: {"preset": "name"} OR {"repo_id": "...", "cmd": "...", "host": "..."}. + Optional `end_after_min: N` schedules a hard-stop N minutes after launch + (handled by cookbook_serve_lifecycle_loop in src/cookbook_serve_lifecycle.py). + """ + import json + import time as _time + import httpx + from pathlib import Path + from core.middleware import INTERNAL_TOOL_HEADER, INTERNAL_TOOL_TOKEN + from core.atomic_io import atomic_write_json + + headers = {INTERNAL_TOOL_HEADER: INTERNAL_TOOL_TOKEN} + try: + cfg = json.loads(command or "{}") + except Exception: + return f"Invalid JSON config: {command!r}", False + if not isinstance(cfg, dict): + return "Config must be a JSON object", False + + # Resolve the preset (if named) OR fall through with explicit fields. + preset_name = (cfg.get("preset") or "").strip() + repo_id = (cfg.get("repo_id") or "").strip() + cmd = (cfg.get("cmd") or "").strip() + host = (cfg.get("host") or cfg.get("remote_host") or "").strip() + try: + end_after_min = int(cfg.get("end_after_min") or 0) + except Exception: + end_after_min = 0 + + state_path = Path("/app/data/cookbook_state.json") + try: + state = json.loads(state_path.read_text(encoding="utf-8")) if state_path.exists() else {} + except Exception: + state = {} + + # Preset lookup. Try three matching strategies in order so the + # schedule still works even when the user's preset is named + # differently from the model's short name: + # + # 1. Exact preset.name == preset_name (case-insensitive) + # 2. preset.model / preset.modelId == repo_id (caller knows the repo) + # 3. preset.model's short name (after final /) == preset_name + # + # Without #2 and #3, scheduling "Qwen3.5-397B-A17B-AWQ" failed when + # the saved preset was named "vllm-qwen-397b" or had the model field + # populated with the full HF repo path. Either should resolve. + def _short(name: str) -> str: + return (name or "").rsplit("/", 1)[-1].lower() + + if not cmd or not repo_id: + presets = state.get("presets") or [] + chosen = None + # Strategy 1: exact name match. + if preset_name: + chosen = next( + (p for p in presets if isinstance(p, dict) + and (p.get("name") or "").lower() == preset_name.lower()), + None, + ) + # Strategy 2: repo_id matches the preset's model field. + if chosen is None and repo_id: + chosen = next( + (p for p in presets if isinstance(p, dict) + and (p.get("model") or p.get("modelId") or "").lower() == repo_id.lower()), + None, + ) + # Strategy 3: model's short name matches the preset_name. + if chosen is None and preset_name: + chosen = next( + (p for p in presets if isinstance(p, dict) + and _short(p.get("model") or p.get("modelId") or "") == preset_name.lower()), + None, + ) + if chosen is not None: + repo_id = repo_id or chosen.get("model") or chosen.get("modelId") or "" + cmd = cmd or (chosen.get("cmd") or "").strip() + host = host or chosen.get("host") or chosen.get("remoteHost") or "" + if not repo_id or not cmd or cmd.startswith("(adopted"): + # Surface what we tried so the user can name their preset to match. + preset_names = [(p.get("name") or "") for p in (state.get("presets") or []) if isinstance(p, dict)] + hint = f" Saved presets: {preset_names!r}" if preset_names else "" + return (f"No launchable config for {preset_name!r} (repo_id={repo_id!r}). " + f"Check Cookbook → Presets has a real cmd, not 'adopted'.{hint}", False) + + # Resolve env_prefix etc. from the host's saved cookbook server entry, + # matching the chat agent's serve_model path. + body = {"repo_id": repo_id, "cmd": cmd} + if host: + body["remote_host"] = host + env = (state.get("env") or {}) + srv = next( + (s for s in (env.get("servers") or []) + if isinstance(s, dict) and (s.get("host") == host or s.get("name") == host)), + {}, + ) + if srv.get("env") == "venv" and srv.get("envPath"): + body["env_prefix"] = f"source {srv['envPath']}/bin/activate" + elif srv.get("env") == "conda" and srv.get("envPath"): + body["env_prefix"] = f"conda activate {srv['envPath']}" + if srv.get("hfToken"): body["hf_token"] = srv["hfToken"] + if srv.get("port"): body["ssh_port"] = str(srv["port"]) + if srv.get("platform"): body["platform"] = srv["platform"] + + try: + async with httpx.AsyncClient(timeout=30) as client: + r = await client.post("http://localhost:7000/api/model/serve", + json=body, headers=headers) + data = r.json() if r.content else {} + except Exception as e: + return f"Launch HTTP failed: {e}", False + if not data.get("ok"): + return f"Launch rejected: {data.get('error') or data.get('detail') or 'unknown'}", False + + sid = data.get("session_id") or "" + # Register the new task in cookbook_state.json + stamp it with our + # scheduler-owner markers. /api/model/serve spawns the tmux session + # but leaves the state-write to the UI — when a scheduled action + # launches a serve from server-side, NOBODY writes the task into + # state, so the Cookbook tab never shows it. We do the write here. + if sid: + try: + # Re-read fresh (the route may have updated state already). + try: + fresh = json.loads(state_path.read_text(encoding="utf-8")) + except Exception: + fresh = {} + if not isinstance(fresh, dict): + fresh = {} + tasks = fresh.get("tasks") if isinstance(fresh.get("tasks"), list) else [] + existing = next( + (t for t in tasks if isinstance(t, dict) and t.get("sessionId") == sid), + None, + ) + if existing is None: + display_name = repo_id.split("/")[-1] if "/" in repo_id else repo_id + placeholder = ( + f"Launched by scheduled task {task_name!r} — waiting for tmux output…\n" + f" session: {sid}\n" + f" target: {host or 'local'}\n" + f" cmd: {cmd[:200]}{'…' if len(cmd) > 200 else ''}" + ) + existing = { + "id": sid, + "sessionId": sid, + "name": display_name, + "modelId": repo_id, + "type": "serve", + "status": "running", + "output": placeholder, + "ts": int(_time.time() * 1000), + "payload": {"repo_id": repo_id, "remote_host": host or "", "_cmd": cmd}, + "remoteHost": host or "", + "sshPort": "", + "platform": "linux", + "_serveReady": False, + "_endpointAdded": False, + } + tasks.append(existing) + # Stamp ownership + end-at on the task entry. + existing["_scheduledByTask"] = task_name or "" + existing["_scheduledByOwner"] = owner or "" + if end_after_min > 0: + existing["_scheduledStopAtMs"] = int(_time.time() * 1000) + end_after_min * 60 * 1000 + fresh["tasks"] = tasks + atomic_write_json(state_path, fresh) + except Exception as e: + logger.warning(f"cookbook_serve: state register/stamp failed: {e}") + # Don't try to render absolute clock time in the message — the + # server runs in UTC (Docker default), the user reads it as local, + # and the offset depends on the user's TZ which the action doesn't + # have a reliable handle on. The Tasks UI already shows the RUN + # timestamp in the user's local time right above this message, so + # "stops 8 min after that" gives the user everything they need. + if end_after_min: + return ( + f"Launched {repo_id} (session {sid}); stops {end_after_min} min after this ran", + True, + ) + return f"Launched {repo_id} (session {sid})", True + + BUILTIN_ACTIONS = { "tidy_sessions": action_tidy_sessions, "tidy_documents": action_tidy_documents, @@ -2020,6 +2211,7 @@ BUILTIN_ACTIONS = { "test_skills": action_test_skills, "audit_skills": action_audit_skills, "check_email_urgency": action_check_email_urgency, + "cookbook_serve": action_cookbook_serve, # ping_notes removed from the registry — runs only inside `_note_pings_loop`. } diff --git a/src/cookbook_serve_lifecycle.py b/src/cookbook_serve_lifecycle.py new file mode 100644 index 0000000..58d4242 --- /dev/null +++ b/src/cookbook_serve_lifecycle.py @@ -0,0 +1,193 @@ +"""Cookbook serve lifecycle: kills scheduler-owned serves whose end-of- +window has passed. + +Pairs with action_cookbook_serve in builtin_actions.py — that action +stamps the task it launches with `_scheduledStopAtMs`, this loop ticks +every 60s and kills any serve whose stamp is in the past. + +Single small module. Delete this file + the registration line in app.py +and the feature stops doing anything; scheduler-launched serves just +stay up until the user kills them manually. +""" + +from __future__ import annotations + +import asyncio +import json +import logging +import time +from pathlib import Path + +import httpx + +logger = logging.getLogger(__name__) + + +def _internal_headers() -> dict: + from core.middleware import INTERNAL_TOOL_HEADER, INTERNAL_TOOL_TOKEN + return {INTERNAL_TOOL_HEADER: INTERNAL_TOOL_TOKEN} + + +async def _delete_endpoint_for_task(task: dict) -> None: + """Drop the auto-registered model endpoint for a scheduled-stop serve. + + Without this, killing the tmux session leaves the endpoint sitting in + the picker (probe goes offline; chats still try to route there) and + the user has to delete it by hand in Settings -> Endpoints. + """ + import re as _re + payload = task.get("payload") or {} + cmd = str(payload.get("_cmd") or "") + remote = task.get("remoteHost") or "" + # Build host the same way _auto_register_llm_endpoint does so URL match wins. + if remote: + host = remote.split("@")[-1] if "@" in remote else remote + else: + host = "host.docker.internal" + port_match = _re.search(r"--port\s+(\d+)", cmd) + ollama_host_match = _re.search(r"OLLAMA_HOST=[^\s]*?:(\d+)", cmd) + if port_match: + port = int(port_match.group(1)) + elif ollama_host_match: + port = int(ollama_host_match.group(1)) + elif "ollama" in cmd: + port = 11434 + else: + port = 8080 + base_url = f"http://{host}:{port}/v1" + try: + async with httpx.AsyncClient(timeout=8) as client: + r = await client.get( + "http://localhost:7000/api/model-endpoints", + headers=_internal_headers(), + ) + if r.status_code >= 400: + return + eps = r.json() if r.content else [] + # Prefer exact URL match; fall back to host:port substring so we + # still catch the case where 0.0.0.0 vs the registered host + # representation diverged. + ep = next((e for e in eps if e.get("base_url") == base_url), None) + if not ep: + hostport = f"{host}:{port}" + ep = next((e for e in eps if hostport in (e.get("base_url") or "")), None) + if ep: + await client.delete( + f"http://localhost:7000/api/model-endpoints/{ep['id']}", + headers=_internal_headers(), + ) + logger.info( + f"cookbook_serve_lifecycle: deleted endpoint {ep.get('id')} " + f"({ep.get('base_url')}) after scheduled stop" + ) + except Exception as e: + logger.warning(f"cookbook_serve_lifecycle: endpoint delete failed: {e}") + + +async def _stop_serve(session_id: str, remote_host: str = "", ssh_port: str = "") -> bool: + """Kill the tmux session that hosts the serve. + + There's no `/api/model/stop` route — the cookbook UI and the chat + agent both kill via `/api/shell/exec` running a `tmux kill-session` + (wrapped in ssh for remote hosts). Mirror that here so the + lifecycle loop can actually stop scheduler-launched serves at + window-end. Without this, the action stamped `_scheduledStopAtMs` + correctly but every kill attempt failed silently (the route + returned 404 and the result was logged as "failed"). + """ + import shlex + if remote_host: + port_flag = f"-p {shlex.quote(str(ssh_port))} " if ssh_port and str(ssh_port) != "22" else "" + cmd = ( + f"ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no " + f"{port_flag}{shlex.quote(remote_host)} " + f"'tmux kill-session -t {shlex.quote(session_id)}'" + ) + else: + cmd = f"tmux kill-session -t {shlex.quote(session_id)}" + try: + async with httpx.AsyncClient(timeout=15) as client: + r = await client.post( + "http://localhost:7000/api/shell/exec", + json={"command": cmd}, + headers=_internal_headers(), + ) + if r.status_code >= 400: + return False + data = r.json() if r.content else {} + ec = data.get("exit_code") + # tmux returns non-zero when the session is already gone + # ("can't find session: ..."). That's still "stop succeeded" + # from our POV — the goal is no live session at the end. + if ec in (None, 0): + return True + stderr = (data.get("stderr") or "").lower() + return "no server" in stderr or "can't find session" in stderr or "session not found" in stderr + except Exception as e: + logger.warning(f"cookbook_serve_lifecycle: stop {session_id} failed: {e}") + return False + + +async def _tick() -> None: + state_path = Path("/app/data/cookbook_state.json") + if not state_path.exists(): + return + try: + state = json.loads(state_path.read_text(encoding="utf-8")) + except Exception: + return + tasks = state.get("tasks") or [] + now_ms = int(time.time() * 1000) + to_stop = [] + for t in tasks: + if not isinstance(t, dict): + continue + stop_at = t.get("_scheduledStopAtMs") + if not isinstance(stop_at, (int, float)): + continue + if stop_at > now_ms: + continue + if (t.get("status") or "").lower() in {"stopped", "ended", "killed", "crashed"}: + continue + sid = t.get("sessionId") or t.get("id") + if not sid: + continue + to_stop.append((sid, t.get("remoteHost") or "", t.get("sshPort") or "")) + if not to_stop: + return + # Re-read state once before writing so we capture any updates from + # concurrent UI syncs. + stopped_any = False + for sid, host, port in to_stop: + ok = await _stop_serve(sid, host, port) + logger.info(f"cookbook_serve_lifecycle: stop {sid} (host={host or 'local'}): {'ok' if ok else 'failed'}") + if ok: + stopped_any = True + # Drop the auto-registered endpoint so the model picker and + # the chat router don't keep pointing at a dead server. + for t in tasks: + if isinstance(t, dict) and (t.get("sessionId") == sid or t.get("id") == sid): + if t.get("type") == "serve": + await _delete_endpoint_for_task(t) + t["status"] = "stopped" + t["_scheduledStopAtMs"] = None + t["_lastStatusFlipAt"] = now_ms + break + if stopped_any: + try: + from core.atomic_io import atomic_write_json + state["tasks"] = tasks + atomic_write_json(state_path, state) + except Exception as e: + logger.warning(f"cookbook_serve_lifecycle: state write failed: {e}") + + +async def cookbook_serve_lifecycle_loop() -> None: + """Forever-loop. Registered as a startup task in app.py.""" + await asyncio.sleep(20) # let the rest of startup settle + while True: + try: + await _tick() + except Exception as e: + logger.warning(f"cookbook_serve_lifecycle tick failed: {e}") + await asyncio.sleep(60) diff --git a/src/task_scheduler.py b/src/task_scheduler.py index 65fc451..2fcb5dc 100644 --- a/src/task_scheduler.py +++ b/src/task_scheduler.py @@ -1018,6 +1018,10 @@ class TaskScheduler: kwargs = {"owner": task.owner, "task_name": task.name, "progress_cb": _progress} if task.action in ("run_script", "run_local", "ssh_command") and task.prompt: kwargs["script" if task.action in ("run_script", "run_local") else "command"] = task.prompt + # cookbook_serve carries its JSON config in task.prompt — feed it + # through as `command` so action_cookbook_serve can json.loads it. + elif task.action == "cookbook_serve" and task.prompt: + kwargs["command"] = task.prompt result, success = await action_fn(**kwargs) return result, success except TaskNoop: diff --git a/src/tool_execution.py b/src/tool_execution.py index a667266..e84a414 100644 --- a/src/tool_execution.py +++ b/src/tool_execution.py @@ -1119,6 +1119,7 @@ async def execute_tool_block( do_manage_documents, do_manage_settings, do_manage_notes, do_manage_calendar, do_download_model, do_serve_model, do_list_served_models, do_stop_served_model, + do_tail_serve_output, do_list_downloads, do_cancel_download, do_search_hf_models, do_list_cached_models, do_list_serve_presets, do_serve_preset, do_adopt_served_model, do_list_cookbook_servers, @@ -1290,6 +1291,9 @@ async def execute_tool_block( elif tool == "stop_served_model": desc = "stop_served_model" result = await do_stop_served_model(content, owner=owner) + elif tool == "tail_serve_output": + desc = "tail_serve_output" + result = await do_tail_serve_output(content, owner=owner) elif tool == "list_downloads": desc = "list_downloads" result = await do_list_downloads(content, owner=owner) diff --git a/src/tool_implementations.py b/src/tool_implementations.py index c7b2649..dbaf50c 100644 --- a/src/tool_implementations.py +++ b/src/tool_implementations.py @@ -5,6 +5,7 @@ Extracted tool implementation functions (do_* and helpers) from agent_tools.py. These handle the actual execution logic for each tool type. """ +import asyncio import json import logging import os @@ -1956,7 +1957,19 @@ async def do_manage_notes(content: str, owner: Optional[str] = None) -> Dict: ) db.add(note) db.commit() - return {"response": f"Note created: \"{title or '(untitled)'}\" (id: {note.id[:8]})", "exit_code": 0} + # Return note_id so the chat-side renderer can build a real + # "View note" button that opens the notes modal at this id. + # Previously the create response only included a prose + # confirmation; the model would type "View note" as a markdown + # link with no target, leaving the user with a click that + # did nothing and uncertainty about whether the note was made. + return { + "response": f"Note created: \"{title or '(untitled)'}\" (id: {note.id[:8]})", + "note_id": note.id, + "note_title": title or "", + "open_url": f"/#open=notes¬e={note.id}", + "exit_code": 0, + } elif action == "update": note_id = args.get("id", "") @@ -2603,6 +2616,8 @@ async def _cookbook_env_for_host(host: str) -> Dict[str, Any]: return { "env_prefix": env_prefix, + "env_type": env_kind, + "env_path": env_path, "gpus": env_root.get("gpus") or "", "platform": platform, "hf_token": env_root.get("hfToken") or "", @@ -3041,6 +3056,31 @@ async def do_serve_model(content: str, owner: Optional[str] = None) -> Dict: # the UI uses. Without env_prefix, `vllm serve …` lands in a shell # without the user's venv and fails 'command not found'. env_cfg = await _cookbook_env_for_host(host) + # Rewrite bare `vllm` / `python3` leading tokens to the venv's absolute + # binary path when the target host has a venv configured. SSH non- + # interactive shells often leave ~/.local/bin ahead of the venv bin on + # PATH even with the venv activated, so `vllm serve` finds the wrong + # binary and crashes early (e.g. compute_89 torch ABI errors on an old + # user-site torch). This mirrors what static/js/cookbook.js does in + # _buildServeCmd for the UI launch path. + env_path = (env_cfg.get("env_path") or "").rstrip("/") + env_type = (env_cfg.get("env_type") or env_cfg.get("env") or "").lower() + if env_type == "venv" and env_path: + venv_bin = f"{env_path}/bin" + # Match the FIRST shell-token: skip leading KEY=VAL env-var prefixes + # (CUDA_VISIBLE_DEVICES=… VLLM_USE_FLASHINFER_SAMPLER=…) before the binary. + import re as _re3 + tokens = cmd.split() + idx = 0 + env_re = _re3.compile(r"^[A-Za-z_][A-Za-z0-9_]*=") + while idx < len(tokens) and env_re.match(tokens[idx]): + idx += 1 + if idx < len(tokens): + head = tokens[idx] + if head in ("vllm", "python3", "python"): + tokens[idx] = f"{venv_bin}/{head}" + cmd = " ".join(tokens) + payload["cmd"] = cmd if env_cfg.get("env_prefix"): payload["env_prefix"] = env_cfg["env_prefix"] if env_cfg.get("gpus"): payload["gpus"] = env_cfg["gpus"] if env_cfg.get("hf_token"): payload["hf_token"] = env_cfg["hf_token"] @@ -3059,7 +3099,19 @@ async def do_serve_model(content: str, owner: Optional[str] = None) -> Dict: ) note = "" if registered else " (state-write failed — task may not show in UI)" return {"output": f"Serving {repo_id} (session: {sid}){note}", "session_id": sid, "exit_code": 0} - return {"error": data.get("error", "Serve failed"), "exit_code": 1} + # FastAPI HTTPException puts the message under `detail`, not `error`. + # Surface BOTH so the agent sees "Invalid characters in cmd" (from + # _validate_serve_cmd rejecting `&&`/`source`/`cd`) instead of + # the generic "Serve failed", which leaves it with nothing to act on. + err_msg = data.get("error") or data.get("detail") or "Serve failed" + hint = "" + if isinstance(err_msg, str) and "cmd" in err_msg.lower(): + hint = (" — the cmd must START with an allowlisted binary " + "(vllm, python3, llama-server, ollama, sglang, lmdeploy, node, npx). " + "Do NOT prefix with `cd …`, `source …`, or chain with `&&`. " + "env_prefix (e.g. `source ~/qwen35-env/bin/activate`) is added " + "automatically from the host's saved venv settings.") + return {"error": f"{err_msg}{hint}", "exit_code": 1} except Exception as e: return {"error": str(e), "exit_code": 1} @@ -3103,13 +3155,31 @@ async def do_list_served_models(content: str, owner: Optional[str] = None) -> Di "exit_code": 0, } + # Sort so the agent sees what's actually LIVE first. Stopped/error/ + # completed tasks are mostly historical noise — they shouldn't lead + # the list when something is genuinely serving. + _ORDER = { + "ready": 0, "running": 1, "loading": 1, "warming": 1, + "queued": 2, "starting": 2, + "error": 5, "crashed": 5, "failed": 5, + "stopped": 6, "killed": 6, "cancelled": 6, "canceled": 6, + "done": 7, "completed": 7, "finished": 7, + } + def _rank(t: Dict[str, Any]) -> int: + phase = (t.get("phase") or t.get("status") or "unknown").lower() + return _ORDER.get(phase, 3) + merged.sort(key=_rank) + cb_n = len(cookbook_tasks) ext_n = len(external) + live_n = sum(1 for t in merged if _rank(t) <= 2) header = [] if cb_n: header.append(f"{cb_n} cookbook-tracked") if ext_n: header.append(f"{ext_n} external") + if live_n: + header.insert(0, f"{live_n} LIVE") lines = [f"Running: {', '.join(header)}."] for t in merged: phase = t.get("phase") or t.get("status", "unknown") @@ -3136,8 +3206,20 @@ async def do_list_served_models(content: str, owner: Optional[str] = None) -> Di if t.get("status") == "error" and t.get("output_tail"): tail = str(t.get("output_tail") or "").strip() if tail: + # Prefer a window around a Python traceback if one exists, + # falling back to the last 30 lines. The previous 6-line + # tail showed only the post-crash bash prompt / neofetch + # banner ("Locale: C / Ubuntu_Odysseus ❯") — useless for + # diagnosis. The traceback we want is usually 50-200 lines + # earlier in the buffer. + _tail_lines = tail.splitlines() + _shown = _tail_lines[-30:] + for _i, _ln in enumerate(_tail_lines): + if "Traceback (most recent call last)" in _ln or "ERROR" in _ln or "Error:" in _ln: + _shown = _tail_lines[_i:_i + 40] + break lines.append(" recent log:") - for line in tail.splitlines()[-6:]: + for line in _shown: lines.append(f" {line[:220]}") if t.get("external") and t.get("cmdline_preview"): lines.append(f" cmd: {t['cmdline_preview']}") @@ -3243,6 +3325,125 @@ async def do_stop_served_model(content: str, owner: Optional[str] = None) -> Dic ) +async def do_tail_serve_output(content: str, owner: Optional[str] = None) -> Dict: + """Capture the last N lines of a cookbook task's tmux pane — remote-aware. + + Used by the agent to debug a failed/stuck serve: list_served_models tells + you the task is `crashed`, this tool returns the actual stderr/traceback + so the agent can match it against a known fix (compute_89 nvcc mismatch, + flashinfer version mismatch, OOM, missing kernels, etc.) and decide + whether to relaunch via serve_model with new flags. + """ + import httpx + import shlex + try: + args = _parse_tool_args(content) + except ValueError: + return {"error": "Invalid JSON arguments", "exit_code": 1} + session_id = (args.get("session_id") or "").strip() + if not session_id: + return {"error": "session_id is required (from list_served_models)", "exit_code": 1} + import re as _re + if not _re.fullmatch(r"[a-zA-Z0-9_-]+", session_id): + return {"error": "Invalid session_id format", "exit_code": 1} + try: + tail = int(args.get("tail") or 400) + except (TypeError, ValueError): + tail = 400 + tail = max(20, min(tail, 4000)) + headers = _internal_headers() + remote = (args.get("remote_host") or args.get("host") or "").strip() + sport = (args.get("ssh_port") or "").strip() + # Resolve host from cookbook state if caller didn't pass one — same + # lookup _cookbook_kill_session uses. + if not remote: + state: Dict[str, Any] = {} + try: + async with httpx.AsyncClient(timeout=10) as client: + resp = await client.get(f"{_COOKBOOK_BASE}/api/cookbook/state", headers=headers) + state = resp.json() or {} + except Exception as e: + logger.debug(f"cookbook state lookup failed for {session_id}: {e}") + if isinstance(state, dict): + for t in (state.get("tasks") or []): + if isinstance(t, dict) and (t.get("sessionId") == session_id or t.get("id") == session_id): + remote = t.get("remoteHost") or "" + if not sport: + sport = t.get("sshPort") or "" + break + # Prefer the persisted /tmp/odysseus-tmux/SESSION.log file over the + # live tmux pane. The pane is what the user would see scrolling on + # their screen — including the post-crash neofetch banner and the + # idle bash prompt that overwrites the actual traceback the moment + # vllm exits. The log file is the raw stdout/stderr of the wrapped + # process and survives the crash unchanged. We only fall back to + # the pane when the log file doesn't exist (older sessions launched + # before the tmux+tee wrapper was added). + log_path = f"/tmp/odysseus-tmux/{session_id}.log" + pane_inner = f"tmux capture-pane -t {shlex.quote(session_id)} -p -S -{tail} 2>/dev/null" + file_inner = f"tail -n {tail} {shlex.quote(log_path)} 2>/dev/null" + inner = ( + f"if [ -s {shlex.quote(log_path)} ]; then {file_inner}; " + f"else {pane_inner}; fi" + ) + if remote: + _pf = f"-p {shlex.quote(str(sport))} " if sport and str(sport) != "22" else "" + cmd = ( + f"ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no " + f"{_pf}{shlex.quote(remote)} {shlex.quote(inner)}" + ) + host_label = remote + else: + cmd = inner + host_label = "local" + try: + async with httpx.AsyncClient(timeout=20) as client: + resp = await client.post(f"{_COOKBOOK_BASE}/api/shell/exec", + json={"command": cmd}, headers=headers) + if resp.status_code >= 400: + return {"error": f"shell/exec returned HTTP {resp.status_code}: {resp.text[:200]}", "exit_code": 1} + data = resp.json() if resp.content else {} + output_text = (data.get("stdout") or "").strip() + stderr_text = (data.get("stderr") or "").strip() + rc = data.get("exit_code") + if rc not in (None, 0) and not output_text: + already_gone = any(s in (stderr_text or "").lower() for s in ("no server running", "can't find session", "session not found")) + if already_gone: + return {"output": f"Tmux session {session_id} on {host_label} is gone (task already exited).", "exit_code": 0, "session_id": session_id, "host": host_label} + return {"error": f"capture-pane failed on {host_label}: {stderr_text or f'exit {rc}'}", "exit_code": 1} + # Dedupe download-progress noise. A 100-shard HF download produces + # tens of thousands of `model-NN-of-MM.safetensors: 91%|...` lines + # that all look the same to the agent and drown the actual error. + # Keep only one sample per (file, decile-percent) bucket. + import re as _re2 + lines = output_text.splitlines() + dedup_lines = [] + seen_progress = set() + progress_re = _re2.compile(r"^([\w./\-]+):\s+(\d+)%") + for ln in lines: + m = progress_re.match(ln.strip()) + if m: + key = (m.group(1), int(m.group(2)) // 10) # bucket by 10% + if key in seen_progress: + continue + seen_progress.add(key) + dedup_lines.append(ln) + output_text = "\n".join(dedup_lines) + # Hard cap so the agent doesn't blow its token budget. + MAX_CHARS = 8000 + if len(output_text) > MAX_CHARS: + output_text = "…(earlier output truncated)…\n" + output_text[-MAX_CHARS:] + return { + "output": output_text or "(empty pane)", + "session_id": session_id, + "host": host_label, + "tail_lines": tail, + "exit_code": 0, + } + except Exception as e: + return {"error": str(e), "exit_code": 1} + + async def do_list_downloads(content: str, owner: Optional[str] = None) -> Dict: """List in-flight model downloads (filters /api/cookbook/tasks/status to type=download).""" import httpx @@ -3615,38 +3816,133 @@ async def do_serve_preset(content: str, owner: Optional[str] = None) -> Dict: async def do_list_cached_models(content: str, owner: Optional[str] = None) -> Dict: - """List models already cached locally (or on a remote host).""" + """List models already cached locally and/or on remote hosts. + + With no `host` arg, scans EVERY configured Cookbook server (and local) + and aggregates — so the agent sees the full inventory in one call + instead of having to query each server individually. + """ import httpx try: args = _parse_tool_args(content) if content.strip() else {} except ValueError: return {"error": "Invalid JSON arguments", "exit_code": 1} - params: Dict[str, str] = {} raw_host = (args.get("host") or "").strip() - host = await _resolve_cookbook_host(raw_host) if raw_host else "" - if host: - params["host"] = host - if args.get("model_dir"): - params["model_dir"] = args["model_dir"] - if args.get("ssh_port"): - params["ssh_port"] = str(args["ssh_port"]) - if args.get("platform"): - params["platform"] = args["platform"] + headers = _internal_headers() + + async def _scan_one(host_label: str, host_val: str, ssh_port: str = "", + platform: str = "", model_dir: str = "") -> list: + """Hit /api/model/cached for one host; tag each returned model with its source.""" + p: Dict[str, str] = {} + if host_val: + p["host"] = host_val + # Caller-provided override beats per-server config beats nothing. + if args.get("model_dir"): + p["model_dir"] = args["model_dir"] + elif model_dir: + p["model_dir"] = model_dir + if ssh_port: + p["ssh_port"] = ssh_port + elif args.get("ssh_port"): + p["ssh_port"] = str(args["ssh_port"]) + if platform: + p["platform"] = platform + elif args.get("platform"): + p["platform"] = args["platform"] + try: + async with httpx.AsyncClient(timeout=60) as client: + resp = await client.get(f"{_COOKBOOK_BASE}/api/model/cached", + params=p, headers=headers) + data = resp.json() + ms = data.get("models", []) if isinstance(data, dict) else (data or []) + for m in ms: + m["host"] = host_label or "local" + return ms or [] + except Exception as e: + logger.debug(f"list_cached_models scan({host_label}) failed: {e}") + return [] + + # When the caller specifies a host explicitly, scan only that one (old behaviour). + # Otherwise iterate every configured server + local so the agent doesn't + # have to repeat the call per server. try: - async with httpx.AsyncClient(timeout=60) as client: - resp = await client.get(f"{_COOKBOOK_BASE}/api/model/cached", - params=params, headers=_internal_headers()) - data = resp.json() - models = data.get("models", []) if isinstance(data, dict) else data + # Pull configured servers from cookbook state (used for resolving + # modelDirs both when caller specifies a host and when we scan all). + servers: list = [] + try: + async with httpx.AsyncClient(timeout=10) as client: + st = await client.get(f"{_COOKBOOK_BASE}/api/cookbook/state", headers=headers) + st_data = st.json() if st.headers.get("content-type", "").startswith("application/json") else {} + servers = (st_data.get("env", {}) or {}).get("servers") or [] + except Exception as e: + logger.debug(f"server list fetch failed: {e}") + st_data = {} + + def _dirs_for(server_record: Dict[str, Any]) -> str: + """Comma-joined modelDirs from a saved server record (Settings). + + Filters out the HF cache (~/.cache/huggingface/hub) — the backend + scan script always scans it by default, so re-passing it as an + extra model_dir is redundant AND confuses some path-handling + edge cases where the extra dir suppresses the deeper scan. + We only need to forward the NON-default dirs (e.g. /mnt/HADES/models). + """ + mds = server_record.get("modelDirs") if isinstance(server_record, dict) else None + HF_DEFAULTS = {"~/.cache/huggingface/hub", "~/.cache/huggingface"} + if isinstance(mds, list): + extras = [d for d in mds if isinstance(d, str) and d.strip() and d.strip() not in HF_DEFAULTS] + return ",".join(extras) + if isinstance(mds, str) and mds.strip() not in HF_DEFAULTS: + return mds + return "" + + if raw_host: + host = await _resolve_cookbook_host(raw_host) + # Find this host's saved record so its modelDirs apply too. + srv = next( + (s for s in servers if isinstance(s, dict) + and (s.get("name") == raw_host or s.get("host") == host or s.get("host") == raw_host)), + {}, + ) + models = await _scan_one(raw_host, host, model_dir=_dirs_for(srv)) + else: + # Always include local. Local's saved record is the one with no host. + local_srv = next((s for s in servers if isinstance(s, dict) and not (s.get("host") or "").strip()), {}) + scans: list = [_scan_one("local", "", model_dir=_dirs_for(local_srv))] + for s in servers: + if not isinstance(s, dict): + continue + name = s.get("name") or s.get("host") + host_val = s.get("host") or "" + if not host_val: + continue + scans.append(_scan_one( + name, + host_val, + ssh_port=str(s.get("port") or ""), + platform=s.get("platform") or "", + model_dir=_dirs_for(s), + )) + results = await asyncio.gather(*scans, return_exceptions=False) + # Dedupe by (host, repo_id) — same model could appear in both HF cache + Ollama list. + seen = set() + models: list = [] + for batch in results: + for m in batch: + key = (m.get("host", ""), m.get("repo_id", "")) + if key in seen: + continue + seen.add(key) + models.append(m) if not models: - # Filesystem cache scans can miss models downloaded into the HF - # default cache when the server has no explicit model_dir configured. - # Still surface completed Cookbook downloads so the agent doesn't - # incorrectly assume a model is absent and re-download it. + # Cache scans can miss models downloaded into the HF default cache + # when the server has no explicit model_dir configured. Surface + # completed Cookbook download tasks so the agent doesn't conclude + # a model is absent and re-download it. downloaded = [] try: async with httpx.AsyncClient(timeout=10) as client: - st = await client.get(f"{_COOKBOOK_BASE}/api/cookbook/state", headers=_internal_headers()) + st = await client.get(f"{_COOKBOOK_BASE}/api/cookbook/state", headers=headers) state = st.json() if st.headers.get("content-type", "").startswith("application/json") else {} for t in (state.get("tasks") or []): if not isinstance(t, dict) or t.get("type") != "download": @@ -3654,27 +3950,44 @@ async def do_list_cached_models(content: str, owner: Optional[str] = None) -> Di if (t.get("status") or "").lower() not in {"done", "completed"}: continue task_host = t.get("remoteHost") or (t.get("payload") or {}).get("remote_host") or "" - if host and task_host != host: + if raw_host and task_host != raw_host: continue repo = t.get("modelId") or t.get("repoId") or (t.get("payload") or {}).get("repo_id") or t.get("name") if repo and repo not in downloaded: downloaded.append(repo) except Exception: downloaded = [] + host_str = f" on {raw_host}" if raw_host else "" if downloaded: - host_str = f" on {raw_host or host}" if (raw_host or host) else "" lines = [f"No cache paths were detected{host_str}, but Cookbook has completed download task(s):"] lines.extend(f"- {repo} — downloaded via Cookbook task" for repo in downloaded) return {"output": "\n".join(lines), "models": [{"repo_id": repo, "source": "cookbook_task"} for repo in downloaded], "exit_code": 0} - host_str = f" on {raw_host or host}" if (raw_host or host) else "" return {"output": f"No cached models found{host_str}.", "exit_code": 0} - lines = [f"{len(models)} cached model(s):"] - for m in models: - name = m.get("repo_id", "?") - sz = m.get("size") or (f"{m.get('size_bytes', 0) / (1024**3):.1f}GB" if m.get("size_bytes") else "") - inc = " (incomplete)" if m.get("has_incomplete") else "" - kind = " [diffusion]" if m.get("is_diffusion") else "" - lines.append(f"- {name}{kind} — {sz}{inc}") + # Multi-host scan: group by host so the agent sees inventory per server. + # Single-host scan: flat list (matches old output shape). + if raw_host: + lines = [f"{len(models)} cached model(s) on {raw_host}:"] + for m in models: + name = m.get("repo_id", "?") + sz = m.get("size") or (f"{m.get('size_bytes', 0) / (1024**3):.1f}GB" if m.get("size_bytes") else "") + inc = " (incomplete)" if m.get("has_incomplete") else "" + kind = " [diffusion]" if m.get("is_diffusion") else "" + lines.append(f"- {name}{kind} — {sz}{inc}") + else: + from collections import defaultdict as _dd + by_host = _dd(list) + for m in models: + by_host[m.get("host", "local")].append(m) + lines = [f"{len(models)} cached model(s) across {len(by_host)} server(s):"] + for host_name in sorted(by_host.keys()): + lines.append(f"\n[{host_name}]") + for m in by_host[host_name]: + name = m.get("repo_id", "?") + sz = m.get("size") or (f"{m.get('size_bytes', 0) / (1024**3):.1f}GB" if m.get("size_bytes") else "") + inc = " (incomplete)" if m.get("has_incomplete") else "" + kind = " [diffusion]" if m.get("is_diffusion") else "" + backend = f" ({m.get('backend')})" if m.get("backend") else "" + lines.append(f"- {name}{kind}{backend} — {sz}{inc}") return {"output": "\n".join(lines), "models": models, "exit_code": 0} except Exception as e: return {"error": str(e), "exit_code": 1} diff --git a/src/tool_index.py b/src/tool_index.py index 2464a81..09e2dcf 100644 --- a/src/tool_index.py +++ b/src/tool_index.py @@ -37,7 +37,15 @@ ALWAYS_AVAILABLE = frozenset({ # keyword hints when the user is actually talking about cookbook. # Keeping the always-on set small leaves room in the ~16-tool # budget for manage_tasks / manage_calendar / etc. - "list_served_models", "stop_served_model", + "list_served_models", "stop_served_model", "tail_serve_output", + # Serving is a core agent capability — keep these always available so + # the router doesn't lose them on phrasings like "servic" / "fire up" / "boot". + "serve_model", "serve_preset", "list_serve_presets", + "list_cached_models", "list_cookbook_servers", + # Fallback when serve_model's allowlist rejects a cmd or when the + # model was launched out-of-band via bash+tmux — without this the + # session is invisible to the cookbook UI even though it's running. + "adopt_served_model", # Generic API loopback — the catch-all when no named tool fits. "app_api", # Memory is ambient — "remember this" can follow any message regardless @@ -118,9 +126,10 @@ BUILTIN_TOOL_DESCRIPTIONS: Dict[str, str] = { "manage_notes": "Create and manage notes and checklists (Google Keep-style). ALWAYS use this for note/todo/checklist/reminder creation — NEVER hit /api/notes via app_api. Accepts natural-language `due_date` like 'tomorrow at 9am' or '11pm today' (parsed in the USER'S timezone). The due_date IS the reminder — it fires a notification at that time, so do NOT also create a calendar event for the same reminder. Set colors, labels, pin, archive. Do NOT use manage_memory for note content.", "manage_calendar": "Calendar event management: list, create, update, delete. Each event can carry a tag/category (event_type — work/personal/health/travel/meal/social/admin/other) and importance (low/normal/high/critical). Resolve today/tomorrow using the Current date and time context, then use ISO datetimes in the user's local wall time; supports all-day events. For event reminders/alarms, pass reminder_minutes; this creates the Notes reminder, so do not also call manage_notes for the same reminder.", "download_model": "Download a HuggingFace model to a local or remote server. Specify repo_id (e.g. 'Qwen/Qwen3-8B'), optional server host, and optional include filter for specific files.", - "serve_model": "Start serving a model with vLLM, SGLang, llama.cpp, Ollama, or Diffusers. For image/inpainting/diffusion use python3 scripts/diffusion_server.py --model --port 8100. After launch, call list_served_models for readiness/errors and retry suggestions.", + "serve_model": "Start serving a model with vLLM, SGLang, llama.cpp, Ollama, or Diffusers. cmd MUST start with the binary directly — e.g. `vllm serve /mnt/HADES/models/Qwen3.5-397B-A17B-AWQ --port 8003 --tensor-parallel-size 8 …`. NEVER prefix with `cd …`, `source …`, or chain with `&&`/`||` — those get rejected by the validator. The venv activation (env_prefix) and CUDA env are added automatically from the target host's saved settings. For image/inpainting/diffusion use python3 scripts/diffusion_server.py --model --port 8100. After launch, call list_served_models for readiness/errors and retry suggestions. If serve_model fails with 'Invalid characters in cmd', simplify to the bare binary + args.", "list_served_models": "List currently running model servers in the Cookbook — shows status (loading, ready, idle, error), model name, port, throughput, and serve failure diagnosis/retry suggestions. Use when the user asks 'what's running', 'show my cookbook', 'which models are up', 'what's serving'.", "stop_served_model": "Stop a running model server in the Cookbook by session ID or model name. Use when the user says 'kill my cookbook', 'stop the model', 'kill the serve', 'shut down vLLM', 'cancel the running model'.", + "tail_serve_output": "Read the actual tmux stderr/traceback of a cookbook serve/download task. Use to debug WHY a task is `crashed`/`error` (compute_89 nvcc mismatch, OOM, missing kernels, wrong attention backend, etc.) so you can call serve_model with adjusted flags. Pass session_id from list_served_models; tail defaults to 300, bump if the error references 'see root cause above'.", "list_downloads": "List in-progress HuggingFace model downloads in the Cookbook. Shows model name, phase, percent, session ID. Use for 'what's downloading', 'show my downloads', 'check download progress'.", "cancel_download": "Cancel an in-progress model download by tmux session ID. Use for 'cancel the download', 'stop downloading X', 'kill the download'. Call list_downloads first to get the session_id.", "search_hf_models": "Search HuggingFace for models matching a query (e.g. 'qwen 8B', 'flux', 'llama-3 instruct'). Returns ranked repo IDs with sizes and download counts. Use for 'find a model', 'search huggingface for X', 'what models are there for Y'.", diff --git a/src/tool_schemas.py b/src/tool_schemas.py index 10874ae..0db5ab1 100644 --- a/src/tool_schemas.py +++ b/src/tool_schemas.py @@ -787,6 +787,21 @@ FUNCTION_TOOL_SCHEMAS = [ } } }, + { + "type": "function", + "function": { + "name": "tail_serve_output", + "description": "Read the last N lines of a cookbook serve/download task's tmux pane. Use ONLY in this exact sequence: (1) the user asked to serve a model, (2) you launched it via serve_model, (3) list_served_models reports the NEW task as crashed/error, (4) call tail_serve_output on the new sessionId to find the root cause, (5) call serve_model again with adjusted flags. DO NOT call this on old stopped/completed download tasks — they are historical and won't tell you anything about the current attempt. DO NOT investigate past failures before launching; the environment may have changed since.", + "parameters": { + "type": "object", + "properties": { + "session_id": {"type": "string", "description": "Tmux session id from list_served_models (e.g. 'serve-abc12345', 'cookbook-a1b2c3d4')."}, + "tail": {"type": "integer", "description": "How many lines of pane scrollback to fetch (default 300, max 4000). Bump this if the error in the visible tail references an earlier line ('see root cause above')."}, + }, + "required": ["session_id"] + } + } + }, { "type": "function", "function": { diff --git a/static/index.html b/static/index.html index c5f3828..a4637d3 100644 --- a/static/index.html +++ b/static/index.html @@ -2283,6 +2283,7 @@ + diff --git a/static/js/admin.js b/static/js/admin.js index 25e3faa..5019096 100644 --- a/static/js/admin.js +++ b/static/js/admin.js @@ -731,14 +731,14 @@ function initEndpointForm() { urlInput.addEventListener('input', () => { if (provider.value && urlInput.value.trim() !== provider.value) { provider.value = ''; - if (kindSel) kindSel.value = 'proxy'; + if (kindSel) kindSel.value = 'api'; _renderPickerMenu(); _syncPickerCurrent(); } }); - if (kindSel) kindSel.value = provider.value ? 'api' : (kindSel.value || 'proxy'); + if (kindSel) kindSel.value = kindSel.value || 'api'; function _apiEndpointKind() { - return (kindSel && kindSel.value) ? kindSel.value : (provider.value ? 'api' : 'proxy'); + return (kindSel && kindSel.value) ? kindSel.value : 'api'; } function _normalizeBaseUrl(raw) { let u = raw.trim(); diff --git a/static/js/calendar.js b/static/js/calendar.js index ebd6bfc..fec9f82 100644 --- a/static/js/calendar.js +++ b/static/js/calendar.js @@ -299,13 +299,40 @@ async function _updateEvent(uid, data) { } async function _deleteEvent(uid) { - const backup = _allEvents[uid]; - delete _allEvents[uid]; + // Multiple "sibling" UIDs may need to vanish optimistically: + // 1. The exact uid the user clicked. + // 2. If the user clicked a RECURRING occurrence (uid contains "::"), + // the server deletes the master + every occurrence — so we strip + // the master uid AND every "master::*" expansion from the + // client-side caches too. Without this, deleting one day of a + // multi-day recurring task only removed THAT day visually; the + // other days kept rendering until the next full refresh. + // 3. If the user clicked the master, strip every "master::*" + // expansion (same prefix scan). + const masterUid = uid.includes('::') ? uid.split('::')[0] : uid; + const backups = {}; + const _matches = (k) => k === uid || k === masterUid || k.startsWith(masterUid + '::'); + + for (const k of Object.keys(_allEvents)) { + if (_matches(k)) { + backups[k] = _allEvents[k]; + delete _allEvents[k]; + } + } + if (Array.isArray(_events)) { + _events = _events.filter(e => !(e && _matches(e.uid || ''))); + } + if (_open) _render(); + _updateBadge && _updateBadge(); const isRecurring = uid.includes('::'); fetch(`${API_BASE}/api/calendar/events/${encodeURIComponent(uid)}`, { method: 'DELETE', credentials: 'same-origin', }).then(r => { - if (!r.ok) throw new Error('HTTP ' + r.status); + // 404 = the event was already deleted by another session/device. That's + // exactly the state we want, so treat it as success — don't restore the + // row, otherwise the user can never clear stale cached events that were + // deleted from desktop while mobile was open (and vice versa). + if (!r.ok && r.status !== 404) throw new Error('HTTP ' + r.status); if (isRecurring) { _fetchedRanges = []; localStorage.removeItem(LS_KEY); @@ -313,7 +340,11 @@ async function _deleteEvent(uid) { _saveCache && _saveCache(); } }).catch((e) => { - if (backup) _allEvents[uid] = backup; + // Server rejected — restore every uid we optimistically stripped. + for (const [k, ev] of Object.entries(backups)) { + _allEvents[k] = ev; + if (Array.isArray(_events)) _events.push(ev); + } if (window.uiModule) window.uiModule.showError('Failed to delete event: ' + (e?.message || 'unknown')); if (_open) _render(); }); @@ -980,7 +1011,39 @@ async function _renderMonth() { const startColInt = Math.round(startCol); const endColInt = Math.round(endCol); const span = endColInt - startColInt + 1; - h += `
${_e(md.summary)}
`; + // Proportional offsets for timed events that span across midnight + // (e.g. 8 PM Mon → 5 AM Tue). Without this, an overnight serve + // window visually fills the ENTIRE next day even when it only + // covers a few hours. All-day events keep the full-day shape. + // Bar visually spans from column (col+startFrac) to (col+span-1+endFrac), + // so a 8 PM→5 AM run shows ~17% of day 1 + ~21% of day 2, not 200%. + let startFrac = 0; + let endFrac = 1; + if (!md.all_day) { + try { + const sIso = md.dtstart || ''; + const eIso = md.dtend || ''; + const sDate = sIso ? new Date(sIso) : null; + const eDate = eIso ? new Date(eIso) : null; + // First-visible-day fraction (0 = midnight start). Clamp to 0 + // when the event started before this row, so the bar still + // starts at the row's left edge. + if (sDate && !isNaN(sDate) && mdStart >= rowStart) { + const midnight = new Date(sDate); midnight.setHours(0, 0, 0, 0); + startFrac = Math.max(0, Math.min(1, (sDate - midnight) / 86400000)); + } + if (eDate && !isNaN(eDate) && mdEnd <= rowEnd) { + const midnight = new Date(eDate); midnight.setHours(0, 0, 0, 0); + endFrac = Math.max(0, Math.min(1, (eDate - midnight) / 86400000)); + // CalDAV end-times are exclusive: an event ending at exactly + // 00:00 on day N really ended at end-of-day N-1, so endFrac=0 + // would visually paint a zero-width slice. Snap to a small + // visible minimum (5% of a day) so the bar still registers. + if (endFrac === 0) endFrac = 1; + } + } catch (_) { startFrac = 0; endFrac = 1; } + } + h += `
${_e(md.summary)}
`; barSlot++; } h += ''; @@ -2688,6 +2751,28 @@ function _showEventForm(existing, defaultDate, defaultEndDate) { + ${(() => { + // Cookbook-task back-link. When the description carries a + // "cookbook_task_id: " marker (set by cookbookSchedule.js + // when the user ticks "Create event in calendar"), render an + // Open-task button so the user can jump straight to the + // source task in the Tasks tab. + const _ct = (existing?.description || '').match(/cookbook_task_id:\s*([A-Za-z0-9_-]+)/); + if (!_ct) return ''; + return ``; + })()}