#!/usr/bin/env python3 """odysseus-backup — snapshot + restore of the data directory. Backs up everything the app keeps under `data/`: the SQLite DB, the Fernet key, JSON state files, RAG indexes, personal docs, attachments, WhatsApp session, etc. Output is a gzip tarball — composable with cron + scp + s3cmd. The backup uses `sqlite3 .backup` for the DB so the app can keep running during the snapshot. odysseus-backup snapshot # → backups/YYYY-MM-DD-HHMMSS.tar.gz odysseus-backup snapshot --out /mnt/nas/x.tgz odysseus-backup list # entries in backups/ odysseus-backup restore PATH [--yes] # overwrite current data/ from a tarball odysseus-backup verify PATH # tarball integrity check (no extract) Restore is destructive: it overwrites `data/` in place. Always pass `--yes` so a typo can't nuke your live state. """ from __future__ import annotations import sys import os, sys sys.path.insert(0, os.path.join(os.path.dirname(__file__), "_lib")) from cli import quiet_logs, emit, fail, common_parser, run, REPO_ROOT as _REPO_ROOT quiet_logs() import argparse, json, logging, os, shutil, sqlite3, subprocess, sys, tarfile, tempfile from datetime import datetime from pathlib import Path, PurePosixPath _DATA_DIR = _REPO_ROOT / "data" _BACKUP_DIR = _REPO_ROOT / "backups" # Stuff inside data/ that we explicitly skip — anything we can re-derive # from the SQLite DB + JSON state. Keeps the tarball small. _SKIP_PATTERNS = { "mail-attachments", # cached IMAP attachment extractions "deep_research", # research runs are large; back up explicitly via --include-research "personal_uploads", # uploaded files; usually wanted, included by default actually } def _sqlite_safe_copy(src: Path, dst: Path) -> None: """Use SQLite's `.backup` API instead of a file copy so a write in-flight doesn't corrupt the snapshot. Falls back to plain copy if the file isn't a SQLite DB.""" try: src_conn = sqlite3.connect(str(src)) dst_conn = sqlite3.connect(str(dst)) with dst_conn: src_conn.backup(dst_conn) src_conn.close() dst_conn.close() except Exception: # Not a SQLite DB or backup unsupported — fall back. dst.write_bytes(src.read_bytes()) def cmd_snapshot(args): """Write a tar.gz of the entire data/ directory. SQLite databases are dumped via .backup() into a temp file before being added to the tar, so a running web app can't corrupt the snapshot. Everything else is copied as-is.""" if not _DATA_DIR.is_dir(): fail(f"no data dir at {_DATA_DIR}") _BACKUP_DIR.mkdir(parents=True, exist_ok=True) out_path = Path(args.out) if args.out else ( _BACKUP_DIR / f"odysseus-backup-{datetime.now().strftime('%Y%m%d-%H%M%S')}.tar.gz" ) out_path.parent.mkdir(parents=True, exist_ok=True) sqlite_dbs = [p for p in _DATA_DIR.rglob("*.db") if p.is_file() and not p.is_symlink()] files_added = 0 total_bytes = 0 with tempfile.TemporaryDirectory() as tmp_str: tmp = Path(tmp_str) # Snapshot SQLite DBs to the temp dir first. db_map: dict[Path, Path] = {} for db in sqlite_dbs: rel = db.relative_to(_DATA_DIR) staged = tmp / rel staged.parent.mkdir(parents=True, exist_ok=True) _sqlite_safe_copy(db, staged) db_map[db] = staged with tarfile.open(out_path, "w:gz") as tar: for p in sorted(_DATA_DIR.rglob("*")): if not p.is_file() or p.is_symlink(): continue rel = p.relative_to(_DATA_DIR.parent) # Skip user-asked-to-skip categories if not args.include_research and "deep_research" in rel.parts: continue if not args.include_attachments and "mail-attachments" in rel.parts: continue # Substitute SQLite snapshots for the live files source = db_map.get(p, p) tar.add(source, arcname=str(rel)) files_added += 1 try: total_bytes += source.stat().st_size except Exception: pass emit({ "ok": True, "path": str(out_path), "files": files_added, "uncompressed_bytes": total_bytes, "compressed_bytes": out_path.stat().st_size, "ratio": round(out_path.stat().st_size / max(total_bytes, 1), 4), "included_research": bool(args.include_research), "included_attachments": bool(args.include_attachments), }, args) def cmd_list(args): """List entries in `backups/`. Most recent first.""" if not _BACKUP_DIR.is_dir(): emit([], args) return entries = [] for p in sorted(_BACKUP_DIR.iterdir(), key=lambda x: x.stat().st_mtime, reverse=True): if not p.is_file(): continue entries.append({ "path": str(p), "name": p.name, "bytes": p.stat().st_size, "modified": datetime.fromtimestamp(p.stat().st_mtime).isoformat(), }) emit(entries, args) def cmd_verify(args): """Open the tarball read-only and walk its members — confirms integrity without extracting anything.""" path = Path(args.path) if not path.exists(): fail(f"no file at {path}") try: with tarfile.open(path, "r:gz") as tar: members = tar.getmembers() _validate_restore_members(members) except (tarfile.TarError, OSError) as e: fail(f"tarball is corrupt: {e}") emit({ "ok": True, "path": str(path), "members": len(members), "first": members[0].name if members else None, "last": members[-1].name if members else None, }, args) def _validate_restore_members(members): """Reject archive entries that can escape data/ during restore.""" for m in members: rel = PurePosixPath(m.name) if rel.is_absolute() or ".." in rel.parts: fail(f"refusing tarball with absolute/parent path: {m.name!r}") if not rel.parts or rel.parts[0] != "data": fail(f"refusing tarball with entry outside data/: {m.name!r}") if m.issym() or m.islnk(): fail(f"refusing tarball with link entry: {m.name!r}") if not (m.isdir() or m.isfile()): fail(f"refusing tarball with special file entry: {m.name!r}") def _extract_restore_members(tar, members, root: Path) -> None: """Extract only regular files/directories after validation.""" for m in members: target = root.joinpath(*PurePosixPath(m.name).parts) if m.isdir(): target.mkdir(parents=True, exist_ok=True) continue target.parent.mkdir(parents=True, exist_ok=True) src = tar.extractfile(m) if src is None: fail(f"extract failed: could not read {m.name!r}") with src, open(target, "wb") as dst: shutil.copyfileobj(src, dst) def cmd_restore(args): """Overwrite `data/` from a tarball. Destructive; requires --yes.""" path = Path(args.path) if not path.exists(): fail(f"no file at {path}") if not args.yes: fail("restore is destructive — pass --yes to confirm overwriting data/") # Sanity check: tarball entries must all be safe, regular files/dirs under # `data/`. Avoid extractall() so symlink/hardlink entries can't redirect a # later write outside the repo. stash = None with tarfile.open(path, "r:gz") as tar: members = tar.getmembers() _validate_restore_members(members) # Save a safety copy of current data/ before extracting. if _DATA_DIR.exists() or _DATA_DIR.is_symlink(): stash = _REPO_ROOT / f"data.before-restore-{datetime.now().strftime('%Y%m%d-%H%M%S')}" os.rename(_DATA_DIR, stash) try: _extract_restore_members(tar, members, _REPO_ROOT) except Exception as e: fail(f"extract failed: {e}") emit({ "ok": True, "restored_from": str(path), "previous_data_stashed_at": str(stash) if stash else None, }, args) def _build_parser(): common = argparse.ArgumentParser(add_help=False) common.add_argument("--pretty", action="store_true") p = argparse.ArgumentParser(prog="odysseus-backup", parents=[common]) sub = p.add_subparsers(dest="cmd", required=True) ps = sub.add_parser("snapshot", parents=[common]) ps.add_argument("--out", help="output path (default: backups/.tar.gz)") ps.add_argument("--include-research", action="store_true", help="include data/deep_research/ (skipped by default; large)") ps.add_argument("--include-attachments", action="store_true", help="include data/mail-attachments/ (skipped by default; re-derivable)") ps.set_defaults(func=cmd_snapshot) pl = sub.add_parser("list", parents=[common]) pl.set_defaults(func=cmd_list) pv = sub.add_parser("verify", parents=[common]) pv.add_argument("path") pv.set_defaults(func=cmd_verify) pr = sub.add_parser("restore", parents=[common]) pr.add_argument("path") pr.add_argument("--yes", action="store_true", help="confirm overwriting current data/") pr.set_defaults(func=cmd_restore) return p if __name__ == "__main__": sys.exit(run(_build_parser()))