219 lines
8.2 KiB
Python
Executable File
219 lines
8.2 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""odysseus-backup — snapshot + restore of the data directory.
|
|
|
|
Backs up everything the app keeps under `data/`: the SQLite DB, the
|
|
Fernet key, JSON state files, RAG indexes, personal docs, attachments,
|
|
WhatsApp session, etc. Output is a gzip tarball — composable with cron
|
|
+ scp + s3cmd. The backup uses `sqlite3 .backup` for the DB so the
|
|
app can keep running during the snapshot.
|
|
|
|
odysseus-backup snapshot # → backups/YYYY-MM-DD-HHMMSS.tar.gz
|
|
odysseus-backup snapshot --out /mnt/nas/x.tgz
|
|
odysseus-backup list # entries in backups/
|
|
odysseus-backup restore PATH [--yes] # overwrite current data/ from a tarball
|
|
odysseus-backup verify PATH # tarball integrity check (no extract)
|
|
|
|
Restore is destructive: it overwrites `data/` in place. Always pass
|
|
`--yes` so a typo can't nuke your live state.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
import sys
|
|
import os, sys
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "_lib"))
|
|
from cli import quiet_logs, emit, fail, common_parser, run, REPO_ROOT as _REPO_ROOT
|
|
quiet_logs()
|
|
|
|
import argparse, json, logging, os, sqlite3, subprocess, sys, tarfile, tempfile
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
_DATA_DIR = _REPO_ROOT / "data"
|
|
_BACKUP_DIR = _REPO_ROOT / "backups"
|
|
|
|
# Stuff inside data/ that we explicitly skip — anything we can re-derive
|
|
# from the SQLite DB + JSON state. Keeps the tarball small.
|
|
_SKIP_PATTERNS = {
|
|
"mail-attachments", # cached IMAP attachment extractions
|
|
"deep_research", # research runs are large; back up explicitly via --include-research
|
|
"personal_uploads", # uploaded files; usually wanted, included by default actually
|
|
}
|
|
|
|
|
|
def _sqlite_safe_copy(src: Path, dst: Path) -> None:
|
|
"""Use SQLite's `.backup` API instead of a file copy so a write
|
|
in-flight doesn't corrupt the snapshot. Falls back to plain copy
|
|
if the file isn't a SQLite DB."""
|
|
try:
|
|
src_conn = sqlite3.connect(str(src))
|
|
dst_conn = sqlite3.connect(str(dst))
|
|
with dst_conn:
|
|
src_conn.backup(dst_conn)
|
|
src_conn.close()
|
|
dst_conn.close()
|
|
except Exception:
|
|
# Not a SQLite DB or backup unsupported — fall back.
|
|
dst.write_bytes(src.read_bytes())
|
|
|
|
|
|
def cmd_snapshot(args):
|
|
"""Write a tar.gz of the entire data/ directory.
|
|
|
|
SQLite databases are dumped via .backup() into a temp file before
|
|
being added to the tar, so a running web app can't corrupt the
|
|
snapshot. Everything else is copied as-is."""
|
|
if not _DATA_DIR.is_dir():
|
|
fail(f"no data dir at {_DATA_DIR}")
|
|
_BACKUP_DIR.mkdir(parents=True, exist_ok=True)
|
|
out_path = Path(args.out) if args.out else (
|
|
_BACKUP_DIR / f"odysseus-backup-{datetime.now().strftime('%Y%m%d-%H%M%S')}.tar.gz"
|
|
)
|
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
sqlite_dbs = [p for p in _DATA_DIR.rglob("*.db") if p.is_file()]
|
|
files_added = 0
|
|
total_bytes = 0
|
|
|
|
with tempfile.TemporaryDirectory() as tmp_str:
|
|
tmp = Path(tmp_str)
|
|
# Snapshot SQLite DBs to the temp dir first.
|
|
db_map: dict[Path, Path] = {}
|
|
for db in sqlite_dbs:
|
|
rel = db.relative_to(_DATA_DIR)
|
|
staged = tmp / rel
|
|
staged.parent.mkdir(parents=True, exist_ok=True)
|
|
_sqlite_safe_copy(db, staged)
|
|
db_map[db] = staged
|
|
|
|
with tarfile.open(out_path, "w:gz") as tar:
|
|
for p in sorted(_DATA_DIR.rglob("*")):
|
|
if not p.is_file():
|
|
continue
|
|
rel = p.relative_to(_DATA_DIR.parent)
|
|
# Skip user-asked-to-skip categories
|
|
if not args.include_research and "deep_research" in rel.parts:
|
|
continue
|
|
if not args.include_attachments and "mail-attachments" in rel.parts:
|
|
continue
|
|
# Substitute SQLite snapshots for the live files
|
|
source = db_map.get(p, p)
|
|
tar.add(source, arcname=str(rel))
|
|
files_added += 1
|
|
try:
|
|
total_bytes += source.stat().st_size
|
|
except Exception:
|
|
pass
|
|
|
|
emit({
|
|
"ok": True,
|
|
"path": str(out_path),
|
|
"files": files_added,
|
|
"uncompressed_bytes": total_bytes,
|
|
"compressed_bytes": out_path.stat().st_size,
|
|
"ratio": round(out_path.stat().st_size / max(total_bytes, 1), 4),
|
|
"included_research": bool(args.include_research),
|
|
"included_attachments": bool(args.include_attachments),
|
|
}, args)
|
|
|
|
|
|
def cmd_list(args):
|
|
"""List entries in `backups/`. Most recent first."""
|
|
if not _BACKUP_DIR.is_dir():
|
|
emit([], args)
|
|
return
|
|
entries = []
|
|
for p in sorted(_BACKUP_DIR.iterdir(), key=lambda x: x.stat().st_mtime, reverse=True):
|
|
if not p.is_file():
|
|
continue
|
|
entries.append({
|
|
"path": str(p),
|
|
"name": p.name,
|
|
"bytes": p.stat().st_size,
|
|
"modified": datetime.fromtimestamp(p.stat().st_mtime).isoformat(),
|
|
})
|
|
emit(entries, args)
|
|
|
|
|
|
def cmd_verify(args):
|
|
"""Open the tarball read-only and walk its members — confirms
|
|
integrity without extracting anything."""
|
|
path = Path(args.path)
|
|
if not path.exists():
|
|
fail(f"no file at {path}")
|
|
try:
|
|
with tarfile.open(path, "r:gz") as tar:
|
|
members = tar.getmembers()
|
|
except (tarfile.TarError, OSError) as e:
|
|
fail(f"tarball is corrupt: {e}")
|
|
emit({
|
|
"ok": True,
|
|
"path": str(path),
|
|
"members": len(members),
|
|
"first": members[0].name if members else None,
|
|
"last": members[-1].name if members else None,
|
|
}, args)
|
|
|
|
|
|
def cmd_restore(args):
|
|
"""Overwrite `data/` from a tarball. Destructive; requires --yes."""
|
|
path = Path(args.path)
|
|
if not path.exists():
|
|
fail(f"no file at {path}")
|
|
if not args.yes:
|
|
fail("restore is destructive — pass --yes to confirm overwriting data/")
|
|
# Sanity check: tarball entries must all be under `data/`. If anyone
|
|
# crafted a malicious tarball with `../etc/passwd`, refuse.
|
|
with tarfile.open(path, "r:gz") as tar:
|
|
for m in tar.getmembers():
|
|
if m.name.startswith("/") or ".." in Path(m.name).parts:
|
|
fail(f"refusing tarball with absolute/parent path: {m.name!r}")
|
|
if not m.name.startswith("data/") and m.name != "data":
|
|
fail(f"refusing tarball with entry outside data/: {m.name!r}")
|
|
# Save a safety copy of current data/ before extracting.
|
|
if _DATA_DIR.exists():
|
|
stash = _REPO_ROOT / f"data.before-restore-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
|
|
os.rename(_DATA_DIR, stash)
|
|
try:
|
|
tar.extractall(path=_REPO_ROOT)
|
|
except Exception as e:
|
|
fail(f"extract failed: {e}")
|
|
emit({
|
|
"ok": True,
|
|
"restored_from": str(path),
|
|
"previous_data_stashed_at": str(stash) if _DATA_DIR.exists() else None,
|
|
}, args)
|
|
|
|
|
|
def _build_parser():
|
|
common = argparse.ArgumentParser(add_help=False)
|
|
common.add_argument("--pretty", action="store_true")
|
|
p = argparse.ArgumentParser(prog="odysseus-backup", parents=[common])
|
|
sub = p.add_subparsers(dest="cmd", required=True)
|
|
|
|
ps = sub.add_parser("snapshot", parents=[common])
|
|
ps.add_argument("--out", help="output path (default: backups/<timestamp>.tar.gz)")
|
|
ps.add_argument("--include-research", action="store_true",
|
|
help="include data/deep_research/ (skipped by default; large)")
|
|
ps.add_argument("--include-attachments", action="store_true",
|
|
help="include data/mail-attachments/ (skipped by default; re-derivable)")
|
|
ps.set_defaults(func=cmd_snapshot)
|
|
|
|
pl = sub.add_parser("list", parents=[common])
|
|
pl.set_defaults(func=cmd_list)
|
|
|
|
pv = sub.add_parser("verify", parents=[common])
|
|
pv.add_argument("path")
|
|
pv.set_defaults(func=cmd_verify)
|
|
|
|
pr = sub.add_parser("restore", parents=[common])
|
|
pr.add_argument("path")
|
|
pr.add_argument("--yes", action="store_true",
|
|
help="confirm overwriting current data/")
|
|
pr.set_defaults(func=cmd_restore)
|
|
|
|
return p
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(run(_build_parser()))
|