fix: backups resilience improvements (#5555)

* fix: backups

* fix: stability
This commit is contained in:
Calum H.
2026-03-13 19:10:13 +00:00
committed by GitHub
parent 455a4f527d
commit c44cc38b3a
9 changed files with 60 additions and 144 deletions

View File

@@ -483,7 +483,6 @@ const isServerRunning = computed(() => serverPowerState.value === 'running')
const serverPowerState = ref<Archon.Websocket.v0.PowerState>('stopped') const serverPowerState = ref<Archon.Websocket.v0.PowerState>('stopped')
const powerStateDetails = ref<{ oom_killed?: boolean; exit_code?: number }>() const powerStateDetails = ref<{ oom_killed?: boolean; exit_code?: number }>()
const backupsState = reactive(new Map()) const backupsState = reactive(new Map())
const completedBackupTasks = new Set<string>()
const cancelledBackups = new Set<string>() const cancelledBackups = new Set<string>()
const markBackupCancelled = (backupId: string) => { const markBackupCancelled = (backupId: string) => {
@@ -853,28 +852,22 @@ const handleAuthIncorrect = () => {
} }
const handleBackupProgress = (data: Archon.Websocket.v0.WSBackupProgressEvent) => { const handleBackupProgress = (data: Archon.Websocket.v0.WSBackupProgressEvent) => {
// Ignore 'file' task events - these are per-file progress updates sent continuously if (data.task === 'file') return
if (data.task === 'file') {
return
}
const backupId = data.id const backupId = data.id
const taskKey = `${backupId}:${data.task}`
if (completedBackupTasks.has(taskKey)) { if (cancelledBackups.has(backupId)) return
return
}
if (cancelledBackups.has(backupId)) {
return
}
const current = backupsState.get(backupId) ?? {} const current = backupsState.get(backupId) ?? {}
const previousState = current[data.task]?.state const currentTaskState = current[data.task]?.state
const previousProgress = current[data.task]?.progress const isIncomingTerminal =
data.state === 'done' || data.state === 'failed' || data.state === 'cancelled'
if (previousState !== data.state || previousProgress !== data.progress) { // Skip duplicate terminal events, but allow retries (terminal → ongoing)
// (mutating same reference doesn't work) if (currentTaskState === data.state && isIncomingTerminal) return
const previousProgress = current[data.task]?.progress
if (currentTaskState !== data.state || previousProgress !== data.progress) {
backupsState.set(backupId, { backupsState.set(backupId, {
...current, ...current,
[data.task]: { [data.task]: {
@@ -884,11 +877,7 @@ const handleBackupProgress = (data: Archon.Websocket.v0.WSBackupProgressEvent) =
}) })
} }
const isTerminalState = if (isIncomingTerminal) {
data.state === 'done' || data.state === 'failed' || data.state === 'cancelled'
if (isTerminalState) {
completedBackupTasks.add(taskKey)
const attemptCleanup = (attempt: number = 1) => { const attemptCleanup = (attempt: number = 1) => {
queryClient.invalidateQueries({ queryKey: ['backups', 'list', serverId] }).then(() => { queryClient.invalidateQueries({ queryKey: ['backups', 'list', serverId] }).then(() => {
const backupData = queryClient.getQueryData<Archon.Backups.v1.Backup[]>([ const backupData = queryClient.getQueryData<Archon.Backups.v1.Backup[]>([
@@ -897,12 +886,31 @@ const handleBackupProgress = (data: Archon.Websocket.v0.WSBackupProgressEvent) =
serverId, serverId,
]) ])
const backup = backupData?.find((b) => b.id === backupId) const backup = backupData?.find((b) => b.id === backupId)
const isStillActive =
backup && (backup.status === 'in_progress' || backup.status === 'pending')
if (backup?.ongoing && attempt < 3) { if (isStillActive && attempt < 6) {
// retry 3 times max, archon is slow compared to ws state setTimeout(() => attemptCleanup(attempt + 1), 1000 * Math.pow(2, attempt - 1))
setTimeout(() => attemptCleanup(attempt + 1), 1000)
return return
} }
if (isStillActive) {
queryClient.setQueryData<Archon.Backups.v1.Backup[]>(
['backups', 'list', serverId],
(old) =>
old?.map((b) => {
if (b.id !== backupId) return b
return {
...b,
status: data.state === 'done' ? ('done' as const) : ('error' as const),
ongoing: false,
interrupted: data.state === 'failed',
}
}),
)
}
backupsState.delete(backupId)
}) })
} }
@@ -1325,7 +1333,6 @@ const cleanup = () => {
isReconnecting.value = false isReconnecting.value = false
isLoading.value = true isLoading.value = true
completedBackupTasks.clear()
cancelledBackups.clear() cancelledBackups.clear()
clearNodeAuthState() clearNodeAuthState()

View File

@@ -426,6 +426,7 @@ export namespace Archon {
export namespace v1 { export namespace v1 {
export type BackupState = 'ongoing' | 'done' | 'failed' | 'cancelled' | 'unchanged' export type BackupState = 'ongoing' | 'done' | 'failed' | 'cancelled' | 'unchanged'
export type BackupTask = 'file' | 'create' | 'restore' export type BackupTask = 'file' | 'create' | 'restore'
export type BackupStatus = 'pending' | 'in_progress' | 'timed_out' | 'error' | 'done'
export type BackupTaskProgress = { export type BackupTaskProgress = {
progress: number // 0.0 to 1.0 progress: number // 0.0 to 1.0
@@ -438,6 +439,7 @@ export namespace Archon {
name: string name: string
created_at: string created_at: string
automated: boolean automated: boolean
status: BackupStatus
interrupted: boolean interrupted: boolean
ongoing: boolean ongoing: boolean
locked: boolean locked: boolean

View File

@@ -50,11 +50,13 @@ const props = withDefaults(
const backupQueued = computed( const backupQueued = computed(
() => () =>
props.backup.status === 'pending' ||
props.backup.task?.create?.progress === 0 || props.backup.task?.create?.progress === 0 ||
(props.backup.ongoing && !props.backup.task?.create), (props.backup.status === 'in_progress' && !props.backup.task?.create),
)
const failedToCreate = computed(
() => props.backup.status === 'error' || props.backup.status === 'timed_out',
) )
// const automated = computed(() => props.backup.automated)
const failedToCreate = computed(() => props.backup.interrupted)
const inactiveStates = ['failed', 'cancelled', 'done'] const inactiveStates = ['failed', 'cancelled', 'done']
@@ -64,11 +66,11 @@ const creating = computed(() => {
return task return task
} }
if (props.backup.ongoing && !props.backup.task?.restore) { if (
return { (props.backup.status === 'in_progress' || props.backup.status === 'pending') &&
progress: 0, !props.backup.task?.restore
state: 'ongoing', ) {
} return { progress: 0, state: 'ongoing' as const }
} }
return undefined return undefined
}) })
@@ -78,13 +80,6 @@ const restoring = computed(() => {
if (task && task.progress < 1 && !inactiveStates.includes(task.state)) { if (task && task.progress < 1 && !inactiveStates.includes(task.state)) {
return task return task
} }
if (props.backup.ongoing && props.backup.task?.restore) {
return {
progress: 0,
state: 'ongoing',
}
}
return undefined return undefined
}) })

View File

@@ -86,13 +86,16 @@ export function useInlineBackup(backupName: string | (() => string)) {
if (!entry?.create) return if (!entry?.create) return
if (entry.create.state === 'done') { if (entry.create.state === 'done') {
stopPolling()
isBackingUp.value = false isBackingUp.value = false
backupComplete.value = true backupComplete.value = true
} else if (entry.create.state === 'cancelled') { } else if (entry.create.state === 'cancelled') {
stopPolling()
isBackingUp.value = false isBackingUp.value = false
isCancelling.value = false isCancelling.value = false
backupCancelled.value = true backupCancelled.value = true
} else if (entry.create.state === 'failed') { } else if (entry.create.state === 'failed') {
stopPolling()
isBackingUp.value = false isBackingUp.value = false
backupFailed.value = true backupFailed.value = true
} }
@@ -118,11 +121,13 @@ export function useInlineBackup(backupName: string | (() => string)) {
try { try {
const backup = await client.archon.backups_v1.get(serverId, worldId.value!, backupId) const backup = await client.archon.backups_v1.get(serverId, worldId.value!, backupId)
const isTerminal =
backup.status === 'done' || backup.status === 'error' || backup.status === 'timed_out'
if (!backup.ongoing) { if (isTerminal) {
stopPolling() stopPolling()
if (!isBackingUp.value) return
if (backup.interrupted) { if (backup.status === 'error' || backup.status === 'timed_out') {
isBackingUp.value = false isBackingUp.value = false
backupFailed.value = true backupFailed.value = true
} else { } else {

View File

@@ -221,7 +221,11 @@ const backups = computed(() => {
...backup.task, ...backup.task,
...progressState, ...progressState,
}, },
status: hasOngoingTask
? ('in_progress' as const)
: hasCompletedTask
? ('done' as const)
: backup.status,
ongoing: hasOngoingTask || (backup.ongoing && !hasCompletedTask), ongoing: hasOngoingTask || (backup.ongoing && !hasCompletedTask),
} }
} }
@@ -304,8 +308,8 @@ const backupCreationDisabled = computed(() => {
if (busyReasons.value.length > 0) { if (busyReasons.value.length > 0) {
return formatMessage(busyReasons.value[0].reason) return formatMessage(busyReasons.value[0].reason)
} }
// also check API data for ongoing backups (before ws fires) // also check for active backups, combining REST data with WS overlay
if (backupsData.value?.some((backup) => backup.ongoing)) { if (backups.value.some((b) => b.status === 'in_progress' || b.status === 'pending')) {
return 'A backup is already in progress' return 'A backup is already in progress'
} }
return undefined return undefined

View File

@@ -1,27 +0,0 @@
import type { WSBackupState, WSBackupTask } from './websocket'
export interface Backup {
id: string
name: string
created_at: string
automated: boolean
interrupted: boolean
ongoing: boolean
task: {
[K in WSBackupTask]?: {
progress: number
state: WSBackupState
}
}
}
export interface AutoBackupSettings {
enabled: boolean
interval: number
}
export interface ServerBackup {
id: string
name: string
created_at: string
}

View File

@@ -1,8 +1,3 @@
import type { Project } from '../../types'
import type { ServerBackup } from './backup'
import type { Mod } from './content'
import type { Allocation } from './server'
export type ServerNotice = { export type ServerNotice = {
id: number id: number
message: string message: string
@@ -21,42 +16,3 @@ export type ServerNotice = {
dismissed_on: string dismissed_on: string
}[] }[]
} }
export interface Server {
server_id: string
name: string
status: string
net: {
ip: string
port: number
domain: string
allocations: Allocation[]
}
game: string
loader: string | null
loader_version: string | null
mc_version: string | null
backup_quota: number
used_backup_quota: number
backups: ServerBackup[]
mods: Mod[]
project: Project | null
suspension_reason: string | null
image: string | null
upstream?: {
kind: 'modpack'
project_id: string
version_id: string
}
motd: string
flows: {
intro?: boolean
}
is_medal?: boolean
medal_expires?: string
}
export interface Servers {
servers: Server[]
}

View File

@@ -1,5 +1,4 @@
export * from './api' export * from './api'
export * from './backup'
export * from './common' export * from './common'
export * from './content' export * from './content'
export * from './filesystem' export * from './filesystem'

View File

@@ -59,18 +59,6 @@ export interface WSNewModEvent {
event: 'new-mod' event: 'new-mod'
} }
export type WSBackupTask = 'create' | 'restore'
export type WSBackupState = 'ongoing' | 'done' | 'failed' | 'cancelled' | 'unchanged'
export interface WSBackupProgressEvent {
event: 'backup-progress'
task: WSBackupTask
id: string
progress: number // percentage
state: WSBackupState
ready: boolean
}
export type FSQueuedOpUnarchive = { export type FSQueuedOpUnarchive = {
op: 'unarchive' op: 'unarchive'
src: string src: string
@@ -109,16 +97,3 @@ export interface WSFilesystemOpsEvent {
event: 'filesystem-ops' event: 'filesystem-ops'
all: FilesystemOp[] all: FilesystemOp[]
} }
export type WSEvent =
| WSLogEvent
| WSStatsEvent
| WSPowerStateEvent
| WSAuthExpiringEvent
| WSAuthIncorrectEvent
| WSInstallationResultEvent
| WSAuthOkEvent
| WSUptimeEvent
| WSNewModEvent
| WSBackupProgressEvent
| WSFilesystemOpsEvent