fix: backups resilience improvements (#5555)
* fix: backups * fix: stability
This commit is contained in:
@@ -483,7 +483,6 @@ const isServerRunning = computed(() => serverPowerState.value === 'running')
|
||||
const serverPowerState = ref<Archon.Websocket.v0.PowerState>('stopped')
|
||||
const powerStateDetails = ref<{ oom_killed?: boolean; exit_code?: number }>()
|
||||
const backupsState = reactive(new Map())
|
||||
const completedBackupTasks = new Set<string>()
|
||||
const cancelledBackups = new Set<string>()
|
||||
|
||||
const markBackupCancelled = (backupId: string) => {
|
||||
@@ -853,28 +852,22 @@ const handleAuthIncorrect = () => {
|
||||
}
|
||||
|
||||
const handleBackupProgress = (data: Archon.Websocket.v0.WSBackupProgressEvent) => {
|
||||
// Ignore 'file' task events - these are per-file progress updates sent continuously
|
||||
if (data.task === 'file') {
|
||||
return
|
||||
}
|
||||
if (data.task === 'file') return
|
||||
|
||||
const backupId = data.id
|
||||
const taskKey = `${backupId}:${data.task}`
|
||||
|
||||
if (completedBackupTasks.has(taskKey)) {
|
||||
return
|
||||
}
|
||||
|
||||
if (cancelledBackups.has(backupId)) {
|
||||
return
|
||||
}
|
||||
if (cancelledBackups.has(backupId)) return
|
||||
|
||||
const current = backupsState.get(backupId) ?? {}
|
||||
const previousState = current[data.task]?.state
|
||||
const previousProgress = current[data.task]?.progress
|
||||
const currentTaskState = current[data.task]?.state
|
||||
const isIncomingTerminal =
|
||||
data.state === 'done' || data.state === 'failed' || data.state === 'cancelled'
|
||||
|
||||
if (previousState !== data.state || previousProgress !== data.progress) {
|
||||
// (mutating same reference doesn't work)
|
||||
// Skip duplicate terminal events, but allow retries (terminal → ongoing)
|
||||
if (currentTaskState === data.state && isIncomingTerminal) return
|
||||
|
||||
const previousProgress = current[data.task]?.progress
|
||||
if (currentTaskState !== data.state || previousProgress !== data.progress) {
|
||||
backupsState.set(backupId, {
|
||||
...current,
|
||||
[data.task]: {
|
||||
@@ -884,11 +877,7 @@ const handleBackupProgress = (data: Archon.Websocket.v0.WSBackupProgressEvent) =
|
||||
})
|
||||
}
|
||||
|
||||
const isTerminalState =
|
||||
data.state === 'done' || data.state === 'failed' || data.state === 'cancelled'
|
||||
if (isTerminalState) {
|
||||
completedBackupTasks.add(taskKey)
|
||||
|
||||
if (isIncomingTerminal) {
|
||||
const attemptCleanup = (attempt: number = 1) => {
|
||||
queryClient.invalidateQueries({ queryKey: ['backups', 'list', serverId] }).then(() => {
|
||||
const backupData = queryClient.getQueryData<Archon.Backups.v1.Backup[]>([
|
||||
@@ -897,12 +886,31 @@ const handleBackupProgress = (data: Archon.Websocket.v0.WSBackupProgressEvent) =
|
||||
serverId,
|
||||
])
|
||||
const backup = backupData?.find((b) => b.id === backupId)
|
||||
const isStillActive =
|
||||
backup && (backup.status === 'in_progress' || backup.status === 'pending')
|
||||
|
||||
if (backup?.ongoing && attempt < 3) {
|
||||
// retry 3 times max, archon is slow compared to ws state
|
||||
setTimeout(() => attemptCleanup(attempt + 1), 1000)
|
||||
if (isStillActive && attempt < 6) {
|
||||
setTimeout(() => attemptCleanup(attempt + 1), 1000 * Math.pow(2, attempt - 1))
|
||||
return
|
||||
}
|
||||
|
||||
if (isStillActive) {
|
||||
queryClient.setQueryData<Archon.Backups.v1.Backup[]>(
|
||||
['backups', 'list', serverId],
|
||||
(old) =>
|
||||
old?.map((b) => {
|
||||
if (b.id !== backupId) return b
|
||||
return {
|
||||
...b,
|
||||
status: data.state === 'done' ? ('done' as const) : ('error' as const),
|
||||
ongoing: false,
|
||||
interrupted: data.state === 'failed',
|
||||
}
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
backupsState.delete(backupId)
|
||||
})
|
||||
}
|
||||
|
||||
@@ -1325,7 +1333,6 @@ const cleanup = () => {
|
||||
isReconnecting.value = false
|
||||
isLoading.value = true
|
||||
|
||||
completedBackupTasks.clear()
|
||||
cancelledBackups.clear()
|
||||
|
||||
clearNodeAuthState()
|
||||
|
||||
@@ -426,6 +426,7 @@ export namespace Archon {
|
||||
export namespace v1 {
|
||||
export type BackupState = 'ongoing' | 'done' | 'failed' | 'cancelled' | 'unchanged'
|
||||
export type BackupTask = 'file' | 'create' | 'restore'
|
||||
export type BackupStatus = 'pending' | 'in_progress' | 'timed_out' | 'error' | 'done'
|
||||
|
||||
export type BackupTaskProgress = {
|
||||
progress: number // 0.0 to 1.0
|
||||
@@ -438,6 +439,7 @@ export namespace Archon {
|
||||
name: string
|
||||
created_at: string
|
||||
automated: boolean
|
||||
status: BackupStatus
|
||||
interrupted: boolean
|
||||
ongoing: boolean
|
||||
locked: boolean
|
||||
|
||||
@@ -50,11 +50,13 @@ const props = withDefaults(
|
||||
|
||||
const backupQueued = computed(
|
||||
() =>
|
||||
props.backup.status === 'pending' ||
|
||||
props.backup.task?.create?.progress === 0 ||
|
||||
(props.backup.ongoing && !props.backup.task?.create),
|
||||
(props.backup.status === 'in_progress' && !props.backup.task?.create),
|
||||
)
|
||||
const failedToCreate = computed(
|
||||
() => props.backup.status === 'error' || props.backup.status === 'timed_out',
|
||||
)
|
||||
// const automated = computed(() => props.backup.automated)
|
||||
const failedToCreate = computed(() => props.backup.interrupted)
|
||||
|
||||
const inactiveStates = ['failed', 'cancelled', 'done']
|
||||
|
||||
@@ -64,11 +66,11 @@ const creating = computed(() => {
|
||||
return task
|
||||
}
|
||||
|
||||
if (props.backup.ongoing && !props.backup.task?.restore) {
|
||||
return {
|
||||
progress: 0,
|
||||
state: 'ongoing',
|
||||
}
|
||||
if (
|
||||
(props.backup.status === 'in_progress' || props.backup.status === 'pending') &&
|
||||
!props.backup.task?.restore
|
||||
) {
|
||||
return { progress: 0, state: 'ongoing' as const }
|
||||
}
|
||||
return undefined
|
||||
})
|
||||
@@ -78,13 +80,6 @@ const restoring = computed(() => {
|
||||
if (task && task.progress < 1 && !inactiveStates.includes(task.state)) {
|
||||
return task
|
||||
}
|
||||
|
||||
if (props.backup.ongoing && props.backup.task?.restore) {
|
||||
return {
|
||||
progress: 0,
|
||||
state: 'ongoing',
|
||||
}
|
||||
}
|
||||
return undefined
|
||||
})
|
||||
|
||||
|
||||
@@ -86,13 +86,16 @@ export function useInlineBackup(backupName: string | (() => string)) {
|
||||
if (!entry?.create) return
|
||||
|
||||
if (entry.create.state === 'done') {
|
||||
stopPolling()
|
||||
isBackingUp.value = false
|
||||
backupComplete.value = true
|
||||
} else if (entry.create.state === 'cancelled') {
|
||||
stopPolling()
|
||||
isBackingUp.value = false
|
||||
isCancelling.value = false
|
||||
backupCancelled.value = true
|
||||
} else if (entry.create.state === 'failed') {
|
||||
stopPolling()
|
||||
isBackingUp.value = false
|
||||
backupFailed.value = true
|
||||
}
|
||||
@@ -118,11 +121,13 @@ export function useInlineBackup(backupName: string | (() => string)) {
|
||||
|
||||
try {
|
||||
const backup = await client.archon.backups_v1.get(serverId, worldId.value!, backupId)
|
||||
const isTerminal =
|
||||
backup.status === 'done' || backup.status === 'error' || backup.status === 'timed_out'
|
||||
|
||||
if (!backup.ongoing) {
|
||||
if (isTerminal) {
|
||||
stopPolling()
|
||||
|
||||
if (backup.interrupted) {
|
||||
if (!isBackingUp.value) return
|
||||
if (backup.status === 'error' || backup.status === 'timed_out') {
|
||||
isBackingUp.value = false
|
||||
backupFailed.value = true
|
||||
} else {
|
||||
|
||||
@@ -221,7 +221,11 @@ const backups = computed(() => {
|
||||
...backup.task,
|
||||
...progressState,
|
||||
},
|
||||
|
||||
status: hasOngoingTask
|
||||
? ('in_progress' as const)
|
||||
: hasCompletedTask
|
||||
? ('done' as const)
|
||||
: backup.status,
|
||||
ongoing: hasOngoingTask || (backup.ongoing && !hasCompletedTask),
|
||||
}
|
||||
}
|
||||
@@ -304,8 +308,8 @@ const backupCreationDisabled = computed(() => {
|
||||
if (busyReasons.value.length > 0) {
|
||||
return formatMessage(busyReasons.value[0].reason)
|
||||
}
|
||||
// also check API data for ongoing backups (before ws fires)
|
||||
if (backupsData.value?.some((backup) => backup.ongoing)) {
|
||||
// also check for active backups, combining REST data with WS overlay
|
||||
if (backups.value.some((b) => b.status === 'in_progress' || b.status === 'pending')) {
|
||||
return 'A backup is already in progress'
|
||||
}
|
||||
return undefined
|
||||
|
||||
@@ -1,27 +0,0 @@
|
||||
import type { WSBackupState, WSBackupTask } from './websocket'
|
||||
|
||||
export interface Backup {
|
||||
id: string
|
||||
name: string
|
||||
created_at: string
|
||||
automated: boolean
|
||||
interrupted: boolean
|
||||
ongoing: boolean
|
||||
task: {
|
||||
[K in WSBackupTask]?: {
|
||||
progress: number
|
||||
state: WSBackupState
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export interface AutoBackupSettings {
|
||||
enabled: boolean
|
||||
interval: number
|
||||
}
|
||||
|
||||
export interface ServerBackup {
|
||||
id: string
|
||||
name: string
|
||||
created_at: string
|
||||
}
|
||||
@@ -1,8 +1,3 @@
|
||||
import type { Project } from '../../types'
|
||||
import type { ServerBackup } from './backup'
|
||||
import type { Mod } from './content'
|
||||
import type { Allocation } from './server'
|
||||
|
||||
export type ServerNotice = {
|
||||
id: number
|
||||
message: string
|
||||
@@ -21,42 +16,3 @@ export type ServerNotice = {
|
||||
dismissed_on: string
|
||||
}[]
|
||||
}
|
||||
|
||||
export interface Server {
|
||||
server_id: string
|
||||
name: string
|
||||
status: string
|
||||
net: {
|
||||
ip: string
|
||||
port: number
|
||||
domain: string
|
||||
allocations: Allocation[]
|
||||
}
|
||||
game: string
|
||||
loader: string | null
|
||||
loader_version: string | null
|
||||
mc_version: string | null
|
||||
backup_quota: number
|
||||
used_backup_quota: number
|
||||
backups: ServerBackup[]
|
||||
mods: Mod[]
|
||||
project: Project | null
|
||||
suspension_reason: string | null
|
||||
image: string | null
|
||||
upstream?: {
|
||||
kind: 'modpack'
|
||||
project_id: string
|
||||
version_id: string
|
||||
}
|
||||
motd: string
|
||||
flows: {
|
||||
intro?: boolean
|
||||
}
|
||||
|
||||
is_medal?: boolean
|
||||
medal_expires?: string
|
||||
}
|
||||
|
||||
export interface Servers {
|
||||
servers: Server[]
|
||||
}
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
export * from './api'
|
||||
export * from './backup'
|
||||
export * from './common'
|
||||
export * from './content'
|
||||
export * from './filesystem'
|
||||
|
||||
@@ -59,18 +59,6 @@ export interface WSNewModEvent {
|
||||
event: 'new-mod'
|
||||
}
|
||||
|
||||
export type WSBackupTask = 'create' | 'restore'
|
||||
export type WSBackupState = 'ongoing' | 'done' | 'failed' | 'cancelled' | 'unchanged'
|
||||
|
||||
export interface WSBackupProgressEvent {
|
||||
event: 'backup-progress'
|
||||
task: WSBackupTask
|
||||
id: string
|
||||
progress: number // percentage
|
||||
state: WSBackupState
|
||||
ready: boolean
|
||||
}
|
||||
|
||||
export type FSQueuedOpUnarchive = {
|
||||
op: 'unarchive'
|
||||
src: string
|
||||
@@ -109,16 +97,3 @@ export interface WSFilesystemOpsEvent {
|
||||
event: 'filesystem-ops'
|
||||
all: FilesystemOp[]
|
||||
}
|
||||
|
||||
export type WSEvent =
|
||||
| WSLogEvent
|
||||
| WSStatsEvent
|
||||
| WSPowerStateEvent
|
||||
| WSAuthExpiringEvent
|
||||
| WSAuthIncorrectEvent
|
||||
| WSInstallationResultEvent
|
||||
| WSAuthOkEvent
|
||||
| WSUptimeEvent
|
||||
| WSNewModEvent
|
||||
| WSBackupProgressEvent
|
||||
| WSFilesystemOpsEvent
|
||||
|
||||
Reference in New Issue
Block a user