#!/bin/bash # ========================================================== # ProxMenux - Apply Pending Restore On Boot # ========================================================== PENDING_BASE="${PMX_RESTORE_PENDING_BASE:-/var/lib/proxmenux/restore-pending}" CURRENT_LINK="${PENDING_BASE}/current" LOG_DIR="${PMX_RESTORE_LOG_DIR:-/var/log/proxmenux}" DEST_PREFIX="${PMX_RESTORE_DEST_PREFIX:-/}" PRE_BACKUP_BASE="${PMX_RESTORE_PRE_BACKUP_BASE:-/var/lib/proxmenux/pre-restore}" RECOVERY_BASE="${PMX_RESTORE_RECOVERY_BASE:-/var/lib/proxmenux/recovery}" mkdir -p "$LOG_DIR" "$PENDING_BASE/completed" >/dev/null 2>&1 || true LOG_FILE="${LOG_DIR}/proxmenux-restore-onboot-$(date +%Y%m%d_%H%M%S).log" exec >>"$LOG_FILE" 2>&1 echo "=== ProxMenux pending restore started at $(date -Iseconds) ===" if [[ ! -e "$CURRENT_LINK" ]]; then echo "No pending restore link found. Nothing to do." exit 0 fi PENDING_DIR="$(readlink -f "$CURRENT_LINK" 2>/dev/null || echo "$CURRENT_LINK")" if [[ ! -d "$PENDING_DIR" ]]; then echo "Pending restore directory not found: $PENDING_DIR" rm -f "$CURRENT_LINK" >/dev/null 2>&1 || true exit 0 fi APPLY_LIST="${PENDING_DIR}/apply-on-boot.list" PLAN_ENV="${PENDING_DIR}/plan.env" STATE_FILE="${PENDING_DIR}/state" if [[ -f "$PLAN_ENV" ]]; then # shellcheck source=/dev/null source "$PLAN_ENV" fi : "${HB_RESTORE_INCLUDE_ZFS:=0}" if [[ ! -f "$APPLY_LIST" ]]; then echo "Apply list missing: $APPLY_LIST" echo "failed" >"$STATE_FILE" exit 1 fi echo "Pending dir: $PENDING_DIR" echo "Apply list: $APPLY_LIST" echo "Include ZFS: $HB_RESTORE_INCLUDE_ZFS" echo "running" >"$STATE_FILE" backup_root="${PRE_BACKUP_BASE}/$(date +%Y%m%d_%H%M%S)-onboot" mkdir -p "$backup_root" >/dev/null 2>&1 || true cluster_recovery_root="" applied=0 skipped=0 failed=0 while IFS= read -r rel; do [[ -z "$rel" ]] && continue src="${PENDING_DIR}/rootfs/${rel}" dst="${DEST_PREFIX%/}/${rel}" if [[ ! -e "$src" ]]; then ((skipped++)) continue fi # Cluster data (/etc/pve, /var/lib/pve-cluster) goes into a # recovery dir for forensics/rollback, but unlike the live- # menu apply path we ALSO apply it for real here: at this # point in boot we're before networking.service, nothing is # talking to the cluster yet, so a `systemctl stop pve-cluster` # → copy → `systemctl start pve-cluster` is safe. This is the # whole reason the operator picked "schedule remaining for # next boot" instead of doing it live from SSH. if [[ "$rel" == etc/pve* ]] || [[ "$rel" == var/lib/pve-cluster* ]]; then if [[ -z "$cluster_recovery_root" ]]; then cluster_recovery_root="${RECOVERY_BASE}/$(date +%Y%m%d_%H%M%S)-onboot" mkdir -p "$cluster_recovery_root" >/dev/null 2>&1 || true fi mkdir -p "$cluster_recovery_root/$(dirname "$rel")" >/dev/null 2>&1 || true cp -a "$src" "$cluster_recovery_root/$rel" >/dev/null 2>&1 || true # Mark that we need to do the live apply at the end of # the loop (we don't want to stop/start pve-cluster # per-file — once is enough). cluster_live_apply=1 ((skipped++)) continue fi # /etc/zfs is opt-in. if [[ "$rel" == etc/zfs || "$rel" == etc/zfs/* ]]; then if [[ "$HB_RESTORE_INCLUDE_ZFS" != "1" ]]; then ((skipped++)) continue fi fi if [[ -e "$dst" ]]; then mkdir -p "$backup_root/$(dirname "$rel")" >/dev/null 2>&1 || true cp -a "$dst" "$backup_root/$rel" >/dev/null 2>&1 || true fi if [[ -d "$src" ]]; then mkdir -p "$dst" >/dev/null 2>&1 || true if rsync -aAXH --delete "$src/" "$dst/" >/dev/null 2>&1; then ((applied++)) else ((failed++)) fi else mkdir -p "$(dirname "$dst")" >/dev/null 2>&1 || true if cp -a "$src" "$dst" >/dev/null 2>&1; then ((applied++)) else ((failed++)) fi fi done <"$APPLY_LIST" systemctl daemon-reload >/dev/null 2>&1 || true # `update-initramfs -u -k all` and `update-grub` used to live here # but: (a) they take 5-10 minutes for 3 kernels, hanging early-boot # for that long, and (b) ifupdown2 was waiting on us. They now run # AFTER pve-cluster is up via the apply_cluster_postboot.sh script # we hook below, in the background where the user is already on the # login prompt and using the system. Zero manual steps needed. echo "Applied: $applied" echo "Skipped: $skipped" echo "Failed: $failed" echo "Backup before restore: $backup_root" if [[ -n "$cluster_recovery_root" ]]; then # Always write the manual-helper script first — that's the # rollback path if the live apply below blows up. helper="${cluster_recovery_root}/apply-cluster-restore.sh" cat > "$helper" </dev/null 2>&1 || true echo "Cluster paths extracted to: $cluster_recovery_root" echo "Cluster recovery helper: $helper" # We DON'T auto-apply /etc/pve here at boot because early-boot # pve-cluster start blocks the unit (corosync etc. not ready). # Instead we hand off to a SECOND oneshot unit that fires # AFTER pve-cluster.service is up, when /etc/pve is the live # pmxcfs FUSE mount and we can write individual files to it # without restarting anything. That second unit is gated by # ConditionPathExists on the marker file we drop here, so on # a normal boot (no marker) it's a no-op. if [[ "${cluster_live_apply:-0}" == "1" ]]; then echo "Installing post-boot cluster apply unit..." # Decide whether the post-boot script needs to run # update-initramfs and/or update-grub by inspecting the # apply list. Skipping them when nothing relevant was # restored saves the operator 5-10 minutes of background # initramfs rebuilds on EVERY restore — only do it when # the backup actually touched paths that affect those # tools' inputs. NEEDS_INITRAMFS=0 NEEDS_GRUB=0 while IFS= read -r _rel; do case "$_rel" in etc/modules|etc/modules/*|\ etc/modules-load.d|etc/modules-load.d/*|\ etc/modprobe.d|etc/modprobe.d/*|\ etc/initramfs-tools|etc/initramfs-tools/*|\ etc/crypttab|\ etc/cryptsetup-initramfs|etc/cryptsetup-initramfs/*) NEEDS_INITRAMFS=1 ;; etc/default/grub|\ etc/kernel|etc/kernel/*|\ etc/grub.d|etc/grub.d/*) NEEDS_GRUB=1 ;; esac done < "$APPLY_LIST" echo "Post-boot maintenance flags: initramfs=$NEEDS_INITRAMFS grub=$NEEDS_GRUB" # Marker as env-style key=value so the post-boot script # can `source` it and read structured fields. mkdir -p /var/lib/proxmenux >/dev/null 2>&1 || true { printf 'RECOVERY_ROOT=%s\n' "$cluster_recovery_root" printf 'PENDING_DIR=%s\n' "$PENDING_DIR" printf 'NEEDS_INITRAMFS=%s\n' "$NEEDS_INITRAMFS" printf 'NEEDS_GRUB=%s\n' "$NEEDS_GRUB" } > /var/lib/proxmenux/cluster-apply-pending chmod 600 /var/lib/proxmenux/cluster-apply-pending # Install the systemd unit. Idempotent: overwrite if it # already exists (so script changes get picked up). cat > /etc/systemd/system/proxmenux-apply-cluster-postboot.service </dev/null 2>&1 || true systemctl enable proxmenux-apply-cluster-postboot.service >/dev/null 2>&1 || true # `systemctl enable` only adds the unit to multi-user.target.wants/. # It does NOT pull the unit into the currently-running boot # transaction — by the time we run, multi-user.target may have # already collected its wants. `start --no-block` schedules the # unit for activation respecting its After= ordering (pve-cluster # comes up first), without blocking apply_pending_restore.sh # itself. Without this, the postboot unit only fires on the # NEXT reboot, defeating the "single reboot, zero manual steps" # promise. systemctl start --no-block proxmenux-apply-cluster-postboot.service >/dev/null 2>&1 || true echo "Cluster apply will run automatically after pve-cluster comes up." echo "Fallback manual: bash $helper" fi fi if [[ "$failed" -eq 0 ]]; then echo "completed" >"$STATE_FILE" else echo "completed_with_errors" >"$STATE_FILE" fi restore_id="$(basename "$PENDING_DIR")" mv "$PENDING_DIR" "${PENDING_BASE}/completed/${restore_id}" >/dev/null 2>&1 || true rm -f "$CURRENT_LINK" >/dev/null 2>&1 || true systemctl disable proxmenux-restore-onboot.service >/dev/null 2>&1 || true echo "=== ProxMenux pending restore finished at $(date -Iseconds) ===" echo "Log file: $LOG_FILE" exit 0