From 6094ab8e1c2256c06499a35f6845d1541a1666b7 Mon Sep 17 00:00:00 2001 From: MacRimi Date: Thu, 11 Jun 2026 17:24:20 +0200 Subject: [PATCH] update beta 1.2.2.2 --- scripts/backup_restore/backup_host.sh | 79 +++++++ .../backup_restore/lib_host_backup_common.sh | 200 ++++++++++++++++++ 2 files changed, 279 insertions(+) diff --git a/scripts/backup_restore/backup_host.sh b/scripts/backup_restore/backup_host.sh index ff3ce148..93dbbf8d 100644 --- a/scripts/backup_restore/backup_host.sh +++ b/scripts/backup_restore/backup_host.sh @@ -1278,6 +1278,23 @@ _rs_apply() { dst="/$rel" [[ -e "$src" ]] || { ((skipped++)); continue; } + # Smart-restore hardware-drift skip list (populated by + # _rs_run_complete_guided when hb_assess_hardware_drift flags + # paths that would break on this host's hardware). Each path + # in $RS_SKIP_PATHS is one absolute path per line. Matching is + # exact-or-descendant so "/etc/zfs/zpool.cache" listed in the + # skip set covers itself when rel == "etc/zfs/zpool.cache". + if [[ -n "${RS_SKIP_PATHS:-}" ]]; then + local _abs="/$rel" _skip="" + while IFS= read -r _skip; do + [[ -z "$_skip" ]] && continue + if [[ "$_abs" == "$_skip" || "$_abs" == "$_skip"/* ]]; then + ((skipped++)) + continue 2 + fi + done <<<"$RS_SKIP_PATHS" + fi + # Never restore cluster virtual filesystem data live. # Extract it for manual recovery in maintenance mode. # Path note: this used to live under /root/proxmenux-recovery/, @@ -1589,6 +1606,19 @@ _rs_collect_pending_paths() { ;; esac [[ -z "$rel" || -n "${seen[$rel]}" ]] && continue + + # Drop hardware-drift skips (see RS_SKIP_PATHS comment in _rs_apply). + if [[ -n "${RS_SKIP_PATHS:-}" ]]; then + local _abs="/$rel" _skip="" _drop=0 + while IFS= read -r _skip; do + [[ -z "$_skip" ]] && continue + if [[ "$_abs" == "$_skip" || "$_abs" == "$_skip"/* ]]; then + _drop=1; break + fi + done <<<"$RS_SKIP_PATHS" + (( _drop )) && continue + fi + seen["$rel"]=1 out+=("$rel") done @@ -1745,6 +1775,55 @@ _rs_run_complete_guided() { local -a all_paths=() hb_load_restore_paths "$staging_root" all_paths + # ── Smart restore plan ────────────────────────────────── + # Compare the backup metadata against the live host and surface + # anything that would be unsafe to restore as-is (ZFS pool GUID + # changed, fstab UUIDs gone, NVIDIA driver state for a host with + # no NVIDIA card, ...). Only opens an extra dialog when there's + # actually drift — same-hardware/same-host restores skip it. + export RS_SKIP_PATHS="" + local -a drift_lines=() + mapfile -t drift_lines < <(hb_assess_hardware_drift "$staging_root" 2>/dev/null) + if (( ${#drift_lines[@]} > 0 )); then + local skip_paths="" + local skip_components="" + local plan_body + plan_body="\Zb$(translate "Smart restore plan — hardware compatibility check")\ZB"$'\n\n' + plan_body+="$(translate "The backup metadata was compared against this host. The following items will be SKIPPED to keep the boot safe:")"$'\n\n' + + local line key action reason + for line in "${drift_lines[@]}"; do + IFS=$'\t' read -r key action reason <<<"$line" + [[ "$action" != "skip" ]] && continue + if [[ "$key" == component:* ]]; then + local cname="${key#component:}" + skip_components+="${cname} " + plan_body+=" \Z1•\Zn $(translate "Component:") \Zb${cname}\ZB"$'\n' + plan_body+=" ${reason}"$'\n\n' + else + skip_paths+="${key}"$'\n' + plan_body+=" \Z1•\Zn $(translate "Path:") \Zb${key}\ZB"$'\n' + plan_body+=" ${reason}"$'\n\n' + fi + done + + plan_body+="$(translate "PVE will regenerate these files automatically for the current hardware. The rest of the backup will be applied normally.")"$'\n\n' + plan_body+="\Zb$(translate "Continue with safe restore?")\ZB" + + if ! dialog --backtitle "ProxMenux" --colors \ + --title "$(translate "Restore plan — compatibility check")" \ + --yesno "$plan_body" 24 90; then + return 1 + fi + + # Persist for _rs_apply / _rs_collect_pending_paths to honor. + # We only store paths (not component:* entries) — component + # auto-reinstall already self-skips when the GPU/TPU isn't on + # this host, so we just surfaced it in the dialog for clarity. + RS_SKIP_PATHS="${skip_paths%$'\n'}" + export RS_SKIP_PATHS + fi + # Build the rich confirmation body. Replaces the previous 4-strategy # menu — by design a Proxmox host restore always requires a reboot # for predictable end state (pmxcfs live writes + initramfs + driver diff --git a/scripts/backup_restore/lib_host_backup_common.sh b/scripts/backup_restore/lib_host_backup_common.sh index 5a6828e0..e4a5a7b6 100644 --- a/scripts/backup_restore/lib_host_backup_common.sh +++ b/scripts/backup_restore/lib_host_backup_common.sh @@ -103,7 +103,10 @@ hb_default_profile_paths() { # ── Common Proxmox tooling (skipped if not present) ── "/etc/systemd/system" # custom units (including log2ram.service if installed) + "/etc/systemd/journald.conf" # journal retention tuning from post-install "/etc/log2ram.conf" + "/etc/logrotate.conf" + "/etc/logrotate.d" # post-install drops log2ram + custom logrotate here "/etc/lm-sensors" "/etc/sensors3.conf" "/etc/fail2ban" @@ -174,6 +177,189 @@ hb_path_warning() { esac } +# ========================================================== +# HARDWARE DRIFT ASSESSMENT (smart restore) +# ========================================================== +# Compares the backup metadata captured by hb_prepare_staging +# against the live target host to detect when applying certain +# paths would break the boot (orphan ZFS pool GUID, stale fstab +# UUIDs, ...) or pointlessly reinstall components for hardware +# that's no longer present (NVIDIA driver on a host with no +# NVIDIA card). +# +# Output format on stdout — one line per assessment, tab-separated: +# +# PATH_OR_KEY \t ACTION \t REASON +# +# Where ACTION is one of: +# skip → the restore flow should EXCLUDE this from apply +# warn → restore but surface the warning in the dialog +# ok → no drift detected (omitted from output) +# +# Callers consume this to build the "Restore plan" dialog and to +# filter the hot/pending path lists. The function never modifies +# state, never prompts — pure analysis. + +# Read the UUIDs referenced by a fstab file. Skips comments and +# `proc`/`none`/`tmpfs` non-block entries. +_hb_fstab_uuids() { + local fstab="$1" + [[ -f "$fstab" ]] || return 0 + awk ' + /^[[:space:]]*#/ { next } + /^[[:space:]]*$/ { next } + { + src = $1 + if (src ~ /^UUID=/) { + sub(/^UUID=/, "", src) + print src + } else if (src ~ /^PARTUUID=/) { + sub(/^PARTUUID=/, "", src) + print src + } else if (src ~ /^\/dev\//) { + print src + } + } + ' "$fstab" +} + +# Build a set of live block-device UUIDs. Returns one UUID per line. +_hb_live_uuids() { + command -v blkid >/dev/null 2>&1 || return 0 + blkid -s UUID -o value 2>/dev/null +} + +# Build a name→guid map of the live ZFS pools. +_hb_live_zpool_guids() { + command -v zpool >/dev/null 2>&1 || return 0 + zpool list -H -o name,guid 2>/dev/null +} + +hb_assess_hardware_drift() { + local staging_root="$1" + local meta="$staging_root/metadata" + local rootfs="$staging_root/rootfs" + + # ── ZFS pool GUID drift ────────────────────────────── + # If the backup had ZFS pools, compare each (name, guid) pair + # against what's on this host. A pool with the same NAME but a + # different GUID is the "fresh PVE install with same pool name" + # case — restoring /etc/zfs/zpool.cache would point ZFS at a + # ghost pool and drop boot to emergency. + if [[ -f "$meta/zpool.guids" ]] && [[ -s "$meta/zpool.guids" ]]; then + local bk_name bk_guid live_map + live_map=$(_hb_live_zpool_guids) + local pool_mismatch="" pool_missing="" + while IFS=$'\t ' read -r bk_name bk_guid; do + [[ -z "$bk_name" ]] && continue + local live_guid + live_guid=$(awk -v n="$bk_name" '$1==n {print $2; exit}' <<<"$live_map") + if [[ -z "$live_guid" ]]; then + pool_missing+="$bk_name " + elif [[ "$live_guid" != "$bk_guid" ]]; then + pool_mismatch+="$bk_name(${bk_guid:0:8}…→${live_guid:0:8}…) " + fi + done < "$meta/zpool.guids" + if [[ -n "$pool_missing" ]]; then + printf '%s\t%s\t%s\n' "/etc/zfs/zpool.cache" "skip" \ + "$(hb_translate "Backup pools not present on this host:") ${pool_missing% }" + elif [[ -n "$pool_mismatch" ]]; then + printf '%s\t%s\t%s\n' "/etc/zfs/zpool.cache" "skip" \ + "$(hb_translate "Pool name matches but GUID differs (fresh ZFS install):") ${pool_mismatch% }" + fi + fi + + # ── Boot partition UUID drift ──────────────────────── + # /etc/kernel/proxmox-boot-uuids lists the EFI vfat UUIDs that + # proxmox-boot-tool replicates the bootloader onto. If those + # UUIDs don't exist on this host, applying the file makes + # subsequent `proxmox-boot-tool refresh` fail. + local boot_uuid_file="$rootfs/etc/kernel/proxmox-boot-uuids" + if [[ -f "$boot_uuid_file" ]] && [[ -s "$boot_uuid_file" ]]; then + local live_uuids + live_uuids=$(_hb_live_uuids) + local missing_boot="" u + while IFS= read -r u; do + u="${u// /}" + [[ -z "$u" || "$u" == "#"* ]] && continue + if ! grep -Fxq "$u" <<<"$live_uuids"; then + missing_boot+="$u " + fi + done < "$boot_uuid_file" + if [[ -n "$missing_boot" ]]; then + printf '%s\t%s\t%s\n' "/etc/kernel/proxmox-boot-uuids" "skip" \ + "$(hb_translate "Boot partition UUIDs from backup not found on this host:") ${missing_boot% }" + fi + fi + + # ── fstab UUID drift ───────────────────────────────── + # Skip ONLY if at least one UUID/dev in the backup's fstab can't + # be resolved on the live host. A clean PVE+ZFS root install + # typically has just `proc /proc proc defaults 0 0`, no UUIDs — + # that yields zero referenced UUIDs and the check is a no-op. + local fstab="$rootfs/etc/fstab" + if [[ -f "$fstab" ]]; then + local live_uuids; live_uuids=$(_hb_live_uuids) + local missing_fstab="" cnt=0 u + while IFS= read -r u; do + ((cnt++)) + if [[ "$u" == /dev/* ]]; then + [[ -b "$u" ]] || missing_fstab+="$u " + else + grep -Fxq "$u" <<<"$live_uuids" || missing_fstab+="$u " + fi + done < <(_hb_fstab_uuids "$fstab") + if (( cnt > 0 )) && [[ -n "$missing_fstab" ]]; then + printf '%s\t%s\t%s\n' "/etc/fstab" "skip" \ + "$(hb_translate "fstab references UUIDs/devices not present on this host:") ${missing_fstab% }" + fi + fi + + # ── Component reinstall drift (GPU / TPU presence) ─── + # components_status.json declares what proxmenux installed on + # the source (nvidia_driver, amdgpu_top, intel_gpu_tools, + # coral_driver). If the target hardware no longer has the + # corresponding device, the post-boot dispatcher would try to + # reinstall a driver for a card that isn't there. The installer + # itself short-circuits in that case (detect_*_gpus), but + # surfacing this in the dialog is cleaner than letting the user + # discover it from the postboot log. + local comp_file="$rootfs/usr/local/share/proxmenux/components_status.json" + if [[ -f "$comp_file" ]] && command -v jq >/dev/null 2>&1 && command -v lspci >/dev/null 2>&1; then + local live_pci; live_pci=$(lspci -nn 2>/dev/null) + local installed_components + installed_components=$(jq -r 'to_entries[] | select(.value.status=="installed") | .key' "$comp_file" 2>/dev/null) + local comp + while IFS= read -r comp; do + [[ -z "$comp" ]] && continue + local pattern="" + case "$comp" in + nvidia_driver) pattern='NVIDIA' ;; + amdgpu_top) pattern='Advanced Micro Devices.*\[AMD/ATI\]' ;; + intel_gpu_tools) pattern='Intel.*(VGA|Display|Graphics)' ;; + coral_driver) pattern='Global Unichip|Google.*Edge TPU' ;; + *) continue ;; + esac + if ! grep -qiE "$pattern" <<<"$live_pci"; then + printf 'component:%s\t%s\t%s\n' "$comp" "skip" \ + "$(hb_translate "Component was installed on the backup source but no matching hardware was found on this host.")" + fi + done <<<"$installed_components" + fi +} + +# Returns 0 (true) if hb_assess_hardware_drift produced any skip +# entries — i.e. there is something for the operator to look at in +# the smart-restore dialog. Returns 1 otherwise. Used by the +# restore flow to decide whether to show the smart-restore dialog +# at all (no drift → skip the extra prompt). +hb_has_hardware_drift() { + local staging_root="$1" + local out + out=$(hb_assess_hardware_drift "$staging_root" 2>/dev/null | grep $'\tskip\t' || true) + [[ -n "$out" ]] +} + # ========================================================== # PROFILE PATH SELECTION # ========================================================== @@ -405,6 +591,20 @@ hb_prepare_staging() { command -v pct >/dev/null 2>&1 && pct list > "$meta/pct-list.txt" 2>&1 || true command -v zpool >/dev/null 2>&1 && zpool status > "$meta/zpool.txt" 2>&1 || true + # Extra hardware fingerprints used by hb_compat_check on restore to + # detect drift that would make some paths unsafe to apply: + # * zpool.guids → pool name + GUID. Same pool name on a fresh + # install gets a NEW GUID; restoring /etc/zfs/zpool.cache with + # the old GUID then drops the boot into emergency mode. + # * blkid.txt → all block-device UUIDs, used to verify + # /etc/fstab and /etc/kernel/proxmox-boot-uuids still resolve. + # * lspci.txt → presence test for GPUs / TPUs / NICs referenced + # by components_status.json (so we don't try to reinstall an + # NVIDIA driver on a host with no NVIDIA card any more). + command -v zpool >/dev/null 2>&1 && zpool list -H -o name,guid > "$meta/zpool.guids" 2>&1 || true + command -v blkid >/dev/null 2>&1 && blkid -s UUID -s TYPE > "$meta/blkid.txt" 2>&1 || true + command -v lspci >/dev/null 2>&1 && lspci -nn > "$meta/lspci.txt" 2>&1 || true + # Package inventory — captures what's installed on the source # host so the restore flow can offer to reinstall missing user # packages on the target. Solves the "config restored but the