diff --git a/scripts/backup_restore/apply_cluster_postboot.sh b/scripts/backup_restore/apply_cluster_postboot.sh index a53e59e6..e38f0c06 100755 --- a/scripts/backup_restore/apply_cluster_postboot.sh +++ b/scripts/backup_restore/apply_cluster_postboot.sh @@ -180,6 +180,68 @@ else echo " (no source node dir to copy from)" fi +# ── LXC bind-mount stub directories ─────────────────────── +# LXC containers with `mp: /path,mp=...` bind-mount entries fail the +# pre-start hook (status 2) if `/path` doesn't exist on the host. After a +# cross-host restore the source's bind-mount paths (custom NAS mounts, second +# disk paths, etc.) generally don't exist on the target's fresh install yet. +# We create empty stubs so `onboot: 1` containers start; the operator wires +# the real data source afterwards. PVE-managed storages (`/mnt/pve/*`) and +# /dev/* are skipped — PVE handles the first, kernel handles the second. +echo "" +echo "── LXC bind-mount stubs ──" +stub_created=0 +stub_skipped=0 +if compgen -G "/etc/pve/nodes/$CUR_NODE/lxc/*.conf" >/dev/null 2>&1; then + for conf in /etc/pve/nodes/"$CUR_NODE"/lxc/*.conf; do + [[ -f "$conf" ]] || continue + while IFS= read -r line; do + if [[ "$line" =~ ^mp[0-9]+:[[:space:]]*(/[^,]+), ]]; then + src="${BASH_REMATCH[1]}" + [[ "$src" == /mnt/pve/* ]] && continue + [[ "$src" == /dev/* ]] && continue + if [[ -e "$src" ]]; then + ((stub_skipped++)) + continue + fi + if mkdir -p "$src" 2>/dev/null; then + echo " + stub $src (from $(basename "$conf"))" + ((stub_created++)) + fi + fi + done < "$conf" + done +fi +echo "Stubs: created=$stub_created, already-present=$stub_skipped" + +# ── Stale node-dir cleanup ──────────────────────────────── +# Fresh PVE install creates /etc/pve/nodes//. After our +# restore changes the hostname back to the source's, pve-cluster boots into +# the source's node dir but leaves the install-hostname dir orphaned. The +# web UI then shows a phantom offline node. Only remove dirs whose lxc/ +# qemu-server/ are empty — never trample a real second cluster member. +echo "" +echo "── Stale node-dir cleanup ──" +removed_nodes=0 +for nodedir in /etc/pve/nodes/*/; do + n=$(basename "$nodedir") + [[ "$n" == "$CUR_NODE" ]] && continue + lxc_empty=1; qemu_empty=1 + [[ -d "$nodedir/lxc" ]] && [[ -n "$(ls -A "$nodedir/lxc" 2>/dev/null)" ]] && lxc_empty=0 + [[ -d "$nodedir/qemu-server" ]] && [[ -n "$(ls -A "$nodedir/qemu-server" 2>/dev/null)" ]] && qemu_empty=0 + if (( lxc_empty && qemu_empty )); then + if rm -rf "$nodedir" 2>/dev/null; then + echo " ✓ removed stale node dir: $n" + ((removed_nodes++)) + else + echo " ✗ rm failed for $n (pmxcfs may have it busy)" + fi + else + echo " ⚠ kept $n (has guest configs — looks like a real cluster member)" + fi +done +echo "Stale node dirs removed: $removed_nodes" + # ── Done with cluster config apply ───────────────────────── echo "" echo "Cluster summary: globals=$copied_global, subdirs=$copied_subdirs, guests=$copied_guests, guest-clashes-skipped=$skipped_guests" diff --git a/scripts/backup_restore/lib_host_backup_common.sh b/scripts/backup_restore/lib_host_backup_common.sh index 444b3bb8..e7dd35db 100644 --- a/scripts/backup_restore/lib_host_backup_common.sh +++ b/scripts/backup_restore/lib_host_backup_common.sh @@ -257,10 +257,24 @@ hb_prepare_staging() { ) fi - # Runtime pending-restore data belongs in /var/lib/proxmenux, never in app code tree. + # /usr/local/share/proxmenux: ship USER STATE only (components_status.json, + # user prefs, post-install cache). NEVER ship code (scripts/, utils.sh, web/, + # AppImage/, monitor-app/) — destination has its own installed proxmenux which + # may be newer than the backup. Hot-applying the backup's old /scripts/ over + # the destination's fresh install silently regresses the apply_cluster_postboot + # dispatcher and the *_installer.sh --auto-reinstall hooks, breaking the + # "user reinstalls nothing" promise. if [[ "$rel" == "usr/local/share/proxmenux" || "$rel" == "usr/local/share/proxmenux/"* ]]; then rsync_opts+=( --exclude "restore-pending/" + --exclude "scripts/" + --exclude "web/" + --exclude "monitor-app/" + --exclude "AppImage/" + --exclude "images/" + --exclude "json/" + --exclude "utils.sh" + --exclude "helpers_cache.json" ) fi diff --git a/scripts/gpu_tpu/nvidia_installer.sh b/scripts/gpu_tpu/nvidia_installer.sh index c827aa78..d1a5ae86 100644 --- a/scripts/gpu_tpu/nvidia_installer.sh +++ b/scripts/gpu_tpu/nvidia_installer.sh @@ -1633,11 +1633,14 @@ auto_reinstall_from_state() { complete_nvidia_uninstall >>"$LOG_FILE" 2>&1 fi - if ! download_nvidia_installer >>"$LOG_FILE" 2>&1; then + local installer + installer=$(download_nvidia_installer "$DRIVER_VERSION" 2>>"$LOG_FILE") + if [[ -z "$installer" || ! -f "$installer" ]]; then echo "Download failed — see $LOG_FILE" | tee -a "$LOG_FILE" return 2 fi - if ! run_nvidia_installer >>"$LOG_FILE" 2>&1; then + echo "Installer ready: $installer" >>"$LOG_FILE" + if ! run_nvidia_installer "$installer" >>"$LOG_FILE" 2>&1; then echo "Install failed — see $LOG_FILE" | tee -a "$LOG_FILE" return 2 fi