From f0b84743506050239efd745f5ed6b751c1809985 Mon Sep 17 00:00:00 2001 From: MacRimi Date: Tue, 9 Jun 2026 17:42:51 +0200 Subject: [PATCH] Update 1.2.2.1 beta --- .../backup_restore/apply_cluster_postboot.sh | 61 ++++++++++++ scripts/gpu_tpu/amd_gpu_tools.sh | 32 +++++++ scripts/gpu_tpu/install_coral.sh | 53 +++++++++++ scripts/gpu_tpu/intel_gpu_tools.sh | 24 ++++- scripts/gpu_tpu/nvidia_installer.sh | 94 ++++++++++++++++++- 5 files changed, 262 insertions(+), 2 deletions(-) diff --git a/scripts/backup_restore/apply_cluster_postboot.sh b/scripts/backup_restore/apply_cluster_postboot.sh index d81b1632..a53e59e6 100755 --- a/scripts/backup_restore/apply_cluster_postboot.sh +++ b/scripts/backup_restore/apply_cluster_postboot.sh @@ -244,6 +244,67 @@ fi # Clean up the maintenance marker now that we're done. rm -f "$MAINT_MARKER" +# ── Component auto-reinstall (driven by components_status.json) ── +# The host-config restore brings back ProxMenux state (including +# components_status.json) but NOT the binary artifacts those +# components installed outside of apt — driver modules under +# /lib/modules//, binaries in /usr/bin/, downloaded +# .deb files, DKMS source trees, etc. For each component the +# restore state says was installed, we kick off its native +# installer in `--auto-reinstall` mode so it replays the install +# without dialogs. The installer's own logic handles "already +# present → no-op", so this is idempotent. +# +# Apt-only components are still handled by the +# packages.manual.list pass done earlier in the restore flow +# (they're in `apt-mark showmanual`). Running the installer here +# for them is harmless overhead (the installer just sees the +# package is present and exits 0), so we don't try to filter. +# +# To register a NEW component for auto-reinstall: add it to the +# COMPONENT_INSTALLERS array below as "component_key:relative +# script path". The script must accept `--auto-reinstall` and +# read its own state from components_status.json. +COMPONENTS_STATUS="/usr/local/share/proxmenux/components_status.json" +COMPONENT_INSTALLERS=( + "nvidia_driver:gpu_tpu/nvidia_installer.sh" + "amdgpu_top:gpu_tpu/amd_gpu_tools.sh" + "intel_gpu_tools:gpu_tpu/intel_gpu_tools.sh" + "coral_driver:gpu_tpu/install_coral.sh" +) + +if command -v jq >/dev/null 2>&1 && [[ -f "$COMPONENTS_STATUS" ]]; then + echo "" + echo "── Component auto-reinstall ──" + SCRIPTS_BASE="/usr/local/share/proxmenux/scripts" + for entry in "${COMPONENT_INSTALLERS[@]}"; do + comp="${entry%%:*}" + installer="$SCRIPTS_BASE/${entry#*:}" + + comp_status=$(jq -r ".${comp}.status // \"\"" "$COMPONENTS_STATUS" 2>/dev/null) + if [[ "$comp_status" != "installed" ]]; then + continue # Was never installed on the source, or was uninstalled — skip. + fi + + if [[ ! -f "$installer" ]]; then + echo " ✗ $comp: installer missing at $installer — skipping" + continue + fi + + echo "" + echo " → $comp (running $installer --auto-reinstall)" + # Run with limited output capture. The installer logs in full to + # its own log file; we only echo a tail here for the operator. + bash "$installer" --auto-reinstall 2>&1 | sed -e 's/^/ /' | tail -15 + rc=${PIPESTATUS[0]} + if (( rc == 0 )); then + echo " ✓ $comp ok" + else + echo " ✗ $comp installer exited $rc — see its own log" + fi + done +fi + echo "" echo "=== Apply finished at $(date -Iseconds) ===" echo "Log: $LOG_FILE" diff --git a/scripts/gpu_tpu/amd_gpu_tools.sh b/scripts/gpu_tpu/amd_gpu_tools.sh index 144caf7e..6e0a2632 100644 --- a/scripts/gpu_tpu/amd_gpu_tools.sh +++ b/scripts/gpu_tpu/amd_gpu_tools.sh @@ -257,7 +257,39 @@ main() { fi } +# ========================================================== +# Non-interactive auto-reinstall (post-restore hook) +# ========================================================== +# Called from apply_cluster_postboot.sh when components_status +# says amdgpu_top was installed on the source but its binary is +# missing on the target (typical for a fresh PVE install + host +# restore — the .deb downloaded from GitHub is not in +# packages.manual.list, so we re-fetch and install it). No +# dialogs. +auto_reinstall_from_state() { + : >"$LOG_FILE" + echo "=== amd_gpu_tools auto_reinstall $(date -Iseconds) ===" >>"$LOG_FILE" + command -v jq >/dev/null 2>&1 || return 1 + [[ -f "$COMPONENTS_STATUS_FILE" ]] || return 1 + local s + s=$(jq -r '.amdgpu_top.status // ""' "$COMPONENTS_STATUS_FILE" 2>/dev/null) + [[ "$s" == "installed" ]] || { echo "not installed in state ($s)" >>"$LOG_FILE"; return 0; } + if command -v amdgpu_top >/dev/null 2>&1 || dpkg -s amdgpu-top >/dev/null 2>&1; then + echo "already present — no-op" >>"$LOG_FILE"; return 0 + fi + export DEBIAN_FRONTEND=noninteractive + install_dependencies >>"$LOG_FILE" 2>&1 + if ! get_latest_release >>"$LOG_FILE" 2>&1; then + echo "Failed to fetch latest release info" >>"$LOG_FILE"; return 2 + fi + install_amdgpu_top +} + # Run main function if [[ "${BASH_SOURCE[0]}" == "$0" ]]; then + if [[ "${1:-}" == "--auto-reinstall" ]]; then + auto_reinstall_from_state + exit $? + fi main fi \ No newline at end of file diff --git a/scripts/gpu_tpu/install_coral.sh b/scripts/gpu_tpu/install_coral.sh index ac29b154..8c44364c 100644 --- a/scripts/gpu_tpu/install_coral.sh +++ b/scripts/gpu_tpu/install_coral.sh @@ -749,4 +749,57 @@ main() { esac } +# ========================================================== +# Non-interactive auto-reinstall (post-restore hook) +# ========================================================== +# Called from apply_cluster_postboot.sh when components_status +# says coral_driver was installed on the source. Coral has two +# install branches that are independent: the PCIe/M.2 gasket+apex +# DKMS modules (kernel-level) and the USB libedgetpu user-space +# runtime. We replay both if either was previously installed and +# the corresponding hardware is now present — the hardware +# detection in detect_coral_hardware naturally short-circuits if +# the user moved the card to a different host or it's not in +# this slot any more. +auto_reinstall_from_state() { + : >"$LOG_FILE" + echo "=== install_coral auto_reinstall $(date -Iseconds) ===" >>"$LOG_FILE" + command -v jq >/dev/null 2>&1 || return 1 + [[ -f "$COMPONENTS_STATUS_FILE" ]] || return 1 + local s + s=$(jq -r '.coral_driver.status // ""' "$COMPONENTS_STATUS_FILE" 2>/dev/null) + [[ "$s" == "installed" ]] || { echo "not installed in state ($s)" >>"$LOG_FILE"; return 0; } + + detect_coral_hardware + detect_coral_install_state + + # No Coral hardware on this host? Skip — nothing to install. + if (( CORAL_PCIE_COUNT == 0 && CORAL_USB_COUNT == 0 )); then + echo "no Coral hardware on this host — skipping" >>"$LOG_FILE" + return 0 + fi + # Already healthy on every branch that has matching hardware → bail out. + if { (( CORAL_PCIE_COUNT == 0 )) || $CORAL_PCIE_INSTALLED; } \ + && { (( CORAL_USB_COUNT == 0 )) || $CORAL_USB_INSTALLED; }; then + echo "already healthy — no-op" >>"$LOG_FILE" + return 0 + fi + + export DEBIAN_FRONTEND=noninteractive + + if (( CORAL_PCIE_COUNT > 0 )) && ! $CORAL_PCIE_INSTALLED; then + echo "Installing gasket+apex DKMS modules..." >>"$LOG_FILE" + install_gasket_apex_dkms >>"$LOG_FILE" 2>&1 || echo "PCIe branch failed" >>"$LOG_FILE" + fi + if (( CORAL_USB_COUNT > 0 )) && ! $CORAL_USB_INSTALLED; then + echo "Installing libedgetpu USB runtime..." >>"$LOG_FILE" + install_libedgetpu_runtime >>"$LOG_FILE" 2>&1 || echo "USB branch failed" >>"$LOG_FILE" + fi +} + +if [[ "${1:-}" == "--auto-reinstall" ]]; then + auto_reinstall_from_state + exit $? +fi + main diff --git a/scripts/gpu_tpu/intel_gpu_tools.sh b/scripts/gpu_tpu/intel_gpu_tools.sh index a0d56106..38e716cf 100644 --- a/scripts/gpu_tpu/intel_gpu_tools.sh +++ b/scripts/gpu_tpu/intel_gpu_tools.sh @@ -194,7 +194,29 @@ main() { fi } -# Run main function +# ========================================================== +# Non-interactive auto-reinstall (post-restore hook) +# ========================================================== +auto_reinstall_from_state() { + : >"$LOG_FILE" + echo "=== intel_gpu_tools auto_reinstall $(date -Iseconds) ===" >>"$LOG_FILE" + command -v jq >/dev/null 2>&1 || return 1 + [[ -f "$COMPONENTS_STATUS_FILE" ]] || return 1 + local s + s=$(jq -r '.intel_gpu_tools.status // ""' "$COMPONENTS_STATUS_FILE" 2>/dev/null) + [[ "$s" == "installed" ]] || { echo "not installed in state ($s)" >>"$LOG_FILE"; return 0; } + if dpkg -s intel-gpu-tools >/dev/null 2>&1; then + echo "already present — no-op" >>"$LOG_FILE"; return 0 + fi + export DEBIAN_FRONTEND=noninteractive + apt-get update -qq >>"$LOG_FILE" 2>&1 + install_intel_gpu_tools +} + if [[ "${BASH_SOURCE[0]}" == "$0" ]]; then + if [[ "${1:-}" == "--auto-reinstall" ]]; then + auto_reinstall_from_state + exit $? + fi main fi \ No newline at end of file diff --git a/scripts/gpu_tpu/nvidia_installer.sh b/scripts/gpu_tpu/nvidia_installer.sh index 7241383f..c827aa78 100644 --- a/scripts/gpu_tpu/nvidia_installer.sh +++ b/scripts/gpu_tpu/nvidia_installer.sh @@ -1565,6 +1565,98 @@ main() { esac } +# ========================================================== +# Non-interactive auto-reinstall entry point +# ========================================================== +# Invoked after a host-config restore by apply_cluster_postboot.sh +# when components_status.json reports nvidia_driver as installed +# but the kernel module isn't loaded on the live system (i.e. the +# restore brought back the configs but not the binary driver from +# /lib/modules//). Replays the install path the user +# originally ran via `menu → 2`, using the recorded version, with +# no dialogs. +# +# Exit codes: +# 0 installed (or no-op — GPU absent / driver already present) +# 1 state file unreadable or no nvidia_driver entry +# 2 install failed +auto_reinstall_from_state() { + : >"$LOG_FILE" + echo "=== auto_reinstall_from_state started $(date -Iseconds) ===" >>"$LOG_FILE" + + if ! command -v jq >/dev/null 2>&1; then + echo "jq not available — cannot read components_status.json" | tee -a "$LOG_FILE" + return 1 + fi + if [[ ! -f "$COMPONENTS_STATUS_FILE" ]]; then + echo "No components_status.json at $COMPONENTS_STATUS_FILE" | tee -a "$LOG_FILE" + return 1 + fi + + local recorded_status recorded_version + recorded_status=$(jq -r '.nvidia_driver.status // ""' "$COMPONENTS_STATUS_FILE" 2>/dev/null) + recorded_version=$(jq -r '.nvidia_driver.version // ""' "$COMPONENTS_STATUS_FILE" 2>/dev/null) + + if [[ "$recorded_status" != "installed" ]]; then + echo "nvidia_driver not marked installed in state ($recorded_status) — nothing to do" | tee -a "$LOG_FILE" + return 0 + fi + if [[ -z "$recorded_version" || "$recorded_version" == "null" ]]; then + echo "nvidia_driver marked installed but no version recorded — aborting" | tee -a "$LOG_FILE" + return 1 + fi + echo "Recorded driver: $recorded_version" >>"$LOG_FILE" + + detect_nvidia_gpus + if ! $NVIDIA_GPU_PRESENT; then + echo "No NVIDIA GPU detected on this host — skipping reinstall" | tee -a "$LOG_FILE" + return 0 + fi + detect_driver_status + if $CURRENT_DRIVER_INSTALLED && [[ "$CURRENT_DRIVER_VERSION" == "$recorded_version" ]]; then + echo "Driver $recorded_version already installed and matches state — no-op" | tee -a "$LOG_FILE" + return 0 + fi + + DRIVER_VERSION="$recorded_version" + + # Same install path as the interactive main() flow, minus all + # dialogs and confirmations. + echo "Reinstalling NVIDIA driver $DRIVER_VERSION non-interactively..." | tee -a "$LOG_FILE" + ensure_workdir + ensure_repos_and_headers >>"$LOG_FILE" 2>&1 + blacklist_nouveau >>"$LOG_FILE" 2>&1 + ensure_modules_config >>"$LOG_FILE" 2>&1 + + if $CURRENT_DRIVER_INSTALLED; then + echo "Different version currently installed; cleaning up first..." | tee -a "$LOG_FILE" + complete_nvidia_uninstall >>"$LOG_FILE" 2>&1 + fi + + if ! download_nvidia_installer >>"$LOG_FILE" 2>&1; then + echo "Download failed — see $LOG_FILE" | tee -a "$LOG_FILE" + return 2 + fi + if ! run_nvidia_installer >>"$LOG_FILE" 2>&1; then + echo "Install failed — see $LOG_FILE" | tee -a "$LOG_FILE" + return 2 + fi + install_udev_rules_and_persistenced >>"$LOG_FILE" 2>&1 + + # Record success — overwrites whatever the restore put there + # (same version key, fresh timestamp). + if declare -F update_component_status >/dev/null 2>&1; then + update_component_status "nvidia_driver" "installed" "$DRIVER_VERSION" "gpu" '{"patched":false}' >>"$LOG_FILE" 2>&1 + fi + + echo "✓ NVIDIA driver $DRIVER_VERSION reinstalled" | tee -a "$LOG_FILE" + return 0 +} + if [[ "${BASH_SOURCE[0]}" == "$0" ]]; then + if [[ "${1:-}" == "--auto-reinstall" ]]; then + auto_reinstall_from_state + exit $? + fi main -fi +fi \ No newline at end of file