From 415bc439bb23487b6fa3043aaa03dbf650f3c0c4 Mon Sep 17 00:00:00 2001 From: MacRimi Date: Fri, 17 Apr 2026 18:24:07 +0200 Subject: [PATCH] Update nvidia_update.sh --- scripts/gpu_tpu/nvidia_update.sh | 930 ++++++++++++++++++++++--------- 1 file changed, 663 insertions(+), 267 deletions(-) diff --git a/scripts/gpu_tpu/nvidia_update.sh b/scripts/gpu_tpu/nvidia_update.sh index c54baea6..09d10e4e 100644 --- a/scripts/gpu_tpu/nvidia_update.sh +++ b/scripts/gpu_tpu/nvidia_update.sh @@ -3,22 +3,37 @@ # ================================================ # Author : MacRimi # License : MIT -# Version : 1.0 -# Last Updated: 01/04/2026 +# Version : 2.0 +# Last Updated: 17/04/2026 # ================================================ +# +# Aligned with nvidia_installer.sh (host install flow & kernel filter) +# and add_gpu_lxc.sh (LXC userspace install flow with distro + memory +# awareness and visible progress output). + +SCRIPT_TITLE="NVIDIA Driver Update (Host + LXC)" LOCAL_SCRIPTS="/usr/local/share/proxmenux/scripts" BASE_DIR="/usr/local/share/proxmenux" UTILS_FILE="$BASE_DIR/utils.sh" +COMPONENTS_STATUS_FILE="$BASE_DIR/components_status.json" LOG_FILE="/tmp/nvidia_update.log" +screen_capture="/tmp/proxmenux_nvidia_update_screen_capture_$$.txt" NVIDIA_BASE_URL="https://download.nvidia.com/XFree86/Linux-x86_64" NVIDIA_WORKDIR="/opt/nvidia" +export BASE_DIR +export COMPONENTS_STATUS_FILE + if [[ -f "$UTILS_FILE" ]]; then source "$UTILS_FILE" fi +if [[ ! -f "$COMPONENTS_STATUS_FILE" ]]; then + echo "{}" > "$COMPONENTS_STATUS_FILE" +fi + load_language initialize_cache @@ -77,6 +92,94 @@ detect_host_nvidia() { } +# ============================================================ +# Kernel compatibility (mirror of nvidia_installer.sh) +# ============================================================ +get_kernel_compatibility_info() { + local kernel_version + kernel_version=$(uname -r) + + KERNEL_MAJOR=$(echo "$kernel_version" | cut -d. -f1) + KERNEL_MINOR=$(echo "$kernel_version" | cut -d. -f2) + + # Minimum driver version matrix (keyed to kernel series) — based on + # https://docs.nvidia.com/datacenter/tesla/drivers/index.html + if [[ "$KERNEL_MAJOR" -ge 6 ]] && [[ "$KERNEL_MINOR" -ge 17 ]]; then + MIN_DRIVER_VERSION="580.82.07" # PVE 9.x + elif [[ "$KERNEL_MAJOR" -ge 6 ]] && [[ "$KERNEL_MINOR" -ge 8 ]]; then + MIN_DRIVER_VERSION="550" # PVE 8.2+ + elif [[ "$KERNEL_MAJOR" -ge 6 ]]; then + MIN_DRIVER_VERSION="535" # PVE 8.x initial + elif [[ "$KERNEL_MAJOR" -eq 5 ]] && [[ "$KERNEL_MINOR" -ge 15 ]]; then + MIN_DRIVER_VERSION="470" # PVE 7.x / 8.x legacy + else + MIN_DRIVER_VERSION="450" # Old kernels + fi +} + +is_version_compatible() { + local version="$1" + local ver_major ver_minor ver_patch + + ver_major=$(echo "$version" | cut -d. -f1) + ver_minor=$(echo "$version" | cut -d. -f2) + ver_patch=$(echo "$version" | cut -d. -f3) + + if [[ "$MIN_DRIVER_VERSION" == "580.82.07" ]]; then + if [[ ${ver_major} -gt 580 ]]; then + return 0 + elif [[ ${ver_major} -eq 580 ]]; then + if [[ $((10#${ver_minor})) -gt 82 ]]; then + return 0 + elif [[ $((10#${ver_minor})) -eq 82 ]]; then + if [[ $((10#${ver_patch:-0})) -ge 7 ]]; then + return 0 + fi + fi + fi + return 1 + fi + + if [[ ${ver_major} -ge ${MIN_DRIVER_VERSION} ]]; then + return 0 + else + return 1 + fi +} + +version_le() { + local v1="$1" + local v2="$2" + + IFS='.' read -r a1 b1 c1 <<<"$v1" + IFS='.' read -r a2 b2 c2 <<<"$v2" + + a1=${a1:-0}; b1=${b1:-0}; c1=${c1:-0} + a2=${a2:-0}; b2=${b2:-0}; c2=${c2:-0} + + a1=$((10#$a1)); b1=$((10#$b1)); c1=$((10#$c1)) + a2=$((10#$a2)); b2=$((10#$b2)); c2=$((10#$c2)) + + if (( a1 < a2 )); then + return 0 + elif (( a1 > a2 )); then + return 1 + fi + + if (( b1 < b2 )); then + return 0 + elif (( b1 > b2 )); then + return 1 + fi + + if (( c1 <= c2 )); then + return 0 + else + return 1 + fi +} + + # ============================================================ # LXC containers with NVIDIA passthrough # ============================================================ @@ -144,7 +247,7 @@ get_latest_version() { # ============================================================ -# Version selection menu +# Version selection menu (filtered by kernel compatibility) # ============================================================ select_target_version() { local latest versions_list @@ -163,23 +266,54 @@ select_target_version() { [[ -z "$versions_list" ]] && versions_list="$latest" latest=$(echo "$latest" | tr -d '[:space:]') + # Kernel compatibility filter + local current_list="$versions_list" + if [[ -n "$MIN_DRIVER_VERSION" ]]; then + local filtered_list="" + while IFS= read -r ver; do + [[ -z "$ver" ]] && continue + if is_version_compatible "$ver"; then + filtered_list+="$ver"$'\n' + fi + done <<< "$current_list" + current_list="$filtered_list" + fi + + # Cap at latest (avoid showing beta branches newer than published latest) + if [[ -n "$latest" ]]; then + local filtered_max_list="" + while IFS= read -r ver; do + [[ -z "$ver" ]] && continue + if version_le "$ver" "$latest"; then + filtered_max_list+="$ver"$'\n' + fi + done <<< "$current_list" + current_list="$filtered_max_list" + fi + + local menu_text + menu_text="\n$(translate 'Current host version:') ${HOST_NVIDIA_VERSION}\n" + menu_text+="$(translate 'Kernel:') $(uname -r)\n\n" + menu_text+="$(translate 'Select the target version to install on host and all affected LXCs:')\n" + menu_text+="$(translate 'Versions shown are compatible with your running kernel.')" + local choices=() choices+=("latest" "$(translate 'Latest available') (${latest:-?})") choices+=("" "") - while IFS= read -r ver; do - ver=$(echo "$ver" | tr -d '[:space:]') - [[ -z "$ver" ]] && continue - choices+=("$ver" "$ver") - done <<< "$versions_list" - - local menu_text - menu_text="\n$(translate 'Current host version:') ${HOST_NVIDIA_VERSION}\n" - menu_text+="$(translate 'Select the target version to install on host and all affected LXCs:')" + if [[ -n "$current_list" ]]; then + while IFS= read -r ver; do + ver=$(echo "$ver" | tr -d '[:space:]') + [[ -z "$ver" ]] && continue + choices+=("$ver" "$ver") + done <<< "$current_list" + else + choices+=("" "$(translate 'No compatible versions found for your kernel')") + fi TARGET_VERSION=$(dialog --backtitle "ProxMenux" \ --title "$(translate 'NVIDIA Driver Version')" \ - --menu "$menu_text" 26 80 16 \ + --menu "$menu_text" 28 80 16 \ "${choices[@]}" \ 2>&1 >/dev/tty) || exit 0 @@ -192,243 +326,6 @@ select_target_version() { } -# ============================================================ -# Update NVIDIA userspace libs inside a single LXC -# ============================================================ -update_lxc_nvidia() { - local ctid="$1" - local version="$2" - local was_running=false - - # Capture old version before update - local old_version - old_version=$(get_lxc_nvidia_version "$ctid") - - if pct status "$ctid" 2>/dev/null | grep -q "running"; then - was_running=true - else - msg_info "$(translate 'Starting container') ${ctid}..." - pct start "$ctid" >>"$LOG_FILE" 2>&1 || true - local ready=false - for _ in {1..15}; do - sleep 2 - pct exec "$ctid" -- true >/dev/null 2>&1 && ready=true && break - done - if ! $ready; then - msg_warn "$(translate 'Container') ${ctid} $(translate 'did not start. Skipping.')" - return 1 - fi - msg_ok "$(translate 'Container') ${ctid} $(translate 'started.')" - fi - - msg_info "$(translate 'Updating NVIDIA libs in container') ${ctid}..." - - local run_file="${NVIDIA_WORKDIR}/NVIDIA-Linux-x86_64-${version}.run" - - if [[ ! -f "$run_file" ]]; then - msg_warn "$(translate 'Installer not found:') ${run_file} — $(translate 'skipping container') ${ctid}" - if [[ "$was_running" == "false" ]]; then pct stop "$ctid" >>"$LOG_FILE" 2>&1 || true; fi - return 1 - fi - - # Extract .run on the host to avoid decompression failures inside the container - local extract_dir="${NVIDIA_WORKDIR}/extracted_${version}" - local archive="/tmp/nvidia_lxc_${version}.tar.gz" - - msg_info "$(translate 'Extracting NVIDIA installer on host...')" - rm -rf "$extract_dir" - if ! sh "$run_file" --extract-only --target "$extract_dir" >>"$LOG_FILE" 2>&1; then - msg_warn "$(translate 'Extraction failed. Check log:') ${LOG_FILE}" - if [[ "$was_running" == "false" ]]; then pct stop "$ctid" >>"$LOG_FILE" 2>&1 || true; fi - return 1 - fi - msg_ok "$(translate 'Extracted.')" - - msg_info "$(translate 'Packing and copying installer to container') ${ctid}..." - tar -czf "$archive" -C "$extract_dir" . >>"$LOG_FILE" 2>&1 - if ! pct push "$ctid" "$archive" /tmp/nvidia_lxc.tar.gz >>"$LOG_FILE" 2>&1; then - msg_warn "$(translate 'pct push failed. Check log:') ${LOG_FILE}" - rm -f "$archive" - if [[ "$was_running" == "false" ]]; then pct stop "$ctid" >>"$LOG_FILE" 2>&1 || true; fi - return 1 - fi - rm -f "$archive" - msg_ok "$(translate 'Installer copied to container.')" - - msg_info2 "$(translate 'Starting NVIDIA installer in container') ${ctid}. $(translate 'This may take several minutes...')" - echo "" >>"$LOG_FILE" - pct exec "$ctid" -- bash -c " - mkdir -p /tmp/nvidia_lxc_install - tar -xzf /tmp/nvidia_lxc.tar.gz -C /tmp/nvidia_lxc_install 2>&1 - /tmp/nvidia_lxc_install/nvidia-installer \ - --no-kernel-modules \ - --no-questions \ - --ui=none \ - --no-nouveau-check \ - --no-dkms - EXIT=\$? - rm -rf /tmp/nvidia_lxc_install /tmp/nvidia_lxc.tar.gz - exit \$EXIT - " 2>&1 | tee -a "$LOG_FILE" - local rc=${PIPESTATUS[0]} - - rm -rf "$extract_dir" - - if [[ $rc -ne 0 ]]; then - msg_warn "$(translate 'NVIDIA installer returned error') ${rc}. $(translate 'Check log:') ${LOG_FILE}" - if [[ "$was_running" == "false" ]]; then pct stop "$ctid" >>"$LOG_FILE" 2>&1 || true; fi - return 1 - fi - - msg_ok "$(translate 'Container') ${ctid}: ${old_version} → ${version}" - msg_info2 "$(translate 'NVIDIA driver verification in container') ${ctid}:" - pct exec "$ctid" -- nvidia-smi 2>/dev/null || true - - if [[ "$was_running" == "false" ]]; then - msg_info "$(translate 'Stopping container') ${ctid}..." - pct stop "$ctid" >>"$LOG_FILE" 2>&1 || true - msg_ok "$(translate 'Container stopped.')" - fi -} - - -# ============================================================ -# Host NVIDIA update -# ============================================================ -_stop_nvidia_services() { - for svc in nvidia-persistenced.service nvidia-powerd.service; do - systemctl is-active --quiet "$svc" 2>/dev/null && systemctl stop "$svc" >/dev/null 2>&1 || true - systemctl is-enabled --quiet "$svc" 2>/dev/null && systemctl disable "$svc" >/dev/null 2>&1 || true - done -} - -_unload_nvidia_modules() { - for mod in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do - modprobe -r "$mod" >/dev/null 2>&1 || true - done - # Second pass for stubborn modules - for mod in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do - modprobe -r --force "$mod" >/dev/null 2>&1 || true - done -} - -_purge_nvidia_host() { - msg_info "$(translate 'Uninstalling current NVIDIA driver from host...')" - - _stop_nvidia_services - _unload_nvidia_modules - - command -v nvidia-uninstall >/dev/null 2>&1 \ - && nvidia-uninstall --silent >>"$LOG_FILE" 2>&1 || true - - # Remove DKMS entries - local dkms_versions - dkms_versions=$(dkms status 2>/dev/null | awk -F, '/nvidia/ {gsub(/ /,"",$2); print $2}' || true) - while IFS= read -r ver; do - [[ -z "$ver" ]] && continue - dkms remove -m nvidia -v "$ver" --all >/dev/null 2>&1 || true - done <<< "$dkms_versions" - - apt-get -y purge 'nvidia-*' 'libnvidia-*' 'cuda-*' >>"$LOG_FILE" 2>&1 || true - apt-get -y autoremove --purge >>"$LOG_FILE" 2>&1 || true - - rm -f /etc/udev/rules.d/70-nvidia.rules - rm -f /etc/modprobe.d/nvidia*.conf /usr/lib/modprobe.d/nvidia*.conf - - msg_ok "$(translate 'Current NVIDIA driver removed from host.')" -} - -_download_installer() { - local version="$1" - local run_file="${NVIDIA_WORKDIR}/NVIDIA-Linux-x86_64-${version}.run" - - mkdir -p "$NVIDIA_WORKDIR" - - # Reuse cached file if valid - local existing_size - existing_size=$(stat -c%s "$run_file" 2>/dev/null || echo "0") - if [[ -f "$run_file" ]] && [[ "$existing_size" -gt 40000000 ]]; then - if file "$run_file" 2>/dev/null | grep -q "executable"; then - msg_ok "$(translate 'Installer already cached.')" - echo "$run_file" - return 0 - fi - fi - rm -f "$run_file" - - msg_info "$(translate 'Downloading NVIDIA driver') ${version}..." - - local urls=( - "${NVIDIA_BASE_URL}/${version}/NVIDIA-Linux-x86_64-${version}.run" - "${NVIDIA_BASE_URL}/${version}/NVIDIA-Linux-x86_64-${version}-no-compat32.run" - ) - - local ok=false - for url in "${urls[@]}"; do - if curl -fL --connect-timeout 30 --max-time 600 "$url" -o "$run_file" >>"$LOG_FILE" 2>&1; then - local sz - sz=$(stat -c%s "$run_file" 2>/dev/null || echo "0") - if [[ "$sz" -gt 40000000 ]] && file "$run_file" 2>/dev/null | grep -q "executable"; then - ok=true - break - fi - fi - rm -f "$run_file" - done - - if ! $ok; then - msg_error "$(translate 'Download failed. Check /tmp/nvidia_update.log')" - exit 1 - fi - - chmod +x "$run_file" - msg_ok "$(translate 'Download complete.')" - echo "$run_file" -} - -_run_installer() { - local installer="$1" - local tmp_dir="${NVIDIA_WORKDIR}/tmp_extract" - mkdir -p "$tmp_dir" - - msg_info "$(translate 'Installing NVIDIA driver on host. This may take several minutes...')" - - sh "$installer" \ - --tmpdir="$tmp_dir" \ - --no-questions \ - --ui=none \ - --disable-nouveau \ - --no-nouveau-check \ - --dkms \ - >>"$LOG_FILE" 2>&1 - local rc=$? - - rm -rf "$tmp_dir" - - if [[ $rc -ne 0 ]]; then - msg_error "$(translate 'NVIDIA installer failed. Check /tmp/nvidia_update.log')" - exit 1 - fi - - msg_ok "$(translate 'NVIDIA driver installed on host.')" -} - -update_host_nvidia() { - local version="$1" - - _purge_nvidia_host - - local installer - installer=$(_download_installer "$version") - - _run_installer "$installer" - - msg_info "$(translate 'Updating initramfs...')" - update-initramfs -u -k all >>"$LOG_FILE" 2>&1 || true - msg_ok "$(translate 'initramfs updated.')" -} - - # ============================================================ # Overview dialog (current state) # ============================================================ @@ -436,18 +333,18 @@ show_current_state_dialog() { find_nvidia_containers local info - info="\n$(translate 'Host NVIDIA driver:') ${HOST_NVIDIA_VERSION}\n\n" + info="\n$(translate 'Host NVIDIA driver:') ${HOST_NVIDIA_VERSION}\n" + info+="$(translate 'Kernel:') $(uname -r)\n\n" if [[ ${#NVIDIA_CONTAINERS[@]} -eq 0 ]]; then info+="$(translate 'No LXC containers with NVIDIA passthrough found.')\n" else info+="$(translate 'LXC containers with NVIDIA passthrough:')\n\n" for ctid in "${NVIDIA_CONTAINERS[@]}"; do - local lxc_ver + local lxc_ver ct_name lxc_ver=$(get_lxc_nvidia_version "$ctid") - local ct_name ct_name=$(pct config "$ctid" 2>/dev/null | grep "^hostname:" | awk '{print $2}') - info+=" CT ${ctid} ${ct_name:+(${ct_name})} — libcuda1: ${lxc_ver}\n" + info+=" CT ${ctid} ${ct_name:+(${ct_name})} — $(translate 'driver:') ${lxc_ver}\n" done fi @@ -461,30 +358,488 @@ show_current_state_dialog() { } +# ============================================================ +# System preparation (mirror of nvidia_installer.sh) +# ============================================================ +ensure_repos_and_headers() { + msg_info "$(translate 'Checking kernel headers and build tools...')" + + local kver + kver=$(uname -r) + + apt-get update -qq >>"$LOG_FILE" 2>&1 + + if ! dpkg -s "pve-headers-$kver" >/dev/null 2>&1 && \ + ! dpkg -s "proxmox-headers-$kver" >/dev/null 2>&1; then + apt-get install -y "pve-headers-$kver" "proxmox-headers-$kver" build-essential dkms >>"$LOG_FILE" 2>&1 || true + else + apt-get install -y build-essential dkms >>"$LOG_FILE" 2>&1 || true + fi + + msg_ok "$(translate 'Kernel headers and build tools verified.')" | tee -a "$screen_capture" +} + + +# ============================================================ +# Host NVIDIA cleanup (before update) +# ============================================================ +_stop_nvidia_services() { + local services=( + "nvidia-persistenced.service" + "nvidia-persistenced" + "nvidia-powerd.service" + ) + + local services_detected=0 + for service in "${services[@]}"; do + if systemctl is-active --quiet "$service" 2>/dev/null || \ + systemctl is-enabled --quiet "$service" 2>/dev/null; then + services_detected=1 + break + fi + done + + if [ "$services_detected" -eq 1 ]; then + msg_info "$(translate 'Stopping and disabling NVIDIA services...')" + for service in "${services[@]}"; do + systemctl is-active --quiet "$service" 2>/dev/null && systemctl stop "$service" >/dev/null 2>&1 || true + systemctl is-enabled --quiet "$service" 2>/dev/null && systemctl disable "$service" >/dev/null 2>&1 || true + done + sleep 2 + msg_ok "$(translate 'NVIDIA services stopped and disabled.')" | tee -a "$screen_capture" + fi +} + +_unload_nvidia_modules() { + msg_info "$(translate 'Unloading NVIDIA kernel modules...')" + + for mod in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do + modprobe -r "$mod" >/dev/null 2>&1 || true + done + + if lsmod | grep -qi '\bnvidia'; then + for mod in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do + modprobe -r --force "$mod" >/dev/null 2>&1 || true + done + fi + + if lsmod | grep -qi '\bnvidia'; then + msg_warn "$(translate 'Some NVIDIA modules could not be unloaded. Update may fail. Ensure no processes are using the GPU.')" + else + msg_ok "$(translate 'NVIDIA kernel modules unloaded successfully.')" | tee -a "$screen_capture" + fi +} + +cleanup_nvidia_dkms() { + local versions + versions=$(dkms status 2>/dev/null | awk -F, '/nvidia/ {gsub(/ /,"",$2); print $2}' || true) + [[ -z "$versions" ]] && return 0 + + msg_info "$(translate 'Removing NVIDIA DKMS entries...')" + while IFS= read -r ver; do + [[ -z "$ver" ]] && continue + dkms remove -m nvidia -v "$ver" --all >/dev/null 2>&1 || true + done <<< "$versions" + msg_ok "$(translate 'NVIDIA DKMS entries removed.')" | tee -a "$screen_capture" +} + +_purge_nvidia_host() { + msg_info2 "$(translate 'Preparing host for driver update...')" + + _stop_nvidia_services + _unload_nvidia_modules + + if command -v nvidia-uninstall >/dev/null 2>&1; then + msg_info "$(translate 'Running nvidia-uninstall...')" + nvidia-uninstall --silent >>"$LOG_FILE" 2>&1 || true + msg_ok "$(translate 'nvidia-uninstall completed.')" | tee -a "$screen_capture" + fi + + cleanup_nvidia_dkms + + msg_info "$(translate 'Purging NVIDIA packages...')" + apt-get -y purge 'nvidia-*' 'libnvidia-*' 'cuda-*' 'libcudnn*' >>"$LOG_FILE" 2>&1 || true + apt-get -y autoremove --purge >>"$LOG_FILE" 2>&1 || true + msg_ok "$(translate 'NVIDIA packages purged.')" | tee -a "$screen_capture" + + # Remove stale udev / modprobe files so the new installer can write fresh ones + rm -f /etc/udev/rules.d/70-nvidia.rules + rm -f /etc/modprobe.d/nvidia*.conf /usr/lib/modprobe.d/nvidia*.conf +} + + +# ============================================================ +# Download installer (with integrity check — mirror of installer) +# ============================================================ +ensure_workdir() { + mkdir -p "$NVIDIA_WORKDIR" +} + +verify_version_exists() { + local version="$1" + local url="${NVIDIA_BASE_URL}/${version}/" + if curl -fsSL --head "$url" >/dev/null 2>&1; then + return 0 + else + return 1 + fi +} + +download_nvidia_installer() { + ensure_workdir + local version="$1" + version=$(echo "$version" | tr -d '[:space:]' | tr -d '\n' | tr -d '\r') + + if [[ ! "$version" =~ ^[0-9]+\.[0-9]+(\.[0-9]+)?$ ]]; then + msg_error "Invalid version format: $version" >&2 + echo "ERROR: Invalid version format: '$version'" >> "$LOG_FILE" + return 1 + fi + + local run_file="$NVIDIA_WORKDIR/NVIDIA-Linux-x86_64-${version}.run" + + if [[ -f "$run_file" ]]; then + local existing_size file_type + existing_size=$(stat -c%s "$run_file" 2>/dev/null || echo "0") + file_type=$(file "$run_file" 2>/dev/null || echo "unknown") + + if [[ $existing_size -gt 40000000 ]] && echo "$file_type" | grep -q "executable"; then + if sh "$run_file" --check 2>&1 | tee -a "$LOG_FILE" | grep -q "OK"; then + msg_ok "$(translate 'Installer already downloaded and verified.')" | tee -a "$screen_capture" >&2 + printf '%s\n' "$run_file" + return 0 + else + msg_warn "$(translate 'Existing file failed verification, re-downloading...')" >&2 + rm -f "$run_file" + fi + else + msg_warn "$(translate 'Removing invalid existing file...')" >&2 + rm -f "$run_file" + fi + fi + + if ! verify_version_exists "$version"; then + msg_error "Version $version does not exist on NVIDIA servers" >&2 + return 1 + fi + + msg_info "$(translate 'Downloading NVIDIA driver') ${version}..." >&2 + + local urls=( + "${NVIDIA_BASE_URL}/${version}/NVIDIA-Linux-x86_64-${version}.run" + "${NVIDIA_BASE_URL}/${version}/NVIDIA-Linux-x86_64-${version}-no-compat32.run" + ) + + local success=false + for url in "${urls[@]}"; do + rm -f "$run_file" + if curl -fL --connect-timeout 30 --max-time 600 "$url" -o "$run_file" >> "$LOG_FILE" 2>&1; then + [[ ! -f "$run_file" ]] && continue + local file_size file_type + file_size=$(stat -c%s "$run_file" 2>/dev/null || echo "0") + file_type=$(file "$run_file" 2>/dev/null) + if [[ $file_size -gt 40000000 ]] && echo "$file_type" | grep -q "executable"; then + success=true + break + fi + rm -f "$run_file" + fi + done + + if ! $success; then + msg_error "$(translate 'Download failed. Check') ${LOG_FILE}" >&2 + return 1 + fi + + chmod +x "$run_file" + msg_ok "$(translate 'Download complete.')" | tee -a "$screen_capture" >&2 + printf '%s\n' "$run_file" +} + + +# ============================================================ +# Host installer run (visible output — mirror of installer) +# ============================================================ +run_host_installer() { + local installer="$1" + local tmp_extract_dir="$NVIDIA_WORKDIR/tmp_extract" + mkdir -p "$tmp_extract_dir" + + msg_info2 "$(translate 'Starting NVIDIA installer on host. This may take several minutes...')" + echo "" >>"$LOG_FILE" + echo "=== Running NVIDIA installer: $installer ===" >>"$LOG_FILE" + + sh "$installer" \ + --tmpdir="$tmp_extract_dir" \ + --no-questions \ + --ui=none \ + --disable-nouveau \ + --no-nouveau-check \ + --dkms \ + 2>&1 | tee -a "$LOG_FILE" + local rc=${PIPESTATUS[0]} + echo "" >>"$LOG_FILE" + + rm -rf "$tmp_extract_dir" + + if [[ $rc -ne 0 ]]; then + msg_error "$(translate 'NVIDIA installer reported an error. Check') ${LOG_FILE}" + update_component_status "nvidia_driver" "failed" "" "gpu" '{"patched":false}' + return 1 + fi + + msg_ok "$(translate 'NVIDIA driver installed on host.')" | tee -a "$screen_capture" + return 0 +} + + +# ============================================================ +# LXC NVIDIA update — aligned with add_gpu_lxc.sh::_install_nvidia_drivers +# ============================================================ +CT_ORIG_MEM="" +NVIDIA_INSTALL_MIN_MB=2048 +CT_WAS_STARTED_FOR_UPDATE=false + +_detect_container_distro() { + local distro + distro=$(pct exec "$1" -- grep "^ID=" /etc/os-release 2>/dev/null \ + | cut -d= -f2 | tr -d '[:space:]"') + echo "${distro:-unknown}" +} + +_ensure_container_memory() { + local ctid="$1" + local cur_mem + cur_mem=$(pct config "$ctid" 2>/dev/null | awk '/^memory:/{print $2}') + [[ -z "$cur_mem" ]] && cur_mem=512 + + if [[ "$cur_mem" -lt "$NVIDIA_INSTALL_MIN_MB" ]]; then + if whiptail --title "$(translate 'Low Container Memory')" --yesno \ + "$(translate 'Container') ${ctid} $(translate 'has') ${cur_mem}MB RAM.\n\n$(translate 'The NVIDIA installer needs at least') ${NVIDIA_INSTALL_MIN_MB}MB $(translate 'to run without being killed by the OOM killer.')\n\n$(translate 'Increase container RAM temporarily to') ${NVIDIA_INSTALL_MIN_MB}MB?" \ + 13 72; then + CT_ORIG_MEM="$cur_mem" + pct set "$ctid" -memory "$NVIDIA_INSTALL_MIN_MB" >>"$LOG_FILE" 2>&1 || true + else + msg_warn "$(translate 'Insufficient memory. Skipping LXC') ${ctid}." + return 1 + fi + fi + return 0 +} + +_restore_container_memory() { + local ctid="$1" + if [[ -n "$CT_ORIG_MEM" ]]; then + msg_info "$(translate 'Restoring container memory to') ${CT_ORIG_MEM}MB..." + pct set "$ctid" -memory "$CT_ORIG_MEM" >>"$LOG_FILE" 2>&1 || true + msg_ok "$(translate 'Memory restored.')" + CT_ORIG_MEM="" + fi +} + +start_container_and_wait() { + local ctid="$1" + msg_info "$(translate 'Starting container') ${ctid}..." + pct start "$ctid" >>"$LOG_FILE" 2>&1 || true + + local ready=false + for _ in {1..15}; do + sleep 2 + if pct exec "$ctid" -- true >/dev/null 2>&1; then + ready=true + break + fi + done + + if ! $ready; then + msg_warn "$(translate 'Container') ${ctid} $(translate 'did not become ready. Skipping.')" + return 1 + fi + msg_ok "$(translate 'Container') ${ctid} $(translate 'started.')" | tee -a "$screen_capture" + return 0 +} + +update_lxc_nvidia() { + local ctid="$1" + local version="$2" + CT_WAS_STARTED_FOR_UPDATE=false + + local old_version + old_version=$(get_lxc_nvidia_version "$ctid") + + msg_info2 "$(translate 'Container') ${ctid}: $(translate 'updating NVIDIA userspace libs') (${old_version} → ${version})" + + # Start the container if stopped (required for pct exec based install) + if ! pct status "$ctid" 2>/dev/null | grep -q "running"; then + CT_WAS_STARTED_FOR_UPDATE=true + if ! start_container_and_wait "$ctid"; then + return 1 + fi + fi + + # Detect distro (alpine / arch / debian-like) + msg_info "$(translate 'Detecting container OS...')" + local distro + distro=$(_detect_container_distro "$ctid") + msg_ok "$(translate 'Container OS:') ${distro}" | tee -a "$screen_capture" + + local install_rc=0 + + case "$distro" in + alpine) + # Alpine: musl — use apk nvidia-utils (repo-managed, no .run) + msg_info2 "$(translate 'Upgrading NVIDIA utils (Alpine)...')" + pct exec "$ctid" -- sh -c \ + "apk update && apk add --no-cache --upgrade nvidia-utils" \ + 2>&1 | tee -a "$LOG_FILE" + install_rc=${PIPESTATUS[0]} + ;; + + arch|manjaro|endeavouros) + msg_info2 "$(translate 'Upgrading NVIDIA utils (Arch)...')" + pct exec "$ctid" -- bash -c \ + "pacman -Syu --noconfirm nvidia-utils" \ + 2>&1 | tee -a "$LOG_FILE" + install_rc=${PIPESTATUS[0]} + ;; + + *) + # Debian / Ubuntu / generic glibc: use the host-cached .run binary + local run_file="${NVIDIA_WORKDIR}/NVIDIA-Linux-x86_64-${version}.run" + + if [[ ! -f "$run_file" ]]; then + msg_warn "$(translate 'Installer not found:') ${run_file}. $(translate 'Skipping LXC') ${ctid}." + install_rc=1 + else + # Memory check — nvidia-installer needs ~2GB during install + if ! _ensure_container_memory "$ctid"; then + install_rc=1 + else + # Disk space check — NVIDIA libs need ~1.5 GB free in the container + local free_mb + free_mb=$(pct exec "$ctid" -- df -m / 2>/dev/null | awk 'NR==2{print $4}' || echo 0) + if [[ "$free_mb" -lt 1500 ]]; then + _restore_container_memory "$ctid" + dialog --backtitle "ProxMenux" \ + --title "$(translate 'Insufficient Disk Space')" \ + --msgbox "\n$(translate 'Container') ${ctid} $(translate 'has only') ${free_mb}MB $(translate 'of free disk space.')\n\n$(translate 'NVIDIA libs require approximately 1.5GB of free space.')\n\n$(translate 'Please expand the container disk and run this option again.')" \ + 12 72 + msg_warn "$(translate 'Insufficient disk space. Skipping LXC') ${ctid}." + install_rc=1 + else + # Extract .run on the host (avoids decompression OOM inside container) + local extract_dir="${NVIDIA_WORKDIR}/extracted_${version}" + local archive="/tmp/nvidia_lxc_${version}.tar.gz" + + msg_info2 "$(translate 'Extracting NVIDIA installer on host...')" + rm -rf "$extract_dir" + sh "$run_file" --extract-only --target "$extract_dir" 2>&1 | tee -a "$LOG_FILE" + if [[ ${PIPESTATUS[0]} -ne 0 ]]; then + msg_warn "$(translate 'Extraction failed. Check log:') ${LOG_FILE}" + _restore_container_memory "$ctid" + install_rc=1 + else + msg_ok "$(translate 'NVIDIA installer extracted.')" | tee -a "$screen_capture" + + msg_info2 "$(translate 'Packing installer archive...')" + tar --checkpoint=5000 --checkpoint-action=dot \ + -czf "$archive" -C "$extract_dir" . 2>&1 | tee -a "$LOG_FILE" + echo "" + local archive_size + archive_size=$(du -sh "$archive" 2>/dev/null | cut -f1) + msg_ok "$(translate 'Archive ready') (${archive_size})." | tee -a "$screen_capture" + + msg_info "$(translate 'Copying installer to container') ${ctid}..." + if ! pct push "$ctid" "$archive" /tmp/nvidia_lxc.tar.gz >>"$LOG_FILE" 2>&1; then + msg_warn "$(translate 'pct push failed. Check log:') ${LOG_FILE}" + rm -f "$archive" + rm -rf "$extract_dir" + _restore_container_memory "$ctid" + install_rc=1 + else + rm -f "$archive" + msg_ok "$(translate 'Installer copied to container.')" | tee -a "$screen_capture" + + msg_info2 "$(translate 'Running NVIDIA installer in container. This may take several minutes...')" + echo "" >>"$LOG_FILE" + pct exec "$ctid" -- bash -c " + mkdir -p /tmp/nvidia_lxc_install + tar -xzf /tmp/nvidia_lxc.tar.gz -C /tmp/nvidia_lxc_install 2>&1 + /tmp/nvidia_lxc_install/nvidia-installer \ + --no-kernel-modules \ + --no-questions \ + --ui=none \ + --no-nouveau-check \ + --no-dkms \ + --no-install-compat32-libs + EXIT=\$? + rm -rf /tmp/nvidia_lxc_install /tmp/nvidia_lxc.tar.gz + exit \$EXIT + " 2>&1 | tee -a "$LOG_FILE" + install_rc=${PIPESTATUS[0]} + + rm -rf "$extract_dir" + _restore_container_memory "$ctid" + fi + fi + fi + fi + fi + ;; + esac + + if [[ $install_rc -ne 0 ]]; then + msg_warn "$(translate 'NVIDIA update failed for LXC') ${ctid} ($(translate 'rc='))${install_rc}. $(translate 'Check log:') ${LOG_FILE}" + if [[ "$CT_WAS_STARTED_FOR_UPDATE" == "true" ]]; then + msg_info "$(translate 'Stopping container') ${ctid}..." + pct stop "$ctid" >>"$LOG_FILE" 2>&1 || true + msg_ok "$(translate 'Container stopped.')" | tee -a "$screen_capture" + fi + return 1 + fi + + # Verify nvidia-smi inside the container + if pct exec "$ctid" -- sh -c "which nvidia-smi" >/dev/null 2>&1; then + local new_ver + new_ver=$(pct exec "$ctid" -- nvidia-smi \ + --query-gpu=driver_version --format=csv,noheader 2>/dev/null \ + | head -1 | tr -d '[:space:]' || true) + msg_ok "$(translate 'Container') ${ctid}: ${old_version} → ${new_ver:-$version}" | tee -a "$screen_capture" + else + msg_warn "$(translate 'nvidia-smi not found in container') ${ctid} $(translate 'after update.')" + fi + + if [[ "$CT_WAS_STARTED_FOR_UPDATE" == "true" ]]; then + msg_info "$(translate 'Stopping container') ${ctid}..." + pct stop "$ctid" >>"$LOG_FILE" 2>&1 || true + msg_ok "$(translate 'Container stopped.')" | tee -a "$screen_capture" + fi + return 0 +} + + # ============================================================ # Restart prompt # ============================================================ restart_prompt() { - echo - msg_success "$(translate 'NVIDIA driver update completed.')" echo msg_info "$(translate 'Removing no longer required packages and purging old cached updates...')" apt-get -y autoremove >/dev/null 2>&1 apt-get -y autoclean >/dev/null 2>&1 - msg_ok "$(translate 'Cleanup finished.')" - echo -e "${TAB}${BL}Log: ${LOG_FILE}${CL}" - echo + msg_ok "$(translate 'Cleanup finished.')" | tee -a "$screen_capture" if whiptail --title "$(translate 'Reboot Required')" \ --yesno "$(translate 'The host driver update requires a reboot to take effect. Do you want to restart now?')" 10 70; then msg_success "$(translate 'Press Enter to continue...')" read -r msg_warn "$(translate 'Rebooting the system...')" + rm -f "$screen_capture" reboot else msg_info2 "$(translate 'You can reboot later manually.')" msg_success "$(translate 'Press Enter to continue...')" read -r + rm -f "$screen_capture" fi } @@ -494,10 +849,12 @@ restart_prompt() { # ============================================================ main() { : >"$LOG_FILE" + : >"$screen_capture" # ---- Phase 1: dialogs ---- check_gpu_not_in_vm_passthrough detect_host_nvidia + get_kernel_compatibility_info show_current_state_dialog select_target_version @@ -513,23 +870,62 @@ main() { # ---- Phase 2: processing ---- show_proxmenux_logo - msg_title "$(translate 'NVIDIA Driver Update')" + msg_title "$(translate "$SCRIPT_TITLE")" - # Download installer once — reused by both LXC containers and host - local run_file - run_file=$(_download_installer "$TARGET_VERSION") + ensure_repos_and_headers - # Update LXC containers first (no reboot needed for userspace libs) + # Download installer once — shared between LXC and host updates + local installer + installer=$(download_nvidia_installer "$TARGET_VERSION" 2>>"$LOG_FILE") + local download_result=$? + + if [[ $download_result -ne 0 || -z "$installer" || ! -f "$installer" ]]; then + msg_error "$(translate 'Failed to obtain NVIDIA installer. Check') ${LOG_FILE}" + rm -f "$screen_capture" + exit 1 + fi + + # Update LXCs first (userspace libs only — doesn't need a reboot) if [[ ${#NVIDIA_CONTAINERS[@]} -gt 0 ]]; then msg_info2 "$(translate 'Updating LXC containers...')" for ctid in "${NVIDIA_CONTAINERS[@]}"; do - update_lxc_nvidia "$ctid" "$TARGET_VERSION" + update_lxc_nvidia "$ctid" "$TARGET_VERSION" || true done fi - # Update host kernel module + drivers (reuses the already-downloaded installer) - update_host_nvidia "$TARGET_VERSION" + # Purge and reinstall host driver + _purge_nvidia_host + if ! run_host_installer "$installer"; then + rm -f "$screen_capture" + exit 1 + fi + + msg_info "$(translate 'Updating initramfs for all kernels...')" + update-initramfs -u -k all >>"$LOG_FILE" 2>&1 || true + msg_ok "$(translate 'initramfs updated.')" | tee -a "$screen_capture" + + # ---- Phase 3: summary ---- + sleep 2 + show_proxmenux_logo + msg_title "$(translate "$SCRIPT_TITLE")" + cat "$screen_capture" + echo -e "${TAB}${GN}📄 $(translate "Log file")${CL}: ${BL}$LOG_FILE${CL}" + + msg_info2 "$(translate 'Checking NVIDIA driver status with nvidia-smi')" + if command -v nvidia-smi >/dev/null 2>&1; then + nvidia-smi || true + local NEW_HOST_VERSION + NEW_HOST_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -n1) + if [[ -n "$NEW_HOST_VERSION" ]]; then + msg_ok "$(translate 'NVIDIA driver') $NEW_HOST_VERSION $(translate 'installed successfully on host.')" + update_component_status "nvidia_driver" "installed" "$NEW_HOST_VERSION" "gpu" '{"patched":false}' + fi + else + msg_warn "$(translate 'nvidia-smi not found in PATH. Verify the update manually after reboot.')" + fi + + msg_success "$(translate 'NVIDIA driver update completed.')" restart_prompt }