Merge pull request #178 from MacRimi/develop

update notification_events.py
2026-06-15 04:47:00 +00:00 · 2026-04-19 02:03:53 +02:00
parent 44e92c8bf0 3e0b907138
commit 4e849d5309
2 changed files with 84 additions and 19 deletions
--- a/AppImage/scripts/notification_events.py
+++ b/AppImage/scripts/notification_events.py
@@ -882,7 +882,15 @@ class JournalWatcher:
            smart_health = self._quick_smart_health(resolved)
            if smart_health != 'FAILED':
                return
-            
+
            # ── Persist observation (before the cooldown gate) ──
            # The 24h cooldown below only suppresses RE-notification; the
            # per-disk observations history must reflect every genuine
            # detection. The DB UPSERT dedups same-signature events via
            # occurrence_count, so calling this on every match is safe.
            # Aligns with the parallel path in HealthMonitor._check_disks_optimized.
            self._record_disk_io_observation(resolved, msg)
            # ── Gate 2: 24-hour dedup per device ──
            # Check both in-memory cache AND the DB (user dismiss clears DB cooldowns).
            # If user dismissed the error, _clear_disk_io_cooldown() removed the DB
@@ -986,6 +994,55 @@ class JournalWatcher:
        except Exception:
            return 'UNKNOWN'
    def _record_disk_io_observation(self, resolved: str, msg: str):
        """Persist a kernel-journal I/O error as a disk observation.
        Signature classification mirrors HealthMonitor._make_io_obs_signature
        so observations from the real-time journal watcher and the periodic
        dmesg scan dedup into the same row (via the UPSERT on
        disk_registry_id + error_type + error_signature).
        """
        try:
            from health_persistence import health_persistence
            m = msg.lower()
            if re.search(r'exception\s+emask|emask\s+0x|revalidation failed|'
                         r'hard resetting link|serror.*badcrc|comreset|'
                         r'link is slow|status.*drdy', m):
                family = 'ata_connection_error'
            elif re.search(r'i/o error|blk_update_request|medium error|sense key', m):
                family = 'block_io_error'
            elif re.search(r'failed command|fpdma queued', m):
                family = 'ata_failed_command'
            else:
                family = 'generic'
            # Best-effort serial lookup so the observation survives device
            # renames (ata8 -> sdh, USB reconnects, etc.).
            serial = None
            try:
                sm = subprocess.run(
                    ['smartctl', '-i', f'/dev/{resolved}'],
                    capture_output=True, text=True, timeout=3)
                if sm.returncode in (0, 4):
                    for line in sm.stdout.split('\n'):
                        if 'Serial Number' in line or 'Serial number' in line:
                            serial = line.split(':')[-1].strip()
                            break
            except Exception:
                pass
            health_persistence.record_disk_observation(
                device_name=resolved,
                serial=serial,
                error_type='io_error',
                error_signature=f'io_{resolved}_{family}',
                raw_message=f'/dev/{resolved}: {msg.strip()[:200]}',
                severity='critical',
            )
        except Exception as e:
            print(f"[JournalWatcher] Error recording disk io observation: {e}")
    def _record_smartd_observation(self, title: str, message: str):
        """Extract device info from a smartd system-mail and record as disk observation."""
        try:
--- a/scripts/gpu_tpu/nvidia_installer.sh
+++ b/scripts/gpu_tpu/nvidia_installer.sh
@@ -95,7 +95,7 @@ detect_driver_status() {
  CURRENT_DRIVER_VERSION=""
  # First check if nvidia kernel module is actually loaded
-  if lsmod | grep -q "^nvidia "; then
+  if grep -q "^nvidia " /proc/modules 2>/dev/null; then
    modprobe nvidia-uvm 2>/dev/null || true
    sleep 1
@@ -273,7 +273,7 @@ update_lxc_nvidia() {
        free_mb=$(pct exec "$ctid" -- df -m / 2>/dev/null | awk 'NR==2{print $4}' || echo 0)
        if [[ "$free_mb" -lt 1500 ]]; then
          _restore_container_memory "$ctid"
-          dialog --backtitle "ProxMenux" \
+          whiptail --backtitle "ProxMenux" \
            --title "$(translate 'Insufficient Disk Space')" \
            --msgbox "\n$(translate 'Container') ${ctid} $(translate 'has only') ${free_mb}MB $(translate 'of free disk space.')\n\n$(translate 'NVIDIA libs require approximately 1.5GB of free space.')" \
            11 72
@@ -381,7 +381,7 @@ offer_lxc_updates_if_any() {
  done
  info+="\n$(translate 'Do you want to update the NVIDIA userspace libraries inside these containers to match the host?')"
-  if ! hybrid_yesno "$(translate 'Update NVIDIA in LXC Containers')" "$info" 20 80; then
+  if ! hybrid_whiptail_yesno "$(translate 'Update NVIDIA in LXC Containers')" "$info" 20 80; then
    msg_info2 "$(translate 'LXC update skipped by user.')"
    return 0
  fi
@@ -427,13 +427,14 @@ options nouveau modeset=0
 EOF
  # Attempt to unload nouveau if currently loaded
-  if lsmod | grep -q "^nouveau "; then
+  if grep -q "^nouveau " /proc/modules 2>/dev/null; then
-    stop_spinner
+
    msg_info "$(translate 'Nouveau module is loaded, attempting to unload...')"
    modprobe -r nouveau 2>/dev/null || true
    sleep 1
    # Check if unload succeeded
-    if lsmod | grep -q "^nouveau "; then
+    if grep -q "^nouveau " /proc/modules 2>/dev/null; then
      NOUVEAU_STILL_LOADED=true
      msg_warn "$(translate 'Could not unload nouveau module (may be in use). The blacklist will take effect after reboot. Installation will continue but a reboot will be required.')"
      echo "WARNING: nouveau module still loaded after unload attempt" >> "$LOG_FILE"
@@ -445,6 +446,7 @@ EOF
    NOUVEAU_STILL_LOADED=false
    msg_ok "$(translate 'nouveau driver has been blacklisted.')" | tee -a "$screen_capture"
  fi
 }
 ensure_modules_config() {
@@ -488,7 +490,7 @@ stop_and_disable_nvidia_services() {
        systemctl disable "$service" >/dev/null 2>&1 || true
      fi
    done
-    
+
    sleep 2
    msg_ok "$(translate 'NVIDIA services stopped and disabled.')" | tee -a "$screen_capture"
@@ -496,36 +498,41 @@ stop_and_disable_nvidia_services() {
 }
 unload_nvidia_modules() {
  for mod in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do
    modprobe -r "$mod" >/dev/null 2>&1 || true
  done
  # Give the kernel a moment to finalize sysfs teardown before re-checking.
  # Reading /proc/modules directly (instead of lsmod) avoids the
  # "could not open /sys/module/<mod>/holders" race when a module has just
  # been removed from /proc/modules but its sysfs dir hasn't been reaped yet.
  sleep 1
-  if lsmod | grep -qi '\bnvidia'; then
+  if grep -q "^nvidia" /proc/modules 2>/dev/null; then
    for mod in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do
      modprobe -r --force "$mod" >/dev/null 2>&1 || true
    done
    sleep 1
  fi
-  if lsmod | grep -qi '\bnvidia'; then
+  if grep -q "^nvidia" /proc/modules 2>/dev/null; then
-    msg_warn "$(translate 'Some NVIDIA modules could not be unloaded. Installation may fail. Ensure no processes are using the GPU.')"
+
    if command -v lsof >/dev/null 2>&1; then
      echo "$(translate 'Processes using NVIDIA:'):" >> "$LOG_FILE"
      lsof /dev/nvidia* 2>/dev/null >> "$LOG_FILE" || true
    fi
  else
    msg_ok "$(translate 'NVIDIA kernel modules unloaded successfully.')" | tee -a "$screen_capture"
  fi
 }
 complete_nvidia_uninstall() {
  msg_info "$(translate 'Completing NVIDIA uninstallation...')"
  stop_and_disable_nvidia_services
  unload_nvidia_modules
  if command -v nvidia-uninstall >/dev/null 2>&1; then
-    #msg_info "$(translate 'Running NVIDIA uninstaller...')"
+    msg_info "$(translate 'Running NVIDIA uninstaller...')"
    nvidia-uninstall --silent >>"$LOG_FILE" 2>&1 || true
    msg_ok "$(translate 'NVIDIA uninstaller completed.')"
  fi
@@ -546,11 +553,11 @@ complete_nvidia_uninstall() {
    find "$NVIDIA_WORKDIR" -type d -name "nvidia-persistenced" -exec rm -rf {} + 2>/dev/null || true
    find "$NVIDIA_WORKDIR" -type d -name "nvidia-patch" -exec rm -rf {} + 2>/dev/null || true
  fi
-  
+
  update_component_status "nvidia_driver" "removed" "" "gpu" '{}'
  msg_ok "$(translate 'Complete NVIDIA uninstallation finished.')" | tee -a "$screen_capture"
-  stop_spinner
+  
 }
 cleanup_nvidia_dkms() {
@@ -786,7 +793,7 @@ download_nvidia_installer() {
        return 0
      else
        echo "Existing file FAILED integrity check, removing..." >> "$LOG_FILE"
-        msg_warn "$(translate 'Existing file failed verification, re-downloading...')" >&2
+        msg_warn "$(translate 'Existing file, re-downloading...')" >&2
        rm -f "$run_file"
      fi
    else
@@ -917,7 +924,8 @@ run_nvidia_installer() {
    update-initramfs -u -k all >>"$LOG_FILE" 2>&1 || true
    # Try one more time to unload nouveau after initramfs rebuild
    modprobe -r nouveau 2>/dev/null || true
-    if lsmod | grep -q "^nouveau "; then
+    sleep 1
    if grep -q "^nouveau " /proc/modules 2>/dev/null; then
      echo "WARNING: nouveau still loaded after initramfs rebuild, proceeding with --no-nouveau-check" >> "$LOG_FILE"
      msg_warn "$(translate 'nouveau still active. Proceeding with installation. A reboot will be required for the driver to work.')"
    else
@@ -1227,7 +1235,7 @@ main() {
        show_proxmenux_logo
        msg_title "$(translate "$SCRIPT_TITLE")"
-        msg_info2 "$(translate 'Uninstalling current NVIDIA driver before installing new version...')"
+        msg_info2 "$(translate 'Uninstalling current NVIDIA driver before installing new version')"
        complete_nvidia_uninstall
        sleep 2