Merge pull request #178 from MacRimi/develop

update notification_events.py
This commit is contained in:
MacRimi
2026-04-19 02:03:53 +02:00
committed by GitHub
2 changed files with 84 additions and 19 deletions

View File

@@ -882,7 +882,15 @@ class JournalWatcher:
smart_health = self._quick_smart_health(resolved) smart_health = self._quick_smart_health(resolved)
if smart_health != 'FAILED': if smart_health != 'FAILED':
return return
# ── Persist observation (before the cooldown gate) ──
# The 24h cooldown below only suppresses RE-notification; the
# per-disk observations history must reflect every genuine
# detection. The DB UPSERT dedups same-signature events via
# occurrence_count, so calling this on every match is safe.
# Aligns with the parallel path in HealthMonitor._check_disks_optimized.
self._record_disk_io_observation(resolved, msg)
# ── Gate 2: 24-hour dedup per device ── # ── Gate 2: 24-hour dedup per device ──
# Check both in-memory cache AND the DB (user dismiss clears DB cooldowns). # Check both in-memory cache AND the DB (user dismiss clears DB cooldowns).
# If user dismissed the error, _clear_disk_io_cooldown() removed the DB # If user dismissed the error, _clear_disk_io_cooldown() removed the DB
@@ -986,6 +994,55 @@ class JournalWatcher:
except Exception: except Exception:
return 'UNKNOWN' return 'UNKNOWN'
def _record_disk_io_observation(self, resolved: str, msg: str):
"""Persist a kernel-journal I/O error as a disk observation.
Signature classification mirrors HealthMonitor._make_io_obs_signature
so observations from the real-time journal watcher and the periodic
dmesg scan dedup into the same row (via the UPSERT on
disk_registry_id + error_type + error_signature).
"""
try:
from health_persistence import health_persistence
m = msg.lower()
if re.search(r'exception\s+emask|emask\s+0x|revalidation failed|'
r'hard resetting link|serror.*badcrc|comreset|'
r'link is slow|status.*drdy', m):
family = 'ata_connection_error'
elif re.search(r'i/o error|blk_update_request|medium error|sense key', m):
family = 'block_io_error'
elif re.search(r'failed command|fpdma queued', m):
family = 'ata_failed_command'
else:
family = 'generic'
# Best-effort serial lookup so the observation survives device
# renames (ata8 -> sdh, USB reconnects, etc.).
serial = None
try:
sm = subprocess.run(
['smartctl', '-i', f'/dev/{resolved}'],
capture_output=True, text=True, timeout=3)
if sm.returncode in (0, 4):
for line in sm.stdout.split('\n'):
if 'Serial Number' in line or 'Serial number' in line:
serial = line.split(':')[-1].strip()
break
except Exception:
pass
health_persistence.record_disk_observation(
device_name=resolved,
serial=serial,
error_type='io_error',
error_signature=f'io_{resolved}_{family}',
raw_message=f'/dev/{resolved}: {msg.strip()[:200]}',
severity='critical',
)
except Exception as e:
print(f"[JournalWatcher] Error recording disk io observation: {e}")
def _record_smartd_observation(self, title: str, message: str): def _record_smartd_observation(self, title: str, message: str):
"""Extract device info from a smartd system-mail and record as disk observation.""" """Extract device info from a smartd system-mail and record as disk observation."""
try: try:

View File

@@ -95,7 +95,7 @@ detect_driver_status() {
CURRENT_DRIVER_VERSION="" CURRENT_DRIVER_VERSION=""
# First check if nvidia kernel module is actually loaded # First check if nvidia kernel module is actually loaded
if lsmod | grep -q "^nvidia "; then if grep -q "^nvidia " /proc/modules 2>/dev/null; then
modprobe nvidia-uvm 2>/dev/null || true modprobe nvidia-uvm 2>/dev/null || true
sleep 1 sleep 1
@@ -273,7 +273,7 @@ update_lxc_nvidia() {
free_mb=$(pct exec "$ctid" -- df -m / 2>/dev/null | awk 'NR==2{print $4}' || echo 0) free_mb=$(pct exec "$ctid" -- df -m / 2>/dev/null | awk 'NR==2{print $4}' || echo 0)
if [[ "$free_mb" -lt 1500 ]]; then if [[ "$free_mb" -lt 1500 ]]; then
_restore_container_memory "$ctid" _restore_container_memory "$ctid"
dialog --backtitle "ProxMenux" \ whiptail --backtitle "ProxMenux" \
--title "$(translate 'Insufficient Disk Space')" \ --title "$(translate 'Insufficient Disk Space')" \
--msgbox "\n$(translate 'Container') ${ctid} $(translate 'has only') ${free_mb}MB $(translate 'of free disk space.')\n\n$(translate 'NVIDIA libs require approximately 1.5GB of free space.')" \ --msgbox "\n$(translate 'Container') ${ctid} $(translate 'has only') ${free_mb}MB $(translate 'of free disk space.')\n\n$(translate 'NVIDIA libs require approximately 1.5GB of free space.')" \
11 72 11 72
@@ -381,7 +381,7 @@ offer_lxc_updates_if_any() {
done done
info+="\n$(translate 'Do you want to update the NVIDIA userspace libraries inside these containers to match the host?')" info+="\n$(translate 'Do you want to update the NVIDIA userspace libraries inside these containers to match the host?')"
if ! hybrid_yesno "$(translate 'Update NVIDIA in LXC Containers')" "$info" 20 80; then if ! hybrid_whiptail_yesno "$(translate 'Update NVIDIA in LXC Containers')" "$info" 20 80; then
msg_info2 "$(translate 'LXC update skipped by user.')" msg_info2 "$(translate 'LXC update skipped by user.')"
return 0 return 0
fi fi
@@ -427,13 +427,14 @@ options nouveau modeset=0
EOF EOF
# Attempt to unload nouveau if currently loaded # Attempt to unload nouveau if currently loaded
if lsmod | grep -q "^nouveau "; then if grep -q "^nouveau " /proc/modules 2>/dev/null; then
stop_spinner
msg_info "$(translate 'Nouveau module is loaded, attempting to unload...')" msg_info "$(translate 'Nouveau module is loaded, attempting to unload...')"
modprobe -r nouveau 2>/dev/null || true modprobe -r nouveau 2>/dev/null || true
sleep 1
# Check if unload succeeded # Check if unload succeeded
if lsmod | grep -q "^nouveau "; then if grep -q "^nouveau " /proc/modules 2>/dev/null; then
NOUVEAU_STILL_LOADED=true NOUVEAU_STILL_LOADED=true
msg_warn "$(translate 'Could not unload nouveau module (may be in use). The blacklist will take effect after reboot. Installation will continue but a reboot will be required.')" msg_warn "$(translate 'Could not unload nouveau module (may be in use). The blacklist will take effect after reboot. Installation will continue but a reboot will be required.')"
echo "WARNING: nouveau module still loaded after unload attempt" >> "$LOG_FILE" echo "WARNING: nouveau module still loaded after unload attempt" >> "$LOG_FILE"
@@ -445,6 +446,7 @@ EOF
NOUVEAU_STILL_LOADED=false NOUVEAU_STILL_LOADED=false
msg_ok "$(translate 'nouveau driver has been blacklisted.')" | tee -a "$screen_capture" msg_ok "$(translate 'nouveau driver has been blacklisted.')" | tee -a "$screen_capture"
fi fi
} }
ensure_modules_config() { ensure_modules_config() {
@@ -488,7 +490,7 @@ stop_and_disable_nvidia_services() {
systemctl disable "$service" >/dev/null 2>&1 || true systemctl disable "$service" >/dev/null 2>&1 || true
fi fi
done done
sleep 2 sleep 2
msg_ok "$(translate 'NVIDIA services stopped and disabled.')" | tee -a "$screen_capture" msg_ok "$(translate 'NVIDIA services stopped and disabled.')" | tee -a "$screen_capture"
@@ -496,36 +498,41 @@ stop_and_disable_nvidia_services() {
} }
unload_nvidia_modules() { unload_nvidia_modules() {
for mod in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do for mod in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do
modprobe -r "$mod" >/dev/null 2>&1 || true modprobe -r "$mod" >/dev/null 2>&1 || true
done done
# Give the kernel a moment to finalize sysfs teardown before re-checking.
# Reading /proc/modules directly (instead of lsmod) avoids the
# "could not open /sys/module/<mod>/holders" race when a module has just
# been removed from /proc/modules but its sysfs dir hasn't been reaped yet.
sleep 1
if lsmod | grep -qi '\bnvidia'; then if grep -q "^nvidia" /proc/modules 2>/dev/null; then
for mod in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do for mod in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do
modprobe -r --force "$mod" >/dev/null 2>&1 || true modprobe -r --force "$mod" >/dev/null 2>&1 || true
done done
sleep 1
fi fi
if lsmod | grep -qi '\bnvidia'; then if grep -q "^nvidia" /proc/modules 2>/dev/null; then
msg_warn "$(translate 'Some NVIDIA modules could not be unloaded. Installation may fail. Ensure no processes are using the GPU.')"
if command -v lsof >/dev/null 2>&1; then if command -v lsof >/dev/null 2>&1; then
echo "$(translate 'Processes using NVIDIA:'):" >> "$LOG_FILE" echo "$(translate 'Processes using NVIDIA:'):" >> "$LOG_FILE"
lsof /dev/nvidia* 2>/dev/null >> "$LOG_FILE" || true lsof /dev/nvidia* 2>/dev/null >> "$LOG_FILE" || true
fi fi
else else
msg_ok "$(translate 'NVIDIA kernel modules unloaded successfully.')" | tee -a "$screen_capture" msg_ok "$(translate 'NVIDIA kernel modules unloaded successfully.')" | tee -a "$screen_capture"
fi fi
} }
complete_nvidia_uninstall() { complete_nvidia_uninstall() {
msg_info "$(translate 'Completing NVIDIA uninstallation...')"
stop_and_disable_nvidia_services stop_and_disable_nvidia_services
unload_nvidia_modules unload_nvidia_modules
if command -v nvidia-uninstall >/dev/null 2>&1; then if command -v nvidia-uninstall >/dev/null 2>&1; then
#msg_info "$(translate 'Running NVIDIA uninstaller...')" msg_info "$(translate 'Running NVIDIA uninstaller...')"
nvidia-uninstall --silent >>"$LOG_FILE" 2>&1 || true nvidia-uninstall --silent >>"$LOG_FILE" 2>&1 || true
msg_ok "$(translate 'NVIDIA uninstaller completed.')" msg_ok "$(translate 'NVIDIA uninstaller completed.')"
fi fi
@@ -546,11 +553,11 @@ complete_nvidia_uninstall() {
find "$NVIDIA_WORKDIR" -type d -name "nvidia-persistenced" -exec rm -rf {} + 2>/dev/null || true find "$NVIDIA_WORKDIR" -type d -name "nvidia-persistenced" -exec rm -rf {} + 2>/dev/null || true
find "$NVIDIA_WORKDIR" -type d -name "nvidia-patch" -exec rm -rf {} + 2>/dev/null || true find "$NVIDIA_WORKDIR" -type d -name "nvidia-patch" -exec rm -rf {} + 2>/dev/null || true
fi fi
update_component_status "nvidia_driver" "removed" "" "gpu" '{}' update_component_status "nvidia_driver" "removed" "" "gpu" '{}'
msg_ok "$(translate 'Complete NVIDIA uninstallation finished.')" | tee -a "$screen_capture" msg_ok "$(translate 'Complete NVIDIA uninstallation finished.')" | tee -a "$screen_capture"
stop_spinner
} }
cleanup_nvidia_dkms() { cleanup_nvidia_dkms() {
@@ -786,7 +793,7 @@ download_nvidia_installer() {
return 0 return 0
else else
echo "Existing file FAILED integrity check, removing..." >> "$LOG_FILE" echo "Existing file FAILED integrity check, removing..." >> "$LOG_FILE"
msg_warn "$(translate 'Existing file failed verification, re-downloading...')" >&2 msg_warn "$(translate 'Existing file, re-downloading...')" >&2
rm -f "$run_file" rm -f "$run_file"
fi fi
else else
@@ -917,7 +924,8 @@ run_nvidia_installer() {
update-initramfs -u -k all >>"$LOG_FILE" 2>&1 || true update-initramfs -u -k all >>"$LOG_FILE" 2>&1 || true
# Try one more time to unload nouveau after initramfs rebuild # Try one more time to unload nouveau after initramfs rebuild
modprobe -r nouveau 2>/dev/null || true modprobe -r nouveau 2>/dev/null || true
if lsmod | grep -q "^nouveau "; then sleep 1
if grep -q "^nouveau " /proc/modules 2>/dev/null; then
echo "WARNING: nouveau still loaded after initramfs rebuild, proceeding with --no-nouveau-check" >> "$LOG_FILE" echo "WARNING: nouveau still loaded after initramfs rebuild, proceeding with --no-nouveau-check" >> "$LOG_FILE"
msg_warn "$(translate 'nouveau still active. Proceeding with installation. A reboot will be required for the driver to work.')" msg_warn "$(translate 'nouveau still active. Proceeding with installation. A reboot will be required for the driver to work.')"
else else
@@ -1227,7 +1235,7 @@ main() {
show_proxmenux_logo show_proxmenux_logo
msg_title "$(translate "$SCRIPT_TITLE")" msg_title "$(translate "$SCRIPT_TITLE")"
msg_info2 "$(translate 'Uninstalling current NVIDIA driver before installing new version...')" msg_info2 "$(translate 'Uninstalling current NVIDIA driver before installing new version')"
complete_nvidia_uninstall complete_nvidia_uninstall
sleep 2 sleep 2