mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-04-25 08:56:21 +00:00
Merge pull request #178 from MacRimi/develop
update notification_events.py
This commit is contained in:
@@ -882,7 +882,15 @@ class JournalWatcher:
|
|||||||
smart_health = self._quick_smart_health(resolved)
|
smart_health = self._quick_smart_health(resolved)
|
||||||
if smart_health != 'FAILED':
|
if smart_health != 'FAILED':
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# ── Persist observation (before the cooldown gate) ──
|
||||||
|
# The 24h cooldown below only suppresses RE-notification; the
|
||||||
|
# per-disk observations history must reflect every genuine
|
||||||
|
# detection. The DB UPSERT dedups same-signature events via
|
||||||
|
# occurrence_count, so calling this on every match is safe.
|
||||||
|
# Aligns with the parallel path in HealthMonitor._check_disks_optimized.
|
||||||
|
self._record_disk_io_observation(resolved, msg)
|
||||||
|
|
||||||
# ── Gate 2: 24-hour dedup per device ──
|
# ── Gate 2: 24-hour dedup per device ──
|
||||||
# Check both in-memory cache AND the DB (user dismiss clears DB cooldowns).
|
# Check both in-memory cache AND the DB (user dismiss clears DB cooldowns).
|
||||||
# If user dismissed the error, _clear_disk_io_cooldown() removed the DB
|
# If user dismissed the error, _clear_disk_io_cooldown() removed the DB
|
||||||
@@ -986,6 +994,55 @@ class JournalWatcher:
|
|||||||
except Exception:
|
except Exception:
|
||||||
return 'UNKNOWN'
|
return 'UNKNOWN'
|
||||||
|
|
||||||
|
def _record_disk_io_observation(self, resolved: str, msg: str):
|
||||||
|
"""Persist a kernel-journal I/O error as a disk observation.
|
||||||
|
|
||||||
|
Signature classification mirrors HealthMonitor._make_io_obs_signature
|
||||||
|
so observations from the real-time journal watcher and the periodic
|
||||||
|
dmesg scan dedup into the same row (via the UPSERT on
|
||||||
|
disk_registry_id + error_type + error_signature).
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from health_persistence import health_persistence
|
||||||
|
|
||||||
|
m = msg.lower()
|
||||||
|
if re.search(r'exception\s+emask|emask\s+0x|revalidation failed|'
|
||||||
|
r'hard resetting link|serror.*badcrc|comreset|'
|
||||||
|
r'link is slow|status.*drdy', m):
|
||||||
|
family = 'ata_connection_error'
|
||||||
|
elif re.search(r'i/o error|blk_update_request|medium error|sense key', m):
|
||||||
|
family = 'block_io_error'
|
||||||
|
elif re.search(r'failed command|fpdma queued', m):
|
||||||
|
family = 'ata_failed_command'
|
||||||
|
else:
|
||||||
|
family = 'generic'
|
||||||
|
|
||||||
|
# Best-effort serial lookup so the observation survives device
|
||||||
|
# renames (ata8 -> sdh, USB reconnects, etc.).
|
||||||
|
serial = None
|
||||||
|
try:
|
||||||
|
sm = subprocess.run(
|
||||||
|
['smartctl', '-i', f'/dev/{resolved}'],
|
||||||
|
capture_output=True, text=True, timeout=3)
|
||||||
|
if sm.returncode in (0, 4):
|
||||||
|
for line in sm.stdout.split('\n'):
|
||||||
|
if 'Serial Number' in line or 'Serial number' in line:
|
||||||
|
serial = line.split(':')[-1].strip()
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
health_persistence.record_disk_observation(
|
||||||
|
device_name=resolved,
|
||||||
|
serial=serial,
|
||||||
|
error_type='io_error',
|
||||||
|
error_signature=f'io_{resolved}_{family}',
|
||||||
|
raw_message=f'/dev/{resolved}: {msg.strip()[:200]}',
|
||||||
|
severity='critical',
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[JournalWatcher] Error recording disk io observation: {e}")
|
||||||
|
|
||||||
def _record_smartd_observation(self, title: str, message: str):
|
def _record_smartd_observation(self, title: str, message: str):
|
||||||
"""Extract device info from a smartd system-mail and record as disk observation."""
|
"""Extract device info from a smartd system-mail and record as disk observation."""
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -95,7 +95,7 @@ detect_driver_status() {
|
|||||||
CURRENT_DRIVER_VERSION=""
|
CURRENT_DRIVER_VERSION=""
|
||||||
|
|
||||||
# First check if nvidia kernel module is actually loaded
|
# First check if nvidia kernel module is actually loaded
|
||||||
if lsmod | grep -q "^nvidia "; then
|
if grep -q "^nvidia " /proc/modules 2>/dev/null; then
|
||||||
|
|
||||||
modprobe nvidia-uvm 2>/dev/null || true
|
modprobe nvidia-uvm 2>/dev/null || true
|
||||||
sleep 1
|
sleep 1
|
||||||
@@ -273,7 +273,7 @@ update_lxc_nvidia() {
|
|||||||
free_mb=$(pct exec "$ctid" -- df -m / 2>/dev/null | awk 'NR==2{print $4}' || echo 0)
|
free_mb=$(pct exec "$ctid" -- df -m / 2>/dev/null | awk 'NR==2{print $4}' || echo 0)
|
||||||
if [[ "$free_mb" -lt 1500 ]]; then
|
if [[ "$free_mb" -lt 1500 ]]; then
|
||||||
_restore_container_memory "$ctid"
|
_restore_container_memory "$ctid"
|
||||||
dialog --backtitle "ProxMenux" \
|
whiptail --backtitle "ProxMenux" \
|
||||||
--title "$(translate 'Insufficient Disk Space')" \
|
--title "$(translate 'Insufficient Disk Space')" \
|
||||||
--msgbox "\n$(translate 'Container') ${ctid} $(translate 'has only') ${free_mb}MB $(translate 'of free disk space.')\n\n$(translate 'NVIDIA libs require approximately 1.5GB of free space.')" \
|
--msgbox "\n$(translate 'Container') ${ctid} $(translate 'has only') ${free_mb}MB $(translate 'of free disk space.')\n\n$(translate 'NVIDIA libs require approximately 1.5GB of free space.')" \
|
||||||
11 72
|
11 72
|
||||||
@@ -381,7 +381,7 @@ offer_lxc_updates_if_any() {
|
|||||||
done
|
done
|
||||||
info+="\n$(translate 'Do you want to update the NVIDIA userspace libraries inside these containers to match the host?')"
|
info+="\n$(translate 'Do you want to update the NVIDIA userspace libraries inside these containers to match the host?')"
|
||||||
|
|
||||||
if ! hybrid_yesno "$(translate 'Update NVIDIA in LXC Containers')" "$info" 20 80; then
|
if ! hybrid_whiptail_yesno "$(translate 'Update NVIDIA in LXC Containers')" "$info" 20 80; then
|
||||||
msg_info2 "$(translate 'LXC update skipped by user.')"
|
msg_info2 "$(translate 'LXC update skipped by user.')"
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
@@ -427,13 +427,14 @@ options nouveau modeset=0
|
|||||||
EOF
|
EOF
|
||||||
|
|
||||||
# Attempt to unload nouveau if currently loaded
|
# Attempt to unload nouveau if currently loaded
|
||||||
if lsmod | grep -q "^nouveau "; then
|
if grep -q "^nouveau " /proc/modules 2>/dev/null; then
|
||||||
stop_spinner
|
|
||||||
msg_info "$(translate 'Nouveau module is loaded, attempting to unload...')"
|
msg_info "$(translate 'Nouveau module is loaded, attempting to unload...')"
|
||||||
modprobe -r nouveau 2>/dev/null || true
|
modprobe -r nouveau 2>/dev/null || true
|
||||||
|
sleep 1
|
||||||
|
|
||||||
# Check if unload succeeded
|
# Check if unload succeeded
|
||||||
if lsmod | grep -q "^nouveau "; then
|
if grep -q "^nouveau " /proc/modules 2>/dev/null; then
|
||||||
NOUVEAU_STILL_LOADED=true
|
NOUVEAU_STILL_LOADED=true
|
||||||
msg_warn "$(translate 'Could not unload nouveau module (may be in use). The blacklist will take effect after reboot. Installation will continue but a reboot will be required.')"
|
msg_warn "$(translate 'Could not unload nouveau module (may be in use). The blacklist will take effect after reboot. Installation will continue but a reboot will be required.')"
|
||||||
echo "WARNING: nouveau module still loaded after unload attempt" >> "$LOG_FILE"
|
echo "WARNING: nouveau module still loaded after unload attempt" >> "$LOG_FILE"
|
||||||
@@ -445,6 +446,7 @@ EOF
|
|||||||
NOUVEAU_STILL_LOADED=false
|
NOUVEAU_STILL_LOADED=false
|
||||||
msg_ok "$(translate 'nouveau driver has been blacklisted.')" | tee -a "$screen_capture"
|
msg_ok "$(translate 'nouveau driver has been blacklisted.')" | tee -a "$screen_capture"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ensure_modules_config() {
|
ensure_modules_config() {
|
||||||
@@ -488,7 +490,7 @@ stop_and_disable_nvidia_services() {
|
|||||||
systemctl disable "$service" >/dev/null 2>&1 || true
|
systemctl disable "$service" >/dev/null 2>&1 || true
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
sleep 2
|
sleep 2
|
||||||
|
|
||||||
msg_ok "$(translate 'NVIDIA services stopped and disabled.')" | tee -a "$screen_capture"
|
msg_ok "$(translate 'NVIDIA services stopped and disabled.')" | tee -a "$screen_capture"
|
||||||
@@ -496,36 +498,41 @@ stop_and_disable_nvidia_services() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
unload_nvidia_modules() {
|
unload_nvidia_modules() {
|
||||||
|
|
||||||
for mod in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do
|
for mod in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do
|
||||||
modprobe -r "$mod" >/dev/null 2>&1 || true
|
modprobe -r "$mod" >/dev/null 2>&1 || true
|
||||||
done
|
done
|
||||||
|
|
||||||
|
# Give the kernel a moment to finalize sysfs teardown before re-checking.
|
||||||
|
# Reading /proc/modules directly (instead of lsmod) avoids the
|
||||||
|
# "could not open /sys/module/<mod>/holders" race when a module has just
|
||||||
|
# been removed from /proc/modules but its sysfs dir hasn't been reaped yet.
|
||||||
|
sleep 1
|
||||||
|
|
||||||
if lsmod | grep -qi '\bnvidia'; then
|
if grep -q "^nvidia" /proc/modules 2>/dev/null; then
|
||||||
for mod in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do
|
for mod in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do
|
||||||
modprobe -r --force "$mod" >/dev/null 2>&1 || true
|
modprobe -r --force "$mod" >/dev/null 2>&1 || true
|
||||||
done
|
done
|
||||||
|
sleep 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if lsmod | grep -qi '\bnvidia'; then
|
if grep -q "^nvidia" /proc/modules 2>/dev/null; then
|
||||||
msg_warn "$(translate 'Some NVIDIA modules could not be unloaded. Installation may fail. Ensure no processes are using the GPU.')"
|
|
||||||
if command -v lsof >/dev/null 2>&1; then
|
if command -v lsof >/dev/null 2>&1; then
|
||||||
echo "$(translate 'Processes using NVIDIA:'):" >> "$LOG_FILE"
|
echo "$(translate 'Processes using NVIDIA:'):" >> "$LOG_FILE"
|
||||||
lsof /dev/nvidia* 2>/dev/null >> "$LOG_FILE" || true
|
lsof /dev/nvidia* 2>/dev/null >> "$LOG_FILE" || true
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
|
|
||||||
msg_ok "$(translate 'NVIDIA kernel modules unloaded successfully.')" | tee -a "$screen_capture"
|
msg_ok "$(translate 'NVIDIA kernel modules unloaded successfully.')" | tee -a "$screen_capture"
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
complete_nvidia_uninstall() {
|
complete_nvidia_uninstall() {
|
||||||
msg_info "$(translate 'Completing NVIDIA uninstallation...')"
|
|
||||||
stop_and_disable_nvidia_services
|
stop_and_disable_nvidia_services
|
||||||
unload_nvidia_modules
|
unload_nvidia_modules
|
||||||
|
|
||||||
if command -v nvidia-uninstall >/dev/null 2>&1; then
|
if command -v nvidia-uninstall >/dev/null 2>&1; then
|
||||||
#msg_info "$(translate 'Running NVIDIA uninstaller...')"
|
msg_info "$(translate 'Running NVIDIA uninstaller...')"
|
||||||
nvidia-uninstall --silent >>"$LOG_FILE" 2>&1 || true
|
nvidia-uninstall --silent >>"$LOG_FILE" 2>&1 || true
|
||||||
msg_ok "$(translate 'NVIDIA uninstaller completed.')"
|
msg_ok "$(translate 'NVIDIA uninstaller completed.')"
|
||||||
fi
|
fi
|
||||||
@@ -546,11 +553,11 @@ complete_nvidia_uninstall() {
|
|||||||
find "$NVIDIA_WORKDIR" -type d -name "nvidia-persistenced" -exec rm -rf {} + 2>/dev/null || true
|
find "$NVIDIA_WORKDIR" -type d -name "nvidia-persistenced" -exec rm -rf {} + 2>/dev/null || true
|
||||||
find "$NVIDIA_WORKDIR" -type d -name "nvidia-patch" -exec rm -rf {} + 2>/dev/null || true
|
find "$NVIDIA_WORKDIR" -type d -name "nvidia-patch" -exec rm -rf {} + 2>/dev/null || true
|
||||||
fi
|
fi
|
||||||
|
|
||||||
update_component_status "nvidia_driver" "removed" "" "gpu" '{}'
|
update_component_status "nvidia_driver" "removed" "" "gpu" '{}'
|
||||||
|
|
||||||
msg_ok "$(translate 'Complete NVIDIA uninstallation finished.')" | tee -a "$screen_capture"
|
msg_ok "$(translate 'Complete NVIDIA uninstallation finished.')" | tee -a "$screen_capture"
|
||||||
stop_spinner
|
|
||||||
}
|
}
|
||||||
|
|
||||||
cleanup_nvidia_dkms() {
|
cleanup_nvidia_dkms() {
|
||||||
@@ -786,7 +793,7 @@ download_nvidia_installer() {
|
|||||||
return 0
|
return 0
|
||||||
else
|
else
|
||||||
echo "Existing file FAILED integrity check, removing..." >> "$LOG_FILE"
|
echo "Existing file FAILED integrity check, removing..." >> "$LOG_FILE"
|
||||||
msg_warn "$(translate 'Existing file failed verification, re-downloading...')" >&2
|
msg_warn "$(translate 'Existing file, re-downloading...')" >&2
|
||||||
rm -f "$run_file"
|
rm -f "$run_file"
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
@@ -917,7 +924,8 @@ run_nvidia_installer() {
|
|||||||
update-initramfs -u -k all >>"$LOG_FILE" 2>&1 || true
|
update-initramfs -u -k all >>"$LOG_FILE" 2>&1 || true
|
||||||
# Try one more time to unload nouveau after initramfs rebuild
|
# Try one more time to unload nouveau after initramfs rebuild
|
||||||
modprobe -r nouveau 2>/dev/null || true
|
modprobe -r nouveau 2>/dev/null || true
|
||||||
if lsmod | grep -q "^nouveau "; then
|
sleep 1
|
||||||
|
if grep -q "^nouveau " /proc/modules 2>/dev/null; then
|
||||||
echo "WARNING: nouveau still loaded after initramfs rebuild, proceeding with --no-nouveau-check" >> "$LOG_FILE"
|
echo "WARNING: nouveau still loaded after initramfs rebuild, proceeding with --no-nouveau-check" >> "$LOG_FILE"
|
||||||
msg_warn "$(translate 'nouveau still active. Proceeding with installation. A reboot will be required for the driver to work.')"
|
msg_warn "$(translate 'nouveau still active. Proceeding with installation. A reboot will be required for the driver to work.')"
|
||||||
else
|
else
|
||||||
@@ -1227,7 +1235,7 @@ main() {
|
|||||||
|
|
||||||
show_proxmenux_logo
|
show_proxmenux_logo
|
||||||
msg_title "$(translate "$SCRIPT_TITLE")"
|
msg_title "$(translate "$SCRIPT_TITLE")"
|
||||||
msg_info2 "$(translate 'Uninstalling current NVIDIA driver before installing new version...')"
|
msg_info2 "$(translate 'Uninstalling current NVIDIA driver before installing new version')"
|
||||||
complete_nvidia_uninstall
|
complete_nvidia_uninstall
|
||||||
|
|
||||||
sleep 2
|
sleep 2
|
||||||
|
|||||||
Reference in New Issue
Block a user