mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-04-25 00:46:21 +00:00
update notification_events.py
This commit is contained in:
Binary file not shown.
Binary file not shown.
@@ -1 +0,0 @@
|
||||
7d3df15c65411b4429e72305e9c7460d09b67c6d5bd2a50c05dd88a928fc4f94 ProxMenux-1.2.0.AppImage
|
||||
@@ -882,7 +882,15 @@ class JournalWatcher:
|
||||
smart_health = self._quick_smart_health(resolved)
|
||||
if smart_health != 'FAILED':
|
||||
return
|
||||
|
||||
|
||||
# ── Persist observation (before the cooldown gate) ──
|
||||
# The 24h cooldown below only suppresses RE-notification; the
|
||||
# per-disk observations history must reflect every genuine
|
||||
# detection. The DB UPSERT dedups same-signature events via
|
||||
# occurrence_count, so calling this on every match is safe.
|
||||
# Aligns with the parallel path in HealthMonitor._check_disks_optimized.
|
||||
self._record_disk_io_observation(resolved, msg)
|
||||
|
||||
# ── Gate 2: 24-hour dedup per device ──
|
||||
# Check both in-memory cache AND the DB (user dismiss clears DB cooldowns).
|
||||
# If user dismissed the error, _clear_disk_io_cooldown() removed the DB
|
||||
@@ -986,6 +994,55 @@ class JournalWatcher:
|
||||
except Exception:
|
||||
return 'UNKNOWN'
|
||||
|
||||
def _record_disk_io_observation(self, resolved: str, msg: str):
|
||||
"""Persist a kernel-journal I/O error as a disk observation.
|
||||
|
||||
Signature classification mirrors HealthMonitor._make_io_obs_signature
|
||||
so observations from the real-time journal watcher and the periodic
|
||||
dmesg scan dedup into the same row (via the UPSERT on
|
||||
disk_registry_id + error_type + error_signature).
|
||||
"""
|
||||
try:
|
||||
from health_persistence import health_persistence
|
||||
|
||||
m = msg.lower()
|
||||
if re.search(r'exception\s+emask|emask\s+0x|revalidation failed|'
|
||||
r'hard resetting link|serror.*badcrc|comreset|'
|
||||
r'link is slow|status.*drdy', m):
|
||||
family = 'ata_connection_error'
|
||||
elif re.search(r'i/o error|blk_update_request|medium error|sense key', m):
|
||||
family = 'block_io_error'
|
||||
elif re.search(r'failed command|fpdma queued', m):
|
||||
family = 'ata_failed_command'
|
||||
else:
|
||||
family = 'generic'
|
||||
|
||||
# Best-effort serial lookup so the observation survives device
|
||||
# renames (ata8 -> sdh, USB reconnects, etc.).
|
||||
serial = None
|
||||
try:
|
||||
sm = subprocess.run(
|
||||
['smartctl', '-i', f'/dev/{resolved}'],
|
||||
capture_output=True, text=True, timeout=3)
|
||||
if sm.returncode in (0, 4):
|
||||
for line in sm.stdout.split('\n'):
|
||||
if 'Serial Number' in line or 'Serial number' in line:
|
||||
serial = line.split(':')[-1].strip()
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
health_persistence.record_disk_observation(
|
||||
device_name=resolved,
|
||||
serial=serial,
|
||||
error_type='io_error',
|
||||
error_signature=f'io_{resolved}_{family}',
|
||||
raw_message=f'/dev/{resolved}: {msg.strip()[:200]}',
|
||||
severity='critical',
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"[JournalWatcher] Error recording disk io observation: {e}")
|
||||
|
||||
def _record_smartd_observation(self, title: str, message: str):
|
||||
"""Extract device info from a smartd system-mail and record as disk observation."""
|
||||
try:
|
||||
|
||||
@@ -95,7 +95,7 @@ detect_driver_status() {
|
||||
CURRENT_DRIVER_VERSION=""
|
||||
|
||||
# First check if nvidia kernel module is actually loaded
|
||||
if lsmod | grep -q "^nvidia "; then
|
||||
if grep -q "^nvidia " /proc/modules 2>/dev/null; then
|
||||
|
||||
modprobe nvidia-uvm 2>/dev/null || true
|
||||
sleep 1
|
||||
@@ -273,7 +273,7 @@ update_lxc_nvidia() {
|
||||
free_mb=$(pct exec "$ctid" -- df -m / 2>/dev/null | awk 'NR==2{print $4}' || echo 0)
|
||||
if [[ "$free_mb" -lt 1500 ]]; then
|
||||
_restore_container_memory "$ctid"
|
||||
dialog --backtitle "ProxMenux" \
|
||||
whiptail --backtitle "ProxMenux" \
|
||||
--title "$(translate 'Insufficient Disk Space')" \
|
||||
--msgbox "\n$(translate 'Container') ${ctid} $(translate 'has only') ${free_mb}MB $(translate 'of free disk space.')\n\n$(translate 'NVIDIA libs require approximately 1.5GB of free space.')" \
|
||||
11 72
|
||||
@@ -381,7 +381,7 @@ offer_lxc_updates_if_any() {
|
||||
done
|
||||
info+="\n$(translate 'Do you want to update the NVIDIA userspace libraries inside these containers to match the host?')"
|
||||
|
||||
if ! hybrid_yesno "$(translate 'Update NVIDIA in LXC Containers')" "$info" 20 80; then
|
||||
if ! hybrid_whiptail_yesno "$(translate 'Update NVIDIA in LXC Containers')" "$info" 20 80; then
|
||||
msg_info2 "$(translate 'LXC update skipped by user.')"
|
||||
return 0
|
||||
fi
|
||||
@@ -427,13 +427,14 @@ options nouveau modeset=0
|
||||
EOF
|
||||
|
||||
# Attempt to unload nouveau if currently loaded
|
||||
if lsmod | grep -q "^nouveau "; then
|
||||
stop_spinner
|
||||
if grep -q "^nouveau " /proc/modules 2>/dev/null; then
|
||||
|
||||
msg_info "$(translate 'Nouveau module is loaded, attempting to unload...')"
|
||||
modprobe -r nouveau 2>/dev/null || true
|
||||
sleep 1
|
||||
|
||||
# Check if unload succeeded
|
||||
if lsmod | grep -q "^nouveau "; then
|
||||
if grep -q "^nouveau " /proc/modules 2>/dev/null; then
|
||||
NOUVEAU_STILL_LOADED=true
|
||||
msg_warn "$(translate 'Could not unload nouveau module (may be in use). The blacklist will take effect after reboot. Installation will continue but a reboot will be required.')"
|
||||
echo "WARNING: nouveau module still loaded after unload attempt" >> "$LOG_FILE"
|
||||
@@ -445,6 +446,7 @@ EOF
|
||||
NOUVEAU_STILL_LOADED=false
|
||||
msg_ok "$(translate 'nouveau driver has been blacklisted.')" | tee -a "$screen_capture"
|
||||
fi
|
||||
|
||||
}
|
||||
|
||||
ensure_modules_config() {
|
||||
@@ -488,7 +490,7 @@ stop_and_disable_nvidia_services() {
|
||||
systemctl disable "$service" >/dev/null 2>&1 || true
|
||||
fi
|
||||
done
|
||||
|
||||
|
||||
sleep 2
|
||||
|
||||
msg_ok "$(translate 'NVIDIA services stopped and disabled.')" | tee -a "$screen_capture"
|
||||
@@ -496,36 +498,41 @@ stop_and_disable_nvidia_services() {
|
||||
}
|
||||
|
||||
unload_nvidia_modules() {
|
||||
|
||||
for mod in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do
|
||||
modprobe -r "$mod" >/dev/null 2>&1 || true
|
||||
done
|
||||
|
||||
# Give the kernel a moment to finalize sysfs teardown before re-checking.
|
||||
# Reading /proc/modules directly (instead of lsmod) avoids the
|
||||
# "could not open /sys/module/<mod>/holders" race when a module has just
|
||||
# been removed from /proc/modules but its sysfs dir hasn't been reaped yet.
|
||||
sleep 1
|
||||
|
||||
if lsmod | grep -qi '\bnvidia'; then
|
||||
if grep -q "^nvidia" /proc/modules 2>/dev/null; then
|
||||
for mod in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do
|
||||
modprobe -r --force "$mod" >/dev/null 2>&1 || true
|
||||
done
|
||||
sleep 1
|
||||
fi
|
||||
|
||||
if lsmod | grep -qi '\bnvidia'; then
|
||||
msg_warn "$(translate 'Some NVIDIA modules could not be unloaded. Installation may fail. Ensure no processes are using the GPU.')"
|
||||
if grep -q "^nvidia" /proc/modules 2>/dev/null; then
|
||||
|
||||
if command -v lsof >/dev/null 2>&1; then
|
||||
echo "$(translate 'Processes using NVIDIA:'):" >> "$LOG_FILE"
|
||||
lsof /dev/nvidia* 2>/dev/null >> "$LOG_FILE" || true
|
||||
fi
|
||||
else
|
||||
|
||||
msg_ok "$(translate 'NVIDIA kernel modules unloaded successfully.')" | tee -a "$screen_capture"
|
||||
fi
|
||||
}
|
||||
|
||||
complete_nvidia_uninstall() {
|
||||
msg_info "$(translate 'Completing NVIDIA uninstallation...')"
|
||||
stop_and_disable_nvidia_services
|
||||
unload_nvidia_modules
|
||||
|
||||
if command -v nvidia-uninstall >/dev/null 2>&1; then
|
||||
#msg_info "$(translate 'Running NVIDIA uninstaller...')"
|
||||
msg_info "$(translate 'Running NVIDIA uninstaller...')"
|
||||
nvidia-uninstall --silent >>"$LOG_FILE" 2>&1 || true
|
||||
msg_ok "$(translate 'NVIDIA uninstaller completed.')"
|
||||
fi
|
||||
@@ -546,11 +553,11 @@ complete_nvidia_uninstall() {
|
||||
find "$NVIDIA_WORKDIR" -type d -name "nvidia-persistenced" -exec rm -rf {} + 2>/dev/null || true
|
||||
find "$NVIDIA_WORKDIR" -type d -name "nvidia-patch" -exec rm -rf {} + 2>/dev/null || true
|
||||
fi
|
||||
|
||||
|
||||
update_component_status "nvidia_driver" "removed" "" "gpu" '{}'
|
||||
|
||||
msg_ok "$(translate 'Complete NVIDIA uninstallation finished.')" | tee -a "$screen_capture"
|
||||
stop_spinner
|
||||
|
||||
}
|
||||
|
||||
cleanup_nvidia_dkms() {
|
||||
@@ -786,7 +793,7 @@ download_nvidia_installer() {
|
||||
return 0
|
||||
else
|
||||
echo "Existing file FAILED integrity check, removing..." >> "$LOG_FILE"
|
||||
msg_warn "$(translate 'Existing file failed verification, re-downloading...')" >&2
|
||||
msg_warn "$(translate 'Existing file, re-downloading...')" >&2
|
||||
rm -f "$run_file"
|
||||
fi
|
||||
else
|
||||
@@ -917,7 +924,8 @@ run_nvidia_installer() {
|
||||
update-initramfs -u -k all >>"$LOG_FILE" 2>&1 || true
|
||||
# Try one more time to unload nouveau after initramfs rebuild
|
||||
modprobe -r nouveau 2>/dev/null || true
|
||||
if lsmod | grep -q "^nouveau "; then
|
||||
sleep 1
|
||||
if grep -q "^nouveau " /proc/modules 2>/dev/null; then
|
||||
echo "WARNING: nouveau still loaded after initramfs rebuild, proceeding with --no-nouveau-check" >> "$LOG_FILE"
|
||||
msg_warn "$(translate 'nouveau still active. Proceeding with installation. A reboot will be required for the driver to work.')"
|
||||
else
|
||||
@@ -1227,7 +1235,7 @@ main() {
|
||||
|
||||
show_proxmenux_logo
|
||||
msg_title "$(translate "$SCRIPT_TITLE")"
|
||||
msg_info2 "$(translate 'Uninstalling current NVIDIA driver before installing new version...')"
|
||||
msg_info2 "$(translate 'Uninstalling current NVIDIA driver before installing new version')"
|
||||
complete_nvidia_uninstall
|
||||
|
||||
sleep 2
|
||||
|
||||
Reference in New Issue
Block a user