From 9fe58935c4e6795356b9c4c4a825991809e13091 Mon Sep 17 00:00:00 2001 From: MacRimi Date: Sun, 1 Mar 2026 22:56:25 +0100 Subject: [PATCH] Update notification service --- AppImage/scripts/health_monitor.py | 26 +++++++++------- AppImage/scripts/notification_events.py | 40 +++++++++++++++++++++++-- 2 files changed, 54 insertions(+), 12 deletions(-) diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py index 119269a7..7430665b 100644 --- a/AppImage/scripts/health_monitor.py +++ b/AppImage/scripts/health_monitor.py @@ -1500,16 +1500,22 @@ class HealthMonitor: if sample: reason += f'\n{sample}' - health_persistence.record_error( - error_key=error_key, - category='disks', - severity=severity, - reason=reason, - details={'disk': disk, 'device': display, - 'error_count': error_count, - 'smart_status': smart_health, - 'sample': sample, 'dismissable': True} - ) + # Only record to persistence ONCE. If the error is + # already active, don't call record_error again -- + # that would keep updating last_seen and preventing + # the freshness check from detecting it as stale. + if not health_persistence.is_error_active(error_key, category='disks'): + health_persistence.record_error( + error_key=error_key, + category='disks', + severity=severity, + reason=reason, + details={'disk': disk, 'device': display, + 'error_count': error_count, + 'smart_status': smart_health, + 'sample': sample, 'dismissable': True} + ) + disk_results[display] = { 'status': severity, 'reason': reason, diff --git a/AppImage/scripts/notification_events.py b/AppImage/scripts/notification_events.py index c5a0a28c..8c4f381e 100644 --- a/AppImage/scripts/notification_events.py +++ b/AppImage/scripts/notification_events.py @@ -1498,18 +1498,54 @@ class PollingCollector: self._last_notified[error_key] = now continue + # ── Freshness check for re-notifications ── + # Don't re-notify errors whose last_seen is stale (>2h old). + # If the health monitor stopped detecting the error, last_seen + # freezes. Re-notifying with dated info is confusing. + _FRESHNESS_WINDOW = 7200 # 2 hours + last_seen_str = error.get('last_seen', '') + error_is_stale = False + if last_seen_str: + try: + from datetime import datetime as _dt + ls_epoch = _dt.fromisoformat(last_seen_str).timestamp() + if now - ls_epoch > _FRESHNESS_WINDOW: + error_is_stale = True + except (ValueError, TypeError): + pass + # Determine if we should notify is_new = error_key not in self._known_errors last_sent = self._last_notified.get(error_key, 0) is_due = (now - last_sent) >= self.DIGEST_INTERVAL - if not is_new and not is_due: - continue + # For re-notifications (not new): skip if stale OR not due + if not is_new: + if error_is_stale or not is_due: + continue # Map to our event type event_type = self._CATEGORY_TO_EVENT_TYPE.get(category, 'system_problem') entity, eid = self._ENTITY_MAP.get(category, ('node', '')) + # ── SMART gate for disk errors ── + # If the health monitor recorded a disk error but SMART is NOT + # FAILED, skip the notification entirely. Disk notifications + # should ONLY be sent when SMART confirms a real hardware failure. + # This prevents WARNING-level disk errors (SMART: unavailable) + # from being emitted as notifications at all. + if category in ('disks', 'smart', 'zfs'): + details = error.get('details', {}) + if isinstance(details, str): + try: + details = json.loads(details) + except (json.JSONDecodeError, TypeError): + details = {} + smart_status = details.get('smart_status', '') if isinstance(details, dict) else '' + if smart_status != 'FAILED': + # SMART is PASSED, UNKNOWN, or unavailable -- don't notify + continue + # Updates are always informational notifications except # system_age which can be WARNING (365+ days) or CRITICAL (548+ days). emit_severity = severity