From 9fe58935c4e6795356b9c4c4a825991809e13091 Mon Sep 17 00:00:00 2001
From: MacRimi <ricoextincion@gmail.com>
Date: Sun, 1 Mar 2026 22:56:25 +0100
Subject: [PATCH] Update notification service

---
 AppImage/scripts/health_monitor.py      | 26 +++++++++-------
 AppImage/scripts/notification_events.py | 40 +++++++++++++++++++++++--
 2 files changed, 54 insertions(+), 12 deletions(-)

diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py
index 119269a7..7430665b 100644
--- a/AppImage/scripts/health_monitor.py
+++ b/AppImage/scripts/health_monitor.py
@@ -1500,16 +1500,22 @@ class HealthMonitor:
                             if sample:
                                 reason += f'\n{sample}'
                             
-                            health_persistence.record_error(
-                                error_key=error_key,
-                                category='disks',
-                                severity=severity,
-                                reason=reason,
-                                details={'disk': disk, 'device': display,
-                                         'error_count': error_count,
-                                         'smart_status': smart_health,
-                                         'sample': sample, 'dismissable': True}
-                            )
+                            # Only record to persistence ONCE.  If the error is
+                            # already active, don't call record_error again --
+                            # that would keep updating last_seen and preventing
+                            # the freshness check from detecting it as stale.
+                            if not health_persistence.is_error_active(error_key, category='disks'):
+                                health_persistence.record_error(
+                                    error_key=error_key,
+                                    category='disks',
+                                    severity=severity,
+                                    reason=reason,
+                                    details={'disk': disk, 'device': display,
+                                             'error_count': error_count,
+                                             'smart_status': smart_health,
+                                             'sample': sample, 'dismissable': True}
+                                )
+                            
                             disk_results[display] = {
                                 'status': severity,
                                 'reason': reason,
diff --git a/AppImage/scripts/notification_events.py b/AppImage/scripts/notification_events.py
index c5a0a28c..8c4f381e 100644
--- a/AppImage/scripts/notification_events.py
+++ b/AppImage/scripts/notification_events.py
@@ -1498,18 +1498,54 @@ class PollingCollector:
                     self._last_notified[error_key] = now
                 continue
             
+            # ── Freshness check for re-notifications ──
+            # Don't re-notify errors whose last_seen is stale (>2h old).
+            # If the health monitor stopped detecting the error, last_seen
+            # freezes.  Re-notifying with dated info is confusing.
+            _FRESHNESS_WINDOW = 7200  # 2 hours
+            last_seen_str = error.get('last_seen', '')
+            error_is_stale = False
+            if last_seen_str:
+                try:
+                    from datetime import datetime as _dt
+                    ls_epoch = _dt.fromisoformat(last_seen_str).timestamp()
+                    if now - ls_epoch > _FRESHNESS_WINDOW:
+                        error_is_stale = True
+                except (ValueError, TypeError):
+                    pass
+            
             # Determine if we should notify
             is_new = error_key not in self._known_errors
             last_sent = self._last_notified.get(error_key, 0)
             is_due = (now - last_sent) >= self.DIGEST_INTERVAL
             
-            if not is_new and not is_due:
-                continue
+            # For re-notifications (not new): skip if stale OR not due
+            if not is_new:
+                if error_is_stale or not is_due:
+                    continue
             
             # Map to our event type
             event_type = self._CATEGORY_TO_EVENT_TYPE.get(category, 'system_problem')
             entity, eid = self._ENTITY_MAP.get(category, ('node', ''))
             
+            # ── SMART gate for disk errors ──
+            # If the health monitor recorded a disk error but SMART is NOT
+            # FAILED, skip the notification entirely.  Disk notifications
+            # should ONLY be sent when SMART confirms a real hardware failure.
+            # This prevents WARNING-level disk errors (SMART: unavailable)
+            # from being emitted as notifications at all.
+            if category in ('disks', 'smart', 'zfs'):
+                details = error.get('details', {})
+                if isinstance(details, str):
+                    try:
+                        details = json.loads(details)
+                    except (json.JSONDecodeError, TypeError):
+                        details = {}
+                smart_status = details.get('smart_status', '') if isinstance(details, dict) else ''
+                if smart_status != 'FAILED':
+                    # SMART is PASSED, UNKNOWN, or unavailable -- don't notify
+                    continue
+            
             # Updates are always informational notifications except
             # system_age which can be WARNING (365+ days) or CRITICAL (548+ days).
             emit_severity = severity