From 9089035f1806e5b4f6df9c7a5dbe96555fe48f22 Mon Sep 17 00:00:00 2001 From: MacRimi Date: Wed, 4 Mar 2026 19:11:38 +0100 Subject: [PATCH] Update notification service --- AppImage/scripts/flask_server.py | 28 +++ AppImage/scripts/health_monitor.py | 71 +++++- AppImage/scripts/health_persistence.py | 263 ++++++++++++++++++++++- AppImage/scripts/notification_events.py | 158 +++++++++++--- AppImage/scripts/notification_manager.py | 20 +- 5 files changed, 504 insertions(+), 36 deletions(-) diff --git a/AppImage/scripts/flask_server.py b/AppImage/scripts/flask_server.py index 3bab0fd1..f7114079 100644 --- a/AppImage/scripts/flask_server.py +++ b/AppImage/scripts/flask_server.py @@ -1418,6 +1418,34 @@ def get_storage_info(): # print(f"Error getting partition info: {e}") pass + # ── Register disks in observation system + enrich with observation counts ── + try: + active_dev_names = list(physical_disks.keys()) + obs_counts = health_persistence.get_disks_observation_counts() + + for disk_name, disk_info in physical_disks.items(): + # Register each disk we see + health_persistence.register_disk( + device_name=disk_name, + serial=disk_info.get('serial', ''), + model=disk_info.get('model', ''), + size_bytes=disk_info.get('size_bytes'), + ) + + # Attach observation count: try serial match first, then device name + serial = disk_info.get('serial', '') + count = obs_counts.get(f'serial:{serial}', 0) if serial else 0 + if count == 0: + count = obs_counts.get(disk_name, 0) + disk_info['observations_count'] = count + + # Mark disks no longer present as removed + health_persistence.mark_removed_disks(active_dev_names) + # Auto-dismiss stale observations (> 30 days old) + health_persistence.cleanup_stale_observations() + except Exception: + pass + storage_data['disks'] = list(physical_disks.values()) return storage_data diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py index 5bc5de4a..bd4eded6 100644 --- a/AppImage/scripts/health_monitor.py +++ b/AppImage/scripts/health_monitor.py @@ -135,19 +135,22 @@ class HealthMonitor: # These are logged at ERR level but are common on SATA controllers # during hot-plug, link renegotiation, or cable noise. They are NOT # indicative of disk failure unless SMART also reports problems. - r'ata\d+.*SError.*BadCRC', - r'ata\d+.*Emask 0x10.*ATA bus error', - r'failed command: (READ|WRITE) FPDMA QUEUED', + # NOTE: patterns are matched against line.lower(), so use lowercase. + r'ata\d+.*serror.*badcrc', + r'ata\d+.*emask 0x10.*ata bus error', + r'failed command: (read|write) fpdma queued', r'ata\d+.*hard resetting link', r'ata\d+.*link is slow', - r'ata\d+.*COMRESET', + r'ata\d+.*comreset', # ── ProxMenux self-referential noise ── # The monitor reporting its OWN service failures is circular -- # it cannot meaningfully alert about itself. - r'proxmenux-monitor\.service.*Failed', + # NOTE: patterns are matched against line.lower(), so use lowercase. + r'proxmenux-monitor\.service.*failed', r'proxmenux-monitor\.service.*exit-code', - r'ProxMenux-Monitor.*Failed at step EXEC', + r'proxmenux-monitor.*failed at step exec', + r'proxmenux-monitor\.appimage', # ── PVE scheduler operational noise ── # pvescheduler emits "could not update job state" every minute @@ -1147,6 +1150,42 @@ class HealthMonitor: return storages + @staticmethod + def _make_io_obs_signature(disk: str, sample: str) -> str: + """Create a stable observation signature for I/O errors on a disk. + + All ATA errors on the same disk (exception Emask, revalidation failed, + hard resetting link, SError, etc.) map to ONE signature per error family. + This ensures that "Emask 0x1 SAct 0xc1000000" and "Emask 0x1 SAct 0x804000" + and "revalidation failed" all dedup into the same observation. + """ + if not sample: + return f'io_{disk}_generic' + + s = sample.lower() + + # Classify into error families (order matters: first match wins) + families = [ + # ATA controller errors: exception, emask, revalidation, reset + # All these are symptoms of the same underlying connection issue + (r'exception\s+emask|emask\s+0x|revalidation failed|hard resetting link|' + r'serror.*badcrc|comreset|link is slow|status.*drdy', + 'ata_connection_error'), + # SCSI / block-layer errors + (r'i/o error|blk_update_request|medium error|sense key', + 'block_io_error'), + # Failed commands (READ/WRITE FPDMA QUEUED) + (r'failed command|fpdma queued', + 'ata_failed_command'), + ] + + for pattern, family in families: + if re.search(pattern, s): + return f'io_{disk}_{family}' + + # Fallback: generic per-disk + return f'io_{disk}_generic' + def _resolve_ata_to_disk(self, ata_port: str) -> str: """Resolve an ATA controller name (e.g. 'ata8') to a block device (e.g. 'sda'). @@ -1444,6 +1483,26 @@ class HealthMonitor: smart_ok = smart_health == 'PASSED' + # ── Record disk observation (always, even if transient) ── + # Signature must be stable across cycles: strip volatile + # data (hex values, counts, timestamps) to dedup properly. + # e.g. "ata8.00: exception Emask 0x1 SAct 0xc1000000" + # and "ata8.00: revalidation failed (errno=-2)" + # both map to the same per-device I/O observation. + try: + obs_sig = self._make_io_obs_signature(disk, sample) + obs_severity = 'critical' if smart_health == 'FAILED' else 'warning' + health_persistence.record_disk_observation( + device_name=disk, + serial=None, + error_type='io_error', + error_signature=obs_sig, + raw_message=f'{display}: {error_count} I/O event(s) in 5 min (SMART: {smart_health})\n{sample}', + severity=obs_severity, + ) + except Exception: + pass + # Transient-only errors (e.g. SError with auto-recovery) # are always INFO regardless of SMART if all_transient: diff --git a/AppImage/scripts/health_persistence.py b/AppImage/scripts/health_persistence.py index 241534e3..374e9687 100644 --- a/AppImage/scripts/health_persistence.py +++ b/AppImage/scripts/health_persistence.py @@ -150,6 +150,45 @@ class HealthPersistence: cursor.execute('CREATE INDEX IF NOT EXISTS idx_notif_severity ON notification_history(severity)') cursor.execute('CREATE INDEX IF NOT EXISTS idx_nls_ts ON notification_last_sent(last_sent_ts)') + # ── Disk Observations System ── + # Registry of all physical disks seen by the system + cursor.execute(''' + CREATE TABLE IF NOT EXISTS disk_registry ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + device_name TEXT NOT NULL, + serial TEXT, + model TEXT, + size_bytes INTEGER, + first_seen TEXT NOT NULL, + last_seen TEXT NOT NULL, + removed INTEGER DEFAULT 0, + UNIQUE(device_name, serial) + ) + ''') + + # Observation log: deduplicated error events per disk + cursor.execute(''' + CREATE TABLE IF NOT EXISTS disk_observations ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + disk_registry_id INTEGER NOT NULL, + error_type TEXT NOT NULL, + error_signature TEXT NOT NULL, + first_occurrence TEXT NOT NULL, + last_occurrence TEXT NOT NULL, + occurrence_count INTEGER DEFAULT 1, + raw_message TEXT, + severity TEXT DEFAULT 'warning', + dismissed INTEGER DEFAULT 0, + FOREIGN KEY(disk_registry_id) REFERENCES disk_registry(id), + UNIQUE(disk_registry_id, error_type, error_signature) + ) + ''') + + cursor.execute('CREATE INDEX IF NOT EXISTS idx_disk_serial ON disk_registry(serial)') + cursor.execute('CREATE INDEX IF NOT EXISTS idx_disk_device ON disk_registry(device_name)') + cursor.execute('CREATE INDEX IF NOT EXISTS idx_obs_disk ON disk_observations(disk_registry_id)') + cursor.execute('CREATE INDEX IF NOT EXISTS idx_obs_dismissed ON disk_observations(dismissed)') + conn.commit() conn.close() @@ -519,10 +558,12 @@ class HealthPersistence: } child_prefix = CASCADE_PREFIXES.get(error_key) if child_prefix: + # Only cascade to active (unresolved) child errors. + # Already-resolved/expired entries must NOT be re-surfaced. cursor.execute(''' UPDATE errors SET acknowledged = 1, resolved_at = ?, suppression_hours = ? - WHERE error_key LIKE ? AND acknowledged = 0 + WHERE error_key LIKE ? AND acknowledged = 0 AND resolved_at IS NULL ''', (now, sup_hours, child_prefix + '%')) result = { @@ -1119,5 +1160,225 @@ class HealthPersistence: print(f"[HealthPersistence] Error recording UNKNOWN persistent: {e}") + # ──────────────────────────────────────────────────────────────── + # Disk Observations API + # ──────────────────────────────────────────────────────────────── + + def register_disk(self, device_name: str, serial: Optional[str] = None, + model: Optional[str] = None, size_bytes: Optional[int] = None): + """Register or update a physical disk in the registry. + + Uses (device_name, serial) as unique key. If the disk was previously + marked removed, it's re-activated. + """ + now = datetime.now().isoformat() + try: + conn = self._get_conn() + cursor = conn.cursor() + + cursor.execute(''' + INSERT INTO disk_registry (device_name, serial, model, size_bytes, first_seen, last_seen, removed) + VALUES (?, ?, ?, ?, ?, ?, 0) + ON CONFLICT(device_name, serial) DO UPDATE SET + model = COALESCE(excluded.model, model), + size_bytes = COALESCE(excluded.size_bytes, size_bytes), + last_seen = excluded.last_seen, + removed = 0 + ''', (device_name, serial or '', model, size_bytes, now, now)) + + conn.commit() + conn.close() + except Exception as e: + print(f"[HealthPersistence] Error registering disk {device_name}: {e}") + + def _get_disk_registry_id(self, cursor, device_name: str, + serial: Optional[str] = None) -> Optional[int]: + """Find disk_registry.id, matching by serial first, then device_name.""" + if serial: + cursor.execute( + 'SELECT id FROM disk_registry WHERE serial = ? AND serial != "" ORDER BY last_seen DESC LIMIT 1', + (serial,)) + row = cursor.fetchone() + if row: + return row[0] + # Fallback: match by device_name (strip /dev/ prefix) + clean_dev = device_name.replace('/dev/', '') + cursor.execute( + 'SELECT id FROM disk_registry WHERE device_name = ? ORDER BY last_seen DESC LIMIT 1', + (clean_dev,)) + row = cursor.fetchone() + return row[0] if row else None + + def record_disk_observation(self, device_name: str, serial: Optional[str], + error_type: str, error_signature: str, + raw_message: str = '', + severity: str = 'warning'): + """Record or deduplicate a disk error observation. + + error_type: 'smart_error', 'io_error', 'connection_error' + error_signature: Normalized unique string for dedup (e.g. 'FailedReadSmartSelfTestLog') + """ + now = datetime.now().isoformat() + try: + conn = self._get_conn() + cursor = conn.cursor() + + # Auto-register the disk if not present + clean_dev = device_name.replace('/dev/', '') + self.register_disk(clean_dev, serial) + + disk_id = self._get_disk_registry_id(cursor, clean_dev, serial) + if not disk_id: + conn.close() + return + + # Upsert observation: if same (disk, type, signature), bump count + update last_occurrence + cursor.execute(''' + INSERT INTO disk_observations + (disk_registry_id, error_type, error_signature, first_occurrence, + last_occurrence, occurrence_count, raw_message, severity, dismissed) + VALUES (?, ?, ?, ?, ?, 1, ?, ?, 0) + ON CONFLICT(disk_registry_id, error_type, error_signature) DO UPDATE SET + last_occurrence = excluded.last_occurrence, + occurrence_count = occurrence_count + 1, + severity = CASE WHEN excluded.severity = 'critical' THEN 'critical' ELSE severity END, + dismissed = 0 + ''', (disk_id, error_type, error_signature, now, now, raw_message, severity)) + + conn.commit() + conn.close() + except Exception as e: + print(f"[HealthPersistence] Error recording disk observation: {e}") + + def get_disk_observations(self, device_name: Optional[str] = None, + serial: Optional[str] = None) -> List[Dict[str, Any]]: + """Get active (non-dismissed) observations for one disk or all disks.""" + try: + conn = self._get_conn() + cursor = conn.cursor() + + if device_name or serial: + disk_id = self._get_disk_registry_id(cursor, + device_name or '', serial) + if not disk_id: + conn.close() + return [] + cursor.execute(''' + SELECT o.id, o.error_type, o.error_signature, + o.first_occurrence, o.last_occurrence, + o.occurrence_count, o.raw_message, o.severity, o.dismissed, + d.device_name, d.serial, d.model + FROM disk_observations o + JOIN disk_registry d ON o.disk_registry_id = d.id + WHERE o.disk_registry_id = ? AND o.dismissed = 0 + ORDER BY o.last_occurrence DESC + ''', (disk_id,)) + else: + cursor.execute(''' + SELECT o.id, o.error_type, o.error_signature, + o.first_occurrence, o.last_occurrence, + o.occurrence_count, o.raw_message, o.severity, o.dismissed, + d.device_name, d.serial, d.model + FROM disk_observations o + JOIN disk_registry d ON o.disk_registry_id = d.id + WHERE o.dismissed = 0 + ORDER BY o.last_occurrence DESC + ''') + + rows = cursor.fetchall() + conn.close() + + return [{ + 'id': r[0], + 'error_type': r[1], + 'error_signature': r[2], + 'first_occurrence': r[3], + 'last_occurrence': r[4], + 'occurrence_count': r[5], + 'raw_message': r[6] or '', + 'severity': r[7], + 'dismissed': bool(r[8]), + 'device_name': r[9], + 'serial': r[10], + 'model': r[11], + } for r in rows] + except Exception as e: + print(f"[HealthPersistence] Error getting observations: {e}") + return [] + + def get_disks_observation_counts(self) -> Dict[str, int]: + """Return {device_name: count} of active observations per disk. + + Also includes serial-keyed entries for cross-device matching. + """ + try: + conn = self._get_conn() + cursor = conn.cursor() + cursor.execute(''' + SELECT d.device_name, d.serial, COUNT(o.id) as cnt + FROM disk_observations o + JOIN disk_registry d ON o.disk_registry_id = d.id + WHERE o.dismissed = 0 + GROUP BY d.id + ''') + result = {} + for device_name, serial, cnt in cursor.fetchall(): + result[device_name] = cnt + if serial: + result[f'serial:{serial}'] = cnt + conn.close() + return result + except Exception as e: + print(f"[HealthPersistence] Error getting observation counts: {e}") + return {} + + def dismiss_disk_observation(self, observation_id: int): + """Mark a single observation as dismissed.""" + try: + conn = self._get_conn() + cursor = conn.cursor() + cursor.execute( + 'UPDATE disk_observations SET dismissed = 1 WHERE id = ?', + (observation_id,)) + conn.commit() + conn.close() + except Exception as e: + print(f"[HealthPersistence] Error dismissing observation: {e}") + + def cleanup_stale_observations(self, max_age_days: int = 30): + """Auto-dismiss observations not seen in max_age_days.""" + try: + from datetime import timedelta + cutoff = (datetime.now() - timedelta(days=max_age_days)).isoformat() + conn = self._get_conn() + cursor = conn.cursor() + cursor.execute(''' + UPDATE disk_observations + SET dismissed = 1 + WHERE dismissed = 0 AND last_occurrence < ? + ''', (cutoff,)) + conn.commit() + conn.close() + except Exception as e: + print(f"[HealthPersistence] Error cleaning stale observations: {e}") + + def mark_removed_disks(self, active_device_names: List[str]): + """Mark disks not in active_device_names as removed.""" + try: + now = datetime.now().isoformat() + conn = self._get_conn() + cursor = conn.cursor() + if active_device_names: + placeholders = ','.join('?' for _ in active_device_names) + cursor.execute(f''' + UPDATE disk_registry SET removed = 1 + WHERE device_name NOT IN ({placeholders}) AND removed = 0 + ''', active_device_names) + conn.commit() + conn.close() + except Exception as e: + print(f"[HealthPersistence] Error marking removed disks: {e}") + + # Global instance health_persistence = HealthPersistence() diff --git a/AppImage/scripts/notification_events.py b/AppImage/scripts/notification_events.py index b16328d2..ba3b63b8 100644 --- a/AppImage/scripts/notification_events.py +++ b/AppImage/scripts/notification_events.py @@ -690,6 +690,68 @@ class JournalWatcher: except Exception: return 'UNKNOWN' + def _record_smartd_observation(self, title: str, message: str): + """Extract device info from a smartd system-mail and record as disk observation.""" + try: + import re as _re + from health_persistence import health_persistence + + # Extract device path: "Device: /dev/sdh [SAT]" or "Device: /dev/sda" + dev_match = _re.search(r'Device:\s*/dev/(\S+?)[\s\[\],]', message) + device = dev_match.group(1) if dev_match else '' + if not device: + return + # Strip partition suffix and SAT prefix + base_dev = _re.sub(r'\d+$', '', device) + + # Extract serial: "S/N:WD-WX72A30AA72R" + sn_match = _re.search(r'S/N:\s*(\S+)', message) + serial = sn_match.group(1) if sn_match else '' + + # Extract model: appears before S/N on the "Device info:" line + model = '' + model_match = _re.search(r'Device info:\s*\n?\s*(.+?)(?:,\s*S/N:)', message) + if model_match: + model = model_match.group(1).strip() + + # Extract error signature from title: "SMART error (FailedReadSmartSelfTestLog)" + sig_match = _re.search(r'SMART error\s*\((\w+)\)', title) + if sig_match: + error_signature = sig_match.group(1) + error_type = 'smart_error' + else: + # Fallback: extract the "warning/error logged" line + warn_match = _re.search( + r'warning/error was logged.*?:\s*\n?\s*(.+)', message, _re.IGNORECASE) + if warn_match: + error_signature = _re.sub(r'[^a-zA-Z0-9_]', '_', + warn_match.group(1).strip())[:80] + else: + error_signature = _re.sub(r'[^a-zA-Z0-9_]', '_', title)[:80] + error_type = 'smart_error' + + # Build a clean raw_message for display + raw_msg = f"Device: /dev/{base_dev}" + if model: + raw_msg += f" ({model})" + if serial: + raw_msg += f" S/N:{serial}" + warn_line_m = _re.search( + r'The following warning/error.*?:\s*\n?\s*(.+)', message, _re.IGNORECASE) + if warn_line_m: + raw_msg += f"\n{warn_line_m.group(1).strip()}" + + health_persistence.record_disk_observation( + device_name=base_dev, + serial=serial, + error_type=error_type, + error_signature=error_signature, + raw_message=raw_msg, + severity='warning', + ) + except Exception as e: + print(f"[DiskIOEventProcessor] Error recording smartd observation: {e}") + @staticmethod def _translate_ata_error(msg: str) -> str: """Translate common ATA/SCSI error codes to human-readable descriptions.""" @@ -1393,15 +1455,42 @@ class PollingCollector: Tracking is stored in ``notification_last_sent`` (same DB). """ - DIGEST_INTERVAL = 86400 # 24 h between re-notifications + DIGEST_INTERVAL = 86400 # 24 h default between re-notifications UPDATE_CHECK_INTERVAL = 86400 # 24 h between update scans NEW_ERROR_WINDOW = 120 # seconds – errors younger than this are "new" + # Per-category anti-oscillation cooldowns (seconds). + # When an error resolves briefly and reappears, we still respect this + # interval before notifying again. This prevents "semi-cascades" where + # the same root cause generates many slightly different notifications. + # + # Key = health_persistence category name + # Value = minimum seconds between notifications for the same error_key + _CATEGORY_COOLDOWNS = { + 'disks': 86400, # 24h - I/O errors are persistent hardware issues + 'smart': 86400, # 24h - SMART errors same as I/O + 'zfs': 86400, # 24h - ZFS pool issues are persistent + 'storage': 3600, # 1h - storage availability can oscillate + 'network': 1800, # 30m - network can flap + 'pve_services': 1800, # 30m - services can restart/oscillate + 'temperature': 3600, # 1h - temp can fluctuate near thresholds + 'logs': 3600, # 1h - repeated log patterns + 'vms': 1800, # 30m - VM state oscillation + 'security': 3600, # 1h - auth failures tend to be bursty + 'cpu': 1800, # 30m - CPU spikes can be transient + 'memory': 1800, # 30m - memory pressure oscillation + 'disk': 3600, # 1h - disk space can fluctuate near threshold + 'updates': 86400, # 24h - update info doesn't change fast + } + _ENTITY_MAP = { 'cpu': ('node', ''), 'memory': ('node', ''), 'temperature': ('node', ''), - 'disk': ('storage', ''), 'network': ('network', ''), + 'load': ('node', ''), + 'disk': ('storage', ''), 'disks': ('storage', ''), 'smart': ('storage', ''), + 'zfs': ('storage', ''), 'storage': ('storage', ''), + 'network': ('network', ''), 'pve_services': ('node', ''), 'security': ('user', ''), - 'updates': ('node', ''), 'storage': ('storage', ''), + 'updates': ('node', ''), 'logs': ('node', ''), 'vms': ('vm', ''), } # Map health-persistence category names to our TEMPLATES event types. @@ -1412,14 +1501,14 @@ class PollingCollector: 'load': 'load_high', 'temperature': 'temp_high', 'disk': 'disk_space_low', + 'disks': 'disk_io_error', # I/O errors from health monitor + 'smart': 'disk_io_error', # SMART errors from health monitor + 'zfs': 'disk_io_error', # ZFS pool/disk errors 'storage': 'storage_unavailable', 'network': 'network_down', 'pve_services': 'service_fail', 'security': 'auth_fail', 'updates': 'update_summary', - 'zfs': 'disk_io_error', - 'smart': 'disk_io_error', - 'disks': 'disk_io_error', 'logs': 'system_problem', 'vms': 'system_problem', } @@ -1547,34 +1636,46 @@ class PollingCollector: # Determine if we should notify is_new = error_key not in self._known_errors last_sent = self._last_notified.get(error_key, 0) - is_due = (now - last_sent) >= self.DIGEST_INTERVAL + cat_cooldown = self._CATEGORY_COOLDOWNS.get(category, self.DIGEST_INTERVAL) + is_due = (now - last_sent) >= cat_cooldown - # For re-notifications (not new): skip if stale OR not due + # Anti-oscillation: even if "new" (resolved then reappeared), + # respect the per-category cooldown interval. This prevents + # "semi-cascades" where the same root cause generates multiple + # slightly different notifications across health check cycles. + # Each category has its own appropriate cooldown (30m for network, + # 24h for disks, 1h for temperature, etc.). + if not is_due: + continue + + # For re-notifications (not new): also skip if stale if not is_new: - if error_is_stale or not is_due: + if error_is_stale: continue # Map to our event type event_type = self._CATEGORY_TO_EVENT_TYPE.get(category, 'system_problem') entity, eid = self._ENTITY_MAP.get(category, ('node', '')) - # ── SMART gate for disk errors ── - # If the health monitor recorded a disk error but SMART is NOT - # FAILED, skip the notification entirely. Disk notifications - # should ONLY be sent when SMART confirms a real hardware failure. - # This prevents WARNING-level disk errors (SMART: unavailable) - # from being emitted as notifications at all. + # ── Disk I/O notification policy ── + # Disk I/O errors are ALWAYS notified (even when SMART says Passed) + # because recurring I/O errors are real issues that should not be hidden. + # The 24h cooldown is enforced per-device by NotificationManager + # (event_type 'disk_io_error' gets 86400s cooldown). + # For transient/INFO-level disk events (SMART OK, low error count), + # the health monitor already resolves them, so they won't appear here. if category in ('disks', 'smart', 'zfs'): - details = error.get('details', {}) - if isinstance(details, str): + details_raw = error.get('details', {}) + if isinstance(details_raw, str): try: - details = json.loads(details) + details_raw = json.loads(details_raw) except (json.JSONDecodeError, TypeError): - details = {} - smart_status = details.get('smart_status', '') if isinstance(details, dict) else '' - if smart_status != 'FAILED': - # SMART is PASSED, UNKNOWN, or unavailable -- don't notify - continue + details_raw = {} + if isinstance(details_raw, dict): + # Extract device name for a stable entity_id (24h cooldown key) + dev = details_raw.get('device', details_raw.get('disk', '')) + if dev: + eid = f'disk_{dev}' # Stable per-device fingerprint # Updates are always informational notifications except # system_age which can be WARNING (365+ days) or CRITICAL (548+ days). @@ -2020,11 +2121,12 @@ class ProxmoxHookWatcher: msg_lower = (message or '').lower() title_lower_sm = (title or '').lower() - # ── Filter smartd noise ── - # FailedReadSmartErrorLog: smartd can't read the error log -- this is - # a firmware quirk on some WD/Seagate drives, NOT a disk failure. - # FailedReadSmartData: similar firmware issue. - # These should NOT generate notifications. + # ── Record disk observation regardless of noise filter ── + # Even "noise" events are recorded as observations so the user + # can see them in the Storage UI. We just don't send notifications. + self._record_smartd_observation(title or '', message or '') + + # ── Filter smartd noise (suppress notification, not observation) ── smartd_noise = [ 'failedreadsmarterrorlog', 'failedreadsmartdata', diff --git a/AppImage/scripts/notification_manager.py b/AppImage/scripts/notification_manager.py index 6b8445d3..81125c15 100644 --- a/AppImage/scripts/notification_manager.py +++ b/AppImage/scripts/notification_manager.py @@ -767,11 +767,29 @@ class NotificationManager: # Same as Proxmox's notification policy. The JournalWatcher already # gates these through SMART verification + its own 24h dedup, but # this acts as defense-in-depth in case a disk event arrives from - # another source (PollingCollector, hooks, etc.). + # another source (PollingCollector, hooks, health monitor, etc.). _DISK_EVENTS = {'disk_io_error', 'storage_unavailable'} if event.event_type in _DISK_EVENTS and cooldown_str is None: cooldown = 86400 # 24 hours + # Health monitor state_change events: per-category cooldowns. + # Different health categories need different re-notification intervals. + # This is the defense-in-depth layer matching HealthEventWatcher's + # _CATEGORY_COOLDOWNS to prevent semi-cascades across all categories. + _HEALTH_CATEGORY_COOLDOWNS = { + 'disks': 86400, 'smart': 86400, 'zfs': 86400, # 24h + 'storage': 3600, 'temperature': 3600, 'logs': 3600, + 'security': 3600, 'disk': 3600, # 1h + 'network': 1800, 'pve_services': 1800, + 'vms': 1800, 'cpu': 1800, 'memory': 1800, # 30m + 'updates': 86400, # 24h + } + if event.event_type == 'state_change' and event.source == 'health': + cat = (event.data or {}).get('category', '') + cat_cd = _HEALTH_CATEGORY_COOLDOWNS.get(cat) + if cat_cd and cooldown_str is None: + cooldown = max(cooldown, cat_cd) + # Backup/replication events: each execution is unique and should # always be delivered. A 10s cooldown prevents exact duplicates # (webhook + tasks) but allows repeated backup jobs to report.