diff --git a/AppImage/scripts/health_persistence.py b/AppImage/scripts/health_persistence.py index a0f893d2..f98c34b9 100644 --- a/AppImage/scripts/health_persistence.py +++ b/AppImage/scripts/health_persistence.py @@ -739,6 +739,14 @@ class HealthPersistence: } conn.commit() conn.close() + + # ── Clear cooldowns for newly dismissed errors too ── + if sup_hours != -1: + if category == 'disks': + self._clear_disk_io_cooldown(error_key) + else: + self._clear_notification_cooldown(error_key) + return result if row: @@ -803,6 +811,19 @@ class HealthPersistence: conn.commit() conn.close() + + # ── Coordinate with notification cooldowns ── + # When an error is dismissed with non-permanent suppression, + # clear the corresponding cooldown in notification_last_sent + # so it can re-notify after the suppression period expires. + # This applies to ALL categories, not just disks. + if sup_hours != -1: + if category == 'disks': + self._clear_disk_io_cooldown(error_key) + else: + # For non-disk categories, clear the PollingCollector cooldown + self._clear_notification_cooldown(error_key) + return result def is_error_acknowledged(self, error_key: str) -> bool: @@ -2732,5 +2753,123 @@ class HealthPersistence: return set() + def _clear_notification_cooldown(self, error_key: str): + """ + Clear notification cooldown from notification_last_sent for non-disk errors. + + This coordinates with PollingCollector's 24h cooldown system. + When any error is dismissed, we remove the corresponding cooldown entry + so the error can be re-detected and re-notified after the suppression period expires. + + The PollingCollector uses 'health_' prefix for all its fingerprints. + """ + try: + conn = self._get_conn() + cursor = conn.cursor() + + # PollingCollector uses 'health_' prefix + fp = f'health_{error_key}' + cursor.execute( + 'DELETE FROM notification_last_sent WHERE fingerprint = ?', + (fp,) + ) + + # Also delete any fingerprints that match the error_key pattern + cursor.execute( + 'DELETE FROM notification_last_sent WHERE fingerprint LIKE ?', + (f'%{error_key}%',) + ) + + deleted_count = cursor.rowcount + conn.commit() + conn.close() + + if deleted_count > 0: + print(f"[HealthPersistence] Cleared notification cooldowns for {error_key}") + except Exception as e: + print(f"[HealthPersistence] Error clearing notification cooldown: {e}") + + def _clear_disk_io_cooldown(self, error_key: str): + """ + Clear disk I/O cooldowns from notification_last_sent when an error is dismissed. + + This coordinates with BOTH: + 1. JournalWatcher's 24h cooldown system (prefixes: diskio_, fs_, fs_serial_) + 2. PollingCollector's 24h cooldown system (prefix: health_) + + When a disk error is dismissed, we remove the corresponding cooldown entries + so the error can be re-detected and re-notified after the suppression period expires. + + Matches fingerprints like: + - diskio_sdh, diskio_sda, diskio_nvme0n1 + - fs_sdh1, fs_sda2, fs_serial_XXXXX + - health_disk_smart_sdh, health_disk_io_error_sdh + - sdh (direct device name used by JournalWatcher) + """ + try: + # Extract device name from error_key + # Common patterns: disk_fs_sdh, disk_smart_sda, disk_io_error_sdh, smart_sdh + import re + device_match = re.search(r'(?:disk_fs_|disk_smart_|disk_io_error_|disk_|smart_|io_error_)(?:/dev/)?([a-z]{2,4}[a-z0-9]*)', error_key) + if not device_match: + # Try to extract device from error_key directly if no pattern matches + # e.g., error_key might just be the device name + device_match = re.match(r'^([a-z]{2,4}[a-z0-9]*)$', error_key) + if not device_match: + return + + device = device_match.group(1) + base_device = re.sub(r'\d+$', '', device) # sdh1 -> sdh + + # Build patterns to match in notification_last_sent + # JournalWatcher uses: direct device name, diskio_, fs_, fs_serial_ + # PollingCollector uses: health_ prefix + patterns = [ + # JournalWatcher patterns + device, # Direct device name (JournalWatcher._check_disk_io uses this) + base_device, + f'diskio_{device}', + f'diskio_{base_device}', + f'fs_{device}', + f'fs_{base_device}', + # PollingCollector patterns (uses health_ prefix) + f'health_{error_key}', + f'health_disk_smart_{device}', + f'health_disk_smart_{base_device}', + f'health_disk_io_error_{device}', + f'health_disk_io_error_{base_device}', + f'health_disk_fs_{device}', + f'health_disk_fs_{base_device}', + ] + + conn = self._get_conn() + cursor = conn.cursor() + + # Delete matching cooldown entries + for pattern in patterns: + cursor.execute( + 'DELETE FROM notification_last_sent WHERE fingerprint = ?', + (pattern,) + ) + # Also match with wildcards for serial-based keys + cursor.execute( + 'DELETE FROM notification_last_sent WHERE fingerprint LIKE ?', + (f'{pattern}%',) + ) + + # Also clear fingerprints that contain the device name anywhere + # This catches edge cases like different fingerprint formats + cursor.execute( + 'DELETE FROM notification_last_sent WHERE fingerprint LIKE ? OR fingerprint LIKE ?', + (f'%{device}%', f'%{base_device}%' if base_device != device else f'%{device}%') + ) + + conn.commit() + conn.close() + print(f"[HealthPersistence] Cleared disk I/O cooldowns for {error_key} (device: {device})") + except Exception as e: + print(f"[HealthPersistence] Error clearing disk I/O cooldown: {e}") + + # Global instance health_persistence = HealthPersistence() diff --git a/AppImage/scripts/notification_events.py b/AppImage/scripts/notification_events.py index 9ab76a35..5e764068 100644 --- a/AppImage/scripts/notification_events.py +++ b/AppImage/scripts/notification_events.py @@ -319,6 +319,41 @@ class JournalWatcher: except Exception as e: print(f"[JournalWatcher] Failed to save disk_io_notified: {e}") + def _get_disk_io_cooldown_from_db(self, device: str) -> Optional[float]: + """ + Get disk I/O cooldown timestamp from DB for a device. + + Used to re-check DB when user might have dismissed the error, + which clears the DB entry via health_persistence._clear_disk_io_cooldown(). + + Returns the timestamp if found and within 24h window, None otherwise. + """ + try: + db_path = Path('/usr/local/share/proxmenux/health_monitor.db') + if not db_path.exists(): + return None + conn = sqlite3.connect(str(db_path), timeout=5) + conn.execute('PRAGMA busy_timeout=3000') + cursor = conn.cursor() + + # Check for the device with various prefixes + # JournalWatcher uses direct device names as keys + cursor.execute( + "SELECT last_sent_ts FROM notification_last_sent WHERE fingerprint = ?", + (device,) + ) + row = cursor.fetchone() + conn.close() + + if row: + ts = float(row[0]) + # Only return if within 24h window + if time.time() - ts < self._DISK_IO_COOLDOWN: + return ts + return None + except Exception: + return None + def stop(self): """Stop the journal watcher.""" self._running = False @@ -589,7 +624,14 @@ class JournalWatcher: fs_dedup_key = f'fs_{device}' last_fs_notified = self._disk_io_notified.get(fs_dedup_key, 0) if now_fs - last_fs_notified < self._DISK_IO_COOLDOWN: - return # Already notified for this device recently + # In-memory says cooldown active. Re-check DB in case + # user dismissed the error (which clears DB cooldowns). + db_ts = self._get_disk_io_cooldown_from_db(fs_dedup_key) + if db_ts is not None and now_fs - db_ts < self._DISK_IO_COOLDOWN: + return # DB confirms cooldown is still active + # DB says cooldown was cleared - proceed + if fs_dedup_key in self._disk_io_notified: + del self._disk_io_notified[fs_dedup_key] # ── Device existence gating ── device_exists = base_dev and _os.path.exists(f'/dev/{base_dev}') @@ -842,10 +884,24 @@ class JournalWatcher: return # ── Gate 2: 24-hour dedup per device ── + # Check both in-memory cache AND the DB (user dismiss clears DB cooldowns). + # If user dismissed the error, _clear_disk_io_cooldown() removed the DB + # entry, so we should refresh from DB to get the real state. now = time.time() + + # First check in-memory cache last_notified = self._disk_io_notified.get(resolved, 0) + if now - last_notified < self._DISK_IO_COOLDOWN: - return # Already notified for this disk recently + # In-memory says we already notified. But user might have dismissed + # the error, which clears the DB. Re-check DB to be sure. + db_ts = self._get_disk_io_cooldown_from_db(resolved) + if db_ts is not None and now - db_ts < self._DISK_IO_COOLDOWN: + return # DB confirms cooldown is still active + # DB says cooldown was cleared (user dismissed) - proceed to notify + # Update in-memory cache + del self._disk_io_notified[resolved] + self._disk_io_notified[resolved] = now self._save_disk_io_notified(resolved, now) @@ -2069,8 +2125,16 @@ class PollingCollector: # ── SAME ERROR COOLDOWN (24h) ── # The SAME error_key cannot be re-notified before 24 hours. # This is the PRIMARY deduplication mechanism. + # EXCEPTION: If user dismissed the error, the cooldown is cleared in DB + # and we should re-check DB to see if cooldown still applies. if time_since_last < self.SAME_ERROR_COOLDOWN: - continue + # Check if user dismissed this - clears DB cooldown + db_ts = self._get_cooldown_from_db(error_key) + if db_ts is not None and now - db_ts < self.SAME_ERROR_COOLDOWN: + continue # DB confirms cooldown still active + # DB says cooldown was cleared (user dismissed) - remove from memory + self._last_notified.pop(error_key, None) + # Continue to the next checks (category cooldown etc.) # ── CATEGORY COOLDOWN (varies) ── # DIFFERENT errors within the same category respect category cooldown. @@ -2735,6 +2799,41 @@ class PollingCollector: conn.close() except Exception: pass + + def _get_cooldown_from_db(self, error_key: str) -> Optional[float]: + """ + Get cooldown timestamp from DB for an error_key. + + Used to re-check DB when user might have dismissed the error, + which clears the DB entry via health_persistence._clear_disk_io_cooldown(). + + Returns the timestamp if found and within 24h window, None otherwise. + """ + try: + db_path = Path('/usr/local/share/proxmenux/health_monitor.db') + if not db_path.exists(): + return None + conn = sqlite3.connect(str(db_path), timeout=5) + conn.execute('PRAGMA busy_timeout=3000') + cursor = conn.cursor() + + # PollingCollector uses 'health_' prefix for its fingerprints + fp = f'health_{error_key}' + cursor.execute( + "SELECT last_sent_ts FROM notification_last_sent WHERE fingerprint = ?", + (fp,) + ) + row = cursor.fetchone() + conn.close() + + if row: + ts = float(row[0]) + # Only return if within 24h window + if time.time() - ts < self.SAME_ERROR_COOLDOWN: + return ts + return None + except Exception: + return None # ─── Proxmox Webhook Receiver ───────────────────────────────────