From ee1204c5666cf2972caf6c19aa4cffffd9f866bd Mon Sep 17 00:00:00 2001 From: MacRimi Date: Thu, 16 Apr 2026 19:10:47 +0200 Subject: [PATCH] update health_monitor.py --- AppImage/components/proxmox-dashboard.tsx | 24 +- AppImage/scripts/health_monitor.py | 144 ++++--- AppImage/scripts/health_persistence.py | 497 +++++++++++----------- 3 files changed, 353 insertions(+), 312 deletions(-) diff --git a/AppImage/components/proxmox-dashboard.tsx b/AppImage/components/proxmox-dashboard.tsx index a4b23d44..8e00cfbd 100644 --- a/AppImage/components/proxmox-dashboard.tsx +++ b/AppImage/components/proxmox-dashboard.tsx @@ -74,6 +74,7 @@ export function ProxmoxDashboard() { serverName: "Loading...", nodeId: "Loading...", }) + const [isInitialLoading, setIsInitialLoading] = useState(true) const [isRefreshing, setIsRefreshing] = useState(false) const [isServerConnected, setIsServerConnected] = useState(true) const [componentKey, setComponentKey] = useState(0) @@ -192,10 +193,10 @@ export function ProxmoxDashboard() { }, []) useEffect(() => { - // Siempre fetch inicial - fetchSystemData() - fetchHealthInfoCount() // Fetch info count on initial load - fetchUpdateStatus() // Fetch ProxMenux update status on initial load + // Siempre fetch inicial — mark loading done when system data arrives + fetchSystemData().finally(() => setIsInitialLoading(false)) + fetchHealthInfoCount() + fetchUpdateStatus() // En overview: cada 30 segundos para actualización frecuente del estado de salud // En otras tabs: cada 60 segundos para reducir carga @@ -372,6 +373,21 @@ export function ProxmoxDashboard() { } } + if (isInitialLoading) { + return ( +
+
+
+
+
+
+
Loading ProxMenux Monitor...
+

Connecting to server and fetching system status

+
+
+ ) + } + return (
diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py index 4d475179..2e3cdd7c 100644 --- a/AppImage/scripts/health_monitor.py +++ b/AppImage/scripts/health_monitor.py @@ -272,6 +272,8 @@ class HealthMonitor: # SMART check cache - reduces disk queries from every 5 min to every 30 min self._smart_cache = {} # {disk_name: {'result': 'PASSED', 'time': timestamp}} self._SMART_CACHE_TTL = 1620 # 27 min - offset to avoid sync with other processes + # Disk identity cache - avoids repeated smartctl -i calls for serial/model + self._disk_identity_cache: Dict[str, Dict[str, str]] = {} # {disk_name: {'serial': ..., 'model': ...}} # Journalctl 24h cache - reduces full log reads from every 5 min to every 1 hour self._journalctl_24h_cache = {'count': 0, 'time': 0} @@ -280,11 +282,14 @@ class HealthMonitor: # Journalctl 10min cache - shared across checks to avoid duplicate calls # Multiple checks (cpu_temp, vms_cts) use the same journalctl query self._journalctl_10min_cache = {'output': '', 'time': 0} - self._JOURNALCTL_10MIN_CACHE_TTL = 60 # 1 minute - fresh enough for health checks + self._JOURNALCTL_10MIN_CACHE_TTL = 120 # 2 minutes - covers full health check cycle # Journalctl 1hour cache - for disk health events (SMART warnings, I/O errors) self._journalctl_1hour_cache = {'output': '', 'time': 0} self._JOURNALCTL_1HOUR_CACHE_TTL = 300 # 5 min cache - disk events don't need real-time + # Timestamp watermark: track last successfully processed journalctl entry + # to avoid re-processing old entries on subsequent checks + self._disk_journal_last_ts: Optional[str] = None # System capabilities - derived from Proxmox storage types at runtime (Priority 1.5) # SMART detection still uses filesystem check on init (lightweight) @@ -316,7 +321,7 @@ class HealthMonitor: ['journalctl', '-b', '0', '--since', '10 minutes ago', '--no-pager', '-p', 'warning'], capture_output=True, text=True, - timeout=20 + timeout=10 ) if result.returncode == 0: cache['output'] = result.stdout @@ -330,37 +335,49 @@ class HealthMonitor: return cache.get('output', '') # Return stale cache on error def _get_journalctl_1hour_warnings(self) -> str: - """Get journalctl warnings from last 1 hour, cached for disk health checks. - + """Get journalctl warnings since last check, cached for disk health checks. + Used by _check_disk_health_from_events for SMART warnings and I/O errors. + Uses a timestamp watermark (_disk_journal_last_ts) to only read NEW entries + since the last successful check, preventing re-processing of old errors. + On first run (no watermark), reads the last 10 minutes to catch recent events + without pulling in stale history. Cached for 5 minutes since disk events don't require real-time detection. """ current_time = time.time() cache = self._journalctl_1hour_cache - + # Return cached result if fresh - if cache['output'] and (current_time - cache['time']) < self._JOURNALCTL_1HOUR_CACHE_TTL: + if cache['output'] is not None and cache['time'] > 0 and (current_time - cache['time']) < self._JOURNALCTL_1HOUR_CACHE_TTL: return cache['output'] - - # Execute journalctl and cache result - # Use -b 0 to only include logs from the current boot + + # Determine --since value: use watermark if available, otherwise 10 minutes + if self._disk_journal_last_ts: + since_arg = self._disk_journal_last_ts + else: + since_arg = '10 minutes ago' + try: result = subprocess.run( - ['journalctl', '-b', '0', '--since', '1 hour ago', '--no-pager', '-p', 'warning', + ['journalctl', '-b', '0', '--since', since_arg, '--no-pager', '-p', 'warning', '--output=short-precise'], capture_output=True, text=True, timeout=15 ) if result.returncode == 0: - cache['output'] = result.stdout + output = result.stdout + cache['output'] = output cache['time'] = current_time - return cache['output'] + # Advance watermark to "now" so next check only gets new entries + from datetime import datetime as _dt + self._disk_journal_last_ts = _dt.now().strftime('%Y-%m-%d %H:%M:%S') + return output except subprocess.TimeoutExpired: - print("[HealthMonitor] journalctl 1hour cache: timeout") + print("[HealthMonitor] journalctl disk cache: timeout") except Exception as e: - print(f"[HealthMonitor] journalctl 1hour cache error: {e}") - + print(f"[HealthMonitor] journalctl disk cache error: {e}") + return cache.get('output', '') # Return stale cache on error # ─── Lightweight sampling methods for the dedicated vital-signs thread ─── @@ -1260,21 +1277,10 @@ class HealthMonitor: reason = f'{disk}: {issue["reason"]}' severity = issue.get('status', 'WARNING') - # Get serial for this disk to properly track it (important for USB disks) - disk_serial = '' - disk_model = '' - try: - smart_result = subprocess.run( - ['smartctl', '-i', '-j', f'/dev/{device}'], - capture_output=True, text=True, timeout=5 - ) - if smart_result.returncode in (0, 4): - import json - smart_data = json.loads(smart_result.stdout) - disk_serial = smart_data.get('serial_number', '') - disk_model = smart_data.get('model_name', '') or smart_data.get('model_family', '') - except Exception: - pass + # Get serial for this disk (cached to avoid repeated smartctl calls) + disk_id = self._get_disk_identity(device) + disk_serial = disk_id['serial'] + disk_model = disk_id['model'] try: if (not health_persistence.is_error_active(io_error_key, category='disks') and @@ -1323,21 +1329,10 @@ class HealthMonitor: device = disk_path.replace('/dev/', '') io_severity = disk_info.get('status', 'WARNING').lower() - # Get serial for proper disk tracking (important for USB) - io_serial = '' - io_model = '' - try: - smart_result = subprocess.run( - ['smartctl', '-i', '-j', f'/dev/{device}'], - capture_output=True, text=True, timeout=5 - ) - if smart_result.returncode in (0, 4): - import json - smart_data = json.loads(smart_result.stdout) - io_serial = smart_data.get('serial_number', '') - io_model = smart_data.get('model_name', '') or smart_data.get('model_family', '') - except Exception: - pass + # Get serial for proper disk tracking (cached) + io_id = self._get_disk_identity(device) + io_serial = io_id['serial'] + io_model = io_id['model'] # Register the disk for observation tracking (worst_health no longer used) try: @@ -1946,20 +1941,53 @@ class HealthMonitor: except Exception: return '' + def _get_disk_identity(self, disk_name: str) -> Dict[str, str]: + """Get disk serial/model with caching. Avoids repeated smartctl -i calls. + + Returns {'serial': '...', 'model': '...'} or empty values on failure. + Cache persists for the lifetime of the monitor (serial/model don't change). + """ + if disk_name in self._disk_identity_cache: + return self._disk_identity_cache[disk_name] + + result = {'serial': '', 'model': ''} + try: + dev_path = f'/dev/{disk_name}' if not disk_name.startswith('/') else disk_name + proc = subprocess.run( + ['smartctl', '-i', '-j', dev_path], + capture_output=True, text=True, timeout=5 + ) + if proc.returncode in (0, 4): + import json as _json + data = _json.loads(proc.stdout) + result['serial'] = data.get('serial_number', '') + result['model'] = data.get('model_name', '') or data.get('model_family', '') + except Exception: + pass + + self._disk_identity_cache[disk_name] = result + return result + def _quick_smart_health(self, disk_name: str) -> str: """Quick SMART health check for a single disk. Returns 'PASSED', 'FAILED', or 'UNKNOWN'. - + Results are cached for 30 minutes to reduce disk queries - SMART status rarely changes. """ if not disk_name or disk_name.startswith('ata') or disk_name.startswith('zram'): return 'UNKNOWN' - # Check cache first + # Check cache first (and evict stale entries periodically) current_time = time.time() cache_key = disk_name cached = self._smart_cache.get(cache_key) if cached and current_time - cached['time'] < self._SMART_CACHE_TTL: return cached['result'] + # Evict expired entries to prevent unbounded growth + if len(self._smart_cache) > 50: + self._smart_cache = { + k: v for k, v in self._smart_cache.items() + if current_time - v['time'] < self._SMART_CACHE_TTL * 2 + } try: dev_path = f'/dev/{disk_name}' if not disk_name.startswith('/') else disk_name @@ -2130,7 +2158,11 @@ class HealthMonitor: t for t in self.io_error_history[disk] if current_time - t < 300 ] - + # Remove empty entries to prevent unbounded dict growth + if not self.io_error_history[disk]: + del self.io_error_history[disk] + continue + error_count = len(self.io_error_history[disk]) error_key = f'disk_{disk}' sample = disk_samples.get(disk, '') @@ -4662,19 +4694,9 @@ class HealthMonitor: obs_sig = f'{sig_base}_{disk_name}' - # Try to get serial for proper cross-referencing - obs_serial = None - try: - sm = subprocess.run( - ['smartctl', '-i', dev_path], - capture_output=True, text=True, timeout=3) - if sm.returncode in (0, 4): - for sline in sm.stdout.split('\n'): - if 'Serial Number' in sline or 'Serial number' in sline: - obs_serial = sline.split(':')[-1].strip() - break - except Exception: - pass + # Get serial for proper cross-referencing (cached) + obs_id = self._get_disk_identity(disk_name) + obs_serial = obs_id['serial'] or None health_persistence.record_disk_observation( device_name=disk_name, diff --git a/AppImage/scripts/health_persistence.py b/AppImage/scripts/health_persistence.py index f98c34b9..21ab723c 100644 --- a/AppImage/scripts/health_persistence.py +++ b/AppImage/scripts/health_persistence.py @@ -54,7 +54,12 @@ class HealthPersistence: self._init_database() def _get_conn(self) -> sqlite3.Connection: - """Get a SQLite connection with timeout and WAL mode for safe concurrency.""" + """Get a SQLite connection with timeout and WAL mode for safe concurrency. + + IMPORTANT: Always close the connection when done, preferably using + the _db_connection() context manager. If not closed explicitly, + Python's GC will close it, but this is unreliable under load. + """ conn = sqlite3.connect(str(self.db_path), timeout=30) conn.execute('PRAGMA journal_mode=WAL') conn.execute('PRAGMA busy_timeout=10000') @@ -332,7 +337,32 @@ class HealthPersistence: print(f"[HealthPersistence] WARNING: Missing tables after init: {missing}") else: print(f"[HealthPersistence] Database initialized with {len(tables)} tables") - + + # ─── Startup migration: clean stale looping disk I/O errors ─── + # Previous versions had a bug where journal-based disk errors were + # re-processed every cycle, causing infinite notification loops. + # On upgrade, clean up any stale disk errors that are stuck in the + # active state from the old buggy behavior. + try: + cursor = conn.cursor() + # Delete active (unresolved) disk errors from journal that are + # older than 2 hours — these are leftovers from the feedback loop. + # Real new errors will be re-detected from fresh journal entries. + cutoff = (datetime.now() - timedelta(hours=2)).isoformat() + cursor.execute(''' + DELETE FROM errors + WHERE error_key LIKE 'smart_%' + AND resolved_at IS NULL + AND acknowledged = 0 + AND last_seen < ? + ''', (cutoff,)) + cleaned = cursor.rowcount + if cleaned > 0: + conn.commit() + print(f"[HealthPersistence] Startup cleanup: removed {cleaned} stale disk error(s) from previous bug") + except Exception as e: + print(f"[HealthPersistence] Startup cleanup warning: {e}") + conn.close() def record_error(self, error_key: str, category: str, severity: str, @@ -345,21 +375,17 @@ class HealthPersistence: return self._record_error_impl(error_key, category, severity, reason, details) def _record_error_impl(self, error_key, category, severity, reason, details): - # === RESOURCE EXISTENCE CHECK === + # === RESOURCE EXISTENCE CHECK (before DB access) === # Skip recording errors for resources that no longer exist - # This prevents "ghost" errors from stale journal entries - - # Check VM/CT existence if error_key and (error_key.startswith(('vm_', 'ct_', 'vmct_'))): import re vmid_match = re.search(r'(?:vm_|ct_|vmct_)(\d+)', error_key) if vmid_match: vmid = vmid_match.group(1) if not self._check_vm_ct_exists(vmid): - return {'type': 'skipped', 'needs_notification': False, + return {'type': 'skipped', 'needs_notification': False, 'reason': f'VM/CT {vmid} no longer exists'} - - # Check disk existence + if error_key and any(error_key.startswith(p) for p in ('smart_', 'disk_', 'io_error_')): import re import os @@ -370,161 +396,151 @@ class HealthPersistence: if not os.path.exists(f'/dev/{disk_name}') and not os.path.exists(f'/dev/{base_disk}'): return {'type': 'skipped', 'needs_notification': False, 'reason': f'Disk /dev/{disk_name} no longer exists'} - + conn = self._get_conn() - cursor = conn.cursor() - - now = datetime.now().isoformat() - details_json = json.dumps(details) if details else None - - cursor.execute(''' - SELECT id, acknowledged, resolved_at, category, severity, first_seen, - notification_sent, suppression_hours - FROM errors WHERE error_key = ? - ''', (error_key,)) - existing = cursor.fetchone() - - event_info = {'type': 'updated', 'needs_notification': False} - - if existing: - err_id, ack, resolved_at, old_cat, old_severity, first_seen, notif_sent, stored_suppression = existing - - if ack == 1: - # SAFETY OVERRIDE: Critical CPU temperature ALWAYS re-triggers - # regardless of any dismiss/permanent setting (hardware protection) - if error_key == 'cpu_temperature' and severity == 'CRITICAL': - cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,)) - cursor.execute(''' - INSERT INTO errors - (error_key, category, severity, reason, details, first_seen, last_seen) - VALUES (?, ?, ?, ?, ?, ?, ?) - ''', (error_key, category, severity, reason, details_json, now, now)) - event_info = {'type': 'new', 'needs_notification': True} - self._record_event(cursor, 'new', error_key, - {'severity': severity, 'reason': reason, - 'note': 'CRITICAL temperature override - safety alert'}) - conn.commit() - conn.close() - return event_info - - # Check suppression: use per-record stored hours (set at dismiss time) - sup_hours = stored_suppression if stored_suppression is not None else self.DEFAULT_SUPPRESSION_HOURS - - # Permanent dismiss (sup_hours == -1): always suppress - if sup_hours == -1: - conn.close() - return {'type': 'skipped_acknowledged', 'needs_notification': False} - - # Time-limited suppression - still_suppressed = False - if resolved_at: - try: - resolved_dt = datetime.fromisoformat(resolved_at) - elapsed_hours = (datetime.now() - resolved_dt).total_seconds() / 3600 - still_suppressed = elapsed_hours < sup_hours - except Exception: - pass - - if still_suppressed: - conn.close() - return {'type': 'skipped_acknowledged', 'needs_notification': False} - else: - # Suppression expired. - # For log-based errors (spike, persistent, cascade), - # do NOT re-trigger. The journal always contains old - # messages, so re-creating the error would cause an - # infinite notification cycle. Instead, just delete - # the stale record so it stops appearing in the UI. - is_log_error = ( - error_key.startswith('log_persistent_') - or error_key.startswith('log_spike_') - or error_key.startswith('log_cascade_') - or error_key.startswith('log_critical_') - or category == 'logs' - ) - if is_log_error: + try: + cursor = conn.cursor() + now = datetime.now().isoformat() + details_json = json.dumps(details) if details else None + + cursor.execute(''' + SELECT id, acknowledged, resolved_at, category, severity, first_seen, + notification_sent, suppression_hours + FROM errors WHERE error_key = ? + ''', (error_key,)) + existing = cursor.fetchone() + + event_info = {'type': 'updated', 'needs_notification': False} + + if existing: + err_id, ack, resolved_at, old_cat, old_severity, first_seen, notif_sent, stored_suppression = existing + + if ack == 1: + # SAFETY OVERRIDE: Critical CPU temperature ALWAYS re-triggers + if error_key == 'cpu_temperature' and severity == 'CRITICAL': cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,)) - conn.commit() - conn.close() - return {'type': 'skipped_expired_log', 'needs_notification': False} - - # For non-log errors (hardware, services, etc.), - # re-triggering is correct -- the condition is real - # and still present. - cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,)) - cursor.execute(''' - INSERT INTO errors - (error_key, category, severity, reason, details, first_seen, last_seen) - VALUES (?, ?, ?, ?, ?, ?, ?) - ''', (error_key, category, severity, reason, details_json, now, now)) - event_info = {'type': 'new', 'needs_notification': True} - self._record_event(cursor, 'new', error_key, - {'severity': severity, 'reason': reason, - 'note': 'Re-triggered after suppression expired'}) - conn.commit() - conn.close() - return event_info - - # Not acknowledged - update existing active error - cursor.execute(''' - UPDATE errors - SET last_seen = ?, severity = ?, reason = ?, details = ? - WHERE error_key = ? AND acknowledged = 0 - ''', (now, severity, reason, details_json, error_key)) - - # Check if severity escalated - if old_severity == 'WARNING' and severity == 'CRITICAL': - event_info['type'] = 'escalated' - event_info['needs_notification'] = True - else: - # Insert new error - cursor.execute(''' - INSERT INTO errors - (error_key, category, severity, reason, details, first_seen, last_seen) - VALUES (?, ?, ?, ?, ?, ?, ?) - ''', (error_key, category, severity, reason, details_json, now, now)) - - event_info['type'] = 'new' - event_info['needs_notification'] = True - - # ─── Auto-suppress: if the category has a non-default setting, ─── - # auto-dismiss immediately so the user never sees it as active. - # Exception: CRITICAL CPU temperature is never auto-suppressed. - if not (error_key == 'cpu_temperature' and severity == 'CRITICAL'): - setting_key = self.CATEGORY_SETTING_MAP.get(category, '') - if setting_key: - # P4 fix: use _get_setting_impl with existing connection to avoid deadlock - stored = self._get_setting_impl(conn, setting_key) - if stored is not None: - configured_hours = int(stored) - if configured_hours != self.DEFAULT_SUPPRESSION_HOURS: - # Non-default setting found: auto-acknowledge - # Mark as acknowledged but DO NOT set resolved_at - error remains active cursor.execute(''' - UPDATE errors - SET acknowledged = 1, acknowledged_at = ?, suppression_hours = ? - WHERE error_key = ? AND acknowledged = 0 - ''', (now, configured_hours, error_key)) - - if cursor.rowcount > 0: - self._record_event(cursor, 'auto_suppressed', error_key, { - 'severity': severity, - 'reason': reason, - 'suppression_hours': configured_hours, - 'note': 'Auto-suppressed by user settings' - }) - event_info['type'] = 'auto_suppressed' - event_info['needs_notification'] = False + INSERT INTO errors + (error_key, category, severity, reason, details, first_seen, last_seen) + VALUES (?, ?, ?, ?, ?, ?, ?) + ''', (error_key, category, severity, reason, details_json, now, now)) + event_info = {'type': 'new', 'needs_notification': True} + self._record_event(cursor, 'new', error_key, + {'severity': severity, 'reason': reason, + 'note': 'CRITICAL temperature override - safety alert'}) + conn.commit() + return event_info + + # Check suppression: use per-record stored hours (set at dismiss time) + sup_hours = stored_suppression if stored_suppression is not None else self.DEFAULT_SUPPRESSION_HOURS + + # Permanent dismiss (sup_hours == -1): always suppress + if sup_hours == -1: + return {'type': 'skipped_acknowledged', 'needs_notification': False} + + # Time-limited suppression + still_suppressed = False + if resolved_at: + try: + resolved_dt = datetime.fromisoformat(resolved_at) + elapsed_hours = (datetime.now() - resolved_dt).total_seconds() / 3600 + still_suppressed = elapsed_hours < sup_hours + except Exception: + pass + + if still_suppressed: + return {'type': 'skipped_acknowledged', 'needs_notification': False} + else: + # Suppression expired. + # Journal-sourced errors (logs AND disk I/O) should NOT + # re-trigger after suppression. The journal always contains + # old messages, so re-creating the error causes an infinite + # notification loop. Delete the stale record instead. + is_journal_error = ( + error_key.startswith('log_persistent_') + or error_key.startswith('log_spike_') + or error_key.startswith('log_cascade_') + or error_key.startswith('log_critical_') + or error_key.startswith('smart_') + or error_key.startswith('disk_') + or error_key.startswith('io_error_') + or category == 'logs' + ) + if is_journal_error: + cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,)) conn.commit() - conn.close() - return event_info - - # Record event - self._record_event(cursor, event_info['type'], error_key, - {'severity': severity, 'reason': reason}) - - conn.commit() - conn.close() + return {'type': 'skipped_expired_journal', 'needs_notification': False} + + # For non-log errors (hardware, services, etc.), + # re-triggering is correct -- the condition is real and still present. + cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,)) + cursor.execute(''' + INSERT INTO errors + (error_key, category, severity, reason, details, first_seen, last_seen) + VALUES (?, ?, ?, ?, ?, ?, ?) + ''', (error_key, category, severity, reason, details_json, now, now)) + event_info = {'type': 'new', 'needs_notification': True} + self._record_event(cursor, 'new', error_key, + {'severity': severity, 'reason': reason, + 'note': 'Re-triggered after suppression expired'}) + conn.commit() + return event_info + + # Not acknowledged - update existing active error + cursor.execute(''' + UPDATE errors + SET last_seen = ?, severity = ?, reason = ?, details = ? + WHERE error_key = ? AND acknowledged = 0 + ''', (now, severity, reason, details_json, error_key)) + + # Check if severity escalated + if old_severity == 'WARNING' and severity == 'CRITICAL': + event_info['type'] = 'escalated' + event_info['needs_notification'] = True + else: + # Insert new error + cursor.execute(''' + INSERT INTO errors + (error_key, category, severity, reason, details, first_seen, last_seen) + VALUES (?, ?, ?, ?, ?, ?, ?) + ''', (error_key, category, severity, reason, details_json, now, now)) + + event_info['type'] = 'new' + event_info['needs_notification'] = True + + # ─── Auto-suppress: if the category has a non-default setting, ─── + if not (error_key == 'cpu_temperature' and severity == 'CRITICAL'): + setting_key = self.CATEGORY_SETTING_MAP.get(category, '') + if setting_key: + stored = self._get_setting_impl(conn, setting_key) + if stored is not None: + configured_hours = int(stored) + if configured_hours != self.DEFAULT_SUPPRESSION_HOURS: + cursor.execute(''' + UPDATE errors + SET acknowledged = 1, acknowledged_at = ?, suppression_hours = ? + WHERE error_key = ? AND acknowledged = 0 + ''', (now, configured_hours, error_key)) + + if cursor.rowcount > 0: + self._record_event(cursor, 'auto_suppressed', error_key, { + 'severity': severity, + 'reason': reason, + 'suppression_hours': configured_hours, + 'note': 'Auto-suppressed by user settings' + }) + event_info['type'] = 'auto_suppressed' + event_info['needs_notification'] = False + conn.commit() + return event_info + + # Record event + self._record_event(cursor, event_info['type'], error_key, + {'severity': severity, 'reason': reason}) + + conn.commit() + finally: + conn.close() return event_info @@ -534,22 +550,20 @@ class HealthPersistence: return self._resolve_error_impl(error_key, reason) def _resolve_error_impl(self, error_key, reason): - conn = self._get_conn() - cursor = conn.cursor() - - now = datetime.now().isoformat() - - cursor.execute(''' - UPDATE errors - SET resolved_at = ? - WHERE error_key = ? AND resolved_at IS NULL - ''', (now, error_key)) - - if cursor.rowcount > 0: - self._record_event(cursor, 'resolved', error_key, {'reason': reason}) - - conn.commit() - conn.close() + with self._db_connection() as conn: + cursor = conn.cursor() + now = datetime.now().isoformat() + + cursor.execute(''' + UPDATE errors + SET resolved_at = ? + WHERE error_key = ? AND resolved_at IS NULL + ''', (now, error_key)) + + if cursor.rowcount > 0: + self._record_event(cursor, 'resolved', error_key, {'reason': reason}) + + conn.commit() def is_error_active(self, error_key: str, category: Optional[str] = None) -> bool: """ @@ -626,37 +640,34 @@ class HealthPersistence: we delete the record entirely so it can re-trigger as a fresh event if the condition returns later. """ - conn = self._get_conn() - cursor = conn.cursor() - - now = datetime.now().isoformat() - - # Check if this error was acknowledged (dismissed) - cursor.execute(''' - SELECT acknowledged FROM errors WHERE error_key = ? - ''', (error_key,)) - row = cursor.fetchone() - - if row and row[0] == 1: - # Dismissed error that naturally resolved - delete entirely - # so it can re-trigger as a new event if it happens again - cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,)) - if cursor.rowcount > 0: - self._record_event(cursor, 'cleared', error_key, - {'reason': 'condition_resolved_after_dismiss'}) - else: - # Normal active error - mark as resolved + with self._db_connection() as conn: + cursor = conn.cursor() + now = datetime.now().isoformat() + + # Check if this error was acknowledged (dismissed) cursor.execute(''' - UPDATE errors - SET resolved_at = ? - WHERE error_key = ? AND resolved_at IS NULL - ''', (now, error_key)) - - if cursor.rowcount > 0: - self._record_event(cursor, 'cleared', error_key, {'reason': 'condition_resolved'}) - - conn.commit() - conn.close() + SELECT acknowledged FROM errors WHERE error_key = ? + ''', (error_key,)) + row = cursor.fetchone() + + if row and row[0] == 1: + # Dismissed error that naturally resolved - delete entirely + cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,)) + if cursor.rowcount > 0: + self._record_event(cursor, 'cleared', error_key, + {'reason': 'condition_resolved_after_dismiss'}) + else: + # Normal active error - mark as resolved + cursor.execute(''' + UPDATE errors + SET resolved_at = ? + WHERE error_key = ? AND resolved_at IS NULL + ''', (now, error_key)) + + if cursor.rowcount > 0: + self._record_event(cursor, 'cleared', error_key, {'reason': 'condition_resolved'}) + + conn.commit() def acknowledge_error(self, error_key: str) -> Dict[str, Any]: """ @@ -671,17 +682,19 @@ class HealthPersistence: def _acknowledge_error_impl(self, error_key): conn = self._get_conn() conn.row_factory = sqlite3.Row - cursor = conn.cursor() - - now = datetime.now().isoformat() - - # Get current error info before acknowledging - cursor.execute('SELECT * FROM errors WHERE error_key = ?', (error_key,)) - row = cursor.fetchone() - - result = {'success': False, 'error_key': error_key} - - if not row: + category = '' + sup_hours = self.DEFAULT_SUPPRESSION_HOURS + try: + cursor = conn.cursor() + now = datetime.now().isoformat() + + # Get current error info before acknowledging + cursor.execute('SELECT * FROM errors WHERE error_key = ?', (error_key,)) + row = cursor.fetchone() + + result = {'success': False, 'error_key': error_key} + + if not row: # Error not in DB yet -- create a minimal record so the dismiss persists. # Try to infer category from the error_key prefix. category = '' @@ -738,15 +751,6 @@ class HealthPersistence: 'acknowledged_at': now } conn.commit() - conn.close() - - # ── Clear cooldowns for newly dismissed errors too ── - if sup_hours != -1: - if category == 'disks': - self._clear_disk_io_cooldown(error_key) - else: - self._clear_notification_cooldown(error_key) - return result if row: @@ -809,21 +813,17 @@ class HealthPersistence: 'suppression_hours': sup_hours } - conn.commit() - conn.close() - + conn.commit() + finally: + conn.close() + # ── Coordinate with notification cooldowns ── - # When an error is dismissed with non-permanent suppression, - # clear the corresponding cooldown in notification_last_sent - # so it can re-notify after the suppression period expires. - # This applies to ALL categories, not just disks. if sup_hours != -1: if category == 'disks': self._clear_disk_io_cooldown(error_key) else: - # For non-disk categories, clear the PollingCollector cooldown self._clear_notification_cooldown(error_key) - + return result def is_error_acknowledged(self, error_key: str) -> bool: @@ -928,12 +928,13 @@ class HealthPersistence: def _cleanup_old_errors_impl(self): conn = self._get_conn() - cursor = conn.cursor() - - now = datetime.now() - now_iso = now.isoformat() - - # Delete resolved errors older than 7 days + try: + cursor = conn.cursor() + + now = datetime.now() + now_iso = now.isoformat() + + # Delete resolved errors older than 7 days cutoff_resolved = (now - timedelta(days=7)).isoformat() cursor.execute('DELETE FROM errors WHERE resolved_at < ?', (cutoff_resolved,)) @@ -1127,12 +1128,13 @@ class HealthPersistence: except Exception: pass # If we can't read uptime, skip this cleanup - conn.commit() - conn.close() - + conn.commit() + finally: + conn.close() + # Clean up errors for resources that no longer exist (VMs/CTs deleted, disks removed) self._cleanup_stale_resources() - + # Clean up disk observations for devices that no longer exist self.cleanup_orphan_observations() @@ -2174,6 +2176,8 @@ class HealthPersistence: last_col = 'last_occurrence' if 'last_occurrence' in columns else 'last_seen' # Upsert observation: if same (disk, type, signature), bump count + update last timestamp + # IMPORTANT: Do NOT reset dismissed — if the user dismissed this observation, + # re-detecting the same journal entry must not un-dismiss it. cursor.execute(f''' INSERT INTO disk_observations (disk_registry_id, {type_col}, error_signature, {first_col}, @@ -2182,8 +2186,7 @@ class HealthPersistence: ON CONFLICT(disk_registry_id, {type_col}, error_signature) DO UPDATE SET {last_col} = excluded.{last_col}, occurrence_count = occurrence_count + 1, - severity = CASE WHEN excluded.severity = 'critical' THEN 'critical' ELSE severity END, - dismissed = 0 + severity = CASE WHEN excluded.severity = 'critical' THEN 'critical' ELSE severity END ''', (disk_id, error_type, error_signature, now, now, raw_message, severity)) conn.commit()