update health_monitor.py

2026-06-11 11:06:24 +00:00 · 2026-04-16 19:10:47 +02:00
parent 7f2b0c5de1
commit ee1204c566
3 changed files with 353 additions and 312 deletions
--- a/AppImage/components/proxmox-dashboard.tsx
+++ b/AppImage/components/proxmox-dashboard.tsx
@@ -74,6 +74,7 @@ export function ProxmoxDashboard() {
    serverName: "Loading...",
    nodeId: "Loading...",
  })
  const [isInitialLoading, setIsInitialLoading] = useState(true)
  const [isRefreshing, setIsRefreshing] = useState(false)
  const [isServerConnected, setIsServerConnected] = useState(true)
  const [componentKey, setComponentKey] = useState(0)
@@ -192,10 +193,10 @@ export function ProxmoxDashboard() {
  }, [])
  useEffect(() => {
-  // Siempre fetch inicial
+  // Siempre fetch inicial — mark loading done when system data arrives
-  fetchSystemData()
+  fetchSystemData().finally(() => setIsInitialLoading(false))
-  fetchHealthInfoCount() // Fetch info count on initial load
+  fetchHealthInfoCount()
-  fetchUpdateStatus() // Fetch ProxMenux update status on initial load
+  fetchUpdateStatus()
    // En overview: cada 30 segundos para actualización frecuente del estado de salud
    // En otras tabs: cada 60 segundos para reducir carga
@@ -372,6 +373,21 @@ export function ProxmoxDashboard() {
    }
  }
  if (isInitialLoading) {
    return (
      <div className="min-h-screen bg-background flex items-center justify-center">
        <div className="flex flex-col items-center gap-4">
          <div className="relative">
            <div className="h-12 w-12 rounded-full border-2 border-muted"></div>
            <div className="absolute inset-0 h-12 w-12 rounded-full border-2 border-transparent border-t-primary animate-spin"></div>
          </div>
          <div className="text-sm font-medium text-foreground">Loading ProxMenux Monitor...</div>
          <p className="text-xs text-muted-foreground">Connecting to server and fetching system status</p>
        </div>
      </div>
    )
  }
  return (
    <div className="min-h-screen bg-background">
      <OnboardingCarousel />
--- a/AppImage/scripts/health_monitor.py
+++ b/AppImage/scripts/health_monitor.py
@@ -272,6 +272,8 @@ class HealthMonitor:
        # SMART check cache - reduces disk queries from every 5 min to every 30 min
        self._smart_cache = {}  # {disk_name: {'result': 'PASSED', 'time': timestamp}}
        self._SMART_CACHE_TTL = 1620  # 27 min - offset to avoid sync with other processes
        # Disk identity cache - avoids repeated smartctl -i calls for serial/model
        self._disk_identity_cache: Dict[str, Dict[str, str]] = {}  # {disk_name: {'serial': ..., 'model': ...}}
        # Journalctl 24h cache - reduces full log reads from every 5 min to every 1 hour
        self._journalctl_24h_cache = {'count': 0, 'time': 0}
@@ -280,11 +282,14 @@ class HealthMonitor:
        # Journalctl 10min cache - shared across checks to avoid duplicate calls
        # Multiple checks (cpu_temp, vms_cts) use the same journalctl query
        self._journalctl_10min_cache = {'output': '', 'time': 0}
-        self._JOURNALCTL_10MIN_CACHE_TTL = 60  # 1 minute - fresh enough for health checks
+        self._JOURNALCTL_10MIN_CACHE_TTL = 120  # 2 minutes - covers full health check cycle
        # Journalctl 1hour cache - for disk health events (SMART warnings, I/O errors)
        self._journalctl_1hour_cache = {'output': '', 'time': 0}
        self._JOURNALCTL_1HOUR_CACHE_TTL = 300  # 5 min cache - disk events don't need real-time
        # Timestamp watermark: track last successfully processed journalctl entry
        # to avoid re-processing old entries on subsequent checks
        self._disk_journal_last_ts: Optional[str] = None
        # System capabilities - derived from Proxmox storage types at runtime (Priority 1.5)
        # SMART detection still uses filesystem check on init (lightweight)
@@ -316,7 +321,7 @@ class HealthMonitor:
                ['journalctl', '-b', '0', '--since', '10 minutes ago', '--no-pager', '-p', 'warning'],
                capture_output=True,
                text=True,
-                timeout=20
+                timeout=10
            )
            if result.returncode == 0:
                cache['output'] = result.stdout
@@ -330,37 +335,49 @@ class HealthMonitor:
        return cache.get('output', '')  # Return stale cache on error
    def _get_journalctl_1hour_warnings(self) -> str:
-        """Get journalctl warnings from last 1 hour, cached for disk health checks.
+        """Get journalctl warnings since last check, cached for disk health checks.
-        
+
        Used by _check_disk_health_from_events for SMART warnings and I/O errors.
        Uses a timestamp watermark (_disk_journal_last_ts) to only read NEW entries
        since the last successful check, preventing re-processing of old errors.
        On first run (no watermark), reads the last 10 minutes to catch recent events
        without pulling in stale history.
        Cached for 5 minutes since disk events don't require real-time detection.
        """
        current_time = time.time()
        cache = self._journalctl_1hour_cache
-        
+
        # Return cached result if fresh
-        if cache['output'] and (current_time - cache['time']) < self._JOURNALCTL_1HOUR_CACHE_TTL:
+        if cache['output'] is not None and cache['time'] > 0 and (current_time - cache['time']) < self._JOURNALCTL_1HOUR_CACHE_TTL:
            return cache['output']
-        
+
-        # Execute journalctl and cache result
+        # Determine --since value: use watermark if available, otherwise 10 minutes
-        # Use -b 0 to only include logs from the current boot
+        if self._disk_journal_last_ts:
            since_arg = self._disk_journal_last_ts
        else:
            since_arg = '10 minutes ago'
        try:
            result = subprocess.run(
-                ['journalctl', '-b', '0', '--since', '1 hour ago', '--no-pager', '-p', 'warning',
+                ['journalctl', '-b', '0', '--since', since_arg, '--no-pager', '-p', 'warning',
                 '--output=short-precise'],
                capture_output=True,
                text=True,
                timeout=15
            )
            if result.returncode == 0:
-                cache['output'] = result.stdout
+                output = result.stdout
                cache['output'] = output
                cache['time'] = current_time
-                return cache['output']
+                # Advance watermark to "now" so next check only gets new entries
                from datetime import datetime as _dt
                self._disk_journal_last_ts = _dt.now().strftime('%Y-%m-%d %H:%M:%S')
                return output
        except subprocess.TimeoutExpired:
-            print("[HealthMonitor] journalctl 1hour cache: timeout")
+            print("[HealthMonitor] journalctl disk cache: timeout")
        except Exception as e:
-            print(f"[HealthMonitor] journalctl 1hour cache error: {e}")
+            print(f"[HealthMonitor] journalctl disk cache error: {e}")
-        
+
        return cache.get('output', '')  # Return stale cache on error
    # ─── Lightweight sampling methods for the dedicated vital-signs thread ───
@@ -1260,21 +1277,10 @@ class HealthMonitor:
                reason = f'{disk}: {issue["reason"]}'
                severity = issue.get('status', 'WARNING')
-                # Get serial for this disk to properly track it (important for USB disks)
+                # Get serial for this disk (cached to avoid repeated smartctl calls)
-                disk_serial = ''
+                disk_id = self._get_disk_identity(device)
-                disk_model = ''
+                disk_serial = disk_id['serial']
-                try:
+                disk_model = disk_id['model']
                    smart_result = subprocess.run(
                        ['smartctl', '-i', '-j', f'/dev/{device}'],
                        capture_output=True, text=True, timeout=5
                    )
                    if smart_result.returncode in (0, 4):
                        import json
                        smart_data = json.loads(smart_result.stdout)
                        disk_serial = smart_data.get('serial_number', '')
                        disk_model = smart_data.get('model_name', '') or smart_data.get('model_family', '')
                except Exception:
                    pass
                try:
                    if (not health_persistence.is_error_active(io_error_key, category='disks') and
@@ -1323,21 +1329,10 @@ class HealthMonitor:
                device = disk_path.replace('/dev/', '')
                io_severity = disk_info.get('status', 'WARNING').lower()
-                # Get serial for proper disk tracking (important for USB)
+                # Get serial for proper disk tracking (cached)
-                io_serial = ''
+                io_id = self._get_disk_identity(device)
-                io_model = ''
+                io_serial = io_id['serial']
-                try:
+                io_model = io_id['model']
                    smart_result = subprocess.run(
                        ['smartctl', '-i', '-j', f'/dev/{device}'],
                        capture_output=True, text=True, timeout=5
                    )
                    if smart_result.returncode in (0, 4):
                        import json
                        smart_data = json.loads(smart_result.stdout)
                        io_serial = smart_data.get('serial_number', '')
                        io_model = smart_data.get('model_name', '') or smart_data.get('model_family', '')
                except Exception:
                    pass
                # Register the disk for observation tracking (worst_health no longer used)
                try:
@@ -1946,20 +1941,53 @@ class HealthMonitor:
        except Exception:
            return ''
    def _get_disk_identity(self, disk_name: str) -> Dict[str, str]:
        """Get disk serial/model with caching. Avoids repeated smartctl -i calls.
        Returns {'serial': '...', 'model': '...'} or empty values on failure.
        Cache persists for the lifetime of the monitor (serial/model don't change).
        """
        if disk_name in self._disk_identity_cache:
            return self._disk_identity_cache[disk_name]
        result = {'serial': '', 'model': ''}
        try:
            dev_path = f'/dev/{disk_name}' if not disk_name.startswith('/') else disk_name
            proc = subprocess.run(
                ['smartctl', '-i', '-j', dev_path],
                capture_output=True, text=True, timeout=5
            )
            if proc.returncode in (0, 4):
                import json as _json
                data = _json.loads(proc.stdout)
                result['serial'] = data.get('serial_number', '')
                result['model'] = data.get('model_name', '') or data.get('model_family', '')
        except Exception:
            pass
        self._disk_identity_cache[disk_name] = result
        return result
    def _quick_smart_health(self, disk_name: str) -> str:
        """Quick SMART health check for a single disk. Returns 'PASSED', 'FAILED', or 'UNKNOWN'.
-        
+
        Results are cached for 30 minutes to reduce disk queries - SMART status rarely changes.
        """
        if not disk_name or disk_name.startswith('ata') or disk_name.startswith('zram'):
            return 'UNKNOWN'
-        # Check cache first
+        # Check cache first (and evict stale entries periodically)
        current_time = time.time()
        cache_key = disk_name
        cached = self._smart_cache.get(cache_key)
        if cached and current_time - cached['time'] < self._SMART_CACHE_TTL:
            return cached['result']
        # Evict expired entries to prevent unbounded growth
        if len(self._smart_cache) > 50:
            self._smart_cache = {
                k: v for k, v in self._smart_cache.items()
                if current_time - v['time'] < self._SMART_CACHE_TTL * 2
            }
        try:
            dev_path = f'/dev/{disk_name}' if not disk_name.startswith('/') else disk_name
@@ -2130,7 +2158,11 @@ class HealthMonitor:
                        t for t in self.io_error_history[disk]
                        if current_time - t < 300
                    ]
-                    
+                    # Remove empty entries to prevent unbounded dict growth
                    if not self.io_error_history[disk]:
                        del self.io_error_history[disk]
                        continue
                    error_count = len(self.io_error_history[disk])
                    error_key = f'disk_{disk}'
                    sample = disk_samples.get(disk, '')
@@ -4662,19 +4694,9 @@ class HealthMonitor:
                    obs_sig = f'{sig_base}_{disk_name}'
-                    # Try to get serial for proper cross-referencing
+                    # Get serial for proper cross-referencing (cached)
-                    obs_serial = None
+                    obs_id = self._get_disk_identity(disk_name)
-                    try:
+                    obs_serial = obs_id['serial'] or None
                        sm = subprocess.run(
                            ['smartctl', '-i', dev_path],
                            capture_output=True, text=True, timeout=3)
                        if sm.returncode in (0, 4):
                            for sline in sm.stdout.split('\n'):
                                if 'Serial Number' in sline or 'Serial number' in sline:
                                    obs_serial = sline.split(':')[-1].strip()
                                    break
                    except Exception:
                        pass
                    health_persistence.record_disk_observation(
                        device_name=disk_name,
--- a/AppImage/scripts/health_persistence.py
+++ b/AppImage/scripts/health_persistence.py
@@ -54,7 +54,12 @@ class HealthPersistence:
        self._init_database()
    def _get_conn(self) -> sqlite3.Connection:
-        """Get a SQLite connection with timeout and WAL mode for safe concurrency."""
+        """Get a SQLite connection with timeout and WAL mode for safe concurrency.
        IMPORTANT: Always close the connection when done, preferably using
        the _db_connection() context manager. If not closed explicitly,
        Python's GC will close it, but this is unreliable under load.
        """
        conn = sqlite3.connect(str(self.db_path), timeout=30)
        conn.execute('PRAGMA journal_mode=WAL')
        conn.execute('PRAGMA busy_timeout=10000')
@@ -332,7 +337,32 @@ class HealthPersistence:
            print(f"[HealthPersistence] WARNING: Missing tables after init: {missing}")
        else:
            print(f"[HealthPersistence] Database initialized with {len(tables)} tables")
-        
+
        # ─── Startup migration: clean stale looping disk I/O errors ───
        # Previous versions had a bug where journal-based disk errors were
        # re-processed every cycle, causing infinite notification loops.
        # On upgrade, clean up any stale disk errors that are stuck in the
        # active state from the old buggy behavior.
        try:
            cursor = conn.cursor()
            # Delete active (unresolved) disk errors from journal that are
            # older than 2 hours — these are leftovers from the feedback loop.
            # Real new errors will be re-detected from fresh journal entries.
            cutoff = (datetime.now() - timedelta(hours=2)).isoformat()
            cursor.execute('''
                DELETE FROM errors
                WHERE error_key LIKE 'smart_%'
                  AND resolved_at IS NULL
                  AND acknowledged = 0
                  AND last_seen < ?
            ''', (cutoff,))
            cleaned = cursor.rowcount
            if cleaned > 0:
                conn.commit()
                print(f"[HealthPersistence] Startup cleanup: removed {cleaned} stale disk error(s) from previous bug")
        except Exception as e:
            print(f"[HealthPersistence] Startup cleanup warning: {e}")
        conn.close()
    def record_error(self, error_key: str, category: str, severity: str, 
@@ -345,21 +375,17 @@ class HealthPersistence:
            return self._record_error_impl(error_key, category, severity, reason, details)
    def _record_error_impl(self, error_key, category, severity, reason, details):
-        # === RESOURCE EXISTENCE CHECK ===
+        # === RESOURCE EXISTENCE CHECK (before DB access) ===
        # Skip recording errors for resources that no longer exist
        # This prevents "ghost" errors from stale journal entries
        # Check VM/CT existence
        if error_key and (error_key.startswith(('vm_', 'ct_', 'vmct_'))):
            import re
            vmid_match = re.search(r'(?:vm_|ct_|vmct_)(\d+)', error_key)
            if vmid_match:
                vmid = vmid_match.group(1)
                if not self._check_vm_ct_exists(vmid):
-                    return {'type': 'skipped', 'needs_notification': False, 
+                    return {'type': 'skipped', 'needs_notification': False,
                            'reason': f'VM/CT {vmid} no longer exists'}
-        
+
        # Check disk existence
        if error_key and any(error_key.startswith(p) for p in ('smart_', 'disk_', 'io_error_')):
            import re
            import os
@@ -370,161 +396,151 @@ class HealthPersistence:
                if not os.path.exists(f'/dev/{disk_name}') and not os.path.exists(f'/dev/{base_disk}'):
                    return {'type': 'skipped', 'needs_notification': False,
                            'reason': f'Disk /dev/{disk_name} no longer exists'}
-        
+
        conn = self._get_conn()
-        cursor = conn.cursor()
+        try:
-        
+            cursor = conn.cursor()
-        now = datetime.now().isoformat()
+            now = datetime.now().isoformat()
-        details_json = json.dumps(details) if details else None
+            details_json = json.dumps(details) if details else None
-        
+
-        cursor.execute('''
+            cursor.execute('''
-            SELECT id, acknowledged, resolved_at, category, severity, first_seen, 
+                SELECT id, acknowledged, resolved_at, category, severity, first_seen,
-                   notification_sent, suppression_hours
+                       notification_sent, suppression_hours
-            FROM errors WHERE error_key = ?
+                FROM errors WHERE error_key = ?
-        ''', (error_key,))
+            ''', (error_key,))
-        existing = cursor.fetchone()
+            existing = cursor.fetchone()
-        
+
-        event_info = {'type': 'updated', 'needs_notification': False}
+            event_info = {'type': 'updated', 'needs_notification': False}
-        
+
-        if existing:
+            if existing:
-            err_id, ack, resolved_at, old_cat, old_severity, first_seen, notif_sent, stored_suppression = existing
+                err_id, ack, resolved_at, old_cat, old_severity, first_seen, notif_sent, stored_suppression = existing
-            
+
-            if ack == 1:
+                if ack == 1:
-                # SAFETY OVERRIDE: Critical CPU temperature ALWAYS re-triggers
+                    # SAFETY OVERRIDE: Critical CPU temperature ALWAYS re-triggers
-                # regardless of any dismiss/permanent setting (hardware protection)
+                    if error_key == 'cpu_temperature' and severity == 'CRITICAL':
                if error_key == 'cpu_temperature' and severity == 'CRITICAL':
                    cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,))
                    cursor.execute('''
                        INSERT INTO errors 
                        (error_key, category, severity, reason, details, first_seen, last_seen)
                        VALUES (?, ?, ?, ?, ?, ?, ?)
                    ''', (error_key, category, severity, reason, details_json, now, now))
                    event_info = {'type': 'new', 'needs_notification': True}
                    self._record_event(cursor, 'new', error_key, 
                                      {'severity': severity, 'reason': reason,
                                       'note': 'CRITICAL temperature override - safety alert'})
                    conn.commit()
                    conn.close()
                    return event_info
                # Check suppression: use per-record stored hours (set at dismiss time)
                sup_hours = stored_suppression if stored_suppression is not None else self.DEFAULT_SUPPRESSION_HOURS
                # Permanent dismiss (sup_hours == -1): always suppress
                if sup_hours == -1:
                    conn.close()
                    return {'type': 'skipped_acknowledged', 'needs_notification': False}
                # Time-limited suppression
                still_suppressed = False
                if resolved_at:
                    try:
                        resolved_dt = datetime.fromisoformat(resolved_at)
                        elapsed_hours = (datetime.now() - resolved_dt).total_seconds() / 3600
                        still_suppressed = elapsed_hours < sup_hours
                    except Exception:
                        pass
                if still_suppressed:
                    conn.close()
                    return {'type': 'skipped_acknowledged', 'needs_notification': False}
                else:
                    # Suppression expired.
                    # For log-based errors (spike, persistent, cascade),
                    # do NOT re-trigger.  The journal always contains old
                    # messages, so re-creating the error would cause an
                    # infinite notification cycle.  Instead, just delete
                    # the stale record so it stops appearing in the UI.
                    is_log_error = (
                        error_key.startswith('log_persistent_')
                        or error_key.startswith('log_spike_')
                        or error_key.startswith('log_cascade_')
                        or error_key.startswith('log_critical_')
                        or category == 'logs'
                    )
                    if is_log_error:
                        cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,))
                        conn.commit()
                        conn.close()
                        return {'type': 'skipped_expired_log', 'needs_notification': False}
                    # For non-log errors (hardware, services, etc.),
                    # re-triggering is correct -- the condition is real
                    # and still present.
                    cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,))
                    cursor.execute('''
                        INSERT INTO errors 
                        (error_key, category, severity, reason, details, first_seen, last_seen)
                        VALUES (?, ?, ?, ?, ?, ?, ?)
                    ''', (error_key, category, severity, reason, details_json, now, now))
                    event_info = {'type': 'new', 'needs_notification': True}
                    self._record_event(cursor, 'new', error_key, 
                                      {'severity': severity, 'reason': reason,
                                       'note': 'Re-triggered after suppression expired'})
                    conn.commit()
                    conn.close()
                    return event_info
            # Not acknowledged - update existing active error
            cursor.execute('''
                UPDATE errors 
                SET last_seen = ?, severity = ?, reason = ?, details = ?
                WHERE error_key = ? AND acknowledged = 0
            ''', (now, severity, reason, details_json, error_key))
            # Check if severity escalated
            if old_severity == 'WARNING' and severity == 'CRITICAL':
                event_info['type'] = 'escalated'
                event_info['needs_notification'] = True
        else:
            # Insert new error
            cursor.execute('''
                INSERT INTO errors 
                (error_key, category, severity, reason, details, first_seen, last_seen)
                VALUES (?, ?, ?, ?, ?, ?, ?)
            ''', (error_key, category, severity, reason, details_json, now, now))
            event_info['type'] = 'new'
            event_info['needs_notification'] = True
        # ─── Auto-suppress: if the category has a non-default setting, ───
        # auto-dismiss immediately so the user never sees it as active.
        # Exception: CRITICAL CPU temperature is never auto-suppressed.
        if not (error_key == 'cpu_temperature' and severity == 'CRITICAL'):
            setting_key = self.CATEGORY_SETTING_MAP.get(category, '')
            if setting_key:
                # P4 fix: use _get_setting_impl with existing connection to avoid deadlock
                stored = self._get_setting_impl(conn, setting_key)
                if stored is not None:
                    configured_hours = int(stored)
                    if configured_hours != self.DEFAULT_SUPPRESSION_HOURS:
                        # Non-default setting found: auto-acknowledge
                        # Mark as acknowledged but DO NOT set resolved_at - error remains active
                        cursor.execute('''
-                            UPDATE errors 
+                            INSERT INTO errors
-                            SET acknowledged = 1, acknowledged_at = ?, suppression_hours = ?
+                            (error_key, category, severity, reason, details, first_seen, last_seen)
-                            WHERE error_key = ? AND acknowledged = 0
+                            VALUES (?, ?, ?, ?, ?, ?, ?)
-                        ''', (now, configured_hours, error_key))
+                        ''', (error_key, category, severity, reason, details_json, now, now))
-                        
+                        event_info = {'type': 'new', 'needs_notification': True}
-                        if cursor.rowcount > 0:
+                        self._record_event(cursor, 'new', error_key,
-                            self._record_event(cursor, 'auto_suppressed', error_key, {
+                                          {'severity': severity, 'reason': reason,
-                                'severity': severity,
+                                           'note': 'CRITICAL temperature override - safety alert'})
-                                'reason': reason,
+                        conn.commit()
-                                'suppression_hours': configured_hours,
+                        return event_info
-                                'note': 'Auto-suppressed by user settings'
+
-                            })
+                    # Check suppression: use per-record stored hours (set at dismiss time)
-                            event_info['type'] = 'auto_suppressed'
+                    sup_hours = stored_suppression if stored_suppression is not None else self.DEFAULT_SUPPRESSION_HOURS
-                            event_info['needs_notification'] = False
+
                    # Permanent dismiss (sup_hours == -1): always suppress
                    if sup_hours == -1:
                        return {'type': 'skipped_acknowledged', 'needs_notification': False}
                    # Time-limited suppression
                    still_suppressed = False
                    if resolved_at:
                        try:
                            resolved_dt = datetime.fromisoformat(resolved_at)
                            elapsed_hours = (datetime.now() - resolved_dt).total_seconds() / 3600
                            still_suppressed = elapsed_hours < sup_hours
                        except Exception:
                            pass
                    if still_suppressed:
                        return {'type': 'skipped_acknowledged', 'needs_notification': False}
                    else:
                        # Suppression expired.
                        # Journal-sourced errors (logs AND disk I/O) should NOT
                        # re-trigger after suppression.  The journal always contains
                        # old messages, so re-creating the error causes an infinite
                        # notification loop.  Delete the stale record instead.
                        is_journal_error = (
                            error_key.startswith('log_persistent_')
                            or error_key.startswith('log_spike_')
                            or error_key.startswith('log_cascade_')
                            or error_key.startswith('log_critical_')
                            or error_key.startswith('smart_')
                            or error_key.startswith('disk_')
                            or error_key.startswith('io_error_')
                            or category == 'logs'
                        )
                        if is_journal_error:
                            cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,))
                            conn.commit()
-                            conn.close()
+                            return {'type': 'skipped_expired_journal', 'needs_notification': False}
-                            return event_info
+
-        
+                        # For non-log errors (hardware, services, etc.),
-        # Record event
+                        # re-triggering is correct -- the condition is real and still present.
-        self._record_event(cursor, event_info['type'], error_key, 
+                        cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,))
-                          {'severity': severity, 'reason': reason})
+                        cursor.execute('''
-        
+                            INSERT INTO errors
-        conn.commit()
+                            (error_key, category, severity, reason, details, first_seen, last_seen)
-        conn.close()
+                            VALUES (?, ?, ?, ?, ?, ?, ?)
                        ''', (error_key, category, severity, reason, details_json, now, now))
                        event_info = {'type': 'new', 'needs_notification': True}
                        self._record_event(cursor, 'new', error_key,
                                          {'severity': severity, 'reason': reason,
                                           'note': 'Re-triggered after suppression expired'})
                        conn.commit()
                        return event_info
                # Not acknowledged - update existing active error
                cursor.execute('''
                    UPDATE errors
                    SET last_seen = ?, severity = ?, reason = ?, details = ?
                    WHERE error_key = ? AND acknowledged = 0
                ''', (now, severity, reason, details_json, error_key))
                # Check if severity escalated
                if old_severity == 'WARNING' and severity == 'CRITICAL':
                    event_info['type'] = 'escalated'
                    event_info['needs_notification'] = True
            else:
                # Insert new error
                cursor.execute('''
                    INSERT INTO errors
                    (error_key, category, severity, reason, details, first_seen, last_seen)
                    VALUES (?, ?, ?, ?, ?, ?, ?)
                ''', (error_key, category, severity, reason, details_json, now, now))
                event_info['type'] = 'new'
                event_info['needs_notification'] = True
            # ─── Auto-suppress: if the category has a non-default setting, ───
            if not (error_key == 'cpu_temperature' and severity == 'CRITICAL'):
                setting_key = self.CATEGORY_SETTING_MAP.get(category, '')
                if setting_key:
                    stored = self._get_setting_impl(conn, setting_key)
                    if stored is not None:
                        configured_hours = int(stored)
                        if configured_hours != self.DEFAULT_SUPPRESSION_HOURS:
                            cursor.execute('''
                                UPDATE errors
                                SET acknowledged = 1, acknowledged_at = ?, suppression_hours = ?
                                WHERE error_key = ? AND acknowledged = 0
                            ''', (now, configured_hours, error_key))
                            if cursor.rowcount > 0:
                                self._record_event(cursor, 'auto_suppressed', error_key, {
                                    'severity': severity,
                                    'reason': reason,
                                    'suppression_hours': configured_hours,
                                    'note': 'Auto-suppressed by user settings'
                                })
                                event_info['type'] = 'auto_suppressed'
                                event_info['needs_notification'] = False
                                conn.commit()
                                return event_info
            # Record event
            self._record_event(cursor, event_info['type'], error_key,
                              {'severity': severity, 'reason': reason})
            conn.commit()
        finally:
            conn.close()
        return event_info
@@ -534,22 +550,20 @@ class HealthPersistence:
            return self._resolve_error_impl(error_key, reason)
    def _resolve_error_impl(self, error_key, reason):
-        conn = self._get_conn()
+        with self._db_connection() as conn:
-        cursor = conn.cursor()
+            cursor = conn.cursor()
-        
+            now = datetime.now().isoformat()
-        now = datetime.now().isoformat()
+
-        
+            cursor.execute('''
-        cursor.execute('''
+                UPDATE errors
-            UPDATE errors 
+                SET resolved_at = ?
-            SET resolved_at = ?
+                WHERE error_key = ? AND resolved_at IS NULL
-            WHERE error_key = ? AND resolved_at IS NULL
+            ''', (now, error_key))
-        ''', (now, error_key))
+
-        
+            if cursor.rowcount > 0:
-        if cursor.rowcount > 0:
+                self._record_event(cursor, 'resolved', error_key, {'reason': reason})
-            self._record_event(cursor, 'resolved', error_key, {'reason': reason})
+
-        
+            conn.commit()
        conn.commit()
        conn.close()
    def is_error_active(self, error_key: str, category: Optional[str] = None) -> bool:
        """
@@ -626,37 +640,34 @@ class HealthPersistence:
        we delete the record entirely so it can re-trigger as a fresh
        event if the condition returns later.
        """
-        conn = self._get_conn()
+        with self._db_connection() as conn:
-        cursor = conn.cursor()
+            cursor = conn.cursor()
-        
+            now = datetime.now().isoformat()
-        now = datetime.now().isoformat()
+
-        
+            # Check if this error was acknowledged (dismissed)
        # Check if this error was acknowledged (dismissed)
        cursor.execute('''
            SELECT acknowledged FROM errors WHERE error_key = ?
        ''', (error_key,))
        row = cursor.fetchone()
        if row and row[0] == 1:
            # Dismissed error that naturally resolved - delete entirely
            # so it can re-trigger as a new event if it happens again
            cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,))
            if cursor.rowcount > 0:
                self._record_event(cursor, 'cleared', error_key, 
                                  {'reason': 'condition_resolved_after_dismiss'})
        else:
            # Normal active error - mark as resolved
            cursor.execute('''
-                UPDATE errors 
+                SELECT acknowledged FROM errors WHERE error_key = ?
-                SET resolved_at = ?
+            ''', (error_key,))
-                WHERE error_key = ? AND resolved_at IS NULL
+            row = cursor.fetchone()
-            ''', (now, error_key))
+
-            
+            if row and row[0] == 1:
-            if cursor.rowcount > 0:
+                # Dismissed error that naturally resolved - delete entirely
-                self._record_event(cursor, 'cleared', error_key, {'reason': 'condition_resolved'})
+                cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,))
-        
+                if cursor.rowcount > 0:
-        conn.commit()
+                    self._record_event(cursor, 'cleared', error_key,
-        conn.close()
+                                      {'reason': 'condition_resolved_after_dismiss'})
            else:
                # Normal active error - mark as resolved
                cursor.execute('''
                    UPDATE errors
                    SET resolved_at = ?
                    WHERE error_key = ? AND resolved_at IS NULL
                ''', (now, error_key))
                if cursor.rowcount > 0:
                    self._record_event(cursor, 'cleared', error_key, {'reason': 'condition_resolved'})
            conn.commit()
    def acknowledge_error(self, error_key: str) -> Dict[str, Any]:
        """
@@ -671,17 +682,19 @@ class HealthPersistence:
    def _acknowledge_error_impl(self, error_key):
        conn = self._get_conn()
        conn.row_factory = sqlite3.Row
-        cursor = conn.cursor()
+        category = ''
-        
+        sup_hours = self.DEFAULT_SUPPRESSION_HOURS
-        now = datetime.now().isoformat()
+        try:
-        
+            cursor = conn.cursor()
-        # Get current error info before acknowledging
+            now = datetime.now().isoformat()
-        cursor.execute('SELECT * FROM errors WHERE error_key = ?', (error_key,))
+
-        row = cursor.fetchone()
+            # Get current error info before acknowledging
-        
+            cursor.execute('SELECT * FROM errors WHERE error_key = ?', (error_key,))
-        result = {'success': False, 'error_key': error_key}
+            row = cursor.fetchone()
-        
+
-        if not row:
+            result = {'success': False, 'error_key': error_key}
            if not row:
            # Error not in DB yet -- create a minimal record so the dismiss persists.
            # Try to infer category from the error_key prefix.
            category = ''
@@ -738,15 +751,6 @@ class HealthPersistence:
                'acknowledged_at': now
            }
            conn.commit()
            conn.close()
            # ── Clear cooldowns for newly dismissed errors too ──
            if sup_hours != -1:
                if category == 'disks':
                    self._clear_disk_io_cooldown(error_key)
                else:
                    self._clear_notification_cooldown(error_key)
            return result
        if row:
@@ -809,21 +813,17 @@ class HealthPersistence:
                'suppression_hours': sup_hours
            }
-        conn.commit()
+            conn.commit()
-        conn.close()
+        finally:
-        
+            conn.close()
        # ── Coordinate with notification cooldowns ──
        # When an error is dismissed with non-permanent suppression,
        # clear the corresponding cooldown in notification_last_sent
        # so it can re-notify after the suppression period expires.
        # This applies to ALL categories, not just disks.
        if sup_hours != -1:
            if category == 'disks':
                self._clear_disk_io_cooldown(error_key)
            else:
                # For non-disk categories, clear the PollingCollector cooldown
                self._clear_notification_cooldown(error_key)
-        
+
        return result
    def is_error_acknowledged(self, error_key: str) -> bool:
@@ -928,12 +928,13 @@ class HealthPersistence:
    def _cleanup_old_errors_impl(self):
        conn = self._get_conn()
-        cursor = conn.cursor()
+        try:
-        
+            cursor = conn.cursor()
-        now = datetime.now()
+
-        now_iso = now.isoformat()
+            now = datetime.now()
-        
+            now_iso = now.isoformat()
-        # Delete resolved errors older than 7 days
+
            # Delete resolved errors older than 7 days
        cutoff_resolved = (now - timedelta(days=7)).isoformat()
        cursor.execute('DELETE FROM errors WHERE resolved_at < ?', (cutoff_resolved,))
@@ -1127,12 +1128,13 @@ class HealthPersistence:
        except Exception:
            pass  # If we can't read uptime, skip this cleanup
-        conn.commit()
+            conn.commit()
-        conn.close()
+        finally:
-        
+            conn.close()
        # Clean up errors for resources that no longer exist (VMs/CTs deleted, disks removed)
        self._cleanup_stale_resources()
-        
+
        # Clean up disk observations for devices that no longer exist
        self.cleanup_orphan_observations()
@@ -2174,6 +2176,8 @@ class HealthPersistence:
            last_col = 'last_occurrence' if 'last_occurrence' in columns else 'last_seen'
            # Upsert observation: if same (disk, type, signature), bump count + update last timestamp
            # IMPORTANT: Do NOT reset dismissed — if the user dismissed this observation,
            # re-detecting the same journal entry must not un-dismiss it.
            cursor.execute(f'''
                INSERT INTO disk_observations
                    (disk_registry_id, {type_col}, error_signature, {first_col},
@@ -2182,8 +2186,7 @@ class HealthPersistence:
                ON CONFLICT(disk_registry_id, {type_col}, error_signature) DO UPDATE SET
                    {last_col} = excluded.{last_col},
                    occurrence_count = occurrence_count + 1,
-                    severity = CASE WHEN excluded.severity = 'critical' THEN 'critical' ELSE severity END,
+                    severity = CASE WHEN excluded.severity = 'critical' THEN 'critical' ELSE severity END
                    dismissed = 0
            ''', (disk_id, error_type, error_signature, now, now, raw_message, severity))
            conn.commit()