Update health_monitor.py

2025-12-15 00:26:23 +00:00 · 2025-11-27 13:29:15 +01:00
parent f22de50527
commit 41537c0bad
1 changed files with 128 additions and 33 deletions
--- a/AppImage/scripts/health_monitor.py
+++ b/AppImage/scripts/health_monitor.py
@@ -12,6 +12,7 @@ import subprocess
 import json
 import time
 import os
 import hashlib # Added for MD5 hashing
 from typing import Dict, List, Any, Tuple, Optional
 from datetime import datetime, timedelta
 from collections import defaultdict
@@ -420,12 +421,18 @@ class HealthMonitor:
            return {'status': 'UNKNOWN', 'reason': f'CPU check failed: {str(e)}'}
    def _check_cpu_temperature(self) -> Optional[Dict[str, Any]]:
-        """Check CPU temperature with hysteresis (5 min sustained) - cached, max 1 check per minute"""
+        """
        Check CPU temperature with temporal logic:
        - WARNING if temp >80°C sustained for >3 minutes
        - Auto-clears if temp ≤80°C for 30 seconds
        - No dismiss button (non-dismissable)
        """
        cache_key = 'cpu_temp'
        current_time = time.time()
        # Check every 10 seconds instead of 60
        if cache_key in self.last_check_times:
-            if current_time - self.last_check_times[cache_key] < 60:
+            if current_time - self.last_check_times[cache_key] < 10:
                return self.cached_results.get(cache_key)
        try:
@@ -455,35 +462,54 @@ class HealthMonitor:
                        'time': current_time
                    })
-                    # Keep last 6 minutes of data
+                    # Keep last 4 minutes of data (240 seconds)
                    self.state_history[state_key] = [
                        entry for entry in self.state_history[state_key]
-                        if current_time - entry['time'] < 360
+                        if current_time - entry['time'] < 240
                    ]
-                    # Check sustained high temperature (5 minutes)
+                    # Check if temperature >80°C for more than 3 minutes (180 seconds)
-                    critical_temp_samples = [
+                    high_temp_samples = [
                        entry for entry in self.state_history[state_key]
-                        if entry['value'] >= self.TEMP_CRITICAL and
+                        if entry['value'] > 80 and current_time - entry['time'] <= 180
                        current_time - entry['time'] <= 300
                    ]
-                    warning_temp_samples = [
+                    # Check if temperature ≤80°C for last 30 seconds (recovery)
                    recovery_samples = [
                        entry for entry in self.state_history[state_key]
-                        if entry['value'] >= self.TEMP_WARNING and
+                        if entry['value'] <= 80 and current_time - entry['time'] <= 30
                        current_time - entry['time'] <= 300
                    ]
-                    # Require at least 3 samples over 5 minutes to trigger alert
+                    # Require at least 18 samples over 3 minutes (one every 10 seconds) to trigger alert
-                    if len(critical_temp_samples) >= 3:
+                    if len(high_temp_samples) >= 18:
-                        status = 'CRITICAL'
+                        # Temperature has been >80°C for >3 minutes
                        reason = f'CPU temperature {max_temp}°C ≥{self.TEMP_CRITICAL}°C sustained >5min'
                    elif len(warning_temp_samples) >= 3:
                        status = 'WARNING'
-                        reason = f'CPU temperature {max_temp}°C ≥{self.TEMP_WARNING}°C sustained >5min'
+                        reason = f'CPU temperature {max_temp}°C >80°C sustained >3min'
-                    else:
+                        
                        # Record non-dismissable error
                        health_persistence.record_error(
                            error_key='cpu_temp_high',
                            category='temperature',
                            severity='WARNING',
                            reason=reason,
                            details={'temperature': max_temp, 'dismissable': False}
                        )
                    elif len(recovery_samples) >= 3:
                        # Temperature has been ≤80°C for 30 seconds - clear the error
                        status = 'OK'
                        reason = None
                        health_persistence.resolve_error('cpu_temp_high', 'Temperature recovered')
                    else:
                        # Temperature is elevated but not long enough, or recovering but not yet cleared
                        # Check if we already have an active error
                        if health_persistence.is_error_active('cpu_temp_high', category='temperature'):
                            # Keep the warning active
                            status = 'WARNING'
                            reason = f'CPU temperature {max_temp}°C still elevated'
                        else:
                            # No active warning yet
                            status = 'OK'
                            reason = None
                    temp_result = {
                        'status': status,
@@ -829,15 +855,44 @@ class HealthMonitor:
                    # Report based on recent error count
                    if error_count >= 3:
                        error_key = f'disk_{disk}'
                        severity = 'CRITICAL'
                        reason = f'{error_count} I/O errors in 5 minutes'
                        health_persistence.record_error(
                            error_key=error_key,
                            category='disks',
                            severity=severity,
                            reason=reason,
                            details={'disk': disk, 'error_count': error_count, 'dismissable': True}
                        )
                        disk_issues[f'/dev/{disk}'] = {
-                            'status': 'CRITICAL',
+                            'status': severity,
-                            'reason': f'{error_count} I/O errors in 5 minutes'
+                            'reason': reason,
                            'dismissable': True
                        }
                    elif error_count >= 1:
                        error_key = f'disk_{disk}'
                        severity = 'WARNING'
                        reason = f'{error_count} I/O error(s) in 5 minutes'
                        health_persistence.record_error(
                            error_key=error_key,
                            category='disks',
                            severity=severity,
                            reason=reason,
                            details={'disk': disk, 'error_count': error_count, 'dismissable': True}
                        )
                        disk_issues[f'/dev/{disk}'] = {
-                            'status': 'WARNING',
+                            'status': severity,
-                            'reason': f'{error_count} I/O error(s) in 5 minutes'
+                            'reason': reason,
                            'dismissable': True
                        }
                    else:
                        error_key = f'disk_{disk}'
                        health_persistence.resolve_error(error_key, 'Disk errors cleared')
            if not disk_issues:
                return {'status': 'OK'}
@@ -851,7 +906,6 @@ class HealthMonitor:
            }
        except Exception:
            # If dmesg check fails, return OK as it's not a critical system failure
            return {'status': 'OK'}
    def _check_network_optimized(self) -> Dict[str, Any]:
@@ -865,6 +919,8 @@ class HealthMonitor:
            net_if_stats = psutil.net_if_stats()
            active_interfaces = set()
            for interface, stats in net_if_stats.items():
                if interface == 'lo':
                    continue
@@ -874,10 +930,25 @@ class HealthMonitor:
                    # Consider common PVE bridge interfaces and physical NICs as important
                    if interface.startswith('vmbr') or interface.startswith('eth') or interface.startswith('ens') or interface.startswith('enp'):
                        issues.append(f'{interface} is DOWN')
                        error_key = interface
                        health_persistence.record_error(
                            error_key=error_key,
                            category='network',
                            severity='CRITICAL',
                            reason='Interface DOWN',
                            details={'interface': interface, 'dismissable': True}
                        )
                        interface_details[interface] = {
                            'status': 'CRITICAL',
-                            'reason': 'Interface DOWN'
+                            'reason': 'Interface DOWN',
                            'dismissable': True
                        }
                else:
                    active_interfaces.add(interface)
                    if interface.startswith('vmbr') or interface.startswith('eth') or interface.startswith('ens') or interface.startswith('enp'):
                        health_persistence.resolve_error(interface, 'Interface recovered')
            # Check connectivity (latency)
            latency_status = self._check_network_latency()
@@ -1307,9 +1378,19 @@ class HealthMonitor:
                # Check persistent log errors recorded by health_persistence
                persistent_errors = health_persistence.get_active_errors('logs')
                if persistent_errors:
                    # Find the highest severity among persistent errors to set overall status
                    max_severity = 'OK'
                    reasons = []
                    for error in persistent_errors:
                        if error['severity'] == 'CRITICAL':
                            max_severity = 'CRITICAL'
                        elif error['severity'] == 'WARNING' and max_severity != 'CRITICAL':
                            max_severity = 'WARNING'
                        reasons.append(error['reason'])
                    return {
-                        'status': 'WARNING', # Or CRITICAL depending on severity of persistent errors
+                        'status': max_severity,
-                        'reason': f'{len(persistent_errors)} persistent log issues detected'
+                        'reason': '; '.join(reasons[:3]) # Show up to 3 persistent reasons
                    }
                return self.cached_results.get(cache_key, {'status': 'OK'})
@@ -1356,30 +1437,38 @@ class HealthMonitor:
                    pattern = self._normalize_log_pattern(line)
                    if severity == 'CRITICAL':
-                        # If this critical pattern is new or we haven't logged it recently
+                        pattern_hash = hashlib.md5(pattern.encode()).hexdigest()[:8]
-                        error_key = f'log_critical_{abs(hash(pattern)) % 10000}'
+                        error_key = f'log_critical_{pattern_hash}'
                        if pattern not in critical_errors_found:
                            critical_errors_found[pattern] = line
-                            # Record persistent error if it's not already active and within recent persistence
+                            # Record persistent error if it's not already active
                            if not health_persistence.is_error_active(error_key, category='logs'):
                                health_persistence.record_error(
                                    error_key=error_key,
                                    category='logs',
                                    severity='CRITICAL',
                                    reason=line[:100], # Truncate reason for brevity
-                                    details={'pattern': pattern}
+                                    details={'pattern': pattern, 'dismissable': True}
                                )
                    recent_patterns[pattern] += 1
                for line in previous_lines:
-                    if not line.strip() or self._is_benign_error(line):
+                    if not line.strip():
                        continue
                    # Skip benign errors
                    if self._is_benign_error(line):
                        continue
                    # Classify severity
                    severity = self._classify_log_severity(line)
-                    if severity is None:
+                    
                    if severity is None: # Skip informational or classified benign lines
                        continue
                    # Normalize to a pattern for grouping
                    pattern = self._normalize_log_pattern(line)
                    previous_patterns[pattern] += 1
@@ -1500,7 +1589,7 @@ class HealthMonitor:
                lines = result.stdout.strip().split('\n')
                for line in lines:
-                    # 'Inst' indicates a package will be installed/upgraded
+                    # 'Inst ' indicates a package will be installed/upgraded
                    if line.startswith('Inst '):
                        update_count += 1
                        line_lower = line.lower()
@@ -2071,5 +2160,11 @@ class HealthMonitor:
        }
 # Global instance
 health_monitor = HealthMonitor()
 .now().isoformat()
        }
 # Global instance
 health_monitor = HealthMonitor()