Update AppImage

2025-11-18 03:26:17 +00:00 · 2025-11-09 17:28:20 +01:00
parent 27353e160f
commit a0635a1026
5 changed files with 656 additions and 51 deletions
--- a/AppImage/scripts/health_monitor.py
+++ b/AppImage/scripts/health_monitor.py
@@ -17,10 +17,12 @@ from datetime import datetime, timedelta
 from collections import defaultdict
 import re

+from health_persistence import health_persistence
+
 class HealthMonitor:
    """
    Monitors system health across multiple components with minimal impact.
-    Implements hysteresis, intelligent caching, and progressive escalation.
+    Implements hysteresis, intelligent caching, progressive escalation, and persistent error tracking.
    Always returns all 10 health categories.
    """
    
@@ -28,8 +30,8 @@ class HealthMonitor:
    CPU_WARNING = 85
    CPU_CRITICAL = 95
    CPU_RECOVERY = 75
-    CPU_WARNING_DURATION = 60
-    CPU_CRITICAL_DURATION = 120
+    CPU_WARNING_DURATION = 300  # 5 minutes sustained
+    CPU_CRITICAL_DURATION = 300  # 5 minutes sustained
    CPU_RECOVERY_DURATION = 120
    
    # Memory Thresholds
@@ -85,6 +87,11 @@ class HealthMonitor:
        self.io_error_history = defaultdict(list)
        self.failed_vm_history = set()  # Track VMs that failed to start
        
+        try:
+            health_persistence.cleanup_old_errors()
+        except Exception as e:
+            print(f"[HealthMonitor] Cleanup warning: {e}")
+    
    def get_system_info(self) -> Dict[str, Any]:
        """
        Get lightweight system info for header display.
@@ -188,7 +195,11 @@ class HealthMonitor:
        """
        Get comprehensive health status with all checks.
        Returns JSON structure with ALL 10 categories always present.
+        Now includes persistent error tracking.
        """
+        active_errors = health_persistence.get_active_errors()
+        persistent_issues = {err['error_key']: err for err in active_errors}
+        
        details = {
            'cpu': {'status': 'OK'},
            'memory': {'status': 'OK'},
@@ -231,8 +242,8 @@ class HealthMonitor:
            elif disks_status.get('status') == 'WARNING':
                warning_issues.append(disks_status.get('reason', 'Disk issue'))
        
-        # Priority 4: VMs/CTs - now detects qmp errors from logs
-        vms_status = self._check_vms_cts_optimized()
+        # Priority 4: VMs/CTs - now with persistence
+        vms_status = self._check_vms_cts_with_persistence()
        if vms_status:
            details['vms'] = vms_status
            if vms_status.get('status') == 'CRITICAL':
@@ -265,8 +276,8 @@ class HealthMonitor:
        elif memory_status.get('status') == 'WARNING':
            warning_issues.append(memory_status.get('reason', 'Memory high'))
        
-        # Priority 8: Logs
-        logs_status = self._check_logs_lightweight()
+        # Priority 8: Logs - now with persistence
+        logs_status = self._check_logs_with_persistence()
        details['logs'] = logs_status
        if logs_status.get('status') == 'CRITICAL':
            critical_issues.append(logs_status.get('reason', 'Critical log errors'))
@@ -305,7 +316,7 @@ class HealthMonitor:
        }
    
    def _check_cpu_with_hysteresis(self) -> Dict[str, Any]:
-        """Check CPU with hysteresis to avoid flapping alerts"""
+        """Check CPU with hysteresis to avoid flapping alerts - requires 5min sustained high usage"""
        try:
            cpu_percent = psutil.cpu_percent(interval=1)
            current_time = time.time()
@@ -318,33 +329,33 @@ class HealthMonitor:
            
            self.state_history[state_key] = [
                entry for entry in self.state_history[state_key]
-                if current_time - entry['time'] < 300
+                if current_time - entry['time'] < 360
            ]
            
-            critical_duration = sum(
-                1 for entry in self.state_history[state_key]
+            critical_samples = [
+                entry for entry in self.state_history[state_key]
                if entry['value'] >= self.CPU_CRITICAL and
                current_time - entry['time'] <= self.CPU_CRITICAL_DURATION
-            )
+            ]
            
-            warning_duration = sum(
-                1 for entry in self.state_history[state_key]
+            warning_samples = [
+                entry for entry in self.state_history[state_key]
                if entry['value'] >= self.CPU_WARNING and
                current_time - entry['time'] <= self.CPU_WARNING_DURATION
-            )
+            ]
            
-            recovery_duration = sum(
-                1 for entry in self.state_history[state_key]
+            recovery_samples = [
+                entry for entry in self.state_history[state_key]
                if entry['value'] < self.CPU_RECOVERY and
                current_time - entry['time'] <= self.CPU_RECOVERY_DURATION
-            )
+            ]
            
-            if critical_duration >= 2:
+            if len(critical_samples) >= 3:
                status = 'CRITICAL'
-                reason = f'CPU >{self.CPU_CRITICAL}% for {self.CPU_CRITICAL_DURATION}s'
-            elif warning_duration >= 2 and recovery_duration < 2:
+                reason = f'CPU >{self.CPU_CRITICAL}% sustained for {self.CPU_CRITICAL_DURATION}s'
+            elif len(warning_samples) >= 3 and len(recovery_samples) < 2:
                status = 'WARNING'
-                reason = f'CPU >{self.CPU_WARNING}% for {self.CPU_WARNING_DURATION}s'
+                reason = f'CPU >{self.CPU_WARNING}% sustained for {self.CPU_WARNING_DURATION}s'
            else:
                status = 'OK'
                reason = None
@@ -871,15 +882,15 @@ class HealthMonitor:
    
    def _check_vms_cts_optimized(self) -> Dict[str, Any]:
        """
-        Optimized VM/CT check - detects qmp failures and other VM errors.
-        Now parses logs for VM/CT specific errors like qmp command failures.
+        Optimized VM/CT check - detects qmp failures and startup errors from logs.
+        Improved detection of container and VM errors from journalctl.
        """
        try:
            issues = []
            vm_details = {}
            
            result = subprocess.run(
-                ['journalctl', '--since', '10 minutes ago', '--no-pager', '-u', 'pve*', '-p', 'warning'],
+                ['journalctl', '--since', '10 minutes ago', '--no-pager', '-p', 'warning'],
                capture_output=True,
                text=True,
                timeout=3
@@ -903,22 +914,56 @@ class HealthMonitor:
                            }
                        continue
                    
-                    ct_match = re.search(r'(?:ct|container)\s+(\d+)', line_lower)
-                    if ct_match and ('error' in line_lower or 'fail' in line_lower):
-                        ctid = ct_match.group(1)
+                    ct_error_match = re.search(r'(?:ct|container|lxc)\s+(\d+)', line_lower)
+                    if ct_error_match and ('error' in line_lower or 'fail' in line_lower or 'device' in line_lower):
+                        ctid = ct_error_match.group(1)
                        key = f'ct_{ctid}'
                        if key not in vm_details:
-                            issues.append(f'CT {ctid}: Error detected')
+                            if 'device' in line_lower and 'does not exist' in line_lower:
+                                device_match = re.search(r'device\s+([/\w\d]+)\s+does not exist', line_lower)
+                                if device_match:
+                                    reason = f'Device {device_match.group(1)} missing'
+                                else:
+                                    reason = 'Device error'
+                            elif 'failed to start' in line_lower:
+                                reason = 'Failed to start'
+                            else:
+                                reason = 'Container error'
+                            
+                            issues.append(f'CT {ctid}: {reason}')
+                            vm_details[key] = {
+                                'status': 'WARNING' if 'device' in reason.lower() else 'CRITICAL',
+                                'reason': reason,
+                                'id': ctid,
+                                'type': 'CT'
+                            }
+                        continue
+                    
+                    vzstart_match = re.search(r'vzstart:(\d+):', line)
+                    if vzstart_match and ('error' in line_lower or 'fail' in line_lower or 'does not exist' in line_lower):
+                        ctid = vzstart_match.group(1)
+                        key = f'ct_{ctid}'
+                        if key not in vm_details:
+                            # Extraer mensaje de error
+                            if 'device' in line_lower and 'does not exist' in line_lower:
+                                device_match = re.search(r'device\s+([/\w\d]+)\s+does not exist', line_lower)
+                                if device_match:
+                                    reason = f'Device {device_match.group(1)} missing'
+                                else:
+                                    reason = 'Device error'
+                            else:
+                                reason = 'Startup error'
+                            
+                            issues.append(f'CT {ctid}: {reason}')
                            vm_details[key] = {
                                'status': 'WARNING',
-                                'reason': 'Container error',
+                                'reason': reason,
                                'id': ctid,
                                'type': 'CT'
                            }
                        continue
                    
                    if any(keyword in line_lower for keyword in ['failed to start', 'cannot start', 'activation failed', 'start error']):
-                        # Extract VM/CT ID
                        id_match = re.search(r'\b(\d{3,4})\b', line)
                        if id_match:
                            vmid = id_match.group(1)
@@ -946,6 +991,118 @@ class HealthMonitor:
        except Exception:
            return {'status': 'OK'}
    
+    # Modified to use persistence
+    def _check_vms_cts_with_persistence(self) -> Dict[str, Any]:
+        """
+        Check VMs/CTs with persistent error tracking.
+        Errors persist until VM starts or 48h elapsed.
+        """
+        try:
+            issues = []
+            vm_details = {}
+            
+            # Get persistent errors first
+            persistent_errors = health_persistence.get_active_errors('vms')
+            
+            # Check if any persistent VMs/CTs have started
+            for error in persistent_errors:
+                error_key = error['error_key']
+                if error_key.startswith('vm_') or error_key.startswith('ct_'):
+                    vm_id = error_key.split('_')[1]
+                    if health_persistence.check_vm_running(vm_id):
+                        continue  # Error auto-resolved
+                
+                # Still active
+                vm_details[error_key] = {
+                    'status': error['severity'],
+                    'reason': error['reason'],
+                    'id': error.get('details', {}).get('id', 'unknown'),
+                    'type': error.get('details', {}).get('type', 'VM/CT'),
+                    'first_seen': error['first_seen']
+                }
+                issues.append(f"{error.get('details', {}).get('type', 'VM')} {error.get('details', {}).get('id', '')}: {error['reason']}")
+            
+            # Check for new errors in logs
+            result = subprocess.run(
+                ['journalctl', '--since', '10 minutes ago', '--no-pager', '-p', 'warning'],
+                capture_output=True,
+                text=True,
+                timeout=3
+            )
+            
+            if result.returncode == 0:
+                for line in result.stdout.split('\n'):
+                    line_lower = line.lower()
+                    
+                    # VM QMP errors
+                    vm_qmp_match = re.search(r'vm\s+(\d+)\s+qmp\s+command.*(?:failed|unable|timeout)', line_lower)
+                    if vm_qmp_match:
+                        vmid = vm_qmp_match.group(1)
+                        error_key = f'vm_{vmid}'
+                        if error_key not in vm_details:
+                            # Record persistent error
+                            health_persistence.record_error(
+                                error_key=error_key,
+                                category='vms',
+                                severity='WARNING',
+                                reason='QMP command timeout',
+                                details={'id': vmid, 'type': 'VM'}
+                            )
+                            issues.append(f'VM {vmid}: Communication issue')
+                            vm_details[error_key] = {
+                                'status': 'WARNING',
+                                'reason': 'QMP command timeout',
+                                'id': vmid,
+                                'type': 'VM'
+                            }
+                        continue
+                    
+                    # Container errors
+                    vzstart_match = re.search(r'vzstart:(\d+):', line)
+                    if vzstart_match and ('error' in line_lower or 'fail' in line_lower or 'does not exist' in line_lower):
+                        ctid = vzstart_match.group(1)
+                        error_key = f'ct_{ctid}'
+                        
+                        if error_key not in vm_details:
+                            if 'device' in line_lower and 'does not exist' in line_lower:
+                                device_match = re.search(r'device\s+([/\w\d]+)\s+does not exist', line_lower)
+                                if device_match:
+                                    reason = f'Device {device_match.group(1)} missing'
+                                else:
+                                    reason = 'Device error'
+                            else:
+                                reason = 'Startup error'
+                            
+                            # Record persistent error
+                            health_persistence.record_error(
+                                error_key=error_key,
+                                category='vms',
+                                severity='WARNING',
+                                reason=reason,
+                                details={'id': ctid, 'type': 'CT'}
+                            )
+                            issues.append(f'CT {ctid}: {reason}')
+                            vm_details[error_key] = {
+                                'status': 'WARNING',
+                                'reason': reason,
+                                'id': ctid,
+                                'type': 'CT'
+                            }
+            
+            if not issues:
+                return {'status': 'OK'}
+            
+            has_critical = any(d.get('status') == 'CRITICAL' for d in vm_details.values())
+            
+            return {
+                'status': 'CRITICAL' if has_critical else 'WARNING',
+                'reason': '; '.join(issues[:3]),
+                'details': vm_details
+            }
+            
+        except Exception:
+            return {'status': 'OK'}
+    
    def _check_pve_services(self) -> Dict[str, Any]:
        """Check critical Proxmox services"""
        try:
@@ -980,13 +1137,24 @@ class HealthMonitor:
                'reason': f'Service check failed: {str(e)}'
            }
    
-    def _check_logs_lightweight(self) -> Dict[str, Any]:
-        """Lightweight log analysis (cached, checked every 5 minutes)"""
+    # Modified to use persistence
+    def _check_logs_with_persistence(self) -> Dict[str, Any]:
+        """
+        Check logs with persistent error tracking.
+        Critical log errors persist for 24h unless acknowledged.
+        """
        cache_key = 'logs_analysis'
        current_time = time.time()
        
        if cache_key in self.last_check_times:
            if current_time - self.last_check_times[cache_key] < self.LOG_CHECK_INTERVAL:
+                # Return persistent errors if any
+                persistent_errors = health_persistence.get_active_errors('logs')
+                if persistent_errors:
+                    return {
+                        'status': 'WARNING',
+                        'reason': f'{len(persistent_errors)} persistent log issues'
+                    }
                return self.cached_results.get(cache_key, {'status': 'OK'})
        
        try:
@@ -1011,6 +1179,16 @@ class HealthMonitor:
                        if keyword.lower() in line_lower:
                            critical_keywords_found.append(keyword)
                            errors_5m += 1
+                            
+                            # Record persistent error for critical keywords
+                            error_key = f'log_critical_{keyword.replace(" ", "_")}'
+                            health_persistence.record_error(
+                                error_key=error_key,
+                                category='logs',
+                                severity='CRITICAL',
+                                reason=f'Critical log: {keyword}',
+                                details={'keyword': keyword}
+                            )
                            break
                    else:
                        if 'error' in line_lower or 'critical' in line_lower or 'fatal' in line_lower: