Update AppImage

2025-11-17 19:16:25 +00:00 · 2025-11-09 21:20:39 +01:00
parent 5b5f325a4e
commit 588af3613b
2 changed files with 118 additions and 70 deletions
--- a/AppImage/scripts/health_monitor.py
+++ b/AppImage/scripts/health_monitor.py
@@ -298,11 +298,12 @@ class HealthMonitor:
        
        # Priority 8: Logs - now with persistence
        logs_status = self._check_logs_with_persistence()
-        details['logs'] = logs_status
-        if logs_status.get('status') == 'CRITICAL':
-            critical_issues.append(logs_status.get('reason', 'Critical log errors'))
-        elif logs_status.get('status') == 'WARNING':
-            warning_issues.append(logs_status.get('reason', 'Log warnings'))
+        if logs_status:
+            details['logs'] = logs_status
+            if logs_status.get('status') == 'CRITICAL':
+                critical_issues.append(logs_status.get('reason', 'Critical log errors'))
+            elif logs_status.get('status') == 'WARNING':
+                warning_issues.append(logs_status.get('reason', 'Log warnings'))
        
        # Priority 9: Updates
        updates_status = self._check_updates()
@@ -1221,14 +1222,19 @@ class HealthMonitor:

    def _check_logs_with_persistence(self) -> Dict[str, Any]:
        """
-        Check logs with intelligent classification and persistent tracking.
-        - Whitelists benign Proxmox warnings
-        - Only counts truly unique error types
-        - Persists critical errors for 24h
+        Intelligent log checking with cascade detection.
+        Only alerts when there's a real problem (error cascade), not normal background warnings.
+        
+        Logic:
+        - Looks at last 3 minutes (not 10) for immediate issues
+        - Detects cascades: ≥5 errors of same type in 3 min = problem
+        - Compares to previous period to detect spikes
+        - Whitelists known benign Proxmox warnings
        """
        cache_key = 'logs_analysis'
        current_time = time.time()
        
+        # Cache for 5 minutes
        if cache_key in self.last_check_times:
            if current_time - self.last_check_times[cache_key] < self.LOG_CHECK_INTERVAL:
                persistent_errors = health_persistence.get_active_errors('logs')
@@ -1240,37 +1246,44 @@ class HealthMonitor:
                return self.cached_results.get(cache_key, {'status': 'OK'})
        
        try:
-            result = subprocess.run(
-                ['journalctl', '--since', '10 minutes ago', '--no-pager', '-p', 'warning'],
+            result_recent = subprocess.run(
+                ['journalctl', '--since', '3 minutes ago', '--no-pager', '-p', 'warning'],
                capture_output=True,
                text=True,
                timeout=3
            )
            
-            if result.returncode == 0:
-                lines = result.stdout.strip().split('\n')
+            result_previous = subprocess.run(
+                ['journalctl', '--since', '6 minutes ago', '--until', '3 minutes ago', '--no-pager', '-p', 'warning'],
+                capture_output=True,
+                text=True,
+                timeout=3
+            )
+            
+            if result_recent.returncode == 0:
+                recent_lines = result_recent.stdout.strip().split('\n')
+                previous_lines = result_previous.stdout.strip().split('\n') if result_previous.returncode == 0 else []
                
-                critical_errors = {}  # pattern -> first line
-                warning_errors = {}   # pattern -> first line
+                recent_patterns = defaultdict(int)
+                previous_patterns = defaultdict(int)
+                critical_errors = {}
                
-                for line in lines:
+                for line in recent_lines:
                    if not line.strip():
                        continue
                    
+                    # Skip benign errors
+                    if self._is_benign_error(line):
+                        continue
+                    
                    # Classify severity
                    severity = self._classify_log_severity(line)
                    
                    if severity is None:
-                        continue  # Benign, skip
+                        continue
                    
-                    # Normalize to pattern for grouping
-                    pattern = re.sub(r'\d{4}-\d{2}-\d{2}', '', line)  # Remove dates
-                    pattern = re.sub(r'\d{2}:\d{2}:\d{2}', '', pattern)  # Remove times
-                    pattern = re.sub(r'pid[:\s]+\d+', 'pid:XXX', pattern.lower())  # Normalize PIDs
-                    pattern = re.sub(r'\b\d{3,6}\b', 'ID', pattern)  # Normalize IDs
-                    pattern = re.sub(r'/dev/\S+', '/dev/XXX', pattern)  # Normalize devices
-                    pattern = re.sub(r'0x[0-9a-f]+', '0xXXX', pattern)  # Normalize hex addresses
-                    pattern = pattern[:150]  # Keep first 150 chars as pattern
+                    # Normalize to pattern
+                    pattern = self._normalize_log_pattern(line)
                    
                    if severity == 'CRITICAL':
                        if pattern not in critical_errors:
@@ -1285,20 +1298,47 @@ class HealthMonitor:
                                reason=line[:100],
                                details={'pattern': pattern}
                            )
-                    elif severity == 'WARNING':
-                        if pattern not in warning_errors:
-                            warning_errors[pattern] = line
+                    
+                    recent_patterns[pattern] += 1
+                
+                for line in previous_lines:
+                    if not line.strip() or self._is_benign_error(line):
+                        continue
+                    
+                    severity = self._classify_log_severity(line)
+                    if severity is None:
+                        continue
+                    
+                    pattern = self._normalize_log_pattern(line)
+                    previous_patterns[pattern] += 1
+                
+                cascading_errors = {
+                    pattern: count for pattern, count in recent_patterns.items()
+                    if count >= 5 and self._classify_log_severity(pattern) in ['WARNING', 'CRITICAL']
+                }
+                
+                spike_errors = {}
+                for pattern, recent_count in recent_patterns.items():
+                    prev_count = previous_patterns.get(pattern, 0)
+                    # Spike if: ≥3 errors now AND ≥3x increase
+                    if recent_count >= 3 and recent_count >= prev_count * 3:
+                        spike_errors[pattern] = recent_count
                
                unique_critical = len(critical_errors)
-                unique_warnings = len(warning_errors)
+                cascade_count = len(cascading_errors)
+                spike_count = len(spike_errors)
                
                if unique_critical > 0:
                    status = 'CRITICAL'
-                    reason = f'{unique_critical} critical error type(s) detected'
-                elif unique_warnings >= 10:
+                    reason = f'{unique_critical} critical error(s): cascade detected'
+                elif cascade_count > 0:
                    status = 'WARNING'
-                    reason = f'{unique_warnings} warning type(s) detected'
+                    reason = f'Error cascade detected: {cascade_count} pattern(s) repeating ≥5 times in 3min'
+                elif spike_count > 0:
+                    status = 'WARNING'
+                    reason = f'Error spike detected: {spike_count} pattern(s) increased 3x'
                else:
+                    # Normal background warnings, no alert
                    status = 'OK'
                    reason = None
                
@@ -1318,6 +1358,21 @@ class HealthMonitor:
        except Exception:
            return {'status': 'OK'}
    
+    def _normalize_log_pattern(self, line: str) -> str:
+        """
+        Normalize log line to a pattern for grouping similar errors.
+        Removes timestamps, PIDs, IDs, paths, and other variables.
+        """
+        pattern = re.sub(r'\d{4}-\d{2}-\d{2}', '', line)  # Remove dates
+        pattern = re.sub(r'\d{2}:\d{2}:\d{2}', '', pattern)  # Remove times
+        pattern = re.sub(r'pid[:\s]+\d+', 'pid:XXX', pattern.lower())  # Normalize PIDs
+        pattern = re.sub(r'\b\d{3,6}\b', 'ID', pattern)  # Normalize IDs
+        pattern = re.sub(r'/dev/\S+', '/dev/XXX', pattern)  # Normalize devices
+        pattern = re.sub(r'/\S+/\S+', '/PATH/', pattern)  # Normalize paths
+        pattern = re.sub(r'0x[0-9a-f]+', '0xXXX', pattern)  # Normalize hex
+        pattern = re.sub(r'\s+', ' ', pattern).strip()  # Normalize whitespace
+        return pattern[:150]  # Keep first 150 chars
+    
    def _check_updates(self) -> Optional[Dict[str, Any]]:
        """
        Check for pending system updates with intelligence.
--- a/AppImage/scripts/health_persistence.py
+++ b/AppImage/scripts/health_persistence.py
@@ -90,6 +90,25 @@ class HealthPersistence:
        now = datetime.now().isoformat()
        details_json = json.dumps(details) if details else None
        
+        cursor.execute('''
+            SELECT acknowledged, resolved_at 
+            FROM errors 
+            WHERE error_key = ? AND acknowledged = 1
+        ''', (error_key,))
+        ack_check = cursor.fetchone()
+        
+        if ack_check and ack_check[1]:  # Has resolved_at timestamp
+            try:
+                resolved_dt = datetime.fromisoformat(ack_check[1])
+                hours_since_ack = (datetime.now() - resolved_dt).total_seconds() / 3600
+                
+                if hours_since_ack < 24:
+                    # Skip re-adding recently acknowledged errors (within 24h)
+                    conn.close()
+                    return {'type': 'skipped_acknowledged', 'needs_notification': False}
+            except Exception:
+                pass
+        
        cursor.execute('''
            SELECT id, first_seen, notification_sent, acknowledged, resolved_at 
            FROM errors WHERE error_key = ?
@@ -101,52 +120,26 @@ class HealthPersistence:
        if existing:
            error_id, first_seen, notif_sent, acknowledged, resolved_at = existing
            
-            if acknowledged == 1 and resolved_at is not None:
-                # Check if acknowledged recently (within 24h)
-                try:
-                    resolved_dt = datetime.fromisoformat(resolved_at)
-                    hours_since_ack = (datetime.now() - resolved_dt).total_seconds() / 3600
-                    
-                    if hours_since_ack < 24:
-                        # Skip re-adding recently acknowledged errors
-                        conn.close()
-                        return {'type': 'skipped_acknowledged', 'needs_notification': False}
-                except Exception:
-                    pass
+            if acknowledged == 1:
+                conn.close()
+                return {'type': 'skipped_acknowledged', 'needs_notification': False}
            
-            # Update existing error (only if not acknowledged or >24h passed)
+            # Update existing error (only if NOT acknowledged)
            cursor.execute('''
                UPDATE errors 
-                SET last_seen = ?, severity = ?, reason = ?, details = ?, resolved_at = NULL, acknowledged = 0
-                WHERE error_key = ?
+                SET last_seen = ?, severity = ?, reason = ?, details = ?
+                WHERE error_key = ? AND acknowledged = 0
            ''', (now, severity, reason, details_json, error_key))
            
            # Check if severity escalated
            cursor.execute('SELECT severity FROM errors WHERE error_key = ?', (error_key,))
-            old_severity = cursor.fetchone()[0]
-            if old_severity == 'WARNING' and severity == 'CRITICAL':
-                event_info['type'] = 'escalated'
-                event_info['needs_notification'] = True
+            old_severity_row = cursor.fetchone()
+            if old_severity_row:
+                old_severity = old_severity_row[0]
+                if old_severity == 'WARNING' and severity == 'CRITICAL':
+                    event_info['type'] = 'escalated'
+                    event_info['needs_notification'] = True
        else:
-            cursor.execute('''
-                SELECT resolved_at, acknowledged FROM errors 
-                WHERE error_key = ? AND acknowledged = 1
-                ORDER BY resolved_at DESC LIMIT 1
-            ''', (error_key,))
-            recent_ack = cursor.fetchone()
-            
-            if recent_ack and recent_ack[0]:
-                try:
-                    resolved_dt = datetime.fromisoformat(recent_ack[0])
-                    hours_since_ack = (datetime.now() - resolved_dt).total_seconds() / 3600
-                    
-                    if hours_since_ack < 24:
-                        # Don't re-add recently acknowledged errors
-                        conn.close()
-                        return {'type': 'skipped_acknowledged', 'needs_notification': False}
-                except Exception:
-                    pass
-            
            # Insert new error
            cursor.execute('''
                INSERT INTO errors