Update AppImage

2026-02-21 01:46:35 +00:00 · 2025-11-09 21:20:39 +01:00
parent 5b5f325a4e
commit 588af3613b
2 changed files with 118 additions and 70 deletions
--- a/AppImage/scripts/health_monitor.py
+++ b/AppImage/scripts/health_monitor.py
@@ -298,6 +298,7 @@ class HealthMonitor:
        # Priority 8: Logs - now with persistence
        logs_status = self._check_logs_with_persistence()
        if logs_status:
            details['logs'] = logs_status
            if logs_status.get('status') == 'CRITICAL':
                critical_issues.append(logs_status.get('reason', 'Critical log errors'))
@@ -1221,14 +1222,19 @@ class HealthMonitor:
    def _check_logs_with_persistence(self) -> Dict[str, Any]:
        """
-        Check logs with intelligent classification and persistent tracking.
+        Intelligent log checking with cascade detection.
-        - Whitelists benign Proxmox warnings
+        Only alerts when there's a real problem (error cascade), not normal background warnings.
-        - Only counts truly unique error types
+        
-        - Persists critical errors for 24h
+        Logic:
        - Looks at last 3 minutes (not 10) for immediate issues
        - Detects cascades: ≥5 errors of same type in 3 min = problem
        - Compares to previous period to detect spikes
        - Whitelists known benign Proxmox warnings
        """
        cache_key = 'logs_analysis'
        current_time = time.time()
        # Cache for 5 minutes
        if cache_key in self.last_check_times:
            if current_time - self.last_check_times[cache_key] < self.LOG_CHECK_INTERVAL:
                persistent_errors = health_persistence.get_active_errors('logs')
@@ -1240,37 +1246,44 @@ class HealthMonitor:
                return self.cached_results.get(cache_key, {'status': 'OK'})
        try:
-            result = subprocess.run(
+            result_recent = subprocess.run(
-                ['journalctl', '--since', '10 minutes ago', '--no-pager', '-p', 'warning'],
+                ['journalctl', '--since', '3 minutes ago', '--no-pager', '-p', 'warning'],
                capture_output=True,
                text=True,
                timeout=3
            )
-            if result.returncode == 0:
+            result_previous = subprocess.run(
-                lines = result.stdout.strip().split('\n')
+                ['journalctl', '--since', '6 minutes ago', '--until', '3 minutes ago', '--no-pager', '-p', 'warning'],
                capture_output=True,
                text=True,
                timeout=3
            )
-                critical_errors = {}  # pattern -> first line
+            if result_recent.returncode == 0:
-                warning_errors = {}   # pattern -> first line
+                recent_lines = result_recent.stdout.strip().split('\n')
                previous_lines = result_previous.stdout.strip().split('\n') if result_previous.returncode == 0 else []
-                for line in lines:
+                recent_patterns = defaultdict(int)
                previous_patterns = defaultdict(int)
                critical_errors = {}
                for line in recent_lines:
                    if not line.strip():
                        continue
                    # Skip benign errors
                    if self._is_benign_error(line):
                        continue
                    # Classify severity
                    severity = self._classify_log_severity(line)
                    if severity is None:
-                        continue  # Benign, skip
+                        continue
-                    # Normalize to pattern for grouping
+                    # Normalize to pattern
-                    pattern = re.sub(r'\d{4}-\d{2}-\d{2}', '', line)  # Remove dates
+                    pattern = self._normalize_log_pattern(line)
                    pattern = re.sub(r'\d{2}:\d{2}:\d{2}', '', pattern)  # Remove times
                    pattern = re.sub(r'pid[:\s]+\d+', 'pid:XXX', pattern.lower())  # Normalize PIDs
                    pattern = re.sub(r'\b\d{3,6}\b', 'ID', pattern)  # Normalize IDs
                    pattern = re.sub(r'/dev/\S+', '/dev/XXX', pattern)  # Normalize devices
                    pattern = re.sub(r'0x[0-9a-f]+', '0xXXX', pattern)  # Normalize hex addresses
                    pattern = pattern[:150]  # Keep first 150 chars as pattern
                    if severity == 'CRITICAL':
                        if pattern not in critical_errors:
@@ -1285,20 +1298,47 @@ class HealthMonitor:
                                reason=line[:100],
                                details={'pattern': pattern}
                            )
-                    elif severity == 'WARNING':
+                    
-                        if pattern not in warning_errors:
+                    recent_patterns[pattern] += 1
-                            warning_errors[pattern] = line
+                
                for line in previous_lines:
                    if not line.strip() or self._is_benign_error(line):
                        continue
                    severity = self._classify_log_severity(line)
                    if severity is None:
                        continue
                    pattern = self._normalize_log_pattern(line)
                    previous_patterns[pattern] += 1
                cascading_errors = {
                    pattern: count for pattern, count in recent_patterns.items()
                    if count >= 5 and self._classify_log_severity(pattern) in ['WARNING', 'CRITICAL']
                }
                spike_errors = {}
                for pattern, recent_count in recent_patterns.items():
                    prev_count = previous_patterns.get(pattern, 0)
                    # Spike if: ≥3 errors now AND ≥3x increase
                    if recent_count >= 3 and recent_count >= prev_count * 3:
                        spike_errors[pattern] = recent_count
                unique_critical = len(critical_errors)
-                unique_warnings = len(warning_errors)
+                cascade_count = len(cascading_errors)
                spike_count = len(spike_errors)
                if unique_critical > 0:
                    status = 'CRITICAL'
-                    reason = f'{unique_critical} critical error type(s) detected'
+                    reason = f'{unique_critical} critical error(s): cascade detected'
-                elif unique_warnings >= 10:
+                elif cascade_count > 0:
                    status = 'WARNING'
-                    reason = f'{unique_warnings} warning type(s) detected'
+                    reason = f'Error cascade detected: {cascade_count} pattern(s) repeating ≥5 times in 3min'
                elif spike_count > 0:
                    status = 'WARNING'
                    reason = f'Error spike detected: {spike_count} pattern(s) increased 3x'
                else:
                    # Normal background warnings, no alert
                    status = 'OK'
                    reason = None
@@ -1318,6 +1358,21 @@ class HealthMonitor:
        except Exception:
            return {'status': 'OK'}
    def _normalize_log_pattern(self, line: str) -> str:
        """
        Normalize log line to a pattern for grouping similar errors.
        Removes timestamps, PIDs, IDs, paths, and other variables.
        """
        pattern = re.sub(r'\d{4}-\d{2}-\d{2}', '', line)  # Remove dates
        pattern = re.sub(r'\d{2}:\d{2}:\d{2}', '', pattern)  # Remove times
        pattern = re.sub(r'pid[:\s]+\d+', 'pid:XXX', pattern.lower())  # Normalize PIDs
        pattern = re.sub(r'\b\d{3,6}\b', 'ID', pattern)  # Normalize IDs
        pattern = re.sub(r'/dev/\S+', '/dev/XXX', pattern)  # Normalize devices
        pattern = re.sub(r'/\S+/\S+', '/PATH/', pattern)  # Normalize paths
        pattern = re.sub(r'0x[0-9a-f]+', '0xXXX', pattern)  # Normalize hex
        pattern = re.sub(r'\s+', ' ', pattern).strip()  # Normalize whitespace
        return pattern[:150]  # Keep first 150 chars
    def _check_updates(self) -> Optional[Dict[str, Any]]:
        """
        Check for pending system updates with intelligence.
--- a/AppImage/scripts/health_persistence.py
+++ b/AppImage/scripts/health_persistence.py
@@ -90,6 +90,25 @@ class HealthPersistence:
        now = datetime.now().isoformat()
        details_json = json.dumps(details) if details else None
        cursor.execute('''
            SELECT acknowledged, resolved_at 
            FROM errors 
            WHERE error_key = ? AND acknowledged = 1
        ''', (error_key,))
        ack_check = cursor.fetchone()
        if ack_check and ack_check[1]:  # Has resolved_at timestamp
            try:
                resolved_dt = datetime.fromisoformat(ack_check[1])
                hours_since_ack = (datetime.now() - resolved_dt).total_seconds() / 3600
                if hours_since_ack < 24:
                    # Skip re-adding recently acknowledged errors (within 24h)
                    conn.close()
                    return {'type': 'skipped_acknowledged', 'needs_notification': False}
            except Exception:
                pass
        cursor.execute('''
            SELECT id, first_seen, notification_sent, acknowledged, resolved_at 
            FROM errors WHERE error_key = ?
@@ -101,52 +120,26 @@ class HealthPersistence:
        if existing:
            error_id, first_seen, notif_sent, acknowledged, resolved_at = existing
-            if acknowledged == 1 and resolved_at is not None:
+            if acknowledged == 1:
                # Check if acknowledged recently (within 24h)
                try:
                    resolved_dt = datetime.fromisoformat(resolved_at)
                    hours_since_ack = (datetime.now() - resolved_dt).total_seconds() / 3600
                    if hours_since_ack < 24:
                        # Skip re-adding recently acknowledged errors
                conn.close()
                return {'type': 'skipped_acknowledged', 'needs_notification': False}
                except Exception:
                    pass
-            # Update existing error (only if not acknowledged or >24h passed)
+            # Update existing error (only if NOT acknowledged)
            cursor.execute('''
                UPDATE errors 
-                SET last_seen = ?, severity = ?, reason = ?, details = ?, resolved_at = NULL, acknowledged = 0
+                SET last_seen = ?, severity = ?, reason = ?, details = ?
-                WHERE error_key = ?
+                WHERE error_key = ? AND acknowledged = 0
            ''', (now, severity, reason, details_json, error_key))
            # Check if severity escalated
            cursor.execute('SELECT severity FROM errors WHERE error_key = ?', (error_key,))
-            old_severity = cursor.fetchone()[0]
+            old_severity_row = cursor.fetchone()
            if old_severity_row:
                old_severity = old_severity_row[0]
                if old_severity == 'WARNING' and severity == 'CRITICAL':
                    event_info['type'] = 'escalated'
                    event_info['needs_notification'] = True
        else:
            cursor.execute('''
                SELECT resolved_at, acknowledged FROM errors 
                WHERE error_key = ? AND acknowledged = 1
                ORDER BY resolved_at DESC LIMIT 1
            ''', (error_key,))
            recent_ack = cursor.fetchone()
            if recent_ack and recent_ack[0]:
                try:
                    resolved_dt = datetime.fromisoformat(recent_ack[0])
                    hours_since_ack = (datetime.now() - resolved_dt).total_seconds() / 3600
                    if hours_since_ack < 24:
                        # Don't re-add recently acknowledged errors
                        conn.close()
                        return {'type': 'skipped_acknowledged', 'needs_notification': False}
                except Exception:
                    pass
            # Insert new error
            cursor.execute('''
                INSERT INTO errors