diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py index b78729b..04d915c 100644 --- a/AppImage/scripts/health_monitor.py +++ b/AppImage/scripts/health_monitor.py @@ -1170,6 +1170,7 @@ class HealthMonitor: """ Check logs with persistent error tracking. Critical log errors persist for 24h unless acknowledged. + Groups similar errors to avoid false high counts. """ cache_key = 'logs_analysis' current_time = time.time() @@ -1196,20 +1197,24 @@ class HealthMonitor: if result.returncode == 0: lines = result.stdout.strip().split('\n') - errors_5m = 0 - warnings_5m = 0 - critical_keywords_found = [] + error_patterns = {} # pattern -> count + critical_keywords_found = set() for line in lines: + if not line.strip(): + continue + line_lower = line.lower() + # Check for critical keywords first + critical_found = False for keyword in self.CRITICAL_LOG_KEYWORDS: if keyword.lower() in line_lower: - critical_keywords_found.append(keyword) - errors_5m += 1 + critical_keywords_found.add(keyword) + critical_found = True # Record persistent error for critical keywords - error_key = f'log_critical_{keyword.replace(" ", "_")}' + error_key = f'log_critical_{keyword.replace(" ", "_").replace("/", "_")}' health_persistence.record_error( error_key=error_key, category='logs', @@ -1218,27 +1223,42 @@ class HealthMonitor: details={'keyword': keyword} ) break - else: - if 'error' in line_lower or 'critical' in line_lower or 'fatal' in line_lower: - errors_5m += 1 - elif 'warning' in line_lower or 'warn' in line_lower: - warnings_5m += 1 + + if critical_found: + continue + + # Remove timestamps, PIDs, and specific IDs to group similar errors + pattern = re.sub(r'\d{4}-\d{2}-\d{2}', '', line_lower) # Remove dates + pattern = re.sub(r'\d{2}:\d{2}:\d{2}', '', pattern) # Remove times + pattern = re.sub(r'pid[:\s]+\d+', 'pid:XXX', pattern) # Normalize PIDs + pattern = re.sub(r'\b\d{3,6}\b', 'ID', pattern) # Normalize IDs + pattern = re.sub(r'/dev/\S+', '/dev/XXX', pattern) # Normalize devices + pattern = pattern[:100] # Keep first 100 chars as pattern + + # Classify error level + if 'error' in line_lower or 'critical' in line_lower or 'fatal' in line_lower: + error_patterns[f'error:{pattern}'] = error_patterns.get(f'error:{pattern}', 0) + 1 + elif 'warning' in line_lower or 'warn' in line_lower: + error_patterns[f'warning:{pattern}'] = error_patterns.get(f'warning:{pattern}', 0) + 1 + + unique_errors = sum(1 for k in error_patterns.keys() if k.startswith('error:')) + unique_warnings = sum(1 for k in error_patterns.keys() if k.startswith('warning:')) if critical_keywords_found: status = 'CRITICAL' - reason = f'Critical errors: {", ".join(set(critical_keywords_found[:3]))}' - elif errors_5m >= self.LOG_ERRORS_CRITICAL: + reason = f'Critical errors: {", ".join(list(critical_keywords_found)[:3])}' + elif unique_errors >= self.LOG_ERRORS_CRITICAL: status = 'CRITICAL' - reason = f'{errors_5m} errors in 5 minutes' - elif warnings_5m >= self.LOG_WARNINGS_CRITICAL: + reason = f'{unique_errors} unique errors in 5 minutes' + elif unique_warnings >= self.LOG_WARNINGS_CRITICAL: status = 'WARNING' - reason = f'{warnings_5m} warnings in 5 minutes' - elif errors_5m >= self.LOG_ERRORS_WARNING: + reason = f'{unique_warnings} unique warnings in 5 minutes' + elif unique_errors >= self.LOG_ERRORS_WARNING: status = 'WARNING' - reason = f'{errors_5m} errors in 5 minutes' - elif warnings_5m >= self.LOG_WARNINGS_WARNING: + reason = f'{unique_errors} unique errors in 5 minutes' + elif unique_warnings >= self.LOG_WARNINGS_WARNING: status = 'WARNING' - reason = f'{warnings_5m} warnings in 5 minutes' + reason = f'{unique_warnings} unique warnings in 5 minutes' else: status = 'OK' reason = None diff --git a/AppImage/scripts/health_persistence.py b/AppImage/scripts/health_persistence.py index 3dfe7d0..14e79e8 100644 --- a/AppImage/scripts/health_persistence.py +++ b/AppImage/scripts/health_persistence.py @@ -101,18 +101,23 @@ class HealthPersistence: if existing: error_id, first_seen, notif_sent, acknowledged, resolved_at = existing - # If acknowledged within last 24 hours, do not re-add unless resolved and re-occurred - if acknowledged and not resolved_at: - first_seen_dt = datetime.fromisoformat(first_seen) - if (datetime.now() - first_seen_dt).total_seconds() < 86400: # 24 hours - # Skip re-adding recently acknowledged errors - conn.close() - return {'type': 'skipped', 'needs_notification': False} + if acknowledged == 1 and resolved_at is not None: + # Check if acknowledged recently (within 24h) + try: + resolved_dt = datetime.fromisoformat(resolved_at) + hours_since_ack = (datetime.now() - resolved_dt).total_seconds() / 3600 + + if hours_since_ack < 24: + # Skip re-adding recently acknowledged errors + conn.close() + return {'type': 'skipped_acknowledged', 'needs_notification': False} + except Exception: + pass - # Update existing error + # Update existing error (only if not acknowledged or >24h passed) cursor.execute(''' UPDATE errors - SET last_seen = ?, severity = ?, reason = ?, details = ?, resolved_at = NULL + SET last_seen = ?, severity = ?, reason = ?, details = ?, resolved_at = NULL, acknowledged = 0 WHERE error_key = ? ''', (now, severity, reason, details_json, error_key)) @@ -123,6 +128,25 @@ class HealthPersistence: event_info['type'] = 'escalated' event_info['needs_notification'] = True else: + cursor.execute(''' + SELECT resolved_at, acknowledged FROM errors + WHERE error_key = ? AND acknowledged = 1 + ORDER BY resolved_at DESC LIMIT 1 + ''', (error_key,)) + recent_ack = cursor.fetchone() + + if recent_ack and recent_ack[0]: + try: + resolved_dt = datetime.fromisoformat(recent_ack[0]) + hours_since_ack = (datetime.now() - resolved_dt).total_seconds() / 3600 + + if hours_since_ack < 24: + # Don't re-add recently acknowledged errors + conn.close() + return {'type': 'skipped_acknowledged', 'needs_notification': False} + except Exception: + pass + # Insert new error cursor.execute(''' INSERT INTO errors