Update AppImage

This commit is contained in:
MacRimi
2025-11-09 18:23:27 +01:00
parent a59489f804
commit 8fb8134898
2 changed files with 73 additions and 29 deletions

View File

@@ -1170,6 +1170,7 @@ class HealthMonitor:
""" """
Check logs with persistent error tracking. Check logs with persistent error tracking.
Critical log errors persist for 24h unless acknowledged. Critical log errors persist for 24h unless acknowledged.
Groups similar errors to avoid false high counts.
""" """
cache_key = 'logs_analysis' cache_key = 'logs_analysis'
current_time = time.time() current_time = time.time()
@@ -1196,20 +1197,24 @@ class HealthMonitor:
if result.returncode == 0: if result.returncode == 0:
lines = result.stdout.strip().split('\n') lines = result.stdout.strip().split('\n')
errors_5m = 0 error_patterns = {} # pattern -> count
warnings_5m = 0 critical_keywords_found = set()
critical_keywords_found = []
for line in lines: for line in lines:
if not line.strip():
continue
line_lower = line.lower() line_lower = line.lower()
# Check for critical keywords first
critical_found = False
for keyword in self.CRITICAL_LOG_KEYWORDS: for keyword in self.CRITICAL_LOG_KEYWORDS:
if keyword.lower() in line_lower: if keyword.lower() in line_lower:
critical_keywords_found.append(keyword) critical_keywords_found.add(keyword)
errors_5m += 1 critical_found = True
# Record persistent error for critical keywords # Record persistent error for critical keywords
error_key = f'log_critical_{keyword.replace(" ", "_")}' error_key = f'log_critical_{keyword.replace(" ", "_").replace("/", "_")}'
health_persistence.record_error( health_persistence.record_error(
error_key=error_key, error_key=error_key,
category='logs', category='logs',
@@ -1218,27 +1223,42 @@ class HealthMonitor:
details={'keyword': keyword} details={'keyword': keyword}
) )
break break
else:
if critical_found:
continue
# Remove timestamps, PIDs, and specific IDs to group similar errors
pattern = re.sub(r'\d{4}-\d{2}-\d{2}', '', line_lower) # Remove dates
pattern = re.sub(r'\d{2}:\d{2}:\d{2}', '', pattern) # Remove times
pattern = re.sub(r'pid[:\s]+\d+', 'pid:XXX', pattern) # Normalize PIDs
pattern = re.sub(r'\b\d{3,6}\b', 'ID', pattern) # Normalize IDs
pattern = re.sub(r'/dev/\S+', '/dev/XXX', pattern) # Normalize devices
pattern = pattern[:100] # Keep first 100 chars as pattern
# Classify error level
if 'error' in line_lower or 'critical' in line_lower or 'fatal' in line_lower: if 'error' in line_lower or 'critical' in line_lower or 'fatal' in line_lower:
errors_5m += 1 error_patterns[f'error:{pattern}'] = error_patterns.get(f'error:{pattern}', 0) + 1
elif 'warning' in line_lower or 'warn' in line_lower: elif 'warning' in line_lower or 'warn' in line_lower:
warnings_5m += 1 error_patterns[f'warning:{pattern}'] = error_patterns.get(f'warning:{pattern}', 0) + 1
unique_errors = sum(1 for k in error_patterns.keys() if k.startswith('error:'))
unique_warnings = sum(1 for k in error_patterns.keys() if k.startswith('warning:'))
if critical_keywords_found: if critical_keywords_found:
status = 'CRITICAL' status = 'CRITICAL'
reason = f'Critical errors: {", ".join(set(critical_keywords_found[:3]))}' reason = f'Critical errors: {", ".join(list(critical_keywords_found)[:3])}'
elif errors_5m >= self.LOG_ERRORS_CRITICAL: elif unique_errors >= self.LOG_ERRORS_CRITICAL:
status = 'CRITICAL' status = 'CRITICAL'
reason = f'{errors_5m} errors in 5 minutes' reason = f'{unique_errors} unique errors in 5 minutes'
elif warnings_5m >= self.LOG_WARNINGS_CRITICAL: elif unique_warnings >= self.LOG_WARNINGS_CRITICAL:
status = 'WARNING' status = 'WARNING'
reason = f'{warnings_5m} warnings in 5 minutes' reason = f'{unique_warnings} unique warnings in 5 minutes'
elif errors_5m >= self.LOG_ERRORS_WARNING: elif unique_errors >= self.LOG_ERRORS_WARNING:
status = 'WARNING' status = 'WARNING'
reason = f'{errors_5m} errors in 5 minutes' reason = f'{unique_errors} unique errors in 5 minutes'
elif warnings_5m >= self.LOG_WARNINGS_WARNING: elif unique_warnings >= self.LOG_WARNINGS_WARNING:
status = 'WARNING' status = 'WARNING'
reason = f'{warnings_5m} warnings in 5 minutes' reason = f'{unique_warnings} unique warnings in 5 minutes'
else: else:
status = 'OK' status = 'OK'
reason = None reason = None

View File

@@ -101,18 +101,23 @@ class HealthPersistence:
if existing: if existing:
error_id, first_seen, notif_sent, acknowledged, resolved_at = existing error_id, first_seen, notif_sent, acknowledged, resolved_at = existing
# If acknowledged within last 24 hours, do not re-add unless resolved and re-occurred if acknowledged == 1 and resolved_at is not None:
if acknowledged and not resolved_at: # Check if acknowledged recently (within 24h)
first_seen_dt = datetime.fromisoformat(first_seen) try:
if (datetime.now() - first_seen_dt).total_seconds() < 86400: # 24 hours resolved_dt = datetime.fromisoformat(resolved_at)
hours_since_ack = (datetime.now() - resolved_dt).total_seconds() / 3600
if hours_since_ack < 24:
# Skip re-adding recently acknowledged errors # Skip re-adding recently acknowledged errors
conn.close() conn.close()
return {'type': 'skipped', 'needs_notification': False} return {'type': 'skipped_acknowledged', 'needs_notification': False}
except Exception:
pass
# Update existing error # Update existing error (only if not acknowledged or >24h passed)
cursor.execute(''' cursor.execute('''
UPDATE errors UPDATE errors
SET last_seen = ?, severity = ?, reason = ?, details = ?, resolved_at = NULL SET last_seen = ?, severity = ?, reason = ?, details = ?, resolved_at = NULL, acknowledged = 0
WHERE error_key = ? WHERE error_key = ?
''', (now, severity, reason, details_json, error_key)) ''', (now, severity, reason, details_json, error_key))
@@ -123,6 +128,25 @@ class HealthPersistence:
event_info['type'] = 'escalated' event_info['type'] = 'escalated'
event_info['needs_notification'] = True event_info['needs_notification'] = True
else: else:
cursor.execute('''
SELECT resolved_at, acknowledged FROM errors
WHERE error_key = ? AND acknowledged = 1
ORDER BY resolved_at DESC LIMIT 1
''', (error_key,))
recent_ack = cursor.fetchone()
if recent_ack and recent_ack[0]:
try:
resolved_dt = datetime.fromisoformat(recent_ack[0])
hours_since_ack = (datetime.now() - resolved_dt).total_seconds() / 3600
if hours_since_ack < 24:
# Don't re-add recently acknowledged errors
conn.close()
return {'type': 'skipped_acknowledged', 'needs_notification': False}
except Exception:
pass
# Insert new error # Insert new error
cursor.execute(''' cursor.execute('''
INSERT INTO errors INSERT INTO errors