Update AppImage

This commit is contained in:
MacRimi
2025-11-09 21:20:39 +01:00
parent 5b5f325a4e
commit 588af3613b
2 changed files with 118 additions and 70 deletions

View File

@@ -298,11 +298,12 @@ class HealthMonitor:
# Priority 8: Logs - now with persistence
logs_status = self._check_logs_with_persistence()
details['logs'] = logs_status
if logs_status.get('status') == 'CRITICAL':
critical_issues.append(logs_status.get('reason', 'Critical log errors'))
elif logs_status.get('status') == 'WARNING':
warning_issues.append(logs_status.get('reason', 'Log warnings'))
if logs_status:
details['logs'] = logs_status
if logs_status.get('status') == 'CRITICAL':
critical_issues.append(logs_status.get('reason', 'Critical log errors'))
elif logs_status.get('status') == 'WARNING':
warning_issues.append(logs_status.get('reason', 'Log warnings'))
# Priority 9: Updates
updates_status = self._check_updates()
@@ -1221,14 +1222,19 @@ class HealthMonitor:
def _check_logs_with_persistence(self) -> Dict[str, Any]:
"""
Check logs with intelligent classification and persistent tracking.
- Whitelists benign Proxmox warnings
- Only counts truly unique error types
- Persists critical errors for 24h
Intelligent log checking with cascade detection.
Only alerts when there's a real problem (error cascade), not normal background warnings.
Logic:
- Looks at last 3 minutes (not 10) for immediate issues
- Detects cascades: ≥5 errors of same type in 3 min = problem
- Compares to previous period to detect spikes
- Whitelists known benign Proxmox warnings
"""
cache_key = 'logs_analysis'
current_time = time.time()
# Cache for 5 minutes
if cache_key in self.last_check_times:
if current_time - self.last_check_times[cache_key] < self.LOG_CHECK_INTERVAL:
persistent_errors = health_persistence.get_active_errors('logs')
@@ -1240,37 +1246,44 @@ class HealthMonitor:
return self.cached_results.get(cache_key, {'status': 'OK'})
try:
result = subprocess.run(
['journalctl', '--since', '10 minutes ago', '--no-pager', '-p', 'warning'],
result_recent = subprocess.run(
['journalctl', '--since', '3 minutes ago', '--no-pager', '-p', 'warning'],
capture_output=True,
text=True,
timeout=3
)
if result.returncode == 0:
lines = result.stdout.strip().split('\n')
result_previous = subprocess.run(
['journalctl', '--since', '6 minutes ago', '--until', '3 minutes ago', '--no-pager', '-p', 'warning'],
capture_output=True,
text=True,
timeout=3
)
if result_recent.returncode == 0:
recent_lines = result_recent.stdout.strip().split('\n')
previous_lines = result_previous.stdout.strip().split('\n') if result_previous.returncode == 0 else []
critical_errors = {} # pattern -> first line
warning_errors = {} # pattern -> first line
recent_patterns = defaultdict(int)
previous_patterns = defaultdict(int)
critical_errors = {}
for line in lines:
for line in recent_lines:
if not line.strip():
continue
# Skip benign errors
if self._is_benign_error(line):
continue
# Classify severity
severity = self._classify_log_severity(line)
if severity is None:
continue # Benign, skip
continue
# Normalize to pattern for grouping
pattern = re.sub(r'\d{4}-\d{2}-\d{2}', '', line) # Remove dates
pattern = re.sub(r'\d{2}:\d{2}:\d{2}', '', pattern) # Remove times
pattern = re.sub(r'pid[:\s]+\d+', 'pid:XXX', pattern.lower()) # Normalize PIDs
pattern = re.sub(r'\b\d{3,6}\b', 'ID', pattern) # Normalize IDs
pattern = re.sub(r'/dev/\S+', '/dev/XXX', pattern) # Normalize devices
pattern = re.sub(r'0x[0-9a-f]+', '0xXXX', pattern) # Normalize hex addresses
pattern = pattern[:150] # Keep first 150 chars as pattern
# Normalize to pattern
pattern = self._normalize_log_pattern(line)
if severity == 'CRITICAL':
if pattern not in critical_errors:
@@ -1285,20 +1298,47 @@ class HealthMonitor:
reason=line[:100],
details={'pattern': pattern}
)
elif severity == 'WARNING':
if pattern not in warning_errors:
warning_errors[pattern] = line
recent_patterns[pattern] += 1
for line in previous_lines:
if not line.strip() or self._is_benign_error(line):
continue
severity = self._classify_log_severity(line)
if severity is None:
continue
pattern = self._normalize_log_pattern(line)
previous_patterns[pattern] += 1
cascading_errors = {
pattern: count for pattern, count in recent_patterns.items()
if count >= 5 and self._classify_log_severity(pattern) in ['WARNING', 'CRITICAL']
}
spike_errors = {}
for pattern, recent_count in recent_patterns.items():
prev_count = previous_patterns.get(pattern, 0)
# Spike if: ≥3 errors now AND ≥3x increase
if recent_count >= 3 and recent_count >= prev_count * 3:
spike_errors[pattern] = recent_count
unique_critical = len(critical_errors)
unique_warnings = len(warning_errors)
cascade_count = len(cascading_errors)
spike_count = len(spike_errors)
if unique_critical > 0:
status = 'CRITICAL'
reason = f'{unique_critical} critical error type(s) detected'
elif unique_warnings >= 10:
reason = f'{unique_critical} critical error(s): cascade detected'
elif cascade_count > 0:
status = 'WARNING'
reason = f'{unique_warnings} warning type(s) detected'
reason = f'Error cascade detected: {cascade_count} pattern(s) repeating ≥5 times in 3min'
elif spike_count > 0:
status = 'WARNING'
reason = f'Error spike detected: {spike_count} pattern(s) increased 3x'
else:
# Normal background warnings, no alert
status = 'OK'
reason = None
@@ -1318,6 +1358,21 @@ class HealthMonitor:
except Exception:
return {'status': 'OK'}
def _normalize_log_pattern(self, line: str) -> str:
"""
Normalize log line to a pattern for grouping similar errors.
Removes timestamps, PIDs, IDs, paths, and other variables.
"""
pattern = re.sub(r'\d{4}-\d{2}-\d{2}', '', line) # Remove dates
pattern = re.sub(r'\d{2}:\d{2}:\d{2}', '', pattern) # Remove times
pattern = re.sub(r'pid[:\s]+\d+', 'pid:XXX', pattern.lower()) # Normalize PIDs
pattern = re.sub(r'\b\d{3,6}\b', 'ID', pattern) # Normalize IDs
pattern = re.sub(r'/dev/\S+', '/dev/XXX', pattern) # Normalize devices
pattern = re.sub(r'/\S+/\S+', '/PATH/', pattern) # Normalize paths
pattern = re.sub(r'0x[0-9a-f]+', '0xXXX', pattern) # Normalize hex
pattern = re.sub(r'\s+', ' ', pattern).strip() # Normalize whitespace
return pattern[:150] # Keep first 150 chars
def _check_updates(self) -> Optional[Dict[str, Any]]:
"""
Check for pending system updates with intelligence.

View File

@@ -90,6 +90,25 @@ class HealthPersistence:
now = datetime.now().isoformat()
details_json = json.dumps(details) if details else None
cursor.execute('''
SELECT acknowledged, resolved_at
FROM errors
WHERE error_key = ? AND acknowledged = 1
''', (error_key,))
ack_check = cursor.fetchone()
if ack_check and ack_check[1]: # Has resolved_at timestamp
try:
resolved_dt = datetime.fromisoformat(ack_check[1])
hours_since_ack = (datetime.now() - resolved_dt).total_seconds() / 3600
if hours_since_ack < 24:
# Skip re-adding recently acknowledged errors (within 24h)
conn.close()
return {'type': 'skipped_acknowledged', 'needs_notification': False}
except Exception:
pass
cursor.execute('''
SELECT id, first_seen, notification_sent, acknowledged, resolved_at
FROM errors WHERE error_key = ?
@@ -101,52 +120,26 @@ class HealthPersistence:
if existing:
error_id, first_seen, notif_sent, acknowledged, resolved_at = existing
if acknowledged == 1 and resolved_at is not None:
# Check if acknowledged recently (within 24h)
try:
resolved_dt = datetime.fromisoformat(resolved_at)
hours_since_ack = (datetime.now() - resolved_dt).total_seconds() / 3600
if hours_since_ack < 24:
# Skip re-adding recently acknowledged errors
conn.close()
return {'type': 'skipped_acknowledged', 'needs_notification': False}
except Exception:
pass
if acknowledged == 1:
conn.close()
return {'type': 'skipped_acknowledged', 'needs_notification': False}
# Update existing error (only if not acknowledged or >24h passed)
# Update existing error (only if NOT acknowledged)
cursor.execute('''
UPDATE errors
SET last_seen = ?, severity = ?, reason = ?, details = ?, resolved_at = NULL, acknowledged = 0
WHERE error_key = ?
SET last_seen = ?, severity = ?, reason = ?, details = ?
WHERE error_key = ? AND acknowledged = 0
''', (now, severity, reason, details_json, error_key))
# Check if severity escalated
cursor.execute('SELECT severity FROM errors WHERE error_key = ?', (error_key,))
old_severity = cursor.fetchone()[0]
if old_severity == 'WARNING' and severity == 'CRITICAL':
event_info['type'] = 'escalated'
event_info['needs_notification'] = True
old_severity_row = cursor.fetchone()
if old_severity_row:
old_severity = old_severity_row[0]
if old_severity == 'WARNING' and severity == 'CRITICAL':
event_info['type'] = 'escalated'
event_info['needs_notification'] = True
else:
cursor.execute('''
SELECT resolved_at, acknowledged FROM errors
WHERE error_key = ? AND acknowledged = 1
ORDER BY resolved_at DESC LIMIT 1
''', (error_key,))
recent_ack = cursor.fetchone()
if recent_ack and recent_ack[0]:
try:
resolved_dt = datetime.fromisoformat(recent_ack[0])
hours_since_ack = (datetime.now() - resolved_dt).total_seconds() / 3600
if hours_since_ack < 24:
# Don't re-add recently acknowledged errors
conn.close()
return {'type': 'skipped_acknowledged', 'needs_notification': False}
except Exception:
pass
# Insert new error
cursor.execute('''
INSERT INTO errors