mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2025-11-17 19:16:25 +00:00
Update AppImage
This commit is contained in:
@@ -298,6 +298,7 @@ class HealthMonitor:
|
|||||||
|
|
||||||
# Priority 8: Logs - now with persistence
|
# Priority 8: Logs - now with persistence
|
||||||
logs_status = self._check_logs_with_persistence()
|
logs_status = self._check_logs_with_persistence()
|
||||||
|
if logs_status:
|
||||||
details['logs'] = logs_status
|
details['logs'] = logs_status
|
||||||
if logs_status.get('status') == 'CRITICAL':
|
if logs_status.get('status') == 'CRITICAL':
|
||||||
critical_issues.append(logs_status.get('reason', 'Critical log errors'))
|
critical_issues.append(logs_status.get('reason', 'Critical log errors'))
|
||||||
@@ -1221,14 +1222,19 @@ class HealthMonitor:
|
|||||||
|
|
||||||
def _check_logs_with_persistence(self) -> Dict[str, Any]:
|
def _check_logs_with_persistence(self) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Check logs with intelligent classification and persistent tracking.
|
Intelligent log checking with cascade detection.
|
||||||
- Whitelists benign Proxmox warnings
|
Only alerts when there's a real problem (error cascade), not normal background warnings.
|
||||||
- Only counts truly unique error types
|
|
||||||
- Persists critical errors for 24h
|
Logic:
|
||||||
|
- Looks at last 3 minutes (not 10) for immediate issues
|
||||||
|
- Detects cascades: ≥5 errors of same type in 3 min = problem
|
||||||
|
- Compares to previous period to detect spikes
|
||||||
|
- Whitelists known benign Proxmox warnings
|
||||||
"""
|
"""
|
||||||
cache_key = 'logs_analysis'
|
cache_key = 'logs_analysis'
|
||||||
current_time = time.time()
|
current_time = time.time()
|
||||||
|
|
||||||
|
# Cache for 5 minutes
|
||||||
if cache_key in self.last_check_times:
|
if cache_key in self.last_check_times:
|
||||||
if current_time - self.last_check_times[cache_key] < self.LOG_CHECK_INTERVAL:
|
if current_time - self.last_check_times[cache_key] < self.LOG_CHECK_INTERVAL:
|
||||||
persistent_errors = health_persistence.get_active_errors('logs')
|
persistent_errors = health_persistence.get_active_errors('logs')
|
||||||
@@ -1240,37 +1246,44 @@ class HealthMonitor:
|
|||||||
return self.cached_results.get(cache_key, {'status': 'OK'})
|
return self.cached_results.get(cache_key, {'status': 'OK'})
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(
|
result_recent = subprocess.run(
|
||||||
['journalctl', '--since', '10 minutes ago', '--no-pager', '-p', 'warning'],
|
['journalctl', '--since', '3 minutes ago', '--no-pager', '-p', 'warning'],
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
text=True,
|
text=True,
|
||||||
timeout=3
|
timeout=3
|
||||||
)
|
)
|
||||||
|
|
||||||
if result.returncode == 0:
|
result_previous = subprocess.run(
|
||||||
lines = result.stdout.strip().split('\n')
|
['journalctl', '--since', '6 minutes ago', '--until', '3 minutes ago', '--no-pager', '-p', 'warning'],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=3
|
||||||
|
)
|
||||||
|
|
||||||
critical_errors = {} # pattern -> first line
|
if result_recent.returncode == 0:
|
||||||
warning_errors = {} # pattern -> first line
|
recent_lines = result_recent.stdout.strip().split('\n')
|
||||||
|
previous_lines = result_previous.stdout.strip().split('\n') if result_previous.returncode == 0 else []
|
||||||
|
|
||||||
for line in lines:
|
recent_patterns = defaultdict(int)
|
||||||
|
previous_patterns = defaultdict(int)
|
||||||
|
critical_errors = {}
|
||||||
|
|
||||||
|
for line in recent_lines:
|
||||||
if not line.strip():
|
if not line.strip():
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Skip benign errors
|
||||||
|
if self._is_benign_error(line):
|
||||||
|
continue
|
||||||
|
|
||||||
# Classify severity
|
# Classify severity
|
||||||
severity = self._classify_log_severity(line)
|
severity = self._classify_log_severity(line)
|
||||||
|
|
||||||
if severity is None:
|
if severity is None:
|
||||||
continue # Benign, skip
|
continue
|
||||||
|
|
||||||
# Normalize to pattern for grouping
|
# Normalize to pattern
|
||||||
pattern = re.sub(r'\d{4}-\d{2}-\d{2}', '', line) # Remove dates
|
pattern = self._normalize_log_pattern(line)
|
||||||
pattern = re.sub(r'\d{2}:\d{2}:\d{2}', '', pattern) # Remove times
|
|
||||||
pattern = re.sub(r'pid[:\s]+\d+', 'pid:XXX', pattern.lower()) # Normalize PIDs
|
|
||||||
pattern = re.sub(r'\b\d{3,6}\b', 'ID', pattern) # Normalize IDs
|
|
||||||
pattern = re.sub(r'/dev/\S+', '/dev/XXX', pattern) # Normalize devices
|
|
||||||
pattern = re.sub(r'0x[0-9a-f]+', '0xXXX', pattern) # Normalize hex addresses
|
|
||||||
pattern = pattern[:150] # Keep first 150 chars as pattern
|
|
||||||
|
|
||||||
if severity == 'CRITICAL':
|
if severity == 'CRITICAL':
|
||||||
if pattern not in critical_errors:
|
if pattern not in critical_errors:
|
||||||
@@ -1285,20 +1298,47 @@ class HealthMonitor:
|
|||||||
reason=line[:100],
|
reason=line[:100],
|
||||||
details={'pattern': pattern}
|
details={'pattern': pattern}
|
||||||
)
|
)
|
||||||
elif severity == 'WARNING':
|
|
||||||
if pattern not in warning_errors:
|
recent_patterns[pattern] += 1
|
||||||
warning_errors[pattern] = line
|
|
||||||
|
for line in previous_lines:
|
||||||
|
if not line.strip() or self._is_benign_error(line):
|
||||||
|
continue
|
||||||
|
|
||||||
|
severity = self._classify_log_severity(line)
|
||||||
|
if severity is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
pattern = self._normalize_log_pattern(line)
|
||||||
|
previous_patterns[pattern] += 1
|
||||||
|
|
||||||
|
cascading_errors = {
|
||||||
|
pattern: count for pattern, count in recent_patterns.items()
|
||||||
|
if count >= 5 and self._classify_log_severity(pattern) in ['WARNING', 'CRITICAL']
|
||||||
|
}
|
||||||
|
|
||||||
|
spike_errors = {}
|
||||||
|
for pattern, recent_count in recent_patterns.items():
|
||||||
|
prev_count = previous_patterns.get(pattern, 0)
|
||||||
|
# Spike if: ≥3 errors now AND ≥3x increase
|
||||||
|
if recent_count >= 3 and recent_count >= prev_count * 3:
|
||||||
|
spike_errors[pattern] = recent_count
|
||||||
|
|
||||||
unique_critical = len(critical_errors)
|
unique_critical = len(critical_errors)
|
||||||
unique_warnings = len(warning_errors)
|
cascade_count = len(cascading_errors)
|
||||||
|
spike_count = len(spike_errors)
|
||||||
|
|
||||||
if unique_critical > 0:
|
if unique_critical > 0:
|
||||||
status = 'CRITICAL'
|
status = 'CRITICAL'
|
||||||
reason = f'{unique_critical} critical error type(s) detected'
|
reason = f'{unique_critical} critical error(s): cascade detected'
|
||||||
elif unique_warnings >= 10:
|
elif cascade_count > 0:
|
||||||
status = 'WARNING'
|
status = 'WARNING'
|
||||||
reason = f'{unique_warnings} warning type(s) detected'
|
reason = f'Error cascade detected: {cascade_count} pattern(s) repeating ≥5 times in 3min'
|
||||||
|
elif spike_count > 0:
|
||||||
|
status = 'WARNING'
|
||||||
|
reason = f'Error spike detected: {spike_count} pattern(s) increased 3x'
|
||||||
else:
|
else:
|
||||||
|
# Normal background warnings, no alert
|
||||||
status = 'OK'
|
status = 'OK'
|
||||||
reason = None
|
reason = None
|
||||||
|
|
||||||
@@ -1318,6 +1358,21 @@ class HealthMonitor:
|
|||||||
except Exception:
|
except Exception:
|
||||||
return {'status': 'OK'}
|
return {'status': 'OK'}
|
||||||
|
|
||||||
|
def _normalize_log_pattern(self, line: str) -> str:
|
||||||
|
"""
|
||||||
|
Normalize log line to a pattern for grouping similar errors.
|
||||||
|
Removes timestamps, PIDs, IDs, paths, and other variables.
|
||||||
|
"""
|
||||||
|
pattern = re.sub(r'\d{4}-\d{2}-\d{2}', '', line) # Remove dates
|
||||||
|
pattern = re.sub(r'\d{2}:\d{2}:\d{2}', '', pattern) # Remove times
|
||||||
|
pattern = re.sub(r'pid[:\s]+\d+', 'pid:XXX', pattern.lower()) # Normalize PIDs
|
||||||
|
pattern = re.sub(r'\b\d{3,6}\b', 'ID', pattern) # Normalize IDs
|
||||||
|
pattern = re.sub(r'/dev/\S+', '/dev/XXX', pattern) # Normalize devices
|
||||||
|
pattern = re.sub(r'/\S+/\S+', '/PATH/', pattern) # Normalize paths
|
||||||
|
pattern = re.sub(r'0x[0-9a-f]+', '0xXXX', pattern) # Normalize hex
|
||||||
|
pattern = re.sub(r'\s+', ' ', pattern).strip() # Normalize whitespace
|
||||||
|
return pattern[:150] # Keep first 150 chars
|
||||||
|
|
||||||
def _check_updates(self) -> Optional[Dict[str, Any]]:
|
def _check_updates(self) -> Optional[Dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
Check for pending system updates with intelligence.
|
Check for pending system updates with intelligence.
|
||||||
|
|||||||
@@ -90,6 +90,25 @@ class HealthPersistence:
|
|||||||
now = datetime.now().isoformat()
|
now = datetime.now().isoformat()
|
||||||
details_json = json.dumps(details) if details else None
|
details_json = json.dumps(details) if details else None
|
||||||
|
|
||||||
|
cursor.execute('''
|
||||||
|
SELECT acknowledged, resolved_at
|
||||||
|
FROM errors
|
||||||
|
WHERE error_key = ? AND acknowledged = 1
|
||||||
|
''', (error_key,))
|
||||||
|
ack_check = cursor.fetchone()
|
||||||
|
|
||||||
|
if ack_check and ack_check[1]: # Has resolved_at timestamp
|
||||||
|
try:
|
||||||
|
resolved_dt = datetime.fromisoformat(ack_check[1])
|
||||||
|
hours_since_ack = (datetime.now() - resolved_dt).total_seconds() / 3600
|
||||||
|
|
||||||
|
if hours_since_ack < 24:
|
||||||
|
# Skip re-adding recently acknowledged errors (within 24h)
|
||||||
|
conn.close()
|
||||||
|
return {'type': 'skipped_acknowledged', 'needs_notification': False}
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
SELECT id, first_seen, notification_sent, acknowledged, resolved_at
|
SELECT id, first_seen, notification_sent, acknowledged, resolved_at
|
||||||
FROM errors WHERE error_key = ?
|
FROM errors WHERE error_key = ?
|
||||||
@@ -101,52 +120,26 @@ class HealthPersistence:
|
|||||||
if existing:
|
if existing:
|
||||||
error_id, first_seen, notif_sent, acknowledged, resolved_at = existing
|
error_id, first_seen, notif_sent, acknowledged, resolved_at = existing
|
||||||
|
|
||||||
if acknowledged == 1 and resolved_at is not None:
|
if acknowledged == 1:
|
||||||
# Check if acknowledged recently (within 24h)
|
|
||||||
try:
|
|
||||||
resolved_dt = datetime.fromisoformat(resolved_at)
|
|
||||||
hours_since_ack = (datetime.now() - resolved_dt).total_seconds() / 3600
|
|
||||||
|
|
||||||
if hours_since_ack < 24:
|
|
||||||
# Skip re-adding recently acknowledged errors
|
|
||||||
conn.close()
|
conn.close()
|
||||||
return {'type': 'skipped_acknowledged', 'needs_notification': False}
|
return {'type': 'skipped_acknowledged', 'needs_notification': False}
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Update existing error (only if not acknowledged or >24h passed)
|
# Update existing error (only if NOT acknowledged)
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
UPDATE errors
|
UPDATE errors
|
||||||
SET last_seen = ?, severity = ?, reason = ?, details = ?, resolved_at = NULL, acknowledged = 0
|
SET last_seen = ?, severity = ?, reason = ?, details = ?
|
||||||
WHERE error_key = ?
|
WHERE error_key = ? AND acknowledged = 0
|
||||||
''', (now, severity, reason, details_json, error_key))
|
''', (now, severity, reason, details_json, error_key))
|
||||||
|
|
||||||
# Check if severity escalated
|
# Check if severity escalated
|
||||||
cursor.execute('SELECT severity FROM errors WHERE error_key = ?', (error_key,))
|
cursor.execute('SELECT severity FROM errors WHERE error_key = ?', (error_key,))
|
||||||
old_severity = cursor.fetchone()[0]
|
old_severity_row = cursor.fetchone()
|
||||||
|
if old_severity_row:
|
||||||
|
old_severity = old_severity_row[0]
|
||||||
if old_severity == 'WARNING' and severity == 'CRITICAL':
|
if old_severity == 'WARNING' and severity == 'CRITICAL':
|
||||||
event_info['type'] = 'escalated'
|
event_info['type'] = 'escalated'
|
||||||
event_info['needs_notification'] = True
|
event_info['needs_notification'] = True
|
||||||
else:
|
else:
|
||||||
cursor.execute('''
|
|
||||||
SELECT resolved_at, acknowledged FROM errors
|
|
||||||
WHERE error_key = ? AND acknowledged = 1
|
|
||||||
ORDER BY resolved_at DESC LIMIT 1
|
|
||||||
''', (error_key,))
|
|
||||||
recent_ack = cursor.fetchone()
|
|
||||||
|
|
||||||
if recent_ack and recent_ack[0]:
|
|
||||||
try:
|
|
||||||
resolved_dt = datetime.fromisoformat(recent_ack[0])
|
|
||||||
hours_since_ack = (datetime.now() - resolved_dt).total_seconds() / 3600
|
|
||||||
|
|
||||||
if hours_since_ack < 24:
|
|
||||||
# Don't re-add recently acknowledged errors
|
|
||||||
conn.close()
|
|
||||||
return {'type': 'skipped_acknowledged', 'needs_notification': False}
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Insert new error
|
# Insert new error
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
INSERT INTO errors
|
INSERT INTO errors
|
||||||
|
|||||||
Reference in New Issue
Block a user