mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2025-11-17 19:16:25 +00:00
Update health_monitor.py
This commit is contained in:
@@ -67,12 +67,31 @@ class HealthMonitor:
|
|||||||
UPDATES_WARNING = 10
|
UPDATES_WARNING = 10
|
||||||
UPDATES_CRITICAL = 30
|
UPDATES_CRITICAL = 30
|
||||||
|
|
||||||
# Critical keywords for immediate escalation
|
# Known benign errors from Proxmox that should not trigger alerts
|
||||||
|
BENIGN_ERROR_PATTERNS = [
|
||||||
|
r'got inotify poll request in wrong process',
|
||||||
|
r'auth key pair too old, rotating',
|
||||||
|
r'proxy detected vanished client connection',
|
||||||
|
r'worker \d+ finished',
|
||||||
|
r'connection timed out',
|
||||||
|
r'disconnect peer',
|
||||||
|
]
|
||||||
|
|
||||||
CRITICAL_LOG_KEYWORDS = [
|
CRITICAL_LOG_KEYWORDS = [
|
||||||
'I/O error', 'EXT4-fs error', 'XFS', 'LVM activation failed',
|
'out of memory', 'oom_kill', 'kernel panic',
|
||||||
'md/raid: device failed', 'Out of memory', 'kernel panic',
|
'filesystem read-only', 'cannot mount',
|
||||||
'filesystem read-only', 'cannot mount', 'failed to start',
|
'raid.*failed', 'md.*device failed',
|
||||||
'task hung', 'oom_kill'
|
'ext4-fs error', 'xfs.*corruption',
|
||||||
|
'lvm activation failed',
|
||||||
|
'hardware error', 'mce:',
|
||||||
|
'segfault', 'general protection fault'
|
||||||
|
]
|
||||||
|
|
||||||
|
WARNING_LOG_KEYWORDS = [
|
||||||
|
'i/o error', 'ata error', 'scsi error',
|
||||||
|
'task hung', 'blocked for more than',
|
||||||
|
'failed to start', 'service.*failed',
|
||||||
|
'disk.*offline', 'disk.*removed'
|
||||||
]
|
]
|
||||||
|
|
||||||
# PVE Critical Services
|
# PVE Critical Services
|
||||||
@@ -215,6 +234,7 @@ class HealthMonitor:
|
|||||||
|
|
||||||
critical_issues = []
|
critical_issues = []
|
||||||
warning_issues = []
|
warning_issues = []
|
||||||
|
info_issues = [] # Added info_issues to track INFO separately
|
||||||
|
|
||||||
# Priority 1: Services PVE
|
# Priority 1: Services PVE
|
||||||
services_status = self._check_pve_services()
|
services_status = self._check_pve_services()
|
||||||
@@ -290,22 +310,26 @@ class HealthMonitor:
|
|||||||
details['updates'] = updates_status
|
details['updates'] = updates_status
|
||||||
if updates_status.get('status') == 'WARNING':
|
if updates_status.get('status') == 'WARNING':
|
||||||
warning_issues.append(updates_status.get('reason', 'Updates pending'))
|
warning_issues.append(updates_status.get('reason', 'Updates pending'))
|
||||||
elif updates_status.get('status') == 'INFO': # Treat INFO as a warning for overall summary
|
elif updates_status.get('status') == 'INFO':
|
||||||
warning_issues.append(updates_status.get('reason', 'Informational update status'))
|
info_issues.append(updates_status.get('reason', 'Informational update'))
|
||||||
|
|
||||||
# Priority 10: Security
|
# Priority 10: Security
|
||||||
security_status = self._check_security()
|
security_status = self._check_security()
|
||||||
details['security'] = security_status
|
details['security'] = security_status
|
||||||
if security_status.get('status') == 'WARNING':
|
if security_status.get('status') == 'WARNING':
|
||||||
warning_issues.append(security_status.get('reason', 'Security issue'))
|
warning_issues.append(security_status.get('reason', 'Security issue'))
|
||||||
|
elif security_status.get('status') == 'INFO':
|
||||||
|
info_issues.append(security_status.get('reason', 'Security info'))
|
||||||
|
|
||||||
# Determine overall status
|
|
||||||
if critical_issues:
|
if critical_issues:
|
||||||
overall = 'CRITICAL'
|
overall = 'CRITICAL'
|
||||||
summary = '; '.join(critical_issues[:3])
|
summary = '; '.join(critical_issues[:3])
|
||||||
elif warning_issues:
|
elif warning_issues:
|
||||||
overall = 'WARNING'
|
overall = 'WARNING'
|
||||||
summary = '; '.join(warning_issues[:3])
|
summary = '; '.join(warning_issues[:3])
|
||||||
|
elif info_issues:
|
||||||
|
overall = 'OK' # INFO is still healthy overall
|
||||||
|
summary = '; '.join(info_issues[:3])
|
||||||
else:
|
else:
|
||||||
overall = 'OK'
|
overall = 'OK'
|
||||||
summary = 'All systems operational'
|
summary = 'All systems operational'
|
||||||
@@ -607,7 +631,22 @@ class HealthMonitor:
|
|||||||
}
|
}
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Check filesystem usage
|
# Check if read-only
|
||||||
|
with open('/proc/mounts', 'r') as f:
|
||||||
|
for line in f:
|
||||||
|
parts = line.split()
|
||||||
|
if len(parts) >= 4 and parts[1] == mount_point:
|
||||||
|
options = parts[3].split(',')
|
||||||
|
if 'ro' in options:
|
||||||
|
issues.append(f'{mount_point}: Mounted read-only')
|
||||||
|
storage_details[mount_point] = {
|
||||||
|
'status': 'CRITICAL',
|
||||||
|
'reason': 'Mounted read-only'
|
||||||
|
}
|
||||||
|
break # Found it, no need to check further for this mountpoint
|
||||||
|
|
||||||
|
# Check filesystem usage only if not already flagged as critical
|
||||||
|
if mount_point not in storage_details or storage_details[mount_point].get('status') == 'OK':
|
||||||
fs_status = self._check_filesystem(mount_point)
|
fs_status = self._check_filesystem(mount_point)
|
||||||
if fs_status['status'] != 'OK':
|
if fs_status['status'] != 'OK':
|
||||||
issues.append(f"{mount_point}: {fs_status['reason']}")
|
issues.append(f"{mount_point}: {fs_status['reason']}")
|
||||||
@@ -630,30 +669,6 @@ class HealthMonitor:
|
|||||||
def _check_filesystem(self, mount_point: str) -> Dict[str, Any]:
|
def _check_filesystem(self, mount_point: str) -> Dict[str, Any]:
|
||||||
"""Check individual filesystem for space and mount status"""
|
"""Check individual filesystem for space and mount status"""
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(
|
|
||||||
['mountpoint', '-q', mount_point],
|
|
||||||
capture_output=True,
|
|
||||||
timeout=2
|
|
||||||
)
|
|
||||||
|
|
||||||
if result.returncode != 0:
|
|
||||||
return {
|
|
||||||
'status': 'CRITICAL',
|
|
||||||
'reason': 'Not mounted'
|
|
||||||
}
|
|
||||||
|
|
||||||
# Check if read-only
|
|
||||||
with open('/proc/mounts', 'r') as f:
|
|
||||||
for line in f:
|
|
||||||
parts = line.split()
|
|
||||||
if len(parts) >= 4 and parts[1] == mount_point:
|
|
||||||
options = parts[3].split(',')
|
|
||||||
if 'ro' in options:
|
|
||||||
return {
|
|
||||||
'status': 'CRITICAL',
|
|
||||||
'reason': 'Mounted read-only'
|
|
||||||
}
|
|
||||||
|
|
||||||
usage = psutil.disk_usage(mount_point)
|
usage = psutil.disk_usage(mount_point)
|
||||||
percent = usage.percent
|
percent = usage.percent
|
||||||
|
|
||||||
@@ -1165,19 +1180,57 @@ class HealthMonitor:
|
|||||||
'reason': f'Service check failed: {str(e)}'
|
'reason': f'Service check failed: {str(e)}'
|
||||||
}
|
}
|
||||||
|
|
||||||
# Modified to use persistence
|
def _is_benign_error(self, line: str) -> bool:
|
||||||
|
"""Check if log line matches benign error patterns"""
|
||||||
|
line_lower = line.lower()
|
||||||
|
for pattern in self.BENIGN_ERROR_PATTERNS:
|
||||||
|
if re.search(pattern, line_lower):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _classify_log_severity(self, line: str) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Classify log line severity intelligently.
|
||||||
|
Returns: 'CRITICAL', 'WARNING', or None (benign)
|
||||||
|
"""
|
||||||
|
line_lower = line.lower()
|
||||||
|
|
||||||
|
# Check if benign first
|
||||||
|
if self._is_benign_error(line):
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Check critical keywords
|
||||||
|
for keyword in self.CRITICAL_LOG_KEYWORDS:
|
||||||
|
if re.search(keyword, line_lower):
|
||||||
|
return 'CRITICAL'
|
||||||
|
|
||||||
|
# Check warning keywords
|
||||||
|
for keyword in self.WARNING_LOG_KEYWORDS:
|
||||||
|
if re.search(keyword, line_lower):
|
||||||
|
return 'WARNING'
|
||||||
|
|
||||||
|
# Generic error/warning classification
|
||||||
|
if 'critical' in line_lower or 'fatal' in line_lower:
|
||||||
|
return 'CRITICAL'
|
||||||
|
elif 'error' in line_lower:
|
||||||
|
return 'WARNING'
|
||||||
|
elif 'warning' in line_lower or 'warn' in line_lower:
|
||||||
|
return None # Generic warnings are benign
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
def _check_logs_with_persistence(self) -> Dict[str, Any]:
|
def _check_logs_with_persistence(self) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Check logs with persistent error tracking.
|
Check logs with intelligent classification and persistent tracking.
|
||||||
Critical log errors persist for 24h unless acknowledged.
|
- Whitelists benign Proxmox warnings
|
||||||
Groups similar errors to avoid false high counts.
|
- Only counts truly unique error types
|
||||||
|
- Persists critical errors for 24h
|
||||||
"""
|
"""
|
||||||
cache_key = 'logs_analysis'
|
cache_key = 'logs_analysis'
|
||||||
current_time = time.time()
|
current_time = time.time()
|
||||||
|
|
||||||
if cache_key in self.last_check_times:
|
if cache_key in self.last_check_times:
|
||||||
if current_time - self.last_check_times[cache_key] < self.LOG_CHECK_INTERVAL:
|
if current_time - self.last_check_times[cache_key] < self.LOG_CHECK_INTERVAL:
|
||||||
# Return persistent errors if any
|
|
||||||
persistent_errors = health_persistence.get_active_errors('logs')
|
persistent_errors = health_persistence.get_active_errors('logs')
|
||||||
if persistent_errors:
|
if persistent_errors:
|
||||||
return {
|
return {
|
||||||
@@ -1188,7 +1241,7 @@ class HealthMonitor:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
['journalctl', '--since', '5 minutes ago', '--no-pager', '-p', 'warning'],
|
['journalctl', '--since', '10 minutes ago', '--no-pager', '-p', 'warning'],
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
text=True,
|
text=True,
|
||||||
timeout=3
|
timeout=3
|
||||||
@@ -1197,68 +1250,54 @@ class HealthMonitor:
|
|||||||
if result.returncode == 0:
|
if result.returncode == 0:
|
||||||
lines = result.stdout.strip().split('\n')
|
lines = result.stdout.strip().split('\n')
|
||||||
|
|
||||||
error_patterns = {} # pattern -> count
|
critical_errors = {} # pattern -> first line
|
||||||
critical_keywords_found = set()
|
warning_errors = {} # pattern -> first line
|
||||||
|
|
||||||
for line in lines:
|
for line in lines:
|
||||||
if not line.strip():
|
if not line.strip():
|
||||||
continue
|
continue
|
||||||
|
|
||||||
line_lower = line.lower()
|
# Classify severity
|
||||||
|
severity = self._classify_log_severity(line)
|
||||||
|
|
||||||
# Check for critical keywords first
|
if severity is None:
|
||||||
critical_found = False
|
continue # Benign, skip
|
||||||
for keyword in self.CRITICAL_LOG_KEYWORDS:
|
|
||||||
if keyword.lower() in line_lower:
|
|
||||||
critical_keywords_found.add(keyword)
|
|
||||||
critical_found = True
|
|
||||||
|
|
||||||
# Record persistent error for critical keywords
|
# Normalize to pattern for grouping
|
||||||
error_key = f'log_critical_{keyword.replace(" ", "_").replace("/", "_")}'
|
pattern = re.sub(r'\d{4}-\d{2}-\d{2}', '', line) # Remove dates
|
||||||
|
pattern = re.sub(r'\d{2}:\d{2}:\d{2}', '', pattern) # Remove times
|
||||||
|
pattern = re.sub(r'pid[:\s]+\d+', 'pid:XXX', pattern.lower()) # Normalize PIDs
|
||||||
|
pattern = re.sub(r'\b\d{3,6}\b', 'ID', pattern) # Normalize IDs
|
||||||
|
pattern = re.sub(r'/dev/\S+', '/dev/XXX', pattern) # Normalize devices
|
||||||
|
pattern = re.sub(r'0x[0-9a-f]+', '0xXXX', pattern) # Normalize hex addresses
|
||||||
|
pattern = pattern[:150] # Keep first 150 chars as pattern
|
||||||
|
|
||||||
|
if severity == 'CRITICAL':
|
||||||
|
if pattern not in critical_errors:
|
||||||
|
critical_errors[pattern] = line
|
||||||
|
|
||||||
|
# Record persistent error
|
||||||
|
error_key = f'log_critical_{abs(hash(pattern)) % 10000}'
|
||||||
health_persistence.record_error(
|
health_persistence.record_error(
|
||||||
error_key=error_key,
|
error_key=error_key,
|
||||||
category='logs',
|
category='logs',
|
||||||
severity='CRITICAL',
|
severity='CRITICAL',
|
||||||
reason=f'Critical log: {keyword}',
|
reason=line[:100],
|
||||||
details={'keyword': keyword}
|
details={'pattern': pattern}
|
||||||
)
|
)
|
||||||
break
|
elif severity == 'WARNING':
|
||||||
|
if pattern not in warning_errors:
|
||||||
|
warning_errors[pattern] = line
|
||||||
|
|
||||||
if critical_found:
|
unique_critical = len(critical_errors)
|
||||||
continue
|
unique_warnings = len(warning_errors)
|
||||||
|
|
||||||
# Remove timestamps, PIDs, and specific IDs to group similar errors
|
if unique_critical > 0:
|
||||||
pattern = re.sub(r'\d{4}-\d{2}-\d{2}', '', line_lower) # Remove dates
|
|
||||||
pattern = re.sub(r'\d{2}:\d{2}:\d{2}', '', pattern) # Remove times
|
|
||||||
pattern = re.sub(r'pid[:\s]+\d+', 'pid:XXX', pattern) # Normalize PIDs
|
|
||||||
pattern = re.sub(r'\b\d{3,6}\b', 'ID', pattern) # Normalize IDs
|
|
||||||
pattern = re.sub(r'/dev/\S+', '/dev/XXX', pattern) # Normalize devices
|
|
||||||
pattern = pattern[:100] # Keep first 100 chars as pattern
|
|
||||||
|
|
||||||
# Classify error level
|
|
||||||
if 'error' in line_lower or 'critical' in line_lower or 'fatal' in line_lower:
|
|
||||||
error_patterns[f'error:{pattern}'] = error_patterns.get(f'error:{pattern}', 0) + 1
|
|
||||||
elif 'warning' in line_lower or 'warn' in line_lower:
|
|
||||||
error_patterns[f'warning:{pattern}'] = error_patterns.get(f'warning:{pattern}', 0) + 1
|
|
||||||
|
|
||||||
unique_errors = sum(1 for k in error_patterns.keys() if k.startswith('error:'))
|
|
||||||
unique_warnings = sum(1 for k in error_patterns.keys() if k.startswith('warning:'))
|
|
||||||
|
|
||||||
if critical_keywords_found:
|
|
||||||
status = 'CRITICAL'
|
status = 'CRITICAL'
|
||||||
reason = f'Critical errors: {", ".join(list(critical_keywords_found)[:3])}'
|
reason = f'{unique_critical} critical error type(s) detected'
|
||||||
elif unique_errors >= self.LOG_ERRORS_CRITICAL:
|
elif unique_warnings >= 10:
|
||||||
status = 'CRITICAL'
|
|
||||||
reason = f'{unique_errors} unique errors in 5 minutes'
|
|
||||||
elif unique_warnings >= self.LOG_WARNINGS_CRITICAL:
|
|
||||||
status = 'WARNING'
|
status = 'WARNING'
|
||||||
reason = f'{unique_warnings} unique warnings in 5 minutes'
|
reason = f'{unique_warnings} warning type(s) detected'
|
||||||
elif unique_errors >= self.LOG_ERRORS_WARNING:
|
|
||||||
status = 'WARNING'
|
|
||||||
reason = f'{unique_errors} unique errors in 5 minutes'
|
|
||||||
elif unique_warnings >= self.LOG_WARNINGS_WARNING:
|
|
||||||
status = 'WARNING'
|
|
||||||
reason = f'{unique_warnings} unique warnings in 5 minutes'
|
|
||||||
else:
|
else:
|
||||||
status = 'OK'
|
status = 'OK'
|
||||||
reason = None
|
reason = None
|
||||||
|
|||||||
Reference in New Issue
Block a user