Update health_monitor.py

This commit is contained in:
MacRimi
2025-11-27 13:29:15 +01:00
parent f22de50527
commit 41537c0bad

View File

@@ -12,6 +12,7 @@ import subprocess
import json import json
import time import time
import os import os
import hashlib # Added for MD5 hashing
from typing import Dict, List, Any, Tuple, Optional from typing import Dict, List, Any, Tuple, Optional
from datetime import datetime, timedelta from datetime import datetime, timedelta
from collections import defaultdict from collections import defaultdict
@@ -420,12 +421,18 @@ class HealthMonitor:
return {'status': 'UNKNOWN', 'reason': f'CPU check failed: {str(e)}'} return {'status': 'UNKNOWN', 'reason': f'CPU check failed: {str(e)}'}
def _check_cpu_temperature(self) -> Optional[Dict[str, Any]]: def _check_cpu_temperature(self) -> Optional[Dict[str, Any]]:
"""Check CPU temperature with hysteresis (5 min sustained) - cached, max 1 check per minute""" """
Check CPU temperature with temporal logic:
- WARNING if temp >80°C sustained for >3 minutes
- Auto-clears if temp ≤80°C for 30 seconds
- No dismiss button (non-dismissable)
"""
cache_key = 'cpu_temp' cache_key = 'cpu_temp'
current_time = time.time() current_time = time.time()
# Check every 10 seconds instead of 60
if cache_key in self.last_check_times: if cache_key in self.last_check_times:
if current_time - self.last_check_times[cache_key] < 60: if current_time - self.last_check_times[cache_key] < 10:
return self.cached_results.get(cache_key) return self.cached_results.get(cache_key)
try: try:
@@ -455,35 +462,54 @@ class HealthMonitor:
'time': current_time 'time': current_time
}) })
# Keep last 6 minutes of data # Keep last 4 minutes of data (240 seconds)
self.state_history[state_key] = [ self.state_history[state_key] = [
entry for entry in self.state_history[state_key] entry for entry in self.state_history[state_key]
if current_time - entry['time'] < 360 if current_time - entry['time'] < 240
] ]
# Check sustained high temperature (5 minutes) # Check if temperature >80°C for more than 3 minutes (180 seconds)
critical_temp_samples = [ high_temp_samples = [
entry for entry in self.state_history[state_key] entry for entry in self.state_history[state_key]
if entry['value'] >= self.TEMP_CRITICAL and if entry['value'] > 80 and current_time - entry['time'] <= 180
current_time - entry['time'] <= 300
] ]
warning_temp_samples = [ # Check if temperature ≤80°C for last 30 seconds (recovery)
recovery_samples = [
entry for entry in self.state_history[state_key] entry for entry in self.state_history[state_key]
if entry['value'] >= self.TEMP_WARNING and if entry['value'] <= 80 and current_time - entry['time'] <= 30
current_time - entry['time'] <= 300
] ]
# Require at least 3 samples over 5 minutes to trigger alert # Require at least 18 samples over 3 minutes (one every 10 seconds) to trigger alert
if len(critical_temp_samples) >= 3: if len(high_temp_samples) >= 18:
status = 'CRITICAL' # Temperature has been >80°C for >3 minutes
reason = f'CPU temperature {max_temp}°C ≥{self.TEMP_CRITICAL}°C sustained >5min'
elif len(warning_temp_samples) >= 3:
status = 'WARNING' status = 'WARNING'
reason = f'CPU temperature {max_temp}°C {self.TEMP_WARNING}°C sustained >5min' reason = f'CPU temperature {max_temp}°C >80°C sustained >3min'
else:
# Record non-dismissable error
health_persistence.record_error(
error_key='cpu_temp_high',
category='temperature',
severity='WARNING',
reason=reason,
details={'temperature': max_temp, 'dismissable': False}
)
elif len(recovery_samples) >= 3:
# Temperature has been ≤80°C for 30 seconds - clear the error
status = 'OK' status = 'OK'
reason = None reason = None
health_persistence.resolve_error('cpu_temp_high', 'Temperature recovered')
else:
# Temperature is elevated but not long enough, or recovering but not yet cleared
# Check if we already have an active error
if health_persistence.is_error_active('cpu_temp_high', category='temperature'):
# Keep the warning active
status = 'WARNING'
reason = f'CPU temperature {max_temp}°C still elevated'
else:
# No active warning yet
status = 'OK'
reason = None
temp_result = { temp_result = {
'status': status, 'status': status,
@@ -829,15 +855,44 @@ class HealthMonitor:
# Report based on recent error count # Report based on recent error count
if error_count >= 3: if error_count >= 3:
error_key = f'disk_{disk}'
severity = 'CRITICAL'
reason = f'{error_count} I/O errors in 5 minutes'
health_persistence.record_error(
error_key=error_key,
category='disks',
severity=severity,
reason=reason,
details={'disk': disk, 'error_count': error_count, 'dismissable': True}
)
disk_issues[f'/dev/{disk}'] = { disk_issues[f'/dev/{disk}'] = {
'status': 'CRITICAL', 'status': severity,
'reason': f'{error_count} I/O errors in 5 minutes' 'reason': reason,
'dismissable': True
} }
elif error_count >= 1: elif error_count >= 1:
error_key = f'disk_{disk}'
severity = 'WARNING'
reason = f'{error_count} I/O error(s) in 5 minutes'
health_persistence.record_error(
error_key=error_key,
category='disks',
severity=severity,
reason=reason,
details={'disk': disk, 'error_count': error_count, 'dismissable': True}
)
disk_issues[f'/dev/{disk}'] = { disk_issues[f'/dev/{disk}'] = {
'status': 'WARNING', 'status': severity,
'reason': f'{error_count} I/O error(s) in 5 minutes' 'reason': reason,
'dismissable': True
} }
else:
error_key = f'disk_{disk}'
health_persistence.resolve_error(error_key, 'Disk errors cleared')
if not disk_issues: if not disk_issues:
return {'status': 'OK'} return {'status': 'OK'}
@@ -851,7 +906,6 @@ class HealthMonitor:
} }
except Exception: except Exception:
# If dmesg check fails, return OK as it's not a critical system failure
return {'status': 'OK'} return {'status': 'OK'}
def _check_network_optimized(self) -> Dict[str, Any]: def _check_network_optimized(self) -> Dict[str, Any]:
@@ -865,6 +919,8 @@ class HealthMonitor:
net_if_stats = psutil.net_if_stats() net_if_stats = psutil.net_if_stats()
active_interfaces = set()
for interface, stats in net_if_stats.items(): for interface, stats in net_if_stats.items():
if interface == 'lo': if interface == 'lo':
continue continue
@@ -874,10 +930,25 @@ class HealthMonitor:
# Consider common PVE bridge interfaces and physical NICs as important # Consider common PVE bridge interfaces and physical NICs as important
if interface.startswith('vmbr') or interface.startswith('eth') or interface.startswith('ens') or interface.startswith('enp'): if interface.startswith('vmbr') or interface.startswith('eth') or interface.startswith('ens') or interface.startswith('enp'):
issues.append(f'{interface} is DOWN') issues.append(f'{interface} is DOWN')
error_key = interface
health_persistence.record_error(
error_key=error_key,
category='network',
severity='CRITICAL',
reason='Interface DOWN',
details={'interface': interface, 'dismissable': True}
)
interface_details[interface] = { interface_details[interface] = {
'status': 'CRITICAL', 'status': 'CRITICAL',
'reason': 'Interface DOWN' 'reason': 'Interface DOWN',
'dismissable': True
} }
else:
active_interfaces.add(interface)
if interface.startswith('vmbr') or interface.startswith('eth') or interface.startswith('ens') or interface.startswith('enp'):
health_persistence.resolve_error(interface, 'Interface recovered')
# Check connectivity (latency) # Check connectivity (latency)
latency_status = self._check_network_latency() latency_status = self._check_network_latency()
@@ -1307,9 +1378,19 @@ class HealthMonitor:
# Check persistent log errors recorded by health_persistence # Check persistent log errors recorded by health_persistence
persistent_errors = health_persistence.get_active_errors('logs') persistent_errors = health_persistence.get_active_errors('logs')
if persistent_errors: if persistent_errors:
# Find the highest severity among persistent errors to set overall status
max_severity = 'OK'
reasons = []
for error in persistent_errors:
if error['severity'] == 'CRITICAL':
max_severity = 'CRITICAL'
elif error['severity'] == 'WARNING' and max_severity != 'CRITICAL':
max_severity = 'WARNING'
reasons.append(error['reason'])
return { return {
'status': 'WARNING', # Or CRITICAL depending on severity of persistent errors 'status': max_severity,
'reason': f'{len(persistent_errors)} persistent log issues detected' 'reason': '; '.join(reasons[:3]) # Show up to 3 persistent reasons
} }
return self.cached_results.get(cache_key, {'status': 'OK'}) return self.cached_results.get(cache_key, {'status': 'OK'})
@@ -1356,30 +1437,38 @@ class HealthMonitor:
pattern = self._normalize_log_pattern(line) pattern = self._normalize_log_pattern(line)
if severity == 'CRITICAL': if severity == 'CRITICAL':
# If this critical pattern is new or we haven't logged it recently pattern_hash = hashlib.md5(pattern.encode()).hexdigest()[:8]
error_key = f'log_critical_{abs(hash(pattern)) % 10000}' error_key = f'log_critical_{pattern_hash}'
if pattern not in critical_errors_found: if pattern not in critical_errors_found:
critical_errors_found[pattern] = line critical_errors_found[pattern] = line
# Record persistent error if it's not already active and within recent persistence # Record persistent error if it's not already active
if not health_persistence.is_error_active(error_key, category='logs'): if not health_persistence.is_error_active(error_key, category='logs'):
health_persistence.record_error( health_persistence.record_error(
error_key=error_key, error_key=error_key,
category='logs', category='logs',
severity='CRITICAL', severity='CRITICAL',
reason=line[:100], # Truncate reason for brevity reason=line[:100], # Truncate reason for brevity
details={'pattern': pattern} details={'pattern': pattern, 'dismissable': True}
) )
recent_patterns[pattern] += 1 recent_patterns[pattern] += 1
for line in previous_lines: for line in previous_lines:
if not line.strip() or self._is_benign_error(line): if not line.strip():
continue continue
# Skip benign errors
if self._is_benign_error(line):
continue
# Classify severity
severity = self._classify_log_severity(line) severity = self._classify_log_severity(line)
if severity is None:
if severity is None: # Skip informational or classified benign lines
continue continue
# Normalize to a pattern for grouping
pattern = self._normalize_log_pattern(line) pattern = self._normalize_log_pattern(line)
previous_patterns[pattern] += 1 previous_patterns[pattern] += 1
@@ -1500,7 +1589,7 @@ class HealthMonitor:
lines = result.stdout.strip().split('\n') lines = result.stdout.strip().split('\n')
for line in lines: for line in lines:
# 'Inst' indicates a package will be installed/upgraded # 'Inst ' indicates a package will be installed/upgraded
if line.startswith('Inst '): if line.startswith('Inst '):
update_count += 1 update_count += 1
line_lower = line.lower() line_lower = line.lower()
@@ -2071,5 +2160,11 @@ class HealthMonitor:
} }
# Global instance
health_monitor = HealthMonitor()
.now().isoformat()
}
# Global instance # Global instance
health_monitor = HealthMonitor() health_monitor = HealthMonitor()