Update health monitor

This commit is contained in:
MacRimi
2026-02-16 22:26:43 +01:00
parent 0f81f45c5f
commit a1d48a28e9
2 changed files with 37 additions and 24 deletions

View File

@@ -66,16 +66,22 @@ def acknowledge_error():
if result.get('success'): if result.get('success'):
# Invalidate cached health results so next fetch reflects the dismiss # Invalidate cached health results so next fetch reflects the dismiss
# Clear category-specific caches based on the error_key prefix # Use the error's category to clear the correct cache
if error_key.startswith('log_'): category = result.get('category', '')
health_monitor.last_check_times.pop('system_logs', None) cache_key_map = {
health_monitor.cached_results.pop('system_logs', None) 'logs': 'system_logs',
elif error_key.startswith('pve_service_'): 'pve_services': 'pve_services',
health_monitor.last_check_times.pop('pve_services', None) 'updates': 'updates_check',
health_monitor.cached_results.pop('pve_services', None) 'security': 'security_check',
elif error_key.startswith('updates_'): 'temperature': 'cpu_check',
health_monitor.last_check_times.pop('updates_check', None) 'network': 'network_check',
health_monitor.cached_results.pop('updates_check', None) 'disks': 'storage_check',
'vms': 'vms_check',
}
cache_key = cache_key_map.get(category)
if cache_key:
health_monitor.last_check_times.pop(cache_key, None)
health_monitor.cached_results.pop(cache_key, None)
# Determine suppression period for the response # Determine suppression period for the response
category = result.get('category', '') category = result.get('category', '')

View File

@@ -587,7 +587,7 @@ class HealthMonitor:
# Record non-dismissable error # Record non-dismissable error
health_persistence.record_error( health_persistence.record_error(
error_key='cpu_temp_high', error_key='cpu_temperature',
category='temperature', category='temperature',
severity='WARNING', severity='WARNING',
reason=reason, reason=reason,
@@ -597,11 +597,11 @@ class HealthMonitor:
# Temperature has been ≤80°C for 30 seconds - clear the error # Temperature has been ≤80°C for 30 seconds - clear the error
status = 'OK' status = 'OK'
reason = None reason = None
health_persistence.resolve_error('cpu_temp_high', 'Temperature recovered') health_persistence.resolve_error('cpu_temperature', 'Temperature recovered')
else: else:
# Temperature is elevated but not long enough, or recovering but not yet cleared # Temperature is elevated but not long enough, or recovering but not yet cleared
# Check if we already have an active error # Check if we already have an active error
if health_persistence.is_error_active('cpu_temp_high', category='temperature'): if health_persistence.is_error_active('cpu_temperature', category='temperature'):
# Keep the warning active # Keep the warning active
status = 'WARNING' status = 'WARNING'
reason = f'CPU temperature {max_temp}°C still elevated' reason = f'CPU temperature {max_temp}°C still elevated'
@@ -1988,6 +1988,7 @@ class HealthMonitor:
update_count = 0 update_count = 0
security_updates_packages = [] security_updates_packages = []
kernel_pve_updates_packages = [] kernel_pve_updates_packages = []
sec_result = None
if result.returncode == 0: if result.returncode == 0:
lines = result.stdout.strip().split('\n') lines = result.stdout.strip().split('\n')
@@ -2012,30 +2013,34 @@ class HealthMonitor:
status = 'WARNING' status = 'WARNING'
reason = f'{len(security_updates_packages)} security update(s) available' reason = f'{len(security_updates_packages)} security update(s) available'
# Record persistent error for security updates to ensure it's visible # Record persistent error for security updates to ensure it's visible
health_persistence.record_error( sec_result = health_persistence.record_error(
error_key='updates_security', error_key='security_updates',
category='updates', category='updates',
severity='WARNING', severity='WARNING',
reason=reason, reason=reason,
details={'count': len(security_updates_packages), 'packages': security_updates_packages[:5]} details={'count': len(security_updates_packages), 'packages': security_updates_packages[:5], 'dismissable': True}
) )
# If previously dismissed, downgrade to INFO
if sec_result and sec_result.get('type') == 'skipped_acknowledged':
status = 'INFO'
reason = None
elif last_update_days and last_update_days >= 548: elif last_update_days and last_update_days >= 548:
# 18+ months without updates - CRITICAL # 18+ months without updates - CRITICAL
status = 'CRITICAL' status = 'CRITICAL'
reason = f'System not updated in {last_update_days} days (>18 months)' reason = f'System not updated in {last_update_days} days (>18 months)'
health_persistence.record_error( health_persistence.record_error(
error_key='updates_548days', error_key='system_age',
category='updates', category='updates',
severity='CRITICAL', severity='CRITICAL',
reason=reason, reason=reason,
details={'days': last_update_days, 'update_count': update_count} details={'days': last_update_days, 'update_count': update_count, 'dismissable': False}
) )
elif last_update_days and last_update_days >= 365: elif last_update_days and last_update_days >= 365:
# 1+ year without updates - WARNING # 1+ year without updates - WARNING
status = 'WARNING' status = 'WARNING'
reason = f'System not updated in {last_update_days} days (>1 year)' reason = f'System not updated in {last_update_days} days (>1 year)'
health_persistence.record_error( health_persistence.record_error(
error_key='updates_365days', error_key='system_age',
category='updates', category='updates',
severity='WARNING', severity='WARNING',
reason=reason, reason=reason,
@@ -2057,14 +2062,16 @@ class HealthMonitor:
# Build checks dict for updates sub-items # Build checks dict for updates sub-items
update_age_status = 'CRITICAL' if (last_update_days and last_update_days >= 548) else ('WARNING' if (last_update_days and last_update_days >= 365) else 'OK') update_age_status = 'CRITICAL' if (last_update_days and last_update_days >= 548) else ('WARNING' if (last_update_days and last_update_days >= 365) else 'OK')
sec_status = 'WARNING' if security_updates_packages else 'OK' sec_dismissed = security_updates_packages and sec_result and sec_result.get('type') == 'skipped_acknowledged'
sec_status = 'INFO' if sec_dismissed else ('WARNING' if security_updates_packages else 'OK')
kernel_status = 'INFO' if kernel_pve_updates_packages else 'OK' kernel_status = 'INFO' if kernel_pve_updates_packages else 'OK'
checks = { checks = {
'security_updates': { 'security_updates': {
'status': sec_status, 'status': sec_status,
'detail': f'{len(security_updates_packages)} security update(s) pending' if security_updates_packages else 'No security updates pending', 'detail': f'{len(security_updates_packages)} security update(s) pending' if security_updates_packages else 'No security updates pending',
'dismissable': True if sec_status != 'OK' else False 'dismissable': True if security_updates_packages and not sec_dismissed else False,
'dismissed': bool(sec_dismissed)
}, },
'system_age': { 'system_age': {
'status': update_age_status, 'status': update_age_status,
@@ -2206,7 +2213,7 @@ class HealthMonitor:
# Record in persistence (dismissable) # Record in persistence (dismissable)
health_persistence.record_error( health_persistence.record_error(
error_key='security_fail2ban_ban', error_key='fail2ban',
category='security', category='security',
severity='WARNING', severity='WARNING',
reason=msg, reason=msg,
@@ -2220,8 +2227,8 @@ class HealthMonitor:
else: else:
result['detail'] = f'Fail2Ban active ({len(jails)} jail(s), no current bans)' result['detail'] = f'Fail2Ban active ({len(jails)} jail(s), no current bans)'
# Auto-resolve if previously banned IPs are now gone # Auto-resolve if previously banned IPs are now gone
if health_persistence.is_error_active('security_fail2ban_ban'): if health_persistence.is_error_active('fail2ban'):
health_persistence.clear_error('security_fail2ban_ban') health_persistence.clear_error('fail2ban')
except Exception as e: except Exception as e:
result['detail'] = f'Unable to check Fail2Ban: {str(e)[:50]}' result['detail'] = f'Unable to check Fail2Ban: {str(e)[:50]}'