Update health monitor

This commit is contained in:
MacRimi
2026-02-16 22:26:43 +01:00
parent 0f81f45c5f
commit a1d48a28e9
2 changed files with 37 additions and 24 deletions

View File

@@ -66,16 +66,22 @@ def acknowledge_error():
if result.get('success'):
# Invalidate cached health results so next fetch reflects the dismiss
# Clear category-specific caches based on the error_key prefix
if error_key.startswith('log_'):
health_monitor.last_check_times.pop('system_logs', None)
health_monitor.cached_results.pop('system_logs', None)
elif error_key.startswith('pve_service_'):
health_monitor.last_check_times.pop('pve_services', None)
health_monitor.cached_results.pop('pve_services', None)
elif error_key.startswith('updates_'):
health_monitor.last_check_times.pop('updates_check', None)
health_monitor.cached_results.pop('updates_check', None)
# Use the error's category to clear the correct cache
category = result.get('category', '')
cache_key_map = {
'logs': 'system_logs',
'pve_services': 'pve_services',
'updates': 'updates_check',
'security': 'security_check',
'temperature': 'cpu_check',
'network': 'network_check',
'disks': 'storage_check',
'vms': 'vms_check',
}
cache_key = cache_key_map.get(category)
if cache_key:
health_monitor.last_check_times.pop(cache_key, None)
health_monitor.cached_results.pop(cache_key, None)
# Determine suppression period for the response
category = result.get('category', '')

View File

@@ -587,7 +587,7 @@ class HealthMonitor:
# Record non-dismissable error
health_persistence.record_error(
error_key='cpu_temp_high',
error_key='cpu_temperature',
category='temperature',
severity='WARNING',
reason=reason,
@@ -597,11 +597,11 @@ class HealthMonitor:
# Temperature has been ≤80°C for 30 seconds - clear the error
status = 'OK'
reason = None
health_persistence.resolve_error('cpu_temp_high', 'Temperature recovered')
health_persistence.resolve_error('cpu_temperature', 'Temperature recovered')
else:
# Temperature is elevated but not long enough, or recovering but not yet cleared
# Check if we already have an active error
if health_persistence.is_error_active('cpu_temp_high', category='temperature'):
if health_persistence.is_error_active('cpu_temperature', category='temperature'):
# Keep the warning active
status = 'WARNING'
reason = f'CPU temperature {max_temp}°C still elevated'
@@ -1988,6 +1988,7 @@ class HealthMonitor:
update_count = 0
security_updates_packages = []
kernel_pve_updates_packages = []
sec_result = None
if result.returncode == 0:
lines = result.stdout.strip().split('\n')
@@ -2012,30 +2013,34 @@ class HealthMonitor:
status = 'WARNING'
reason = f'{len(security_updates_packages)} security update(s) available'
# Record persistent error for security updates to ensure it's visible
health_persistence.record_error(
error_key='updates_security',
sec_result = health_persistence.record_error(
error_key='security_updates',
category='updates',
severity='WARNING',
reason=reason,
details={'count': len(security_updates_packages), 'packages': security_updates_packages[:5]}
details={'count': len(security_updates_packages), 'packages': security_updates_packages[:5], 'dismissable': True}
)
# If previously dismissed, downgrade to INFO
if sec_result and sec_result.get('type') == 'skipped_acknowledged':
status = 'INFO'
reason = None
elif last_update_days and last_update_days >= 548:
# 18+ months without updates - CRITICAL
status = 'CRITICAL'
reason = f'System not updated in {last_update_days} days (>18 months)'
health_persistence.record_error(
error_key='updates_548days',
error_key='system_age',
category='updates',
severity='CRITICAL',
reason=reason,
details={'days': last_update_days, 'update_count': update_count}
details={'days': last_update_days, 'update_count': update_count, 'dismissable': False}
)
elif last_update_days and last_update_days >= 365:
# 1+ year without updates - WARNING
status = 'WARNING'
reason = f'System not updated in {last_update_days} days (>1 year)'
health_persistence.record_error(
error_key='updates_365days',
error_key='system_age',
category='updates',
severity='WARNING',
reason=reason,
@@ -2057,14 +2062,16 @@ class HealthMonitor:
# Build checks dict for updates sub-items
update_age_status = 'CRITICAL' if (last_update_days and last_update_days >= 548) else ('WARNING' if (last_update_days and last_update_days >= 365) else 'OK')
sec_status = 'WARNING' if security_updates_packages else 'OK'
sec_dismissed = security_updates_packages and sec_result and sec_result.get('type') == 'skipped_acknowledged'
sec_status = 'INFO' if sec_dismissed else ('WARNING' if security_updates_packages else 'OK')
kernel_status = 'INFO' if kernel_pve_updates_packages else 'OK'
checks = {
'security_updates': {
'status': sec_status,
'detail': f'{len(security_updates_packages)} security update(s) pending' if security_updates_packages else 'No security updates pending',
'dismissable': True if sec_status != 'OK' else False
'dismissable': True if security_updates_packages and not sec_dismissed else False,
'dismissed': bool(sec_dismissed)
},
'system_age': {
'status': update_age_status,
@@ -2206,7 +2213,7 @@ class HealthMonitor:
# Record in persistence (dismissable)
health_persistence.record_error(
error_key='security_fail2ban_ban',
error_key='fail2ban',
category='security',
severity='WARNING',
reason=msg,
@@ -2220,8 +2227,8 @@ class HealthMonitor:
else:
result['detail'] = f'Fail2Ban active ({len(jails)} jail(s), no current bans)'
# Auto-resolve if previously banned IPs are now gone
if health_persistence.is_error_active('security_fail2ban_ban'):
health_persistence.clear_error('security_fail2ban_ban')
if health_persistence.is_error_active('fail2ban'):
health_persistence.clear_error('fail2ban')
except Exception as e:
result['detail'] = f'Unable to check Fail2Ban: {str(e)[:50]}'