mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-02-18 16:36:27 +00:00
Update health monitor
This commit is contained in:
@@ -66,16 +66,22 @@ def acknowledge_error():
|
|||||||
|
|
||||||
if result.get('success'):
|
if result.get('success'):
|
||||||
# Invalidate cached health results so next fetch reflects the dismiss
|
# Invalidate cached health results so next fetch reflects the dismiss
|
||||||
# Clear category-specific caches based on the error_key prefix
|
# Use the error's category to clear the correct cache
|
||||||
if error_key.startswith('log_'):
|
category = result.get('category', '')
|
||||||
health_monitor.last_check_times.pop('system_logs', None)
|
cache_key_map = {
|
||||||
health_monitor.cached_results.pop('system_logs', None)
|
'logs': 'system_logs',
|
||||||
elif error_key.startswith('pve_service_'):
|
'pve_services': 'pve_services',
|
||||||
health_monitor.last_check_times.pop('pve_services', None)
|
'updates': 'updates_check',
|
||||||
health_monitor.cached_results.pop('pve_services', None)
|
'security': 'security_check',
|
||||||
elif error_key.startswith('updates_'):
|
'temperature': 'cpu_check',
|
||||||
health_monitor.last_check_times.pop('updates_check', None)
|
'network': 'network_check',
|
||||||
health_monitor.cached_results.pop('updates_check', None)
|
'disks': 'storage_check',
|
||||||
|
'vms': 'vms_check',
|
||||||
|
}
|
||||||
|
cache_key = cache_key_map.get(category)
|
||||||
|
if cache_key:
|
||||||
|
health_monitor.last_check_times.pop(cache_key, None)
|
||||||
|
health_monitor.cached_results.pop(cache_key, None)
|
||||||
|
|
||||||
# Determine suppression period for the response
|
# Determine suppression period for the response
|
||||||
category = result.get('category', '')
|
category = result.get('category', '')
|
||||||
|
|||||||
@@ -587,7 +587,7 @@ class HealthMonitor:
|
|||||||
|
|
||||||
# Record non-dismissable error
|
# Record non-dismissable error
|
||||||
health_persistence.record_error(
|
health_persistence.record_error(
|
||||||
error_key='cpu_temp_high',
|
error_key='cpu_temperature',
|
||||||
category='temperature',
|
category='temperature',
|
||||||
severity='WARNING',
|
severity='WARNING',
|
||||||
reason=reason,
|
reason=reason,
|
||||||
@@ -597,11 +597,11 @@ class HealthMonitor:
|
|||||||
# Temperature has been ≤80°C for 30 seconds - clear the error
|
# Temperature has been ≤80°C for 30 seconds - clear the error
|
||||||
status = 'OK'
|
status = 'OK'
|
||||||
reason = None
|
reason = None
|
||||||
health_persistence.resolve_error('cpu_temp_high', 'Temperature recovered')
|
health_persistence.resolve_error('cpu_temperature', 'Temperature recovered')
|
||||||
else:
|
else:
|
||||||
# Temperature is elevated but not long enough, or recovering but not yet cleared
|
# Temperature is elevated but not long enough, or recovering but not yet cleared
|
||||||
# Check if we already have an active error
|
# Check if we already have an active error
|
||||||
if health_persistence.is_error_active('cpu_temp_high', category='temperature'):
|
if health_persistence.is_error_active('cpu_temperature', category='temperature'):
|
||||||
# Keep the warning active
|
# Keep the warning active
|
||||||
status = 'WARNING'
|
status = 'WARNING'
|
||||||
reason = f'CPU temperature {max_temp}°C still elevated'
|
reason = f'CPU temperature {max_temp}°C still elevated'
|
||||||
@@ -1988,6 +1988,7 @@ class HealthMonitor:
|
|||||||
update_count = 0
|
update_count = 0
|
||||||
security_updates_packages = []
|
security_updates_packages = []
|
||||||
kernel_pve_updates_packages = []
|
kernel_pve_updates_packages = []
|
||||||
|
sec_result = None
|
||||||
|
|
||||||
if result.returncode == 0:
|
if result.returncode == 0:
|
||||||
lines = result.stdout.strip().split('\n')
|
lines = result.stdout.strip().split('\n')
|
||||||
@@ -2012,30 +2013,34 @@ class HealthMonitor:
|
|||||||
status = 'WARNING'
|
status = 'WARNING'
|
||||||
reason = f'{len(security_updates_packages)} security update(s) available'
|
reason = f'{len(security_updates_packages)} security update(s) available'
|
||||||
# Record persistent error for security updates to ensure it's visible
|
# Record persistent error for security updates to ensure it's visible
|
||||||
health_persistence.record_error(
|
sec_result = health_persistence.record_error(
|
||||||
error_key='updates_security',
|
error_key='security_updates',
|
||||||
category='updates',
|
category='updates',
|
||||||
severity='WARNING',
|
severity='WARNING',
|
||||||
reason=reason,
|
reason=reason,
|
||||||
details={'count': len(security_updates_packages), 'packages': security_updates_packages[:5]}
|
details={'count': len(security_updates_packages), 'packages': security_updates_packages[:5], 'dismissable': True}
|
||||||
)
|
)
|
||||||
|
# If previously dismissed, downgrade to INFO
|
||||||
|
if sec_result and sec_result.get('type') == 'skipped_acknowledged':
|
||||||
|
status = 'INFO'
|
||||||
|
reason = None
|
||||||
elif last_update_days and last_update_days >= 548:
|
elif last_update_days and last_update_days >= 548:
|
||||||
# 18+ months without updates - CRITICAL
|
# 18+ months without updates - CRITICAL
|
||||||
status = 'CRITICAL'
|
status = 'CRITICAL'
|
||||||
reason = f'System not updated in {last_update_days} days (>18 months)'
|
reason = f'System not updated in {last_update_days} days (>18 months)'
|
||||||
health_persistence.record_error(
|
health_persistence.record_error(
|
||||||
error_key='updates_548days',
|
error_key='system_age',
|
||||||
category='updates',
|
category='updates',
|
||||||
severity='CRITICAL',
|
severity='CRITICAL',
|
||||||
reason=reason,
|
reason=reason,
|
||||||
details={'days': last_update_days, 'update_count': update_count}
|
details={'days': last_update_days, 'update_count': update_count, 'dismissable': False}
|
||||||
)
|
)
|
||||||
elif last_update_days and last_update_days >= 365:
|
elif last_update_days and last_update_days >= 365:
|
||||||
# 1+ year without updates - WARNING
|
# 1+ year without updates - WARNING
|
||||||
status = 'WARNING'
|
status = 'WARNING'
|
||||||
reason = f'System not updated in {last_update_days} days (>1 year)'
|
reason = f'System not updated in {last_update_days} days (>1 year)'
|
||||||
health_persistence.record_error(
|
health_persistence.record_error(
|
||||||
error_key='updates_365days',
|
error_key='system_age',
|
||||||
category='updates',
|
category='updates',
|
||||||
severity='WARNING',
|
severity='WARNING',
|
||||||
reason=reason,
|
reason=reason,
|
||||||
@@ -2057,14 +2062,16 @@ class HealthMonitor:
|
|||||||
|
|
||||||
# Build checks dict for updates sub-items
|
# Build checks dict for updates sub-items
|
||||||
update_age_status = 'CRITICAL' if (last_update_days and last_update_days >= 548) else ('WARNING' if (last_update_days and last_update_days >= 365) else 'OK')
|
update_age_status = 'CRITICAL' if (last_update_days and last_update_days >= 548) else ('WARNING' if (last_update_days and last_update_days >= 365) else 'OK')
|
||||||
sec_status = 'WARNING' if security_updates_packages else 'OK'
|
sec_dismissed = security_updates_packages and sec_result and sec_result.get('type') == 'skipped_acknowledged'
|
||||||
|
sec_status = 'INFO' if sec_dismissed else ('WARNING' if security_updates_packages else 'OK')
|
||||||
kernel_status = 'INFO' if kernel_pve_updates_packages else 'OK'
|
kernel_status = 'INFO' if kernel_pve_updates_packages else 'OK'
|
||||||
|
|
||||||
checks = {
|
checks = {
|
||||||
'security_updates': {
|
'security_updates': {
|
||||||
'status': sec_status,
|
'status': sec_status,
|
||||||
'detail': f'{len(security_updates_packages)} security update(s) pending' if security_updates_packages else 'No security updates pending',
|
'detail': f'{len(security_updates_packages)} security update(s) pending' if security_updates_packages else 'No security updates pending',
|
||||||
'dismissable': True if sec_status != 'OK' else False
|
'dismissable': True if security_updates_packages and not sec_dismissed else False,
|
||||||
|
'dismissed': bool(sec_dismissed)
|
||||||
},
|
},
|
||||||
'system_age': {
|
'system_age': {
|
||||||
'status': update_age_status,
|
'status': update_age_status,
|
||||||
@@ -2206,7 +2213,7 @@ class HealthMonitor:
|
|||||||
|
|
||||||
# Record in persistence (dismissable)
|
# Record in persistence (dismissable)
|
||||||
health_persistence.record_error(
|
health_persistence.record_error(
|
||||||
error_key='security_fail2ban_ban',
|
error_key='fail2ban',
|
||||||
category='security',
|
category='security',
|
||||||
severity='WARNING',
|
severity='WARNING',
|
||||||
reason=msg,
|
reason=msg,
|
||||||
@@ -2220,8 +2227,8 @@ class HealthMonitor:
|
|||||||
else:
|
else:
|
||||||
result['detail'] = f'Fail2Ban active ({len(jails)} jail(s), no current bans)'
|
result['detail'] = f'Fail2Ban active ({len(jails)} jail(s), no current bans)'
|
||||||
# Auto-resolve if previously banned IPs are now gone
|
# Auto-resolve if previously banned IPs are now gone
|
||||||
if health_persistence.is_error_active('security_fail2ban_ban'):
|
if health_persistence.is_error_active('fail2ban'):
|
||||||
health_persistence.clear_error('security_fail2ban_ban')
|
health_persistence.clear_error('fail2ban')
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
result['detail'] = f'Unable to check Fail2Ban: {str(e)[:50]}'
|
result['detail'] = f'Unable to check Fail2Ban: {str(e)[:50]}'
|
||||||
|
|||||||
Reference in New Issue
Block a user