Update health monitor

This commit is contained in:
MacRimi
2026-02-16 18:19:29 +01:00
parent 2ee5be7402
commit 1ed8f5d124
3 changed files with 204 additions and 91 deletions

View File

@@ -65,6 +65,18 @@ def acknowledge_error():
result = health_persistence.acknowledge_error(error_key)
if result.get('success'):
# Invalidate cached health results so next fetch reflects the dismiss
# Clear category-specific caches based on the error_key prefix
if error_key.startswith('log_'):
health_monitor.last_check_times.pop('system_logs', None)
health_monitor.cached_results.pop('system_logs', None)
elif error_key.startswith('pve_service_'):
health_monitor.last_check_times.pop('pve_services', None)
health_monitor.cached_results.pop('pve_services', None)
elif error_key.startswith('updates_'):
health_monitor.last_check_times.pop('updates_check', None)
health_monitor.cached_results.pop('updates_check', None)
# Determine suppression period for the response
category = result.get('category', '')
if category == 'updates':

View File

@@ -487,18 +487,14 @@ class HealthMonitor:
checks = {
'cpu_usage': {
'status': status,
'detail': f'{round(cpu_percent, 1)}% ({psutil.cpu_count()} cores)',
'value': round(cpu_percent, 1),
'thresholds': f'Warning >{self.CPU_WARNING}%, Critical >{self.CPU_CRITICAL}%'
'detail': 'Sustained high CPU usage' if status != 'OK' else 'Normal'
}
}
if temp_status and temp_status.get('status') != 'UNKNOWN':
temp_val = temp_status.get('value', 'N/A')
t_status = temp_status.get('status', 'OK')
checks['cpu_temperature'] = {
'status': temp_status.get('status', 'OK'),
'detail': f'{temp_val}°C' if isinstance(temp_val, (int, float)) else str(temp_val),
'value': temp_val,
'thresholds': 'Warning >80°C sustained >3min'
'status': t_status,
'detail': 'Temperature elevated' if t_status != 'OK' else 'Normal'
}
else:
checks['cpu_temperature'] = {
@@ -697,15 +693,11 @@ class HealthMonitor:
'checks': {
'ram_usage': {
'status': ram_status,
'detail': f'{round(mem_percent, 1)}% used ({ram_avail_gb} GB free of {ram_total_gb} GB)',
'value': round(mem_percent, 1),
'thresholds': f'Warning >{self.MEMORY_WARNING}%, Critical >90%'
'detail': 'High RAM usage sustained' if ram_status != 'OK' else 'Normal'
},
'swap_usage': {
'status': swap_status,
'detail': f'{round(swap_percent, 1)}% used ({swap_used_gb} GB of {swap_total_gb} GB)' if swap.total > 0 else 'No swap configured',
'value': round(swap_percent, 1),
'thresholds': 'Critical when swap >20% of RAM'
'detail': 'Excessive swap usage' if swap_status != 'OK' else ('Normal' if swap.total > 0 else 'No swap configured')
}
}
}
@@ -810,8 +802,16 @@ class HealthMonitor:
}
if not issues:
# Add a summary OK entry if nothing specific
checks['root_filesystem'] = checks.get('/', {'status': 'OK', 'detail': 'Root filesystem healthy'})
# Add descriptive OK entries for what we monitor
checks['root_filesystem'] = checks.get('/', {'status': 'OK', 'detail': 'Mounted read-write, space OK'})
checks['smart_health'] = {'status': 'OK', 'detail': 'No SMART warnings in journal'}
checks['io_errors'] = {'status': 'OK', 'detail': 'No I/O errors in dmesg'}
# Check if ZFS is present
if os.path.exists('/sbin/zpool') or os.path.exists('/usr/sbin/zpool'):
checks['zfs_pools'] = {'status': 'OK', 'detail': 'ZFS pools healthy'}
# Check if LVM is present
if os.path.exists('/sbin/lvm') or os.path.exists('/usr/sbin/lvm'):
checks['lvm_volumes'] = {'status': 'OK', 'detail': 'LVM volumes OK'}
return {'status': 'OK', 'checks': checks}
# Determine overall status
@@ -993,22 +993,22 @@ class HealthMonitor:
if error_count >= 3:
error_key = f'disk_{disk}'
severity = 'CRITICAL'
reason = f'{error_count} I/O errors in 5 minutes'
health_persistence.record_error(
error_key=error_key,
category='disks',
severity=severity,
reason=reason,
details={'disk': disk, 'error_count': error_count, 'dismissable': True}
)
disk_issues[f'/dev/{disk}'] = {
'status': severity,
'reason': reason,
'dismissable': True
}
elif error_count >= 1:
reason = f'{error_count} I/O errors in 5 minutes'
health_persistence.record_error(
error_key=error_key,
category='disks',
severity=severity,
reason=reason,
details={'disk': disk, 'error_count': error_count, 'dismissable': False}
)
disk_details[disk] = {
'status': severity,
'reason': reason,
'dismissable': False
}
elif error_count >= 1:
error_key = f'disk_{disk}'
severity = 'WARNING'
reason = f'{error_count} I/O error(s) in 5 minutes'
@@ -1116,14 +1116,14 @@ class HealthMonitor:
category='network',
severity='CRITICAL',
reason=alert_reason or 'Interface DOWN',
details={'interface': interface, 'dismissable': True}
)
interface_details[interface] = {
'status': 'CRITICAL',
'reason': alert_reason or 'Interface DOWN',
'dismissable': True
}
details={'interface': interface, 'dismissable': False}
)
interface_details[interface] = {
'status': 'CRITICAL',
'reason': alert_reason or 'Interface DOWN',
'dismissable': False
}
else:
active_interfaces.add(interface)
if interface.startswith('vmbr') or interface.startswith(('eth', 'ens', 'enp', 'eno')):
@@ -1488,7 +1488,10 @@ class HealthMonitor:
}
if not issues:
checks['all_vms_cts'] = {'status': 'OK', 'detail': 'No issues detected in logs'}
checks['qmp_communication'] = {'status': 'OK', 'detail': 'No QMP timeouts detected'}
checks['container_startup'] = {'status': 'OK', 'detail': 'No container startup errors'}
checks['vm_startup'] = {'status': 'OK', 'detail': 'No VM startup failures'}
checks['oom_killer'] = {'status': 'OK', 'detail': 'No OOM events detected'}
return {'status': 'OK', 'checks': checks}
has_critical = any(d.get('status') == 'CRITICAL' for d in vm_details.values())
@@ -1830,30 +1833,75 @@ class HealthMonitor:
status = 'OK'
reason = None
# Build checks dict for log sub-items
# Record/clear persistent errors for each log sub-check so Dismiss works
log_sub_checks = {
'log_error_cascade': {'active': cascade_count > 0, 'severity': 'WARNING',
'reason': f'{cascade_count} pattern(s) repeating >=15 times'},
'log_error_spike': {'active': spike_count > 0, 'severity': 'WARNING',
'reason': f'{spike_count} pattern(s) with 4x increase'},
'log_persistent_errors': {'active': persistent_count > 0, 'severity': 'WARNING',
'reason': f'{persistent_count} recurring pattern(s) over 15+ min'},
'log_critical_errors': {'active': unique_critical_count > 0, 'severity': 'CRITICAL',
'reason': f'{unique_critical_count} critical error(s) found', 'dismissable': False},
}
# Track which sub-checks were dismissed
dismissed_keys = set()
for err_key, info in log_sub_checks.items():
if info['active']:
is_dismissable = info.get('dismissable', True)
result = health_persistence.record_error(
error_key=err_key,
category='logs',
severity=info['severity'],
reason=info['reason'],
details={'dismissable': is_dismissable}
)
if result and result.get('type') == 'skipped_acknowledged':
dismissed_keys.add(err_key)
elif health_persistence.is_error_active(err_key):
health_persistence.clear_error(err_key)
# Build checks dict - downgrade dismissed items to INFO
def _log_check_status(key, active, severity):
if not active:
return 'OK'
if key in dismissed_keys:
return 'INFO'
return severity
log_checks = {
'error_cascade': {
'status': 'WARNING' if cascade_count > 0 else 'OK',
'log_error_cascade': {
'status': _log_check_status('log_error_cascade', cascade_count > 0, 'WARNING'),
'detail': f'{cascade_count} pattern(s) repeating >=15 times' if cascade_count > 0 else 'No cascading errors',
'dismissable': True
'dismissable': True,
'dismissed': 'log_error_cascade' in dismissed_keys
},
'error_spike': {
'status': 'WARNING' if spike_count > 0 else 'OK',
'log_error_spike': {
'status': _log_check_status('log_error_spike', spike_count > 0, 'WARNING'),
'detail': f'{spike_count} pattern(s) with 4x increase' if spike_count > 0 else 'No error spikes',
'dismissable': True
'dismissable': True,
'dismissed': 'log_error_spike' in dismissed_keys
},
'persistent_errors': {
'status': 'WARNING' if persistent_count > 0 else 'OK',
'log_persistent_errors': {
'status': _log_check_status('log_persistent_errors', persistent_count > 0, 'WARNING'),
'detail': f'{persistent_count} recurring pattern(s) over 15+ min' if persistent_count > 0 else 'No persistent patterns',
'dismissable': True
'dismissable': True,
'dismissed': 'log_persistent_errors' in dismissed_keys
},
'critical_errors': {
'status': 'CRITICAL' if unique_critical_count > 0 else 'OK',
'log_critical_errors': {
'status': _log_check_status('log_critical_errors', unique_critical_count > 0, 'CRITICAL'),
'detail': f'{unique_critical_count} critical error(s) found' if unique_critical_count > 0 else 'No critical errors',
'dismissable': True
'dismissable': False
}
}
# Recalculate overall status considering dismissed items
active_issues = [k for k, v in log_checks.items() if v['status'] in ('WARNING', 'CRITICAL')]
if not active_issues:
status = 'OK'
reason = None
log_result = {'status': status, 'checks': log_checks}
if reason:
log_result['reason'] = reason
@@ -1864,10 +1912,10 @@ class HealthMonitor:
# If journalctl command failed or returned no data
ok_result = {'status': 'OK', 'checks': {
'error_cascade': {'status': 'OK', 'detail': 'No cascading errors'},
'error_spike': {'status': 'OK', 'detail': 'No error spikes'},
'persistent_errors': {'status': 'OK', 'detail': 'No persistent patterns'},
'critical_errors': {'status': 'OK', 'detail': 'No critical errors'}
'log_error_cascade': {'status': 'OK', 'detail': 'No cascading errors'},
'log_error_spike': {'status': 'OK', 'detail': 'No error spikes'},
'log_persistent_errors': {'status': 'OK', 'detail': 'No persistent patterns'},
'log_critical_errors': {'status': 'OK', 'detail': 'No critical errors'}
}}
self.cached_results[cache_key] = ok_result
self.last_check_times[cache_key] = current_time
@@ -2014,11 +2062,12 @@ class HealthMonitor:
'security_updates': {
'status': sec_status,
'detail': f'{len(security_updates_packages)} security update(s) pending' if security_updates_packages else 'No security updates pending',
'dismissable': True if sec_status != 'OK' else False
},
'system_age': {
'status': update_age_status,
'detail': f'Last updated {last_update_days} day(s) ago' if last_update_days is not None else 'Unknown',
'thresholds': 'Warning >365 days, Critical >548 days'
'dismissable': False if update_age_status == 'CRITICAL' else True if update_age_status == 'WARNING' else False
},
'pending_updates': {
'status': 'INFO' if update_count > 50 else 'OK',
@@ -2208,7 +2257,7 @@ class HealthMonitor:
if updates_data and updates_data.get('days_since_update', 9999) > 365:
msg = f'Uptime {int(uptime_days)} days (>1 year, consider updating kernel/system)'
issues.append(msg)
checks['uptime'] = {'status': 'WARNING', 'detail': msg, 'days': int(uptime_days)}
checks['uptime'] = {'status': 'WARNING', 'detail': msg, 'days': int(uptime_days), 'dismissable': True}
else:
checks['uptime'] = {'status': 'OK', 'detail': f'Uptime {int(uptime_days)} days, system recently updated'}
else:
@@ -2223,7 +2272,8 @@ class HealthMonitor:
cert_reason = cert_status.get('reason', '')
checks['certificates'] = {
'status': cert_sev,
'detail': cert_reason if cert_reason else 'Certificate valid'
'detail': cert_reason if cert_reason else 'Certificate valid',
'dismissable': True if cert_sev not in ['OK', 'INFO'] else False
}
if cert_sev not in ['OK', 'INFO']:
issues.append(cert_reason or 'Certificate issue')
@@ -2247,7 +2297,7 @@ class HealthMonitor:
if failed_logins > 50:
msg = f'{failed_logins} failed login attempts in 24h'
issues.append(msg)
checks['login_attempts'] = {'status': 'WARNING', 'detail': msg, 'count': failed_logins}
checks['login_attempts'] = {'status': 'WARNING', 'detail': msg, 'count': failed_logins, 'dismissable': True}
elif failed_logins > 0:
checks['login_attempts'] = {'status': 'OK', 'detail': f'{failed_logins} failed attempts in 24h (within threshold)', 'count': failed_logins}
else:
@@ -2258,8 +2308,10 @@ class HealthMonitor:
# Sub-check 4: Fail2Ban ban detection
try:
f2b = self._check_fail2ban_bans()
f2b_status = f2b.get('status', 'OK')
checks['fail2ban'] = {
'status': f2b.get('status', 'OK'),
'status': f2b_status,
'dismissable': True if f2b_status not in ['OK'] else False,
'detail': f2b.get('detail', ''),
'installed': f2b.get('installed', False),
'banned_count': f2b.get('banned_count', 0)
@@ -2511,10 +2563,22 @@ class HealthMonitor:
# All storages are available. We should also clear any previously recorded storage errors.
active_errors = health_persistence.get_active_errors()
for error in active_errors:
# Target errors related to storage unavailability
if error.get('category') == 'storage' and error.get('error_key', '').startswith('storage_unavailable_'):
health_persistence.clear_error(error['error_key'])
return {'status': 'OK'}
# Build checks from all configured storages for descriptive display
available_storages = storage_status.get('available', [])
checks = {}
for st in available_storages:
st_name = st.get('name', 'unknown')
st_type = st.get('type', 'unknown')
checks[st_name] = {
'status': 'OK',
'detail': f'{st_type} storage available'
}
if not checks:
checks['proxmox_storages'] = {'status': 'OK', 'detail': 'All storages available'}
return {'status': 'OK', 'checks': checks}
storage_details = {}
for storage in unavailable_storages:
@@ -2552,10 +2616,29 @@ class HealthMonitor:
'dismissable': False
}
# Build checks from storage_details
checks = {}
for st_name, st_info in storage_details.items():
checks[st_name] = {
'status': 'CRITICAL',
'detail': st_info.get('reason', 'Unavailable'),
'dismissable': False
}
# Also add available storages
available_list = storage_status.get('available', [])
unavail_names = {s['name'] for s in unavailable_storages}
for st in available_list:
if st.get('name') not in unavail_names and st.get('name') not in checks:
checks[st['name']] = {
'status': 'OK',
'detail': f'{st.get("type", "unknown")} storage available'
}
return {
'status': 'CRITICAL',
'reason': f'{len(unavailable_storages)} Proxmox storage(s) unavailable',
'details': storage_details
'details': storage_details,
'checks': checks
}
except Exception as e: