Update health monitor

This commit is contained in:
MacRimi
2026-02-16 18:19:29 +01:00
parent 2ee5be7402
commit 1ed8f5d124
3 changed files with 204 additions and 91 deletions

View File

@@ -300,27 +300,45 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
const formatCheckLabel = (key: string): string => { const formatCheckLabel = (key: string): string => {
const labels: Record<string, string> = { const labels: Record<string, string> = {
// CPU
cpu_usage: "CPU Usage", cpu_usage: "CPU Usage",
cpu_temperature: "Temperature", cpu_temperature: "Temperature",
// Memory
ram_usage: "RAM Usage", ram_usage: "RAM Usage",
swap_usage: "Swap Usage", swap_usage: "Swap Usage",
// Disk I/O
root_filesystem: "Root Filesystem", root_filesystem: "Root Filesystem",
smart_health: "SMART Health",
io_errors: "I/O Errors",
zfs_pools: "ZFS Pools",
lvm_volumes: "LVM Volumes",
lvm_check: "LVM Status", lvm_check: "LVM Status",
// Network
connectivity: "Connectivity", connectivity: "Connectivity",
all_vms_cts: "VMs & Containers", // VMs & CTs
qmp_communication: "QMP Communication",
container_startup: "Container Startup",
vm_startup: "VM Startup",
oom_killer: "OOM Killer",
// Services
cluster_mode: "Cluster Mode", cluster_mode: "Cluster Mode",
error_cascade: "Error Cascade", // Logs (prefixed with log_)
error_spike: "Error Spike", log_error_cascade: "Error Cascade",
persistent_errors: "Persistent Errors", log_error_spike: "Error Spike",
critical_errors: "Critical Errors", log_persistent_errors: "Persistent Errors",
log_critical_errors: "Critical Errors",
// Updates
security_updates: "Security Updates", security_updates: "Security Updates",
system_age: "System Age", system_age: "System Age",
pending_updates: "Pending Updates", pending_updates: "Pending Updates",
kernel_pve: "Kernel / PVE", kernel_pve: "Kernel / PVE",
// Security
uptime: "Uptime", uptime: "Uptime",
certificates: "Certificates", certificates: "Certificates",
login_attempts: "Login Attempts", login_attempts: "Login Attempts",
fail2ban: "Fail2Ban", fail2ban: "Fail2Ban",
// Storage (Proxmox)
proxmox_storages: "Proxmox Storages",
} }
if (labels[key]) return labels[key] if (labels[key]) return labels[key]
// Convert snake_case or camelCase to Title Case // Convert snake_case or camelCase to Title Case
@@ -331,7 +349,7 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
} }
const renderChecks = ( const renderChecks = (
checks: Record<string, { status: string; detail: string; dismissable?: boolean; thresholds?: string; [key: string]: any }>, checks: Record<string, { status: string; detail: string; dismissable?: boolean; [key: string]: any }>,
categoryKey: string categoryKey: string
) => { ) => {
if (!checks || Object.keys(checks).length === 0) return null if (!checks || Object.keys(checks).length === 0) return null
@@ -347,18 +365,18 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
key={checkKey} key={checkKey}
className="flex items-center justify-between gap-2 text-xs py-1.5 px-3 rounded-md hover:bg-muted/40 transition-colors" className="flex items-center justify-between gap-2 text-xs py-1.5 px-3 rounded-md hover:bg-muted/40 transition-colors"
> >
<div className="flex items-center gap-2 min-w-0 flex-1"> <div className="flex items-center gap-2 min-w-0 flex-1 overflow-hidden">
{getStatusIcon(checkData.status, "sm")} {getStatusIcon(checkData.status, "sm")}
<span className="font-medium shrink-0">{formatCheckLabel(checkKey)}</span> <span className="font-medium shrink-0">{formatCheckLabel(checkKey)}</span>
<span className="text-muted-foreground truncate">{checkData.detail}</span> <span className="text-muted-foreground truncate block">{checkData.detail}</span>
{checkData.dismissed && (
<Badge variant="outline" className="text-[9px] px-1.5 py-0 h-4 shrink-0 text-blue-400 border-blue-400/30">
Dismissed
</Badge>
)}
</div> </div>
<div className="flex items-center gap-1.5 shrink-0"> <div className="flex items-center gap-1.5 shrink-0">
{checkData.thresholds && ( {(checkStatus === "WARNING" || checkStatus === "CRITICAL") && isDismissable && !checkData.dismissed && (
<span className="text-[10px] text-muted-foreground/60 hidden sm:inline">
({checkData.thresholds})
</span>
)}
{(checkStatus === "WARNING" || checkStatus === "CRITICAL") && isDismissable && (
<Button <Button
size="sm" size="sm"
variant="outline" variant="outline"
@@ -391,7 +409,7 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
return ( return (
<Dialog open={open} onOpenChange={onOpenChange}> <Dialog open={open} onOpenChange={onOpenChange}>
<DialogContent className="max-w-3xl max-h-[85vh] overflow-y-auto"> <DialogContent className="max-w-3xl max-h-[85vh] overflow-y-auto overflow-x-hidden">
<DialogHeader> <DialogHeader>
<div className="flex items-center justify-between gap-3"> <div className="flex items-center justify-between gap-3">
<DialogTitle className="flex items-center gap-2 flex-1"> <DialogTitle className="flex items-center gap-2 flex-1">
@@ -453,8 +471,8 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
</div> </div>
{healthData.summary && healthData.summary !== "All systems operational" && ( {healthData.summary && healthData.summary !== "All systems operational" && (
<div className="text-sm p-3 rounded-lg bg-muted/20 border"> <div className="text-sm p-3 rounded-lg bg-muted/20 border overflow-hidden">
<span className="font-medium text-foreground">{healthData.summary}</span> <p className="font-medium text-foreground truncate">{healthData.summary}</p>
</div> </div>
)} )}
@@ -475,18 +493,18 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
> >
{/* Clickable header row */} {/* Clickable header row */}
<div <div
className="flex items-center gap-3 p-3 cursor-pointer select-none" className="flex items-center gap-3 p-3 cursor-pointer select-none overflow-hidden"
onClick={() => toggleCategory(key)} onClick={() => toggleCategory(key)}
> >
<div className="flex-shrink-0 flex items-center gap-2"> <div className="shrink-0 flex items-center gap-2">
<Icon className="h-4 w-4 text-blue-500" /> <Icon className="h-4 w-4 text-blue-500" />
{getStatusIcon(status)} {getStatusIcon(status)}
</div> </div>
<div className="flex-1 min-w-0"> <div className="flex-1 min-w-0 overflow-hidden">
<div className="flex items-center gap-2"> <div className="flex items-center gap-2">
<p className="font-medium text-sm">{label}</p> <p className="font-medium text-sm truncate">{label}</p>
{hasChecks && ( {hasChecks && (
<span className="text-[10px] text-muted-foreground"> <span className="text-[10px] text-muted-foreground shrink-0">
({Object.keys(checks).length} checks) ({Object.keys(checks).length} checks)
</span> </span>
)} )}
@@ -509,9 +527,9 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
{/* Expandable checks section */} {/* Expandable checks section */}
{isExpanded && ( {isExpanded && (
<div className="border-t border-border/50 bg-muted/5 px-2 py-1.5"> <div className="border-t border-border/50 bg-muted/5 px-2 py-1.5 overflow-hidden">
{reason && ( {reason && (
<p className="text-xs text-muted-foreground px-3 py-1.5 mb-1">{reason}</p> <p className="text-xs text-muted-foreground px-3 py-1.5 mb-1 break-words">{reason}</p>
)} )}
{hasChecks ? ( {hasChecks ? (
renderChecks(checks, key) renderChecks(checks, key)
@@ -546,7 +564,7 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
</div> </div>
<div className="flex-1 min-w-0"> <div className="flex-1 min-w-0">
<div className="flex items-center justify-between gap-2 mb-1"> <div className="flex items-center justify-between gap-2 mb-1">
<p className="font-medium text-sm text-muted-foreground">{item.reason}</p> <p className="font-medium text-sm text-muted-foreground truncate">{item.reason}</p>
<div className="flex items-center gap-1.5 shrink-0"> <div className="flex items-center gap-1.5 shrink-0">
<Badge variant="outline" className="text-xs border-blue-500/50 text-blue-500/70 bg-transparent"> <Badge variant="outline" className="text-xs border-blue-500/50 text-blue-500/70 bg-transparent">
Dismissed Dismissed

View File

@@ -65,6 +65,18 @@ def acknowledge_error():
result = health_persistence.acknowledge_error(error_key) result = health_persistence.acknowledge_error(error_key)
if result.get('success'): if result.get('success'):
# Invalidate cached health results so next fetch reflects the dismiss
# Clear category-specific caches based on the error_key prefix
if error_key.startswith('log_'):
health_monitor.last_check_times.pop('system_logs', None)
health_monitor.cached_results.pop('system_logs', None)
elif error_key.startswith('pve_service_'):
health_monitor.last_check_times.pop('pve_services', None)
health_monitor.cached_results.pop('pve_services', None)
elif error_key.startswith('updates_'):
health_monitor.last_check_times.pop('updates_check', None)
health_monitor.cached_results.pop('updates_check', None)
# Determine suppression period for the response # Determine suppression period for the response
category = result.get('category', '') category = result.get('category', '')
if category == 'updates': if category == 'updates':

View File

@@ -487,18 +487,14 @@ class HealthMonitor:
checks = { checks = {
'cpu_usage': { 'cpu_usage': {
'status': status, 'status': status,
'detail': f'{round(cpu_percent, 1)}% ({psutil.cpu_count()} cores)', 'detail': 'Sustained high CPU usage' if status != 'OK' else 'Normal'
'value': round(cpu_percent, 1),
'thresholds': f'Warning >{self.CPU_WARNING}%, Critical >{self.CPU_CRITICAL}%'
} }
} }
if temp_status and temp_status.get('status') != 'UNKNOWN': if temp_status and temp_status.get('status') != 'UNKNOWN':
temp_val = temp_status.get('value', 'N/A') t_status = temp_status.get('status', 'OK')
checks['cpu_temperature'] = { checks['cpu_temperature'] = {
'status': temp_status.get('status', 'OK'), 'status': t_status,
'detail': f'{temp_val}°C' if isinstance(temp_val, (int, float)) else str(temp_val), 'detail': 'Temperature elevated' if t_status != 'OK' else 'Normal'
'value': temp_val,
'thresholds': 'Warning >80°C sustained >3min'
} }
else: else:
checks['cpu_temperature'] = { checks['cpu_temperature'] = {
@@ -697,15 +693,11 @@ class HealthMonitor:
'checks': { 'checks': {
'ram_usage': { 'ram_usage': {
'status': ram_status, 'status': ram_status,
'detail': f'{round(mem_percent, 1)}% used ({ram_avail_gb} GB free of {ram_total_gb} GB)', 'detail': 'High RAM usage sustained' if ram_status != 'OK' else 'Normal'
'value': round(mem_percent, 1),
'thresholds': f'Warning >{self.MEMORY_WARNING}%, Critical >90%'
}, },
'swap_usage': { 'swap_usage': {
'status': swap_status, 'status': swap_status,
'detail': f'{round(swap_percent, 1)}% used ({swap_used_gb} GB of {swap_total_gb} GB)' if swap.total > 0 else 'No swap configured', 'detail': 'Excessive swap usage' if swap_status != 'OK' else ('Normal' if swap.total > 0 else 'No swap configured')
'value': round(swap_percent, 1),
'thresholds': 'Critical when swap >20% of RAM'
} }
} }
} }
@@ -810,8 +802,16 @@ class HealthMonitor:
} }
if not issues: if not issues:
# Add a summary OK entry if nothing specific # Add descriptive OK entries for what we monitor
checks['root_filesystem'] = checks.get('/', {'status': 'OK', 'detail': 'Root filesystem healthy'}) checks['root_filesystem'] = checks.get('/', {'status': 'OK', 'detail': 'Mounted read-write, space OK'})
checks['smart_health'] = {'status': 'OK', 'detail': 'No SMART warnings in journal'}
checks['io_errors'] = {'status': 'OK', 'detail': 'No I/O errors in dmesg'}
# Check if ZFS is present
if os.path.exists('/sbin/zpool') or os.path.exists('/usr/sbin/zpool'):
checks['zfs_pools'] = {'status': 'OK', 'detail': 'ZFS pools healthy'}
# Check if LVM is present
if os.path.exists('/sbin/lvm') or os.path.exists('/usr/sbin/lvm'):
checks['lvm_volumes'] = {'status': 'OK', 'detail': 'LVM volumes OK'}
return {'status': 'OK', 'checks': checks} return {'status': 'OK', 'checks': checks}
# Determine overall status # Determine overall status
@@ -993,22 +993,22 @@ class HealthMonitor:
if error_count >= 3: if error_count >= 3:
error_key = f'disk_{disk}' error_key = f'disk_{disk}'
severity = 'CRITICAL' severity = 'CRITICAL'
reason = f'{error_count} I/O errors in 5 minutes' reason = f'{error_count} I/O errors in 5 minutes'
health_persistence.record_error( health_persistence.record_error(
error_key=error_key, error_key=error_key,
category='disks', category='disks',
severity=severity, severity=severity,
reason=reason, reason=reason,
details={'disk': disk, 'error_count': error_count, 'dismissable': True} details={'disk': disk, 'error_count': error_count, 'dismissable': False}
) )
disk_issues[f'/dev/{disk}'] = { disk_details[disk] = {
'status': severity, 'status': severity,
'reason': reason, 'reason': reason,
'dismissable': True 'dismissable': False
} }
elif error_count >= 1: elif error_count >= 1:
error_key = f'disk_{disk}' error_key = f'disk_{disk}'
severity = 'WARNING' severity = 'WARNING'
reason = f'{error_count} I/O error(s) in 5 minutes' reason = f'{error_count} I/O error(s) in 5 minutes'
@@ -1116,14 +1116,14 @@ class HealthMonitor:
category='network', category='network',
severity='CRITICAL', severity='CRITICAL',
reason=alert_reason or 'Interface DOWN', reason=alert_reason or 'Interface DOWN',
details={'interface': interface, 'dismissable': True} details={'interface': interface, 'dismissable': False}
) )
interface_details[interface] = { interface_details[interface] = {
'status': 'CRITICAL', 'status': 'CRITICAL',
'reason': alert_reason or 'Interface DOWN', 'reason': alert_reason or 'Interface DOWN',
'dismissable': True 'dismissable': False
} }
else: else:
active_interfaces.add(interface) active_interfaces.add(interface)
if interface.startswith('vmbr') or interface.startswith(('eth', 'ens', 'enp', 'eno')): if interface.startswith('vmbr') or interface.startswith(('eth', 'ens', 'enp', 'eno')):
@@ -1488,7 +1488,10 @@ class HealthMonitor:
} }
if not issues: if not issues:
checks['all_vms_cts'] = {'status': 'OK', 'detail': 'No issues detected in logs'} checks['qmp_communication'] = {'status': 'OK', 'detail': 'No QMP timeouts detected'}
checks['container_startup'] = {'status': 'OK', 'detail': 'No container startup errors'}
checks['vm_startup'] = {'status': 'OK', 'detail': 'No VM startup failures'}
checks['oom_killer'] = {'status': 'OK', 'detail': 'No OOM events detected'}
return {'status': 'OK', 'checks': checks} return {'status': 'OK', 'checks': checks}
has_critical = any(d.get('status') == 'CRITICAL' for d in vm_details.values()) has_critical = any(d.get('status') == 'CRITICAL' for d in vm_details.values())
@@ -1830,30 +1833,75 @@ class HealthMonitor:
status = 'OK' status = 'OK'
reason = None reason = None
# Build checks dict for log sub-items # Record/clear persistent errors for each log sub-check so Dismiss works
log_sub_checks = {
'log_error_cascade': {'active': cascade_count > 0, 'severity': 'WARNING',
'reason': f'{cascade_count} pattern(s) repeating >=15 times'},
'log_error_spike': {'active': spike_count > 0, 'severity': 'WARNING',
'reason': f'{spike_count} pattern(s) with 4x increase'},
'log_persistent_errors': {'active': persistent_count > 0, 'severity': 'WARNING',
'reason': f'{persistent_count} recurring pattern(s) over 15+ min'},
'log_critical_errors': {'active': unique_critical_count > 0, 'severity': 'CRITICAL',
'reason': f'{unique_critical_count} critical error(s) found', 'dismissable': False},
}
# Track which sub-checks were dismissed
dismissed_keys = set()
for err_key, info in log_sub_checks.items():
if info['active']:
is_dismissable = info.get('dismissable', True)
result = health_persistence.record_error(
error_key=err_key,
category='logs',
severity=info['severity'],
reason=info['reason'],
details={'dismissable': is_dismissable}
)
if result and result.get('type') == 'skipped_acknowledged':
dismissed_keys.add(err_key)
elif health_persistence.is_error_active(err_key):
health_persistence.clear_error(err_key)
# Build checks dict - downgrade dismissed items to INFO
def _log_check_status(key, active, severity):
if not active:
return 'OK'
if key in dismissed_keys:
return 'INFO'
return severity
log_checks = { log_checks = {
'error_cascade': { 'log_error_cascade': {
'status': 'WARNING' if cascade_count > 0 else 'OK', 'status': _log_check_status('log_error_cascade', cascade_count > 0, 'WARNING'),
'detail': f'{cascade_count} pattern(s) repeating >=15 times' if cascade_count > 0 else 'No cascading errors', 'detail': f'{cascade_count} pattern(s) repeating >=15 times' if cascade_count > 0 else 'No cascading errors',
'dismissable': True 'dismissable': True,
'dismissed': 'log_error_cascade' in dismissed_keys
}, },
'error_spike': { 'log_error_spike': {
'status': 'WARNING' if spike_count > 0 else 'OK', 'status': _log_check_status('log_error_spike', spike_count > 0, 'WARNING'),
'detail': f'{spike_count} pattern(s) with 4x increase' if spike_count > 0 else 'No error spikes', 'detail': f'{spike_count} pattern(s) with 4x increase' if spike_count > 0 else 'No error spikes',
'dismissable': True 'dismissable': True,
'dismissed': 'log_error_spike' in dismissed_keys
}, },
'persistent_errors': { 'log_persistent_errors': {
'status': 'WARNING' if persistent_count > 0 else 'OK', 'status': _log_check_status('log_persistent_errors', persistent_count > 0, 'WARNING'),
'detail': f'{persistent_count} recurring pattern(s) over 15+ min' if persistent_count > 0 else 'No persistent patterns', 'detail': f'{persistent_count} recurring pattern(s) over 15+ min' if persistent_count > 0 else 'No persistent patterns',
'dismissable': True 'dismissable': True,
'dismissed': 'log_persistent_errors' in dismissed_keys
}, },
'critical_errors': { 'log_critical_errors': {
'status': 'CRITICAL' if unique_critical_count > 0 else 'OK', 'status': _log_check_status('log_critical_errors', unique_critical_count > 0, 'CRITICAL'),
'detail': f'{unique_critical_count} critical error(s) found' if unique_critical_count > 0 else 'No critical errors', 'detail': f'{unique_critical_count} critical error(s) found' if unique_critical_count > 0 else 'No critical errors',
'dismissable': True 'dismissable': False
} }
} }
# Recalculate overall status considering dismissed items
active_issues = [k for k, v in log_checks.items() if v['status'] in ('WARNING', 'CRITICAL')]
if not active_issues:
status = 'OK'
reason = None
log_result = {'status': status, 'checks': log_checks} log_result = {'status': status, 'checks': log_checks}
if reason: if reason:
log_result['reason'] = reason log_result['reason'] = reason
@@ -1864,10 +1912,10 @@ class HealthMonitor:
# If journalctl command failed or returned no data # If journalctl command failed or returned no data
ok_result = {'status': 'OK', 'checks': { ok_result = {'status': 'OK', 'checks': {
'error_cascade': {'status': 'OK', 'detail': 'No cascading errors'}, 'log_error_cascade': {'status': 'OK', 'detail': 'No cascading errors'},
'error_spike': {'status': 'OK', 'detail': 'No error spikes'}, 'log_error_spike': {'status': 'OK', 'detail': 'No error spikes'},
'persistent_errors': {'status': 'OK', 'detail': 'No persistent patterns'}, 'log_persistent_errors': {'status': 'OK', 'detail': 'No persistent patterns'},
'critical_errors': {'status': 'OK', 'detail': 'No critical errors'} 'log_critical_errors': {'status': 'OK', 'detail': 'No critical errors'}
}} }}
self.cached_results[cache_key] = ok_result self.cached_results[cache_key] = ok_result
self.last_check_times[cache_key] = current_time self.last_check_times[cache_key] = current_time
@@ -2014,11 +2062,12 @@ class HealthMonitor:
'security_updates': { 'security_updates': {
'status': sec_status, 'status': sec_status,
'detail': f'{len(security_updates_packages)} security update(s) pending' if security_updates_packages else 'No security updates pending', 'detail': f'{len(security_updates_packages)} security update(s) pending' if security_updates_packages else 'No security updates pending',
'dismissable': True if sec_status != 'OK' else False
}, },
'system_age': { 'system_age': {
'status': update_age_status, 'status': update_age_status,
'detail': f'Last updated {last_update_days} day(s) ago' if last_update_days is not None else 'Unknown', 'detail': f'Last updated {last_update_days} day(s) ago' if last_update_days is not None else 'Unknown',
'thresholds': 'Warning >365 days, Critical >548 days' 'dismissable': False if update_age_status == 'CRITICAL' else True if update_age_status == 'WARNING' else False
}, },
'pending_updates': { 'pending_updates': {
'status': 'INFO' if update_count > 50 else 'OK', 'status': 'INFO' if update_count > 50 else 'OK',
@@ -2208,7 +2257,7 @@ class HealthMonitor:
if updates_data and updates_data.get('days_since_update', 9999) > 365: if updates_data and updates_data.get('days_since_update', 9999) > 365:
msg = f'Uptime {int(uptime_days)} days (>1 year, consider updating kernel/system)' msg = f'Uptime {int(uptime_days)} days (>1 year, consider updating kernel/system)'
issues.append(msg) issues.append(msg)
checks['uptime'] = {'status': 'WARNING', 'detail': msg, 'days': int(uptime_days)} checks['uptime'] = {'status': 'WARNING', 'detail': msg, 'days': int(uptime_days), 'dismissable': True}
else: else:
checks['uptime'] = {'status': 'OK', 'detail': f'Uptime {int(uptime_days)} days, system recently updated'} checks['uptime'] = {'status': 'OK', 'detail': f'Uptime {int(uptime_days)} days, system recently updated'}
else: else:
@@ -2223,7 +2272,8 @@ class HealthMonitor:
cert_reason = cert_status.get('reason', '') cert_reason = cert_status.get('reason', '')
checks['certificates'] = { checks['certificates'] = {
'status': cert_sev, 'status': cert_sev,
'detail': cert_reason if cert_reason else 'Certificate valid' 'detail': cert_reason if cert_reason else 'Certificate valid',
'dismissable': True if cert_sev not in ['OK', 'INFO'] else False
} }
if cert_sev not in ['OK', 'INFO']: if cert_sev not in ['OK', 'INFO']:
issues.append(cert_reason or 'Certificate issue') issues.append(cert_reason or 'Certificate issue')
@@ -2247,7 +2297,7 @@ class HealthMonitor:
if failed_logins > 50: if failed_logins > 50:
msg = f'{failed_logins} failed login attempts in 24h' msg = f'{failed_logins} failed login attempts in 24h'
issues.append(msg) issues.append(msg)
checks['login_attempts'] = {'status': 'WARNING', 'detail': msg, 'count': failed_logins} checks['login_attempts'] = {'status': 'WARNING', 'detail': msg, 'count': failed_logins, 'dismissable': True}
elif failed_logins > 0: elif failed_logins > 0:
checks['login_attempts'] = {'status': 'OK', 'detail': f'{failed_logins} failed attempts in 24h (within threshold)', 'count': failed_logins} checks['login_attempts'] = {'status': 'OK', 'detail': f'{failed_logins} failed attempts in 24h (within threshold)', 'count': failed_logins}
else: else:
@@ -2258,8 +2308,10 @@ class HealthMonitor:
# Sub-check 4: Fail2Ban ban detection # Sub-check 4: Fail2Ban ban detection
try: try:
f2b = self._check_fail2ban_bans() f2b = self._check_fail2ban_bans()
f2b_status = f2b.get('status', 'OK')
checks['fail2ban'] = { checks['fail2ban'] = {
'status': f2b.get('status', 'OK'), 'status': f2b_status,
'dismissable': True if f2b_status not in ['OK'] else False,
'detail': f2b.get('detail', ''), 'detail': f2b.get('detail', ''),
'installed': f2b.get('installed', False), 'installed': f2b.get('installed', False),
'banned_count': f2b.get('banned_count', 0) 'banned_count': f2b.get('banned_count', 0)
@@ -2511,10 +2563,22 @@ class HealthMonitor:
# All storages are available. We should also clear any previously recorded storage errors. # All storages are available. We should also clear any previously recorded storage errors.
active_errors = health_persistence.get_active_errors() active_errors = health_persistence.get_active_errors()
for error in active_errors: for error in active_errors:
# Target errors related to storage unavailability
if error.get('category') == 'storage' and error.get('error_key', '').startswith('storage_unavailable_'): if error.get('category') == 'storage' and error.get('error_key', '').startswith('storage_unavailable_'):
health_persistence.clear_error(error['error_key']) health_persistence.clear_error(error['error_key'])
return {'status': 'OK'}
# Build checks from all configured storages for descriptive display
available_storages = storage_status.get('available', [])
checks = {}
for st in available_storages:
st_name = st.get('name', 'unknown')
st_type = st.get('type', 'unknown')
checks[st_name] = {
'status': 'OK',
'detail': f'{st_type} storage available'
}
if not checks:
checks['proxmox_storages'] = {'status': 'OK', 'detail': 'All storages available'}
return {'status': 'OK', 'checks': checks}
storage_details = {} storage_details = {}
for storage in unavailable_storages: for storage in unavailable_storages:
@@ -2552,10 +2616,29 @@ class HealthMonitor:
'dismissable': False 'dismissable': False
} }
# Build checks from storage_details
checks = {}
for st_name, st_info in storage_details.items():
checks[st_name] = {
'status': 'CRITICAL',
'detail': st_info.get('reason', 'Unavailable'),
'dismissable': False
}
# Also add available storages
available_list = storage_status.get('available', [])
unavail_names = {s['name'] for s in unavailable_storages}
for st in available_list:
if st.get('name') not in unavail_names and st.get('name') not in checks:
checks[st['name']] = {
'status': 'OK',
'detail': f'{st.get("type", "unknown")} storage available'
}
return { return {
'status': 'CRITICAL', 'status': 'CRITICAL',
'reason': f'{len(unavailable_storages)} Proxmox storage(s) unavailable', 'reason': f'{len(unavailable_storages)} Proxmox storage(s) unavailable',
'details': storage_details 'details': storage_details,
'checks': checks
} }
except Exception as e: except Exception as e: