mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-02-19 08:56:23 +00:00
Update health monitor
This commit is contained in:
@@ -300,27 +300,45 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
|
|||||||
|
|
||||||
const formatCheckLabel = (key: string): string => {
|
const formatCheckLabel = (key: string): string => {
|
||||||
const labels: Record<string, string> = {
|
const labels: Record<string, string> = {
|
||||||
|
// CPU
|
||||||
cpu_usage: "CPU Usage",
|
cpu_usage: "CPU Usage",
|
||||||
cpu_temperature: "Temperature",
|
cpu_temperature: "Temperature",
|
||||||
|
// Memory
|
||||||
ram_usage: "RAM Usage",
|
ram_usage: "RAM Usage",
|
||||||
swap_usage: "Swap Usage",
|
swap_usage: "Swap Usage",
|
||||||
|
// Disk I/O
|
||||||
root_filesystem: "Root Filesystem",
|
root_filesystem: "Root Filesystem",
|
||||||
|
smart_health: "SMART Health",
|
||||||
|
io_errors: "I/O Errors",
|
||||||
|
zfs_pools: "ZFS Pools",
|
||||||
|
lvm_volumes: "LVM Volumes",
|
||||||
lvm_check: "LVM Status",
|
lvm_check: "LVM Status",
|
||||||
|
// Network
|
||||||
connectivity: "Connectivity",
|
connectivity: "Connectivity",
|
||||||
all_vms_cts: "VMs & Containers",
|
// VMs & CTs
|
||||||
|
qmp_communication: "QMP Communication",
|
||||||
|
container_startup: "Container Startup",
|
||||||
|
vm_startup: "VM Startup",
|
||||||
|
oom_killer: "OOM Killer",
|
||||||
|
// Services
|
||||||
cluster_mode: "Cluster Mode",
|
cluster_mode: "Cluster Mode",
|
||||||
error_cascade: "Error Cascade",
|
// Logs (prefixed with log_)
|
||||||
error_spike: "Error Spike",
|
log_error_cascade: "Error Cascade",
|
||||||
persistent_errors: "Persistent Errors",
|
log_error_spike: "Error Spike",
|
||||||
critical_errors: "Critical Errors",
|
log_persistent_errors: "Persistent Errors",
|
||||||
|
log_critical_errors: "Critical Errors",
|
||||||
|
// Updates
|
||||||
security_updates: "Security Updates",
|
security_updates: "Security Updates",
|
||||||
system_age: "System Age",
|
system_age: "System Age",
|
||||||
pending_updates: "Pending Updates",
|
pending_updates: "Pending Updates",
|
||||||
kernel_pve: "Kernel / PVE",
|
kernel_pve: "Kernel / PVE",
|
||||||
|
// Security
|
||||||
uptime: "Uptime",
|
uptime: "Uptime",
|
||||||
certificates: "Certificates",
|
certificates: "Certificates",
|
||||||
login_attempts: "Login Attempts",
|
login_attempts: "Login Attempts",
|
||||||
fail2ban: "Fail2Ban",
|
fail2ban: "Fail2Ban",
|
||||||
|
// Storage (Proxmox)
|
||||||
|
proxmox_storages: "Proxmox Storages",
|
||||||
}
|
}
|
||||||
if (labels[key]) return labels[key]
|
if (labels[key]) return labels[key]
|
||||||
// Convert snake_case or camelCase to Title Case
|
// Convert snake_case or camelCase to Title Case
|
||||||
@@ -331,7 +349,7 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
|
|||||||
}
|
}
|
||||||
|
|
||||||
const renderChecks = (
|
const renderChecks = (
|
||||||
checks: Record<string, { status: string; detail: string; dismissable?: boolean; thresholds?: string; [key: string]: any }>,
|
checks: Record<string, { status: string; detail: string; dismissable?: boolean; [key: string]: any }>,
|
||||||
categoryKey: string
|
categoryKey: string
|
||||||
) => {
|
) => {
|
||||||
if (!checks || Object.keys(checks).length === 0) return null
|
if (!checks || Object.keys(checks).length === 0) return null
|
||||||
@@ -347,18 +365,18 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
|
|||||||
key={checkKey}
|
key={checkKey}
|
||||||
className="flex items-center justify-between gap-2 text-xs py-1.5 px-3 rounded-md hover:bg-muted/40 transition-colors"
|
className="flex items-center justify-between gap-2 text-xs py-1.5 px-3 rounded-md hover:bg-muted/40 transition-colors"
|
||||||
>
|
>
|
||||||
<div className="flex items-center gap-2 min-w-0 flex-1">
|
<div className="flex items-center gap-2 min-w-0 flex-1 overflow-hidden">
|
||||||
{getStatusIcon(checkData.status, "sm")}
|
{getStatusIcon(checkData.status, "sm")}
|
||||||
<span className="font-medium shrink-0">{formatCheckLabel(checkKey)}</span>
|
<span className="font-medium shrink-0">{formatCheckLabel(checkKey)}</span>
|
||||||
<span className="text-muted-foreground truncate">{checkData.detail}</span>
|
<span className="text-muted-foreground truncate block">{checkData.detail}</span>
|
||||||
|
{checkData.dismissed && (
|
||||||
|
<Badge variant="outline" className="text-[9px] px-1.5 py-0 h-4 shrink-0 text-blue-400 border-blue-400/30">
|
||||||
|
Dismissed
|
||||||
|
</Badge>
|
||||||
|
)}
|
||||||
</div>
|
</div>
|
||||||
<div className="flex items-center gap-1.5 shrink-0">
|
<div className="flex items-center gap-1.5 shrink-0">
|
||||||
{checkData.thresholds && (
|
{(checkStatus === "WARNING" || checkStatus === "CRITICAL") && isDismissable && !checkData.dismissed && (
|
||||||
<span className="text-[10px] text-muted-foreground/60 hidden sm:inline">
|
|
||||||
({checkData.thresholds})
|
|
||||||
</span>
|
|
||||||
)}
|
|
||||||
{(checkStatus === "WARNING" || checkStatus === "CRITICAL") && isDismissable && (
|
|
||||||
<Button
|
<Button
|
||||||
size="sm"
|
size="sm"
|
||||||
variant="outline"
|
variant="outline"
|
||||||
@@ -391,7 +409,7 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
|
|||||||
|
|
||||||
return (
|
return (
|
||||||
<Dialog open={open} onOpenChange={onOpenChange}>
|
<Dialog open={open} onOpenChange={onOpenChange}>
|
||||||
<DialogContent className="max-w-3xl max-h-[85vh] overflow-y-auto">
|
<DialogContent className="max-w-3xl max-h-[85vh] overflow-y-auto overflow-x-hidden">
|
||||||
<DialogHeader>
|
<DialogHeader>
|
||||||
<div className="flex items-center justify-between gap-3">
|
<div className="flex items-center justify-between gap-3">
|
||||||
<DialogTitle className="flex items-center gap-2 flex-1">
|
<DialogTitle className="flex items-center gap-2 flex-1">
|
||||||
@@ -453,8 +471,8 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
|
|||||||
</div>
|
</div>
|
||||||
|
|
||||||
{healthData.summary && healthData.summary !== "All systems operational" && (
|
{healthData.summary && healthData.summary !== "All systems operational" && (
|
||||||
<div className="text-sm p-3 rounded-lg bg-muted/20 border">
|
<div className="text-sm p-3 rounded-lg bg-muted/20 border overflow-hidden">
|
||||||
<span className="font-medium text-foreground">{healthData.summary}</span>
|
<p className="font-medium text-foreground truncate">{healthData.summary}</p>
|
||||||
</div>
|
</div>
|
||||||
)}
|
)}
|
||||||
|
|
||||||
@@ -475,18 +493,18 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
|
|||||||
>
|
>
|
||||||
{/* Clickable header row */}
|
{/* Clickable header row */}
|
||||||
<div
|
<div
|
||||||
className="flex items-center gap-3 p-3 cursor-pointer select-none"
|
className="flex items-center gap-3 p-3 cursor-pointer select-none overflow-hidden"
|
||||||
onClick={() => toggleCategory(key)}
|
onClick={() => toggleCategory(key)}
|
||||||
>
|
>
|
||||||
<div className="flex-shrink-0 flex items-center gap-2">
|
<div className="shrink-0 flex items-center gap-2">
|
||||||
<Icon className="h-4 w-4 text-blue-500" />
|
<Icon className="h-4 w-4 text-blue-500" />
|
||||||
{getStatusIcon(status)}
|
{getStatusIcon(status)}
|
||||||
</div>
|
</div>
|
||||||
<div className="flex-1 min-w-0">
|
<div className="flex-1 min-w-0 overflow-hidden">
|
||||||
<div className="flex items-center gap-2">
|
<div className="flex items-center gap-2">
|
||||||
<p className="font-medium text-sm">{label}</p>
|
<p className="font-medium text-sm truncate">{label}</p>
|
||||||
{hasChecks && (
|
{hasChecks && (
|
||||||
<span className="text-[10px] text-muted-foreground">
|
<span className="text-[10px] text-muted-foreground shrink-0">
|
||||||
({Object.keys(checks).length} checks)
|
({Object.keys(checks).length} checks)
|
||||||
</span>
|
</span>
|
||||||
)}
|
)}
|
||||||
@@ -509,9 +527,9 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
|
|||||||
|
|
||||||
{/* Expandable checks section */}
|
{/* Expandable checks section */}
|
||||||
{isExpanded && (
|
{isExpanded && (
|
||||||
<div className="border-t border-border/50 bg-muted/5 px-2 py-1.5">
|
<div className="border-t border-border/50 bg-muted/5 px-2 py-1.5 overflow-hidden">
|
||||||
{reason && (
|
{reason && (
|
||||||
<p className="text-xs text-muted-foreground px-3 py-1.5 mb-1">{reason}</p>
|
<p className="text-xs text-muted-foreground px-3 py-1.5 mb-1 break-words">{reason}</p>
|
||||||
)}
|
)}
|
||||||
{hasChecks ? (
|
{hasChecks ? (
|
||||||
renderChecks(checks, key)
|
renderChecks(checks, key)
|
||||||
@@ -546,7 +564,7 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
|
|||||||
</div>
|
</div>
|
||||||
<div className="flex-1 min-w-0">
|
<div className="flex-1 min-w-0">
|
||||||
<div className="flex items-center justify-between gap-2 mb-1">
|
<div className="flex items-center justify-between gap-2 mb-1">
|
||||||
<p className="font-medium text-sm text-muted-foreground">{item.reason}</p>
|
<p className="font-medium text-sm text-muted-foreground truncate">{item.reason}</p>
|
||||||
<div className="flex items-center gap-1.5 shrink-0">
|
<div className="flex items-center gap-1.5 shrink-0">
|
||||||
<Badge variant="outline" className="text-xs border-blue-500/50 text-blue-500/70 bg-transparent">
|
<Badge variant="outline" className="text-xs border-blue-500/50 text-blue-500/70 bg-transparent">
|
||||||
Dismissed
|
Dismissed
|
||||||
|
|||||||
@@ -65,6 +65,18 @@ def acknowledge_error():
|
|||||||
result = health_persistence.acknowledge_error(error_key)
|
result = health_persistence.acknowledge_error(error_key)
|
||||||
|
|
||||||
if result.get('success'):
|
if result.get('success'):
|
||||||
|
# Invalidate cached health results so next fetch reflects the dismiss
|
||||||
|
# Clear category-specific caches based on the error_key prefix
|
||||||
|
if error_key.startswith('log_'):
|
||||||
|
health_monitor.last_check_times.pop('system_logs', None)
|
||||||
|
health_monitor.cached_results.pop('system_logs', None)
|
||||||
|
elif error_key.startswith('pve_service_'):
|
||||||
|
health_monitor.last_check_times.pop('pve_services', None)
|
||||||
|
health_monitor.cached_results.pop('pve_services', None)
|
||||||
|
elif error_key.startswith('updates_'):
|
||||||
|
health_monitor.last_check_times.pop('updates_check', None)
|
||||||
|
health_monitor.cached_results.pop('updates_check', None)
|
||||||
|
|
||||||
# Determine suppression period for the response
|
# Determine suppression period for the response
|
||||||
category = result.get('category', '')
|
category = result.get('category', '')
|
||||||
if category == 'updates':
|
if category == 'updates':
|
||||||
|
|||||||
@@ -487,18 +487,14 @@ class HealthMonitor:
|
|||||||
checks = {
|
checks = {
|
||||||
'cpu_usage': {
|
'cpu_usage': {
|
||||||
'status': status,
|
'status': status,
|
||||||
'detail': f'{round(cpu_percent, 1)}% ({psutil.cpu_count()} cores)',
|
'detail': 'Sustained high CPU usage' if status != 'OK' else 'Normal'
|
||||||
'value': round(cpu_percent, 1),
|
|
||||||
'thresholds': f'Warning >{self.CPU_WARNING}%, Critical >{self.CPU_CRITICAL}%'
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if temp_status and temp_status.get('status') != 'UNKNOWN':
|
if temp_status and temp_status.get('status') != 'UNKNOWN':
|
||||||
temp_val = temp_status.get('value', 'N/A')
|
t_status = temp_status.get('status', 'OK')
|
||||||
checks['cpu_temperature'] = {
|
checks['cpu_temperature'] = {
|
||||||
'status': temp_status.get('status', 'OK'),
|
'status': t_status,
|
||||||
'detail': f'{temp_val}°C' if isinstance(temp_val, (int, float)) else str(temp_val),
|
'detail': 'Temperature elevated' if t_status != 'OK' else 'Normal'
|
||||||
'value': temp_val,
|
|
||||||
'thresholds': 'Warning >80°C sustained >3min'
|
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
checks['cpu_temperature'] = {
|
checks['cpu_temperature'] = {
|
||||||
@@ -697,15 +693,11 @@ class HealthMonitor:
|
|||||||
'checks': {
|
'checks': {
|
||||||
'ram_usage': {
|
'ram_usage': {
|
||||||
'status': ram_status,
|
'status': ram_status,
|
||||||
'detail': f'{round(mem_percent, 1)}% used ({ram_avail_gb} GB free of {ram_total_gb} GB)',
|
'detail': 'High RAM usage sustained' if ram_status != 'OK' else 'Normal'
|
||||||
'value': round(mem_percent, 1),
|
|
||||||
'thresholds': f'Warning >{self.MEMORY_WARNING}%, Critical >90%'
|
|
||||||
},
|
},
|
||||||
'swap_usage': {
|
'swap_usage': {
|
||||||
'status': swap_status,
|
'status': swap_status,
|
||||||
'detail': f'{round(swap_percent, 1)}% used ({swap_used_gb} GB of {swap_total_gb} GB)' if swap.total > 0 else 'No swap configured',
|
'detail': 'Excessive swap usage' if swap_status != 'OK' else ('Normal' if swap.total > 0 else 'No swap configured')
|
||||||
'value': round(swap_percent, 1),
|
|
||||||
'thresholds': 'Critical when swap >20% of RAM'
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -810,8 +802,16 @@ class HealthMonitor:
|
|||||||
}
|
}
|
||||||
|
|
||||||
if not issues:
|
if not issues:
|
||||||
# Add a summary OK entry if nothing specific
|
# Add descriptive OK entries for what we monitor
|
||||||
checks['root_filesystem'] = checks.get('/', {'status': 'OK', 'detail': 'Root filesystem healthy'})
|
checks['root_filesystem'] = checks.get('/', {'status': 'OK', 'detail': 'Mounted read-write, space OK'})
|
||||||
|
checks['smart_health'] = {'status': 'OK', 'detail': 'No SMART warnings in journal'}
|
||||||
|
checks['io_errors'] = {'status': 'OK', 'detail': 'No I/O errors in dmesg'}
|
||||||
|
# Check if ZFS is present
|
||||||
|
if os.path.exists('/sbin/zpool') or os.path.exists('/usr/sbin/zpool'):
|
||||||
|
checks['zfs_pools'] = {'status': 'OK', 'detail': 'ZFS pools healthy'}
|
||||||
|
# Check if LVM is present
|
||||||
|
if os.path.exists('/sbin/lvm') or os.path.exists('/usr/sbin/lvm'):
|
||||||
|
checks['lvm_volumes'] = {'status': 'OK', 'detail': 'LVM volumes OK'}
|
||||||
return {'status': 'OK', 'checks': checks}
|
return {'status': 'OK', 'checks': checks}
|
||||||
|
|
||||||
# Determine overall status
|
# Determine overall status
|
||||||
@@ -1000,13 +1000,13 @@ class HealthMonitor:
|
|||||||
category='disks',
|
category='disks',
|
||||||
severity=severity,
|
severity=severity,
|
||||||
reason=reason,
|
reason=reason,
|
||||||
details={'disk': disk, 'error_count': error_count, 'dismissable': True}
|
details={'disk': disk, 'error_count': error_count, 'dismissable': False}
|
||||||
)
|
)
|
||||||
|
|
||||||
disk_issues[f'/dev/{disk}'] = {
|
disk_details[disk] = {
|
||||||
'status': severity,
|
'status': severity,
|
||||||
'reason': reason,
|
'reason': reason,
|
||||||
'dismissable': True
|
'dismissable': False
|
||||||
}
|
}
|
||||||
elif error_count >= 1:
|
elif error_count >= 1:
|
||||||
error_key = f'disk_{disk}'
|
error_key = f'disk_{disk}'
|
||||||
@@ -1116,13 +1116,13 @@ class HealthMonitor:
|
|||||||
category='network',
|
category='network',
|
||||||
severity='CRITICAL',
|
severity='CRITICAL',
|
||||||
reason=alert_reason or 'Interface DOWN',
|
reason=alert_reason or 'Interface DOWN',
|
||||||
details={'interface': interface, 'dismissable': True}
|
details={'interface': interface, 'dismissable': False}
|
||||||
)
|
)
|
||||||
|
|
||||||
interface_details[interface] = {
|
interface_details[interface] = {
|
||||||
'status': 'CRITICAL',
|
'status': 'CRITICAL',
|
||||||
'reason': alert_reason or 'Interface DOWN',
|
'reason': alert_reason or 'Interface DOWN',
|
||||||
'dismissable': True
|
'dismissable': False
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
active_interfaces.add(interface)
|
active_interfaces.add(interface)
|
||||||
@@ -1488,7 +1488,10 @@ class HealthMonitor:
|
|||||||
}
|
}
|
||||||
|
|
||||||
if not issues:
|
if not issues:
|
||||||
checks['all_vms_cts'] = {'status': 'OK', 'detail': 'No issues detected in logs'}
|
checks['qmp_communication'] = {'status': 'OK', 'detail': 'No QMP timeouts detected'}
|
||||||
|
checks['container_startup'] = {'status': 'OK', 'detail': 'No container startup errors'}
|
||||||
|
checks['vm_startup'] = {'status': 'OK', 'detail': 'No VM startup failures'}
|
||||||
|
checks['oom_killer'] = {'status': 'OK', 'detail': 'No OOM events detected'}
|
||||||
return {'status': 'OK', 'checks': checks}
|
return {'status': 'OK', 'checks': checks}
|
||||||
|
|
||||||
has_critical = any(d.get('status') == 'CRITICAL' for d in vm_details.values())
|
has_critical = any(d.get('status') == 'CRITICAL' for d in vm_details.values())
|
||||||
@@ -1830,30 +1833,75 @@ class HealthMonitor:
|
|||||||
status = 'OK'
|
status = 'OK'
|
||||||
reason = None
|
reason = None
|
||||||
|
|
||||||
# Build checks dict for log sub-items
|
# Record/clear persistent errors for each log sub-check so Dismiss works
|
||||||
|
log_sub_checks = {
|
||||||
|
'log_error_cascade': {'active': cascade_count > 0, 'severity': 'WARNING',
|
||||||
|
'reason': f'{cascade_count} pattern(s) repeating >=15 times'},
|
||||||
|
'log_error_spike': {'active': spike_count > 0, 'severity': 'WARNING',
|
||||||
|
'reason': f'{spike_count} pattern(s) with 4x increase'},
|
||||||
|
'log_persistent_errors': {'active': persistent_count > 0, 'severity': 'WARNING',
|
||||||
|
'reason': f'{persistent_count} recurring pattern(s) over 15+ min'},
|
||||||
|
'log_critical_errors': {'active': unique_critical_count > 0, 'severity': 'CRITICAL',
|
||||||
|
'reason': f'{unique_critical_count} critical error(s) found', 'dismissable': False},
|
||||||
|
}
|
||||||
|
|
||||||
|
# Track which sub-checks were dismissed
|
||||||
|
dismissed_keys = set()
|
||||||
|
for err_key, info in log_sub_checks.items():
|
||||||
|
if info['active']:
|
||||||
|
is_dismissable = info.get('dismissable', True)
|
||||||
|
result = health_persistence.record_error(
|
||||||
|
error_key=err_key,
|
||||||
|
category='logs',
|
||||||
|
severity=info['severity'],
|
||||||
|
reason=info['reason'],
|
||||||
|
details={'dismissable': is_dismissable}
|
||||||
|
)
|
||||||
|
if result and result.get('type') == 'skipped_acknowledged':
|
||||||
|
dismissed_keys.add(err_key)
|
||||||
|
elif health_persistence.is_error_active(err_key):
|
||||||
|
health_persistence.clear_error(err_key)
|
||||||
|
|
||||||
|
# Build checks dict - downgrade dismissed items to INFO
|
||||||
|
def _log_check_status(key, active, severity):
|
||||||
|
if not active:
|
||||||
|
return 'OK'
|
||||||
|
if key in dismissed_keys:
|
||||||
|
return 'INFO'
|
||||||
|
return severity
|
||||||
|
|
||||||
log_checks = {
|
log_checks = {
|
||||||
'error_cascade': {
|
'log_error_cascade': {
|
||||||
'status': 'WARNING' if cascade_count > 0 else 'OK',
|
'status': _log_check_status('log_error_cascade', cascade_count > 0, 'WARNING'),
|
||||||
'detail': f'{cascade_count} pattern(s) repeating >=15 times' if cascade_count > 0 else 'No cascading errors',
|
'detail': f'{cascade_count} pattern(s) repeating >=15 times' if cascade_count > 0 else 'No cascading errors',
|
||||||
'dismissable': True
|
'dismissable': True,
|
||||||
|
'dismissed': 'log_error_cascade' in dismissed_keys
|
||||||
},
|
},
|
||||||
'error_spike': {
|
'log_error_spike': {
|
||||||
'status': 'WARNING' if spike_count > 0 else 'OK',
|
'status': _log_check_status('log_error_spike', spike_count > 0, 'WARNING'),
|
||||||
'detail': f'{spike_count} pattern(s) with 4x increase' if spike_count > 0 else 'No error spikes',
|
'detail': f'{spike_count} pattern(s) with 4x increase' if spike_count > 0 else 'No error spikes',
|
||||||
'dismissable': True
|
'dismissable': True,
|
||||||
|
'dismissed': 'log_error_spike' in dismissed_keys
|
||||||
},
|
},
|
||||||
'persistent_errors': {
|
'log_persistent_errors': {
|
||||||
'status': 'WARNING' if persistent_count > 0 else 'OK',
|
'status': _log_check_status('log_persistent_errors', persistent_count > 0, 'WARNING'),
|
||||||
'detail': f'{persistent_count} recurring pattern(s) over 15+ min' if persistent_count > 0 else 'No persistent patterns',
|
'detail': f'{persistent_count} recurring pattern(s) over 15+ min' if persistent_count > 0 else 'No persistent patterns',
|
||||||
'dismissable': True
|
'dismissable': True,
|
||||||
|
'dismissed': 'log_persistent_errors' in dismissed_keys
|
||||||
},
|
},
|
||||||
'critical_errors': {
|
'log_critical_errors': {
|
||||||
'status': 'CRITICAL' if unique_critical_count > 0 else 'OK',
|
'status': _log_check_status('log_critical_errors', unique_critical_count > 0, 'CRITICAL'),
|
||||||
'detail': f'{unique_critical_count} critical error(s) found' if unique_critical_count > 0 else 'No critical errors',
|
'detail': f'{unique_critical_count} critical error(s) found' if unique_critical_count > 0 else 'No critical errors',
|
||||||
'dismissable': True
|
'dismissable': False
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Recalculate overall status considering dismissed items
|
||||||
|
active_issues = [k for k, v in log_checks.items() if v['status'] in ('WARNING', 'CRITICAL')]
|
||||||
|
if not active_issues:
|
||||||
|
status = 'OK'
|
||||||
|
reason = None
|
||||||
|
|
||||||
log_result = {'status': status, 'checks': log_checks}
|
log_result = {'status': status, 'checks': log_checks}
|
||||||
if reason:
|
if reason:
|
||||||
log_result['reason'] = reason
|
log_result['reason'] = reason
|
||||||
@@ -1864,10 +1912,10 @@ class HealthMonitor:
|
|||||||
|
|
||||||
# If journalctl command failed or returned no data
|
# If journalctl command failed or returned no data
|
||||||
ok_result = {'status': 'OK', 'checks': {
|
ok_result = {'status': 'OK', 'checks': {
|
||||||
'error_cascade': {'status': 'OK', 'detail': 'No cascading errors'},
|
'log_error_cascade': {'status': 'OK', 'detail': 'No cascading errors'},
|
||||||
'error_spike': {'status': 'OK', 'detail': 'No error spikes'},
|
'log_error_spike': {'status': 'OK', 'detail': 'No error spikes'},
|
||||||
'persistent_errors': {'status': 'OK', 'detail': 'No persistent patterns'},
|
'log_persistent_errors': {'status': 'OK', 'detail': 'No persistent patterns'},
|
||||||
'critical_errors': {'status': 'OK', 'detail': 'No critical errors'}
|
'log_critical_errors': {'status': 'OK', 'detail': 'No critical errors'}
|
||||||
}}
|
}}
|
||||||
self.cached_results[cache_key] = ok_result
|
self.cached_results[cache_key] = ok_result
|
||||||
self.last_check_times[cache_key] = current_time
|
self.last_check_times[cache_key] = current_time
|
||||||
@@ -2014,11 +2062,12 @@ class HealthMonitor:
|
|||||||
'security_updates': {
|
'security_updates': {
|
||||||
'status': sec_status,
|
'status': sec_status,
|
||||||
'detail': f'{len(security_updates_packages)} security update(s) pending' if security_updates_packages else 'No security updates pending',
|
'detail': f'{len(security_updates_packages)} security update(s) pending' if security_updates_packages else 'No security updates pending',
|
||||||
|
'dismissable': True if sec_status != 'OK' else False
|
||||||
},
|
},
|
||||||
'system_age': {
|
'system_age': {
|
||||||
'status': update_age_status,
|
'status': update_age_status,
|
||||||
'detail': f'Last updated {last_update_days} day(s) ago' if last_update_days is not None else 'Unknown',
|
'detail': f'Last updated {last_update_days} day(s) ago' if last_update_days is not None else 'Unknown',
|
||||||
'thresholds': 'Warning >365 days, Critical >548 days'
|
'dismissable': False if update_age_status == 'CRITICAL' else True if update_age_status == 'WARNING' else False
|
||||||
},
|
},
|
||||||
'pending_updates': {
|
'pending_updates': {
|
||||||
'status': 'INFO' if update_count > 50 else 'OK',
|
'status': 'INFO' if update_count > 50 else 'OK',
|
||||||
@@ -2208,7 +2257,7 @@ class HealthMonitor:
|
|||||||
if updates_data and updates_data.get('days_since_update', 9999) > 365:
|
if updates_data and updates_data.get('days_since_update', 9999) > 365:
|
||||||
msg = f'Uptime {int(uptime_days)} days (>1 year, consider updating kernel/system)'
|
msg = f'Uptime {int(uptime_days)} days (>1 year, consider updating kernel/system)'
|
||||||
issues.append(msg)
|
issues.append(msg)
|
||||||
checks['uptime'] = {'status': 'WARNING', 'detail': msg, 'days': int(uptime_days)}
|
checks['uptime'] = {'status': 'WARNING', 'detail': msg, 'days': int(uptime_days), 'dismissable': True}
|
||||||
else:
|
else:
|
||||||
checks['uptime'] = {'status': 'OK', 'detail': f'Uptime {int(uptime_days)} days, system recently updated'}
|
checks['uptime'] = {'status': 'OK', 'detail': f'Uptime {int(uptime_days)} days, system recently updated'}
|
||||||
else:
|
else:
|
||||||
@@ -2223,7 +2272,8 @@ class HealthMonitor:
|
|||||||
cert_reason = cert_status.get('reason', '')
|
cert_reason = cert_status.get('reason', '')
|
||||||
checks['certificates'] = {
|
checks['certificates'] = {
|
||||||
'status': cert_sev,
|
'status': cert_sev,
|
||||||
'detail': cert_reason if cert_reason else 'Certificate valid'
|
'detail': cert_reason if cert_reason else 'Certificate valid',
|
||||||
|
'dismissable': True if cert_sev not in ['OK', 'INFO'] else False
|
||||||
}
|
}
|
||||||
if cert_sev not in ['OK', 'INFO']:
|
if cert_sev not in ['OK', 'INFO']:
|
||||||
issues.append(cert_reason or 'Certificate issue')
|
issues.append(cert_reason or 'Certificate issue')
|
||||||
@@ -2247,7 +2297,7 @@ class HealthMonitor:
|
|||||||
if failed_logins > 50:
|
if failed_logins > 50:
|
||||||
msg = f'{failed_logins} failed login attempts in 24h'
|
msg = f'{failed_logins} failed login attempts in 24h'
|
||||||
issues.append(msg)
|
issues.append(msg)
|
||||||
checks['login_attempts'] = {'status': 'WARNING', 'detail': msg, 'count': failed_logins}
|
checks['login_attempts'] = {'status': 'WARNING', 'detail': msg, 'count': failed_logins, 'dismissable': True}
|
||||||
elif failed_logins > 0:
|
elif failed_logins > 0:
|
||||||
checks['login_attempts'] = {'status': 'OK', 'detail': f'{failed_logins} failed attempts in 24h (within threshold)', 'count': failed_logins}
|
checks['login_attempts'] = {'status': 'OK', 'detail': f'{failed_logins} failed attempts in 24h (within threshold)', 'count': failed_logins}
|
||||||
else:
|
else:
|
||||||
@@ -2258,8 +2308,10 @@ class HealthMonitor:
|
|||||||
# Sub-check 4: Fail2Ban ban detection
|
# Sub-check 4: Fail2Ban ban detection
|
||||||
try:
|
try:
|
||||||
f2b = self._check_fail2ban_bans()
|
f2b = self._check_fail2ban_bans()
|
||||||
|
f2b_status = f2b.get('status', 'OK')
|
||||||
checks['fail2ban'] = {
|
checks['fail2ban'] = {
|
||||||
'status': f2b.get('status', 'OK'),
|
'status': f2b_status,
|
||||||
|
'dismissable': True if f2b_status not in ['OK'] else False,
|
||||||
'detail': f2b.get('detail', ''),
|
'detail': f2b.get('detail', ''),
|
||||||
'installed': f2b.get('installed', False),
|
'installed': f2b.get('installed', False),
|
||||||
'banned_count': f2b.get('banned_count', 0)
|
'banned_count': f2b.get('banned_count', 0)
|
||||||
@@ -2511,10 +2563,22 @@ class HealthMonitor:
|
|||||||
# All storages are available. We should also clear any previously recorded storage errors.
|
# All storages are available. We should also clear any previously recorded storage errors.
|
||||||
active_errors = health_persistence.get_active_errors()
|
active_errors = health_persistence.get_active_errors()
|
||||||
for error in active_errors:
|
for error in active_errors:
|
||||||
# Target errors related to storage unavailability
|
|
||||||
if error.get('category') == 'storage' and error.get('error_key', '').startswith('storage_unavailable_'):
|
if error.get('category') == 'storage' and error.get('error_key', '').startswith('storage_unavailable_'):
|
||||||
health_persistence.clear_error(error['error_key'])
|
health_persistence.clear_error(error['error_key'])
|
||||||
return {'status': 'OK'}
|
|
||||||
|
# Build checks from all configured storages for descriptive display
|
||||||
|
available_storages = storage_status.get('available', [])
|
||||||
|
checks = {}
|
||||||
|
for st in available_storages:
|
||||||
|
st_name = st.get('name', 'unknown')
|
||||||
|
st_type = st.get('type', 'unknown')
|
||||||
|
checks[st_name] = {
|
||||||
|
'status': 'OK',
|
||||||
|
'detail': f'{st_type} storage available'
|
||||||
|
}
|
||||||
|
if not checks:
|
||||||
|
checks['proxmox_storages'] = {'status': 'OK', 'detail': 'All storages available'}
|
||||||
|
return {'status': 'OK', 'checks': checks}
|
||||||
|
|
||||||
storage_details = {}
|
storage_details = {}
|
||||||
for storage in unavailable_storages:
|
for storage in unavailable_storages:
|
||||||
@@ -2552,10 +2616,29 @@ class HealthMonitor:
|
|||||||
'dismissable': False
|
'dismissable': False
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Build checks from storage_details
|
||||||
|
checks = {}
|
||||||
|
for st_name, st_info in storage_details.items():
|
||||||
|
checks[st_name] = {
|
||||||
|
'status': 'CRITICAL',
|
||||||
|
'detail': st_info.get('reason', 'Unavailable'),
|
||||||
|
'dismissable': False
|
||||||
|
}
|
||||||
|
# Also add available storages
|
||||||
|
available_list = storage_status.get('available', [])
|
||||||
|
unavail_names = {s['name'] for s in unavailable_storages}
|
||||||
|
for st in available_list:
|
||||||
|
if st.get('name') not in unavail_names and st.get('name') not in checks:
|
||||||
|
checks[st['name']] = {
|
||||||
|
'status': 'OK',
|
||||||
|
'detail': f'{st.get("type", "unknown")} storage available'
|
||||||
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'status': 'CRITICAL',
|
'status': 'CRITICAL',
|
||||||
'reason': f'{len(unavailable_storages)} Proxmox storage(s) unavailable',
|
'reason': f'{len(unavailable_storages)} Proxmox storage(s) unavailable',
|
||||||
'details': storage_details
|
'details': storage_details,
|
||||||
|
'checks': checks
|
||||||
}
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
Reference in New Issue
Block a user