mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-02-18 16:36:27 +00:00
update health monitor
This commit is contained in:
@@ -409,7 +409,7 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
|
||||
|
||||
return (
|
||||
<Dialog open={open} onOpenChange={onOpenChange}>
|
||||
<DialogContent className="max-w-3xl max-h-[85vh] overflow-y-auto overflow-x-hidden">
|
||||
<DialogContent className="max-w-3xl w-[95vw] max-h-[85vh] overflow-y-auto overflow-x-hidden">
|
||||
<DialogHeader>
|
||||
<div className="flex items-center justify-between gap-3">
|
||||
<DialogTitle className="flex items-center gap-2 flex-1">
|
||||
@@ -471,8 +471,8 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
|
||||
</div>
|
||||
|
||||
{healthData.summary && healthData.summary !== "All systems operational" && (
|
||||
<div className="text-sm p-3 rounded-lg bg-muted/20 border overflow-hidden">
|
||||
<p className="font-medium text-foreground truncate">{healthData.summary}</p>
|
||||
<div className="text-sm p-3 rounded-lg bg-muted/20 border overflow-hidden max-w-full">
|
||||
<p className="font-medium text-foreground truncate" title={healthData.summary}>{healthData.summary}</p>
|
||||
</div>
|
||||
)}
|
||||
|
||||
@@ -510,7 +510,7 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
|
||||
)}
|
||||
</div>
|
||||
{reason && !isExpanded && (
|
||||
<p className="text-xs text-muted-foreground mt-0.5 truncate">{reason}</p>
|
||||
<p className="text-xs text-muted-foreground mt-0.5 truncate" title={reason}>{reason}</p>
|
||||
)}
|
||||
</div>
|
||||
<div className="flex items-center gap-2 shrink-0">
|
||||
|
||||
@@ -143,6 +143,11 @@ class HealthMonitor:
|
||||
self.failed_vm_history = set() # Track VMs that failed to start
|
||||
self.persistent_log_patterns = defaultdict(lambda: {'count': 0, 'first_seen': 0, 'last_seen': 0})
|
||||
|
||||
# System capabilities - derived from Proxmox storage types at runtime (Priority 1.5)
|
||||
# SMART detection still uses filesystem check on init (lightweight)
|
||||
has_smart = os.path.exists('/usr/sbin/smartctl') or os.path.exists('/usr/bin/smartctl')
|
||||
self.capabilities = {'has_zfs': False, 'has_lvm': False, 'has_smart': has_smart}
|
||||
|
||||
try:
|
||||
health_persistence.cleanup_old_errors()
|
||||
except Exception as e:
|
||||
@@ -291,6 +296,12 @@ class HealthMonitor:
|
||||
critical_issues.append(proxmox_storage_result.get('reason', 'Proxmox storage unavailable'))
|
||||
elif proxmox_storage_result.get('status') == 'WARNING':
|
||||
warning_issues.append(proxmox_storage_result.get('reason', 'Proxmox storage issue'))
|
||||
|
||||
# Derive capabilities from Proxmox storage types (immediate, no extra checks)
|
||||
storage_checks = proxmox_storage_result.get('checks', {})
|
||||
storage_types = {v.get('detail', '').split(' ')[0].lower() for v in storage_checks.values() if isinstance(v, dict)}
|
||||
self.capabilities['has_zfs'] = any(t in ('zfspool', 'zfs') for t in storage_types)
|
||||
self.capabilities['has_lvm'] = any(t in ('lvm', 'lvmthin') for t in storage_types)
|
||||
|
||||
# Priority 2: Disk/Filesystem Health (Internal checks: usage, ZFS, SMART, IO errors)
|
||||
storage_status = self._check_storage_optimized()
|
||||
@@ -802,15 +813,14 @@ class HealthMonitor:
|
||||
}
|
||||
|
||||
if not issues:
|
||||
# Add descriptive OK entries for what we monitor
|
||||
# Add descriptive OK entries only for capabilities this server actually has
|
||||
checks['root_filesystem'] = checks.get('/', {'status': 'OK', 'detail': 'Mounted read-write, space OK'})
|
||||
checks['smart_health'] = {'status': 'OK', 'detail': 'No SMART warnings in journal'}
|
||||
checks['io_errors'] = {'status': 'OK', 'detail': 'No I/O errors in dmesg'}
|
||||
# Check if ZFS is present
|
||||
if os.path.exists('/sbin/zpool') or os.path.exists('/usr/sbin/zpool'):
|
||||
if self.capabilities.get('has_smart'):
|
||||
checks['smart_health'] = {'status': 'OK', 'detail': 'No SMART warnings in journal'}
|
||||
if self.capabilities.get('has_zfs'):
|
||||
checks['zfs_pools'] = {'status': 'OK', 'detail': 'ZFS pools healthy'}
|
||||
# Check if LVM is present
|
||||
if os.path.exists('/sbin/lvm') or os.path.exists('/usr/sbin/lvm'):
|
||||
if self.capabilities.get('has_lvm'):
|
||||
checks['lvm_volumes'] = {'status': 'OK', 'detail': 'LVM volumes OK'}
|
||||
return {'status': 'OK', 'checks': checks}
|
||||
|
||||
@@ -1666,24 +1676,16 @@ class HealthMonitor:
|
||||
# Cache the result for 5 minutes to avoid excessive journalctl calls
|
||||
if cache_key in self.last_check_times:
|
||||
if current_time - self.last_check_times[cache_key] < self.LOG_CHECK_INTERVAL:
|
||||
# Check persistent log errors recorded by health_persistence
|
||||
persistent_errors = health_persistence.get_active_errors('logs')
|
||||
if persistent_errors:
|
||||
# Find the highest severity among persistent errors to set overall status
|
||||
max_severity = 'OK'
|
||||
reasons = []
|
||||
for error in persistent_errors:
|
||||
if error['severity'] == 'CRITICAL':
|
||||
max_severity = 'CRITICAL'
|
||||
elif error['severity'] == 'WARNING' and max_severity != 'CRITICAL':
|
||||
max_severity = 'WARNING'
|
||||
reasons.append(error['reason'])
|
||||
|
||||
return {
|
||||
'status': max_severity,
|
||||
'reason': '; '.join(reasons[:3]) # Show up to 3 persistent reasons
|
||||
}
|
||||
return self.cached_results.get(cache_key, {'status': 'OK'})
|
||||
# Return the full cached result (which includes 'checks' dict)
|
||||
cached = self.cached_results.get(cache_key)
|
||||
if cached:
|
||||
return cached
|
||||
return {'status': 'OK', 'checks': {
|
||||
'log_error_cascade': {'status': 'OK', 'detail': 'No cascading errors'},
|
||||
'log_error_spike': {'status': 'OK', 'detail': 'No error spikes'},
|
||||
'log_persistent_errors': {'status': 'OK', 'detail': 'No persistent patterns'},
|
||||
'log_critical_errors': {'status': 'OK', 'detail': 'No critical errors'}
|
||||
}}
|
||||
|
||||
try:
|
||||
# Fetch logs from the last 3 minutes for immediate issue detection
|
||||
|
||||
@@ -71,6 +71,15 @@ class HealthPersistence:
|
||||
)
|
||||
''')
|
||||
|
||||
# System capabilities table (detected once, cached forever)
|
||||
cursor.execute('''
|
||||
CREATE TABLE IF NOT EXISTS system_capabilities (
|
||||
cap_key TEXT PRIMARY KEY,
|
||||
cap_value TEXT NOT NULL,
|
||||
detected_at TEXT NOT NULL
|
||||
)
|
||||
''')
|
||||
|
||||
# Indexes for performance
|
||||
cursor.execute('CREATE INDEX IF NOT EXISTS idx_error_key ON errors(error_key)')
|
||||
cursor.execute('CREATE INDEX IF NOT EXISTS idx_category ON errors(category)')
|
||||
@@ -572,6 +581,48 @@ class HealthPersistence:
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
# ─── System Capabilities Cache ───────────────────────────────
|
||||
|
||||
def get_capability(self, cap_key: str) -> Optional[str]:
|
||||
"""
|
||||
Get a cached system capability value.
|
||||
Returns None if not yet detected.
|
||||
"""
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
'SELECT cap_value FROM system_capabilities WHERE cap_key = ?',
|
||||
(cap_key,)
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
conn.close()
|
||||
return row[0] if row else None
|
||||
|
||||
def set_capability(self, cap_key: str, cap_value: str):
|
||||
"""Store a system capability value (detected once, cached forever)."""
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
INSERT OR REPLACE INTO system_capabilities (cap_key, cap_value, detected_at)
|
||||
VALUES (?, ?, ?)
|
||||
''', (cap_key, cap_value, datetime.now().isoformat()))
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
def get_all_capabilities(self) -> Dict[str, str]:
|
||||
"""Get all cached system capabilities as a dict."""
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('SELECT cap_key, cap_value FROM system_capabilities')
|
||||
rows = cursor.fetchall()
|
||||
conn.close()
|
||||
return {row[0]: row[1] for row in rows}
|
||||
|
||||
# Note: System capabilities (has_zfs, has_lvm) are now derived at runtime
|
||||
# from Proxmox storage types in health_monitor.get_detailed_status()
|
||||
# This avoids redundant subprocess calls and ensures immediate detection
|
||||
# when the user adds new ZFS/LVM storage via Proxmox.
|
||||
|
||||
|
||||
# Global instance
|
||||
|
||||
Reference in New Issue
Block a user