From 0f81f45c5f396408ba40189dd019a608b9e2a016 Mon Sep 17 00:00:00 2001 From: MacRimi Date: Mon, 16 Feb 2026 22:07:10 +0100 Subject: [PATCH] update health monitor --- AppImage/components/health-status-modal.tsx | 8 ++-- AppImage/scripts/health_monitor.py | 50 ++++++++++---------- AppImage/scripts/health_persistence.py | 51 +++++++++++++++++++++ 3 files changed, 81 insertions(+), 28 deletions(-) diff --git a/AppImage/components/health-status-modal.tsx b/AppImage/components/health-status-modal.tsx index 72d1cc00..af219ff4 100644 --- a/AppImage/components/health-status-modal.tsx +++ b/AppImage/components/health-status-modal.tsx @@ -409,7 +409,7 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu return ( - +
@@ -471,8 +471,8 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
{healthData.summary && healthData.summary !== "All systems operational" && ( -
-

{healthData.summary}

+
+

{healthData.summary}

)} @@ -510,7 +510,7 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu )}
{reason && !isExpanded && ( -

{reason}

+

{reason}

)}
diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py index 794d365a..af07b791 100644 --- a/AppImage/scripts/health_monitor.py +++ b/AppImage/scripts/health_monitor.py @@ -143,6 +143,11 @@ class HealthMonitor: self.failed_vm_history = set() # Track VMs that failed to start self.persistent_log_patterns = defaultdict(lambda: {'count': 0, 'first_seen': 0, 'last_seen': 0}) + # System capabilities - derived from Proxmox storage types at runtime (Priority 1.5) + # SMART detection still uses filesystem check on init (lightweight) + has_smart = os.path.exists('/usr/sbin/smartctl') or os.path.exists('/usr/bin/smartctl') + self.capabilities = {'has_zfs': False, 'has_lvm': False, 'has_smart': has_smart} + try: health_persistence.cleanup_old_errors() except Exception as e: @@ -291,6 +296,12 @@ class HealthMonitor: critical_issues.append(proxmox_storage_result.get('reason', 'Proxmox storage unavailable')) elif proxmox_storage_result.get('status') == 'WARNING': warning_issues.append(proxmox_storage_result.get('reason', 'Proxmox storage issue')) + + # Derive capabilities from Proxmox storage types (immediate, no extra checks) + storage_checks = proxmox_storage_result.get('checks', {}) + storage_types = {v.get('detail', '').split(' ')[0].lower() for v in storage_checks.values() if isinstance(v, dict)} + self.capabilities['has_zfs'] = any(t in ('zfspool', 'zfs') for t in storage_types) + self.capabilities['has_lvm'] = any(t in ('lvm', 'lvmthin') for t in storage_types) # Priority 2: Disk/Filesystem Health (Internal checks: usage, ZFS, SMART, IO errors) storage_status = self._check_storage_optimized() @@ -802,15 +813,14 @@ class HealthMonitor: } if not issues: - # Add descriptive OK entries for what we monitor + # Add descriptive OK entries only for capabilities this server actually has checks['root_filesystem'] = checks.get('/', {'status': 'OK', 'detail': 'Mounted read-write, space OK'}) - checks['smart_health'] = {'status': 'OK', 'detail': 'No SMART warnings in journal'} checks['io_errors'] = {'status': 'OK', 'detail': 'No I/O errors in dmesg'} - # Check if ZFS is present - if os.path.exists('/sbin/zpool') or os.path.exists('/usr/sbin/zpool'): + if self.capabilities.get('has_smart'): + checks['smart_health'] = {'status': 'OK', 'detail': 'No SMART warnings in journal'} + if self.capabilities.get('has_zfs'): checks['zfs_pools'] = {'status': 'OK', 'detail': 'ZFS pools healthy'} - # Check if LVM is present - if os.path.exists('/sbin/lvm') or os.path.exists('/usr/sbin/lvm'): + if self.capabilities.get('has_lvm'): checks['lvm_volumes'] = {'status': 'OK', 'detail': 'LVM volumes OK'} return {'status': 'OK', 'checks': checks} @@ -1666,24 +1676,16 @@ class HealthMonitor: # Cache the result for 5 minutes to avoid excessive journalctl calls if cache_key in self.last_check_times: if current_time - self.last_check_times[cache_key] < self.LOG_CHECK_INTERVAL: - # Check persistent log errors recorded by health_persistence - persistent_errors = health_persistence.get_active_errors('logs') - if persistent_errors: - # Find the highest severity among persistent errors to set overall status - max_severity = 'OK' - reasons = [] - for error in persistent_errors: - if error['severity'] == 'CRITICAL': - max_severity = 'CRITICAL' - elif error['severity'] == 'WARNING' and max_severity != 'CRITICAL': - max_severity = 'WARNING' - reasons.append(error['reason']) - - return { - 'status': max_severity, - 'reason': '; '.join(reasons[:3]) # Show up to 3 persistent reasons - } - return self.cached_results.get(cache_key, {'status': 'OK'}) + # Return the full cached result (which includes 'checks' dict) + cached = self.cached_results.get(cache_key) + if cached: + return cached + return {'status': 'OK', 'checks': { + 'log_error_cascade': {'status': 'OK', 'detail': 'No cascading errors'}, + 'log_error_spike': {'status': 'OK', 'detail': 'No error spikes'}, + 'log_persistent_errors': {'status': 'OK', 'detail': 'No persistent patterns'}, + 'log_critical_errors': {'status': 'OK', 'detail': 'No critical errors'} + }} try: # Fetch logs from the last 3 minutes for immediate issue detection diff --git a/AppImage/scripts/health_persistence.py b/AppImage/scripts/health_persistence.py index 973bbe9c..259bb046 100644 --- a/AppImage/scripts/health_persistence.py +++ b/AppImage/scripts/health_persistence.py @@ -71,6 +71,15 @@ class HealthPersistence: ) ''') + # System capabilities table (detected once, cached forever) + cursor.execute(''' + CREATE TABLE IF NOT EXISTS system_capabilities ( + cap_key TEXT PRIMARY KEY, + cap_value TEXT NOT NULL, + detected_at TEXT NOT NULL + ) + ''') + # Indexes for performance cursor.execute('CREATE INDEX IF NOT EXISTS idx_error_key ON errors(error_key)') cursor.execute('CREATE INDEX IF NOT EXISTS idx_category ON errors(category)') @@ -572,6 +581,48 @@ class HealthPersistence: conn.commit() conn.close() + + # ─── System Capabilities Cache ─────────────────────────────── + + def get_capability(self, cap_key: str) -> Optional[str]: + """ + Get a cached system capability value. + Returns None if not yet detected. + """ + conn = sqlite3.connect(str(self.db_path)) + cursor = conn.cursor() + cursor.execute( + 'SELECT cap_value FROM system_capabilities WHERE cap_key = ?', + (cap_key,) + ) + row = cursor.fetchone() + conn.close() + return row[0] if row else None + + def set_capability(self, cap_key: str, cap_value: str): + """Store a system capability value (detected once, cached forever).""" + conn = sqlite3.connect(str(self.db_path)) + cursor = conn.cursor() + cursor.execute(''' + INSERT OR REPLACE INTO system_capabilities (cap_key, cap_value, detected_at) + VALUES (?, ?, ?) + ''', (cap_key, cap_value, datetime.now().isoformat())) + conn.commit() + conn.close() + + def get_all_capabilities(self) -> Dict[str, str]: + """Get all cached system capabilities as a dict.""" + conn = sqlite3.connect(str(self.db_path)) + cursor = conn.cursor() + cursor.execute('SELECT cap_key, cap_value FROM system_capabilities') + rows = cursor.fetchall() + conn.close() + return {row[0]: row[1] for row in rows} + + # Note: System capabilities (has_zfs, has_lvm) are now derived at runtime + # from Proxmox storage types in health_monitor.get_detailed_status() + # This avoids redundant subprocess calls and ensures immediate detection + # when the user adds new ZFS/LVM storage via Proxmox. # Global instance