From adde2ce5b9ed15c171bef8bcdc33fc8c0d92f03a Mon Sep 17 00:00:00 2001 From: MacRimi Date: Mon, 6 Apr 2026 12:02:05 +0200 Subject: [PATCH] update health_persistence.py --- AppImage/components/system-overview.tsx | 28 +-- AppImage/scripts/flask_server.py | 3 +- AppImage/scripts/health_monitor.py | 57 ++++-- AppImage/scripts/health_persistence.py | 230 ++++++++++++++++-------- AppImage/scripts/notification_events.py | 31 +++- 5 files changed, 245 insertions(+), 104 deletions(-) diff --git a/AppImage/components/system-overview.tsx b/AppImage/components/system-overview.tsx index d8f6cb5e..b1907209 100644 --- a/AppImage/components/system-overview.tsx +++ b/AppImage/components/system-overview.tsx @@ -111,9 +111,9 @@ const fetchSystemData = async (retries = 3, delayMs = 500): Promise("/api/system") return data - } catch (error) { + } catch { if (attempt === retries - 1) { - console.error("[v0] Failed to fetch system data after retries:", error) + // Silent fail - API not available (expected in preview environment) return null } // Wait before retry @@ -127,8 +127,8 @@ const fetchVMData = async (): Promise => { try { const data = await fetchApi("/api/vms") return Array.isArray(data) ? data : data.vms || [] - } catch (error) { - console.error("[v0] Failed to fetch VM data:", error) + } catch { + // Silent fail - API not available return [] } } @@ -137,8 +137,7 @@ const fetchStorageData = async (): Promise => { try { const data = await fetchApi("/api/storage/summary") return data - } catch (error) { - console.log("[v0] Storage API not available (this is normal if not configured)") + } catch { return null } } @@ -146,13 +145,22 @@ const fetchStorageData = async (): Promise => { const fetchNetworkData = async (): Promise => { try { const data = await fetchApi("/api/network/summary") - return data - } catch (error) { - console.log("[v0] Network API not available (this is normal if not configured)") - return null + return data + } catch { + return null } } +const fetchProxmoxStorageData = async (): Promise => { + try { + const data = await fetchApi("/api/proxmox-storage") + return data + } catch { + return null + } +} +} + const fetchProxmoxStorageData = async (): Promise => { try { const data = await fetchApi("/api/proxmox-storage") diff --git a/AppImage/scripts/flask_server.py b/AppImage/scripts/flask_server.py index 2279827e..4cd385d2 100644 --- a/AppImage/scripts/flask_server.py +++ b/AppImage/scripts/flask_server.py @@ -915,8 +915,9 @@ def _capture_health_journal_context(categories: list, reason: str = '') -> str: return "" # Capture recent journal entries matching keywords + # Use -b 0 to only include logs from the current boot cmd = ( - f"journalctl --since='10 minutes ago' --no-pager -n 500 2>/dev/null | " + f"journalctl -b 0 --since='10 minutes ago' --no-pager -n 500 2>/dev/null | " f"grep -iE '{pattern}' | tail -n 30" ) diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py index 231687ca..72b863bd 100644 --- a/AppImage/scripts/health_monitor.py +++ b/AppImage/scripts/health_monitor.py @@ -150,7 +150,7 @@ class HealthMonitor: r'zfs.*scrub (started|finished|in progress)', r'zpool.*resilver', - # ���─ LXC/Container normal operations ── + # ── LXC/Container normal operations ── r'lxc.*monitor', r'systemd\[1\]: (started|stopped) .*\.scope', @@ -184,13 +184,21 @@ class HealthMonitor: ] CRITICAL_LOG_KEYWORDS = [ - 'out of memory', 'oom_kill', 'kernel panic', - 'filesystem read-only', 'cannot mount', - 'raid.*failed', 'md.*device failed', - 'ext4-fs error', 'xfs.*corruption', - 'lvm activation failed', + # OOM and memory errors + 'out of memory', 'oom_kill', 'oom-kill', 'invoked oom-killer', + 'memory cgroup out of memory', 'cannot allocate memory', 'oom_reaper', + # Kernel panics and critical faults + 'kernel panic', 'general protection fault', 'trap invalid opcode', + # Filesystem critical errors + 'filesystem read-only', 'read-only file system', 'cannot mount', + 'ext4-fs error', 'ext4_abort', 'xfs.*corruption', 'btrfs.*error', + # RAID/Storage critical + 'raid.*failed', 'md.*device failed', 'lvm activation failed', + 'zpool.*faulted', 'state: faulted', + # Hardware errors 'hardware error', 'mce:', - 'general protection fault', + # Cluster critical + 'quorum lost', 'split brain', ] # Segfault is WARNING, not CRITICAL -- only PVE-critical process @@ -202,11 +210,20 @@ class HealthMonitor: } WARNING_LOG_KEYWORDS = [ - 'i/o error', 'ata error', 'scsi error', - 'task hung', 'blocked for more than', - 'failed to start', 'service.*failed', + # Storage I/O errors + 'i/o error', 'buffer i/o error', 'ata error', 'scsi error', 'disk.*offline', 'disk.*removed', - 'segfault', # WARNING by default; escalated to CRITICAL only for PVE processes + # CPU/IO blocking + 'task hung', 'blocked for more than', 'soft lockup', + # Service failures + 'failed to start', 'service.*failed', + 'entering failed state', 'code=exited, status=', 'code=killed', + # Process crashes (WARNING by default; escalated to CRITICAL for PVE processes) + 'segfault', + # Cluster/Network warnings + 'corosync.*failed', 'corosync.*timeout', + 'connection lost', 'totem.*failed', + 'entered disabled state', 'entered blocking state', ] # PVE Critical Services @@ -769,12 +786,30 @@ class HealthMonitor: if len(critical_samples) >= 3: status = 'CRITICAL' reason = f'CPU >{self.CPU_CRITICAL}% sustained for {self.CPU_CRITICAL_DURATION}s' + # Record the error + health_persistence.record_error( + error_key='cpu_usage', + category='cpu', + severity='CRITICAL', + reason=reason, + details={'cpu_percent': cpu_percent} + ) elif len(warning_samples) >= 3 and len(recovery_samples) < 2: status = 'WARNING' reason = f'CPU >{self.CPU_WARNING}% sustained for {self.CPU_WARNING_DURATION}s' + # Record the warning + health_persistence.record_error( + error_key='cpu_usage', + category='cpu', + severity='WARNING', + reason=reason, + details={'cpu_percent': cpu_percent} + ) else: status = 'OK' reason = None + # CPU is normal - auto-resolve any existing CPU errors + health_persistence.resolve_error('cpu_usage', 'CPU usage returned to normal') temp_status = self._check_cpu_temperature() diff --git a/AppImage/scripts/health_persistence.py b/AppImage/scripts/health_persistence.py index c95db4bf..948ac701 100644 --- a/AppImage/scripts/health_persistence.py +++ b/AppImage/scripts/health_persistence.py @@ -967,10 +967,12 @@ class HealthPersistence: cutoff_events = (now - timedelta(days=30)).isoformat() cursor.execute('DELETE FROM events WHERE timestamp < ?', (cutoff_events,)) - # ── Auto-resolve transient errors after system stabilizes ── - # Transient errors (OOM, high CPU, service failures) resolve themselves. - # If the system has been up for >10 minutes and these errors haven't recurred, - # they are stale and should be auto-resolved. + # ══════════════════════════════════════════════════════════════════════ + # SMART AUTO-RESOLVE: Based on system state, not hardcoded patterns + # ══════════════════════════════════════════════════════════════════════ + # Logic: If an error hasn't been seen recently AND the system is healthy, + # the error is stale and should be auto-resolved. + # This works for ANY error pattern, not just predefined ones. try: import psutil # Get system uptime @@ -979,9 +981,13 @@ class HealthPersistence: # Only auto-resolve if system has been stable for at least 10 minutes if uptime_seconds > 600: # 10 minutes - stale_cutoff = (now - timedelta(minutes=10)).isoformat() + current_cpu = psutil.cpu_percent(interval=0.1) + current_mem = psutil.virtual_memory().percent - # 1. Resolve transient log errors (OOM, service failures) + # ── 1. LOGS category: Auto-resolve if not seen in 15 minutes ── + # Log errors are transient - if journalctl hasn't reported them recently, + # they are from a previous state and should be resolved. + stale_logs_cutoff = (now - timedelta(minutes=15)).isoformat() cursor.execute(''' UPDATE errors SET resolved_at = ? @@ -989,49 +995,69 @@ class HealthPersistence: AND resolved_at IS NULL AND acknowledged = 0 AND last_seen < ? - AND (error_key LIKE 'log_critical_%' - OR error_key LIKE 'log_persistent_%' - OR reason LIKE '%Out of memory%' - OR reason LIKE '%Recurring error%' - OR reason LIKE '%service%Failed%' - OR reason LIKE '%timeout%' - OR reason LIKE '%critical error%') - ''', (now_iso, stale_cutoff)) + ''', (now_iso, stale_logs_cutoff)) - # 2. Auto-resolve CPU errors if current CPU is normal (<75%) - try: - current_cpu = psutil.cpu_percent(interval=0.1) - if current_cpu < 75: - cursor.execute(''' - UPDATE errors - SET resolved_at = ? - WHERE category = 'temperature' - AND resolved_at IS NULL - AND acknowledged = 0 - AND last_seen < ? - AND (error_key = 'cpu_usage' - OR reason LIKE '%CPU >%sustained%' - OR reason LIKE '%Sustained high CPU%') - ''', (now_iso, stale_cutoff)) - except Exception: - pass + # ── 2. CPU category: Auto-resolve if CPU is normal (<75%) ── + if current_cpu < 75: + stale_cpu_cutoff = (now - timedelta(minutes=5)).isoformat() + cursor.execute(''' + UPDATE errors + SET resolved_at = ? + WHERE (category = 'cpu' OR category = 'temperature') + AND resolved_at IS NULL + AND acknowledged = 0 + AND last_seen < ? + AND (error_key LIKE 'cpu_%' OR reason LIKE '%CPU%') + ''', (now_iso, stale_cpu_cutoff)) - # 3. Auto-resolve memory errors if current memory is normal (<80%) - try: - current_mem = psutil.virtual_memory().percent - if current_mem < 80: - cursor.execute(''' - UPDATE errors - SET resolved_at = ? - WHERE category = 'memory' - AND resolved_at IS NULL - AND acknowledged = 0 - AND last_seen < ? - AND (reason LIKE '%Memory >%' - OR reason LIKE '%RAM usage%') - ''', (now_iso, stale_cutoff)) - except Exception: - pass + # ── 3. MEMORY category: Auto-resolve if memory is normal (<80%) ── + if current_mem < 80: + stale_mem_cutoff = (now - timedelta(minutes=5)).isoformat() + cursor.execute(''' + UPDATE errors + SET resolved_at = ? + WHERE (category = 'memory' OR category = 'logs') + AND resolved_at IS NULL + AND acknowledged = 0 + AND last_seen < ? + AND (error_key LIKE '%oom%' + OR error_key LIKE '%memory%' + OR reason LIKE '%memory%' + OR reason LIKE '%OOM%' + OR reason LIKE '%killed%process%') + ''', (now_iso, stale_mem_cutoff)) + + # ── 4. VMS category: Auto-resolve if VM/CT is now running ── + # Check all active VM/CT errors and resolve if the VM/CT is now running + cursor.execute(''' + SELECT error_key, category FROM errors + WHERE (category IN ('vms', 'vmct') OR error_key LIKE 'vm_%' OR error_key LIKE 'ct_%' OR error_key LIKE 'vmct_%') + AND resolved_at IS NULL + AND acknowledged = 0 + ''') + vm_errors = cursor.fetchall() + for error_key, cat in vm_errors: + # Extract VM/CT ID from error_key + import re + vmid_match = re.search(r'(?:vm_|ct_|vmct_)(\d+)', error_key) + if vmid_match: + vmid = vmid_match.group(1) + # Check if running - this auto-resolves if so + self.check_vm_running(vmid) + + # ── 5. GENERIC: Any error not seen in 30 minutes while system is healthy ── + # If CPU < 80% and Memory < 85% and error hasn't been seen in 30 min, + # the system has recovered and the error is stale. + if current_cpu < 80 and current_mem < 85: + stale_generic_cutoff = (now - timedelta(minutes=30)).isoformat() + cursor.execute(''' + UPDATE errors + SET resolved_at = ? + WHERE resolved_at IS NULL + AND acknowledged = 0 + AND last_seen < ? + AND category NOT IN ('disks', 'storage') + ''', (now_iso, stale_generic_cutoff)) except Exception: pass # If we can't read uptime, skip this cleanup @@ -1166,9 +1192,20 @@ class HealthPersistence: """Extract VM/CT ID from error message or key.""" if not text: return None - # Patterns: "VM 100", "CT 100", "vm_100_", "ct_100_", "VMID 100", etc. - match = re.search(r'(?:VM|CT|VMID|CTID|vm_|ct_)[\s_]?(\d{3,})', text, re.IGNORECASE) - return match.group(1) if match else None + # Patterns: "VM 100", "CT 100", "vm_100_", "ct_100_", "VMID 100", "VM/CT 100", "qemu/100", "lxc/100", etc. + patterns = [ + r'(?:VM|CT|VMID|CTID|vm_|ct_|vmct_)[\s_]?(\d{3,})', # VM 100, ct_100 + r'VM/CT[\s_]?(\d{3,})', # VM/CT 100 + r'(?:qemu|lxc)[/\\](\d{3,})', # qemu/100, lxc/100 + r'process.*kvm.*?(\d{3,})', # process kvm with vmid + r'Failed to start.*?(\d{3,})', # Failed to start VM/CT + r'starting.*?(\d{3,}).*failed', # starting 100 failed + ] + for pattern in patterns: + match = re.search(pattern, text, re.IGNORECASE) + if match: + return match.group(1) + return None def get_age_hours(timestamp_str): """Get age in hours from ISO timestamp string.""" @@ -1189,11 +1226,20 @@ class HealthPersistence: # === VM/CT ERRORS === # Check if VM/CT still exists (covers: vms/vmct categories, vm_*, ct_*, vmct_* error keys) - if category in ('vms', 'vmct') or (error_key and (error_key.startswith('vm_') or error_key.startswith('ct_') or error_key.startswith('vmct_'))): - vmid = extract_vmid_from_text(error_key) or extract_vmid_from_text(reason) - if vmid and not check_vm_ct_cached(vmid): + # Also check if the reason mentions a VM/CT that no longer exists + vmid_from_key = extract_vmid_from_text(error_key) if error_key else None + vmid_from_reason = extract_vmid_from_text(reason) if reason else None + vmid = vmid_from_key or vmid_from_reason + + if vmid and not check_vm_ct_cached(vmid): + # VM/CT doesn't exist - resolve regardless of category + should_resolve = True + resolution_reason = f'VM/CT {vmid} deleted' + elif category in ('vms', 'vmct') or (error_key and (error_key.startswith('vm_') or error_key.startswith('ct_') or error_key.startswith('vmct_'))): + # VM/CT category but ID couldn't be extracted - resolve if stale + if not vmid and last_seen_hours > 1: should_resolve = True - resolution_reason = 'VM/CT deleted' + resolution_reason = 'VM/CT error stale (>1h, ID not found)' # === DISK ERRORS === # Check if disk device or ZFS pool still exists @@ -1360,8 +1406,17 @@ class HealthPersistence: def check_vm_running(self, vm_id: str) -> bool: """ - Check if a VM/CT is running and resolve error if so. + Check if a VM/CT is running and resolve TRANSIENT errors if so. Also resolves error if VM/CT no longer exists. + + Only resolves errors that are likely to be fixed by a restart: + - QMP command failures + - Startup failures (generic) + + Does NOT resolve persistent configuration errors like: + - Device missing + - Permission issues + Returns True if running/resolved, False otherwise. """ import subprocess @@ -1369,6 +1424,8 @@ class HealthPersistence: try: vm_exists = False ct_exists = False + is_running = False + vm_type = None # Check qm status for VMs result_vm = subprocess.run( @@ -1380,32 +1437,59 @@ class HealthPersistence: if result_vm.returncode == 0: vm_exists = True + vm_type = 'vm' if 'running' in result_vm.stdout.lower(): - self.resolve_error(f'vm_{vm_id}', 'VM started') - self.resolve_error(f'vmct_{vm_id}', 'VM started') - return True + is_running = True # Check pct status for containers - result_ct = subprocess.run( - ['pct', 'status', vm_id], - capture_output=True, - text=True, - timeout=2 - ) + if not vm_exists: + result_ct = subprocess.run( + ['pct', 'status', vm_id], + capture_output=True, + text=True, + timeout=2 + ) + + if result_ct.returncode == 0: + ct_exists = True + vm_type = 'ct' + if 'running' in result_ct.stdout.lower(): + is_running = True - if result_ct.returncode == 0: - ct_exists = True - if 'running' in result_ct.stdout.lower(): - self.resolve_error(f'ct_{vm_id}', 'Container started') - self.resolve_error(f'vmct_{vm_id}', 'Container started') - return True - - # If neither VM nor CT exists, resolve all related errors + # If neither VM nor CT exists, resolve ALL related errors if not vm_exists and not ct_exists: self.resolve_error(f'vm_{vm_id}', 'VM/CT deleted') self.resolve_error(f'ct_{vm_id}', 'VM/CT deleted') self.resolve_error(f'vmct_{vm_id}', 'VM/CT deleted') - return True # Error resolved because resource doesn't exist + return True + + # If running, only resolve TRANSIENT errors (QMP, startup) + # Do NOT resolve persistent config errors (device missing, permissions) + if is_running: + conn = self._get_conn() + cursor = conn.cursor() + + # Get the error details to check if it's a persistent config error + for prefix in (f'{vm_type}_{vm_id}', f'vmct_{vm_id}'): + cursor.execute(''' + SELECT error_key, reason FROM errors + WHERE error_key = ? AND resolved_at IS NULL + ''', (prefix,)) + row = cursor.fetchone() + if row: + reason = (row[1] or '').lower() + # Check if this is a persistent config error that won't be fixed by restart + is_persistent_config = any(indicator in reason for indicator in [ + 'device', 'missing', 'does not exist', 'permission', + 'not found', 'no such', 'invalid' + ]) + + if not is_persistent_config: + # Transient error - resolve it + self.resolve_error(prefix, f'{vm_type.upper()} started successfully') + + conn.close() + return True return False diff --git a/AppImage/scripts/notification_events.py b/AppImage/scripts/notification_events.py index 5393e23c..cc00e136 100644 --- a/AppImage/scripts/notification_events.py +++ b/AppImage/scripts/notification_events.py @@ -174,8 +174,9 @@ def capture_journal_context(keywords: list, lines: int = 30, return "" # Use journalctl with grep to filter relevant lines + # Use -b 0 to only include logs from the current boot (not previous boots) cmd = ( - f"journalctl --since='{since}' --no-pager -n 500 2>/dev/null | " + f"journalctl -b 0 --since='{since}' --no-pager -n 500 2>/dev/null | " f"grep -iE '{pattern}' | tail -n {lines}" ) @@ -1800,6 +1801,8 @@ class PollingCollector: # Key = health_persistence category name # Value = minimum seconds between notifications for the same error_key _CATEGORY_COOLDOWNS = { + # Category cooldown: minimum time between DIFFERENT errors of the same category + # This prevents notification storms when multiple issues arise together 'disks': 86400, # 24h - I/O errors are persistent hardware issues 'smart': 86400, # 24h - SMART errors same as I/O 'zfs': 86400, # 24h - ZFS pool issues are persistent @@ -1809,6 +1812,7 @@ class PollingCollector: 'temperature': 3600, # 1h - temp can fluctuate near thresholds 'logs': 3600, # 1h - repeated log patterns 'vms': 1800, # 30m - VM state oscillation + 'vmct': 1800, # 30m - VM/CT state oscillation 'security': 3600, # 1h - auth failures tend to be bursty 'cpu': 1800, # 30m - CPU spikes can be transient 'memory': 1800, # 30m - memory pressure oscillation @@ -1816,6 +1820,10 @@ class PollingCollector: 'updates': 86400, # 24h - update info doesn't change fast } + # Global cooldown: minimum time before the SAME error can be re-notified + # This is independent of category - same error_key cannot repeat before this time + SAME_ERROR_COOLDOWN = 86400 # 24 hours + _ENTITY_MAP = { 'cpu': ('node', ''), 'memory': ('node', ''), 'temperature': ('node', ''), 'load': ('node', ''), @@ -2032,15 +2040,20 @@ class PollingCollector: # Determine if we should notify is_new = error_key not in self._known_errors last_sent = self._last_notified.get(error_key, 0) - cat_cooldown = self._CATEGORY_COOLDOWNS.get(category, self.DIGEST_INTERVAL) - is_due = (now - last_sent) >= cat_cooldown + time_since_last = now - last_sent + + # ── SAME ERROR COOLDOWN (24h) ── + # The SAME error_key cannot be re-notified before 24 hours. + # This is the PRIMARY deduplication mechanism. + if time_since_last < self.SAME_ERROR_COOLDOWN: + continue + + # ── CATEGORY COOLDOWN (varies) ── + # DIFFERENT errors within the same category respect category cooldown. + # This prevents notification storms when multiple issues arise together. + cat_cooldown = self._CATEGORY_COOLDOWNS.get(category, self.DIGEST_INTERVAL) + is_due = time_since_last >= cat_cooldown - # Anti-oscillation: even if "new" (resolved then reappeared), - # respect the per-category cooldown interval. This prevents - # "semi-cascades" where the same root cause generates multiple - # slightly different notifications across health check cycles. - # Each category has its own appropriate cooldown (30m for network, - # 24h for disks, 1h for temperature, etc.). if not is_due: continue