From dc52f4c692652b554e4ffbc459fd17481871d900 Mon Sep 17 00:00:00 2001 From: MacRimi Date: Sun, 1 Mar 2026 18:44:11 +0100 Subject: [PATCH] Update notification service --- AppImage/components/health-status-modal.tsx | 1 + AppImage/scripts/health_monitor.py | 420 +++++++++++++++----- AppImage/scripts/health_persistence.py | 27 ++ AppImage/scripts/notification_events.py | 160 +++++--- AppImage/scripts/notification_templates.py | 20 +- 5 files changed, 478 insertions(+), 150 deletions(-) diff --git a/AppImage/components/health-status-modal.tsx b/AppImage/components/health-status-modal.tsx index eb9cba0f..7a1f8642 100644 --- a/AppImage/components/health-status-modal.tsx +++ b/AppImage/components/health-status-modal.tsx @@ -383,6 +383,7 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu log_persistent_errors: "Persistent Errors", log_critical_errors: "Critical Errors", // Updates + pve_version: "Proxmox VE Version", security_updates: "Security Updates", system_age: "System Age", pending_updates: "Pending Updates", diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py index 7c818ac4..119269a7 100644 --- a/AppImage/scripts/health_monitor.py +++ b/AppImage/scripts/health_monitor.py @@ -71,11 +71,12 @@ class HealthMonitor: LOG_CHECK_INTERVAL = 300 # Updates Thresholds - UPDATES_WARNING = 365 # Only warn after 1 year without updates - UPDATES_CRITICAL = 730 # Critical after 2 years + UPDATES_WARNING = 365 # Only warn after 1 year without updates (system_age) + UPDATES_CRITICAL = 548 # Critical after 18 months without updates + SECURITY_WARN_DAYS = 360 # Security updates only become WARNING after 360 days unpatched BENIGN_ERROR_PATTERNS = [ - # Proxmox specific benign patterns + # ── Proxmox API / proxy operational noise ── r'got inotify poll request in wrong process', r'auth key pair too old, rotating', r'proxy detected vanished client connection', @@ -84,33 +85,62 @@ class HealthMonitor: r'disconnect peer', r'task OK', r'backup finished', + # PVE ticket / auth transient errors (web UI session expiry, API token + # refresh, brute-force bots). These are logged at WARNING/ERR level + # but are NOT system problems -- they are access-control events. + r'invalid PVE ticket', + r'authentication failure.*pve', + r'permission denied.*ticket', + r'no ticket', + r'CSRF.*failed', + r'pveproxy\[\d+\]: authentication failure', + r'pvedaemon\[\d+\]: authentication failure', + # PVE cluster/corosync normal chatter + r'corosync.*retransmit', + r'corosync.*delivering', + r'pmxcfs.*update', + r'pve-cluster\[\d+\]:.*status', - # Systemd informational messages + # ── Systemd informational messages ── r'(started|starting|stopped|stopping) session', r'session \d+ logged (in|out)', r'new session \d+ of user', r'removed session \d+', r'user@\d+\.service:', r'user runtime directory', + # Systemd service restarts (normal lifecycle) + r'systemd\[\d+\]: .+\.service: (Scheduled restart|Consumed)', + r'systemd\[\d+\]: .+\.service: Deactivated successfully', - # Network transient errors (common and usually self-recovering) + # ── Network transient errors (common and usually self-recovering) ── r'dhcp.*timeout', r'temporary failure in name resolution', r'network is unreachable', r'no route to host', - # Backup and sync normal warnings + # ── Backup and sync normal warnings ── r'rsync.*vanished', r'backup job .* finished', r'vzdump backup .* finished', - # ZFS informational + # ── ZFS informational ── r'zfs.*scrub (started|finished|in progress)', r'zpool.*resilver', - # LXC/Container normal operations + # ── LXC/Container normal operations ── r'lxc.*monitor', r'systemd\[1\]: (started|stopped) .*\.scope', + + # ── ATA/SCSI transient bus errors ── + # These are logged at ERR level but are common on SATA controllers + # during hot-plug, link renegotiation, or cable noise. They are NOT + # indicative of disk failure unless SMART also reports problems. + r'ata\d+.*SError.*BadCRC', + r'ata\d+.*Emask 0x10.*ATA bus error', + r'failed command: (READ|WRITE) FPDMA QUEUED', + r'ata\d+.*hard resetting link', + r'ata\d+.*link is slow', + r'ata\d+.*COMRESET', ] CRITICAL_LOG_KEYWORDS = [ @@ -120,14 +150,23 @@ class HealthMonitor: 'ext4-fs error', 'xfs.*corruption', 'lvm activation failed', 'hardware error', 'mce:', - 'segfault', 'general protection fault' + 'general protection fault', ] + # Segfault is WARNING, not CRITICAL -- only PVE-critical process + # segfaults are escalated to CRITICAL in _classify_log_severity. + PVE_CRITICAL_PROCESSES = { + 'pveproxy', 'pvedaemon', 'pvestatd', 'pve-cluster', + 'corosync', 'qemu-system', 'lxc-start', 'ceph-osd', + 'ceph-mon', 'pmxcfs', 'kvm', + } + WARNING_LOG_KEYWORDS = [ 'i/o error', 'ata error', 'scsi error', 'task hung', 'blocked for more than', 'failed to start', 'service.*failed', - 'disk.*offline', 'disk.*removed' + 'disk.*offline', 'disk.*removed', + 'segfault', # WARNING by default; escalated to CRITICAL only for PVE processes ] # PVE Critical Services @@ -1483,7 +1522,7 @@ class HealthMonitor: else: health_persistence.resolve_error(error_key, 'Disk errors cleared') - # Also include active filesystem errors (detected by _check_system_logs + # Also include active filesystem errors (detected by _check_log_analysis # and cross-referenced to the 'disks' category) try: fs_errors = health_persistence.get_active_errors(category='disks') @@ -1491,6 +1530,11 @@ class HealthMonitor: err_key = err.get('error_key', '') if not err_key.startswith('disk_fs_'): continue # Only filesystem cross-references + + # Skip acknowledged/dismissed errors + if err.get('acknowledged') == 1: + continue + details = err.get('details', {}) if isinstance(details, str): try: @@ -1498,15 +1542,34 @@ class HealthMonitor: details = _json.loads(details) except Exception: details = {} + device = details.get('device', err_key.replace('disk_fs_', '/dev/')) + base_disk = details.get('disk', '') + + # Check if the device still exists. If not, auto-resolve + # the error -- it was likely a disconnected USB/temp device. + dev_path = f'/dev/{base_disk}' if base_disk else device + if not os.path.exists(dev_path): + health_persistence.resolve_error( + err_key, 'Device no longer present in system') + continue + + # Cross-reference with SMART: if SMART is healthy for + # this disk, downgrade to INFO (transient fs error). + severity = err.get('severity', 'WARNING') + if base_disk: + smart_health = self._quick_smart_health(base_disk) + if smart_health == 'PASSED' and severity == 'CRITICAL': + severity = 'WARNING' + if device not in disk_results: disk_results[device] = { - 'status': err.get('severity', 'CRITICAL'), + 'status': severity, 'reason': err.get('reason', 'Filesystem error'), - 'device': details.get('disk', ''), + 'device': base_disk, 'error_count': 1, 'error_type': 'filesystem', - 'dismissable': False, + 'dismissable': True, 'error_key': err_key, } except Exception: @@ -2303,6 +2366,9 @@ class HealthMonitor: if 'segfault' in line_lower: m = re.search(r'(\S+)\[\d+\].*segfault', line) process = m.group(1) if m else 'unknown' + is_critical_proc = any(p in process.lower() for p in self.PVE_CRITICAL_PROCESSES) + if is_critical_proc: + return f'Critical process "{process}" crashed (segmentation fault) -- PVE service affected' return f'Process "{process}" crashed (segmentation fault)' # Hardware error @@ -2324,31 +2390,43 @@ class HealthMonitor: """ Classify log line severity intelligently. Returns: 'CRITICAL', 'WARNING', or None (benign/info) + + Design principles: + - CRITICAL must be reserved for events that require IMMEDIATE action + (data loss risk, service outage, hardware failure confirmed by SMART). + - WARNING is for events worth investigating but not urgent. + - Everything else is None (benign/informational). """ line_lower = line.lower() - # Check if benign first + # Check if benign first -- fast path for known noise if self._is_benign_error(line): return None - # Check critical keywords + # Check critical keywords (hard failures: OOM, panic, FS corruption, etc.) for keyword in self.CRITICAL_LOG_KEYWORDS: if re.search(keyword, line_lower): return 'CRITICAL' - # Check warning keywords + # Check warning keywords (includes segfault, I/O errors, etc.) for keyword in self.WARNING_LOG_KEYWORDS: if re.search(keyword, line_lower): + # Special case: segfault of a PVE-critical process is CRITICAL + if 'segfault' in line_lower: + for proc in self.PVE_CRITICAL_PROCESSES: + if proc in line_lower: + return 'CRITICAL' return 'WARNING' - # Generic error/warning classification based on common terms - if 'critical' in line_lower or 'fatal' in line_lower or 'panic' in line_lower: + # Generic classification -- very conservative to avoid false positives. + # Only escalate if the line explicitly uses severity-level keywords + # from the kernel or systemd (not just any line containing "error"). + if 'kernel panic' in line_lower or 'fatal' in line_lower and 'non-fatal' not in line_lower: return 'CRITICAL' - elif 'error' in line_lower or 'fail' in line_lower: - return 'WARNING' - elif 'warning' in line_lower or 'warn' in line_lower: - return None # Generic warnings are often informational and not critical + # Lines from priority "err" that don't match any keyword above are + # likely informational noise (e.g. "error response from daemon"). + # Return None to avoid flooding the dashboard with non-actionable items. return None def _check_logs_with_persistence(self) -> Dict[str, Any]: @@ -2424,18 +2502,61 @@ class HealthMonitor: pattern_hash = hashlib.md5(pattern.encode()).hexdigest()[:8] error_key = f'log_critical_{pattern_hash}' + # ── SMART cross-reference for disk/FS errors ── + # Filesystem and disk errors are only truly CRITICAL if + # the underlying disk is actually failing. We check: + # 1. Device exists? No -> WARNING (disconnected USB, etc.) + # 2. SMART PASSED? -> WARNING (transient error, not disk failure) + # 3. SMART FAILED? -> CRITICAL (confirmed hardware problem) + # 4. SMART UNKNOWN? -> WARNING (can't confirm, err on side of caution) + fs_dev_match = re.search( + r'(?:ext4-fs|btrfs|xfs|zfs)\s+error.*?device\s+(\S+?)\)?[:\s]', + line, re.IGNORECASE + ) + smart_status_for_log = None + if fs_dev_match: + fs_dev = fs_dev_match.group(1).rstrip(')') + base_dev = re.sub(r'\d+$', '', fs_dev) + if not os.path.exists(f'/dev/{base_dev}'): + # Device not present -- almost certainly a disconnected drive + severity = 'WARNING' + smart_status_for_log = 'DEVICE_ABSENT' + elif self.capabilities.get('has_smart'): + smart_health = self._quick_smart_health(base_dev) + smart_status_for_log = smart_health + if smart_health == 'PASSED': + # SMART says disk is healthy -- transient FS error + severity = 'WARNING' + elif smart_health == 'UNKNOWN': + # Can't verify -- be conservative, don't alarm + severity = 'WARNING' + # smart_health == 'FAILED' -> keep CRITICAL + if pattern not in critical_errors_found: - critical_errors_found[pattern] = line + # Only count as "critical" if severity wasn't downgraded + if severity == 'CRITICAL': + critical_errors_found[pattern] = line # Build a human-readable reason from the raw log line enriched_reason = self._enrich_critical_log_reason(line) + + # Append SMART context to the reason if we checked it + if smart_status_for_log == 'PASSED': + enriched_reason += '\nSMART: Passed (disk is healthy -- error is likely transient)' + elif smart_status_for_log == 'FAILED': + enriched_reason += '\nSMART: FAILED -- disk is failing, replace immediately' + elif smart_status_for_log == 'DEVICE_ABSENT': + enriched_reason += '\nDevice not currently detected -- may be a disconnected USB or temporary device' + # Record persistent error if it's not already active if not health_persistence.is_error_active(error_key, category='logs'): health_persistence.record_error( error_key=error_key, category='logs', - severity='CRITICAL', + severity=severity, reason=enriched_reason, - details={'pattern': pattern, 'raw_line': line[:200], 'dismissable': True} + details={'pattern': pattern, 'raw_line': line[:200], + 'smart_status': smart_status_for_log, + 'dismissable': True} ) # Cross-reference: filesystem errors also belong in the disks category @@ -2446,11 +2567,23 @@ class HealthMonitor: # Strip partition number to get base disk (sdb1 -> sdb) base_device = re.sub(r'\d+$', '', fs_device) if not ('nvme' in fs_device or 'mmcblk' in fs_device) else fs_device.rsplit('p', 1)[0] if 'p' in fs_device else fs_device disk_error_key = f'disk_fs_{fs_device}' + + # Use the SMART-aware severity we already determined above + device_exists = os.path.exists(f'/dev/{base_device}') + if not device_exists: + fs_severity = 'WARNING' + elif smart_status_for_log == 'PASSED': + fs_severity = 'WARNING' # SMART healthy -> transient + elif smart_status_for_log == 'FAILED': + fs_severity = 'CRITICAL' # SMART failing -> real problem + else: + fs_severity = 'WARNING' # Can't confirm -> conservative + if not health_persistence.is_error_active(disk_error_key, category='disks'): health_persistence.record_error( error_key=disk_error_key, category='disks', - severity='CRITICAL', + severity=fs_severity, reason=enriched_reason, details={ 'disk': base_device, @@ -2458,7 +2591,9 @@ class HealthMonitor: 'error_type': 'filesystem', 'error_count': 1, 'sample': line[:200], - 'dismissable': False + 'smart_status': smart_status_for_log, + 'dismissable': True, + 'device_exists': device_exists, } ) @@ -2529,11 +2664,17 @@ class HealthMonitor: # Use the original sample line for the notification, # not the normalized pattern (which has IDs replaced). sample = data.get('sample', pattern) + # Strip journal timestamp prefix so the stored reason + # doesn't contain dated information that confuses + # re-notifications. + clean_sample = re.sub( + r'^[A-Z][a-z]{2}\s+\d+\s+[\d:]+\s+\S+\s+', '', sample + ) health_persistence.record_error( error_key=error_key, category='logs', severity='WARNING', - reason=f'Recurring error ({data["count"]}x): {sample[:150]}', + reason=f'Recurring error ({data["count"]}x): {clean_sample[:150]}', details={'pattern': pattern, 'sample': sample, 'dismissable': True, 'occurrences': data['count']} ) @@ -2707,12 +2848,31 @@ class HealthMonitor: return pattern[:150] # Keep first 150 characters to avoid overly long patterns + # Regex to parse Inst lines: Inst [] ( []) + _RE_INST = re.compile(r'^Inst\s+(\S+)\s+\[([^\]]+)\]\s+\((\S+)\s+') + _RE_INST_NEW = re.compile(r'^Inst\s+(\S+)\s+\((\S+)\s+') + + _PVE_PREFIXES = ( + 'pve-', 'proxmox-', 'qemu-server', 'lxc-pve', 'ceph', + 'corosync', 'libpve', 'pbs-', 'pmg-', + ) + _KERNEL_PREFIXES = ('linux-image', 'pve-kernel', 'pve-firmware') + _IMPORTANT_PKGS = { + 'pve-manager', 'proxmox-ve', 'qemu-server', 'pve-container', + 'pve-ha-manager', 'pve-firewall', 'ceph-common', + 'proxmox-backup-client', + } + def _check_updates(self) -> Optional[Dict[str, Any]]: """ Check for pending system updates. - - WARNING: Security updates available, or system not updated >1 year (365 days). + - INFO: Any updates available (including security updates). + - WARNING: Security updates pending 360+ days unpatched, or system not updated >1 year (365 days). - CRITICAL: System not updated >18 months (548 days). - - INFO: Kernel/PVE updates available, or >50 non-security updates pending. + + Updates are always informational unless they represent a prolonged + unpatched state. Detects PVE version upgrades from pve-manager + Inst lines and exposes them as an INFO sub-check. """ cache_key = 'updates_check' current_time = time.time() @@ -2734,150 +2894,214 @@ class HealthMonitor: days_since_update = (current_time - mtime) / 86400 last_update_days = int(days_since_update) except Exception: - pass # Ignore if mtime fails + pass # Perform a dry run of apt-get upgrade to see pending packages try: result = subprocess.run( ['apt-get', 'upgrade', '--dry-run'], - capture_output=True, - text=True, - timeout=10 + capture_output=True, text=True, timeout=30 ) except subprocess.TimeoutExpired: print("[HealthMonitor] apt-get upgrade --dry-run timed out") return { 'status': 'UNKNOWN', 'reason': 'apt-get timed out - repository may be unreachable', - 'count': 0, - 'checks': {} + 'count': 0, 'checks': {} } status = 'OK' reason = None update_count = 0 - security_updates_packages = [] - kernel_pve_updates_packages = [] + security_pkgs: list = [] + kernel_pkgs: list = [] + pve_pkgs: list = [] + important_pkgs: list = [] # {name, cur, new} + pve_manager_info = None # {cur, new} or None sec_result = None + sec_severity = 'INFO' + sec_days_unpatched = 0 if result.returncode == 0: - lines = result.stdout.strip().split('\n') + for line in result.stdout.strip().split('\n'): + if not line.startswith('Inst '): + continue + update_count += 1 + + # Parse package name, current and new versions + m = self._RE_INST.match(line) + if m: + pkg_name, cur_ver, new_ver = m.group(1), m.group(2), m.group(3) + else: + m2 = self._RE_INST_NEW.match(line) + if m2: + pkg_name, cur_ver, new_ver = m2.group(1), '', m2.group(2) + else: + parts = line.split() + pkg_name = parts[1] if len(parts) > 1 else 'unknown' + cur_ver, new_ver = '', '' + + # Strip arch suffix (e.g. package:amd64) + pkg_name = pkg_name.split(':')[0] + name_lower = pkg_name.lower() + line_lower = line.lower() + + # Categorise + if 'security' in line_lower or 'debian-security' in line_lower: + security_pkgs.append(pkg_name) + + if any(name_lower.startswith(p) for p in self._KERNEL_PREFIXES): + kernel_pkgs.append(pkg_name) + elif any(name_lower.startswith(p) for p in self._PVE_PREFIXES): + pve_pkgs.append(pkg_name) + + # Collect important packages with version info + if pkg_name in self._IMPORTANT_PKGS and cur_ver: + important_pkgs.append({ + 'name': pkg_name, 'cur': cur_ver, 'new': new_ver + }) + + # Detect pve-manager upgrade -> PVE version upgrade + if pkg_name == 'pve-manager' and cur_ver and new_ver: + pve_manager_info = {'cur': cur_ver, 'new': new_ver} - for line in lines: - # 'Inst ' indicates a package will be installed/upgraded - if line.startswith('Inst '): - update_count += 1 - line_lower = line.lower() - package_name = line.split()[1].split(':')[0] # Get package name, strip arch if present - - # Check for security updates (common pattern in repo names) - if 'security' in line_lower or 'debian-security' in line_lower: - security_updates_packages.append(package_name) - - # Check for kernel or critical PVE updates - if any(pkg in line_lower for pkg in ['linux-image', 'pve-kernel', 'pve-manager', 'proxmox-ve', 'qemu-server', 'pve-api-core']): - kernel_pve_updates_packages.append(package_name) - - # Determine overall status based on findings - if security_updates_packages: - status = 'WARNING' - reason = f'{len(security_updates_packages)} security update(s) available' - # Record persistent error for security updates to ensure it's visible + # ── Determine overall status ────────────────────── + if security_pkgs: + sec_days_unpatched = 0 + try: + existing = health_persistence.get_error_by_key('security_updates') + if existing and existing.get('first_seen'): + from datetime import datetime + first_dt = datetime.fromisoformat(existing['first_seen']) + sec_days_unpatched = (datetime.now() - first_dt).days + except Exception: + pass + + if sec_days_unpatched >= self.SECURITY_WARN_DAYS: + status = 'WARNING' + reason = f'{len(security_pkgs)} security update(s) pending for {sec_days_unpatched} days' + sec_severity = 'WARNING' + else: + status = 'INFO' + reason = f'{len(security_pkgs)} security update(s) pending' + sec_severity = 'INFO' + sec_result = health_persistence.record_error( error_key='security_updates', category='updates', - severity='WARNING', + severity=sec_severity, reason=reason, - details={'count': len(security_updates_packages), 'packages': security_updates_packages[:5], 'dismissable': True} + details={'count': len(security_pkgs), 'packages': security_pkgs[:5], + 'dismissable': sec_severity == 'WARNING', + 'days_unpatched': sec_days_unpatched} ) - # If previously dismissed, downgrade to INFO if sec_result and sec_result.get('type') == 'skipped_acknowledged': status = 'INFO' reason = None + elif last_update_days and last_update_days >= 548: - # 18+ months without updates - CRITICAL status = 'CRITICAL' reason = f'System not updated in {last_update_days} days (>18 months)' health_persistence.record_error( - error_key='system_age', - category='updates', - severity='CRITICAL', - reason=reason, + error_key='system_age', category='updates', + severity='CRITICAL', reason=reason, details={'days': last_update_days, 'update_count': update_count, 'dismissable': False} ) elif last_update_days and last_update_days >= 365: - # 1+ year without updates - WARNING status = 'WARNING' reason = f'System not updated in {last_update_days} days (>1 year)' age_result = health_persistence.record_error( - error_key='system_age', - category='updates', - severity='WARNING', - reason=reason, + error_key='system_age', category='updates', + severity='WARNING', reason=reason, details={'days': last_update_days, 'update_count': update_count, 'dismissable': True} ) if age_result and age_result.get('type') == 'skipped_acknowledged': status = 'INFO' reason = None - elif kernel_pve_updates_packages: - # Informational: Kernel or critical PVE components need update + elif kernel_pkgs or pve_pkgs: status = 'INFO' - reason = f'{len(kernel_pve_updates_packages)} kernel/PVE update(s) available' - elif update_count > 50: - # Informational: Large number of pending updates + reason = f'{len(kernel_pkgs)} kernel + {len(pve_pkgs)} Proxmox update(s) available' + elif update_count > 0: status = 'INFO' - reason = f'{update_count} updates pending (consider maintenance window)' + reason = f'{update_count} package update(s) pending' - # If apt-get upgrade --dry-run failed elif result.returncode != 0: status = 'WARNING' reason = 'Failed to check for updates (apt-get error)' - # Build checks dict for updates sub-items + # ── Build checks dict ───────────────────────────────── age_dismissed = bool(age_result and age_result.get('type') == 'skipped_acknowledged') update_age_status = 'CRITICAL' if (last_update_days and last_update_days >= 548) else ( 'INFO' if age_dismissed else ('WARNING' if (last_update_days and last_update_days >= 365) else 'OK')) - sec_dismissed = security_updates_packages and sec_result and sec_result.get('type') == 'skipped_acknowledged' - sec_status = 'INFO' if sec_dismissed else ('WARNING' if security_updates_packages else 'OK') - kernel_status = 'INFO' if kernel_pve_updates_packages else 'OK' + + sec_dismissed = security_pkgs and sec_result and sec_result.get('type') == 'skipped_acknowledged' + if sec_dismissed: + sec_status = 'INFO' + elif security_pkgs: + sec_status = sec_severity + else: + sec_status = 'OK' + + sec_detail = f'{len(security_pkgs)} security update(s) pending' + if security_pkgs and sec_days_unpatched >= self.SECURITY_WARN_DAYS: + sec_detail += f' ({sec_days_unpatched} days unpatched)' checks = { + 'kernel_pve': { + 'status': 'INFO' if kernel_pkgs else 'OK', + 'detail': f'{len(kernel_pkgs)} kernel/PVE update(s)' if kernel_pkgs else 'Kernel/PVE up to date', + 'error_key': 'kernel_pve' + }, + 'pending_updates': { + 'status': 'INFO' if update_count > 0 else 'OK', + 'detail': f'{update_count} package(s) pending', + 'error_key': 'pending_updates' + }, 'security_updates': { 'status': sec_status, - 'detail': f'{len(security_updates_packages)} security update(s) pending' if security_updates_packages else 'No security updates pending', - 'dismissable': True if security_updates_packages and not sec_dismissed else False, + 'detail': sec_detail if security_pkgs else 'No security updates pending', + 'dismissable': sec_status == 'WARNING' and not sec_dismissed, 'dismissed': bool(sec_dismissed), 'error_key': 'security_updates' }, 'system_age': { 'status': update_age_status, 'detail': f'Last updated {last_update_days} day(s) ago' if last_update_days is not None else 'Unknown', - 'dismissable': False if update_age_status == 'CRITICAL' else True if update_age_status == 'WARNING' else False, + 'dismissable': update_age_status == 'WARNING' and not age_dismissed, 'dismissed': bool(age_dismissed), 'error_key': 'system_age' }, - 'pending_updates': { - 'status': 'INFO' if update_count > 50 else 'OK', - 'detail': f'{update_count} package(s) pending', - 'error_key': 'pending_updates' - }, - 'kernel_pve': { - 'status': kernel_status, - 'detail': f'{len(kernel_pve_updates_packages)} kernel/PVE update(s)' if kernel_pve_updates_packages else 'Kernel/PVE up to date', - 'error_key': 'kernel_pve' - } } + # PVE version sub-check (always INFO) + if pve_manager_info: + checks['pve_version'] = { + 'status': 'INFO', + 'detail': f"PVE {pve_manager_info['cur']} -> {pve_manager_info['new']} available", + 'error_key': 'pve_version' + } + else: + checks['pve_version'] = { + 'status': 'OK', + 'detail': 'Proxmox VE is up to date', + 'error_key': 'pve_version' + } + # Construct result dictionary update_result = { 'status': status, 'count': update_count, - 'checks': checks + 'checks': checks, } if reason: update_result['reason'] = reason if last_update_days is not None: update_result['days_since_update'] = last_update_days + # Attach categorised counts for the frontend + update_result['security_count'] = len(security_pkgs) + update_result['pve_count'] = len(pve_pkgs) + update_result['kernel_count'] = len(kernel_pkgs) + update_result['important_packages'] = important_pkgs[:8] self.cached_results[cache_key] = update_result self.last_check_times[cache_key] = current_time diff --git a/AppImage/scripts/health_persistence.py b/AppImage/scripts/health_persistence.py index 82be46f6..9e4b0085 100644 --- a/AppImage/scripts/health_persistence.py +++ b/AppImage/scripts/health_persistence.py @@ -548,6 +548,33 @@ class HealthPersistence: return errors + def get_error_by_key(self, error_key: str) -> Optional[Dict[str, Any]]: + """Get a single error record by its unique error_key. + + Returns the full row as a dict (including first_seen, last_seen, + acknowledged, etc.) or None if not found / already resolved. + Only returns unresolved (active) errors. + """ + conn = self._get_conn() + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + cursor.execute(''' + SELECT * FROM errors + WHERE error_key = ? AND resolved_at IS NULL + LIMIT 1 + ''', (error_key,)) + row = cursor.fetchone() + conn.close() + if row is None: + return None + error_dict = dict(row) + if error_dict.get('details'): + try: + error_dict['details'] = json.loads(error_dict['details']) + except (json.JSONDecodeError, TypeError): + pass + return error_dict + def cleanup_old_errors(self): """Clean up old resolved errors and auto-resolve stale errors""" with self._db_lock: diff --git a/AppImage/scripts/notification_events.py b/AppImage/scripts/notification_events.py index 383f32f9..5978f7d3 100644 --- a/AppImage/scripts/notification_events.py +++ b/AppImage/scripts/notification_events.py @@ -337,6 +337,16 @@ class JournalWatcher: entity = 'disk' entity_id = f'fs_{device}' + # Check if the device physically exists to calibrate severity. + # A disconnected USB / temp device should NOT be CRITICAL. + import os as _os + base_dev = re.sub(r'\d+$', '', device) if device != 'unknown' else '' + device_exists = base_dev and _os.path.exists(f'/dev/{base_dev}') + + if not device_exists and device != 'unknown': + # Device not present -- downgrade to WARNING + severity = 'WARNING' + # Identify what this device is (model, type, mountpoint) device_info = self._identify_block_device(device) @@ -357,7 +367,10 @@ class JournalWatcher: if inode: inode_hint = 'root directory' if inode == '2' else f'inode #{inode}' parts.append(f'Affected: {inode_hint}') - parts.append(f'Action: Run "fsck /dev/{device}" (unmount first) or check backup integrity') + if device_exists: + parts.append(f'Action: Run "fsck /dev/{device}" (unmount first) or check backup integrity') + else: + parts.append('Note: Device not currently connected -- this may be a stale journal entry') enriched = '\n'.join(parts) else: @@ -1325,7 +1338,7 @@ class PollingCollector: 'network': 'network_down', 'pve_services': 'service_fail', 'security': 'auth_fail', - 'updates': 'update_available', + 'updates': 'update_summary', 'zfs': 'disk_io_error', 'smart': 'disk_io_error', 'disks': 'disk_io_error', @@ -1442,12 +1455,18 @@ class PollingCollector: event_type = self._CATEGORY_TO_EVENT_TYPE.get(category, 'system_problem') entity, eid = self._ENTITY_MAP.get(category, ('node', '')) + # Updates are always informational notifications except + # system_age which can be WARNING (365+ days) or CRITICAL (548+ days). + emit_severity = severity + if category == 'updates' and error_key != 'system_age': + emit_severity = 'INFO' + data = { 'hostname': self._hostname, 'category': category, 'reason': reason, 'error_key': error_key, - 'severity': severity, + 'severity': emit_severity, 'first_seen': error.get('first_seen', ''), 'last_seen': error.get('last_seen', ''), 'is_persistent': not is_new, @@ -1464,7 +1483,7 @@ class PollingCollector: pass self._queue.put(NotificationEvent( - event_type, severity, data, source='health', + event_type, emit_severity, data, source='health', entity=entity, entity_id=eid or error_key, )) @@ -1482,11 +1501,36 @@ class PollingCollector: # ── Update check (enriched) ──────────────────────────────── + # Proxmox-related package prefixes used for categorisation + _PVE_PREFIXES = ( + 'pve-', 'proxmox-', 'qemu-server', 'lxc-pve', 'ceph', + 'corosync', 'libpve', 'pbs-', 'pmg-', + ) + _KERNEL_PREFIXES = ('linux-image', 'pve-kernel', 'pve-firmware') + _IMPORTANT_PKGS = { + 'pve-manager', 'proxmox-ve', 'qemu-server', 'pve-container', + 'pve-ha-manager', 'pve-firewall', 'pve-storage-iscsi-direct', + 'ceph-common', 'proxmox-backup-client', + } + + # Regex to parse Inst lines from apt-get -s upgrade + # Inst [] ( []) + _RE_INST = re.compile( + r'^Inst\s+(\S+)\s+\[([^\]]+)\]\s+\((\S+)\s+' + ) + # Fallback for new installs (no current version): + # Inst ( []) + _RE_INST_NEW = re.compile( + r'^Inst\s+(\S+)\s+\((\S+)\s+' + ) + def _check_updates(self): """Check for available system updates every 24 h. - Enriched output: total count, security updates, PVE version hint, - and top package names. + Emits a structured ``update_summary`` notification with categorised + counts (security, Proxmox-related, kernel, other) and important + package versions. If pve-manager has an upgrade, also emits a + separate ``pve_update`` notification. """ now = time.time() if now - self._last_update_check < self.UPDATE_CHECK_INTERVAL: @@ -1502,58 +1546,84 @@ class PollingCollector: if result.returncode != 0: return - lines = [l for l in result.stdout.split('\n') if l.startswith('Inst ')] - total = len(lines) + inst_lines = [l for l in result.stdout.split('\n') if l.startswith('Inst ')] + total = len(inst_lines) if total == 0: return - packages = [l.split()[1] for l in lines] - security = [p for p in packages if any( - kw in p.lower() for kw in ('security', 'cve', 'openssl', 'libssl') - )] + # ── Parse every Inst line ────────────────────────────── + all_pkgs: list[dict] = [] # {name, cur, new} + security_pkgs: list[dict] = [] + pve_pkgs: list[dict] = [] + kernel_pkgs: list[dict] = [] + pve_manager_info: dict | None = None - # Also detect security updates via apt changelog / Debian-Security origin - sec_result = subprocess.run( - ['apt-get', '-s', 'upgrade', '-o', 'Dir::Etc::SourceList=/dev/null', - '-o', 'Dir::Etc::SourceParts=/dev/null'], - capture_output=True, text=True, timeout=30, - ) - # Count lines from security repo (rough heuristic) - sec_count = max(len(security), 0) - try: - sec_output = subprocess.run( - ['apt-get', '-s', '--only-upgrade', 'install'] + packages[:50], - capture_output=True, text=True, timeout=30, - ) - for line in sec_output.stdout.split('\n'): - if 'security' in line.lower() and 'Inst ' in line: - sec_count += 1 - except Exception: - pass + for line in inst_lines: + m = self._RE_INST.match(line) + if m: + info = {'name': m.group(1), 'cur': m.group(2), 'new': m.group(3)} + else: + m2 = self._RE_INST_NEW.match(line) + if m2: + info = {'name': m2.group(1), 'cur': '', 'new': m2.group(2)} + else: + pkg_name = line.split()[1] if len(line.split()) > 1 else 'unknown' + info = {'name': pkg_name, 'cur': '', 'new': ''} + + all_pkgs.append(info) + name_lower = info['name'].lower() + line_lower = line.lower() + + # Categorise + if 'security' in line_lower or 'debian-security' in line_lower: + security_pkgs.append(info) + + if any(name_lower.startswith(p) for p in self._KERNEL_PREFIXES): + kernel_pkgs.append(info) + elif any(name_lower.startswith(p) for p in self._PVE_PREFIXES): + pve_pkgs.append(info) + + # Detect pve-manager upgrade specifically + if info['name'] == 'pve-manager': + pve_manager_info = info - # Check for PVE version upgrade - pve_packages = [p for p in packages if 'pve-' in p.lower() or 'proxmox-' in p.lower()] - - # Build display details - top_pkgs = packages[:8] - details = ', '.join(top_pkgs) - if total > 8: - details += f', ... +{total - 8} more' + # ── Build important packages list ────────────────────── + important_lines = [] + for pkg in all_pkgs: + if pkg['name'] in self._IMPORTANT_PKGS and pkg['cur']: + important_lines.append( + f"{pkg['name']} ({pkg['cur']} -> {pkg['new']})" + ) + # ── Emit structured update_summary ───────────────────── data = { 'hostname': self._hostname, - 'count': str(total), - 'security_count': str(sec_count), - 'details': details, - 'packages': ', '.join(packages[:20]), + 'total_count': str(total), + 'security_count': str(len(security_pkgs)), + 'pve_count': str(len(pve_pkgs)), + 'kernel_count': str(len(kernel_pkgs)), + 'important_list': ', '.join(important_lines) if important_lines else 'none', + 'package_list': ', '.join(important_lines[:6]) if important_lines else '', } - if pve_packages: - data['pve_packages'] = ', '.join(pve_packages) self._queue.put(NotificationEvent( - 'update_available', 'INFO', data, + 'update_summary', 'INFO', data, source='polling', entity='node', entity_id='', )) + + # ── Emit pve_update if pve-manager has an upgrade ────── + if pve_manager_info and pve_manager_info['cur'] and pve_manager_info['new']: + pve_data = { + 'hostname': self._hostname, + 'current_version': pve_manager_info['cur'], + 'new_version': pve_manager_info['new'], + 'version': pve_manager_info['new'], + 'details': f"pve-manager {pve_manager_info['cur']} -> {pve_manager_info['new']}", + } + self._queue.put(NotificationEvent( + 'pve_update', 'INFO', pve_data, + source='polling', entity='node', entity_id='', + )) except Exception: pass diff --git a/AppImage/scripts/notification_templates.py b/AppImage/scripts/notification_templates.py index ed7dc882..ef3c298e 100644 --- a/AppImage/scripts/notification_templates.py +++ b/AppImage/scripts/notification_templates.py @@ -584,10 +584,10 @@ TEMPLATES = { 'default_enabled': True, }, 'update_available': { - 'title': '{hostname}: Updates available ({count})', - 'body': '{count} package updates are available.\n{details}', + 'title': '{hostname}: Updates available', + 'body': 'Total updates: {total_count}\nSecurity: {security_count}\nProxmox: {pve_count}\nKernel: {kernel_count}\nImportant: {important_list}', 'group': 'system', - 'default_enabled': False, + 'default_enabled': False, # Superseded by update_summary }, 'update_complete': { 'title': '{hostname}: Update completed', @@ -626,14 +626,20 @@ TEMPLATES = { # ── Update notifications (enriched) ── 'update_summary': { - 'title': '{hostname}: {total_count} updates available', - 'body': '{security_count} security update(s), {total_count} total.\n{package_list}', + 'title': '{hostname}: Updates available', + 'body': ( + 'Total updates: {total_count}\n' + 'Security updates: {security_count}\n' + 'Proxmox-related updates: {pve_count}\n' + 'Kernel updates: {kernel_count}\n' + 'Important packages: {important_list}' + ), 'group': 'system', 'default_enabled': True, }, 'pve_update': { - 'title': '{hostname}: PVE update available ({version})', - 'body': 'Proxmox VE update available: {version}\n{details}', + 'title': '{hostname}: Proxmox VE {new_version} available', + 'body': 'Proxmox VE {current_version} -> {new_version}\n{details}', 'group': 'system', 'default_enabled': True, },