From dc52f4c692652b554e4ffbc459fd17481871d900 Mon Sep 17 00:00:00 2001
From: MacRimi <ricoextincion@gmail.com>
Date: Sun, 1 Mar 2026 18:44:11 +0100
Subject: [PATCH] Update notification service

---
 AppImage/components/health-status-modal.tsx |   1 +
 AppImage/scripts/health_monitor.py          | 420 +++++++++++++++-----
 AppImage/scripts/health_persistence.py      |  27 ++
 AppImage/scripts/notification_events.py     | 160 +++++---
 AppImage/scripts/notification_templates.py  |  20 +-
 5 files changed, 478 insertions(+), 150 deletions(-)

diff --git a/AppImage/components/health-status-modal.tsx b/AppImage/components/health-status-modal.tsx
index eb9cba0f..7a1f8642 100644
--- a/AppImage/components/health-status-modal.tsx
+++ b/AppImage/components/health-status-modal.tsx
@@ -383,6 +383,7 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
       log_persistent_errors: "Persistent Errors",
       log_critical_errors: "Critical Errors",
       // Updates
+      pve_version: "Proxmox VE Version",
       security_updates: "Security Updates",
       system_age: "System Age",
       pending_updates: "Pending Updates",
diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py
index 7c818ac4..119269a7 100644
--- a/AppImage/scripts/health_monitor.py
+++ b/AppImage/scripts/health_monitor.py
@@ -71,11 +71,12 @@ class HealthMonitor:
     LOG_CHECK_INTERVAL = 300
     
     # Updates Thresholds
-    UPDATES_WARNING = 365  # Only warn after 1 year without updates
-    UPDATES_CRITICAL = 730  # Critical after 2 years
+    UPDATES_WARNING = 365   # Only warn after 1 year without updates (system_age)
+    UPDATES_CRITICAL = 548  # Critical after 18 months without updates
+    SECURITY_WARN_DAYS = 360  # Security updates only become WARNING after 360 days unpatched
     
     BENIGN_ERROR_PATTERNS = [
-        # Proxmox specific benign patterns
+        # ── Proxmox API / proxy operational noise ──
         r'got inotify poll request in wrong process',
         r'auth key pair too old, rotating',
         r'proxy detected vanished client connection',
@@ -84,33 +85,62 @@ class HealthMonitor:
         r'disconnect peer',
         r'task OK',
         r'backup finished',
+        # PVE ticket / auth transient errors (web UI session expiry, API token
+        # refresh, brute-force bots). These are logged at WARNING/ERR level
+        # but are NOT system problems -- they are access-control events.
+        r'invalid PVE ticket',
+        r'authentication failure.*pve',
+        r'permission denied.*ticket',
+        r'no ticket',
+        r'CSRF.*failed',
+        r'pveproxy\[\d+\]: authentication failure',
+        r'pvedaemon\[\d+\]: authentication failure',
+        # PVE cluster/corosync normal chatter
+        r'corosync.*retransmit',
+        r'corosync.*delivering',
+        r'pmxcfs.*update',
+        r'pve-cluster\[\d+\]:.*status',
         
-        # Systemd informational messages
+        # ── Systemd informational messages ──
         r'(started|starting|stopped|stopping) session',
         r'session \d+ logged (in|out)',
         r'new session \d+ of user',
         r'removed session \d+',
         r'user@\d+\.service:',
         r'user runtime directory',
+        # Systemd service restarts (normal lifecycle)
+        r'systemd\[\d+\]: .+\.service: (Scheduled restart|Consumed)',
+        r'systemd\[\d+\]: .+\.service: Deactivated successfully',
         
-        # Network transient errors (common and usually self-recovering)
+        # ── Network transient errors (common and usually self-recovering) ──
         r'dhcp.*timeout',
         r'temporary failure in name resolution',
         r'network is unreachable',
         r'no route to host',
         
-        # Backup and sync normal warnings
+        # ── Backup and sync normal warnings ──
         r'rsync.*vanished',
         r'backup job .* finished',
         r'vzdump backup .* finished',
         
-        # ZFS informational
+        # ── ZFS informational ──
         r'zfs.*scrub (started|finished|in progress)',
         r'zpool.*resilver',
         
-        # LXC/Container normal operations
+        # ── LXC/Container normal operations ──
         r'lxc.*monitor',
         r'systemd\[1\]: (started|stopped) .*\.scope',
+        
+        # ── ATA/SCSI transient bus errors ──
+        # These are logged at ERR level but are common on SATA controllers
+        # during hot-plug, link renegotiation, or cable noise. They are NOT
+        # indicative of disk failure unless SMART also reports problems.
+        r'ata\d+.*SError.*BadCRC',
+        r'ata\d+.*Emask 0x10.*ATA bus error',
+        r'failed command: (READ|WRITE) FPDMA QUEUED',
+        r'ata\d+.*hard resetting link',
+        r'ata\d+.*link is slow',
+        r'ata\d+.*COMRESET',
     ]
     
     CRITICAL_LOG_KEYWORDS = [
@@ -120,14 +150,23 @@ class HealthMonitor:
         'ext4-fs error', 'xfs.*corruption',
         'lvm activation failed',
         'hardware error', 'mce:',
-        'segfault', 'general protection fault'
+        'general protection fault',
     ]
     
+    # Segfault is WARNING, not CRITICAL -- only PVE-critical process
+    # segfaults are escalated to CRITICAL in _classify_log_severity.
+    PVE_CRITICAL_PROCESSES = {
+        'pveproxy', 'pvedaemon', 'pvestatd', 'pve-cluster',
+        'corosync', 'qemu-system', 'lxc-start', 'ceph-osd',
+        'ceph-mon', 'pmxcfs', 'kvm',
+    }
+    
     WARNING_LOG_KEYWORDS = [
         'i/o error', 'ata error', 'scsi error',
         'task hung', 'blocked for more than',
         'failed to start', 'service.*failed',
-        'disk.*offline', 'disk.*removed'
+        'disk.*offline', 'disk.*removed',
+        'segfault',  # WARNING by default; escalated to CRITICAL only for PVE processes
     ]
     
     # PVE Critical Services
@@ -1483,7 +1522,7 @@ class HealthMonitor:
                     else:
                         health_persistence.resolve_error(error_key, 'Disk errors cleared')
             
-            # Also include active filesystem errors (detected by _check_system_logs
+            # Also include active filesystem errors (detected by _check_log_analysis
             # and cross-referenced to the 'disks' category)
             try:
                 fs_errors = health_persistence.get_active_errors(category='disks')
@@ -1491,6 +1530,11 @@ class HealthMonitor:
                     err_key = err.get('error_key', '')
                     if not err_key.startswith('disk_fs_'):
                         continue  # Only filesystem cross-references
+                    
+                    # Skip acknowledged/dismissed errors
+                    if err.get('acknowledged') == 1:
+                        continue
+                    
                     details = err.get('details', {})
                     if isinstance(details, str):
                         try:
@@ -1498,15 +1542,34 @@ class HealthMonitor:
                             details = _json.loads(details)
                         except Exception:
                             details = {}
+                    
                     device = details.get('device', err_key.replace('disk_fs_', '/dev/'))
+                    base_disk = details.get('disk', '')
+                    
+                    # Check if the device still exists.  If not, auto-resolve
+                    # the error -- it was likely a disconnected USB/temp device.
+                    dev_path = f'/dev/{base_disk}' if base_disk else device
+                    if not os.path.exists(dev_path):
+                        health_persistence.resolve_error(
+                            err_key, 'Device no longer present in system')
+                        continue
+                    
+                    # Cross-reference with SMART: if SMART is healthy for
+                    # this disk, downgrade to INFO (transient fs error).
+                    severity = err.get('severity', 'WARNING')
+                    if base_disk:
+                        smart_health = self._quick_smart_health(base_disk)
+                        if smart_health == 'PASSED' and severity == 'CRITICAL':
+                            severity = 'WARNING'
+                    
                     if device not in disk_results:
                         disk_results[device] = {
-                            'status': err.get('severity', 'CRITICAL'),
+                            'status': severity,
                             'reason': err.get('reason', 'Filesystem error'),
-                            'device': details.get('disk', ''),
+                            'device': base_disk,
                             'error_count': 1,
                             'error_type': 'filesystem',
-                            'dismissable': False,
+                            'dismissable': True,
                             'error_key': err_key,
                         }
             except Exception:
@@ -2303,6 +2366,9 @@ class HealthMonitor:
         if 'segfault' in line_lower:
             m = re.search(r'(\S+)\[\d+\].*segfault', line)
             process = m.group(1) if m else 'unknown'
+            is_critical_proc = any(p in process.lower() for p in self.PVE_CRITICAL_PROCESSES)
+            if is_critical_proc:
+                return f'Critical process "{process}" crashed (segmentation fault) -- PVE service affected'
             return f'Process "{process}" crashed (segmentation fault)'
         
         # Hardware error
@@ -2324,31 +2390,43 @@ class HealthMonitor:
         """
         Classify log line severity intelligently.
         Returns: 'CRITICAL', 'WARNING', or None (benign/info)
+        
+        Design principles:
+        - CRITICAL must be reserved for events that require IMMEDIATE action
+          (data loss risk, service outage, hardware failure confirmed by SMART).
+        - WARNING is for events worth investigating but not urgent.
+        - Everything else is None (benign/informational).
         """
         line_lower = line.lower()
         
-        # Check if benign first
+        # Check if benign first -- fast path for known noise
         if self._is_benign_error(line):
             return None
         
-        # Check critical keywords
+        # Check critical keywords (hard failures: OOM, panic, FS corruption, etc.)
         for keyword in self.CRITICAL_LOG_KEYWORDS:
             if re.search(keyword, line_lower):
                 return 'CRITICAL'
         
-        # Check warning keywords
+        # Check warning keywords (includes segfault, I/O errors, etc.)
         for keyword in self.WARNING_LOG_KEYWORDS:
             if re.search(keyword, line_lower):
+                # Special case: segfault of a PVE-critical process is CRITICAL
+                if 'segfault' in line_lower:
+                    for proc in self.PVE_CRITICAL_PROCESSES:
+                        if proc in line_lower:
+                            return 'CRITICAL'
                 return 'WARNING'
         
-        # Generic error/warning classification based on common terms
-        if 'critical' in line_lower or 'fatal' in line_lower or 'panic' in line_lower:
+        # Generic classification -- very conservative to avoid false positives.
+        # Only escalate if the line explicitly uses severity-level keywords
+        # from the kernel or systemd (not just any line containing "error").
+        if 'kernel panic' in line_lower or 'fatal' in line_lower and 'non-fatal' not in line_lower:
             return 'CRITICAL'
-        elif 'error' in line_lower or 'fail' in line_lower:
-            return 'WARNING'
-        elif 'warning' in line_lower or 'warn' in line_lower:
-            return None  # Generic warnings are often informational and not critical
         
+        # Lines from priority "err" that don't match any keyword above are
+        # likely informational noise (e.g. "error response from daemon").
+        # Return None to avoid flooding the dashboard with non-actionable items.
         return None
 
     def _check_logs_with_persistence(self) -> Dict[str, Any]:
@@ -2424,18 +2502,61 @@ class HealthMonitor:
                         pattern_hash = hashlib.md5(pattern.encode()).hexdigest()[:8]
                         error_key = f'log_critical_{pattern_hash}'
                         
+                        # ── SMART cross-reference for disk/FS errors ──
+                        # Filesystem and disk errors are only truly CRITICAL if
+                        # the underlying disk is actually failing.  We check:
+                        #  1. Device exists? No -> WARNING (disconnected USB, etc.)
+                        #  2. SMART PASSED? -> WARNING (transient error, not disk failure)
+                        #  3. SMART FAILED? -> CRITICAL (confirmed hardware problem)
+                        #  4. SMART UNKNOWN? -> WARNING (can't confirm, err on side of caution)
+                        fs_dev_match = re.search(
+                            r'(?:ext4-fs|btrfs|xfs|zfs)\s+error.*?device\s+(\S+?)\)?[:\s]',
+                            line, re.IGNORECASE
+                        )
+                        smart_status_for_log = None
+                        if fs_dev_match:
+                            fs_dev = fs_dev_match.group(1).rstrip(')')
+                            base_dev = re.sub(r'\d+$', '', fs_dev)
+                            if not os.path.exists(f'/dev/{base_dev}'):
+                                # Device not present -- almost certainly a disconnected drive
+                                severity = 'WARNING'
+                                smart_status_for_log = 'DEVICE_ABSENT'
+                            elif self.capabilities.get('has_smart'):
+                                smart_health = self._quick_smart_health(base_dev)
+                                smart_status_for_log = smart_health
+                                if smart_health == 'PASSED':
+                                    # SMART says disk is healthy -- transient FS error
+                                    severity = 'WARNING'
+                                elif smart_health == 'UNKNOWN':
+                                    # Can't verify -- be conservative, don't alarm
+                                    severity = 'WARNING'
+                                # smart_health == 'FAILED' -> keep CRITICAL
+                        
                         if pattern not in critical_errors_found:
-                            critical_errors_found[pattern] = line
+                            # Only count as "critical" if severity wasn't downgraded
+                            if severity == 'CRITICAL':
+                                critical_errors_found[pattern] = line
                             # Build a human-readable reason from the raw log line
                             enriched_reason = self._enrich_critical_log_reason(line)
+                            
+                            # Append SMART context to the reason if we checked it
+                            if smart_status_for_log == 'PASSED':
+                                enriched_reason += '\nSMART: Passed (disk is healthy -- error is likely transient)'
+                            elif smart_status_for_log == 'FAILED':
+                                enriched_reason += '\nSMART: FAILED -- disk is failing, replace immediately'
+                            elif smart_status_for_log == 'DEVICE_ABSENT':
+                                enriched_reason += '\nDevice not currently detected -- may be a disconnected USB or temporary device'
+                            
                             # Record persistent error if it's not already active
                             if not health_persistence.is_error_active(error_key, category='logs'):
                                 health_persistence.record_error(
                                     error_key=error_key,
                                     category='logs',
-                                    severity='CRITICAL',
+                                    severity=severity,
                                     reason=enriched_reason,
-                                    details={'pattern': pattern, 'raw_line': line[:200], 'dismissable': True}
+                                    details={'pattern': pattern, 'raw_line': line[:200],
+                                             'smart_status': smart_status_for_log,
+                                             'dismissable': True}
                                 )
                             
                             # Cross-reference: filesystem errors also belong in the disks category
@@ -2446,11 +2567,23 @@ class HealthMonitor:
                                 # Strip partition number to get base disk (sdb1 -> sdb)
                                 base_device = re.sub(r'\d+$', '', fs_device) if not ('nvme' in fs_device or 'mmcblk' in fs_device) else fs_device.rsplit('p', 1)[0] if 'p' in fs_device else fs_device
                                 disk_error_key = f'disk_fs_{fs_device}'
+                                
+                                # Use the SMART-aware severity we already determined above
+                                device_exists = os.path.exists(f'/dev/{base_device}')
+                                if not device_exists:
+                                    fs_severity = 'WARNING'
+                                elif smart_status_for_log == 'PASSED':
+                                    fs_severity = 'WARNING'  # SMART healthy -> transient
+                                elif smart_status_for_log == 'FAILED':
+                                    fs_severity = 'CRITICAL'  # SMART failing -> real problem
+                                else:
+                                    fs_severity = 'WARNING'  # Can't confirm -> conservative
+                                
                                 if not health_persistence.is_error_active(disk_error_key, category='disks'):
                                     health_persistence.record_error(
                                         error_key=disk_error_key,
                                         category='disks',
-                                        severity='CRITICAL',
+                                        severity=fs_severity,
                                         reason=enriched_reason,
                                         details={
                                             'disk': base_device,
@@ -2458,7 +2591,9 @@ class HealthMonitor:
                                             'error_type': 'filesystem',
                                             'error_count': 1,
                                             'sample': line[:200],
-                                            'dismissable': False
+                                            'smart_status': smart_status_for_log,
+                                            'dismissable': True,
+                                            'device_exists': device_exists,
                                         }
                                     )
                     
@@ -2529,11 +2664,17 @@ class HealthMonitor:
                             # Use the original sample line for the notification,
                             # not the normalized pattern (which has IDs replaced).
                             sample = data.get('sample', pattern)
+                            # Strip journal timestamp prefix so the stored reason
+                            # doesn't contain dated information that confuses
+                            # re-notifications.
+                            clean_sample = re.sub(
+                                r'^[A-Z][a-z]{2}\s+\d+\s+[\d:]+\s+\S+\s+', '', sample
+                            )
                             health_persistence.record_error(
                                 error_key=error_key,
                                 category='logs',
                                 severity='WARNING',
-                                reason=f'Recurring error ({data["count"]}x): {sample[:150]}',
+                                reason=f'Recurring error ({data["count"]}x): {clean_sample[:150]}',
                                 details={'pattern': pattern, 'sample': sample,
                                          'dismissable': True, 'occurrences': data['count']}
                             )
@@ -2707,12 +2848,31 @@ class HealthMonitor:
         
         return pattern[:150]  # Keep first 150 characters to avoid overly long patterns
     
+    # Regex to parse Inst lines: Inst <pkg> [<cur>] (<new> <repo> [<arch>])
+    _RE_INST = re.compile(r'^Inst\s+(\S+)\s+\[([^\]]+)\]\s+\((\S+)\s+')
+    _RE_INST_NEW = re.compile(r'^Inst\s+(\S+)\s+\((\S+)\s+')
+    
+    _PVE_PREFIXES = (
+        'pve-', 'proxmox-', 'qemu-server', 'lxc-pve', 'ceph',
+        'corosync', 'libpve', 'pbs-', 'pmg-',
+    )
+    _KERNEL_PREFIXES = ('linux-image', 'pve-kernel', 'pve-firmware')
+    _IMPORTANT_PKGS = {
+        'pve-manager', 'proxmox-ve', 'qemu-server', 'pve-container',
+        'pve-ha-manager', 'pve-firewall', 'ceph-common',
+        'proxmox-backup-client',
+    }
+    
     def _check_updates(self) -> Optional[Dict[str, Any]]:
         """
         Check for pending system updates.
-        - WARNING: Security updates available, or system not updated >1 year (365 days).
+        - INFO: Any updates available (including security updates).
+        - WARNING: Security updates pending 360+ days unpatched, or system not updated >1 year (365 days).
         - CRITICAL: System not updated >18 months (548 days).
-        - INFO: Kernel/PVE updates available, or >50 non-security updates pending.
+        
+        Updates are always informational unless they represent a prolonged
+        unpatched state.  Detects PVE version upgrades from pve-manager
+        Inst lines and exposes them as an INFO sub-check.
         """
         cache_key = 'updates_check'
         current_time = time.time()
@@ -2734,150 +2894,214 @@ class HealthMonitor:
                     days_since_update = (current_time - mtime) / 86400
                     last_update_days = int(days_since_update)
                 except Exception:
-                    pass # Ignore if mtime fails
+                    pass
             
             # Perform a dry run of apt-get upgrade to see pending packages
             try:
                 result = subprocess.run(
                     ['apt-get', 'upgrade', '--dry-run'],
-                    capture_output=True,
-                    text=True,
-                    timeout=10
+                    capture_output=True, text=True, timeout=30
                 )
             except subprocess.TimeoutExpired:
                 print("[HealthMonitor] apt-get upgrade --dry-run timed out")
                 return {
                     'status': 'UNKNOWN',
                     'reason': 'apt-get timed out - repository may be unreachable',
-                    'count': 0,
-                    'checks': {}
+                    'count': 0, 'checks': {}
                 }
             
             status = 'OK'
             reason = None
             update_count = 0
-            security_updates_packages = []
-            kernel_pve_updates_packages = []
+            security_pkgs: list = []
+            kernel_pkgs: list = []
+            pve_pkgs: list = []
+            important_pkgs: list = []   # {name, cur, new}
+            pve_manager_info = None     # {cur, new} or None
             sec_result = None
+            sec_severity = 'INFO'
+            sec_days_unpatched = 0
             
             if result.returncode == 0:
-                lines = result.stdout.strip().split('\n')
+                for line in result.stdout.strip().split('\n'):
+                    if not line.startswith('Inst '):
+                        continue
+                    update_count += 1
+                    
+                    # Parse package name, current and new versions
+                    m = self._RE_INST.match(line)
+                    if m:
+                        pkg_name, cur_ver, new_ver = m.group(1), m.group(2), m.group(3)
+                    else:
+                        m2 = self._RE_INST_NEW.match(line)
+                        if m2:
+                            pkg_name, cur_ver, new_ver = m2.group(1), '', m2.group(2)
+                        else:
+                            parts = line.split()
+                            pkg_name = parts[1] if len(parts) > 1 else 'unknown'
+                            cur_ver, new_ver = '', ''
+                    
+                    # Strip arch suffix (e.g. package:amd64)
+                    pkg_name = pkg_name.split(':')[0]
+                    name_lower = pkg_name.lower()
+                    line_lower = line.lower()
+                    
+                    # Categorise
+                    if 'security' in line_lower or 'debian-security' in line_lower:
+                        security_pkgs.append(pkg_name)
+                    
+                    if any(name_lower.startswith(p) for p in self._KERNEL_PREFIXES):
+                        kernel_pkgs.append(pkg_name)
+                    elif any(name_lower.startswith(p) for p in self._PVE_PREFIXES):
+                        pve_pkgs.append(pkg_name)
+                    
+                    # Collect important packages with version info
+                    if pkg_name in self._IMPORTANT_PKGS and cur_ver:
+                        important_pkgs.append({
+                            'name': pkg_name, 'cur': cur_ver, 'new': new_ver
+                        })
+                    
+                    # Detect pve-manager upgrade -> PVE version upgrade
+                    if pkg_name == 'pve-manager' and cur_ver and new_ver:
+                        pve_manager_info = {'cur': cur_ver, 'new': new_ver}
                 
-                for line in lines:
-                    # 'Inst ' indicates a package will be installed/upgraded
-                    if line.startswith('Inst '):
-                        update_count += 1
-                        line_lower = line.lower()
-                        package_name = line.split()[1].split(':')[0] # Get package name, strip arch if present
-                        
-                        # Check for security updates (common pattern in repo names)
-                        if 'security' in line_lower or 'debian-security' in line_lower:
-                            security_updates_packages.append(package_name)
-                        
-                        # Check for kernel or critical PVE updates
-                        if any(pkg in line_lower for pkg in ['linux-image', 'pve-kernel', 'pve-manager', 'proxmox-ve', 'qemu-server', 'pve-api-core']):
-                            kernel_pve_updates_packages.append(package_name)
-                
-                # Determine overall status based on findings
-                if security_updates_packages:
-                    status = 'WARNING'
-                    reason = f'{len(security_updates_packages)} security update(s) available'
-                    # Record persistent error for security updates to ensure it's visible
+                # ── Determine overall status ──────────────────────
+                if security_pkgs:
+                    sec_days_unpatched = 0
+                    try:
+                        existing = health_persistence.get_error_by_key('security_updates')
+                        if existing and existing.get('first_seen'):
+                            from datetime import datetime
+                            first_dt = datetime.fromisoformat(existing['first_seen'])
+                            sec_days_unpatched = (datetime.now() - first_dt).days
+                    except Exception:
+                        pass
+                    
+                    if sec_days_unpatched >= self.SECURITY_WARN_DAYS:
+                        status = 'WARNING'
+                        reason = f'{len(security_pkgs)} security update(s) pending for {sec_days_unpatched} days'
+                        sec_severity = 'WARNING'
+                    else:
+                        status = 'INFO'
+                        reason = f'{len(security_pkgs)} security update(s) pending'
+                        sec_severity = 'INFO'
+                    
                     sec_result = health_persistence.record_error(
                         error_key='security_updates',
                         category='updates',
-                        severity='WARNING',
+                        severity=sec_severity,
                         reason=reason,
-                        details={'count': len(security_updates_packages), 'packages': security_updates_packages[:5], 'dismissable': True}
+                        details={'count': len(security_pkgs), 'packages': security_pkgs[:5],
+                                 'dismissable': sec_severity == 'WARNING',
+                                 'days_unpatched': sec_days_unpatched}
                     )
-                    # If previously dismissed, downgrade to INFO
                     if sec_result and sec_result.get('type') == 'skipped_acknowledged':
                         status = 'INFO'
                         reason = None
+                
                 elif last_update_days and last_update_days >= 548:
-                    # 18+ months without updates - CRITICAL
                     status = 'CRITICAL'
                     reason = f'System not updated in {last_update_days} days (>18 months)'
                     health_persistence.record_error(
-                        error_key='system_age',
-                        category='updates',
-                        severity='CRITICAL',
-                        reason=reason,
+                        error_key='system_age', category='updates',
+                        severity='CRITICAL', reason=reason,
                         details={'days': last_update_days, 'update_count': update_count, 'dismissable': False}
                     )
                 elif last_update_days and last_update_days >= 365:
-                    # 1+ year without updates - WARNING
                     status = 'WARNING'
                     reason = f'System not updated in {last_update_days} days (>1 year)'
                     age_result = health_persistence.record_error(
-                        error_key='system_age',
-                        category='updates',
-                        severity='WARNING',
-                        reason=reason,
+                        error_key='system_age', category='updates',
+                        severity='WARNING', reason=reason,
                         details={'days': last_update_days, 'update_count': update_count, 'dismissable': True}
                     )
                     if age_result and age_result.get('type') == 'skipped_acknowledged':
                         status = 'INFO'
                         reason = None
-                elif kernel_pve_updates_packages:
-                    # Informational: Kernel or critical PVE components need update
+                elif kernel_pkgs or pve_pkgs:
                     status = 'INFO'
-                    reason = f'{len(kernel_pve_updates_packages)} kernel/PVE update(s) available'
-                elif update_count > 50:
-                    # Informational: Large number of pending updates
+                    reason = f'{len(kernel_pkgs)} kernel + {len(pve_pkgs)} Proxmox update(s) available'
+                elif update_count > 0:
                     status = 'INFO'
-                    reason = f'{update_count} updates pending (consider maintenance window)'
+                    reason = f'{update_count} package update(s) pending'
             
-            # If apt-get upgrade --dry-run failed
             elif result.returncode != 0:
                 status = 'WARNING'
                 reason = 'Failed to check for updates (apt-get error)'
 
-            # Build checks dict for updates sub-items
+            # ── Build checks dict ─────────────────────────────────
             age_dismissed = bool(age_result and age_result.get('type') == 'skipped_acknowledged')
             update_age_status = 'CRITICAL' if (last_update_days and last_update_days >= 548) else (
                 'INFO' if age_dismissed else ('WARNING' if (last_update_days and last_update_days >= 365) else 'OK'))
-            sec_dismissed = security_updates_packages and sec_result and sec_result.get('type') == 'skipped_acknowledged'
-            sec_status = 'INFO' if sec_dismissed else ('WARNING' if security_updates_packages else 'OK')
-            kernel_status = 'INFO' if kernel_pve_updates_packages else 'OK'
+            
+            sec_dismissed = security_pkgs and sec_result and sec_result.get('type') == 'skipped_acknowledged'
+            if sec_dismissed:
+                sec_status = 'INFO'
+            elif security_pkgs:
+                sec_status = sec_severity
+            else:
+                sec_status = 'OK'
+            
+            sec_detail = f'{len(security_pkgs)} security update(s) pending'
+            if security_pkgs and sec_days_unpatched >= self.SECURITY_WARN_DAYS:
+                sec_detail += f' ({sec_days_unpatched} days unpatched)'
             
             checks = {
+                'kernel_pve': {
+                    'status': 'INFO' if kernel_pkgs else 'OK',
+                    'detail': f'{len(kernel_pkgs)} kernel/PVE update(s)' if kernel_pkgs else 'Kernel/PVE up to date',
+                    'error_key': 'kernel_pve'
+                },
+                'pending_updates': {
+                    'status': 'INFO' if update_count > 0 else 'OK',
+                    'detail': f'{update_count} package(s) pending',
+                    'error_key': 'pending_updates'
+                },
                 'security_updates': {
                     'status': sec_status,
-                    'detail': f'{len(security_updates_packages)} security update(s) pending' if security_updates_packages else 'No security updates pending',
-                    'dismissable': True if security_updates_packages and not sec_dismissed else False,
+                    'detail': sec_detail if security_pkgs else 'No security updates pending',
+                    'dismissable': sec_status == 'WARNING' and not sec_dismissed,
                     'dismissed': bool(sec_dismissed),
                     'error_key': 'security_updates'
                 },
                 'system_age': {
                     'status': update_age_status,
                     'detail': f'Last updated {last_update_days} day(s) ago' if last_update_days is not None else 'Unknown',
-                    'dismissable': False if update_age_status == 'CRITICAL' else True if update_age_status == 'WARNING' else False,
+                    'dismissable': update_age_status == 'WARNING' and not age_dismissed,
                     'dismissed': bool(age_dismissed),
                     'error_key': 'system_age'
                 },
-                'pending_updates': {
-                    'status': 'INFO' if update_count > 50 else 'OK',
-                    'detail': f'{update_count} package(s) pending',
-                    'error_key': 'pending_updates'
-                },
-                'kernel_pve': {
-                    'status': kernel_status,
-                    'detail': f'{len(kernel_pve_updates_packages)} kernel/PVE update(s)' if kernel_pve_updates_packages else 'Kernel/PVE up to date',
-                    'error_key': 'kernel_pve'
-                }
             }
             
+            # PVE version sub-check (always INFO)
+            if pve_manager_info:
+                checks['pve_version'] = {
+                    'status': 'INFO',
+                    'detail': f"PVE {pve_manager_info['cur']} -> {pve_manager_info['new']} available",
+                    'error_key': 'pve_version'
+                }
+            else:
+                checks['pve_version'] = {
+                    'status': 'OK',
+                    'detail': 'Proxmox VE is up to date',
+                    'error_key': 'pve_version'
+                }
+            
             # Construct result dictionary
             update_result = {
                 'status': status,
                 'count': update_count,
-                'checks': checks
+                'checks': checks,
             }
             if reason:
                 update_result['reason'] = reason
             if last_update_days is not None:
                 update_result['days_since_update'] = last_update_days
+            # Attach categorised counts for the frontend
+            update_result['security_count'] = len(security_pkgs)
+            update_result['pve_count'] = len(pve_pkgs)
+            update_result['kernel_count'] = len(kernel_pkgs)
+            update_result['important_packages'] = important_pkgs[:8]
             
             self.cached_results[cache_key] = update_result
             self.last_check_times[cache_key] = current_time
diff --git a/AppImage/scripts/health_persistence.py b/AppImage/scripts/health_persistence.py
index 82be46f6..9e4b0085 100644
--- a/AppImage/scripts/health_persistence.py
+++ b/AppImage/scripts/health_persistence.py
@@ -548,6 +548,33 @@ class HealthPersistence:
         
         return errors
     
+    def get_error_by_key(self, error_key: str) -> Optional[Dict[str, Any]]:
+        """Get a single error record by its unique error_key.
+        
+        Returns the full row as a dict (including first_seen, last_seen,
+        acknowledged, etc.) or None if not found / already resolved.
+        Only returns unresolved (active) errors.
+        """
+        conn = self._get_conn()
+        conn.row_factory = sqlite3.Row
+        cursor = conn.cursor()
+        cursor.execute('''
+            SELECT * FROM errors
+            WHERE error_key = ? AND resolved_at IS NULL
+            LIMIT 1
+        ''', (error_key,))
+        row = cursor.fetchone()
+        conn.close()
+        if row is None:
+            return None
+        error_dict = dict(row)
+        if error_dict.get('details'):
+            try:
+                error_dict['details'] = json.loads(error_dict['details'])
+            except (json.JSONDecodeError, TypeError):
+                pass
+        return error_dict
+    
     def cleanup_old_errors(self):
         """Clean up old resolved errors and auto-resolve stale errors"""
         with self._db_lock:
diff --git a/AppImage/scripts/notification_events.py b/AppImage/scripts/notification_events.py
index 383f32f9..5978f7d3 100644
--- a/AppImage/scripts/notification_events.py
+++ b/AppImage/scripts/notification_events.py
@@ -337,6 +337,16 @@ class JournalWatcher:
                     entity = 'disk'
                     entity_id = f'fs_{device}'
                     
+                    # Check if the device physically exists to calibrate severity.
+                    # A disconnected USB / temp device should NOT be CRITICAL.
+                    import os as _os
+                    base_dev = re.sub(r'\d+$', '', device) if device != 'unknown' else ''
+                    device_exists = base_dev and _os.path.exists(f'/dev/{base_dev}')
+                    
+                    if not device_exists and device != 'unknown':
+                        # Device not present -- downgrade to WARNING
+                        severity = 'WARNING'
+                    
                     # Identify what this device is (model, type, mountpoint)
                     device_info = self._identify_block_device(device)
                     
@@ -357,7 +367,10 @@ class JournalWatcher:
                     if inode:
                         inode_hint = 'root directory' if inode == '2' else f'inode #{inode}'
                         parts.append(f'Affected: {inode_hint}')
-                    parts.append(f'Action: Run "fsck /dev/{device}" (unmount first) or check backup integrity')
+                    if device_exists:
+                        parts.append(f'Action: Run "fsck /dev/{device}" (unmount first) or check backup integrity')
+                    else:
+                        parts.append('Note: Device not currently connected -- this may be a stale journal entry')
                     enriched = '\n'.join(parts)
                 
                 else:
@@ -1325,7 +1338,7 @@ class PollingCollector:
         'network': 'network_down',
         'pve_services': 'service_fail',
         'security': 'auth_fail',
-        'updates': 'update_available',
+        'updates': 'update_summary',
         'zfs': 'disk_io_error',
         'smart': 'disk_io_error',
         'disks': 'disk_io_error',
@@ -1442,12 +1455,18 @@ class PollingCollector:
             event_type = self._CATEGORY_TO_EVENT_TYPE.get(category, 'system_problem')
             entity, eid = self._ENTITY_MAP.get(category, ('node', ''))
             
+            # Updates are always informational notifications except
+            # system_age which can be WARNING (365+ days) or CRITICAL (548+ days).
+            emit_severity = severity
+            if category == 'updates' and error_key != 'system_age':
+                emit_severity = 'INFO'
+            
             data = {
                 'hostname': self._hostname,
                 'category': category,
                 'reason': reason,
                 'error_key': error_key,
-                'severity': severity,
+                'severity': emit_severity,
                 'first_seen': error.get('first_seen', ''),
                 'last_seen': error.get('last_seen', ''),
                 'is_persistent': not is_new,
@@ -1464,7 +1483,7 @@ class PollingCollector:
                     pass
             
             self._queue.put(NotificationEvent(
-                event_type, severity, data, source='health',
+                event_type, emit_severity, data, source='health',
                 entity=entity, entity_id=eid or error_key,
             ))
             
@@ -1482,11 +1501,36 @@ class PollingCollector:
     
     # ── Update check (enriched) ────────────────────────────────
     
+    # Proxmox-related package prefixes used for categorisation
+    _PVE_PREFIXES = (
+        'pve-', 'proxmox-', 'qemu-server', 'lxc-pve', 'ceph',
+        'corosync', 'libpve', 'pbs-', 'pmg-',
+    )
+    _KERNEL_PREFIXES = ('linux-image', 'pve-kernel', 'pve-firmware')
+    _IMPORTANT_PKGS = {
+        'pve-manager', 'proxmox-ve', 'qemu-server', 'pve-container',
+        'pve-ha-manager', 'pve-firewall', 'pve-storage-iscsi-direct',
+        'ceph-common', 'proxmox-backup-client',
+    }
+    
+    # Regex to parse Inst lines from apt-get -s upgrade
+    # Inst <pkg> [<cur_ver>] (<new_ver> <repo> [<arch>])
+    _RE_INST = re.compile(
+        r'^Inst\s+(\S+)\s+\[([^\]]+)\]\s+\((\S+)\s+'
+    )
+    # Fallback for new installs (no current version):
+    # Inst <pkg> (<new_ver> <repo> [<arch>])
+    _RE_INST_NEW = re.compile(
+        r'^Inst\s+(\S+)\s+\((\S+)\s+'
+    )
+    
     def _check_updates(self):
         """Check for available system updates every 24 h.
         
-        Enriched output: total count, security updates, PVE version hint,
-        and top package names.
+        Emits a structured ``update_summary`` notification with categorised
+        counts (security, Proxmox-related, kernel, other) and important
+        package versions.  If pve-manager has an upgrade, also emits a
+        separate ``pve_update`` notification.
         """
         now = time.time()
         if now - self._last_update_check < self.UPDATE_CHECK_INTERVAL:
@@ -1502,58 +1546,84 @@ class PollingCollector:
             if result.returncode != 0:
                 return
             
-            lines = [l for l in result.stdout.split('\n') if l.startswith('Inst ')]
-            total = len(lines)
+            inst_lines = [l for l in result.stdout.split('\n') if l.startswith('Inst ')]
+            total = len(inst_lines)
             if total == 0:
                 return
             
-            packages = [l.split()[1] for l in lines]
-            security = [p for p in packages if any(
-                kw in p.lower() for kw in ('security', 'cve', 'openssl', 'libssl')
-            )]
+            # ── Parse every Inst line ──────────────────────────────
+            all_pkgs: list[dict] = []   # {name, cur, new}
+            security_pkgs: list[dict] = []
+            pve_pkgs: list[dict] = []
+            kernel_pkgs: list[dict] = []
+            pve_manager_info: dict | None = None
             
-            # Also detect security updates via apt changelog / Debian-Security origin
-            sec_result = subprocess.run(
-                ['apt-get', '-s', 'upgrade', '-o', 'Dir::Etc::SourceList=/dev/null',
-                 '-o', 'Dir::Etc::SourceParts=/dev/null'],
-                capture_output=True, text=True, timeout=30,
-            )
-            # Count lines from security repo (rough heuristic)
-            sec_count = max(len(security), 0)
-            try:
-                sec_output = subprocess.run(
-                    ['apt-get', '-s', '--only-upgrade', 'install'] + packages[:50],
-                    capture_output=True, text=True, timeout=30,
-                )
-                for line in sec_output.stdout.split('\n'):
-                    if 'security' in line.lower() and 'Inst ' in line:
-                        sec_count += 1
-            except Exception:
-                pass
+            for line in inst_lines:
+                m = self._RE_INST.match(line)
+                if m:
+                    info = {'name': m.group(1), 'cur': m.group(2), 'new': m.group(3)}
+                else:
+                    m2 = self._RE_INST_NEW.match(line)
+                    if m2:
+                        info = {'name': m2.group(1), 'cur': '', 'new': m2.group(2)}
+                    else:
+                        pkg_name = line.split()[1] if len(line.split()) > 1 else 'unknown'
+                        info = {'name': pkg_name, 'cur': '', 'new': ''}
+                
+                all_pkgs.append(info)
+                name_lower = info['name'].lower()
+                line_lower = line.lower()
+                
+                # Categorise
+                if 'security' in line_lower or 'debian-security' in line_lower:
+                    security_pkgs.append(info)
+                
+                if any(name_lower.startswith(p) for p in self._KERNEL_PREFIXES):
+                    kernel_pkgs.append(info)
+                elif any(name_lower.startswith(p) for p in self._PVE_PREFIXES):
+                    pve_pkgs.append(info)
+                
+                # Detect pve-manager upgrade specifically
+                if info['name'] == 'pve-manager':
+                    pve_manager_info = info
             
-            # Check for PVE version upgrade
-            pve_packages = [p for p in packages if 'pve-' in p.lower() or 'proxmox-' in p.lower()]
-            
-            # Build display details
-            top_pkgs = packages[:8]
-            details = ', '.join(top_pkgs)
-            if total > 8:
-                details += f', ... +{total - 8} more'
+            # ── Build important packages list ──────────────────────
+            important_lines = []
+            for pkg in all_pkgs:
+                if pkg['name'] in self._IMPORTANT_PKGS and pkg['cur']:
+                    important_lines.append(
+                        f"{pkg['name']} ({pkg['cur']} -> {pkg['new']})"
+                    )
             
+            # ── Emit structured update_summary ─────────────────────
             data = {
                 'hostname': self._hostname,
-                'count': str(total),
-                'security_count': str(sec_count),
-                'details': details,
-                'packages': ', '.join(packages[:20]),
+                'total_count': str(total),
+                'security_count': str(len(security_pkgs)),
+                'pve_count': str(len(pve_pkgs)),
+                'kernel_count': str(len(kernel_pkgs)),
+                'important_list': ', '.join(important_lines) if important_lines else 'none',
+                'package_list': ', '.join(important_lines[:6]) if important_lines else '',
             }
-            if pve_packages:
-                data['pve_packages'] = ', '.join(pve_packages)
             
             self._queue.put(NotificationEvent(
-                'update_available', 'INFO', data,
+                'update_summary', 'INFO', data,
                 source='polling', entity='node', entity_id='',
             ))
+            
+            # ── Emit pve_update if pve-manager has an upgrade ──────
+            if pve_manager_info and pve_manager_info['cur'] and pve_manager_info['new']:
+                pve_data = {
+                    'hostname': self._hostname,
+                    'current_version': pve_manager_info['cur'],
+                    'new_version': pve_manager_info['new'],
+                    'version': pve_manager_info['new'],
+                    'details': f"pve-manager {pve_manager_info['cur']} -> {pve_manager_info['new']}",
+                }
+                self._queue.put(NotificationEvent(
+                    'pve_update', 'INFO', pve_data,
+                    source='polling', entity='node', entity_id='',
+                ))
         except Exception:
             pass
     
diff --git a/AppImage/scripts/notification_templates.py b/AppImage/scripts/notification_templates.py
index ed7dc882..ef3c298e 100644
--- a/AppImage/scripts/notification_templates.py
+++ b/AppImage/scripts/notification_templates.py
@@ -584,10 +584,10 @@ TEMPLATES = {
         'default_enabled': True,
     },
     'update_available': {
-        'title': '{hostname}: Updates available ({count})',
-        'body': '{count} package updates are available.\n{details}',
+        'title': '{hostname}: Updates available',
+        'body': 'Total updates: {total_count}\nSecurity: {security_count}\nProxmox: {pve_count}\nKernel: {kernel_count}\nImportant: {important_list}',
         'group': 'system',
-        'default_enabled': False,
+        'default_enabled': False,  # Superseded by update_summary
     },
     'update_complete': {
         'title': '{hostname}: Update completed',
@@ -626,14 +626,20 @@ TEMPLATES = {
     
     # ── Update notifications (enriched) ──
     'update_summary': {
-        'title': '{hostname}: {total_count} updates available',
-        'body': '{security_count} security update(s), {total_count} total.\n{package_list}',
+        'title': '{hostname}: Updates available',
+        'body': (
+            'Total updates: {total_count}\n'
+            'Security updates: {security_count}\n'
+            'Proxmox-related updates: {pve_count}\n'
+            'Kernel updates: {kernel_count}\n'
+            'Important packages: {important_list}'
+        ),
         'group': 'system',
         'default_enabled': True,
     },
     'pve_update': {
-        'title': '{hostname}: PVE update available ({version})',
-        'body': 'Proxmox VE update available: {version}\n{details}',
+        'title': '{hostname}: Proxmox VE {new_version} available',
+        'body': 'Proxmox VE {current_version} -> {new_version}\n{details}',
         'group': 'system',
         'default_enabled': True,
     },