Update notification service

2026-06-12 03:17:07 +00:00 · 2026-02-26 18:21:01 +01:00
parent ffc202f6a3
commit 4d24d6d17b
3 changed files with 223 additions and 19 deletions
--- a/AppImage/scripts/health_monitor.py
+++ b/AppImage/scripts/health_monitor.py
@@ -821,8 +821,20 @@ class HealthMonitor:
        issues = []
        storage_details = {}
        
-        # Check disk usage and mount status first for critical mounts
-        critical_mounts = ['/']
+        # Check disk usage and mount status for important mounts.
+        # We detect actual mountpoints dynamically rather than hard-coding.
+        critical_mounts = set()
+        critical_mounts.add('/')
+        try:
+            for part in psutil.disk_partitions(all=False):
+                mp = part.mountpoint
+                # Include standard system mounts and PVE storage
+                if mp in ('/', '/var', '/tmp', '/boot', '/boot/efi') or \
+                   mp.startswith('/var/lib/vz') or mp.startswith('/mnt/'):
+                    critical_mounts.add(mp)
+        except Exception:
+            pass
+        critical_mounts = sorted(critical_mounts)
        
        for mount_point in critical_mounts:
            try:
@@ -857,9 +869,32 @@ class HealthMonitor:
                # Check filesystem usage only if not already flagged as critical
                if mount_point not in storage_details or storage_details[mount_point].get('status') == 'OK':
                    fs_status = self._check_filesystem(mount_point)
+                    error_key = f'disk_space_{mount_point}'
                    if fs_status['status'] != 'OK':
                        issues.append(f"{mount_point}: {fs_status['reason']}")
                        storage_details[mount_point] = fs_status
+                        # Record persistent error for notifications
+                        usage = psutil.disk_usage(mount_point)
+                        avail_gb = usage.free / (1024**3)
+                        if avail_gb >= 1:
+                            avail_str = f"{avail_gb:.1f} GiB"
+                        else:
+                            avail_str = f"{usage.free / (1024**2):.0f} MiB"
+                        health_persistence.record_error(
+                            error_key=error_key,
+                            category='disk',
+                            severity=fs_status['status'],
+                            reason=f'{mount_point}: {fs_status["reason"]}',
+                            details={
+                                'mount': mount_point,
+                                'used': str(round(usage.percent, 1)),
+                                'available': avail_str,
+                                'dismissable': False,
+                            }
+                        )
+                    else:
+                        # Space recovered -- clear any previous alert
+                        health_persistence.clear_error(error_key)
            except Exception:
                pass # Silently skip if mountpoint check fails
        
@@ -1871,7 +1906,8 @@ class HealthMonitor:
                        self.persistent_log_patterns[pattern] = {
                            'count': 1,
                            'first_seen': current_time,
-                            'last_seen': current_time
+                            'last_seen': current_time,
+                            'sample': line.strip()[:200],  # Original line for display
                        }
                
                for line in previous_lines:
@@ -1913,12 +1949,16 @@ class HealthMonitor:
                        pattern_hash = hashlib.md5(pattern.encode()).hexdigest()[:8]
                        error_key = f'log_persistent_{pattern_hash}'
                        if not health_persistence.is_error_active(error_key, category='logs'):
+                            # Use the original sample line for the notification,
+                            # not the normalized pattern (which has IDs replaced).
+                            sample = data.get('sample', pattern)
                            health_persistence.record_error(
                                error_key=error_key,
                                category='logs',
                                severity='WARNING',
-                                reason=f'Persistent error pattern detected: {pattern[:80]}',
-                                details={'pattern': pattern, 'dismissable': True, 'occurrences': data['count']}
+                                reason=f'Recurring error ({data["count"]}x): {sample[:150]}',
+                                details={'pattern': pattern, 'sample': sample,
+                                         'dismissable': True, 'occurrences': data['count']}
                            )
                
                patterns_to_remove = [
--- a/AppImage/scripts/notification_events.py
+++ b/AppImage/scripts/notification_events.py
@@ -249,6 +249,23 @@ class JournalWatcher:
    
    def _check_kernel_critical(self, msg: str, syslog_id: str, priority: int):
        """Detect kernel panics, OOM, segfaults, hardware errors."""
+        # Only process messages from kernel or systemd (not app-level logs)
+        if syslog_id and syslog_id not in ('kernel', 'systemd', 'systemd-coredump', ''):
+            return
+        
+        # Filter out normal kernel messages that are NOT problems
+        _KERNEL_NOISE = [
+            r'vfio-pci\s+\S+:\s*reset',       # PCI passthrough resets (normal during VM start/stop)
+            r'vfio-pci\s+\S+:\s*resetting',
+            r'entered\s+(?:promiscuous|allmulticast)\s+mode',  # Network bridge ops
+            r'entered\s+(?:blocking|forwarding|disabled)\s+state',  # Bridge STP
+            r'tap\d+i\d+:',                     # TAP interface events
+            r'vmbr\d+:.*port\s+\d+',            # Bridge port events
+        ]
+        for noise in _KERNEL_NOISE:
+            if re.search(noise, msg, re.IGNORECASE):
+                return
+        
        critical_patterns = {
            r'kernel panic':       ('system_problem', 'CRITICAL', 'Kernel panic'),
            r'Out of memory':      ('system_problem', 'CRITICAL', 'Out of memory killer activated'),
@@ -318,6 +335,19 @@ class JournalWatcher:
    
    def _check_service_failure(self, msg: str, unit: str):
        """Detect critical service failures with enriched context."""
+        # Filter out noise -- these are normal systemd transient units,
+        # not real service failures worth alerting about.
+        _NOISE_PATTERNS = [
+            r'session-\d+\.scope',          # SSH/login sessions
+            r'user@\d+\.service',           # Per-user service managers
+            r'user-runtime-dir@\d+',        # User runtime dirs
+            r'systemd-coredump@',           # Coredump handlers (transient)
+            r'run-.*\.mount',               # Transient mounts
+        ]
+        for noise in _NOISE_PATTERNS:
+            if re.search(noise, msg) or re.search(noise, unit):
+                return
+        
        service_patterns = [
            r'Failed to start (.+)',
            r'Unit (\S+) (?:entered failed state|failed)',
@@ -743,13 +773,16 @@ class PollingCollector:
        'load': 'load_high',
        'temperature': 'temp_high',
        'disk': 'disk_space_low',
-        'storage': 'disk_space_low',
+        'storage': 'storage_unavailable',
        'network': 'network_down',
        'pve_services': 'service_fail',
        'security': 'auth_fail',
        'updates': 'update_available',
        'zfs': 'disk_io_error',
        'smart': 'disk_io_error',
+        'disks': 'disk_io_error',
+        'logs': 'system_problem',
+        'vms': 'system_problem',
    }
    
    def __init__(self, event_queue: Queue, poll_interval: int = 60):
--- a/AppImage/scripts/notification_templates.py
+++ b/AppImage/scripts/notification_templates.py
@@ -25,10 +25,10 @@ from typing import Dict, Any, Optional, List
 def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]:
    """Parse a PVE vzdump notification message into structured data.
    
-    PVE vzdump messages contain:
-      - A table:  VMID  Name  Status  Time  Size  Filename
-      - Totals:   Total running time: Xs / Total size: X GiB
-      - Full logs per VM
+    Supports two formats:
+    1. Local storage: table with columns VMID Name Status Time Size Filename
+    2. PBS storage: log-style output with 'Finished Backup of VM NNN (HH:MM:SS)'
+       and sizes in lines like 'root.pxar: had to backup X of Y' or 'transferred X'
    
    Returns dict with 'vms' list, 'total_time', 'total_size', or None.
    """
@@ -41,7 +41,7 @@ def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]:
    
    lines = message.split('\n')
    
-    # Find the table header line
+    # ── Strategy 1: classic table (local/NFS/CIFS storage) ──
    header_idx = -1
    for i, line in enumerate(lines):
        if re.match(r'\s*VMID\s+Name\s+Status', line, re.IGNORECASE):
@@ -49,15 +49,10 @@ def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]:
            break
    
    if header_idx >= 0:
-        # Parse column positions from header
-        header = lines[header_idx]
-        # Parse table rows after header
        for line in lines[header_idx + 1:]:
            stripped = line.strip()
            if not stripped or stripped.startswith('Total') or stripped.startswith('Logs') or stripped.startswith('='):
                break
-            # Table row: VMID  Name  Status  Time  Size  Filename
-            # Use regex to parse flexible whitespace columns
            m = re.match(
                r'\s*(\d+)\s+'           # VMID
                r'(\S+)\s+'              # Name
@@ -74,10 +69,91 @@ def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]:
                    'status': m.group(3),
                    'time': m.group(4),
                    'size': m.group(5),
-                    'filename': m.group(6).split('/')[-1],  # just filename
+                    'filename': m.group(6).split('/')[-1],
                })
    
-    # Extract totals
+    # ── Strategy 2: log-style (PBS / Proxmox Backup Server) ──
+    # Parse from the full vzdump log lines.
+    # Look for patterns:
+    #   "Starting Backup of VM NNN (lxc/qemu)"  -> detect guest
+    #   "CT Name: xxx" or "VM Name: xxx"         -> guest name
+    #   "Finished Backup of VM NNN (HH:MM:SS)"   -> duration + status=ok
+    #   "root.pxar: had to backup X of Y"         -> size (CT)
+    #   "transferred X in N seconds"              -> size (QEMU)
+    #   "creating ... archive 'ct/100/2026-..'"   -> archive name for PBS
+    #   "TASK ERROR:" or "ERROR:"                 -> status=error
+    if not vms:
+        current_vm: Optional[Dict[str, str]] = None
+        
+        for line in lines:
+            # Remove "INFO: " prefix that PVE adds
+            clean = re.sub(r'^(?:INFO|WARNING|ERROR):\s*', '', line.strip())
+            
+            # Start of a new VM backup
+            m_start = re.match(
+                r'Starting Backup of VM (\d+)\s+\((lxc|qemu)\)', clean)
+            if m_start:
+                if current_vm:
+                    vms.append(current_vm)
+                current_vm = {
+                    'vmid': m_start.group(1),
+                    'name': '',
+                    'status': 'ok',
+                    'time': '',
+                    'size': '',
+                    'filename': '',
+                    'type': m_start.group(2),
+                }
+                continue
+            
+            if current_vm:
+                # Guest name
+                m_name = re.match(r'(?:CT|VM) Name:\s*(.+)', clean)
+                if m_name:
+                    current_vm['name'] = m_name.group(1).strip()
+                    continue
+                
+                # PBS archive path -> extract as filename
+                m_archive = re.search(
+                    r"creating .+ archive '([^']+)'", clean)
+                if m_archive:
+                    current_vm['filename'] = m_archive.group(1)
+                    continue
+                
+                # Size for containers (pxar)
+                m_pxar = re.search(
+                    r'root\.pxar:.*?of\s+([\d.]+\s+\S+)', clean)
+                if m_pxar:
+                    current_vm['size'] = m_pxar.group(1)
+                    continue
+                
+                # Size for QEMU (transferred)
+                m_transfer = re.search(
+                    r'transferred\s+([\d.]+\s+\S+)', clean)
+                if m_transfer:
+                    current_vm['size'] = m_transfer.group(1)
+                    continue
+                
+                # Finished -> duration
+                m_finish = re.match(
+                    r'Finished Backup of VM (\d+)\s+\(([^)]+)\)', clean)
+                if m_finish:
+                    current_vm['time'] = m_finish.group(2)
+                    current_vm['status'] = 'ok'
+                    vms.append(current_vm)
+                    current_vm = None
+                    continue
+                
+                # Error
+                if clean.startswith('ERROR:') or clean.startswith('TASK ERROR'):
+                    if current_vm:
+                        current_vm['status'] = 'error'
+        
+        # Don't forget the last VM if it wasn't finished
+        if current_vm:
+            vms.append(current_vm)
+    
+    # ── Extract totals ──
    for line in lines:
        m_time = re.search(r'Total running time:\s*(.+)', line)
        if m_time:
@@ -86,6 +162,50 @@ def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]:
        if m_size:
            total_size = m_size.group(1).strip()
    
+    # For PBS: calculate total size if not explicitly stated
+    if not total_size and vms:
+        # Sum individual sizes if they share units
+        sizes_gib = 0.0
+        for vm in vms:
+            s = vm.get('size', '')
+            m = re.match(r'([\d.]+)\s+(.*)', s)
+            if m:
+                val = float(m.group(1))
+                unit = m.group(2).strip().upper()
+                if 'GIB' in unit or 'GB' in unit:
+                    sizes_gib += val
+                elif 'MIB' in unit or 'MB' in unit:
+                    sizes_gib += val / 1024
+                elif 'TIB' in unit or 'TB' in unit:
+                    sizes_gib += val * 1024
+        if sizes_gib > 0:
+            if sizes_gib >= 1024:
+                total_size = f"{sizes_gib / 1024:.3f} TiB"
+            elif sizes_gib >= 1:
+                total_size = f"{sizes_gib:.3f} GiB"
+            else:
+                total_size = f"{sizes_gib * 1024:.3f} MiB"
+    
+    # For PBS: calculate total time if not stated
+    if not total_time and vms:
+        total_secs = 0
+        for vm in vms:
+            t = vm.get('time', '')
+            # Parse HH:MM:SS format
+            m = re.match(r'(\d+):(\d+):(\d+)', t)
+            if m:
+                total_secs += int(m.group(1)) * 3600 + int(m.group(2)) * 60 + int(m.group(3))
+        if total_secs > 0:
+            hours = total_secs // 3600
+            mins = (total_secs % 3600) // 60
+            secs = total_secs % 60
+            if hours:
+                total_time = f"{hours}h {mins}m {secs}s"
+            elif mins:
+                total_time = f"{mins}m {secs}s"
+            else:
+                total_time = f"{secs}s"
+    
    if not vms and not total_size:
        return None
    
@@ -113,7 +233,12 @@ def _format_vzdump_body(parsed: Dict[str, Any], is_success: bool) -> str:
        if vm.get('time'):
            details.append(f"Duration: {vm['time']}")
        if vm.get('filename'):
-            details.append(f"File: {vm['filename']}")
+            fname = vm['filename']
+            # PBS archives look like "ct/100/2026-..." or "vm/105/2026-..."
+            if re.match(r'^(?:ct|vm)/\d+/', fname):
+                details.append(f"PBS: {fname}")
+            else:
+                details.append(f"File: {fname}")
        if details:
            parts.append(' | '.join(details))
        parts.append('')  # blank line between VMs
@@ -338,6 +463,12 @@ TEMPLATES = {
        'group': 'storage',
        'default_enabled': True,
    },
+    'storage_unavailable': {
+        'title': '{hostname}: Storage unavailable - {storage_name}',
+        'body': 'PVE storage "{storage_name}" ({storage_type}) is not available.\n{reason}',
+        'group': 'storage',
+        'default_enabled': True,
+    },
    'load_high': {
        'title': '{hostname}: High system load ({value})',
        'body': 'System load average: {value} on {cores} cores.\n{details}',