Update notification service

2026-04-30 11:26:23 +00:00 · 2026-02-26 18:21:01 +01:00
parent ffc202f6a3
commit 4d24d6d17b
3 changed files with 223 additions and 19 deletions
--- a/AppImage/scripts/health_monitor.py
+++ b/AppImage/scripts/health_monitor.py
@@ -821,8 +821,20 @@ class HealthMonitor:
        issues = []
        storage_details = {}
-        # Check disk usage and mount status first for critical mounts
+        # Check disk usage and mount status for important mounts.
-        critical_mounts = ['/']
+        # We detect actual mountpoints dynamically rather than hard-coding.
        critical_mounts = set()
        critical_mounts.add('/')
        try:
            for part in psutil.disk_partitions(all=False):
                mp = part.mountpoint
                # Include standard system mounts and PVE storage
                if mp in ('/', '/var', '/tmp', '/boot', '/boot/efi') or \
                   mp.startswith('/var/lib/vz') or mp.startswith('/mnt/'):
                    critical_mounts.add(mp)
        except Exception:
            pass
        critical_mounts = sorted(critical_mounts)
        for mount_point in critical_mounts:
            try:
@@ -857,9 +869,32 @@ class HealthMonitor:
                # Check filesystem usage only if not already flagged as critical
                if mount_point not in storage_details or storage_details[mount_point].get('status') == 'OK':
                    fs_status = self._check_filesystem(mount_point)
                    error_key = f'disk_space_{mount_point}'
                    if fs_status['status'] != 'OK':
                        issues.append(f"{mount_point}: {fs_status['reason']}")
                        storage_details[mount_point] = fs_status
                        # Record persistent error for notifications
                        usage = psutil.disk_usage(mount_point)
                        avail_gb = usage.free / (1024**3)
                        if avail_gb >= 1:
                            avail_str = f"{avail_gb:.1f} GiB"
                        else:
                            avail_str = f"{usage.free / (1024**2):.0f} MiB"
                        health_persistence.record_error(
                            error_key=error_key,
                            category='disk',
                            severity=fs_status['status'],
                            reason=f'{mount_point}: {fs_status["reason"]}',
                            details={
                                'mount': mount_point,
                                'used': str(round(usage.percent, 1)),
                                'available': avail_str,
                                'dismissable': False,
                            }
                        )
                    else:
                        # Space recovered -- clear any previous alert
                        health_persistence.clear_error(error_key)
            except Exception:
                pass # Silently skip if mountpoint check fails
@@ -1871,7 +1906,8 @@ class HealthMonitor:
                        self.persistent_log_patterns[pattern] = {
                            'count': 1,
                            'first_seen': current_time,
-                            'last_seen': current_time
+                            'last_seen': current_time,
                            'sample': line.strip()[:200],  # Original line for display
                        }
                for line in previous_lines:
@@ -1913,12 +1949,16 @@ class HealthMonitor:
                        pattern_hash = hashlib.md5(pattern.encode()).hexdigest()[:8]
                        error_key = f'log_persistent_{pattern_hash}'
                        if not health_persistence.is_error_active(error_key, category='logs'):
                            # Use the original sample line for the notification,
                            # not the normalized pattern (which has IDs replaced).
                            sample = data.get('sample', pattern)
                            health_persistence.record_error(
                                error_key=error_key,
                                category='logs',
                                severity='WARNING',
-                                reason=f'Persistent error pattern detected: {pattern[:80]}',
+                                reason=f'Recurring error ({data["count"]}x): {sample[:150]}',
-                                details={'pattern': pattern, 'dismissable': True, 'occurrences': data['count']}
+                                details={'pattern': pattern, 'sample': sample,
                                         'dismissable': True, 'occurrences': data['count']}
                            )
                patterns_to_remove = [
--- a/AppImage/scripts/notification_events.py
+++ b/AppImage/scripts/notification_events.py
@@ -249,6 +249,23 @@ class JournalWatcher:
    def _check_kernel_critical(self, msg: str, syslog_id: str, priority: int):
        """Detect kernel panics, OOM, segfaults, hardware errors."""
        # Only process messages from kernel or systemd (not app-level logs)
        if syslog_id and syslog_id not in ('kernel', 'systemd', 'systemd-coredump', ''):
            return
        # Filter out normal kernel messages that are NOT problems
        _KERNEL_NOISE = [
            r'vfio-pci\s+\S+:\s*reset',       # PCI passthrough resets (normal during VM start/stop)
            r'vfio-pci\s+\S+:\s*resetting',
            r'entered\s+(?:promiscuous|allmulticast)\s+mode',  # Network bridge ops
            r'entered\s+(?:blocking|forwarding|disabled)\s+state',  # Bridge STP
            r'tap\d+i\d+:',                     # TAP interface events
            r'vmbr\d+:.*port\s+\d+',            # Bridge port events
        ]
        for noise in _KERNEL_NOISE:
            if re.search(noise, msg, re.IGNORECASE):
                return
        critical_patterns = {
            r'kernel panic':       ('system_problem', 'CRITICAL', 'Kernel panic'),
            r'Out of memory':      ('system_problem', 'CRITICAL', 'Out of memory killer activated'),
@@ -318,6 +335,19 @@ class JournalWatcher:
    def _check_service_failure(self, msg: str, unit: str):
        """Detect critical service failures with enriched context."""
        # Filter out noise -- these are normal systemd transient units,
        # not real service failures worth alerting about.
        _NOISE_PATTERNS = [
            r'session-\d+\.scope',          # SSH/login sessions
            r'user@\d+\.service',           # Per-user service managers
            r'user-runtime-dir@\d+',        # User runtime dirs
            r'systemd-coredump@',           # Coredump handlers (transient)
            r'run-.*\.mount',               # Transient mounts
        ]
        for noise in _NOISE_PATTERNS:
            if re.search(noise, msg) or re.search(noise, unit):
                return
        service_patterns = [
            r'Failed to start (.+)',
            r'Unit (\S+) (?:entered failed state|failed)',
@@ -743,13 +773,16 @@ class PollingCollector:
        'load': 'load_high',
        'temperature': 'temp_high',
        'disk': 'disk_space_low',
-        'storage': 'disk_space_low',
+        'storage': 'storage_unavailable',
        'network': 'network_down',
        'pve_services': 'service_fail',
        'security': 'auth_fail',
        'updates': 'update_available',
        'zfs': 'disk_io_error',
        'smart': 'disk_io_error',
        'disks': 'disk_io_error',
        'logs': 'system_problem',
        'vms': 'system_problem',
    }
    def __init__(self, event_queue: Queue, poll_interval: int = 60):
--- a/AppImage/scripts/notification_templates.py
+++ b/AppImage/scripts/notification_templates.py
@@ -25,10 +25,10 @@ from typing import Dict, Any, Optional, List
 def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]:
    """Parse a PVE vzdump notification message into structured data.
-    PVE vzdump messages contain:
+    Supports two formats:
-      - A table:  VMID  Name  Status  Time  Size  Filename
+    1. Local storage: table with columns VMID Name Status Time Size Filename
-      - Totals:   Total running time: Xs / Total size: X GiB
+    2. PBS storage: log-style output with 'Finished Backup of VM NNN (HH:MM:SS)'
-      - Full logs per VM
+       and sizes in lines like 'root.pxar: had to backup X of Y' or 'transferred X'
    Returns dict with 'vms' list, 'total_time', 'total_size', or None.
    """
@@ -41,7 +41,7 @@ def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]:
    lines = message.split('\n')
-    # Find the table header line
+    # ── Strategy 1: classic table (local/NFS/CIFS storage) ──
    header_idx = -1
    for i, line in enumerate(lines):
        if re.match(r'\s*VMID\s+Name\s+Status', line, re.IGNORECASE):
@@ -49,15 +49,10 @@ def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]:
            break
    if header_idx >= 0:
        # Parse column positions from header
        header = lines[header_idx]
        # Parse table rows after header
        for line in lines[header_idx + 1:]:
            stripped = line.strip()
            if not stripped or stripped.startswith('Total') or stripped.startswith('Logs') or stripped.startswith('='):
                break
            # Table row: VMID  Name  Status  Time  Size  Filename
            # Use regex to parse flexible whitespace columns
            m = re.match(
                r'\s*(\d+)\s+'           # VMID
                r'(\S+)\s+'              # Name
@@ -74,10 +69,91 @@ def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]:
                    'status': m.group(3),
                    'time': m.group(4),
                    'size': m.group(5),
-                    'filename': m.group(6).split('/')[-1],  # just filename
+                    'filename': m.group(6).split('/')[-1],
                })
-    # Extract totals
+    # ── Strategy 2: log-style (PBS / Proxmox Backup Server) ──
    # Parse from the full vzdump log lines.
    # Look for patterns:
    #   "Starting Backup of VM NNN (lxc/qemu)"  -> detect guest
    #   "CT Name: xxx" or "VM Name: xxx"         -> guest name
    #   "Finished Backup of VM NNN (HH:MM:SS)"   -> duration + status=ok
    #   "root.pxar: had to backup X of Y"         -> size (CT)
    #   "transferred X in N seconds"              -> size (QEMU)
    #   "creating ... archive 'ct/100/2026-..'"   -> archive name for PBS
    #   "TASK ERROR:" or "ERROR:"                 -> status=error
    if not vms:
        current_vm: Optional[Dict[str, str]] = None
        for line in lines:
            # Remove "INFO: " prefix that PVE adds
            clean = re.sub(r'^(?:INFO|WARNING|ERROR):\s*', '', line.strip())
            # Start of a new VM backup
            m_start = re.match(
                r'Starting Backup of VM (\d+)\s+\((lxc|qemu)\)', clean)
            if m_start:
                if current_vm:
                    vms.append(current_vm)
                current_vm = {
                    'vmid': m_start.group(1),
                    'name': '',
                    'status': 'ok',
                    'time': '',
                    'size': '',
                    'filename': '',
                    'type': m_start.group(2),
                }
                continue
            if current_vm:
                # Guest name
                m_name = re.match(r'(?:CT|VM) Name:\s*(.+)', clean)
                if m_name:
                    current_vm['name'] = m_name.group(1).strip()
                    continue
                # PBS archive path -> extract as filename
                m_archive = re.search(
                    r"creating .+ archive '([^']+)'", clean)
                if m_archive:
                    current_vm['filename'] = m_archive.group(1)
                    continue
                # Size for containers (pxar)
                m_pxar = re.search(
                    r'root\.pxar:.*?of\s+([\d.]+\s+\S+)', clean)
                if m_pxar:
                    current_vm['size'] = m_pxar.group(1)
                    continue
                # Size for QEMU (transferred)
                m_transfer = re.search(
                    r'transferred\s+([\d.]+\s+\S+)', clean)
                if m_transfer:
                    current_vm['size'] = m_transfer.group(1)
                    continue
                # Finished -> duration
                m_finish = re.match(
                    r'Finished Backup of VM (\d+)\s+\(([^)]+)\)', clean)
                if m_finish:
                    current_vm['time'] = m_finish.group(2)
                    current_vm['status'] = 'ok'
                    vms.append(current_vm)
                    current_vm = None
                    continue
                # Error
                if clean.startswith('ERROR:') or clean.startswith('TASK ERROR'):
                    if current_vm:
                        current_vm['status'] = 'error'
        # Don't forget the last VM if it wasn't finished
        if current_vm:
            vms.append(current_vm)
    # ── Extract totals ──
    for line in lines:
        m_time = re.search(r'Total running time:\s*(.+)', line)
        if m_time:
@@ -86,6 +162,50 @@ def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]:
        if m_size:
            total_size = m_size.group(1).strip()
    # For PBS: calculate total size if not explicitly stated
    if not total_size and vms:
        # Sum individual sizes if they share units
        sizes_gib = 0.0
        for vm in vms:
            s = vm.get('size', '')
            m = re.match(r'([\d.]+)\s+(.*)', s)
            if m:
                val = float(m.group(1))
                unit = m.group(2).strip().upper()
                if 'GIB' in unit or 'GB' in unit:
                    sizes_gib += val
                elif 'MIB' in unit or 'MB' in unit:
                    sizes_gib += val / 1024
                elif 'TIB' in unit or 'TB' in unit:
                    sizes_gib += val * 1024
        if sizes_gib > 0:
            if sizes_gib >= 1024:
                total_size = f"{sizes_gib / 1024:.3f} TiB"
            elif sizes_gib >= 1:
                total_size = f"{sizes_gib:.3f} GiB"
            else:
                total_size = f"{sizes_gib * 1024:.3f} MiB"
    # For PBS: calculate total time if not stated
    if not total_time and vms:
        total_secs = 0
        for vm in vms:
            t = vm.get('time', '')
            # Parse HH:MM:SS format
            m = re.match(r'(\d+):(\d+):(\d+)', t)
            if m:
                total_secs += int(m.group(1)) * 3600 + int(m.group(2)) * 60 + int(m.group(3))
        if total_secs > 0:
            hours = total_secs // 3600
            mins = (total_secs % 3600) // 60
            secs = total_secs % 60
            if hours:
                total_time = f"{hours}h {mins}m {secs}s"
            elif mins:
                total_time = f"{mins}m {secs}s"
            else:
                total_time = f"{secs}s"
    if not vms and not total_size:
        return None
@@ -113,7 +233,12 @@ def _format_vzdump_body(parsed: Dict[str, Any], is_success: bool) -> str:
        if vm.get('time'):
            details.append(f"Duration: {vm['time']}")
        if vm.get('filename'):
-            details.append(f"File: {vm['filename']}")
+            fname = vm['filename']
            # PBS archives look like "ct/100/2026-..." or "vm/105/2026-..."
            if re.match(r'^(?:ct|vm)/\d+/', fname):
                details.append(f"PBS: {fname}")
            else:
                details.append(f"File: {fname}")
        if details:
            parts.append(' | '.join(details))
        parts.append('')  # blank line between VMs
@@ -338,6 +463,12 @@ TEMPLATES = {
        'group': 'storage',
        'default_enabled': True,
    },
    'storage_unavailable': {
        'title': '{hostname}: Storage unavailable - {storage_name}',
        'body': 'PVE storage "{storage_name}" ({storage_type}) is not available.\n{reason}',
        'group': 'storage',
        'default_enabled': True,
    },
    'load_high': {
        'title': '{hostname}: High system load ({value})',
        'body': 'System load average: {value} on {cores} cores.\n{details}',