From c5354d014ca5bf63480685673203e38456e66266 Mon Sep 17 00:00:00 2001 From: MacRimi Date: Sat, 28 Feb 2026 19:18:13 +0100 Subject: [PATCH] Update notification service --- AppImage/components/health-status-modal.tsx | 12 +- AppImage/components/storage-overview.tsx | 34 +- AppImage/scripts/flask_server.py | 4 + AppImage/scripts/health_monitor.py | 213 ++++++++++++- AppImage/scripts/notification_events.py | 326 +++++++++++++++++++- AppImage/scripts/notification_manager.py | 45 +++ AppImage/scripts/notification_templates.py | 2 +- 7 files changed, 598 insertions(+), 38 deletions(-) diff --git a/AppImage/components/health-status-modal.tsx b/AppImage/components/health-status-modal.tsx index a43cd720..fc5ff81a 100644 --- a/AppImage/components/health-status-modal.tsx +++ b/AppImage/components/health-status-modal.tsx @@ -634,23 +634,23 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
-
-
-
+
+
+

{catLabel}

{item.reason}

{isPermanent ? ( - + Permanent ) : ( - + Dismissed )} - + was {item.severity}
diff --git a/AppImage/components/storage-overview.tsx b/AppImage/components/storage-overview.tsx index 648993b9..42fb209a 100644 --- a/AppImage/components/storage-overview.tsx +++ b/AppImage/components/storage-overview.tsx @@ -39,6 +39,7 @@ interface DiskInfo { severity: string sample: string reason: string + error_type?: string // 'io' | 'filesystem' } } @@ -782,19 +783,23 @@ export function StorageOverview() {
- {disk.io_errors && disk.io_errors.count > 0 && ( + {disk.io_errors && disk.io_errors.count > 0 && (
- {disk.io_errors.count} I/O error{disk.io_errors.count !== 1 ? 's' : ''} in 5 min + + {disk.io_errors.error_type === 'filesystem' + ? `Filesystem corruption detected` + : `${disk.io_errors.count} I/O error${disk.io_errors.count !== 1 ? 's' : ''} in 5 min`} +
- )} - -
- {disk.size_formatted && ( + )} + +
+ {disk.size_formatted && (

Size

{disk.size_formatted}

@@ -866,9 +871,20 @@ export function StorageOverview() { }`}>
- {disk.io_errors.count} I/O error{disk.io_errors.count !== 1 ? 's' : ''} in 5 min - {disk.io_errors.sample && ( -

{disk.io_errors.sample}

+ {disk.io_errors.error_type === 'filesystem' ? ( + <> + Filesystem corruption detected + {disk.io_errors.reason && ( +

{disk.io_errors.reason}

+ )} + + ) : ( + <> + {disk.io_errors.count} I/O error{disk.io_errors.count !== 1 ? 's' : ''} in 5 min + {disk.io_errors.sample && ( +

{disk.io_errors.sample}

+ )} + )}
diff --git a/AppImage/scripts/flask_server.py b/AppImage/scripts/flask_server.py index dd38090e..747db50d 100644 --- a/AppImage/scripts/flask_server.py +++ b/AppImage/scripts/flask_server.py @@ -1199,6 +1199,7 @@ def get_storage_info(): 'severity': severity, 'sample': sample, 'reason': err.get('reason', ''), + 'error_type': details.get('error_type', 'io'), } # Override health status if I/O errors are more severe current_health = physical_disks[matched_disk].get('health', 'unknown').lower() @@ -1206,6 +1207,9 @@ def get_storage_info(): physical_disks[matched_disk]['health'] = 'critical' elif severity == 'WARNING' and current_health in ('healthy', 'unknown'): physical_disks[matched_disk]['health'] = 'warning' + # If err_device doesn't match any physical disk, the error still + # lives in the health monitor (Disk I/O & System Logs sections). + # We don't create virtual disks -- Physical Disks shows real hardware only. except Exception: pass diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py index 5ba2dfb4..ed769bc3 100644 --- a/AppImage/scripts/health_monitor.py +++ b/AppImage/scripts/health_monitor.py @@ -1145,6 +1145,65 @@ class HealthMonitor: return ata_port # Return original if resolution fails + def _identify_block_device(self, device: str) -> str: + """ + Identify a block device by querying lsblk. + Returns a human-readable string like: + "KINGSTON SA400S37960G (SSD, 894.3G) mounted at /mnt/data" + Returns empty string if the device is not found in lsblk. + """ + if not device or device == 'unknown': + return '' + try: + candidates = [device] + base = re.sub(r'\d+$', '', device) if not ('nvme' in device or 'mmcblk' in device) else device + if base != device: + candidates.append(base) + + for dev in candidates: + dev_path = f'/dev/{dev}' if not dev.startswith('/') else dev + result = subprocess.run( + ['lsblk', '-ndo', 'NAME,MODEL,SIZE,TRAN,MOUNTPOINT,ROTA', dev_path], + capture_output=True, text=True, timeout=3 + ) + if result.returncode == 0 and result.stdout.strip(): + fields = result.stdout.strip().split(None, 5) + name = fields[0] if len(fields) > 0 else dev + model = fields[1] if len(fields) > 1 and fields[1] else 'Unknown model' + size = fields[2] if len(fields) > 2 else '?' + tran = (fields[3] if len(fields) > 3 else '').upper() + mountpoint = fields[4] if len(fields) > 4 and fields[4] else '' + rota = fields[5].strip() if len(fields) > 5 else '1' + + if tran == 'USB': + disk_type = 'USB' + elif tran == 'NVME' or 'nvme' in name: + disk_type = 'NVMe' + elif rota == '0': + disk_type = 'SSD' + else: + disk_type = 'HDD' + + info = f'{model} ({disk_type}, {size})' + if mountpoint: + info += f' mounted at {mountpoint}' + elif dev != device: + part_result = subprocess.run( + ['lsblk', '-ndo', 'MOUNTPOINT', f'/dev/{device}'], + capture_output=True, text=True, timeout=2 + ) + part_mount = part_result.stdout.strip() if part_result.returncode == 0 else '' + if part_mount: + info += f' partition {device} mounted at {part_mount}' + else: + info += ' -- not mounted' + else: + info += ' -- not mounted' + return info + return '' + except Exception: + return '' + def _quick_smart_health(self, disk_name: str) -> str: """Quick SMART health check for a single disk. Returns 'PASSED', 'FAILED', or 'UNKNOWN'.""" if not disk_name or disk_name.startswith('ata') or disk_name.startswith('zram'): @@ -1320,6 +1379,35 @@ class HealthMonitor: else: health_persistence.resolve_error(error_key, 'Disk errors cleared') + # Also include active filesystem errors (detected by _check_system_logs + # and cross-referenced to the 'disks' category) + try: + fs_errors = health_persistence.get_active_errors(category='disks') + for err in fs_errors: + err_key = err.get('error_key', '') + if not err_key.startswith('disk_fs_'): + continue # Only filesystem cross-references + details = err.get('details', {}) + if isinstance(details, str): + try: + import json as _json + details = _json.loads(details) + except Exception: + details = {} + device = details.get('device', err_key.replace('disk_fs_', '/dev/')) + if device not in disk_results: + disk_results[device] = { + 'status': err.get('severity', 'CRITICAL'), + 'reason': err.get('reason', 'Filesystem error'), + 'device': details.get('disk', ''), + 'error_count': 1, + 'error_type': 'filesystem', + 'dismissable': False, + 'error_key': err_key, + } + except Exception: + pass + if not disk_results: return {'status': 'OK'} @@ -1336,7 +1424,7 @@ class HealthMonitor: return { 'status': 'CRITICAL' if has_critical else 'WARNING', - 'reason': f"{len(active_results)} disk(s) with recent errors", + 'reason': f"{len(active_results)} disk(s) with errors", 'details': disk_results } @@ -2035,6 +2123,87 @@ class HealthMonitor: return True return False + def _enrich_critical_log_reason(self, line: str) -> str: + """ + Transform a raw kernel/system log line into a human-readable reason + for notifications and the health dashboard. + """ + line_lower = line.lower() + + # EXT4/BTRFS/XFS/ZFS filesystem errors + if 'ext4-fs error' in line_lower or 'btrfs error' in line_lower or 'xfs' in line_lower and 'error' in line_lower: + fs_type = 'EXT4' if 'ext4' in line_lower else ('BTRFS' if 'btrfs' in line_lower else 'XFS') + dev_match = re.search(r'device\s+(\S+?)\)?:', line) + device = dev_match.group(1).rstrip(')') if dev_match else 'unknown' + func_match = re.search(r':\s+(\w+):\d+:', line) + func_name = func_match.group(1) if func_match else '' + inode_match = re.search(r'inode\s+#?(\d+)', line) + inode = inode_match.group(1) if inode_match else '' + + # Translate function name + func_translations = { + 'ext4_find_entry': 'directory lookup failed (possible directory corruption)', + 'ext4_lookup': 'file lookup failed (possible metadata corruption)', + 'ext4_journal_start': 'journal transaction failed (journal corruption)', + 'ext4_readdir': 'directory read failed (directory data corrupted)', + 'ext4_get_inode_loc': 'inode location failed (inode table corruption)', + '__ext4_get_inode_loc': 'inode location failed (inode table corruption)', + 'ext4_xattr_get': 'extended attributes read failed', + 'ext4_iget': 'inode read failed (possible inode corruption)', + 'ext4_mb_generate_buddy': 'block allocator error', + 'ext4_validate_block_bitmap': 'block bitmap corrupted', + 'ext4_validate_inode_bitmap': 'inode bitmap corrupted', + 'htree_dirblock_to_tree': 'directory index tree corrupted', + } + + # Identify the device + device_info = self._identify_block_device(device) + + reason = f'{fs_type} filesystem error on /dev/{device}' + if device_info: + reason += f'\nDevice: {device_info}' + else: + reason += f'\nDevice: /dev/{device} (not currently detected -- may be a disconnected USB or temporary device)' + if func_name: + desc = func_translations.get(func_name, func_name) + reason += f'\nError: {desc}' + if inode: + inode_hint = 'root directory' if inode == '2' else f'inode #{inode}' + reason += f'\nAffected: {inode_hint}' + reason += f'\nAction: Run "fsck /dev/{device}" (unmount first)' + return reason + + # Out of memory + if 'out of memory' in line_lower or 'oom_kill' in line_lower: + m = re.search(r'Killed process\s+\d+\s+\(([^)]+)\)', line) + process = m.group(1) if m else 'unknown' + return f'Out of memory - system killed process "{process}" to free RAM' + + # Kernel panic + if 'kernel panic' in line_lower: + return 'Kernel panic - system halted. Reboot required.' + + # Segfault + if 'segfault' in line_lower: + m = re.search(r'(\S+)\[\d+\].*segfault', line) + process = m.group(1) if m else 'unknown' + return f'Process "{process}" crashed (segmentation fault)' + + # Hardware error + if 'hardware error' in line_lower or 'mce:' in line_lower: + return f'Hardware error detected (MCE) - check CPU/RAM health' + + # RAID failure + if 'raid' in line_lower and 'fail' in line_lower: + md_match = re.search(r'(md\d+)', line) + md_dev = md_match.group(1) if md_match else 'unknown' + return f'RAID array {md_dev} degraded or failed - check disk status' + + # Fallback: clean up the raw line + clean = re.sub(r'^\w{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+\S+\s+', '', line) + clean = re.sub(r'\[\d+\]:\s*', '', clean) + return clean[:150] + def _classify_log_severity(self, line: str) -> Optional[str]: """ Classify log line severity intelligently. @@ -2141,15 +2310,41 @@ class HealthMonitor: if pattern not in critical_errors_found: critical_errors_found[pattern] = line + # Build a human-readable reason from the raw log line + enriched_reason = self._enrich_critical_log_reason(line) # Record persistent error if it's not already active if not health_persistence.is_error_active(error_key, category='logs'): health_persistence.record_error( error_key=error_key, category='logs', severity='CRITICAL', - reason=line[:100], # Truncate reason for brevity - details={'pattern': pattern, 'dismissable': True} + reason=enriched_reason, + details={'pattern': pattern, 'raw_line': line[:200], 'dismissable': True} ) + + # Cross-reference: filesystem errors also belong in the disks category + # so they appear in the Storage/Disks dashboard section + fs_match = re.search(r'(?:ext4-fs|btrfs|xfs|zfs)\s+error.*?(?:device\s+(\S+?)\)?[:\s])', line, re.IGNORECASE) + if fs_match: + fs_device = fs_match.group(1).rstrip(')') if fs_match.group(1) else 'unknown' + # Strip partition number to get base disk (sdb1 -> sdb) + base_device = re.sub(r'\d+$', '', fs_device) if not ('nvme' in fs_device or 'mmcblk' in fs_device) else fs_device.rsplit('p', 1)[0] if 'p' in fs_device else fs_device + disk_error_key = f'disk_fs_{fs_device}' + if not health_persistence.is_error_active(disk_error_key, category='disks'): + health_persistence.record_error( + error_key=disk_error_key, + category='disks', + severity='CRITICAL', + reason=enriched_reason, + details={ + 'disk': base_device, + 'device': f'/dev/{fs_device}', + 'error_type': 'filesystem', + 'error_count': 1, + 'sample': line[:200], + 'dismissable': False + } + ) recent_patterns[pattern] += 1 @@ -2241,9 +2436,13 @@ class HealthMonitor: if unique_critical_count > 0: status = 'CRITICAL' - # Get a representative critical error reason - representative_error = next(iter(critical_errors_found.values())) - reason = f'Critical error detected: {representative_error[:100]}' + # Use enriched reason from the first critical error for the summary + representative_line = next(iter(critical_errors_found.values())) + enriched = self._enrich_critical_log_reason(representative_line) + if unique_critical_count == 1: + reason = enriched + else: + reason = f'{unique_critical_count} critical error(s):\n{enriched}' elif cascade_count > 0: status = 'WARNING' samples = _get_samples(cascading_errors, 3) @@ -2326,7 +2525,7 @@ class HealthMonitor: }, 'log_critical_errors': { 'status': _log_check_status('log_critical_errors', unique_critical_count > 0, 'CRITICAL'), - 'detail': f'{unique_critical_count} critical error(s) found' if unique_critical_count > 0 else 'No critical errors', + 'detail': reason if unique_critical_count > 0 else 'No critical errors', 'dismissable': False, 'error_key': 'log_critical_errors' } diff --git a/AppImage/scripts/notification_events.py b/AppImage/scripts/notification_events.py index e88ca0c0..09dfa167 100644 --- a/AppImage/scripts/notification_events.py +++ b/AppImage/scripts/notification_events.py @@ -302,6 +302,10 @@ class JournalWatcher: lib_match = re.search(r'\bin\s+(\S+)', msg) lib_name = lib_match.group(1) if lib_match else '' + # Dedup by process name so repeated segfaults don't spam + if proc_name: + entity_id = f'segfault_{proc_name}' + parts = [reason] if proc_name: parts.append(f"Process: {proc_name}" + (f" (PID {proc_pid})" if proc_pid else '')) @@ -313,9 +317,48 @@ class JournalWatcher: m = re.search(r'Killed process\s+(\d+)\s+\(([^)]+)\)', msg) if m: enriched = f"{reason}\nKilled: {m.group(2)} (PID {m.group(1)})" + entity_id = f'oom_{m.group(2)}' # Dedup by killed process else: enriched = f"{reason}\n{msg[:300]}" + elif re.search(r'EXT4-fs error|BTRFS error|XFS.*error|ZFS.*error', msg, re.IGNORECASE): + # Filesystem errors: extract device, function and human-readable explanation + fs_type = 'EXT4' + for fs in ['EXT4', 'BTRFS', 'XFS', 'ZFS']: + if fs.lower() in msg.lower(): + fs_type = fs + break + + dev_match = re.search(r'device\s+(\S+?)\)?:', msg) + device = dev_match.group(1).rstrip(')') if dev_match else 'unknown' + + # Dedup by device: all EXT4 errors on sdb1 share ONE notification + entity = 'disk' + entity_id = f'fs_{device}' + + # Identify what this device is (model, type, mountpoint) + device_info = self._identify_block_device(device) + + func_match = re.search(r':\s+(\w+:\d+):', msg) + func_info = func_match.group(1) if func_match else '' + + inode_match = re.search(r'inode\s+#?(\d+)', msg) + inode = inode_match.group(1) if inode_match else '' + + parts = [f'{fs_type} filesystem corruption on /dev/{device}'] + # Add device identification so the user knows what this device is + if device_info: + parts.append(f'Device: {device_info}') + else: + parts.append(f'Device: /dev/{device} (not currently detected -- may be a disconnected USB or temporary device)') + if func_info: + parts.append(f'Error: {self._translate_fs_function(func_info)}') + if inode: + inode_hint = 'root directory' if inode == '2' else f'inode #{inode}' + parts.append(f'Affected: {inode_hint}') + parts.append(f'Action: Run "fsck /dev/{device}" (unmount first) or check backup integrity') + enriched = '\n'.join(parts) + else: # Generic: include the raw journal message for context enriched = f"{reason}\n{msg[:300]}" @@ -325,6 +368,92 @@ class JournalWatcher: self._emit(event_type, severity, data, entity=entity, entity_id=entity_id) return + def _identify_block_device(self, device: str) -> str: + """ + Identify a block device by querying lsblk. + Returns a human-readable string like: + "KINGSTON SA400S37960G (SSD, 894.3G) mounted at /mnt/data" + "ST8000VN004-3CP101 (HDD, 7.3T) -- not mounted" + Returns empty string if the device is not found. + """ + if not device or device == 'unknown': + return '' + try: + # Try the device as-is first, then the base disk (sdb1 -> sdb) + candidates = [device] + base = re.sub(r'\d+$', '', device) if not ('nvme' in device or 'mmcblk' in device) else device + if base != device: + candidates.append(base) + + for dev in candidates: + dev_path = f'/dev/{dev}' if not dev.startswith('/') else dev + result = subprocess.run( + ['lsblk', '-ndo', 'NAME,MODEL,SIZE,TRAN,MOUNTPOINT,ROTA', dev_path], + capture_output=True, text=True, timeout=3 + ) + if result.returncode == 0 and result.stdout.strip(): + fields = result.stdout.strip().split(None, 5) + name = fields[0] if len(fields) > 0 else dev + model = fields[1] if len(fields) > 1 and fields[1] else 'Unknown model' + size = fields[2] if len(fields) > 2 else '?' + tran = (fields[3] if len(fields) > 3 else '').upper() # sata, usb, nvme + mountpoint = fields[4] if len(fields) > 4 and fields[4] else '' + rota = fields[5].strip() if len(fields) > 5 else '1' + + # Determine disk type + if tran == 'USB': + disk_type = 'USB' + elif tran == 'NVME' or 'nvme' in name: + disk_type = 'NVMe' + elif rota == '0': + disk_type = 'SSD' + else: + disk_type = 'HDD' + + info = f'{model} ({disk_type}, {size})' + if mountpoint: + info += f' mounted at {mountpoint}' + elif dev != device: + # Check partition mountpoint + part_result = subprocess.run( + ['lsblk', '-ndo', 'MOUNTPOINT', f'/dev/{device}'], + capture_output=True, text=True, timeout=2 + ) + part_mount = part_result.stdout.strip() if part_result.returncode == 0 else '' + if part_mount: + info += f' partition {device} mounted at {part_mount}' + else: + info += ' -- not mounted' + else: + info += ' -- not mounted' + + return info + + return '' + except Exception: + return '' + + @staticmethod + def _translate_fs_function(func_info: str) -> str: + """Translate EXT4/filesystem function names to plain language.""" + func_name = func_info.split(':')[0] if ':' in func_info else func_info + translations = { + 'ext4_find_entry': 'directory lookup failed (possible directory corruption)', + 'ext4_lookup': 'file lookup failed (possible metadata corruption)', + 'ext4_journal_start': 'journal transaction failed (journal corruption)', + 'ext4_readdir': 'directory read failed (directory data corrupted)', + 'ext4_get_inode_loc': 'inode location failed (inode table corruption)', + '__ext4_get_inode_loc': 'inode location failed (inode table corruption)', + 'ext4_xattr_get': 'extended attributes read failed', + 'ext4_iget': 'inode read failed (possible inode corruption)', + 'ext4_mb_generate_buddy': 'block allocator error', + 'ext4_validate_block_bitmap': 'block bitmap corrupted', + 'ext4_validate_inode_bitmap': 'inode bitmap corrupted', + 'htree_dirblock_to_tree': 'directory index tree corrupted', + } + desc = translations.get(func_name, func_name) + return desc + def _check_service_failure(self, msg: str, unit: str): """Detect critical service failures with enriched context.""" # Filter out noise -- these are normal systemd transient units, @@ -405,7 +534,16 @@ class JournalWatcher: return '' def _check_disk_io(self, msg: str, syslog_id: str, priority: int): - """Detect disk I/O errors from kernel messages.""" + """ + Detect disk I/O errors from kernel messages. + + Cross-references SMART health before notifying: + - SMART PASSED -> no notification (transient controller event) + - SMART FAILED/UNKNOWN -> notify with enriched context + + Resolves ATA controller names to physical devices and identifies + the disk model/type/mountpoint for the user. + """ if syslog_id != 'kernel' and priority > 3: return @@ -413,20 +551,144 @@ class JournalWatcher: r'blk_update_request: I/O error.*dev (\S+)', r'Buffer I/O error on device (\S+)', r'SCSI error.*sd(\w)', - r'ata\d+.*error', + r'(ata\d+)[\.\d]*:.*error', ] for pattern in io_patterns: match = re.search(pattern, msg) if match: - device = match.group(1) if match.lastindex else 'unknown' + raw_device = match.group(1) if match.lastindex else 'unknown' + + # Resolve ATA port to physical disk name + if raw_device.startswith('ata'): + resolved = self._resolve_ata_to_disk(raw_device) + else: + # Strip partition number (sdb1 -> sdb) + resolved = re.sub(r'\d+$', '', raw_device) if raw_device.startswith('sd') else raw_device + + # Check SMART health -- if disk is healthy, this is transient noise + smart_health = self._quick_smart_health(resolved) + if smart_health == 'PASSED': + # SMART says disk is fine, don't notify for transient ATA/SCSI events + return + + # SMART is FAILED or UNKNOWN -- this may be a real problem + device_info = self._identify_block_device(resolved) + + # Build a clear, informative reason + parts = [] + if smart_health == 'FAILED': + parts.append(f'Disk /dev/{resolved}: I/O errors detected (SMART: FAILED)') + else: + parts.append(f'Disk /dev/{resolved}: I/O errors detected (SMART: unable to verify)') + + if device_info: + parts.append(f'Device: {device_info}') + elif resolved.startswith('ata'): + parts.append(f'Device: ATA controller {raw_device} (could not resolve to physical disk)') + else: + parts.append(f'Device: /dev/{resolved} (not currently detected -- may be disconnected or temporary)') + + # Extract useful detail from the raw kernel message + detail = self._translate_ata_error(msg) + if detail: + parts.append(f'Detail: {detail}') + + parts.append('Action: Check disk health with "smartctl -a /dev/{}" and consider replacement if SMART reports failures'.format(resolved)) + + enriched = '\n'.join(parts) + self._emit('disk_io_error', 'CRITICAL', { - 'device': device, - 'reason': msg[:200], + 'device': resolved, + 'reason': enriched, 'hostname': self._hostname, - }, entity='disk', entity_id=device) + }, entity='disk', entity_id=resolved) return + def _resolve_ata_to_disk(self, ata_port: str) -> str: + """Resolve an ATA port name (ata8) to a physical disk name (sda).""" + try: + port_num = re.search(r'ata(\d+)', ata_port) + if not port_num: + return ata_port + num = port_num.group(1) + # Check /sys/class/ata_port for the mapping + import glob as _glob + for path in _glob.glob(f'/sys/class/ata_port/ata{num}/../../host*/target*/*/block/*'): + disk_name = os.path.basename(path) + if disk_name.startswith('sd') or disk_name.startswith('nvme'): + return disk_name + # Fallback: try scsi_host mapping + for path in _glob.glob(f'/sys/class/ata_port/ata{num}/../../host*/scsi_host/host*/../../target*/*/block/*'): + disk_name = os.path.basename(path) + if disk_name.startswith('sd'): + return disk_name + return ata_port + except Exception: + return ata_port + + def _quick_smart_health(self, disk_name: str) -> str: + """Quick SMART health check. Returns 'PASSED', 'FAILED', or 'UNKNOWN'.""" + if not disk_name or disk_name.startswith('ata') or disk_name.startswith('zram'): + return 'UNKNOWN' + try: + dev_path = f'/dev/{disk_name}' if not disk_name.startswith('/') else disk_name + result = subprocess.run( + ['smartctl', '--health', '-j', dev_path], + capture_output=True, text=True, timeout=5 + ) + import json as _json + data = _json.loads(result.stdout) + passed = data.get('smart_status', {}).get('passed', None) + if passed is True: + return 'PASSED' + elif passed is False: + return 'FAILED' + return 'UNKNOWN' + except Exception: + return 'UNKNOWN' + + @staticmethod + def _translate_ata_error(msg: str) -> str: + """Translate common ATA/SCSI error codes to human-readable descriptions.""" + error_codes = { + 'IDNF': 'sector address not found (possible bad sector or cable issue)', + 'UNC': 'uncorrectable read error (bad sector)', + 'ABRT': 'command aborted by drive', + 'AMNF': 'address mark not found (surface damage)', + 'TK0NF': 'track 0 not found (drive hardware failure)', + 'BBK': 'bad block detected', + 'ICRC': 'interface CRC error (cable or connector issue)', + 'MC': 'media changed', + 'MCR': 'media change requested', + 'WP': 'write protected', + } + + parts = [] + for code, description in error_codes.items(): + if code in msg: + parts.append(description) + + if parts: + return '; '.join(parts) + + # Try to extract the Emask/SErr/action codes + emask = re.search(r'Emask\s+(0x[0-9a-f]+)', msg) + serr = re.search(r'SErr\s+(0x[0-9a-f]+)', msg) + action = re.search(r'action\s+(0x[0-9a-f]+)', msg) + + if emask or serr: + info = [] + if emask: + info.append(f'Error mask: {emask.group(1)}') + if serr: + info.append(f'SATA error: {serr.group(1)}') + if action and action.group(1) == '0x0': + info.append('auto-recovered') + return ', '.join(info) + + return '' + def _check_cluster_events(self, msg: str, syslog_id: str): """Detect cluster split-brain and node disconnect.""" msg_lower = msg.lower() @@ -613,6 +875,12 @@ class TaskWatcher: # Cache for active vzdump detection self._vzdump_active_cache: float = 0 # timestamp of last positive check self._vzdump_cache_ttl = 5 # cache result for 5s + # Internal tracking: when we see a vzdump task without an end status, + # we mark the timestamp. When we see it complete (status=OK/ERROR), + # we clear it. This supplements the /var/log/pve/tasks/active check + # to avoid timing gaps. + self._vzdump_running_since: float = 0 # 0 = no vzdump tracked + self._vzdump_grace_period = 120 # seconds after vzdump ends to still suppress def start(self): if self._running: @@ -634,13 +902,27 @@ class TaskWatcher: self._running = False def _is_vzdump_active(self) -> bool: - """Check if a vzdump (backup) job is currently running. + """Check if a vzdump (backup) job is currently running or recently finished. - Reads /var/log/pve/tasks/active which lists all running PVE tasks. - Also verifies the process is actually alive (PID check). - Result is cached for a few seconds to avoid excessive file reads. + Two-layer detection: + 1. Internal tracking: TaskWatcher marks vzdump start/end with a grace period + (covers the case where the VM restart arrives milliseconds after vzdump ends) + 2. /var/log/pve/tasks/active: reads the active task file and verifies PID + + This combination eliminates timing gaps that caused false VM notifications. """ now = time.time() + + # Layer 1: Internal tracking (most reliable, no file I/O) + if self._vzdump_running_since > 0: + elapsed = now - self._vzdump_running_since + if elapsed < self._vzdump_grace_period: + return True + else: + # Grace period expired -- clear the tracking + self._vzdump_running_since = 0 + + # Layer 2: /var/log/pve/tasks/active (catches vzdump started by other nodes or cron) # Negative cache: if we recently confirmed NO vzdump, skip the check if hasattr(self, '_vzdump_negative_cache') and \ now - self._vzdump_negative_cache < self._vzdump_cache_ttl: @@ -731,7 +1013,17 @@ class TaskWatcher: event_type, default_severity = event_info - + # Track vzdump (backup) tasks internally for VM suppression. + # When a vzdump starts (no status yet), mark it. When it completes + # (status = OK or ERROR), keep a grace period for the post-backup + # VM restart that follows shortly after. + if task_type == 'vzdump': + if not status: + # Backup just started -- track it + self._vzdump_running_since = time.time() + else: + # Backup just finished -- start grace period for VM restarts + self._vzdump_running_since = time.time() # will expire via grace_period # Check if task failed is_error = status and status != 'OK' and status != '' @@ -768,10 +1060,14 @@ class TaskWatcher: # Determine entity type from task type entity = 'ct' if task_type.startswith('vz') else 'vm' - # Backup and replication events are handled EXCLUSIVELY by the PVE - # webhook, which delivers much richer data (full logs, sizes, durations, - # filenames). TaskWatcher skips these entirely to avoid duplicates. - _WEBHOOK_EXCLUSIVE = {'backup_complete', 'backup_fail', 'backup_start', + # Backup completion/failure and replication events are handled + # EXCLUSIVELY by the PVE webhook, which delivers richer data (full + # logs, sizes, durations, filenames). TaskWatcher skips these to + # avoid duplicates. + # NOTE: backup_start is NOT in this set -- PVE's webhook only fires + # when a backup FINISHES, so TaskWatcher is the only source for + # the "backup started" notification. + _WEBHOOK_EXCLUSIVE = {'backup_complete', 'backup_fail', 'replication_complete', 'replication_fail'} if event_type in _WEBHOOK_EXCLUSIVE: return diff --git a/AppImage/scripts/notification_manager.py b/AppImage/scripts/notification_manager.py index 3b2bed92..87dcc156 100644 --- a/AppImage/scripts/notification_manager.py +++ b/AppImage/scripts/notification_manager.py @@ -554,6 +554,15 @@ class NotificationManager: def _dispatch_event(self, event: NotificationEvent): """Shared dispatch pipeline: cooldown -> rate limit -> render -> send.""" + # Suppress VM/CT start/stop during active backups (second layer of defense). + # The primary filter is in TaskWatcher, but timing gaps can let events + # slip through. This catch-all filter checks at dispatch time. + _BACKUP_NOISE_TYPES = {'vm_start', 'vm_stop', 'vm_shutdown', 'vm_restart', + 'ct_start', 'ct_stop', 'ct_shutdown', 'ct_restart'} + if event.event_type in _BACKUP_NOISE_TYPES and event.severity != 'CRITICAL': + if self._is_backup_running(): + return + # Check cooldown if not self._check_cooldown(event): return @@ -628,6 +637,42 @@ class NotificationManager: # ─── Cooldown / Dedup ─────────────────────────────────────── + def _is_backup_running(self) -> bool: + """Quick check if any vzdump process is currently active. + + Reads /var/log/pve/tasks/active and also checks for vzdump processes. + """ + import os + # Method 1: Check active tasks file + try: + with open('/var/log/pve/tasks/active', 'r') as f: + for line in f: + if ':vzdump:' in line: + parts = line.strip().split(':') + if len(parts) >= 3: + try: + pid = int(parts[2]) + os.kill(pid, 0) + return True + except (ValueError, ProcessLookupError, PermissionError): + pass + except (OSError, IOError): + pass + + # Method 2: Check for running vzdump processes directly + import subprocess + try: + result = subprocess.run( + ['pgrep', '-x', 'vzdump'], + capture_output=True, timeout=2 + ) + if result.returncode == 0: + return True + except Exception: + pass + + return False + def _check_cooldown(self, event: NotificationEvent) -> bool: """Check if the event passes cooldown rules.""" now = time.time() diff --git a/AppImage/scripts/notification_templates.py b/AppImage/scripts/notification_templates.py index 55371f45..153e78f0 100644 --- a/AppImage/scripts/notification_templates.py +++ b/AppImage/scripts/notification_templates.py @@ -480,7 +480,7 @@ TEMPLATES = { 'default_enabled': True, }, 'disk_io_error': { - 'title': '{hostname}: Disk I/O error', + 'title': '{hostname}: Disk I/O error on /dev/{device}', 'body': '{reason}', 'group': 'storage', 'default_enabled': True,