diff --git a/AppImage/components/health-status-modal.tsx b/AppImage/components/health-status-modal.tsx index dc5ac3a7..a55cef58 100644 --- a/AppImage/components/health-status-modal.tsx +++ b/AppImage/components/health-status-modal.tsx @@ -30,7 +30,6 @@ import { ChevronRight, Settings2, HelpCircle, - Usb, } from "lucide-react" interface CategoryCheck { @@ -415,44 +414,13 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu ) => { if (!checks || Object.keys(checks).length === 0) return null - // Sort checks: non-disk entries first, then disk entries sorted by device name - const sortedEntries = Object.entries(checks) - .filter(([, checkData]) => checkData.installed !== false) - .sort(([keyA, dataA], [keyB, dataB]) => { - const isDiskA = dataA.is_disk_entry === true - const isDiskB = dataB.is_disk_entry === true - if (isDiskA && !isDiskB) return 1 - if (!isDiskA && isDiskB) return -1 - if (isDiskA && isDiskB) { - // Sort disks by device name - const deviceA = dataA.device || keyA - const deviceB = dataB.device || keyB - return deviceA.localeCompare(deviceB) - } - return 0 - }) - return (
- {sortedEntries.map(([checkKey, checkData]) => { + {Object.entries(checks) + .filter(([, checkData]) => checkData.installed !== false) + .map(([checkKey, checkData]) => { const isDismissable = checkData.dismissable === true const checkStatus = checkData.status?.toUpperCase() || "OK" - const isDiskEntry = checkData.is_disk_entry === true - - // For disk entries, format label specially - let displayLabel = formatCheckLabel(checkKey) - let diskIcon = null - if (isDiskEntry) { - displayLabel = checkData.device || checkKey.replace(/_/g, '/') - const diskType = checkData.disk_type || '' - if (diskType === 'USB') { - diskIcon = - } else if (diskType === 'NVMe') { - diskIcon = - } else { - diskIcon = - } - } return (
{getStatusIcon(checkData.dismissed ? "INFO" : checkData.status, "sm")} - - {diskIcon} - {displayLabel} - {isDiskEntry && checkData.disk_type && ( - - {checkData.disk_type} - - )} - + {formatCheckLabel(checkKey)} {checkData.detail} {checkData.dismissed && ( @@ -499,7 +459,6 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu )} )} -
) diff --git a/AppImage/components/storage-overview.tsx b/AppImage/components/storage-overview.tsx index 0d25558d..a87149e5 100644 --- a/AppImage/components/storage-overview.tsx +++ b/AppImage/components/storage-overview.tsx @@ -1016,59 +1016,34 @@ export function StorageOverview() { className="sm:hidden border border-white/10 rounded-lg p-4 cursor-pointer bg-white/5 transition-colors" onClick={() => handleDiskClick(disk)} > -
- {/* Header row */} -
-
- -

/dev/{disk.name}

- USB -
-
+
+
+ +

/dev/{disk.name}

+ USB +
+
+ {disk.model && disk.model !== "Unknown" && ( +

{disk.model}

+ )} +
{disk.temperature > 0 && (
- - + + {disk.temperature}°C
)} {getHealthBadge(disk.health)} + {(disk.observations_count ?? 0) > 0 && ( + + + {disk.observations_count} + + )}
- - {/* Model if available */} - {disk.model && disk.model !== "Unknown" && ( -

{disk.model}

- )} - - {/* Info grid - 2 columns */} -
-
- Size -

{disk.size_formatted || disk.size || "N/A"}

-
-
- SMART Status -

{disk.smart_status || "N/A"}

-
- {disk.serial && disk.serial !== "Unknown" && ( -
- Serial -

{disk.serial}

-
- )} -
- - {/* Observations badge if any */} - {(disk.observations_count ?? 0) > 0 && ( -
- - - {disk.observations_count} observation{disk.observations_count > 1 ? 's' : ''} - -
- )}
@@ -1314,7 +1289,7 @@ export function StorageOverview() {
{/* Observations Section */} - {(diskObservations.length > 0 || loadingObservations || (selectedDisk.observations_count ?? 0) > 0) && ( + {(diskObservations.length > 0 || loadingObservations) && (

diff --git a/AppImage/scripts/flask_server.py b/AppImage/scripts/flask_server.py index a144e60c..90579fe4 100644 --- a/AppImage/scripts/flask_server.py +++ b/AppImage/scripts/flask_server.py @@ -2554,55 +2554,6 @@ def get_smart_data(disk_name): import traceback traceback.print_exc() - # ── Integrate persistent worst_health ── - # The health should never improve from a previous worst state without admin intervention. - # This prevents disks from showing "healthy" after they had issues that may have auto-resolved. - try: - current_health = smart_data['health'] - serial = smart_data.get('serial', '') - - # Get persistent worst_health - worst_info = health_persistence.get_disk_worst_health(disk_name, serial if serial != 'Unknown' else None) - - if worst_info: - worst_health = worst_info.get('worst_health', 'healthy') - admin_cleared = worst_info.get('admin_cleared', False) - - # Only apply worst_health if not cleared by admin - if not admin_cleared: - severity_order = {'unknown': -1, 'healthy': 0, 'warning': 1, 'critical': 2} - current_severity = severity_order.get(current_health, 0) - worst_severity = severity_order.get(worst_health, 0) - - # If worst_health is worse than current, use worst_health - if worst_severity > current_severity: - smart_data['health'] = worst_health - smart_data['health_source'] = 'persistent' - smart_data['worst_health_date'] = worst_info.get('worst_health_date') - smart_data['worst_health_reason'] = worst_info.get('worst_health_reason', '') - - # Update worst_health if current is worse (and not already stored) - if current_health in ('warning', 'critical'): - health_reason = '' - if smart_data.get('pending_sectors', 0) > 0: - health_reason = f"{smart_data['pending_sectors']} pending sector(s)" - if smart_data.get('reallocated_sectors', 0) > 0: - if health_reason: - health_reason += f", {smart_data['reallocated_sectors']} reallocated" - else: - health_reason = f"{smart_data['reallocated_sectors']} reallocated sector(s)" - if smart_data.get('smart_status') == 'failed': - health_reason = 'SMART test FAILED' + (f' ({health_reason})' if health_reason else '') - - health_persistence.update_disk_worst_health( - disk_name, - serial if serial != 'Unknown' else None, - current_health, - health_reason - ) - except Exception as e: - # print(f"[v0] Error integrating worst_health: {e}") - pass return smart_data diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py index 8fa8ca82..f56c2a1c 100644 --- a/AppImage/scripts/health_monitor.py +++ b/AppImage/scripts/health_monitor.py @@ -1034,19 +1034,38 @@ class HealthMonitor: io_error_key = f'disk_{device}' error_key = f'smart_{device}' reason = f'{disk}: {issue["reason"]}' + severity = issue.get('status', 'WARNING') + + # Get serial for this disk to properly track it (important for USB disks) + disk_serial = '' + disk_model = '' + try: + smart_result = subprocess.run( + ['smartctl', '-i', '-j', f'/dev/{device}'], + capture_output=True, text=True, timeout=5 + ) + if smart_result.returncode in (0, 4): + import json + smart_data = json.loads(smart_result.stdout) + disk_serial = smart_data.get('serial_number', '') + disk_model = smart_data.get('model_name', '') or smart_data.get('model_family', '') + except Exception: + pass + try: if (not health_persistence.is_error_active(io_error_key, category='disks') and not health_persistence.is_error_active(error_key, category='disks')): health_persistence.record_error( error_key=error_key, category='disks', - severity=issue.get('status', 'WARNING'), + severity=severity, reason=reason, details={ 'disk': device, 'device': disk, 'block_device': device, - 'serial': '', + 'serial': disk_serial, + 'model': disk_model, 'smart_status': 'WARNING', 'smart_lines': issue.get('smart_lines', []), 'io_lines': issue.get('io_lines', []), @@ -1055,6 +1074,12 @@ class HealthMonitor: 'dismissable': True, } ) + # Update worst_health for the disk (persists even if current error clears) + # Use serial for proper USB disk tracking + health_persistence.update_disk_worst_health(device, disk_serial if disk_serial else None, severity.lower()) + # Also register the disk for observation tracking + if disk_serial: + health_persistence.register_disk(device, disk_serial, disk_model, 0) except Exception: pass @@ -1073,16 +1098,205 @@ class HealthMonitor: if disk_path not in storage_details or storage_details[disk_path].get('status') == 'OK': issues.append(f'{disk_path}: {disk_info.get("reason", "I/O errors")}') storage_details[disk_path] = disk_info + # Update worst_health for I/O errors + device = disk_path.replace('/dev/', '') + io_severity = disk_info.get('status', 'WARNING').lower() + + # Get serial for proper disk tracking (important for USB) + io_serial = '' + io_model = '' + try: + smart_result = subprocess.run( + ['smartctl', '-i', '-j', f'/dev/{device}'], + capture_output=True, text=True, timeout=5 + ) + if smart_result.returncode in (0, 4): + import json + smart_data = json.loads(smart_result.stdout) + io_serial = smart_data.get('serial_number', '') + io_model = smart_data.get('model_name', '') or smart_data.get('model_family', '') + except Exception: + pass + + try: + health_persistence.update_disk_worst_health(device, io_serial if io_serial else None, io_severity) + if io_serial: + health_persistence.register_disk(device, io_serial, io_model, 0) + except Exception: + pass - # Build checks dict from storage_details, adding OK entries for items with no issues + # Build checks dict from storage_details + # We consolidate disk error entries (like /Dev/Sda) into physical disk entries + # and only show disks with problems (not healthy ones). checks = {} + disk_errors_by_device = {} # Collect disk errors for consolidation + for key, val in storage_details.items(): + # Check if this is a disk device entry (e.g., /Dev/Sda, /dev/sda, sda) + key_lower = key.lower() + is_disk_entry = ( + key_lower.startswith('/dev/') or + key_lower.startswith('dev/') or + (len(key_lower) <= 10 and (key_lower.startswith('sd') or + key_lower.startswith('nvme') or key_lower.startswith('hd'))) + ) + + if is_disk_entry: + # Extract device name and collect for consolidation + device_name = key_lower.replace('/dev/', '').replace('dev/', '').strip('/') + if device_name and len(device_name) <= 15: + if device_name not in disk_errors_by_device: + disk_errors_by_device[device_name] = { + 'status': val.get('status', 'WARNING'), + 'detail': val.get('reason', ''), + 'error_key': val.get('error_key'), + 'dismissable': val.get('dismissable', True), + } + else: + # Merge: keep worst status + existing = disk_errors_by_device[device_name] + if val.get('status') == 'CRITICAL': + existing['status'] = 'CRITICAL' + # Append detail if different + new_detail = val.get('reason', '') + if new_detail and new_detail not in existing.get('detail', ''): + existing['detail'] = f"{existing['detail']}; {new_detail}".strip('; ') + continue # Don't add raw disk error entry, we'll add consolidated later + + # Non-disk entries go directly to checks checks[key] = { 'status': val.get('status', 'OK'), 'detail': val.get('reason', 'OK'), **{k: v for k, v in val.items() if k not in ('status', 'reason')} } + # Get physical disk info for matching errors to disks + # This uses the same detection as flask_server.py /api/storage/info + physical_disks = {} + try: + result = subprocess.run( + ['lsblk', '-b', '-d', '-n', '-o', 'NAME,SIZE,TYPE,TRAN'], + capture_output=True, text=True, timeout=5 + ) + if result.returncode == 0: + for line in result.stdout.strip().split('\n'): + if not line.strip(): + continue + parts = line.split() + if len(parts) >= 3 and parts[2] == 'disk': + disk_name = parts[0] + # Skip virtual devices + if disk_name.startswith(('zd', 'zram', 'loop', 'ram', 'dm-')): + continue + tran = parts[3].upper() if len(parts) > 3 else '' + is_usb = tran == 'USB' + is_nvme = disk_name.startswith('nvme') + + # Get serial from smartctl + serial = '' + model = '' + try: + smart_result = subprocess.run( + ['smartctl', '-i', '-j', f'/dev/{disk_name}'], + capture_output=True, text=True, timeout=5 + ) + if smart_result.returncode in (0, 4): # 4 = SMART not available but info OK + import json + smart_data = json.loads(smart_result.stdout) + serial = smart_data.get('serial_number', '') + model = smart_data.get('model_name', '') or smart_data.get('model_family', '') + except Exception: + pass + + physical_disks[disk_name] = { + 'serial': serial, + 'model': model, + 'is_usb': is_usb, + 'is_nvme': is_nvme, + 'disk_type': 'USB' if is_usb else ('NVMe' if is_nvme else 'SATA'), + } + except Exception: + pass + + # Add consolidated disk entries (only for disks with errors) + for device_name, error_info in disk_errors_by_device.items(): + # Try to find this disk in physical_disks for enriched info + disk_info = physical_disks.get(device_name, {}) + + # If not found by name, try to match by serial (from error details) + if not disk_info: + error_serial = error_info.get('serial', '') + if error_serial: + for dk, di in physical_disks.items(): + if di.get('serial', '').lower() == error_serial.lower(): + disk_info = di + device_name = dk # Update device name to matched disk + break + + # Determine disk type + disk_type = disk_info.get('disk_type', 'SATA') + if not disk_info: + # Fallback detection + if device_name.startswith('nvme'): + disk_type = 'NVMe' + else: + # Check if USB via sysfs + try: + usb_check = subprocess.run( + ['readlink', '-f', f'/sys/block/{device_name}'], + capture_output=True, text=True, timeout=2 + ) + if 'usb' in usb_check.stdout.lower(): + disk_type = 'USB' + except Exception: + pass + + serial = disk_info.get('serial', '') + model = disk_info.get('model', '') + + # Get worst_health from persistence + try: + health_status = health_persistence.get_disk_health_status(device_name, serial if serial else None) + worst_health = health_status.get('worst_health', 'healthy') + + # Final health = max(current, worst) + health_order = {'healthy': 0, 'ok': 0, 'warning': 1, 'critical': 2} + current_level = health_order.get(error_info['status'].lower(), 1) + worst_level = health_order.get(worst_health.lower(), 0) + + if worst_level > current_level: + # worst_health is worse, use it + final_status = worst_health.upper() + else: + final_status = error_info['status'] + except Exception: + final_status = error_info['status'] + + # Build detail string with serial/model if available + detail = error_info['detail'] + if serial and serial not in detail: + detail = f"{serial} - {detail}" + + # Create consolidated disk entry + check_key = f'/dev/{device_name}' + checks[check_key] = { + 'status': final_status, + 'detail': detail, + 'disk_type': disk_type, + 'device': f'/dev/{device_name}', + 'serial': serial, + 'model': model, + 'error_key': error_info.get('error_key') or f'disk_{device_name}', + 'dismissable': error_info.get('dismissable', True), + 'is_disk_entry': True, + } + + # Register disk in persistence if not already (for worst_health tracking) + try: + health_persistence.register_disk(device_name, serial if serial else None, model, 0) + except Exception: + pass + # ALWAYS add descriptive entries for capabilities this server has. # When everything is OK, they show as OK. When there are issues, # they still appear so the user can see the full picture (e.g. @@ -1105,120 +1319,8 @@ class HealthMonitor: if self.capabilities.get('has_lvm') and 'lvm_volumes' not in checks and 'lvm_check' not in checks: checks['lvm_volumes'] = {'status': 'OK', 'detail': 'LVM volumes OK'} - # Get physical disks list for UI display - physical_disks = self._get_physical_disks_list() - - # Collect disk error entries (SMART, I/O, etc.) from checks that should be merged with disk entries - # These have keys like '/Dev/Sda', '/dev/sda', 'sda', etc. - disk_errors_by_device = {} - keys_to_remove = [] - for key, val in checks.items(): - # Skip non-disk error entries (like lvm_check, root_fs, etc.) - key_lower = key.lower() - - # Check if this looks like a disk error entry - is_disk_error = False - device_name = None - - if key_lower.startswith('/dev/') or key_lower.startswith('dev/'): - # Keys like '/Dev/Sda', '/dev/sda' - device_name = key_lower.replace('/dev/', '').replace('dev/', '').strip('/') - is_disk_error = True - elif key_lower.startswith('sd') or key_lower.startswith('nvme') or key_lower.startswith('hd'): - # Keys like 'sda', 'nvme0n1' - device_name = key_lower - is_disk_error = True - - if is_disk_error and device_name and len(device_name) <= 15: - # Store the error info, merging if we already have an error for this device - if device_name not in disk_errors_by_device: - disk_errors_by_device[device_name] = { - 'status': val.get('status', 'WARNING'), - 'detail': val.get('detail', val.get('reason', '')), - 'error_key': val.get('error_key'), - 'dismissable': val.get('dismissable', True), - 'dismissed': val.get('dismissed', False), - } - else: - # Merge: keep the worst status - existing = disk_errors_by_device[device_name] - if val.get('status') == 'CRITICAL': - existing['status'] = 'CRITICAL' - # Append details - new_detail = val.get('detail', val.get('reason', '')) - if new_detail and new_detail not in existing.get('detail', ''): - existing['detail'] = f"{existing.get('detail', '')}; {new_detail}".strip('; ') - keys_to_remove.append(key) - - # Remove the old disk error entries - they'll be merged into disk entries - for key in keys_to_remove: - del checks[key] - - # Add individual disk checks for UI display (like Network interfaces) - for disk in physical_disks: - device = disk.get('device', '') - name = disk.get('name', '') - serial = disk.get('serial', '') - final_health = disk.get('final_health', 'healthy') - final_reason = disk.get('final_reason', '') - is_usb = disk.get('is_usb', False) - - # Format check key - use device path for uniqueness - check_key = device.lower().replace('/', '_') # e.g., _dev_sda - - # Check if there's a disk error (SMART, I/O, etc.) for this disk - disk_error = disk_errors_by_device.get(name.lower()) - - # Determine status - use disk error status if present, otherwise use final_health - if disk_error and disk_error.get('status') in ('WARNING', 'CRITICAL'): - status = disk_error['status'] - error_detail = disk_error.get('detail', '') - elif final_health == 'critical': - status = 'CRITICAL' - error_detail = '' - elif final_health == 'warning': - status = 'WARNING' - error_detail = '' - else: - status = 'OK' - error_detail = '' - - # Build detail string - disk_type = 'USB' if is_usb else ('NVMe' if disk.get('is_nvme') else 'SATA') - detail = f'{serial}' if serial else 'Unknown serial' - if final_reason: - detail += f' - {final_reason}' - elif error_detail: - detail += f' - {error_detail}' - - # Only add to checks if not already present - if check_key not in checks: - checks[check_key] = { - 'status': status, - 'detail': detail, - 'device': device, - 'serial': serial, - 'disk_type': disk_type, - 'is_disk_entry': True, # Flag to identify disk entries in frontend - 'worst_health': disk.get('worst_health', 'healthy'), - 'worst_health_date': disk.get('worst_health_date'), - 'admin_cleared': disk.get('admin_cleared', False), - } - - # If disk has issues, it needs an error_key for dismiss functionality - if status != 'OK': - # Use disk error_key if available, otherwise generate one - if disk_error and disk_error.get('error_key'): - checks[check_key]['error_key'] = disk_error['error_key'] - else: - checks[check_key]['error_key'] = f'disk_{name}_{serial}' if serial else f'disk_{name}' - checks[check_key]['dismissable'] = True - # Preserve dismissed state from disk error - if disk_error and disk_error.get('dismissed'): - checks[check_key]['dismissed'] = True - if not issues: - return {'status': 'OK', 'checks': checks, 'physical_disks': physical_disks} + return {'status': 'OK', 'checks': checks} # ── Mark dismissed checks ── # If an error_key in a check has been acknowledged (dismissed) in the @@ -1250,7 +1352,6 @@ class HealthMonitor: 'reason': '; '.join(issues[:3]), 'details': storage_details, 'checks': checks, - 'physical_disks': physical_disks, 'all_dismissed': True, } except Exception: @@ -1265,8 +1366,7 @@ class HealthMonitor: 'status': 'CRITICAL' if has_critical else 'WARNING', 'reason': '; '.join(issues[:3]), 'details': storage_details, - 'checks': checks, - 'physical_disks': physical_disks + 'checks': checks } def _check_filesystem(self, mount_point: str) -> Dict[str, Any]: @@ -1350,221 +1450,9 @@ class HealthMonitor: return {'status': 'OK'} # No VGs found, LVM not in use return {'status': 'OK', 'volumes': len(volumes)} - + except Exception: return {'status': 'OK'} - - def _get_physical_disks_list(self) -> List[Dict[str, Any]]: - """Get list of all physical disks with their health status. - - Combines real-time SMART data with persistent worst_health state. - Returns list suitable for display in Health Monitor UI. - """ - disks = [] - - try: - # Get all block devices - result = subprocess.run( - ['lsblk', '-d', '-n', '-o', 'NAME,SIZE,TYPE,TRAN,MODEL,SERIAL'], - capture_output=True, text=True, timeout=5 - ) - - if result.returncode != 0: - return [] - - for line in result.stdout.strip().split('\n'): - if not line.strip(): - continue - - parts = line.split(None, 5) - if len(parts) < 3: - continue - - name = parts[0] - size = parts[1] if len(parts) > 1 else '' - dtype = parts[2] if len(parts) > 2 else '' - transport = parts[3] if len(parts) > 3 else '' - model = parts[4] if len(parts) > 4 else '' - serial = parts[5] if len(parts) > 5 else '' - - # Only include disk type devices - if dtype != 'disk': - continue - - # Skip loop devices, ram disks, etc. - if name.startswith(('loop', 'ram', 'zram')): - continue - - is_usb = transport.lower() == 'usb' - is_nvme = name.startswith('nvme') - - # Get current SMART status - current_health = 'healthy' - smart_status = 'UNKNOWN' - pending_sectors = 0 - reallocated_sectors = 0 - - try: - dev_path = f'/dev/{name}' - smart_result = subprocess.run( - ['smartctl', '-H', '-A', dev_path], - capture_output=True, text=True, timeout=5 - ) - - output = smart_result.stdout - - # Check SMART overall status - if 'PASSED' in output: - smart_status = 'PASSED' - elif 'FAILED' in output: - smart_status = 'FAILED' - current_health = 'critical' - - # Parse SMART attributes for pending/reallocated sectors - for attr_line in output.split('\n'): - if 'Current_Pending_Sector' in attr_line or 'Pending_Sector' in attr_line: - parts_attr = attr_line.split() - if parts_attr: - try: - pending_sectors = int(parts_attr[-1]) - except ValueError: - pass - elif 'Reallocated_Sector' in attr_line: - parts_attr = attr_line.split() - if parts_attr: - try: - reallocated_sectors = int(parts_attr[-1]) - except ValueError: - pass - - # Determine current health based on sectors - if current_health != 'critical': - if pending_sectors > 10 or reallocated_sectors > 10: - current_health = 'critical' - elif pending_sectors > 0 or reallocated_sectors > 0: - current_health = 'warning' - - except Exception: - pass - - # Build health reason - health_reason = '' - if pending_sectors > 0: - health_reason = f'{pending_sectors} pending sector(s)' - if reallocated_sectors > 0: - if health_reason: - health_reason += f', {reallocated_sectors} reallocated' - else: - health_reason = f'{reallocated_sectors} reallocated sector(s)' - if smart_status == 'FAILED': - health_reason = 'SMART test FAILED' + (f' ({health_reason})' if health_reason else '') - - # Get persistent worst_health from database - worst_info = health_persistence.get_disk_worst_health(name, serial) - worst_health = worst_info.get('worst_health', 'healthy') if worst_info else 'healthy' - worst_health_date = worst_info.get('worst_health_date') if worst_info else None - worst_health_reason = worst_info.get('worst_health_reason', '') if worst_info else '' - admin_cleared = worst_info.get('admin_cleared', False) if worst_info else False - - # Update worst_health if current is worse - if current_health != 'healthy': - updated = health_persistence.update_disk_worst_health( - name, serial, current_health, health_reason - ) - if updated: - worst_health = current_health - worst_health_reason = health_reason - - # Record as disk observation (for both internal and USB disks) - # This ensures SMART issues are tracked in observations - try: - obs_type = 'smart_error' - if pending_sectors and pending_sectors > 0: - obs_type = 'pending_sectors' - elif reallocated_sectors and reallocated_sectors > 0: - obs_type = 'reallocated_sectors' - elif smart_status == 'FAILED': - obs_type = 'smart_failed' - - obs_sig = f'smart_{name}_{obs_type}_{pending_sectors}_{reallocated_sectors}' - health_persistence.record_disk_observation( - device_name=name, - serial=serial, - error_type=obs_type, - error_signature=obs_sig, - raw_message=f'/dev/{name}: {health_reason}', - severity=current_health, - ) - - # Send smart_warning notification if this is a NEW issue - # (only when updated=True means this is first time seeing this state) - if updated: - try: - from notification_manager import notification_manager - notification_manager.send_notification( - event_type='smart_warning', - data={ - 'device': f'/dev/{name}', - 'reason': health_reason, - 'serial': serial or 'Unknown', - 'model': model or 'Unknown', - 'pending_sectors': pending_sectors, - 'reallocated_sectors': reallocated_sectors, - 'smart_status': smart_status, - 'hostname': self._hostname, - } - ) - except Exception: - pass - except Exception: - pass - - # Final health is the worse of current and persistent - severity_order = {'healthy': 0, 'warning': 1, 'critical': 2} - if severity_order.get(worst_health, 0) > severity_order.get(current_health, 0): - final_health = worst_health - final_reason = worst_health_reason - else: - final_health = current_health - final_reason = health_reason - - # Get active observations count - obs = health_persistence.get_disk_observations(device_name=name, serial=serial) - active_observations = len(obs) if obs else 0 - - # Register disk in persistence (for tracking) - try: - health_persistence.register_disk(name, serial, model) - except Exception: - pass - - disks.append({ - 'device': f'/dev/{name}', - 'name': name, - 'serial': serial or '', - 'model': model or 'Unknown', - 'size': size, - 'transport': transport, - 'is_usb': is_usb, - 'is_nvme': is_nvme, - 'smart_status': smart_status, - 'current_health': current_health, - 'current_health_reason': health_reason, - 'worst_health': worst_health, - 'worst_health_date': worst_health_date, - 'worst_health_reason': worst_health_reason, - 'final_health': final_health, - 'final_reason': final_reason, - 'pending_sectors': pending_sectors, - 'reallocated_sectors': reallocated_sectors, - 'active_observations': active_observations, - 'admin_cleared': admin_cleared, - }) - - except Exception as e: - print(f"[HealthMonitor] Error getting physical disks list: {e}") - - return disks # This function is no longer used in get_detailed_status, but kept for reference if needed. # The new _check_proxmox_storage function handles this logic better. diff --git a/AppImage/scripts/health_persistence.py b/AppImage/scripts/health_persistence.py index 04baf545..54cdbae2 100644 --- a/AppImage/scripts/health_persistence.py +++ b/AppImage/scripts/health_persistence.py @@ -164,14 +164,25 @@ class HealthPersistence: removed INTEGER DEFAULT 0, worst_health TEXT DEFAULT 'healthy', worst_health_date TEXT, - worst_health_reason TEXT, - admin_cleared INTEGER DEFAULT 0, - admin_cleared_date TEXT, - admin_cleared_note TEXT, + admin_cleared TEXT, UNIQUE(device_name, serial) ) ''') + # Migration: add worst_health columns if they don't exist (for existing DBs) + try: + cursor.execute('ALTER TABLE disk_registry ADD COLUMN worst_health TEXT DEFAULT "healthy"') + except Exception: + pass + try: + cursor.execute('ALTER TABLE disk_registry ADD COLUMN worst_health_date TEXT') + except Exception: + pass + try: + cursor.execute('ALTER TABLE disk_registry ADD COLUMN admin_cleared TEXT') + except Exception: + pass + # Observation log: deduplicated error events per disk cursor.execute(''' CREATE TABLE IF NOT EXISTS disk_observations ( @@ -195,17 +206,6 @@ class HealthPersistence: cursor.execute('CREATE INDEX IF NOT EXISTS idx_obs_disk ON disk_observations(disk_registry_id)') cursor.execute('CREATE INDEX IF NOT EXISTS idx_obs_dismissed ON disk_observations(dismissed)') - # Migration: add worst_health columns to disk_registry if not present - cursor.execute("PRAGMA table_info(disk_registry)") - disk_columns = [col[1] for col in cursor.fetchall()] - if 'worst_health' not in disk_columns: - cursor.execute("ALTER TABLE disk_registry ADD COLUMN worst_health TEXT DEFAULT 'healthy'") - cursor.execute("ALTER TABLE disk_registry ADD COLUMN worst_health_date TEXT") - cursor.execute("ALTER TABLE disk_registry ADD COLUMN worst_health_reason TEXT") - cursor.execute("ALTER TABLE disk_registry ADD COLUMN admin_cleared INTEGER DEFAULT 0") - cursor.execute("ALTER TABLE disk_registry ADD COLUMN admin_cleared_date TEXT") - cursor.execute("ALTER TABLE disk_registry ADD COLUMN admin_cleared_note TEXT") - conn.commit() conn.close() @@ -1231,26 +1231,11 @@ class HealthPersistence: # a different device_name (e.g. 'ata8' instead of 'sdh'), # update that entry's device_name so observations carry over. if serial: - # Try exact match first cursor.execute(''' SELECT id, device_name FROM disk_registry WHERE serial = ? AND serial != '' AND device_name != ? ''', (serial, device_name)) old_rows = cursor.fetchall() - - # If no exact match, try normalized match (for USB disks with special chars) - if not old_rows: - normalized = self._normalize_serial(serial) - if normalized and normalized != serial: - cursor.execute( - 'SELECT id, device_name, serial FROM disk_registry ' - 'WHERE serial != "" AND device_name != ?', (device_name,)) - for row in cursor.fetchall(): - db_normalized = self._normalize_serial(row[2]) - if db_normalized == normalized or normalized in db_normalized or db_normalized in normalized: - old_rows.append((row[0], row[1])) - break - for old_id, old_dev in old_rows: # Only consolidate ATA names -> block device names if old_dev.startswith('ata') and not device_name.startswith('ata'): @@ -1288,23 +1273,6 @@ class HealthPersistence: except Exception as e: print(f"[HealthPersistence] Error registering disk {device_name}: {e}") - def _normalize_serial(self, serial: str) -> str: - """Normalize serial number for comparison. - - USB disks can have serials with escape sequences like \\x06\\x18 - or non-printable characters. This normalizes them for matching. - """ - if not serial: - return '' - import re - # Remove escape sequences like \x06, \x18 - normalized = re.sub(r'\\x[0-9a-fA-F]{2}', '', serial) - # Remove non-printable characters - normalized = ''.join(c for c in normalized if c.isprintable()) - # Remove common prefixes that vary - normalized = normalized.strip() - return normalized - def _get_disk_registry_id(self, cursor, device_name: str, serial: Optional[str] = None) -> Optional[int]: """Find disk_registry.id, matching by serial first, then device_name. @@ -1313,25 +1281,12 @@ class HealthPersistence: checks entries with ATA names that share the same serial. """ if serial: - # Try exact match first cursor.execute( 'SELECT id FROM disk_registry WHERE serial = ? AND serial != "" ORDER BY last_seen DESC LIMIT 1', (serial,)) row = cursor.fetchone() if row: return row[0] - - # Try normalized serial match (for USB disks with special chars) - normalized = self._normalize_serial(serial) - if normalized and normalized != serial: - # Search for serials that start with or contain the normalized version - cursor.execute( - 'SELECT id, serial FROM disk_registry WHERE serial != "" ORDER BY last_seen DESC') - for row in cursor.fetchall(): - db_normalized = self._normalize_serial(row[1]) - if db_normalized == normalized or normalized in db_normalized or db_normalized in normalized: - return row[0] - # Fallback: match by device_name (strip /dev/ prefix) clean_dev = device_name.replace('/dev/', '') cursor.execute( @@ -1340,7 +1295,6 @@ class HealthPersistence: row = cursor.fetchone() if row: return row[0] - # Last resort: search for ATA-named entries that might refer to this device # This handles cases where observations were recorded under 'ata8' # but we're querying for 'sdh' @@ -1353,6 +1307,131 @@ class HealthPersistence: pass return None + def update_disk_worst_health(self, device_name: str, serial: Optional[str], + new_health: str) -> bool: + """Update worst_health if new_health is worse than current. + + Health hierarchy: healthy < warning < critical + Only escalates, never downgrades automatically. + + Returns True if worst_health was updated. + """ + health_order = {'healthy': 0, 'warning': 1, 'critical': 2} + new_level = health_order.get(new_health.lower(), 0) + + if new_level == 0: # healthy never updates worst_health + return False + + now = datetime.now().isoformat() + try: + conn = self._get_conn() + cursor = conn.cursor() + + disk_id = self._get_disk_registry_id(cursor, device_name, serial) + if not disk_id: + # Register disk first + self.register_disk(device_name.replace('/dev/', ''), serial) + disk_id = self._get_disk_registry_id(cursor, device_name, serial) + + if not disk_id: + conn.close() + return False + + # Get current worst_health + cursor.execute('SELECT worst_health FROM disk_registry WHERE id = ?', (disk_id,)) + row = cursor.fetchone() + current_worst = row[0] if row and row[0] else 'healthy' + current_level = health_order.get(current_worst.lower(), 0) + + # Only update if new health is worse + if new_level > current_level: + cursor.execute(''' + UPDATE disk_registry + SET worst_health = ?, worst_health_date = ?, admin_cleared = NULL + WHERE id = ? + ''', (new_health.lower(), now, disk_id)) + conn.commit() + conn.close() + return True + + conn.close() + return False + except Exception as e: + print(f"[HealthPersistence] Error updating worst_health for {device_name}: {e}") + return False + + def get_disk_health_status(self, device_name: str, serial: Optional[str] = None) -> Dict[str, Any]: + """Get the health status of a disk including worst_health. + + Returns dict with: + - worst_health: 'healthy', 'warning', or 'critical' + - worst_health_date: ISO timestamp when worst_health was set + - admin_cleared: ISO timestamp if admin manually cleared the health + - observations_count: Number of recorded observations + """ + try: + conn = self._get_conn() + cursor = conn.cursor() + + disk_id = self._get_disk_registry_id(cursor, device_name, serial) + if not disk_id: + conn.close() + return {'worst_health': 'healthy', 'observations_count': 0} + + cursor.execute(''' + SELECT worst_health, worst_health_date, admin_cleared + FROM disk_registry WHERE id = ? + ''', (disk_id,)) + row = cursor.fetchone() + + # Count observations + cursor.execute( + 'SELECT COUNT(*) FROM disk_observations WHERE disk_registry_id = ? AND dismissed = 0', + (disk_id,)) + obs_count = cursor.fetchone()[0] + + conn.close() + + if row: + return { + 'worst_health': row[0] or 'healthy', + 'worst_health_date': row[1], + 'admin_cleared': row[2], + 'observations_count': obs_count + } + return {'worst_health': 'healthy', 'observations_count': obs_count} + except Exception as e: + print(f"[HealthPersistence] Error getting disk health for {device_name}: {e}") + return {'worst_health': 'healthy', 'observations_count': 0} + + def clear_disk_health_history(self, device_name: str, serial: Optional[str] = None) -> bool: + """Admin action: clear worst_health back to healthy. + + This resets the health status but keeps all observations for audit. + Records when the admin cleared it for accountability. + """ + now = datetime.now().isoformat() + try: + conn = self._get_conn() + cursor = conn.cursor() + + disk_id = self._get_disk_registry_id(cursor, device_name, serial) + if not disk_id: + conn.close() + return False + + cursor.execute(''' + UPDATE disk_registry + SET worst_health = 'healthy', worst_health_date = NULL, admin_cleared = ? + WHERE id = ? + ''', (now, disk_id)) + conn.commit() + conn.close() + return True + except Exception as e: + print(f"[HealthPersistence] Error clearing health for {device_name}: {e}") + return False + def record_disk_observation(self, device_name: str, serial: Optional[str], error_type: str, error_signature: str, raw_message: str = '', @@ -1391,6 +1470,10 @@ class HealthPersistence: conn.commit() conn.close() + + # Update worst_health based on observation severity + self.update_disk_worst_health(clean_dev, serial, severity) + except Exception as e: print(f"[HealthPersistence] Error recording disk observation: {e}") @@ -1539,186 +1622,6 @@ class HealthPersistence: except Exception as e: print(f"[HealthPersistence] Error marking removed disks: {e}") - # ──────────────────────────────────────────────────────────────── - # Disk Worst Health State Tracking - # ──────────────────────────────────────────────────────────────── - - HEALTH_SEVERITY_ORDER = {'healthy': 0, 'warning': 1, 'critical': 2} - - def update_disk_worst_health(self, device_name: str, serial: Optional[str], - health: str, reason: str = '') -> bool: - """Update worst_health if the new health is worse than current. - - Health progression is one-way: healthy -> warning -> critical - Only admin_clear_disk_health() can reset to healthy. - - Returns True if worst_health was updated. - """ - health_lower = health.lower() - if health_lower not in self.HEALTH_SEVERITY_ORDER: - return False - - try: - conn = self._get_conn() - cursor = conn.cursor() - - disk_id = self._get_disk_registry_id(cursor, device_name.replace('/dev/', ''), serial) - if not disk_id: - # Auto-register disk if not present - self.register_disk(device_name.replace('/dev/', ''), serial) - disk_id = self._get_disk_registry_id(cursor, device_name.replace('/dev/', ''), serial) - - if not disk_id: - conn.close() - return False - - # Get current worst_health - cursor.execute('SELECT worst_health, admin_cleared FROM disk_registry WHERE id = ?', (disk_id,)) - row = cursor.fetchone() - if not row: - conn.close() - return False - - current_worst = row[0] or 'healthy' - admin_cleared = row[1] or 0 - - # If admin cleared and new issue is the same or less severe, don't update - # But if admin cleared and issue escalates, update anyway - current_severity = self.HEALTH_SEVERITY_ORDER.get(current_worst, 0) - new_severity = self.HEALTH_SEVERITY_ORDER.get(health_lower, 0) - - # Only update if new health is worse - if new_severity > current_severity: - now = datetime.now().isoformat() - cursor.execute(''' - UPDATE disk_registry - SET worst_health = ?, worst_health_date = ?, worst_health_reason = ?, - admin_cleared = 0 - WHERE id = ? - ''', (health_lower, now, reason, disk_id)) - conn.commit() - conn.close() - return True - - conn.close() - return False - except Exception as e: - print(f"[HealthPersistence] Error updating disk worst_health: {e}") - return False - - def get_disk_worst_health(self, device_name: str, serial: Optional[str] = None) -> Optional[Dict[str, Any]]: - """Get the worst health state for a specific disk.""" - try: - conn = self._get_conn() - cursor = conn.cursor() - - disk_id = self._get_disk_registry_id(cursor, device_name.replace('/dev/', ''), serial) - if not disk_id: - conn.close() - return None - - cursor.execute(''' - SELECT worst_health, worst_health_date, worst_health_reason, - admin_cleared, admin_cleared_date, admin_cleared_note - FROM disk_registry WHERE id = ? - ''', (disk_id,)) - row = cursor.fetchone() - conn.close() - - if row: - return { - 'worst_health': row[0] or 'healthy', - 'worst_health_date': row[1], - 'worst_health_reason': row[2], - 'admin_cleared': bool(row[3]), - 'admin_cleared_date': row[4], - 'admin_cleared_note': row[5], - } - return None - except Exception as e: - print(f"[HealthPersistence] Error getting disk worst_health: {e}") - return None - - def admin_clear_disk_health(self, device_name: str, serial: Optional[str], note: str) -> bool: - """Admin manually clears disk health history (e.g., after disk replacement). - - Requires a note explaining why (for audit trail). - """ - if not note or len(note.strip()) < 5: - return False # Require meaningful note - - try: - conn = self._get_conn() - cursor = conn.cursor() - - disk_id = self._get_disk_registry_id(cursor, device_name.replace('/dev/', ''), serial) - if not disk_id: - conn.close() - return False - - now = datetime.now().isoformat() - cursor.execute(''' - UPDATE disk_registry - SET worst_health = 'healthy', admin_cleared = 1, - admin_cleared_date = ?, admin_cleared_note = ? - WHERE id = ? - ''', (now, note.strip(), disk_id)) - - # Also dismiss all active observations for this disk - cursor.execute(''' - UPDATE disk_observations SET dismissed = 1 WHERE disk_registry_id = ? - ''', (disk_id,)) - - conn.commit() - conn.close() - return True - except Exception as e: - print(f"[HealthPersistence] Error clearing disk health: {e}") - return False - - def get_all_disks_health_summary(self) -> List[Dict[str, Any]]: - """Get health summary for all registered disks (for Health Monitor listing). - - Returns list of disks with their current and worst health states. - """ - try: - conn = self._get_conn() - cursor = conn.cursor() - - cursor.execute(''' - SELECT d.id, d.device_name, d.serial, d.model, d.size_bytes, - d.first_seen, d.last_seen, d.removed, - d.worst_health, d.worst_health_date, d.worst_health_reason, - d.admin_cleared, d.admin_cleared_date, - (SELECT COUNT(*) FROM disk_observations o - WHERE o.disk_registry_id = d.id AND o.dismissed = 0) as active_observations - FROM disk_registry d - WHERE d.removed = 0 - ORDER BY d.device_name - ''') - rows = cursor.fetchall() - conn.close() - - return [{ - 'id': r[0], - 'device_name': r[1], - 'serial': r[2] or '', - 'model': r[3] or 'Unknown', - 'size_bytes': r[4], - 'first_seen': r[5], - 'last_seen': r[6], - 'removed': bool(r[7]), - 'worst_health': r[8] or 'healthy', - 'worst_health_date': r[9], - 'worst_health_reason': r[10] or '', - 'admin_cleared': bool(r[11]), - 'admin_cleared_date': r[12], - 'active_observations': r[13], - } for r in rows] - except Exception as e: - print(f"[HealthPersistence] Error getting disks health summary: {e}") - return [] - # Global instance health_persistence = HealthPersistence() diff --git a/AppImage/scripts/notification_events.py b/AppImage/scripts/notification_events.py index c3b74939..2caf0257 100644 --- a/AppImage/scripts/notification_events.py +++ b/AppImage/scripts/notification_events.py @@ -402,16 +402,47 @@ class JournalWatcher: entity = 'disk' entity_id = f'fs_{device}' - # ── 24h dedup for filesystem errors per device ── + # ── Get disk serial for USB-aware cooldown ── + # USB disks can change device names (sda->sdb) on reconnect. + # Using serial as cooldown key ensures same physical disk + # shares one 24h cooldown regardless of device letter. + import os as _os + base_dev = re.sub(r'\d+$', '', device) if device != 'unknown' else '' + disk_serial = '' + is_usb_disk = False + if base_dev: + try: + # Check if USB via sysfs + sysfs_link = subprocess.run( + ['readlink', '-f', f'/sys/block/{base_dev}'], + capture_output=True, text=True, timeout=2 + ) + is_usb_disk = 'usb' in sysfs_link.stdout.lower() + + # Get serial from smartctl + smart_result = subprocess.run( + ['smartctl', '-i', '-j', f'/dev/{base_dev}'], + capture_output=True, text=True, timeout=5 + ) + if smart_result.returncode in (0, 4): + import json + smart_data = json.loads(smart_result.stdout) + disk_serial = smart_data.get('serial_number', '') + except Exception: + pass + + # ── 24h dedup for filesystem errors ── + # Use serial for USB disks, device name for others now_fs = time.time() - fs_dedup_key = f'fs_{device}' + if is_usb_disk and disk_serial: + fs_dedup_key = f'fs_serial_{disk_serial}' + else: + fs_dedup_key = f'fs_{device}' last_fs_notified = self._disk_io_notified.get(fs_dedup_key, 0) if now_fs - last_fs_notified < self._DISK_IO_COOLDOWN: return # Already notified for this device recently - # ── SMART + device existence gating ── - import os as _os - base_dev = re.sub(r'\d+$', '', device) if device != 'unknown' else '' + # ── Device existence gating ── device_exists = base_dev and _os.path.exists(f'/dev/{base_dev}') if not device_exists and device != 'unknown': @@ -749,7 +780,6 @@ class JournalWatcher: """Extract device info from a smartd system-mail and record as disk observation.""" try: import re as _re - import subprocess from health_persistence import health_persistence # Extract device path: "Device: /dev/sdh [SAT]" or "Device: /dev/sda" @@ -770,21 +800,6 @@ class JournalWatcher: if model_match: model = model_match.group(1).strip() - # If no serial from message, try to get it from smartctl (important for USB disks) - if not serial or len(serial) < 3: - try: - result = subprocess.run( - ['smartctl', '-i', '-j', f'/dev/{base_dev}'], - capture_output=True, text=True, timeout=5 - ) - import json as _json - data = _json.loads(result.stdout) - serial = data.get('serial_number', '') or serial - if not model: - model = data.get('model_name', '') or data.get('model_family', '') - except Exception: - pass - # Extract error signature from title: "SMART error (FailedReadSmartSelfTestLog)" sig_match = _re.search(r'SMART error\s*\((\w+)\)', title) if sig_match: @@ -821,12 +836,10 @@ class JournalWatcher: severity='warning', ) - # Also update worst_health so the disk stays marked as warning - # even if current SMART readings show 0 pending sectors - warn_line_text = warn_line_m.group(1).strip() if warn_line_m else error_signature - health_persistence.update_disk_worst_health( - base_dev, serial, 'warning', warn_line_text - ) + # Update worst_health for permanent tracking (record_disk_observation + # already does this, but we ensure it here for safety) + health_persistence.update_disk_worst_health(base_dev, serial, 'warning') + except Exception as e: print(f"[DiskIOEventProcessor] Error recording smartd observation: {e}") @@ -1751,8 +1764,26 @@ class PollingCollector: if isinstance(details_raw, dict): # Extract device name for a stable entity_id (24h cooldown key) dev = details_raw.get('device', details_raw.get('disk', '')) - if dev: - eid = f'disk_{dev}' # Stable per-device fingerprint + serial = details_raw.get('serial', '') + + # For USB disks, use serial as entity_id for stable cooldown + # USB disks can change device names (sda->sdb) on reconnect + # Using serial ensures same physical disk shares cooldown + if serial and dev: + # Check if this is a USB disk + try: + sysfs_result = subprocess.run( + ['readlink', '-f', f'/sys/block/{dev.replace("/dev/", "")}'], + capture_output=True, text=True, timeout=2 + ) + if 'usb' in sysfs_result.stdout.lower(): + eid = f'disk_serial_{serial}' # USB: use serial + else: + eid = f'disk_{dev}' # Non-USB: use device name + except Exception: + eid = f'disk_{dev}' # Fallback to device name + elif dev: + eid = f'disk_{dev}' # No serial: use device name # Updates are always informational notifications except # system_age which can be WARNING (365+ days) or CRITICAL (548+ days). @@ -1818,15 +1849,26 @@ class PollingCollector: except Exception: pass - # Skip recovery notifications for SMART disk errors (pending/reallocated sectors). - # These indicate physical disk degradation that doesn't truly "recover" -- - # the disk may show 0 pending sectors later but the damage history persists. - # The worst_health in disk_registry tracks this, so we don't send false "resolved". + # Skip recovery notifications for PERMANENT disk events. + # These indicate physical disk degradation that doesn't truly "recover": + # - SMART pending/reallocated sectors indicate physical damage + # - Disk may show 0 pending sectors later but damage history persists + # - Sending "Resolved" gives false sense of security + # The worst_health in disk_registry tracks this permanently. if category == 'disks': - reason_lower = reason.lower() if reason else '' - if any(indicator in reason_lower for indicator in [ - 'pending', 'reallocated', 'sector', 'smart', 'unreadable' - ]): + reason_lower = (reason or '').lower() + permanent_indicators = [ + 'pending', # pending sectors + 'reallocated', # reallocated sectors + 'unreadable', # unreadable sectors + 'smart', # SMART errors + 'surface error', # disk surface errors + 'bad sector', # bad sectors + 'i/o error', # I/O errors (repeated) + 'medium error', # SCSI medium errors + ] + if any(indicator in reason_lower for indicator in permanent_indicators): + # Don't send recovery - just clean up tracking self._last_notified.pop(key, None) continue diff --git a/AppImage/scripts/notification_templates.py b/AppImage/scripts/notification_templates.py index ef8306b3..a67de4da 100644 --- a/AppImage/scripts/notification_templates.py +++ b/AppImage/scripts/notification_templates.py @@ -559,13 +559,6 @@ TEMPLATES = { 'group': 'storage', 'default_enabled': True, }, - 'smart_warning': { - 'title': '{hostname}: SMART warning on {device}', - 'body': '{device}: {reason}', - 'label': 'SMART warning (sectors)', - 'group': 'storage', - 'default_enabled': True, - }, 'storage_unavailable': { 'title': '{hostname}: Storage unavailable - {storage_name}', 'body': 'PVE storage "{storage_name}" ({storage_type}) is not available.\n{reason}',