- {sortedEntries.map(([checkKey, checkData]) => {
+ {Object.entries(checks)
+ .filter(([, checkData]) => checkData.installed !== false)
+ .map(([checkKey, checkData]) => {
const isDismissable = checkData.dismissable === true
const checkStatus = checkData.status?.toUpperCase() || "OK"
- const isDiskEntry = checkData.is_disk_entry === true
-
- // For disk entries, format label specially
- let displayLabel = formatCheckLabel(checkKey)
- let diskIcon = null
- if (isDiskEntry) {
- displayLabel = checkData.device || checkKey.replace(/_/g, '/')
- const diskType = checkData.disk_type || ''
- if (diskType === 'USB') {
- diskIcon =
)
diff --git a/AppImage/components/storage-overview.tsx b/AppImage/components/storage-overview.tsx
index 0d25558d..a87149e5 100644
--- a/AppImage/components/storage-overview.tsx
+++ b/AppImage/components/storage-overview.tsx
@@ -1016,59 +1016,34 @@ export function StorageOverview() {
className="sm:hidden border border-white/10 rounded-lg p-4 cursor-pointer bg-white/5 transition-colors"
onClick={() => handleDiskClick(disk)}
>
-
- {/* Header row */}
-
-
-
-
/dev/{disk.name}
- USB
-
-
+
+
+
+
/dev/{disk.name}
+ USB
+
+
+ {disk.model && disk.model !== "Unknown" && (
+
{disk.model}
+ )}
+
{disk.temperature > 0 && (
-
-
+
+
{disk.temperature}°C
)}
{getHealthBadge(disk.health)}
+ {(disk.observations_count ?? 0) > 0 && (
+
+
+ {disk.observations_count}
+
+ )}
-
- {/* Model if available */}
- {disk.model && disk.model !== "Unknown" && (
-
{disk.model}
- )}
-
- {/* Info grid - 2 columns */}
-
-
-
Size
-
{disk.size_formatted || disk.size || "N/A"}
-
-
-
SMART Status
-
{disk.smart_status || "N/A"}
-
- {disk.serial && disk.serial !== "Unknown" && (
-
-
Serial
-
{disk.serial}
-
- )}
-
-
- {/* Observations badge if any */}
- {(disk.observations_count ?? 0) > 0 && (
-
-
-
- {disk.observations_count} observation{disk.observations_count > 1 ? 's' : ''}
-
-
- )}
@@ -1314,7 +1289,7 @@ export function StorageOverview() {
{/* Observations Section */}
- {(diskObservations.length > 0 || loadingObservations || (selectedDisk.observations_count ?? 0) > 0) && (
+ {(diskObservations.length > 0 || loadingObservations) && (
diff --git a/AppImage/scripts/flask_server.py b/AppImage/scripts/flask_server.py
index a144e60c..90579fe4 100644
--- a/AppImage/scripts/flask_server.py
+++ b/AppImage/scripts/flask_server.py
@@ -2554,55 +2554,6 @@ def get_smart_data(disk_name):
import traceback
traceback.print_exc()
- # ── Integrate persistent worst_health ──
- # The health should never improve from a previous worst state without admin intervention.
- # This prevents disks from showing "healthy" after they had issues that may have auto-resolved.
- try:
- current_health = smart_data['health']
- serial = smart_data.get('serial', '')
-
- # Get persistent worst_health
- worst_info = health_persistence.get_disk_worst_health(disk_name, serial if serial != 'Unknown' else None)
-
- if worst_info:
- worst_health = worst_info.get('worst_health', 'healthy')
- admin_cleared = worst_info.get('admin_cleared', False)
-
- # Only apply worst_health if not cleared by admin
- if not admin_cleared:
- severity_order = {'unknown': -1, 'healthy': 0, 'warning': 1, 'critical': 2}
- current_severity = severity_order.get(current_health, 0)
- worst_severity = severity_order.get(worst_health, 0)
-
- # If worst_health is worse than current, use worst_health
- if worst_severity > current_severity:
- smart_data['health'] = worst_health
- smart_data['health_source'] = 'persistent'
- smart_data['worst_health_date'] = worst_info.get('worst_health_date')
- smart_data['worst_health_reason'] = worst_info.get('worst_health_reason', '')
-
- # Update worst_health if current is worse (and not already stored)
- if current_health in ('warning', 'critical'):
- health_reason = ''
- if smart_data.get('pending_sectors', 0) > 0:
- health_reason = f"{smart_data['pending_sectors']} pending sector(s)"
- if smart_data.get('reallocated_sectors', 0) > 0:
- if health_reason:
- health_reason += f", {smart_data['reallocated_sectors']} reallocated"
- else:
- health_reason = f"{smart_data['reallocated_sectors']} reallocated sector(s)"
- if smart_data.get('smart_status') == 'failed':
- health_reason = 'SMART test FAILED' + (f' ({health_reason})' if health_reason else '')
-
- health_persistence.update_disk_worst_health(
- disk_name,
- serial if serial != 'Unknown' else None,
- current_health,
- health_reason
- )
- except Exception as e:
- # print(f"[v0] Error integrating worst_health: {e}")
- pass
return smart_data
diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py
index 8fa8ca82..f56c2a1c 100644
--- a/AppImage/scripts/health_monitor.py
+++ b/AppImage/scripts/health_monitor.py
@@ -1034,19 +1034,38 @@ class HealthMonitor:
io_error_key = f'disk_{device}'
error_key = f'smart_{device}'
reason = f'{disk}: {issue["reason"]}'
+ severity = issue.get('status', 'WARNING')
+
+ # Get serial for this disk to properly track it (important for USB disks)
+ disk_serial = ''
+ disk_model = ''
+ try:
+ smart_result = subprocess.run(
+ ['smartctl', '-i', '-j', f'/dev/{device}'],
+ capture_output=True, text=True, timeout=5
+ )
+ if smart_result.returncode in (0, 4):
+ import json
+ smart_data = json.loads(smart_result.stdout)
+ disk_serial = smart_data.get('serial_number', '')
+ disk_model = smart_data.get('model_name', '') or smart_data.get('model_family', '')
+ except Exception:
+ pass
+
try:
if (not health_persistence.is_error_active(io_error_key, category='disks') and
not health_persistence.is_error_active(error_key, category='disks')):
health_persistence.record_error(
error_key=error_key,
category='disks',
- severity=issue.get('status', 'WARNING'),
+ severity=severity,
reason=reason,
details={
'disk': device,
'device': disk,
'block_device': device,
- 'serial': '',
+ 'serial': disk_serial,
+ 'model': disk_model,
'smart_status': 'WARNING',
'smart_lines': issue.get('smart_lines', []),
'io_lines': issue.get('io_lines', []),
@@ -1055,6 +1074,12 @@ class HealthMonitor:
'dismissable': True,
}
)
+ # Update worst_health for the disk (persists even if current error clears)
+ # Use serial for proper USB disk tracking
+ health_persistence.update_disk_worst_health(device, disk_serial if disk_serial else None, severity.lower())
+ # Also register the disk for observation tracking
+ if disk_serial:
+ health_persistence.register_disk(device, disk_serial, disk_model, 0)
except Exception:
pass
@@ -1073,16 +1098,205 @@ class HealthMonitor:
if disk_path not in storage_details or storage_details[disk_path].get('status') == 'OK':
issues.append(f'{disk_path}: {disk_info.get("reason", "I/O errors")}')
storage_details[disk_path] = disk_info
+ # Update worst_health for I/O errors
+ device = disk_path.replace('/dev/', '')
+ io_severity = disk_info.get('status', 'WARNING').lower()
+
+ # Get serial for proper disk tracking (important for USB)
+ io_serial = ''
+ io_model = ''
+ try:
+ smart_result = subprocess.run(
+ ['smartctl', '-i', '-j', f'/dev/{device}'],
+ capture_output=True, text=True, timeout=5
+ )
+ if smart_result.returncode in (0, 4):
+ import json
+ smart_data = json.loads(smart_result.stdout)
+ io_serial = smart_data.get('serial_number', '')
+ io_model = smart_data.get('model_name', '') or smart_data.get('model_family', '')
+ except Exception:
+ pass
+
+ try:
+ health_persistence.update_disk_worst_health(device, io_serial if io_serial else None, io_severity)
+ if io_serial:
+ health_persistence.register_disk(device, io_serial, io_model, 0)
+ except Exception:
+ pass
- # Build checks dict from storage_details, adding OK entries for items with no issues
+ # Build checks dict from storage_details
+ # We consolidate disk error entries (like /Dev/Sda) into physical disk entries
+ # and only show disks with problems (not healthy ones).
checks = {}
+ disk_errors_by_device = {} # Collect disk errors for consolidation
+
for key, val in storage_details.items():
+ # Check if this is a disk device entry (e.g., /Dev/Sda, /dev/sda, sda)
+ key_lower = key.lower()
+ is_disk_entry = (
+ key_lower.startswith('/dev/') or
+ key_lower.startswith('dev/') or
+ (len(key_lower) <= 10 and (key_lower.startswith('sd') or
+ key_lower.startswith('nvme') or key_lower.startswith('hd')))
+ )
+
+ if is_disk_entry:
+ # Extract device name and collect for consolidation
+ device_name = key_lower.replace('/dev/', '').replace('dev/', '').strip('/')
+ if device_name and len(device_name) <= 15:
+ if device_name not in disk_errors_by_device:
+ disk_errors_by_device[device_name] = {
+ 'status': val.get('status', 'WARNING'),
+ 'detail': val.get('reason', ''),
+ 'error_key': val.get('error_key'),
+ 'dismissable': val.get('dismissable', True),
+ }
+ else:
+ # Merge: keep worst status
+ existing = disk_errors_by_device[device_name]
+ if val.get('status') == 'CRITICAL':
+ existing['status'] = 'CRITICAL'
+ # Append detail if different
+ new_detail = val.get('reason', '')
+ if new_detail and new_detail not in existing.get('detail', ''):
+ existing['detail'] = f"{existing['detail']}; {new_detail}".strip('; ')
+ continue # Don't add raw disk error entry, we'll add consolidated later
+
+ # Non-disk entries go directly to checks
checks[key] = {
'status': val.get('status', 'OK'),
'detail': val.get('reason', 'OK'),
**{k: v for k, v in val.items() if k not in ('status', 'reason')}
}
+ # Get physical disk info for matching errors to disks
+ # This uses the same detection as flask_server.py /api/storage/info
+ physical_disks = {}
+ try:
+ result = subprocess.run(
+ ['lsblk', '-b', '-d', '-n', '-o', 'NAME,SIZE,TYPE,TRAN'],
+ capture_output=True, text=True, timeout=5
+ )
+ if result.returncode == 0:
+ for line in result.stdout.strip().split('\n'):
+ if not line.strip():
+ continue
+ parts = line.split()
+ if len(parts) >= 3 and parts[2] == 'disk':
+ disk_name = parts[0]
+ # Skip virtual devices
+ if disk_name.startswith(('zd', 'zram', 'loop', 'ram', 'dm-')):
+ continue
+ tran = parts[3].upper() if len(parts) > 3 else ''
+ is_usb = tran == 'USB'
+ is_nvme = disk_name.startswith('nvme')
+
+ # Get serial from smartctl
+ serial = ''
+ model = ''
+ try:
+ smart_result = subprocess.run(
+ ['smartctl', '-i', '-j', f'/dev/{disk_name}'],
+ capture_output=True, text=True, timeout=5
+ )
+ if smart_result.returncode in (0, 4): # 4 = SMART not available but info OK
+ import json
+ smart_data = json.loads(smart_result.stdout)
+ serial = smart_data.get('serial_number', '')
+ model = smart_data.get('model_name', '') or smart_data.get('model_family', '')
+ except Exception:
+ pass
+
+ physical_disks[disk_name] = {
+ 'serial': serial,
+ 'model': model,
+ 'is_usb': is_usb,
+ 'is_nvme': is_nvme,
+ 'disk_type': 'USB' if is_usb else ('NVMe' if is_nvme else 'SATA'),
+ }
+ except Exception:
+ pass
+
+ # Add consolidated disk entries (only for disks with errors)
+ for device_name, error_info in disk_errors_by_device.items():
+ # Try to find this disk in physical_disks for enriched info
+ disk_info = physical_disks.get(device_name, {})
+
+ # If not found by name, try to match by serial (from error details)
+ if not disk_info:
+ error_serial = error_info.get('serial', '')
+ if error_serial:
+ for dk, di in physical_disks.items():
+ if di.get('serial', '').lower() == error_serial.lower():
+ disk_info = di
+ device_name = dk # Update device name to matched disk
+ break
+
+ # Determine disk type
+ disk_type = disk_info.get('disk_type', 'SATA')
+ if not disk_info:
+ # Fallback detection
+ if device_name.startswith('nvme'):
+ disk_type = 'NVMe'
+ else:
+ # Check if USB via sysfs
+ try:
+ usb_check = subprocess.run(
+ ['readlink', '-f', f'/sys/block/{device_name}'],
+ capture_output=True, text=True, timeout=2
+ )
+ if 'usb' in usb_check.stdout.lower():
+ disk_type = 'USB'
+ except Exception:
+ pass
+
+ serial = disk_info.get('serial', '')
+ model = disk_info.get('model', '')
+
+ # Get worst_health from persistence
+ try:
+ health_status = health_persistence.get_disk_health_status(device_name, serial if serial else None)
+ worst_health = health_status.get('worst_health', 'healthy')
+
+ # Final health = max(current, worst)
+ health_order = {'healthy': 0, 'ok': 0, 'warning': 1, 'critical': 2}
+ current_level = health_order.get(error_info['status'].lower(), 1)
+ worst_level = health_order.get(worst_health.lower(), 0)
+
+ if worst_level > current_level:
+ # worst_health is worse, use it
+ final_status = worst_health.upper()
+ else:
+ final_status = error_info['status']
+ except Exception:
+ final_status = error_info['status']
+
+ # Build detail string with serial/model if available
+ detail = error_info['detail']
+ if serial and serial not in detail:
+ detail = f"{serial} - {detail}"
+
+ # Create consolidated disk entry
+ check_key = f'/dev/{device_name}'
+ checks[check_key] = {
+ 'status': final_status,
+ 'detail': detail,
+ 'disk_type': disk_type,
+ 'device': f'/dev/{device_name}',
+ 'serial': serial,
+ 'model': model,
+ 'error_key': error_info.get('error_key') or f'disk_{device_name}',
+ 'dismissable': error_info.get('dismissable', True),
+ 'is_disk_entry': True,
+ }
+
+ # Register disk in persistence if not already (for worst_health tracking)
+ try:
+ health_persistence.register_disk(device_name, serial if serial else None, model, 0)
+ except Exception:
+ pass
+
# ALWAYS add descriptive entries for capabilities this server has.
# When everything is OK, they show as OK. When there are issues,
# they still appear so the user can see the full picture (e.g.
@@ -1105,120 +1319,8 @@ class HealthMonitor:
if self.capabilities.get('has_lvm') and 'lvm_volumes' not in checks and 'lvm_check' not in checks:
checks['lvm_volumes'] = {'status': 'OK', 'detail': 'LVM volumes OK'}
- # Get physical disks list for UI display
- physical_disks = self._get_physical_disks_list()
-
- # Collect disk error entries (SMART, I/O, etc.) from checks that should be merged with disk entries
- # These have keys like '/Dev/Sda', '/dev/sda', 'sda', etc.
- disk_errors_by_device = {}
- keys_to_remove = []
- for key, val in checks.items():
- # Skip non-disk error entries (like lvm_check, root_fs, etc.)
- key_lower = key.lower()
-
- # Check if this looks like a disk error entry
- is_disk_error = False
- device_name = None
-
- if key_lower.startswith('/dev/') or key_lower.startswith('dev/'):
- # Keys like '/Dev/Sda', '/dev/sda'
- device_name = key_lower.replace('/dev/', '').replace('dev/', '').strip('/')
- is_disk_error = True
- elif key_lower.startswith('sd') or key_lower.startswith('nvme') or key_lower.startswith('hd'):
- # Keys like 'sda', 'nvme0n1'
- device_name = key_lower
- is_disk_error = True
-
- if is_disk_error and device_name and len(device_name) <= 15:
- # Store the error info, merging if we already have an error for this device
- if device_name not in disk_errors_by_device:
- disk_errors_by_device[device_name] = {
- 'status': val.get('status', 'WARNING'),
- 'detail': val.get('detail', val.get('reason', '')),
- 'error_key': val.get('error_key'),
- 'dismissable': val.get('dismissable', True),
- 'dismissed': val.get('dismissed', False),
- }
- else:
- # Merge: keep the worst status
- existing = disk_errors_by_device[device_name]
- if val.get('status') == 'CRITICAL':
- existing['status'] = 'CRITICAL'
- # Append details
- new_detail = val.get('detail', val.get('reason', ''))
- if new_detail and new_detail not in existing.get('detail', ''):
- existing['detail'] = f"{existing.get('detail', '')}; {new_detail}".strip('; ')
- keys_to_remove.append(key)
-
- # Remove the old disk error entries - they'll be merged into disk entries
- for key in keys_to_remove:
- del checks[key]
-
- # Add individual disk checks for UI display (like Network interfaces)
- for disk in physical_disks:
- device = disk.get('device', '')
- name = disk.get('name', '')
- serial = disk.get('serial', '')
- final_health = disk.get('final_health', 'healthy')
- final_reason = disk.get('final_reason', '')
- is_usb = disk.get('is_usb', False)
-
- # Format check key - use device path for uniqueness
- check_key = device.lower().replace('/', '_') # e.g., _dev_sda
-
- # Check if there's a disk error (SMART, I/O, etc.) for this disk
- disk_error = disk_errors_by_device.get(name.lower())
-
- # Determine status - use disk error status if present, otherwise use final_health
- if disk_error and disk_error.get('status') in ('WARNING', 'CRITICAL'):
- status = disk_error['status']
- error_detail = disk_error.get('detail', '')
- elif final_health == 'critical':
- status = 'CRITICAL'
- error_detail = ''
- elif final_health == 'warning':
- status = 'WARNING'
- error_detail = ''
- else:
- status = 'OK'
- error_detail = ''
-
- # Build detail string
- disk_type = 'USB' if is_usb else ('NVMe' if disk.get('is_nvme') else 'SATA')
- detail = f'{serial}' if serial else 'Unknown serial'
- if final_reason:
- detail += f' - {final_reason}'
- elif error_detail:
- detail += f' - {error_detail}'
-
- # Only add to checks if not already present
- if check_key not in checks:
- checks[check_key] = {
- 'status': status,
- 'detail': detail,
- 'device': device,
- 'serial': serial,
- 'disk_type': disk_type,
- 'is_disk_entry': True, # Flag to identify disk entries in frontend
- 'worst_health': disk.get('worst_health', 'healthy'),
- 'worst_health_date': disk.get('worst_health_date'),
- 'admin_cleared': disk.get('admin_cleared', False),
- }
-
- # If disk has issues, it needs an error_key for dismiss functionality
- if status != 'OK':
- # Use disk error_key if available, otherwise generate one
- if disk_error and disk_error.get('error_key'):
- checks[check_key]['error_key'] = disk_error['error_key']
- else:
- checks[check_key]['error_key'] = f'disk_{name}_{serial}' if serial else f'disk_{name}'
- checks[check_key]['dismissable'] = True
- # Preserve dismissed state from disk error
- if disk_error and disk_error.get('dismissed'):
- checks[check_key]['dismissed'] = True
-
if not issues:
- return {'status': 'OK', 'checks': checks, 'physical_disks': physical_disks}
+ return {'status': 'OK', 'checks': checks}
# ── Mark dismissed checks ──
# If an error_key in a check has been acknowledged (dismissed) in the
@@ -1250,7 +1352,6 @@ class HealthMonitor:
'reason': '; '.join(issues[:3]),
'details': storage_details,
'checks': checks,
- 'physical_disks': physical_disks,
'all_dismissed': True,
}
except Exception:
@@ -1265,8 +1366,7 @@ class HealthMonitor:
'status': 'CRITICAL' if has_critical else 'WARNING',
'reason': '; '.join(issues[:3]),
'details': storage_details,
- 'checks': checks,
- 'physical_disks': physical_disks
+ 'checks': checks
}
def _check_filesystem(self, mount_point: str) -> Dict[str, Any]:
@@ -1350,221 +1450,9 @@ class HealthMonitor:
return {'status': 'OK'} # No VGs found, LVM not in use
return {'status': 'OK', 'volumes': len(volumes)}
-
+
except Exception:
return {'status': 'OK'}
-
- def _get_physical_disks_list(self) -> List[Dict[str, Any]]:
- """Get list of all physical disks with their health status.
-
- Combines real-time SMART data with persistent worst_health state.
- Returns list suitable for display in Health Monitor UI.
- """
- disks = []
-
- try:
- # Get all block devices
- result = subprocess.run(
- ['lsblk', '-d', '-n', '-o', 'NAME,SIZE,TYPE,TRAN,MODEL,SERIAL'],
- capture_output=True, text=True, timeout=5
- )
-
- if result.returncode != 0:
- return []
-
- for line in result.stdout.strip().split('\n'):
- if not line.strip():
- continue
-
- parts = line.split(None, 5)
- if len(parts) < 3:
- continue
-
- name = parts[0]
- size = parts[1] if len(parts) > 1 else ''
- dtype = parts[2] if len(parts) > 2 else ''
- transport = parts[3] if len(parts) > 3 else ''
- model = parts[4] if len(parts) > 4 else ''
- serial = parts[5] if len(parts) > 5 else ''
-
- # Only include disk type devices
- if dtype != 'disk':
- continue
-
- # Skip loop devices, ram disks, etc.
- if name.startswith(('loop', 'ram', 'zram')):
- continue
-
- is_usb = transport.lower() == 'usb'
- is_nvme = name.startswith('nvme')
-
- # Get current SMART status
- current_health = 'healthy'
- smart_status = 'UNKNOWN'
- pending_sectors = 0
- reallocated_sectors = 0
-
- try:
- dev_path = f'/dev/{name}'
- smart_result = subprocess.run(
- ['smartctl', '-H', '-A', dev_path],
- capture_output=True, text=True, timeout=5
- )
-
- output = smart_result.stdout
-
- # Check SMART overall status
- if 'PASSED' in output:
- smart_status = 'PASSED'
- elif 'FAILED' in output:
- smart_status = 'FAILED'
- current_health = 'critical'
-
- # Parse SMART attributes for pending/reallocated sectors
- for attr_line in output.split('\n'):
- if 'Current_Pending_Sector' in attr_line or 'Pending_Sector' in attr_line:
- parts_attr = attr_line.split()
- if parts_attr:
- try:
- pending_sectors = int(parts_attr[-1])
- except ValueError:
- pass
- elif 'Reallocated_Sector' in attr_line:
- parts_attr = attr_line.split()
- if parts_attr:
- try:
- reallocated_sectors = int(parts_attr[-1])
- except ValueError:
- pass
-
- # Determine current health based on sectors
- if current_health != 'critical':
- if pending_sectors > 10 or reallocated_sectors > 10:
- current_health = 'critical'
- elif pending_sectors > 0 or reallocated_sectors > 0:
- current_health = 'warning'
-
- except Exception:
- pass
-
- # Build health reason
- health_reason = ''
- if pending_sectors > 0:
- health_reason = f'{pending_sectors} pending sector(s)'
- if reallocated_sectors > 0:
- if health_reason:
- health_reason += f', {reallocated_sectors} reallocated'
- else:
- health_reason = f'{reallocated_sectors} reallocated sector(s)'
- if smart_status == 'FAILED':
- health_reason = 'SMART test FAILED' + (f' ({health_reason})' if health_reason else '')
-
- # Get persistent worst_health from database
- worst_info = health_persistence.get_disk_worst_health(name, serial)
- worst_health = worst_info.get('worst_health', 'healthy') if worst_info else 'healthy'
- worst_health_date = worst_info.get('worst_health_date') if worst_info else None
- worst_health_reason = worst_info.get('worst_health_reason', '') if worst_info else ''
- admin_cleared = worst_info.get('admin_cleared', False) if worst_info else False
-
- # Update worst_health if current is worse
- if current_health != 'healthy':
- updated = health_persistence.update_disk_worst_health(
- name, serial, current_health, health_reason
- )
- if updated:
- worst_health = current_health
- worst_health_reason = health_reason
-
- # Record as disk observation (for both internal and USB disks)
- # This ensures SMART issues are tracked in observations
- try:
- obs_type = 'smart_error'
- if pending_sectors and pending_sectors > 0:
- obs_type = 'pending_sectors'
- elif reallocated_sectors and reallocated_sectors > 0:
- obs_type = 'reallocated_sectors'
- elif smart_status == 'FAILED':
- obs_type = 'smart_failed'
-
- obs_sig = f'smart_{name}_{obs_type}_{pending_sectors}_{reallocated_sectors}'
- health_persistence.record_disk_observation(
- device_name=name,
- serial=serial,
- error_type=obs_type,
- error_signature=obs_sig,
- raw_message=f'/dev/{name}: {health_reason}',
- severity=current_health,
- )
-
- # Send smart_warning notification if this is a NEW issue
- # (only when updated=True means this is first time seeing this state)
- if updated:
- try:
- from notification_manager import notification_manager
- notification_manager.send_notification(
- event_type='smart_warning',
- data={
- 'device': f'/dev/{name}',
- 'reason': health_reason,
- 'serial': serial or 'Unknown',
- 'model': model or 'Unknown',
- 'pending_sectors': pending_sectors,
- 'reallocated_sectors': reallocated_sectors,
- 'smart_status': smart_status,
- 'hostname': self._hostname,
- }
- )
- except Exception:
- pass
- except Exception:
- pass
-
- # Final health is the worse of current and persistent
- severity_order = {'healthy': 0, 'warning': 1, 'critical': 2}
- if severity_order.get(worst_health, 0) > severity_order.get(current_health, 0):
- final_health = worst_health
- final_reason = worst_health_reason
- else:
- final_health = current_health
- final_reason = health_reason
-
- # Get active observations count
- obs = health_persistence.get_disk_observations(device_name=name, serial=serial)
- active_observations = len(obs) if obs else 0
-
- # Register disk in persistence (for tracking)
- try:
- health_persistence.register_disk(name, serial, model)
- except Exception:
- pass
-
- disks.append({
- 'device': f'/dev/{name}',
- 'name': name,
- 'serial': serial or '',
- 'model': model or 'Unknown',
- 'size': size,
- 'transport': transport,
- 'is_usb': is_usb,
- 'is_nvme': is_nvme,
- 'smart_status': smart_status,
- 'current_health': current_health,
- 'current_health_reason': health_reason,
- 'worst_health': worst_health,
- 'worst_health_date': worst_health_date,
- 'worst_health_reason': worst_health_reason,
- 'final_health': final_health,
- 'final_reason': final_reason,
- 'pending_sectors': pending_sectors,
- 'reallocated_sectors': reallocated_sectors,
- 'active_observations': active_observations,
- 'admin_cleared': admin_cleared,
- })
-
- except Exception as e:
- print(f"[HealthMonitor] Error getting physical disks list: {e}")
-
- return disks
# This function is no longer used in get_detailed_status, but kept for reference if needed.
# The new _check_proxmox_storage function handles this logic better.
diff --git a/AppImage/scripts/health_persistence.py b/AppImage/scripts/health_persistence.py
index 04baf545..54cdbae2 100644
--- a/AppImage/scripts/health_persistence.py
+++ b/AppImage/scripts/health_persistence.py
@@ -164,14 +164,25 @@ class HealthPersistence:
removed INTEGER DEFAULT 0,
worst_health TEXT DEFAULT 'healthy',
worst_health_date TEXT,
- worst_health_reason TEXT,
- admin_cleared INTEGER DEFAULT 0,
- admin_cleared_date TEXT,
- admin_cleared_note TEXT,
+ admin_cleared TEXT,
UNIQUE(device_name, serial)
)
''')
+ # Migration: add worst_health columns if they don't exist (for existing DBs)
+ try:
+ cursor.execute('ALTER TABLE disk_registry ADD COLUMN worst_health TEXT DEFAULT "healthy"')
+ except Exception:
+ pass
+ try:
+ cursor.execute('ALTER TABLE disk_registry ADD COLUMN worst_health_date TEXT')
+ except Exception:
+ pass
+ try:
+ cursor.execute('ALTER TABLE disk_registry ADD COLUMN admin_cleared TEXT')
+ except Exception:
+ pass
+
# Observation log: deduplicated error events per disk
cursor.execute('''
CREATE TABLE IF NOT EXISTS disk_observations (
@@ -195,17 +206,6 @@ class HealthPersistence:
cursor.execute('CREATE INDEX IF NOT EXISTS idx_obs_disk ON disk_observations(disk_registry_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_obs_dismissed ON disk_observations(dismissed)')
- # Migration: add worst_health columns to disk_registry if not present
- cursor.execute("PRAGMA table_info(disk_registry)")
- disk_columns = [col[1] for col in cursor.fetchall()]
- if 'worst_health' not in disk_columns:
- cursor.execute("ALTER TABLE disk_registry ADD COLUMN worst_health TEXT DEFAULT 'healthy'")
- cursor.execute("ALTER TABLE disk_registry ADD COLUMN worst_health_date TEXT")
- cursor.execute("ALTER TABLE disk_registry ADD COLUMN worst_health_reason TEXT")
- cursor.execute("ALTER TABLE disk_registry ADD COLUMN admin_cleared INTEGER DEFAULT 0")
- cursor.execute("ALTER TABLE disk_registry ADD COLUMN admin_cleared_date TEXT")
- cursor.execute("ALTER TABLE disk_registry ADD COLUMN admin_cleared_note TEXT")
-
conn.commit()
conn.close()
@@ -1231,26 +1231,11 @@ class HealthPersistence:
# a different device_name (e.g. 'ata8' instead of 'sdh'),
# update that entry's device_name so observations carry over.
if serial:
- # Try exact match first
cursor.execute('''
SELECT id, device_name FROM disk_registry
WHERE serial = ? AND serial != '' AND device_name != ?
''', (serial, device_name))
old_rows = cursor.fetchall()
-
- # If no exact match, try normalized match (for USB disks with special chars)
- if not old_rows:
- normalized = self._normalize_serial(serial)
- if normalized and normalized != serial:
- cursor.execute(
- 'SELECT id, device_name, serial FROM disk_registry '
- 'WHERE serial != "" AND device_name != ?', (device_name,))
- for row in cursor.fetchall():
- db_normalized = self._normalize_serial(row[2])
- if db_normalized == normalized or normalized in db_normalized or db_normalized in normalized:
- old_rows.append((row[0], row[1]))
- break
-
for old_id, old_dev in old_rows:
# Only consolidate ATA names -> block device names
if old_dev.startswith('ata') and not device_name.startswith('ata'):
@@ -1288,23 +1273,6 @@ class HealthPersistence:
except Exception as e:
print(f"[HealthPersistence] Error registering disk {device_name}: {e}")
- def _normalize_serial(self, serial: str) -> str:
- """Normalize serial number for comparison.
-
- USB disks can have serials with escape sequences like \\x06\\x18
- or non-printable characters. This normalizes them for matching.
- """
- if not serial:
- return ''
- import re
- # Remove escape sequences like \x06, \x18
- normalized = re.sub(r'\\x[0-9a-fA-F]{2}', '', serial)
- # Remove non-printable characters
- normalized = ''.join(c for c in normalized if c.isprintable())
- # Remove common prefixes that vary
- normalized = normalized.strip()
- return normalized
-
def _get_disk_registry_id(self, cursor, device_name: str,
serial: Optional[str] = None) -> Optional[int]:
"""Find disk_registry.id, matching by serial first, then device_name.
@@ -1313,25 +1281,12 @@ class HealthPersistence:
checks entries with ATA names that share the same serial.
"""
if serial:
- # Try exact match first
cursor.execute(
'SELECT id FROM disk_registry WHERE serial = ? AND serial != "" ORDER BY last_seen DESC LIMIT 1',
(serial,))
row = cursor.fetchone()
if row:
return row[0]
-
- # Try normalized serial match (for USB disks with special chars)
- normalized = self._normalize_serial(serial)
- if normalized and normalized != serial:
- # Search for serials that start with or contain the normalized version
- cursor.execute(
- 'SELECT id, serial FROM disk_registry WHERE serial != "" ORDER BY last_seen DESC')
- for row in cursor.fetchall():
- db_normalized = self._normalize_serial(row[1])
- if db_normalized == normalized or normalized in db_normalized or db_normalized in normalized:
- return row[0]
-
# Fallback: match by device_name (strip /dev/ prefix)
clean_dev = device_name.replace('/dev/', '')
cursor.execute(
@@ -1340,7 +1295,6 @@ class HealthPersistence:
row = cursor.fetchone()
if row:
return row[0]
-
# Last resort: search for ATA-named entries that might refer to this device
# This handles cases where observations were recorded under 'ata8'
# but we're querying for 'sdh'
@@ -1353,6 +1307,131 @@ class HealthPersistence:
pass
return None
+ def update_disk_worst_health(self, device_name: str, serial: Optional[str],
+ new_health: str) -> bool:
+ """Update worst_health if new_health is worse than current.
+
+ Health hierarchy: healthy < warning < critical
+ Only escalates, never downgrades automatically.
+
+ Returns True if worst_health was updated.
+ """
+ health_order = {'healthy': 0, 'warning': 1, 'critical': 2}
+ new_level = health_order.get(new_health.lower(), 0)
+
+ if new_level == 0: # healthy never updates worst_health
+ return False
+
+ now = datetime.now().isoformat()
+ try:
+ conn = self._get_conn()
+ cursor = conn.cursor()
+
+ disk_id = self._get_disk_registry_id(cursor, device_name, serial)
+ if not disk_id:
+ # Register disk first
+ self.register_disk(device_name.replace('/dev/', ''), serial)
+ disk_id = self._get_disk_registry_id(cursor, device_name, serial)
+
+ if not disk_id:
+ conn.close()
+ return False
+
+ # Get current worst_health
+ cursor.execute('SELECT worst_health FROM disk_registry WHERE id = ?', (disk_id,))
+ row = cursor.fetchone()
+ current_worst = row[0] if row and row[0] else 'healthy'
+ current_level = health_order.get(current_worst.lower(), 0)
+
+ # Only update if new health is worse
+ if new_level > current_level:
+ cursor.execute('''
+ UPDATE disk_registry
+ SET worst_health = ?, worst_health_date = ?, admin_cleared = NULL
+ WHERE id = ?
+ ''', (new_health.lower(), now, disk_id))
+ conn.commit()
+ conn.close()
+ return True
+
+ conn.close()
+ return False
+ except Exception as e:
+ print(f"[HealthPersistence] Error updating worst_health for {device_name}: {e}")
+ return False
+
+ def get_disk_health_status(self, device_name: str, serial: Optional[str] = None) -> Dict[str, Any]:
+ """Get the health status of a disk including worst_health.
+
+ Returns dict with:
+ - worst_health: 'healthy', 'warning', or 'critical'
+ - worst_health_date: ISO timestamp when worst_health was set
+ - admin_cleared: ISO timestamp if admin manually cleared the health
+ - observations_count: Number of recorded observations
+ """
+ try:
+ conn = self._get_conn()
+ cursor = conn.cursor()
+
+ disk_id = self._get_disk_registry_id(cursor, device_name, serial)
+ if not disk_id:
+ conn.close()
+ return {'worst_health': 'healthy', 'observations_count': 0}
+
+ cursor.execute('''
+ SELECT worst_health, worst_health_date, admin_cleared
+ FROM disk_registry WHERE id = ?
+ ''', (disk_id,))
+ row = cursor.fetchone()
+
+ # Count observations
+ cursor.execute(
+ 'SELECT COUNT(*) FROM disk_observations WHERE disk_registry_id = ? AND dismissed = 0',
+ (disk_id,))
+ obs_count = cursor.fetchone()[0]
+
+ conn.close()
+
+ if row:
+ return {
+ 'worst_health': row[0] or 'healthy',
+ 'worst_health_date': row[1],
+ 'admin_cleared': row[2],
+ 'observations_count': obs_count
+ }
+ return {'worst_health': 'healthy', 'observations_count': obs_count}
+ except Exception as e:
+ print(f"[HealthPersistence] Error getting disk health for {device_name}: {e}")
+ return {'worst_health': 'healthy', 'observations_count': 0}
+
+ def clear_disk_health_history(self, device_name: str, serial: Optional[str] = None) -> bool:
+ """Admin action: clear worst_health back to healthy.
+
+ This resets the health status but keeps all observations for audit.
+ Records when the admin cleared it for accountability.
+ """
+ now = datetime.now().isoformat()
+ try:
+ conn = self._get_conn()
+ cursor = conn.cursor()
+
+ disk_id = self._get_disk_registry_id(cursor, device_name, serial)
+ if not disk_id:
+ conn.close()
+ return False
+
+ cursor.execute('''
+ UPDATE disk_registry
+ SET worst_health = 'healthy', worst_health_date = NULL, admin_cleared = ?
+ WHERE id = ?
+ ''', (now, disk_id))
+ conn.commit()
+ conn.close()
+ return True
+ except Exception as e:
+ print(f"[HealthPersistence] Error clearing health for {device_name}: {e}")
+ return False
+
def record_disk_observation(self, device_name: str, serial: Optional[str],
error_type: str, error_signature: str,
raw_message: str = '',
@@ -1391,6 +1470,10 @@ class HealthPersistence:
conn.commit()
conn.close()
+
+ # Update worst_health based on observation severity
+ self.update_disk_worst_health(clean_dev, serial, severity)
+
except Exception as e:
print(f"[HealthPersistence] Error recording disk observation: {e}")
@@ -1539,186 +1622,6 @@ class HealthPersistence:
except Exception as e:
print(f"[HealthPersistence] Error marking removed disks: {e}")
- # ────────────────────────────────────────────────────────────────
- # Disk Worst Health State Tracking
- # ────────────────────────────────────────────────────────────────
-
- HEALTH_SEVERITY_ORDER = {'healthy': 0, 'warning': 1, 'critical': 2}
-
- def update_disk_worst_health(self, device_name: str, serial: Optional[str],
- health: str, reason: str = '') -> bool:
- """Update worst_health if the new health is worse than current.
-
- Health progression is one-way: healthy -> warning -> critical
- Only admin_clear_disk_health() can reset to healthy.
-
- Returns True if worst_health was updated.
- """
- health_lower = health.lower()
- if health_lower not in self.HEALTH_SEVERITY_ORDER:
- return False
-
- try:
- conn = self._get_conn()
- cursor = conn.cursor()
-
- disk_id = self._get_disk_registry_id(cursor, device_name.replace('/dev/', ''), serial)
- if not disk_id:
- # Auto-register disk if not present
- self.register_disk(device_name.replace('/dev/', ''), serial)
- disk_id = self._get_disk_registry_id(cursor, device_name.replace('/dev/', ''), serial)
-
- if not disk_id:
- conn.close()
- return False
-
- # Get current worst_health
- cursor.execute('SELECT worst_health, admin_cleared FROM disk_registry WHERE id = ?', (disk_id,))
- row = cursor.fetchone()
- if not row:
- conn.close()
- return False
-
- current_worst = row[0] or 'healthy'
- admin_cleared = row[1] or 0
-
- # If admin cleared and new issue is the same or less severe, don't update
- # But if admin cleared and issue escalates, update anyway
- current_severity = self.HEALTH_SEVERITY_ORDER.get(current_worst, 0)
- new_severity = self.HEALTH_SEVERITY_ORDER.get(health_lower, 0)
-
- # Only update if new health is worse
- if new_severity > current_severity:
- now = datetime.now().isoformat()
- cursor.execute('''
- UPDATE disk_registry
- SET worst_health = ?, worst_health_date = ?, worst_health_reason = ?,
- admin_cleared = 0
- WHERE id = ?
- ''', (health_lower, now, reason, disk_id))
- conn.commit()
- conn.close()
- return True
-
- conn.close()
- return False
- except Exception as e:
- print(f"[HealthPersistence] Error updating disk worst_health: {e}")
- return False
-
- def get_disk_worst_health(self, device_name: str, serial: Optional[str] = None) -> Optional[Dict[str, Any]]:
- """Get the worst health state for a specific disk."""
- try:
- conn = self._get_conn()
- cursor = conn.cursor()
-
- disk_id = self._get_disk_registry_id(cursor, device_name.replace('/dev/', ''), serial)
- if not disk_id:
- conn.close()
- return None
-
- cursor.execute('''
- SELECT worst_health, worst_health_date, worst_health_reason,
- admin_cleared, admin_cleared_date, admin_cleared_note
- FROM disk_registry WHERE id = ?
- ''', (disk_id,))
- row = cursor.fetchone()
- conn.close()
-
- if row:
- return {
- 'worst_health': row[0] or 'healthy',
- 'worst_health_date': row[1],
- 'worst_health_reason': row[2],
- 'admin_cleared': bool(row[3]),
- 'admin_cleared_date': row[4],
- 'admin_cleared_note': row[5],
- }
- return None
- except Exception as e:
- print(f"[HealthPersistence] Error getting disk worst_health: {e}")
- return None
-
- def admin_clear_disk_health(self, device_name: str, serial: Optional[str], note: str) -> bool:
- """Admin manually clears disk health history (e.g., after disk replacement).
-
- Requires a note explaining why (for audit trail).
- """
- if not note or len(note.strip()) < 5:
- return False # Require meaningful note
-
- try:
- conn = self._get_conn()
- cursor = conn.cursor()
-
- disk_id = self._get_disk_registry_id(cursor, device_name.replace('/dev/', ''), serial)
- if not disk_id:
- conn.close()
- return False
-
- now = datetime.now().isoformat()
- cursor.execute('''
- UPDATE disk_registry
- SET worst_health = 'healthy', admin_cleared = 1,
- admin_cleared_date = ?, admin_cleared_note = ?
- WHERE id = ?
- ''', (now, note.strip(), disk_id))
-
- # Also dismiss all active observations for this disk
- cursor.execute('''
- UPDATE disk_observations SET dismissed = 1 WHERE disk_registry_id = ?
- ''', (disk_id,))
-
- conn.commit()
- conn.close()
- return True
- except Exception as e:
- print(f"[HealthPersistence] Error clearing disk health: {e}")
- return False
-
- def get_all_disks_health_summary(self) -> List[Dict[str, Any]]:
- """Get health summary for all registered disks (for Health Monitor listing).
-
- Returns list of disks with their current and worst health states.
- """
- try:
- conn = self._get_conn()
- cursor = conn.cursor()
-
- cursor.execute('''
- SELECT d.id, d.device_name, d.serial, d.model, d.size_bytes,
- d.first_seen, d.last_seen, d.removed,
- d.worst_health, d.worst_health_date, d.worst_health_reason,
- d.admin_cleared, d.admin_cleared_date,
- (SELECT COUNT(*) FROM disk_observations o
- WHERE o.disk_registry_id = d.id AND o.dismissed = 0) as active_observations
- FROM disk_registry d
- WHERE d.removed = 0
- ORDER BY d.device_name
- ''')
- rows = cursor.fetchall()
- conn.close()
-
- return [{
- 'id': r[0],
- 'device_name': r[1],
- 'serial': r[2] or '',
- 'model': r[3] or 'Unknown',
- 'size_bytes': r[4],
- 'first_seen': r[5],
- 'last_seen': r[6],
- 'removed': bool(r[7]),
- 'worst_health': r[8] or 'healthy',
- 'worst_health_date': r[9],
- 'worst_health_reason': r[10] or '',
- 'admin_cleared': bool(r[11]),
- 'admin_cleared_date': r[12],
- 'active_observations': r[13],
- } for r in rows]
- except Exception as e:
- print(f"[HealthPersistence] Error getting disks health summary: {e}")
- return []
-
# Global instance
health_persistence = HealthPersistence()
diff --git a/AppImage/scripts/notification_events.py b/AppImage/scripts/notification_events.py
index c3b74939..2caf0257 100644
--- a/AppImage/scripts/notification_events.py
+++ b/AppImage/scripts/notification_events.py
@@ -402,16 +402,47 @@ class JournalWatcher:
entity = 'disk'
entity_id = f'fs_{device}'
- # ── 24h dedup for filesystem errors per device ──
+ # ── Get disk serial for USB-aware cooldown ──
+ # USB disks can change device names (sda->sdb) on reconnect.
+ # Using serial as cooldown key ensures same physical disk
+ # shares one 24h cooldown regardless of device letter.
+ import os as _os
+ base_dev = re.sub(r'\d+$', '', device) if device != 'unknown' else ''
+ disk_serial = ''
+ is_usb_disk = False
+ if base_dev:
+ try:
+ # Check if USB via sysfs
+ sysfs_link = subprocess.run(
+ ['readlink', '-f', f'/sys/block/{base_dev}'],
+ capture_output=True, text=True, timeout=2
+ )
+ is_usb_disk = 'usb' in sysfs_link.stdout.lower()
+
+ # Get serial from smartctl
+ smart_result = subprocess.run(
+ ['smartctl', '-i', '-j', f'/dev/{base_dev}'],
+ capture_output=True, text=True, timeout=5
+ )
+ if smart_result.returncode in (0, 4):
+ import json
+ smart_data = json.loads(smart_result.stdout)
+ disk_serial = smart_data.get('serial_number', '')
+ except Exception:
+ pass
+
+ # ── 24h dedup for filesystem errors ──
+ # Use serial for USB disks, device name for others
now_fs = time.time()
- fs_dedup_key = f'fs_{device}'
+ if is_usb_disk and disk_serial:
+ fs_dedup_key = f'fs_serial_{disk_serial}'
+ else:
+ fs_dedup_key = f'fs_{device}'
last_fs_notified = self._disk_io_notified.get(fs_dedup_key, 0)
if now_fs - last_fs_notified < self._DISK_IO_COOLDOWN:
return # Already notified for this device recently
- # ── SMART + device existence gating ──
- import os as _os
- base_dev = re.sub(r'\d+$', '', device) if device != 'unknown' else ''
+ # ── Device existence gating ──
device_exists = base_dev and _os.path.exists(f'/dev/{base_dev}')
if not device_exists and device != 'unknown':
@@ -749,7 +780,6 @@ class JournalWatcher:
"""Extract device info from a smartd system-mail and record as disk observation."""
try:
import re as _re
- import subprocess
from health_persistence import health_persistence
# Extract device path: "Device: /dev/sdh [SAT]" or "Device: /dev/sda"
@@ -770,21 +800,6 @@ class JournalWatcher:
if model_match:
model = model_match.group(1).strip()
- # If no serial from message, try to get it from smartctl (important for USB disks)
- if not serial or len(serial) < 3:
- try:
- result = subprocess.run(
- ['smartctl', '-i', '-j', f'/dev/{base_dev}'],
- capture_output=True, text=True, timeout=5
- )
- import json as _json
- data = _json.loads(result.stdout)
- serial = data.get('serial_number', '') or serial
- if not model:
- model = data.get('model_name', '') or data.get('model_family', '')
- except Exception:
- pass
-
# Extract error signature from title: "SMART error (FailedReadSmartSelfTestLog)"
sig_match = _re.search(r'SMART error\s*\((\w+)\)', title)
if sig_match:
@@ -821,12 +836,10 @@ class JournalWatcher:
severity='warning',
)
- # Also update worst_health so the disk stays marked as warning
- # even if current SMART readings show 0 pending sectors
- warn_line_text = warn_line_m.group(1).strip() if warn_line_m else error_signature
- health_persistence.update_disk_worst_health(
- base_dev, serial, 'warning', warn_line_text
- )
+ # Update worst_health for permanent tracking (record_disk_observation
+ # already does this, but we ensure it here for safety)
+ health_persistence.update_disk_worst_health(base_dev, serial, 'warning')
+
except Exception as e:
print(f"[DiskIOEventProcessor] Error recording smartd observation: {e}")
@@ -1751,8 +1764,26 @@ class PollingCollector:
if isinstance(details_raw, dict):
# Extract device name for a stable entity_id (24h cooldown key)
dev = details_raw.get('device', details_raw.get('disk', ''))
- if dev:
- eid = f'disk_{dev}' # Stable per-device fingerprint
+ serial = details_raw.get('serial', '')
+
+ # For USB disks, use serial as entity_id for stable cooldown
+ # USB disks can change device names (sda->sdb) on reconnect
+ # Using serial ensures same physical disk shares cooldown
+ if serial and dev:
+ # Check if this is a USB disk
+ try:
+ sysfs_result = subprocess.run(
+ ['readlink', '-f', f'/sys/block/{dev.replace("/dev/", "")}'],
+ capture_output=True, text=True, timeout=2
+ )
+ if 'usb' in sysfs_result.stdout.lower():
+ eid = f'disk_serial_{serial}' # USB: use serial
+ else:
+ eid = f'disk_{dev}' # Non-USB: use device name
+ except Exception:
+ eid = f'disk_{dev}' # Fallback to device name
+ elif dev:
+ eid = f'disk_{dev}' # No serial: use device name
# Updates are always informational notifications except
# system_age which can be WARNING (365+ days) or CRITICAL (548+ days).
@@ -1818,15 +1849,26 @@ class PollingCollector:
except Exception:
pass
- # Skip recovery notifications for SMART disk errors (pending/reallocated sectors).
- # These indicate physical disk degradation that doesn't truly "recover" --
- # the disk may show 0 pending sectors later but the damage history persists.
- # The worst_health in disk_registry tracks this, so we don't send false "resolved".
+ # Skip recovery notifications for PERMANENT disk events.
+ # These indicate physical disk degradation that doesn't truly "recover":
+ # - SMART pending/reallocated sectors indicate physical damage
+ # - Disk may show 0 pending sectors later but damage history persists
+ # - Sending "Resolved" gives false sense of security
+ # The worst_health in disk_registry tracks this permanently.
if category == 'disks':
- reason_lower = reason.lower() if reason else ''
- if any(indicator in reason_lower for indicator in [
- 'pending', 'reallocated', 'sector', 'smart', 'unreadable'
- ]):
+ reason_lower = (reason or '').lower()
+ permanent_indicators = [
+ 'pending', # pending sectors
+ 'reallocated', # reallocated sectors
+ 'unreadable', # unreadable sectors
+ 'smart', # SMART errors
+ 'surface error', # disk surface errors
+ 'bad sector', # bad sectors
+ 'i/o error', # I/O errors (repeated)
+ 'medium error', # SCSI medium errors
+ ]
+ if any(indicator in reason_lower for indicator in permanent_indicators):
+ # Don't send recovery - just clean up tracking
self._last_notified.pop(key, None)
continue
diff --git a/AppImage/scripts/notification_templates.py b/AppImage/scripts/notification_templates.py
index ef8306b3..a67de4da 100644
--- a/AppImage/scripts/notification_templates.py
+++ b/AppImage/scripts/notification_templates.py
@@ -559,13 +559,6 @@ TEMPLATES = {
'group': 'storage',
'default_enabled': True,
},
- 'smart_warning': {
- 'title': '{hostname}: SMART warning on {device}',
- 'body': '{device}: {reason}',
- 'label': 'SMART warning (sectors)',
- 'group': 'storage',
- 'default_enabled': True,
- },
'storage_unavailable': {
'title': '{hostname}: Storage unavailable - {storage_name}',
'body': 'PVE storage "{storage_name}" ({storage_type}) is not available.\n{reason}',