From 898392725a25ebfc69f95c88b9b63d2d5efb13ba Mon Sep 17 00:00:00 2001 From: MacRimi Date: Thu, 5 Mar 2026 17:29:07 +0100 Subject: [PATCH] Update notification service --- AppImage/components/storage-overview.tsx | 138 ++++++++++++++++++++++- AppImage/scripts/flask_server.py | 77 +++++++++++-- AppImage/scripts/health_monitor.py | 98 ++++++++++++++-- AppImage/scripts/health_persistence.py | 55 ++++++++- 4 files changed, 344 insertions(+), 24 deletions(-) diff --git a/AppImage/components/storage-overview.tsx b/AppImage/components/storage-overview.tsx index 42fb209a..4253ff21 100644 --- a/AppImage/components/storage-overview.tsx +++ b/AppImage/components/storage-overview.tsx @@ -2,7 +2,7 @@ import { useEffect, useState } from "react" import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card" -import { HardDrive, Database, AlertTriangle, CheckCircle2, XCircle, Square, Thermometer, Archive } from "lucide-react" +import { HardDrive, Database, AlertTriangle, CheckCircle2, XCircle, Square, Thermometer, Archive, Info, Clock } from "lucide-react" import { Badge } from "@/components/ui/badge" import { Progress } from "@/components/ui/progress" import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle } from "@/components/ui/dialog" @@ -41,6 +41,22 @@ interface DiskInfo { reason: string error_type?: string // 'io' | 'filesystem' } + observations_count?: number +} + +interface DiskObservation { + id: number + error_type: string + error_signature: string + first_occurrence: string + last_occurrence: string + occurrence_count: number + raw_message: string + severity: string + dismissed: boolean + device_name: string + serial: string + model: string } interface ZFSPool { @@ -98,6 +114,8 @@ export function StorageOverview() { const [loading, setLoading] = useState(true) const [selectedDisk, setSelectedDisk] = useState(null) const [detailsOpen, setDetailsOpen] = useState(false) + const [diskObservations, setDiskObservations] = useState([]) + const [loadingObservations, setLoadingObservations] = useState(false) const fetchStorageData = async () => { try { @@ -241,11 +259,39 @@ export function StorageOverview() { return badgeStyles[diskType] } - const handleDiskClick = (disk: DiskInfo) => { + const handleDiskClick = async (disk: DiskInfo) => { setSelectedDisk(disk) setDetailsOpen(true) + setDiskObservations([]) + + if (disk.observations_count && disk.observations_count > 0) { + setLoadingObservations(true) + try { + const params = new URLSearchParams() + if (disk.name) params.set('device', disk.name) + if (disk.serial && disk.serial !== 'Unknown') params.set('serial', disk.serial) + const data = await fetchApi<{ observations: DiskObservation[] }>(`/api/storage/observations?${params.toString()}`) + setDiskObservations(data.observations || []) + } catch { + setDiskObservations([]) + } finally { + setLoadingObservations(false) + } + } } + const formatObsDate = (iso: string) => { + if (!iso) return 'N/A' + try { + const d = new Date(iso) + return d.toLocaleDateString(undefined, { month: 'short', day: 'numeric', year: 'numeric' }) + + ' ' + d.toLocaleTimeString(undefined, { hour: '2-digit', minute: '2-digit' }) + } catch { return iso } + } + + const obsTypeLabel = (t: string) => + ({ smart_error: 'SMART Error', io_error: 'I/O Error', connection_error: 'Connection Error' }[t] || t) + const getStorageTypeBadge = (type: string) => { const typeColors: Record = { pbs: "bg-purple-500/10 text-purple-500 border-purple-500/20", @@ -778,6 +824,12 @@ export function StorageOverview() { )} + {disk.observations_count && disk.observations_count > 0 && ( + + + {disk.observations_count} + + )} {getHealthBadge(disk.health)} @@ -858,6 +910,12 @@ export function StorageOverview() { )} + {disk.observations_count && disk.observations_count > 0 && ( + + + {disk.observations_count} + + )} {getHealthBadge(disk.health)} @@ -925,7 +983,7 @@ export function StorageOverview() { {/* Disk Details Dialog */} - + @@ -950,7 +1008,15 @@ export function StorageOverview() {

Health Status

-
{getHealthBadge(selectedDisk.health)}
+
+ {getHealthBadge(selectedDisk.health)} + {selectedDisk.observations_count && selectedDisk.observations_count > 0 && ( + + + {selectedDisk.observations_count} obs. + + )} +
@@ -1054,6 +1120,70 @@ export function StorageOverview() { + + {/* Observations Section */} + {(diskObservations.length > 0 || loadingObservations) && ( +
+

+ + Observations + + {diskObservations.length} + +

+ {loadingObservations ? ( +
+
+ Loading observations... +
+ ) : ( +
+ {diskObservations.map((obs) => ( +
+
+
+ + {obsTypeLabel(obs.error_type)} + + {obs.occurrence_count > 1 && ( + + {'Occurred ' + obs.occurrence_count + 'x'} + + )} +
+
+

+ {obs.raw_message} +

+
+ + + {'First: ' + formatObsDate(obs.first_occurrence)} + + {obs.occurrence_count > 1 && ( + + + {'Last: ' + formatObsDate(obs.last_occurrence)} + + )} +
+
+ ))} +
+ )} +
+ )}
)}
diff --git a/AppImage/scripts/flask_server.py b/AppImage/scripts/flask_server.py index f7114079..2c6f8ea1 100644 --- a/AppImage/scripts/flask_server.py +++ b/AppImage/scripts/flask_server.py @@ -1253,22 +1253,80 @@ def get_storage_info(): details = {} err_device = details.get('disk', '') + # Prefer the pre-resolved block device name (e.g. 'sdh' instead of 'ata8') + block_device = details.get('block_device', '') + err_serial = details.get('serial', '') error_count = details.get('error_count', 0) sample = details.get('sample', '') severity = err.get('severity', 'WARNING') # Match error to physical disk. - # err_device can be 'sda', 'nvme0n1', or 'ata8' (if resolution failed) + # Priority: block_device > serial > err_device > ATA resolution matched_disk = None - if err_device in physical_disks: + + # 1. Direct match via pre-resolved block_device + if block_device and block_device in physical_disks: + matched_disk = block_device + + # 2. Match by serial (most reliable across reboots/device renaming) + if not matched_disk and err_serial: + for dk, dinfo in physical_disks.items(): + if dinfo.get('serial', '').lower() == err_serial.lower(): + matched_disk = dk + break + + # 3. Direct match via err_device + if not matched_disk and err_device in physical_disks: matched_disk = err_device - else: - # Try partial match: 'sda' matches disk 'sda' + + # 4. Partial match + if not matched_disk: for dk in physical_disks: if dk == err_device or err_device.startswith(dk): matched_disk = dk break + # 5. ATA name resolution as last resort: 'ata8' -> 'sdh' via /sys + if not matched_disk and err_device.startswith('ata'): + # Method A: Use /sys/class/ata_port to find the block device + try: + ata_path = f'/sys/class/ata_port/{err_device}' + if os.path.exists(ata_path): + device_path = os.path.realpath(ata_path) + for root, dirs, files in os.walk(os.path.dirname(device_path)): + if 'block' in dirs: + devs = os.listdir(os.path.join(root, 'block')) + for bd in devs: + if bd in physical_disks: + matched_disk = bd + break + if matched_disk: + break + except (OSError, IOError): + pass + # Method B: Walk /sys/block/sd* and check if ataX in device path + if not matched_disk: + try: + for sd in os.listdir('/sys/block'): + if not sd.startswith('sd'): + continue + dev_link = f'/sys/block/{sd}/device' + if os.path.islink(dev_link): + real_p = os.path.realpath(dev_link) + if f'/{err_device}/' in real_p: + if sd in physical_disks: + matched_disk = sd + break + except (OSError, IOError): + pass + # Method C: Check error details for display name hint + if not matched_disk: + display = details.get('display', '') + if display.startswith('/dev/'): + dev_hint = display.replace('/dev/', '') + if dev_hint in physical_disks: + matched_disk = dev_hint + if matched_disk: physical_disks[matched_disk]['io_errors'] = { 'count': error_count, @@ -1421,17 +1479,22 @@ def get_storage_info(): # ── Register disks in observation system + enrich with observation counts ── try: active_dev_names = list(physical_disks.keys()) - obs_counts = health_persistence.get_disks_observation_counts() + # Register disks FIRST so that old ATA-named entries get + # consolidated into block device names via serial matching. for disk_name, disk_info in physical_disks.items(): - # Register each disk we see health_persistence.register_disk( device_name=disk_name, serial=disk_info.get('serial', ''), model=disk_info.get('model', ''), size_bytes=disk_info.get('size_bytes'), ) - + + # Fetch observation counts AFTER registration so consolidated + # entries are already merged (ata8 -> sdh). + obs_counts = health_persistence.get_disks_observation_counts() + + for disk_name, disk_info in physical_disks.items(): # Attach observation count: try serial match first, then device name serial = disk_info.get('serial', '') count = obs_counts.get(f'serial:{serial}', 0) if serial else 0 diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py index bd4eded6..2d1a4c7f 100644 --- a/AppImage/scripts/health_monitor.py +++ b/AppImage/scripts/health_monitor.py @@ -1002,16 +1002,25 @@ class HealthMonitor: **{k: v for k, v in val.items() if k not in ('status', 'reason')} } + # ALWAYS add descriptive entries for capabilities this server has. + # When everything is OK, they show as OK. When there are issues, + # they still appear so the user can see the full picture (e.g. + # LVM is OK even though I/O errors exist on a disk). + if 'root_filesystem' not in checks: + checks['root_filesystem'] = checks.pop('/', None) or {'status': 'OK', 'detail': 'Mounted read-write, space OK'} + if 'io_errors' not in checks: + # Only add OK if no disk I/O errors are present in checks + has_io = any(v.get('error_count') or 'I/O' in str(v.get('detail', '')) for v in checks.values()) + if not has_io: + checks['io_errors'] = {'status': 'OK', 'detail': 'No I/O errors in dmesg'} + if self.capabilities.get('has_smart') and 'smart_health' not in checks: + checks['smart_health'] = {'status': 'OK', 'detail': 'No SMART warnings in journal'} + if self.capabilities.get('has_zfs') and 'zfs_pools' not in checks: + checks['zfs_pools'] = {'status': 'OK', 'detail': 'ZFS pools healthy'} + if self.capabilities.get('has_lvm') and 'lvm_volumes' not in checks and 'lvm_check' not in checks: + checks['lvm_volumes'] = {'status': 'OK', 'detail': 'LVM volumes OK'} + if not issues: - # Add descriptive OK entries only for capabilities this server actually has - checks['root_filesystem'] = checks.get('/', {'status': 'OK', 'detail': 'Mounted read-write, space OK'}) - checks['io_errors'] = {'status': 'OK', 'detail': 'No I/O errors in dmesg'} - if self.capabilities.get('has_smart'): - checks['smart_health'] = {'status': 'OK', 'detail': 'No SMART warnings in journal'} - if self.capabilities.get('has_zfs'): - checks['zfs_pools'] = {'status': 'OK', 'detail': 'ZFS pools healthy'} - if self.capabilities.get('has_lvm'): - checks['lvm_volumes'] = {'status': 'OK', 'detail': 'LVM volumes OK'} return {'status': 'OK', 'checks': checks} # Determine overall status @@ -1235,6 +1244,36 @@ class HealthMonitor: except (OSError, subprocess.TimeoutExpired): pass + # Method 3: Use /sys/block/sd* and trace back to ATA host number + # ata8 => host7 (N-1) or host8 depending on controller numbering + try: + for sd in sorted(os.listdir('/sys/block')): + if not sd.startswith('sd'): + continue + # /sys/block/sdX/device -> ../../hostN/targetN:0:0/N:0:0:0 + dev_link = f'/sys/block/{sd}/device' + if os.path.islink(dev_link): + real_path = os.path.realpath(dev_link) + # Check if 'ataX' appears in the device path + if f'/{ata_port}/' in real_path or f'/ata{port_num}/' in real_path: + return sd + # Also check host number mapping: ata8 -> host7 (N-1 convention) + for offset in (0, -1): + host_n = int(port_num) + offset + if host_n >= 0 and f'/host{host_n}/' in real_path: + # Verify: check if ataX appears in the chain + parent = real_path + while parent and parent != '/': + parent = os.path.dirname(parent) + if os.path.basename(parent) == ata_port: + return sd + # Check 1 level: /sys/devices/.../ataX/hostY/... + ata_check = os.path.join(os.path.dirname(parent), ata_port) + if os.path.exists(ata_check): + return sd + except (OSError, IOError, ValueError): + pass + return ata_port # Return original if resolution fails def _identify_block_device(self, device: str) -> str: @@ -1483,6 +1522,39 @@ class HealthMonitor: smart_ok = smart_health == 'PASSED' + # Resolve ATA name to block device early so we can use it + # in both record_error details AND record_disk_observation. + resolved_block = disk + resolved_serial = None + if disk.startswith('ata'): + resolved_block = self._resolve_ata_to_disk(disk) + # Get serial from the resolved device + try: + dev_path = f'/dev/{resolved_block}' if resolved_block != disk else None + if dev_path: + sm = subprocess.run( + ['smartctl', '-i', dev_path], + capture_output=True, text=True, timeout=3) + if sm.returncode in (0, 4): + for sline in sm.stdout.split('\n'): + if 'Serial Number' in sline or 'Serial number' in sline: + resolved_serial = sline.split(':')[-1].strip() + break + except Exception: + pass + else: + try: + sm = subprocess.run( + ['smartctl', '-i', f'/dev/{disk}'], + capture_output=True, text=True, timeout=3) + if sm.returncode in (0, 4): + for sline in sm.stdout.split('\n'): + if 'Serial Number' in sline or 'Serial number' in sline: + resolved_serial = sline.split(':')[-1].strip() + break + except Exception: + pass + # ── Record disk observation (always, even if transient) ── # Signature must be stable across cycles: strip volatile # data (hex values, counts, timestamps) to dedup properly. @@ -1493,8 +1565,8 @@ class HealthMonitor: obs_sig = self._make_io_obs_signature(disk, sample) obs_severity = 'critical' if smart_health == 'FAILED' else 'warning' health_persistence.record_disk_observation( - device_name=disk, - serial=None, + device_name=resolved_block, + serial=resolved_serial, error_type='io_error', error_signature=obs_sig, raw_message=f'{display}: {error_count} I/O event(s) in 5 min (SMART: {smart_health})\n{sample}', @@ -1551,6 +1623,8 @@ class HealthMonitor: severity=severity, reason=reason, details={'disk': disk, 'device': display, + 'block_device': resolved_block, + 'serial': resolved_serial or '', 'error_count': error_count, 'smart_status': smart_health, 'sample': sample, 'dismissable': False} @@ -1584,6 +1658,8 @@ class HealthMonitor: severity=severity, reason=reason, details={'disk': disk, 'device': display, + 'block_device': resolved_block, + 'serial': resolved_serial or '', 'error_count': error_count, 'smart_status': smart_health, 'sample': sample, 'dismissable': True} diff --git a/AppImage/scripts/health_persistence.py b/AppImage/scripts/health_persistence.py index 374e9687..ec2adfde 100644 --- a/AppImage/scripts/health_persistence.py +++ b/AppImage/scripts/health_persistence.py @@ -1170,12 +1170,47 @@ class HealthPersistence: Uses (device_name, serial) as unique key. If the disk was previously marked removed, it's re-activated. + + Also consolidates old ATA-named entries: if an observation was recorded + under 'ata8' and we now know the real block device is 'sdh' with + serial 'WX72...', update the old entry so observations are linked. """ now = datetime.now().isoformat() try: conn = self._get_conn() cursor = conn.cursor() + # Consolidate: if serial is known and an old entry exists with + # a different device_name (e.g. 'ata8' instead of 'sdh'), + # update that entry's device_name so observations carry over. + if serial: + cursor.execute(''' + SELECT id, device_name FROM disk_registry + WHERE serial = ? AND serial != '' AND device_name != ? + ''', (serial, device_name)) + old_rows = cursor.fetchall() + for old_id, old_dev in old_rows: + # Only consolidate ATA names -> block device names + if old_dev.startswith('ata') and not device_name.startswith('ata'): + # Check if target (device_name, serial) already exists + cursor.execute( + 'SELECT id FROM disk_registry WHERE device_name = ? AND serial = ?', + (device_name, serial)) + existing = cursor.fetchone() + if existing: + # Merge: move observations from old -> existing, then delete old + cursor.execute( + 'UPDATE disk_observations SET disk_registry_id = ? WHERE disk_registry_id = ?', + (existing[0], old_id)) + cursor.execute('DELETE FROM disk_registry WHERE id = ?', (old_id,)) + else: + # Rename the old entry to the real block device name + cursor.execute( + 'UPDATE disk_registry SET device_name = ?, model = COALESCE(?, model), ' + 'size_bytes = COALESCE(?, size_bytes), last_seen = ?, removed = 0 ' + 'WHERE id = ?', + (device_name, model, size_bytes, now, old_id)) + cursor.execute(''' INSERT INTO disk_registry (device_name, serial, model, size_bytes, first_seen, last_seen, removed) VALUES (?, ?, ?, ?, ?, ?, 0) @@ -1193,7 +1228,11 @@ class HealthPersistence: def _get_disk_registry_id(self, cursor, device_name: str, serial: Optional[str] = None) -> Optional[int]: - """Find disk_registry.id, matching by serial first, then device_name.""" + """Find disk_registry.id, matching by serial first, then device_name. + + Also handles ATA-to-block cross-references: if looking for 'sdh' also + checks entries with ATA names that share the same serial. + """ if serial: cursor.execute( 'SELECT id FROM disk_registry WHERE serial = ? AND serial != "" ORDER BY last_seen DESC LIMIT 1', @@ -1207,7 +1246,19 @@ class HealthPersistence: 'SELECT id FROM disk_registry WHERE device_name = ? ORDER BY last_seen DESC LIMIT 1', (clean_dev,)) row = cursor.fetchone() - return row[0] if row else None + if row: + return row[0] + # Last resort: search for ATA-named entries that might refer to this device + # This handles cases where observations were recorded under 'ata8' + # but we're querying for 'sdh' + if clean_dev.startswith('sd') or clean_dev.startswith('nvme'): + cursor.execute( + 'SELECT id FROM disk_registry WHERE device_name LIKE "ata%" ORDER BY last_seen DESC') + # For each ATA entry, we can't resolve here without OS access, + # so just return None and let the serial-based consolidation + # in register_disk handle it over time. + pass + return None def record_disk_observation(self, device_name: str, serial: Optional[str], error_type: str, error_signature: str,