From 898392725a25ebfc69f95c88b9b63d2d5efb13ba Mon Sep 17 00:00:00 2001
From: MacRimi <ricoextincion@gmail.com>
Date: Thu, 5 Mar 2026 17:29:07 +0100
Subject: [PATCH] Update notification service

---
 AppImage/components/storage-overview.tsx | 138 ++++++++++++++++++++++-
 AppImage/scripts/flask_server.py         |  77 +++++++++++--
 AppImage/scripts/health_monitor.py       |  98 ++++++++++++++--
 AppImage/scripts/health_persistence.py   |  55 ++++++++-
 4 files changed, 344 insertions(+), 24 deletions(-)
diff --git a/AppImage/components/storage-overview.tsx b/AppImage/components/storage-overview.tsx
index 42fb209a..4253ff21 100644
--- a/AppImage/components/storage-overview.tsx
+++ b/AppImage/components/storage-overview.tsx
@@ -2,7 +2,7 @@
 
 import { useEffect, useState } from "react"
 import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card"
-import { HardDrive, Database, AlertTriangle, CheckCircle2, XCircle, Square, Thermometer, Archive } from "lucide-react"
+import { HardDrive, Database, AlertTriangle, CheckCircle2, XCircle, Square, Thermometer, Archive, Info, Clock } from "lucide-react"
 import { Badge } from "@/components/ui/badge"
 import { Progress } from "@/components/ui/progress"
 import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle } from "@/components/ui/dialog"
@@ -41,6 +41,22 @@ interface DiskInfo {
     reason: string
     error_type?: string  // 'io' | 'filesystem'
   }
+  observations_count?: number
+}
+
+interface DiskObservation {
+  id: number
+  error_type: string
+  error_signature: string
+  first_occurrence: string
+  last_occurrence: string
+  occurrence_count: number
+  raw_message: string
+  severity: string
+  dismissed: boolean
+  device_name: string
+  serial: string
+  model: string
 }
 
 interface ZFSPool {
@@ -98,6 +114,8 @@ export function StorageOverview() {
   const [loading, setLoading] = useState(true)
   const [selectedDisk, setSelectedDisk] = useState<DiskInfo | null>(null)
   const [detailsOpen, setDetailsOpen] = useState(false)
+  const [diskObservations, setDiskObservations] = useState<DiskObservation[]>([])
+  const [loadingObservations, setLoadingObservations] = useState(false)
 
   const fetchStorageData = async () => {
     try {
@@ -241,11 +259,39 @@ export function StorageOverview() {
     return badgeStyles[diskType]
   }
 
-  const handleDiskClick = (disk: DiskInfo) => {
+  const handleDiskClick = async (disk: DiskInfo) => {
     setSelectedDisk(disk)
     setDetailsOpen(true)
+    setDiskObservations([])
+
+    if (disk.observations_count && disk.observations_count > 0) {
+      setLoadingObservations(true)
+      try {
+        const params = new URLSearchParams()
+        if (disk.name) params.set('device', disk.name)
+        if (disk.serial && disk.serial !== 'Unknown') params.set('serial', disk.serial)
+        const data = await fetchApi<{ observations: DiskObservation[] }>(`/api/storage/observations?${params.toString()}`)
+        setDiskObservations(data.observations || [])
+      } catch {
+        setDiskObservations([])
+      } finally {
+        setLoadingObservations(false)
+      }
+    }
   }
 
+  const formatObsDate = (iso: string) => {
+    if (!iso) return 'N/A'
+    try {
+      const d = new Date(iso)
+      return d.toLocaleDateString(undefined, { month: 'short', day: 'numeric', year: 'numeric' })
+        + ' ' + d.toLocaleTimeString(undefined, { hour: '2-digit', minute: '2-digit' })
+    } catch { return iso }
+  }
+
+  const obsTypeLabel = (t: string) =>
+    ({ smart_error: 'SMART Error', io_error: 'I/O Error', connection_error: 'Connection Error' }[t] || t)
+
   const getStorageTypeBadge = (type: string) => {
     const typeColors: Record<string, string> = {
       pbs: "bg-purple-500/10 text-purple-500 border-purple-500/20",
@@ -778,6 +824,12 @@ export function StorageOverview() {
                             </span>
                           </div>
                         )}
+                        {disk.observations_count && disk.observations_count > 0 && (
+                          <Badge className="bg-blue-500/10 text-blue-400 border-blue-500/20 gap-1 text-[10px] px-1.5 py-0">
+                            <Info className="h-3 w-3" />
+                            {disk.observations_count}
+                          </Badge>
+                        )}
                         {getHealthBadge(disk.health)}
                       </div>
                     </div>
@@ -858,6 +910,12 @@ export function StorageOverview() {
                             </span>
                           </div>
                         )}
+                        {disk.observations_count && disk.observations_count > 0 && (
+                          <Badge className="bg-blue-500/10 text-blue-400 border-blue-500/20 gap-1 text-[10px] px-1.5 py-0">
+                            <Info className="h-3 w-3" />
+                            {disk.observations_count}
+                          </Badge>
+                        )}
                         {getHealthBadge(disk.health)}
                       </div>
                     </div>
@@ -925,7 +983,7 @@ export function StorageOverview() {
 
       {/* Disk Details Dialog */}
       <Dialog open={detailsOpen} onOpenChange={setDetailsOpen}>
-        <DialogContent className="max-w-2xl">
+        <DialogContent className="max-w-2xl max-h-[90vh] overflow-y-auto">
           <DialogHeader>
             <DialogTitle className="flex items-center gap-2">
               <HardDrive className="h-5 w-5" />
@@ -950,7 +1008,15 @@ export function StorageOverview() {
                 </div>
                 <div>
                   <p className="text-sm text-muted-foreground">Health Status</p>
-                  <div className="mt-1">{getHealthBadge(selectedDisk.health)}</div>
+                  <div className="flex items-center gap-2 mt-1">
+                    {getHealthBadge(selectedDisk.health)}
+                    {selectedDisk.observations_count && selectedDisk.observations_count > 0 && (
+                      <Badge className="bg-blue-500/10 text-blue-400 border-blue-500/20 gap-1 text-[10px] px-1.5 py-0">
+                        <Info className="h-3 w-3" />
+                        {selectedDisk.observations_count} obs.
+                      </Badge>
+                    )}
+                  </div>
                 </div>
               </div>
 
@@ -1054,6 +1120,70 @@ export function StorageOverview() {
                   </div>
                 </div>
               </div>
+
+              {/* Observations Section */}
+              {(diskObservations.length > 0 || loadingObservations) && (
+                <div className="border-t pt-4">
+                  <h4 className="font-semibold mb-3 flex items-center gap-2">
+                    <Info className="h-4 w-4 text-blue-400" />
+                    Observations
+                    <Badge className="bg-blue-500/10 text-blue-400 border-blue-500/20 text-[10px] px-1.5 py-0">
+                      {diskObservations.length}
+                    </Badge>
+                  </h4>
+                  {loadingObservations ? (
+                    <div className="flex items-center gap-2 text-sm text-muted-foreground py-2">
+                      <div className="h-4 w-4 rounded-full border-2 border-transparent border-t-blue-400 animate-spin" />
+                      Loading observations...
+                    </div>
+                  ) : (
+                    <div className="space-y-3 max-h-60 overflow-y-auto">
+                      {diskObservations.map((obs) => (
+                        <div
+                          key={obs.id}
+                          className={`rounded-lg border p-3 text-sm ${
+                            obs.severity === 'critical'
+                              ? 'bg-red-500/5 border-red-500/20'
+                              : 'bg-blue-500/5 border-blue-500/20'
+                          }`}
+                        >
+                          <div className="flex items-start justify-between gap-2">
+                            <div className="flex items-center gap-2 flex-wrap">
+                              <Badge className={`text-[10px] px-1.5 py-0 ${
+                                obs.severity === 'critical'
+                                  ? 'bg-red-500/10 text-red-400 border-red-500/20'
+                                  : 'bg-blue-500/10 text-blue-400 border-blue-500/20'
+                              }`}>
+                                {obsTypeLabel(obs.error_type)}
+                              </Badge>
+                              {obs.occurrence_count > 1 && (
+                                <span className="text-xs text-muted-foreground">
+                                  {'Occurred ' + obs.occurrence_count + 'x'}
+                                </span>
+                              )}
+                            </div>
+                          </div>
+                          <p className="mt-1.5 text-xs whitespace-pre-line opacity-90 font-mono leading-relaxed">
+                            {obs.raw_message}
+                          </p>
+                          <div className="flex items-center gap-3 mt-2 text-[10px] text-muted-foreground">
+                            <span className="flex items-center gap-1">
+                              <Clock className="h-3 w-3" />
+                              {'First: ' + formatObsDate(obs.first_occurrence)}
+                            </span>
+                            {obs.occurrence_count > 1 && (
+                              <span className="flex items-center gap-1">
+                                <Clock className="h-3 w-3" />
+                                {'Last: ' + formatObsDate(obs.last_occurrence)}
+                              </span>
+                            )}
+                          </div>
+                        </div>
+                      ))}
+                    </div>
+                  )}
+                </div>
+              )}
             </div>
           )}
         </DialogContent>
diff --git a/AppImage/scripts/flask_server.py b/AppImage/scripts/flask_server.py
index f7114079..2c6f8ea1 100644
--- a/AppImage/scripts/flask_server.py
+++ b/AppImage/scripts/flask_server.py
@@ -1253,22 +1253,80 @@ def get_storage_info():
                         details = {}
                 
                 err_device = details.get('disk', '')
+                # Prefer the pre-resolved block device name (e.g. 'sdh' instead of 'ata8')
+                block_device = details.get('block_device', '')
+                err_serial = details.get('serial', '')
                 error_count = details.get('error_count', 0)
                 sample = details.get('sample', '')
                 severity = err.get('severity', 'WARNING')
                 
                 # Match error to physical disk.
-                # err_device can be 'sda', 'nvme0n1', or 'ata8' (if resolution failed)
+                # Priority: block_device > serial > err_device > ATA resolution
                 matched_disk = None
-                if err_device in physical_disks:
+                
+                # 1. Direct match via pre-resolved block_device
+                if block_device and block_device in physical_disks:
+                    matched_disk = block_device
+                
+                # 2. Match by serial (most reliable across reboots/device renaming)
+                if not matched_disk and err_serial:
+                    for dk, dinfo in physical_disks.items():
+                        if dinfo.get('serial', '').lower() == err_serial.lower():
+                            matched_disk = dk
+                            break
+                
+                # 3. Direct match via err_device
+                if not matched_disk and err_device in physical_disks:
                     matched_disk = err_device
-                else:
-                    # Try partial match: 'sda' matches disk 'sda'
+                
+                # 4. Partial match
+                if not matched_disk:
                     for dk in physical_disks:
                         if dk == err_device or err_device.startswith(dk):
                             matched_disk = dk
                             break
                 
+                # 5. ATA name resolution as last resort: 'ata8' -> 'sdh' via /sys
+                if not matched_disk and err_device.startswith('ata'):
+                    # Method A: Use /sys/class/ata_port to find the block device
+                    try:
+                        ata_path = f'/sys/class/ata_port/{err_device}'
+                        if os.path.exists(ata_path):
+                            device_path = os.path.realpath(ata_path)
+                            for root, dirs, files in os.walk(os.path.dirname(device_path)):
+                                if 'block' in dirs:
+                                    devs = os.listdir(os.path.join(root, 'block'))
+                                    for bd in devs:
+                                        if bd in physical_disks:
+                                            matched_disk = bd
+                                            break
+                                if matched_disk:
+                                    break
+                    except (OSError, IOError):
+                        pass
+                    # Method B: Walk /sys/block/sd* and check if ataX in device path
+                    if not matched_disk:
+                        try:
+                            for sd in os.listdir('/sys/block'):
+                                if not sd.startswith('sd'):
+                                    continue
+                                dev_link = f'/sys/block/{sd}/device'
+                                if os.path.islink(dev_link):
+                                    real_p = os.path.realpath(dev_link)
+                                    if f'/{err_device}/' in real_p:
+                                        if sd in physical_disks:
+                                            matched_disk = sd
+                                            break
+                        except (OSError, IOError):
+                            pass
+                    # Method C: Check error details for display name hint
+                    if not matched_disk:
+                        display = details.get('display', '')
+                        if display.startswith('/dev/'):
+                            dev_hint = display.replace('/dev/', '')
+                            if dev_hint in physical_disks:
+                                matched_disk = dev_hint
+                
                 if matched_disk:
                     physical_disks[matched_disk]['io_errors'] = {
                         'count': error_count,
@@ -1421,17 +1479,22 @@ def get_storage_info():
         # ── Register disks in observation system + enrich with observation counts ──
         try:
             active_dev_names = list(physical_disks.keys())
-            obs_counts = health_persistence.get_disks_observation_counts()
             
+            # Register disks FIRST so that old ATA-named entries get
+            # consolidated into block device names via serial matching.
             for disk_name, disk_info in physical_disks.items():
-                # Register each disk we see
                 health_persistence.register_disk(
                     device_name=disk_name,
                     serial=disk_info.get('serial', ''),
                     model=disk_info.get('model', ''),
                     size_bytes=disk_info.get('size_bytes'),
                 )
-                
+            
+            # Fetch observation counts AFTER registration so consolidated
+            # entries are already merged (ata8 -> sdh).
+            obs_counts = health_persistence.get_disks_observation_counts()
+            
+            for disk_name, disk_info in physical_disks.items():
                 # Attach observation count: try serial match first, then device name
                 serial = disk_info.get('serial', '')
                 count = obs_counts.get(f'serial:{serial}', 0) if serial else 0
diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py
index bd4eded6..2d1a4c7f 100644
--- a/AppImage/scripts/health_monitor.py
+++ b/AppImage/scripts/health_monitor.py
@@ -1002,16 +1002,25 @@ class HealthMonitor:
                 **{k: v for k, v in val.items() if k not in ('status', 'reason')}
             }
         
+        # ALWAYS add descriptive entries for capabilities this server has.
+        # When everything is OK, they show as OK.  When there are issues,
+        # they still appear so the user can see the full picture (e.g.
+        # LVM is OK even though I/O errors exist on a disk).
+        if 'root_filesystem' not in checks:
+            checks['root_filesystem'] = checks.pop('/', None) or {'status': 'OK', 'detail': 'Mounted read-write, space OK'}
+        if 'io_errors' not in checks:
+            # Only add OK if no disk I/O errors are present in checks
+            has_io = any(v.get('error_count') or 'I/O' in str(v.get('detail', '')) for v in checks.values())
+            if not has_io:
+                checks['io_errors'] = {'status': 'OK', 'detail': 'No I/O errors in dmesg'}
+        if self.capabilities.get('has_smart') and 'smart_health' not in checks:
+            checks['smart_health'] = {'status': 'OK', 'detail': 'No SMART warnings in journal'}
+        if self.capabilities.get('has_zfs') and 'zfs_pools' not in checks:
+            checks['zfs_pools'] = {'status': 'OK', 'detail': 'ZFS pools healthy'}
+        if self.capabilities.get('has_lvm') and 'lvm_volumes' not in checks and 'lvm_check' not in checks:
+            checks['lvm_volumes'] = {'status': 'OK', 'detail': 'LVM volumes OK'}
+        
         if not issues:
-            # Add descriptive OK entries only for capabilities this server actually has
-            checks['root_filesystem'] = checks.get('/', {'status': 'OK', 'detail': 'Mounted read-write, space OK'})
-            checks['io_errors'] = {'status': 'OK', 'detail': 'No I/O errors in dmesg'}
-            if self.capabilities.get('has_smart'):
-                checks['smart_health'] = {'status': 'OK', 'detail': 'No SMART warnings in journal'}
-            if self.capabilities.get('has_zfs'):
-                checks['zfs_pools'] = {'status': 'OK', 'detail': 'ZFS pools healthy'}
-            if self.capabilities.get('has_lvm'):
-                checks['lvm_volumes'] = {'status': 'OK', 'detail': 'LVM volumes OK'}
             return {'status': 'OK', 'checks': checks}
         
         # Determine overall status
@@ -1235,6 +1244,36 @@ class HealthMonitor:
         except (OSError, subprocess.TimeoutExpired):
             pass
         
+        # Method 3: Use /sys/block/sd* and trace back to ATA host number
+        # ata8 => host7 (N-1) or host8 depending on controller numbering
+        try:
+            for sd in sorted(os.listdir('/sys/block')):
+                if not sd.startswith('sd'):
+                    continue
+                # /sys/block/sdX/device -> ../../hostN/targetN:0:0/N:0:0:0
+                dev_link = f'/sys/block/{sd}/device'
+                if os.path.islink(dev_link):
+                    real_path = os.path.realpath(dev_link)
+                    # Check if 'ataX' appears in the device path
+                    if f'/{ata_port}/' in real_path or f'/ata{port_num}/' in real_path:
+                        return sd
+                    # Also check host number mapping: ata8 -> host7 (N-1 convention)
+                    for offset in (0, -1):
+                        host_n = int(port_num) + offset
+                        if host_n >= 0 and f'/host{host_n}/' in real_path:
+                            # Verify: check if ataX appears in the chain
+                            parent = real_path
+                            while parent and parent != '/':
+                                parent = os.path.dirname(parent)
+                                if os.path.basename(parent) == ata_port:
+                                    return sd
+                                # Check 1 level: /sys/devices/.../ataX/hostY/...
+                                ata_check = os.path.join(os.path.dirname(parent), ata_port)
+                                if os.path.exists(ata_check):
+                                    return sd
+        except (OSError, IOError, ValueError):
+            pass
+        
         return ata_port  # Return original if resolution fails
     
     def _identify_block_device(self, device: str) -> str:
@@ -1483,6 +1522,39 @@ class HealthMonitor:
                         
                         smart_ok = smart_health == 'PASSED'
                         
+                        # Resolve ATA name to block device early so we can use it
+                        # in both record_error details AND record_disk_observation.
+                        resolved_block = disk
+                        resolved_serial = None
+                        if disk.startswith('ata'):
+                            resolved_block = self._resolve_ata_to_disk(disk)
+                            # Get serial from the resolved device
+                            try:
+                                dev_path = f'/dev/{resolved_block}' if resolved_block != disk else None
+                                if dev_path:
+                                    sm = subprocess.run(
+                                        ['smartctl', '-i', dev_path],
+                                        capture_output=True, text=True, timeout=3)
+                                    if sm.returncode in (0, 4):
+                                        for sline in sm.stdout.split('\n'):
+                                            if 'Serial Number' in sline or 'Serial number' in sline:
+                                                resolved_serial = sline.split(':')[-1].strip()
+                                                break
+                            except Exception:
+                                pass
+                        else:
+                            try:
+                                sm = subprocess.run(
+                                    ['smartctl', '-i', f'/dev/{disk}'],
+                                    capture_output=True, text=True, timeout=3)
+                                if sm.returncode in (0, 4):
+                                    for sline in sm.stdout.split('\n'):
+                                        if 'Serial Number' in sline or 'Serial number' in sline:
+                                            resolved_serial = sline.split(':')[-1].strip()
+                                            break
+                            except Exception:
+                                pass
+                        
                         # ── Record disk observation (always, even if transient) ──
                         # Signature must be stable across cycles: strip volatile
                         # data (hex values, counts, timestamps) to dedup properly.
@@ -1493,8 +1565,8 @@ class HealthMonitor:
                             obs_sig = self._make_io_obs_signature(disk, sample)
                             obs_severity = 'critical' if smart_health == 'FAILED' else 'warning'
                             health_persistence.record_disk_observation(
-                                device_name=disk,
-                                serial=None,
+                                device_name=resolved_block,
+                                serial=resolved_serial,
                                 error_type='io_error',
                                 error_signature=obs_sig,
                                 raw_message=f'{display}: {error_count} I/O event(s) in 5 min (SMART: {smart_health})\n{sample}',
@@ -1551,6 +1623,8 @@ class HealthMonitor:
                                 severity=severity,
                                 reason=reason,
                                 details={'disk': disk, 'device': display,
+                                         'block_device': resolved_block,
+                                         'serial': resolved_serial or '',
                                          'error_count': error_count,
                                          'smart_status': smart_health,
                                          'sample': sample, 'dismissable': False}
@@ -1584,6 +1658,8 @@ class HealthMonitor:
                                     severity=severity,
                                     reason=reason,
                                     details={'disk': disk, 'device': display,
+                                             'block_device': resolved_block,
+                                             'serial': resolved_serial or '',
                                              'error_count': error_count,
                                              'smart_status': smart_health,
                                              'sample': sample, 'dismissable': True}
diff --git a/AppImage/scripts/health_persistence.py b/AppImage/scripts/health_persistence.py
index 374e9687..ec2adfde 100644
--- a/AppImage/scripts/health_persistence.py
+++ b/AppImage/scripts/health_persistence.py
@@ -1170,12 +1170,47 @@ class HealthPersistence:
         
         Uses (device_name, serial) as unique key. If the disk was previously
         marked removed, it's re-activated.
+        
+        Also consolidates old ATA-named entries: if an observation was recorded
+        under 'ata8' and we now know the real block device is 'sdh' with
+        serial 'WX72...', update the old entry so observations are linked.
         """
         now = datetime.now().isoformat()
         try:
             conn = self._get_conn()
             cursor = conn.cursor()
             
+            # Consolidate: if serial is known and an old entry exists with
+            # a different device_name (e.g. 'ata8' instead of 'sdh'),
+            # update that entry's device_name so observations carry over.
+            if serial:
+                cursor.execute('''
+                    SELECT id, device_name FROM disk_registry
+                    WHERE serial = ? AND serial != '' AND device_name != ?
+                ''', (serial, device_name))
+                old_rows = cursor.fetchall()
+                for old_id, old_dev in old_rows:
+                    # Only consolidate ATA names -> block device names
+                    if old_dev.startswith('ata') and not device_name.startswith('ata'):
+                        # Check if target (device_name, serial) already exists
+                        cursor.execute(
+                            'SELECT id FROM disk_registry WHERE device_name = ? AND serial = ?',
+                            (device_name, serial))
+                        existing = cursor.fetchone()
+                        if existing:
+                            # Merge: move observations from old -> existing, then delete old
+                            cursor.execute(
+                                'UPDATE disk_observations SET disk_registry_id = ? WHERE disk_registry_id = ?',
+                                (existing[0], old_id))
+                            cursor.execute('DELETE FROM disk_registry WHERE id = ?', (old_id,))
+                        else:
+                            # Rename the old entry to the real block device name
+                            cursor.execute(
+                                'UPDATE disk_registry SET device_name = ?, model = COALESCE(?, model), '
+                                'size_bytes = COALESCE(?, size_bytes), last_seen = ?, removed = 0 '
+                                'WHERE id = ?',
+                                (device_name, model, size_bytes, now, old_id))
+            
             cursor.execute('''
                 INSERT INTO disk_registry (device_name, serial, model, size_bytes, first_seen, last_seen, removed)
                 VALUES (?, ?, ?, ?, ?, ?, 0)
@@ -1193,7 +1228,11 @@ class HealthPersistence:
 
     def _get_disk_registry_id(self, cursor, device_name: str,
                                serial: Optional[str] = None) -> Optional[int]:
-        """Find disk_registry.id, matching by serial first, then device_name."""
+        """Find disk_registry.id, matching by serial first, then device_name.
+        
+        Also handles ATA-to-block cross-references: if looking for 'sdh' also
+        checks entries with ATA names that share the same serial.
+        """
         if serial:
             cursor.execute(
                 'SELECT id FROM disk_registry WHERE serial = ? AND serial != "" ORDER BY last_seen DESC LIMIT 1',
@@ -1207,7 +1246,19 @@ class HealthPersistence:
             'SELECT id FROM disk_registry WHERE device_name = ? ORDER BY last_seen DESC LIMIT 1',
             (clean_dev,))
         row = cursor.fetchone()
-        return row[0] if row else None
+        if row:
+            return row[0]
+        # Last resort: search for ATA-named entries that might refer to this device
+        # This handles cases where observations were recorded under 'ata8'
+        # but we're querying for 'sdh'
+        if clean_dev.startswith('sd') or clean_dev.startswith('nvme'):
+            cursor.execute(
+                'SELECT id FROM disk_registry WHERE device_name LIKE "ata%" ORDER BY last_seen DESC')
+            # For each ATA entry, we can't resolve here without OS access,
+            # so just return None and let the serial-based consolidation
+            # in register_disk handle it over time.
+            pass
+        return None
 
     def record_disk_observation(self, device_name: str, serial: Optional[str],
                                  error_type: str, error_signature: str,