Update notification service

This commit is contained in:
MacRimi
2026-03-05 17:29:07 +01:00
parent 9089035f18
commit 898392725a
4 changed files with 344 additions and 24 deletions

View File

@@ -2,7 +2,7 @@
import { useEffect, useState } from "react"
import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card"
import { HardDrive, Database, AlertTriangle, CheckCircle2, XCircle, Square, Thermometer, Archive } from "lucide-react"
import { HardDrive, Database, AlertTriangle, CheckCircle2, XCircle, Square, Thermometer, Archive, Info, Clock } from "lucide-react"
import { Badge } from "@/components/ui/badge"
import { Progress } from "@/components/ui/progress"
import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle } from "@/components/ui/dialog"
@@ -41,6 +41,22 @@ interface DiskInfo {
reason: string
error_type?: string // 'io' | 'filesystem'
}
observations_count?: number
}
interface DiskObservation {
id: number
error_type: string
error_signature: string
first_occurrence: string
last_occurrence: string
occurrence_count: number
raw_message: string
severity: string
dismissed: boolean
device_name: string
serial: string
model: string
}
interface ZFSPool {
@@ -98,6 +114,8 @@ export function StorageOverview() {
const [loading, setLoading] = useState(true)
const [selectedDisk, setSelectedDisk] = useState<DiskInfo | null>(null)
const [detailsOpen, setDetailsOpen] = useState(false)
const [diskObservations, setDiskObservations] = useState<DiskObservation[]>([])
const [loadingObservations, setLoadingObservations] = useState(false)
const fetchStorageData = async () => {
try {
@@ -241,11 +259,39 @@ export function StorageOverview() {
return badgeStyles[diskType]
}
const handleDiskClick = (disk: DiskInfo) => {
const handleDiskClick = async (disk: DiskInfo) => {
setSelectedDisk(disk)
setDetailsOpen(true)
setDiskObservations([])
if (disk.observations_count && disk.observations_count > 0) {
setLoadingObservations(true)
try {
const params = new URLSearchParams()
if (disk.name) params.set('device', disk.name)
if (disk.serial && disk.serial !== 'Unknown') params.set('serial', disk.serial)
const data = await fetchApi<{ observations: DiskObservation[] }>(`/api/storage/observations?${params.toString()}`)
setDiskObservations(data.observations || [])
} catch {
setDiskObservations([])
} finally {
setLoadingObservations(false)
}
}
}
const formatObsDate = (iso: string) => {
if (!iso) return 'N/A'
try {
const d = new Date(iso)
return d.toLocaleDateString(undefined, { month: 'short', day: 'numeric', year: 'numeric' })
+ ' ' + d.toLocaleTimeString(undefined, { hour: '2-digit', minute: '2-digit' })
} catch { return iso }
}
const obsTypeLabel = (t: string) =>
({ smart_error: 'SMART Error', io_error: 'I/O Error', connection_error: 'Connection Error' }[t] || t)
const getStorageTypeBadge = (type: string) => {
const typeColors: Record<string, string> = {
pbs: "bg-purple-500/10 text-purple-500 border-purple-500/20",
@@ -778,6 +824,12 @@ export function StorageOverview() {
</span>
</div>
)}
{disk.observations_count && disk.observations_count > 0 && (
<Badge className="bg-blue-500/10 text-blue-400 border-blue-500/20 gap-1 text-[10px] px-1.5 py-0">
<Info className="h-3 w-3" />
{disk.observations_count}
</Badge>
)}
{getHealthBadge(disk.health)}
</div>
</div>
@@ -858,6 +910,12 @@ export function StorageOverview() {
</span>
</div>
)}
{disk.observations_count && disk.observations_count > 0 && (
<Badge className="bg-blue-500/10 text-blue-400 border-blue-500/20 gap-1 text-[10px] px-1.5 py-0">
<Info className="h-3 w-3" />
{disk.observations_count}
</Badge>
)}
{getHealthBadge(disk.health)}
</div>
</div>
@@ -925,7 +983,7 @@ export function StorageOverview() {
{/* Disk Details Dialog */}
<Dialog open={detailsOpen} onOpenChange={setDetailsOpen}>
<DialogContent className="max-w-2xl">
<DialogContent className="max-w-2xl max-h-[90vh] overflow-y-auto">
<DialogHeader>
<DialogTitle className="flex items-center gap-2">
<HardDrive className="h-5 w-5" />
@@ -950,7 +1008,15 @@ export function StorageOverview() {
</div>
<div>
<p className="text-sm text-muted-foreground">Health Status</p>
<div className="mt-1">{getHealthBadge(selectedDisk.health)}</div>
<div className="flex items-center gap-2 mt-1">
{getHealthBadge(selectedDisk.health)}
{selectedDisk.observations_count && selectedDisk.observations_count > 0 && (
<Badge className="bg-blue-500/10 text-blue-400 border-blue-500/20 gap-1 text-[10px] px-1.5 py-0">
<Info className="h-3 w-3" />
{selectedDisk.observations_count} obs.
</Badge>
)}
</div>
</div>
</div>
@@ -1054,6 +1120,70 @@ export function StorageOverview() {
</div>
</div>
</div>
{/* Observations Section */}
{(diskObservations.length > 0 || loadingObservations) && (
<div className="border-t pt-4">
<h4 className="font-semibold mb-3 flex items-center gap-2">
<Info className="h-4 w-4 text-blue-400" />
Observations
<Badge className="bg-blue-500/10 text-blue-400 border-blue-500/20 text-[10px] px-1.5 py-0">
{diskObservations.length}
</Badge>
</h4>
{loadingObservations ? (
<div className="flex items-center gap-2 text-sm text-muted-foreground py-2">
<div className="h-4 w-4 rounded-full border-2 border-transparent border-t-blue-400 animate-spin" />
Loading observations...
</div>
) : (
<div className="space-y-3 max-h-60 overflow-y-auto">
{diskObservations.map((obs) => (
<div
key={obs.id}
className={`rounded-lg border p-3 text-sm ${
obs.severity === 'critical'
? 'bg-red-500/5 border-red-500/20'
: 'bg-blue-500/5 border-blue-500/20'
}`}
>
<div className="flex items-start justify-between gap-2">
<div className="flex items-center gap-2 flex-wrap">
<Badge className={`text-[10px] px-1.5 py-0 ${
obs.severity === 'critical'
? 'bg-red-500/10 text-red-400 border-red-500/20'
: 'bg-blue-500/10 text-blue-400 border-blue-500/20'
}`}>
{obsTypeLabel(obs.error_type)}
</Badge>
{obs.occurrence_count > 1 && (
<span className="text-xs text-muted-foreground">
{'Occurred ' + obs.occurrence_count + 'x'}
</span>
)}
</div>
</div>
<p className="mt-1.5 text-xs whitespace-pre-line opacity-90 font-mono leading-relaxed">
{obs.raw_message}
</p>
<div className="flex items-center gap-3 mt-2 text-[10px] text-muted-foreground">
<span className="flex items-center gap-1">
<Clock className="h-3 w-3" />
{'First: ' + formatObsDate(obs.first_occurrence)}
</span>
{obs.occurrence_count > 1 && (
<span className="flex items-center gap-1">
<Clock className="h-3 w-3" />
{'Last: ' + formatObsDate(obs.last_occurrence)}
</span>
)}
</div>
</div>
))}
</div>
)}
</div>
)}
</div>
)}
</DialogContent>

View File

@@ -1253,22 +1253,80 @@ def get_storage_info():
details = {}
err_device = details.get('disk', '')
# Prefer the pre-resolved block device name (e.g. 'sdh' instead of 'ata8')
block_device = details.get('block_device', '')
err_serial = details.get('serial', '')
error_count = details.get('error_count', 0)
sample = details.get('sample', '')
severity = err.get('severity', 'WARNING')
# Match error to physical disk.
# err_device can be 'sda', 'nvme0n1', or 'ata8' (if resolution failed)
# Priority: block_device > serial > err_device > ATA resolution
matched_disk = None
if err_device in physical_disks:
# 1. Direct match via pre-resolved block_device
if block_device and block_device in physical_disks:
matched_disk = block_device
# 2. Match by serial (most reliable across reboots/device renaming)
if not matched_disk and err_serial:
for dk, dinfo in physical_disks.items():
if dinfo.get('serial', '').lower() == err_serial.lower():
matched_disk = dk
break
# 3. Direct match via err_device
if not matched_disk and err_device in physical_disks:
matched_disk = err_device
else:
# Try partial match: 'sda' matches disk 'sda'
# 4. Partial match
if not matched_disk:
for dk in physical_disks:
if dk == err_device or err_device.startswith(dk):
matched_disk = dk
break
# 5. ATA name resolution as last resort: 'ata8' -> 'sdh' via /sys
if not matched_disk and err_device.startswith('ata'):
# Method A: Use /sys/class/ata_port to find the block device
try:
ata_path = f'/sys/class/ata_port/{err_device}'
if os.path.exists(ata_path):
device_path = os.path.realpath(ata_path)
for root, dirs, files in os.walk(os.path.dirname(device_path)):
if 'block' in dirs:
devs = os.listdir(os.path.join(root, 'block'))
for bd in devs:
if bd in physical_disks:
matched_disk = bd
break
if matched_disk:
break
except (OSError, IOError):
pass
# Method B: Walk /sys/block/sd* and check if ataX in device path
if not matched_disk:
try:
for sd in os.listdir('/sys/block'):
if not sd.startswith('sd'):
continue
dev_link = f'/sys/block/{sd}/device'
if os.path.islink(dev_link):
real_p = os.path.realpath(dev_link)
if f'/{err_device}/' in real_p:
if sd in physical_disks:
matched_disk = sd
break
except (OSError, IOError):
pass
# Method C: Check error details for display name hint
if not matched_disk:
display = details.get('display', '')
if display.startswith('/dev/'):
dev_hint = display.replace('/dev/', '')
if dev_hint in physical_disks:
matched_disk = dev_hint
if matched_disk:
physical_disks[matched_disk]['io_errors'] = {
'count': error_count,
@@ -1421,17 +1479,22 @@ def get_storage_info():
# ── Register disks in observation system + enrich with observation counts ──
try:
active_dev_names = list(physical_disks.keys())
obs_counts = health_persistence.get_disks_observation_counts()
# Register disks FIRST so that old ATA-named entries get
# consolidated into block device names via serial matching.
for disk_name, disk_info in physical_disks.items():
# Register each disk we see
health_persistence.register_disk(
device_name=disk_name,
serial=disk_info.get('serial', ''),
model=disk_info.get('model', ''),
size_bytes=disk_info.get('size_bytes'),
)
# Fetch observation counts AFTER registration so consolidated
# entries are already merged (ata8 -> sdh).
obs_counts = health_persistence.get_disks_observation_counts()
for disk_name, disk_info in physical_disks.items():
# Attach observation count: try serial match first, then device name
serial = disk_info.get('serial', '')
count = obs_counts.get(f'serial:{serial}', 0) if serial else 0

View File

@@ -1002,16 +1002,25 @@ class HealthMonitor:
**{k: v for k, v in val.items() if k not in ('status', 'reason')}
}
# ALWAYS add descriptive entries for capabilities this server has.
# When everything is OK, they show as OK. When there are issues,
# they still appear so the user can see the full picture (e.g.
# LVM is OK even though I/O errors exist on a disk).
if 'root_filesystem' not in checks:
checks['root_filesystem'] = checks.pop('/', None) or {'status': 'OK', 'detail': 'Mounted read-write, space OK'}
if 'io_errors' not in checks:
# Only add OK if no disk I/O errors are present in checks
has_io = any(v.get('error_count') or 'I/O' in str(v.get('detail', '')) for v in checks.values())
if not has_io:
checks['io_errors'] = {'status': 'OK', 'detail': 'No I/O errors in dmesg'}
if self.capabilities.get('has_smart') and 'smart_health' not in checks:
checks['smart_health'] = {'status': 'OK', 'detail': 'No SMART warnings in journal'}
if self.capabilities.get('has_zfs') and 'zfs_pools' not in checks:
checks['zfs_pools'] = {'status': 'OK', 'detail': 'ZFS pools healthy'}
if self.capabilities.get('has_lvm') and 'lvm_volumes' not in checks and 'lvm_check' not in checks:
checks['lvm_volumes'] = {'status': 'OK', 'detail': 'LVM volumes OK'}
if not issues:
# Add descriptive OK entries only for capabilities this server actually has
checks['root_filesystem'] = checks.get('/', {'status': 'OK', 'detail': 'Mounted read-write, space OK'})
checks['io_errors'] = {'status': 'OK', 'detail': 'No I/O errors in dmesg'}
if self.capabilities.get('has_smart'):
checks['smart_health'] = {'status': 'OK', 'detail': 'No SMART warnings in journal'}
if self.capabilities.get('has_zfs'):
checks['zfs_pools'] = {'status': 'OK', 'detail': 'ZFS pools healthy'}
if self.capabilities.get('has_lvm'):
checks['lvm_volumes'] = {'status': 'OK', 'detail': 'LVM volumes OK'}
return {'status': 'OK', 'checks': checks}
# Determine overall status
@@ -1235,6 +1244,36 @@ class HealthMonitor:
except (OSError, subprocess.TimeoutExpired):
pass
# Method 3: Use /sys/block/sd* and trace back to ATA host number
# ata8 => host7 (N-1) or host8 depending on controller numbering
try:
for sd in sorted(os.listdir('/sys/block')):
if not sd.startswith('sd'):
continue
# /sys/block/sdX/device -> ../../hostN/targetN:0:0/N:0:0:0
dev_link = f'/sys/block/{sd}/device'
if os.path.islink(dev_link):
real_path = os.path.realpath(dev_link)
# Check if 'ataX' appears in the device path
if f'/{ata_port}/' in real_path or f'/ata{port_num}/' in real_path:
return sd
# Also check host number mapping: ata8 -> host7 (N-1 convention)
for offset in (0, -1):
host_n = int(port_num) + offset
if host_n >= 0 and f'/host{host_n}/' in real_path:
# Verify: check if ataX appears in the chain
parent = real_path
while parent and parent != '/':
parent = os.path.dirname(parent)
if os.path.basename(parent) == ata_port:
return sd
# Check 1 level: /sys/devices/.../ataX/hostY/...
ata_check = os.path.join(os.path.dirname(parent), ata_port)
if os.path.exists(ata_check):
return sd
except (OSError, IOError, ValueError):
pass
return ata_port # Return original if resolution fails
def _identify_block_device(self, device: str) -> str:
@@ -1483,6 +1522,39 @@ class HealthMonitor:
smart_ok = smart_health == 'PASSED'
# Resolve ATA name to block device early so we can use it
# in both record_error details AND record_disk_observation.
resolved_block = disk
resolved_serial = None
if disk.startswith('ata'):
resolved_block = self._resolve_ata_to_disk(disk)
# Get serial from the resolved device
try:
dev_path = f'/dev/{resolved_block}' if resolved_block != disk else None
if dev_path:
sm = subprocess.run(
['smartctl', '-i', dev_path],
capture_output=True, text=True, timeout=3)
if sm.returncode in (0, 4):
for sline in sm.stdout.split('\n'):
if 'Serial Number' in sline or 'Serial number' in sline:
resolved_serial = sline.split(':')[-1].strip()
break
except Exception:
pass
else:
try:
sm = subprocess.run(
['smartctl', '-i', f'/dev/{disk}'],
capture_output=True, text=True, timeout=3)
if sm.returncode in (0, 4):
for sline in sm.stdout.split('\n'):
if 'Serial Number' in sline or 'Serial number' in sline:
resolved_serial = sline.split(':')[-1].strip()
break
except Exception:
pass
# ── Record disk observation (always, even if transient) ──
# Signature must be stable across cycles: strip volatile
# data (hex values, counts, timestamps) to dedup properly.
@@ -1493,8 +1565,8 @@ class HealthMonitor:
obs_sig = self._make_io_obs_signature(disk, sample)
obs_severity = 'critical' if smart_health == 'FAILED' else 'warning'
health_persistence.record_disk_observation(
device_name=disk,
serial=None,
device_name=resolved_block,
serial=resolved_serial,
error_type='io_error',
error_signature=obs_sig,
raw_message=f'{display}: {error_count} I/O event(s) in 5 min (SMART: {smart_health})\n{sample}',
@@ -1551,6 +1623,8 @@ class HealthMonitor:
severity=severity,
reason=reason,
details={'disk': disk, 'device': display,
'block_device': resolved_block,
'serial': resolved_serial or '',
'error_count': error_count,
'smart_status': smart_health,
'sample': sample, 'dismissable': False}
@@ -1584,6 +1658,8 @@ class HealthMonitor:
severity=severity,
reason=reason,
details={'disk': disk, 'device': display,
'block_device': resolved_block,
'serial': resolved_serial or '',
'error_count': error_count,
'smart_status': smart_health,
'sample': sample, 'dismissable': True}

View File

@@ -1170,12 +1170,47 @@ class HealthPersistence:
Uses (device_name, serial) as unique key. If the disk was previously
marked removed, it's re-activated.
Also consolidates old ATA-named entries: if an observation was recorded
under 'ata8' and we now know the real block device is 'sdh' with
serial 'WX72...', update the old entry so observations are linked.
"""
now = datetime.now().isoformat()
try:
conn = self._get_conn()
cursor = conn.cursor()
# Consolidate: if serial is known and an old entry exists with
# a different device_name (e.g. 'ata8' instead of 'sdh'),
# update that entry's device_name so observations carry over.
if serial:
cursor.execute('''
SELECT id, device_name FROM disk_registry
WHERE serial = ? AND serial != '' AND device_name != ?
''', (serial, device_name))
old_rows = cursor.fetchall()
for old_id, old_dev in old_rows:
# Only consolidate ATA names -> block device names
if old_dev.startswith('ata') and not device_name.startswith('ata'):
# Check if target (device_name, serial) already exists
cursor.execute(
'SELECT id FROM disk_registry WHERE device_name = ? AND serial = ?',
(device_name, serial))
existing = cursor.fetchone()
if existing:
# Merge: move observations from old -> existing, then delete old
cursor.execute(
'UPDATE disk_observations SET disk_registry_id = ? WHERE disk_registry_id = ?',
(existing[0], old_id))
cursor.execute('DELETE FROM disk_registry WHERE id = ?', (old_id,))
else:
# Rename the old entry to the real block device name
cursor.execute(
'UPDATE disk_registry SET device_name = ?, model = COALESCE(?, model), '
'size_bytes = COALESCE(?, size_bytes), last_seen = ?, removed = 0 '
'WHERE id = ?',
(device_name, model, size_bytes, now, old_id))
cursor.execute('''
INSERT INTO disk_registry (device_name, serial, model, size_bytes, first_seen, last_seen, removed)
VALUES (?, ?, ?, ?, ?, ?, 0)
@@ -1193,7 +1228,11 @@ class HealthPersistence:
def _get_disk_registry_id(self, cursor, device_name: str,
serial: Optional[str] = None) -> Optional[int]:
"""Find disk_registry.id, matching by serial first, then device_name."""
"""Find disk_registry.id, matching by serial first, then device_name.
Also handles ATA-to-block cross-references: if looking for 'sdh' also
checks entries with ATA names that share the same serial.
"""
if serial:
cursor.execute(
'SELECT id FROM disk_registry WHERE serial = ? AND serial != "" ORDER BY last_seen DESC LIMIT 1',
@@ -1207,7 +1246,19 @@ class HealthPersistence:
'SELECT id FROM disk_registry WHERE device_name = ? ORDER BY last_seen DESC LIMIT 1',
(clean_dev,))
row = cursor.fetchone()
return row[0] if row else None
if row:
return row[0]
# Last resort: search for ATA-named entries that might refer to this device
# This handles cases where observations were recorded under 'ata8'
# but we're querying for 'sdh'
if clean_dev.startswith('sd') or clean_dev.startswith('nvme'):
cursor.execute(
'SELECT id FROM disk_registry WHERE device_name LIKE "ata%" ORDER BY last_seen DESC')
# For each ATA entry, we can't resolve here without OS access,
# so just return None and let the serial-based consolidation
# in register_disk handle it over time.
pass
return None
def record_disk_observation(self, device_name: str, serial: Optional[str],
error_type: str, error_signature: str,