mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-04-30 19:36:24 +00:00
Update notification service
This commit is contained in:
@@ -30,7 +30,6 @@ import {
|
|||||||
ChevronRight,
|
ChevronRight,
|
||||||
Settings2,
|
Settings2,
|
||||||
HelpCircle,
|
HelpCircle,
|
||||||
Usb,
|
|
||||||
} from "lucide-react"
|
} from "lucide-react"
|
||||||
|
|
||||||
interface CategoryCheck {
|
interface CategoryCheck {
|
||||||
@@ -415,44 +414,13 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
|
|||||||
) => {
|
) => {
|
||||||
if (!checks || Object.keys(checks).length === 0) return null
|
if (!checks || Object.keys(checks).length === 0) return null
|
||||||
|
|
||||||
// Sort checks: non-disk entries first, then disk entries sorted by device name
|
|
||||||
const sortedEntries = Object.entries(checks)
|
|
||||||
.filter(([, checkData]) => checkData.installed !== false)
|
|
||||||
.sort(([keyA, dataA], [keyB, dataB]) => {
|
|
||||||
const isDiskA = dataA.is_disk_entry === true
|
|
||||||
const isDiskB = dataB.is_disk_entry === true
|
|
||||||
if (isDiskA && !isDiskB) return 1
|
|
||||||
if (!isDiskA && isDiskB) return -1
|
|
||||||
if (isDiskA && isDiskB) {
|
|
||||||
// Sort disks by device name
|
|
||||||
const deviceA = dataA.device || keyA
|
|
||||||
const deviceB = dataB.device || keyB
|
|
||||||
return deviceA.localeCompare(deviceB)
|
|
||||||
}
|
|
||||||
return 0
|
|
||||||
})
|
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div className="mt-2 space-y-0.5">
|
<div className="mt-2 space-y-0.5">
|
||||||
{sortedEntries.map(([checkKey, checkData]) => {
|
{Object.entries(checks)
|
||||||
|
.filter(([, checkData]) => checkData.installed !== false)
|
||||||
|
.map(([checkKey, checkData]) => {
|
||||||
const isDismissable = checkData.dismissable === true
|
const isDismissable = checkData.dismissable === true
|
||||||
const checkStatus = checkData.status?.toUpperCase() || "OK"
|
const checkStatus = checkData.status?.toUpperCase() || "OK"
|
||||||
const isDiskEntry = checkData.is_disk_entry === true
|
|
||||||
|
|
||||||
// For disk entries, format label specially
|
|
||||||
let displayLabel = formatCheckLabel(checkKey)
|
|
||||||
let diskIcon = null
|
|
||||||
if (isDiskEntry) {
|
|
||||||
displayLabel = checkData.device || checkKey.replace(/_/g, '/')
|
|
||||||
const diskType = checkData.disk_type || ''
|
|
||||||
if (diskType === 'USB') {
|
|
||||||
diskIcon = <Usb className="h-3 w-3 text-orange-400 mr-1" />
|
|
||||||
} else if (diskType === 'NVMe') {
|
|
||||||
diskIcon = <HardDrive className="h-3 w-3 text-blue-400 mr-1" />
|
|
||||||
} else {
|
|
||||||
diskIcon = <HardDrive className="h-3 w-3 text-muted-foreground mr-1" />
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div
|
<div
|
||||||
@@ -461,15 +429,7 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
|
|||||||
>
|
>
|
||||||
<div className="flex items-start gap-1.5 sm:gap-2 min-w-0 flex-1">
|
<div className="flex items-start gap-1.5 sm:gap-2 min-w-0 flex-1">
|
||||||
<span className="mt-0.5 shrink-0">{getStatusIcon(checkData.dismissed ? "INFO" : checkData.status, "sm")}</span>
|
<span className="mt-0.5 shrink-0">{getStatusIcon(checkData.dismissed ? "INFO" : checkData.status, "sm")}</span>
|
||||||
<span className="font-medium shrink-0 flex items-center">
|
<span className="font-medium shrink-0">{formatCheckLabel(checkKey)}</span>
|
||||||
{diskIcon}
|
|
||||||
{displayLabel}
|
|
||||||
{isDiskEntry && checkData.disk_type && (
|
|
||||||
<Badge variant="outline" className="ml-1.5 text-[8px] px-1 py-0 h-3.5 shrink-0">
|
|
||||||
{checkData.disk_type}
|
|
||||||
</Badge>
|
|
||||||
)}
|
|
||||||
</span>
|
|
||||||
<span className="text-muted-foreground break-words whitespace-pre-wrap min-w-0">{checkData.detail}</span>
|
<span className="text-muted-foreground break-words whitespace-pre-wrap min-w-0">{checkData.detail}</span>
|
||||||
{checkData.dismissed && (
|
{checkData.dismissed && (
|
||||||
<Badge variant="outline" className="text-[9px] px-1 py-0 h-4 shrink-0 text-blue-400 border-blue-400/30">
|
<Badge variant="outline" className="text-[9px] px-1 py-0 h-4 shrink-0 text-blue-400 border-blue-400/30">
|
||||||
@@ -499,7 +459,6 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
|
|||||||
)}
|
)}
|
||||||
</Button>
|
</Button>
|
||||||
)}
|
)}
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1016,59 +1016,34 @@ export function StorageOverview() {
|
|||||||
className="sm:hidden border border-white/10 rounded-lg p-4 cursor-pointer bg-white/5 transition-colors"
|
className="sm:hidden border border-white/10 rounded-lg p-4 cursor-pointer bg-white/5 transition-colors"
|
||||||
onClick={() => handleDiskClick(disk)}
|
onClick={() => handleDiskClick(disk)}
|
||||||
>
|
>
|
||||||
<div className="space-y-3">
|
<div className="space-y-2 mb-3">
|
||||||
{/* Header row */}
|
<div className="flex items-center gap-2">
|
||||||
<div className="flex items-center justify-between">
|
<Usb className="h-5 w-5 text-orange-400 flex-shrink-0" />
|
||||||
<div className="flex items-center gap-2">
|
<h3 className="font-semibold">/dev/{disk.name}</h3>
|
||||||
<Usb className="h-5 w-5 text-orange-400 flex-shrink-0" />
|
<Badge className="bg-orange-500/10 text-orange-400 border-orange-500/20 text-[10px] px-1.5">USB</Badge>
|
||||||
<h3 className="font-semibold">/dev/{disk.name}</h3>
|
</div>
|
||||||
<Badge className="bg-orange-500/10 text-orange-400 border-orange-500/20 text-[10px] px-1.5">USB</Badge>
|
<div className="flex items-center justify-between gap-3 pl-7">
|
||||||
</div>
|
{disk.model && disk.model !== "Unknown" && (
|
||||||
<div className="flex items-center gap-2">
|
<p className="text-sm text-muted-foreground truncate flex-1 min-w-0">{disk.model}</p>
|
||||||
|
)}
|
||||||
|
<div className="flex items-center gap-3 flex-shrink-0">
|
||||||
{disk.temperature > 0 && (
|
{disk.temperature > 0 && (
|
||||||
<div className="flex items-center gap-1">
|
<div className="flex items-center gap-1">
|
||||||
<Thermometer className={`h-3.5 w-3.5 ${getTempColor(disk.temperature, disk.name, disk.rotation_rate)}`} />
|
<Thermometer className={`h-4 w-4 ${getTempColor(disk.temperature, disk.name, disk.rotation_rate)}`} />
|
||||||
<span className={`text-xs font-medium ${getTempColor(disk.temperature, disk.name, disk.rotation_rate)}`}>
|
<span className={`text-sm font-medium ${getTempColor(disk.temperature, disk.name, disk.rotation_rate)}`}>
|
||||||
{disk.temperature}°C
|
{disk.temperature}°C
|
||||||
</span>
|
</span>
|
||||||
</div>
|
</div>
|
||||||
)}
|
)}
|
||||||
{getHealthBadge(disk.health)}
|
{getHealthBadge(disk.health)}
|
||||||
|
{(disk.observations_count ?? 0) > 0 && (
|
||||||
|
<Badge className="bg-blue-500/10 text-blue-400 border-blue-500/20 gap-1 text-[10px] px-1.5 py-0">
|
||||||
|
<Info className="h-3 w-3" />
|
||||||
|
{disk.observations_count}
|
||||||
|
</Badge>
|
||||||
|
)}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
{/* Model if available */}
|
|
||||||
{disk.model && disk.model !== "Unknown" && (
|
|
||||||
<p className="text-sm text-muted-foreground truncate pl-7">{disk.model}</p>
|
|
||||||
)}
|
|
||||||
|
|
||||||
{/* Info grid - 2 columns */}
|
|
||||||
<div className="grid grid-cols-2 gap-x-4 gap-y-2 pl-7 text-sm">
|
|
||||||
<div>
|
|
||||||
<span className="text-muted-foreground">Size</span>
|
|
||||||
<p className="font-medium">{disk.size_formatted || disk.size || "N/A"}</p>
|
|
||||||
</div>
|
|
||||||
<div>
|
|
||||||
<span className="text-muted-foreground">SMART Status</span>
|
|
||||||
<p className="font-medium capitalize">{disk.smart_status || "N/A"}</p>
|
|
||||||
</div>
|
|
||||||
{disk.serial && disk.serial !== "Unknown" && (
|
|
||||||
<div className="col-span-2">
|
|
||||||
<span className="text-muted-foreground">Serial</span>
|
|
||||||
<p className="font-medium text-xs truncate">{disk.serial}</p>
|
|
||||||
</div>
|
|
||||||
)}
|
|
||||||
</div>
|
|
||||||
|
|
||||||
{/* Observations badge if any */}
|
|
||||||
{(disk.observations_count ?? 0) > 0 && (
|
|
||||||
<div className="pl-7">
|
|
||||||
<Badge className="bg-blue-500/10 text-blue-400 border-blue-500/20 gap-1 text-[10px] px-1.5 py-0">
|
|
||||||
<Info className="h-3 w-3" />
|
|
||||||
{disk.observations_count} observation{disk.observations_count > 1 ? 's' : ''}
|
|
||||||
</Badge>
|
|
||||||
</div>
|
|
||||||
)}
|
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
@@ -1314,7 +1289,7 @@ export function StorageOverview() {
|
|||||||
</div>
|
</div>
|
||||||
|
|
||||||
{/* Observations Section */}
|
{/* Observations Section */}
|
||||||
{(diskObservations.length > 0 || loadingObservations || (selectedDisk.observations_count ?? 0) > 0) && (
|
{(diskObservations.length > 0 || loadingObservations) && (
|
||||||
<div className="border-t pt-4">
|
<div className="border-t pt-4">
|
||||||
<h4 className="font-semibold mb-2 flex items-center gap-2">
|
<h4 className="font-semibold mb-2 flex items-center gap-2">
|
||||||
<Info className="h-4 w-4 text-blue-400" />
|
<Info className="h-4 w-4 text-blue-400" />
|
||||||
|
|||||||
@@ -2554,55 +2554,6 @@ def get_smart_data(disk_name):
|
|||||||
import traceback
|
import traceback
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
# ── Integrate persistent worst_health ──
|
|
||||||
# The health should never improve from a previous worst state without admin intervention.
|
|
||||||
# This prevents disks from showing "healthy" after they had issues that may have auto-resolved.
|
|
||||||
try:
|
|
||||||
current_health = smart_data['health']
|
|
||||||
serial = smart_data.get('serial', '')
|
|
||||||
|
|
||||||
# Get persistent worst_health
|
|
||||||
worst_info = health_persistence.get_disk_worst_health(disk_name, serial if serial != 'Unknown' else None)
|
|
||||||
|
|
||||||
if worst_info:
|
|
||||||
worst_health = worst_info.get('worst_health', 'healthy')
|
|
||||||
admin_cleared = worst_info.get('admin_cleared', False)
|
|
||||||
|
|
||||||
# Only apply worst_health if not cleared by admin
|
|
||||||
if not admin_cleared:
|
|
||||||
severity_order = {'unknown': -1, 'healthy': 0, 'warning': 1, 'critical': 2}
|
|
||||||
current_severity = severity_order.get(current_health, 0)
|
|
||||||
worst_severity = severity_order.get(worst_health, 0)
|
|
||||||
|
|
||||||
# If worst_health is worse than current, use worst_health
|
|
||||||
if worst_severity > current_severity:
|
|
||||||
smart_data['health'] = worst_health
|
|
||||||
smart_data['health_source'] = 'persistent'
|
|
||||||
smart_data['worst_health_date'] = worst_info.get('worst_health_date')
|
|
||||||
smart_data['worst_health_reason'] = worst_info.get('worst_health_reason', '')
|
|
||||||
|
|
||||||
# Update worst_health if current is worse (and not already stored)
|
|
||||||
if current_health in ('warning', 'critical'):
|
|
||||||
health_reason = ''
|
|
||||||
if smart_data.get('pending_sectors', 0) > 0:
|
|
||||||
health_reason = f"{smart_data['pending_sectors']} pending sector(s)"
|
|
||||||
if smart_data.get('reallocated_sectors', 0) > 0:
|
|
||||||
if health_reason:
|
|
||||||
health_reason += f", {smart_data['reallocated_sectors']} reallocated"
|
|
||||||
else:
|
|
||||||
health_reason = f"{smart_data['reallocated_sectors']} reallocated sector(s)"
|
|
||||||
if smart_data.get('smart_status') == 'failed':
|
|
||||||
health_reason = 'SMART test FAILED' + (f' ({health_reason})' if health_reason else '')
|
|
||||||
|
|
||||||
health_persistence.update_disk_worst_health(
|
|
||||||
disk_name,
|
|
||||||
serial if serial != 'Unknown' else None,
|
|
||||||
current_health,
|
|
||||||
health_reason
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
# print(f"[v0] Error integrating worst_health: {e}")
|
|
||||||
pass
|
|
||||||
|
|
||||||
return smart_data
|
return smart_data
|
||||||
|
|
||||||
|
|||||||
@@ -1034,19 +1034,38 @@ class HealthMonitor:
|
|||||||
io_error_key = f'disk_{device}'
|
io_error_key = f'disk_{device}'
|
||||||
error_key = f'smart_{device}'
|
error_key = f'smart_{device}'
|
||||||
reason = f'{disk}: {issue["reason"]}'
|
reason = f'{disk}: {issue["reason"]}'
|
||||||
|
severity = issue.get('status', 'WARNING')
|
||||||
|
|
||||||
|
# Get serial for this disk to properly track it (important for USB disks)
|
||||||
|
disk_serial = ''
|
||||||
|
disk_model = ''
|
||||||
|
try:
|
||||||
|
smart_result = subprocess.run(
|
||||||
|
['smartctl', '-i', '-j', f'/dev/{device}'],
|
||||||
|
capture_output=True, text=True, timeout=5
|
||||||
|
)
|
||||||
|
if smart_result.returncode in (0, 4):
|
||||||
|
import json
|
||||||
|
smart_data = json.loads(smart_result.stdout)
|
||||||
|
disk_serial = smart_data.get('serial_number', '')
|
||||||
|
disk_model = smart_data.get('model_name', '') or smart_data.get('model_family', '')
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if (not health_persistence.is_error_active(io_error_key, category='disks') and
|
if (not health_persistence.is_error_active(io_error_key, category='disks') and
|
||||||
not health_persistence.is_error_active(error_key, category='disks')):
|
not health_persistence.is_error_active(error_key, category='disks')):
|
||||||
health_persistence.record_error(
|
health_persistence.record_error(
|
||||||
error_key=error_key,
|
error_key=error_key,
|
||||||
category='disks',
|
category='disks',
|
||||||
severity=issue.get('status', 'WARNING'),
|
severity=severity,
|
||||||
reason=reason,
|
reason=reason,
|
||||||
details={
|
details={
|
||||||
'disk': device,
|
'disk': device,
|
||||||
'device': disk,
|
'device': disk,
|
||||||
'block_device': device,
|
'block_device': device,
|
||||||
'serial': '',
|
'serial': disk_serial,
|
||||||
|
'model': disk_model,
|
||||||
'smart_status': 'WARNING',
|
'smart_status': 'WARNING',
|
||||||
'smart_lines': issue.get('smart_lines', []),
|
'smart_lines': issue.get('smart_lines', []),
|
||||||
'io_lines': issue.get('io_lines', []),
|
'io_lines': issue.get('io_lines', []),
|
||||||
@@ -1055,6 +1074,12 @@ class HealthMonitor:
|
|||||||
'dismissable': True,
|
'dismissable': True,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
# Update worst_health for the disk (persists even if current error clears)
|
||||||
|
# Use serial for proper USB disk tracking
|
||||||
|
health_persistence.update_disk_worst_health(device, disk_serial if disk_serial else None, severity.lower())
|
||||||
|
# Also register the disk for observation tracking
|
||||||
|
if disk_serial:
|
||||||
|
health_persistence.register_disk(device, disk_serial, disk_model, 0)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -1073,16 +1098,205 @@ class HealthMonitor:
|
|||||||
if disk_path not in storage_details or storage_details[disk_path].get('status') == 'OK':
|
if disk_path not in storage_details or storage_details[disk_path].get('status') == 'OK':
|
||||||
issues.append(f'{disk_path}: {disk_info.get("reason", "I/O errors")}')
|
issues.append(f'{disk_path}: {disk_info.get("reason", "I/O errors")}')
|
||||||
storage_details[disk_path] = disk_info
|
storage_details[disk_path] = disk_info
|
||||||
|
# Update worst_health for I/O errors
|
||||||
|
device = disk_path.replace('/dev/', '')
|
||||||
|
io_severity = disk_info.get('status', 'WARNING').lower()
|
||||||
|
|
||||||
# Build checks dict from storage_details, adding OK entries for items with no issues
|
# Get serial for proper disk tracking (important for USB)
|
||||||
|
io_serial = ''
|
||||||
|
io_model = ''
|
||||||
|
try:
|
||||||
|
smart_result = subprocess.run(
|
||||||
|
['smartctl', '-i', '-j', f'/dev/{device}'],
|
||||||
|
capture_output=True, text=True, timeout=5
|
||||||
|
)
|
||||||
|
if smart_result.returncode in (0, 4):
|
||||||
|
import json
|
||||||
|
smart_data = json.loads(smart_result.stdout)
|
||||||
|
io_serial = smart_data.get('serial_number', '')
|
||||||
|
io_model = smart_data.get('model_name', '') or smart_data.get('model_family', '')
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
health_persistence.update_disk_worst_health(device, io_serial if io_serial else None, io_severity)
|
||||||
|
if io_serial:
|
||||||
|
health_persistence.register_disk(device, io_serial, io_model, 0)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Build checks dict from storage_details
|
||||||
|
# We consolidate disk error entries (like /Dev/Sda) into physical disk entries
|
||||||
|
# and only show disks with problems (not healthy ones).
|
||||||
checks = {}
|
checks = {}
|
||||||
|
disk_errors_by_device = {} # Collect disk errors for consolidation
|
||||||
|
|
||||||
for key, val in storage_details.items():
|
for key, val in storage_details.items():
|
||||||
|
# Check if this is a disk device entry (e.g., /Dev/Sda, /dev/sda, sda)
|
||||||
|
key_lower = key.lower()
|
||||||
|
is_disk_entry = (
|
||||||
|
key_lower.startswith('/dev/') or
|
||||||
|
key_lower.startswith('dev/') or
|
||||||
|
(len(key_lower) <= 10 and (key_lower.startswith('sd') or
|
||||||
|
key_lower.startswith('nvme') or key_lower.startswith('hd')))
|
||||||
|
)
|
||||||
|
|
||||||
|
if is_disk_entry:
|
||||||
|
# Extract device name and collect for consolidation
|
||||||
|
device_name = key_lower.replace('/dev/', '').replace('dev/', '').strip('/')
|
||||||
|
if device_name and len(device_name) <= 15:
|
||||||
|
if device_name not in disk_errors_by_device:
|
||||||
|
disk_errors_by_device[device_name] = {
|
||||||
|
'status': val.get('status', 'WARNING'),
|
||||||
|
'detail': val.get('reason', ''),
|
||||||
|
'error_key': val.get('error_key'),
|
||||||
|
'dismissable': val.get('dismissable', True),
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
# Merge: keep worst status
|
||||||
|
existing = disk_errors_by_device[device_name]
|
||||||
|
if val.get('status') == 'CRITICAL':
|
||||||
|
existing['status'] = 'CRITICAL'
|
||||||
|
# Append detail if different
|
||||||
|
new_detail = val.get('reason', '')
|
||||||
|
if new_detail and new_detail not in existing.get('detail', ''):
|
||||||
|
existing['detail'] = f"{existing['detail']}; {new_detail}".strip('; ')
|
||||||
|
continue # Don't add raw disk error entry, we'll add consolidated later
|
||||||
|
|
||||||
|
# Non-disk entries go directly to checks
|
||||||
checks[key] = {
|
checks[key] = {
|
||||||
'status': val.get('status', 'OK'),
|
'status': val.get('status', 'OK'),
|
||||||
'detail': val.get('reason', 'OK'),
|
'detail': val.get('reason', 'OK'),
|
||||||
**{k: v for k, v in val.items() if k not in ('status', 'reason')}
|
**{k: v for k, v in val.items() if k not in ('status', 'reason')}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Get physical disk info for matching errors to disks
|
||||||
|
# This uses the same detection as flask_server.py /api/storage/info
|
||||||
|
physical_disks = {}
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
['lsblk', '-b', '-d', '-n', '-o', 'NAME,SIZE,TYPE,TRAN'],
|
||||||
|
capture_output=True, text=True, timeout=5
|
||||||
|
)
|
||||||
|
if result.returncode == 0:
|
||||||
|
for line in result.stdout.strip().split('\n'):
|
||||||
|
if not line.strip():
|
||||||
|
continue
|
||||||
|
parts = line.split()
|
||||||
|
if len(parts) >= 3 and parts[2] == 'disk':
|
||||||
|
disk_name = parts[0]
|
||||||
|
# Skip virtual devices
|
||||||
|
if disk_name.startswith(('zd', 'zram', 'loop', 'ram', 'dm-')):
|
||||||
|
continue
|
||||||
|
tran = parts[3].upper() if len(parts) > 3 else ''
|
||||||
|
is_usb = tran == 'USB'
|
||||||
|
is_nvme = disk_name.startswith('nvme')
|
||||||
|
|
||||||
|
# Get serial from smartctl
|
||||||
|
serial = ''
|
||||||
|
model = ''
|
||||||
|
try:
|
||||||
|
smart_result = subprocess.run(
|
||||||
|
['smartctl', '-i', '-j', f'/dev/{disk_name}'],
|
||||||
|
capture_output=True, text=True, timeout=5
|
||||||
|
)
|
||||||
|
if smart_result.returncode in (0, 4): # 4 = SMART not available but info OK
|
||||||
|
import json
|
||||||
|
smart_data = json.loads(smart_result.stdout)
|
||||||
|
serial = smart_data.get('serial_number', '')
|
||||||
|
model = smart_data.get('model_name', '') or smart_data.get('model_family', '')
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
physical_disks[disk_name] = {
|
||||||
|
'serial': serial,
|
||||||
|
'model': model,
|
||||||
|
'is_usb': is_usb,
|
||||||
|
'is_nvme': is_nvme,
|
||||||
|
'disk_type': 'USB' if is_usb else ('NVMe' if is_nvme else 'SATA'),
|
||||||
|
}
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Add consolidated disk entries (only for disks with errors)
|
||||||
|
for device_name, error_info in disk_errors_by_device.items():
|
||||||
|
# Try to find this disk in physical_disks for enriched info
|
||||||
|
disk_info = physical_disks.get(device_name, {})
|
||||||
|
|
||||||
|
# If not found by name, try to match by serial (from error details)
|
||||||
|
if not disk_info:
|
||||||
|
error_serial = error_info.get('serial', '')
|
||||||
|
if error_serial:
|
||||||
|
for dk, di in physical_disks.items():
|
||||||
|
if di.get('serial', '').lower() == error_serial.lower():
|
||||||
|
disk_info = di
|
||||||
|
device_name = dk # Update device name to matched disk
|
||||||
|
break
|
||||||
|
|
||||||
|
# Determine disk type
|
||||||
|
disk_type = disk_info.get('disk_type', 'SATA')
|
||||||
|
if not disk_info:
|
||||||
|
# Fallback detection
|
||||||
|
if device_name.startswith('nvme'):
|
||||||
|
disk_type = 'NVMe'
|
||||||
|
else:
|
||||||
|
# Check if USB via sysfs
|
||||||
|
try:
|
||||||
|
usb_check = subprocess.run(
|
||||||
|
['readlink', '-f', f'/sys/block/{device_name}'],
|
||||||
|
capture_output=True, text=True, timeout=2
|
||||||
|
)
|
||||||
|
if 'usb' in usb_check.stdout.lower():
|
||||||
|
disk_type = 'USB'
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
serial = disk_info.get('serial', '')
|
||||||
|
model = disk_info.get('model', '')
|
||||||
|
|
||||||
|
# Get worst_health from persistence
|
||||||
|
try:
|
||||||
|
health_status = health_persistence.get_disk_health_status(device_name, serial if serial else None)
|
||||||
|
worst_health = health_status.get('worst_health', 'healthy')
|
||||||
|
|
||||||
|
# Final health = max(current, worst)
|
||||||
|
health_order = {'healthy': 0, 'ok': 0, 'warning': 1, 'critical': 2}
|
||||||
|
current_level = health_order.get(error_info['status'].lower(), 1)
|
||||||
|
worst_level = health_order.get(worst_health.lower(), 0)
|
||||||
|
|
||||||
|
if worst_level > current_level:
|
||||||
|
# worst_health is worse, use it
|
||||||
|
final_status = worst_health.upper()
|
||||||
|
else:
|
||||||
|
final_status = error_info['status']
|
||||||
|
except Exception:
|
||||||
|
final_status = error_info['status']
|
||||||
|
|
||||||
|
# Build detail string with serial/model if available
|
||||||
|
detail = error_info['detail']
|
||||||
|
if serial and serial not in detail:
|
||||||
|
detail = f"{serial} - {detail}"
|
||||||
|
|
||||||
|
# Create consolidated disk entry
|
||||||
|
check_key = f'/dev/{device_name}'
|
||||||
|
checks[check_key] = {
|
||||||
|
'status': final_status,
|
||||||
|
'detail': detail,
|
||||||
|
'disk_type': disk_type,
|
||||||
|
'device': f'/dev/{device_name}',
|
||||||
|
'serial': serial,
|
||||||
|
'model': model,
|
||||||
|
'error_key': error_info.get('error_key') or f'disk_{device_name}',
|
||||||
|
'dismissable': error_info.get('dismissable', True),
|
||||||
|
'is_disk_entry': True,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Register disk in persistence if not already (for worst_health tracking)
|
||||||
|
try:
|
||||||
|
health_persistence.register_disk(device_name, serial if serial else None, model, 0)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
# ALWAYS add descriptive entries for capabilities this server has.
|
# ALWAYS add descriptive entries for capabilities this server has.
|
||||||
# When everything is OK, they show as OK. When there are issues,
|
# When everything is OK, they show as OK. When there are issues,
|
||||||
# they still appear so the user can see the full picture (e.g.
|
# they still appear so the user can see the full picture (e.g.
|
||||||
@@ -1105,120 +1319,8 @@ class HealthMonitor:
|
|||||||
if self.capabilities.get('has_lvm') and 'lvm_volumes' not in checks and 'lvm_check' not in checks:
|
if self.capabilities.get('has_lvm') and 'lvm_volumes' not in checks and 'lvm_check' not in checks:
|
||||||
checks['lvm_volumes'] = {'status': 'OK', 'detail': 'LVM volumes OK'}
|
checks['lvm_volumes'] = {'status': 'OK', 'detail': 'LVM volumes OK'}
|
||||||
|
|
||||||
# Get physical disks list for UI display
|
|
||||||
physical_disks = self._get_physical_disks_list()
|
|
||||||
|
|
||||||
# Collect disk error entries (SMART, I/O, etc.) from checks that should be merged with disk entries
|
|
||||||
# These have keys like '/Dev/Sda', '/dev/sda', 'sda', etc.
|
|
||||||
disk_errors_by_device = {}
|
|
||||||
keys_to_remove = []
|
|
||||||
for key, val in checks.items():
|
|
||||||
# Skip non-disk error entries (like lvm_check, root_fs, etc.)
|
|
||||||
key_lower = key.lower()
|
|
||||||
|
|
||||||
# Check if this looks like a disk error entry
|
|
||||||
is_disk_error = False
|
|
||||||
device_name = None
|
|
||||||
|
|
||||||
if key_lower.startswith('/dev/') or key_lower.startswith('dev/'):
|
|
||||||
# Keys like '/Dev/Sda', '/dev/sda'
|
|
||||||
device_name = key_lower.replace('/dev/', '').replace('dev/', '').strip('/')
|
|
||||||
is_disk_error = True
|
|
||||||
elif key_lower.startswith('sd') or key_lower.startswith('nvme') or key_lower.startswith('hd'):
|
|
||||||
# Keys like 'sda', 'nvme0n1'
|
|
||||||
device_name = key_lower
|
|
||||||
is_disk_error = True
|
|
||||||
|
|
||||||
if is_disk_error and device_name and len(device_name) <= 15:
|
|
||||||
# Store the error info, merging if we already have an error for this device
|
|
||||||
if device_name not in disk_errors_by_device:
|
|
||||||
disk_errors_by_device[device_name] = {
|
|
||||||
'status': val.get('status', 'WARNING'),
|
|
||||||
'detail': val.get('detail', val.get('reason', '')),
|
|
||||||
'error_key': val.get('error_key'),
|
|
||||||
'dismissable': val.get('dismissable', True),
|
|
||||||
'dismissed': val.get('dismissed', False),
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
# Merge: keep the worst status
|
|
||||||
existing = disk_errors_by_device[device_name]
|
|
||||||
if val.get('status') == 'CRITICAL':
|
|
||||||
existing['status'] = 'CRITICAL'
|
|
||||||
# Append details
|
|
||||||
new_detail = val.get('detail', val.get('reason', ''))
|
|
||||||
if new_detail and new_detail not in existing.get('detail', ''):
|
|
||||||
existing['detail'] = f"{existing.get('detail', '')}; {new_detail}".strip('; ')
|
|
||||||
keys_to_remove.append(key)
|
|
||||||
|
|
||||||
# Remove the old disk error entries - they'll be merged into disk entries
|
|
||||||
for key in keys_to_remove:
|
|
||||||
del checks[key]
|
|
||||||
|
|
||||||
# Add individual disk checks for UI display (like Network interfaces)
|
|
||||||
for disk in physical_disks:
|
|
||||||
device = disk.get('device', '')
|
|
||||||
name = disk.get('name', '')
|
|
||||||
serial = disk.get('serial', '')
|
|
||||||
final_health = disk.get('final_health', 'healthy')
|
|
||||||
final_reason = disk.get('final_reason', '')
|
|
||||||
is_usb = disk.get('is_usb', False)
|
|
||||||
|
|
||||||
# Format check key - use device path for uniqueness
|
|
||||||
check_key = device.lower().replace('/', '_') # e.g., _dev_sda
|
|
||||||
|
|
||||||
# Check if there's a disk error (SMART, I/O, etc.) for this disk
|
|
||||||
disk_error = disk_errors_by_device.get(name.lower())
|
|
||||||
|
|
||||||
# Determine status - use disk error status if present, otherwise use final_health
|
|
||||||
if disk_error and disk_error.get('status') in ('WARNING', 'CRITICAL'):
|
|
||||||
status = disk_error['status']
|
|
||||||
error_detail = disk_error.get('detail', '')
|
|
||||||
elif final_health == 'critical':
|
|
||||||
status = 'CRITICAL'
|
|
||||||
error_detail = ''
|
|
||||||
elif final_health == 'warning':
|
|
||||||
status = 'WARNING'
|
|
||||||
error_detail = ''
|
|
||||||
else:
|
|
||||||
status = 'OK'
|
|
||||||
error_detail = ''
|
|
||||||
|
|
||||||
# Build detail string
|
|
||||||
disk_type = 'USB' if is_usb else ('NVMe' if disk.get('is_nvme') else 'SATA')
|
|
||||||
detail = f'{serial}' if serial else 'Unknown serial'
|
|
||||||
if final_reason:
|
|
||||||
detail += f' - {final_reason}'
|
|
||||||
elif error_detail:
|
|
||||||
detail += f' - {error_detail}'
|
|
||||||
|
|
||||||
# Only add to checks if not already present
|
|
||||||
if check_key not in checks:
|
|
||||||
checks[check_key] = {
|
|
||||||
'status': status,
|
|
||||||
'detail': detail,
|
|
||||||
'device': device,
|
|
||||||
'serial': serial,
|
|
||||||
'disk_type': disk_type,
|
|
||||||
'is_disk_entry': True, # Flag to identify disk entries in frontend
|
|
||||||
'worst_health': disk.get('worst_health', 'healthy'),
|
|
||||||
'worst_health_date': disk.get('worst_health_date'),
|
|
||||||
'admin_cleared': disk.get('admin_cleared', False),
|
|
||||||
}
|
|
||||||
|
|
||||||
# If disk has issues, it needs an error_key for dismiss functionality
|
|
||||||
if status != 'OK':
|
|
||||||
# Use disk error_key if available, otherwise generate one
|
|
||||||
if disk_error and disk_error.get('error_key'):
|
|
||||||
checks[check_key]['error_key'] = disk_error['error_key']
|
|
||||||
else:
|
|
||||||
checks[check_key]['error_key'] = f'disk_{name}_{serial}' if serial else f'disk_{name}'
|
|
||||||
checks[check_key]['dismissable'] = True
|
|
||||||
# Preserve dismissed state from disk error
|
|
||||||
if disk_error and disk_error.get('dismissed'):
|
|
||||||
checks[check_key]['dismissed'] = True
|
|
||||||
|
|
||||||
if not issues:
|
if not issues:
|
||||||
return {'status': 'OK', 'checks': checks, 'physical_disks': physical_disks}
|
return {'status': 'OK', 'checks': checks}
|
||||||
|
|
||||||
# ── Mark dismissed checks ──
|
# ── Mark dismissed checks ──
|
||||||
# If an error_key in a check has been acknowledged (dismissed) in the
|
# If an error_key in a check has been acknowledged (dismissed) in the
|
||||||
@@ -1250,7 +1352,6 @@ class HealthMonitor:
|
|||||||
'reason': '; '.join(issues[:3]),
|
'reason': '; '.join(issues[:3]),
|
||||||
'details': storage_details,
|
'details': storage_details,
|
||||||
'checks': checks,
|
'checks': checks,
|
||||||
'physical_disks': physical_disks,
|
|
||||||
'all_dismissed': True,
|
'all_dismissed': True,
|
||||||
}
|
}
|
||||||
except Exception:
|
except Exception:
|
||||||
@@ -1265,8 +1366,7 @@ class HealthMonitor:
|
|||||||
'status': 'CRITICAL' if has_critical else 'WARNING',
|
'status': 'CRITICAL' if has_critical else 'WARNING',
|
||||||
'reason': '; '.join(issues[:3]),
|
'reason': '; '.join(issues[:3]),
|
||||||
'details': storage_details,
|
'details': storage_details,
|
||||||
'checks': checks,
|
'checks': checks
|
||||||
'physical_disks': physical_disks
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def _check_filesystem(self, mount_point: str) -> Dict[str, Any]:
|
def _check_filesystem(self, mount_point: str) -> Dict[str, Any]:
|
||||||
@@ -1354,218 +1454,6 @@ class HealthMonitor:
|
|||||||
except Exception:
|
except Exception:
|
||||||
return {'status': 'OK'}
|
return {'status': 'OK'}
|
||||||
|
|
||||||
def _get_physical_disks_list(self) -> List[Dict[str, Any]]:
|
|
||||||
"""Get list of all physical disks with their health status.
|
|
||||||
|
|
||||||
Combines real-time SMART data with persistent worst_health state.
|
|
||||||
Returns list suitable for display in Health Monitor UI.
|
|
||||||
"""
|
|
||||||
disks = []
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Get all block devices
|
|
||||||
result = subprocess.run(
|
|
||||||
['lsblk', '-d', '-n', '-o', 'NAME,SIZE,TYPE,TRAN,MODEL,SERIAL'],
|
|
||||||
capture_output=True, text=True, timeout=5
|
|
||||||
)
|
|
||||||
|
|
||||||
if result.returncode != 0:
|
|
||||||
return []
|
|
||||||
|
|
||||||
for line in result.stdout.strip().split('\n'):
|
|
||||||
if not line.strip():
|
|
||||||
continue
|
|
||||||
|
|
||||||
parts = line.split(None, 5)
|
|
||||||
if len(parts) < 3:
|
|
||||||
continue
|
|
||||||
|
|
||||||
name = parts[0]
|
|
||||||
size = parts[1] if len(parts) > 1 else ''
|
|
||||||
dtype = parts[2] if len(parts) > 2 else ''
|
|
||||||
transport = parts[3] if len(parts) > 3 else ''
|
|
||||||
model = parts[4] if len(parts) > 4 else ''
|
|
||||||
serial = parts[5] if len(parts) > 5 else ''
|
|
||||||
|
|
||||||
# Only include disk type devices
|
|
||||||
if dtype != 'disk':
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Skip loop devices, ram disks, etc.
|
|
||||||
if name.startswith(('loop', 'ram', 'zram')):
|
|
||||||
continue
|
|
||||||
|
|
||||||
is_usb = transport.lower() == 'usb'
|
|
||||||
is_nvme = name.startswith('nvme')
|
|
||||||
|
|
||||||
# Get current SMART status
|
|
||||||
current_health = 'healthy'
|
|
||||||
smart_status = 'UNKNOWN'
|
|
||||||
pending_sectors = 0
|
|
||||||
reallocated_sectors = 0
|
|
||||||
|
|
||||||
try:
|
|
||||||
dev_path = f'/dev/{name}'
|
|
||||||
smart_result = subprocess.run(
|
|
||||||
['smartctl', '-H', '-A', dev_path],
|
|
||||||
capture_output=True, text=True, timeout=5
|
|
||||||
)
|
|
||||||
|
|
||||||
output = smart_result.stdout
|
|
||||||
|
|
||||||
# Check SMART overall status
|
|
||||||
if 'PASSED' in output:
|
|
||||||
smart_status = 'PASSED'
|
|
||||||
elif 'FAILED' in output:
|
|
||||||
smart_status = 'FAILED'
|
|
||||||
current_health = 'critical'
|
|
||||||
|
|
||||||
# Parse SMART attributes for pending/reallocated sectors
|
|
||||||
for attr_line in output.split('\n'):
|
|
||||||
if 'Current_Pending_Sector' in attr_line or 'Pending_Sector' in attr_line:
|
|
||||||
parts_attr = attr_line.split()
|
|
||||||
if parts_attr:
|
|
||||||
try:
|
|
||||||
pending_sectors = int(parts_attr[-1])
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
elif 'Reallocated_Sector' in attr_line:
|
|
||||||
parts_attr = attr_line.split()
|
|
||||||
if parts_attr:
|
|
||||||
try:
|
|
||||||
reallocated_sectors = int(parts_attr[-1])
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Determine current health based on sectors
|
|
||||||
if current_health != 'critical':
|
|
||||||
if pending_sectors > 10 or reallocated_sectors > 10:
|
|
||||||
current_health = 'critical'
|
|
||||||
elif pending_sectors > 0 or reallocated_sectors > 0:
|
|
||||||
current_health = 'warning'
|
|
||||||
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Build health reason
|
|
||||||
health_reason = ''
|
|
||||||
if pending_sectors > 0:
|
|
||||||
health_reason = f'{pending_sectors} pending sector(s)'
|
|
||||||
if reallocated_sectors > 0:
|
|
||||||
if health_reason:
|
|
||||||
health_reason += f', {reallocated_sectors} reallocated'
|
|
||||||
else:
|
|
||||||
health_reason = f'{reallocated_sectors} reallocated sector(s)'
|
|
||||||
if smart_status == 'FAILED':
|
|
||||||
health_reason = 'SMART test FAILED' + (f' ({health_reason})' if health_reason else '')
|
|
||||||
|
|
||||||
# Get persistent worst_health from database
|
|
||||||
worst_info = health_persistence.get_disk_worst_health(name, serial)
|
|
||||||
worst_health = worst_info.get('worst_health', 'healthy') if worst_info else 'healthy'
|
|
||||||
worst_health_date = worst_info.get('worst_health_date') if worst_info else None
|
|
||||||
worst_health_reason = worst_info.get('worst_health_reason', '') if worst_info else ''
|
|
||||||
admin_cleared = worst_info.get('admin_cleared', False) if worst_info else False
|
|
||||||
|
|
||||||
# Update worst_health if current is worse
|
|
||||||
if current_health != 'healthy':
|
|
||||||
updated = health_persistence.update_disk_worst_health(
|
|
||||||
name, serial, current_health, health_reason
|
|
||||||
)
|
|
||||||
if updated:
|
|
||||||
worst_health = current_health
|
|
||||||
worst_health_reason = health_reason
|
|
||||||
|
|
||||||
# Record as disk observation (for both internal and USB disks)
|
|
||||||
# This ensures SMART issues are tracked in observations
|
|
||||||
try:
|
|
||||||
obs_type = 'smart_error'
|
|
||||||
if pending_sectors and pending_sectors > 0:
|
|
||||||
obs_type = 'pending_sectors'
|
|
||||||
elif reallocated_sectors and reallocated_sectors > 0:
|
|
||||||
obs_type = 'reallocated_sectors'
|
|
||||||
elif smart_status == 'FAILED':
|
|
||||||
obs_type = 'smart_failed'
|
|
||||||
|
|
||||||
obs_sig = f'smart_{name}_{obs_type}_{pending_sectors}_{reallocated_sectors}'
|
|
||||||
health_persistence.record_disk_observation(
|
|
||||||
device_name=name,
|
|
||||||
serial=serial,
|
|
||||||
error_type=obs_type,
|
|
||||||
error_signature=obs_sig,
|
|
||||||
raw_message=f'/dev/{name}: {health_reason}',
|
|
||||||
severity=current_health,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Send smart_warning notification if this is a NEW issue
|
|
||||||
# (only when updated=True means this is first time seeing this state)
|
|
||||||
if updated:
|
|
||||||
try:
|
|
||||||
from notification_manager import notification_manager
|
|
||||||
notification_manager.send_notification(
|
|
||||||
event_type='smart_warning',
|
|
||||||
data={
|
|
||||||
'device': f'/dev/{name}',
|
|
||||||
'reason': health_reason,
|
|
||||||
'serial': serial or 'Unknown',
|
|
||||||
'model': model or 'Unknown',
|
|
||||||
'pending_sectors': pending_sectors,
|
|
||||||
'reallocated_sectors': reallocated_sectors,
|
|
||||||
'smart_status': smart_status,
|
|
||||||
'hostname': self._hostname,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Final health is the worse of current and persistent
|
|
||||||
severity_order = {'healthy': 0, 'warning': 1, 'critical': 2}
|
|
||||||
if severity_order.get(worst_health, 0) > severity_order.get(current_health, 0):
|
|
||||||
final_health = worst_health
|
|
||||||
final_reason = worst_health_reason
|
|
||||||
else:
|
|
||||||
final_health = current_health
|
|
||||||
final_reason = health_reason
|
|
||||||
|
|
||||||
# Get active observations count
|
|
||||||
obs = health_persistence.get_disk_observations(device_name=name, serial=serial)
|
|
||||||
active_observations = len(obs) if obs else 0
|
|
||||||
|
|
||||||
# Register disk in persistence (for tracking)
|
|
||||||
try:
|
|
||||||
health_persistence.register_disk(name, serial, model)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
disks.append({
|
|
||||||
'device': f'/dev/{name}',
|
|
||||||
'name': name,
|
|
||||||
'serial': serial or '',
|
|
||||||
'model': model or 'Unknown',
|
|
||||||
'size': size,
|
|
||||||
'transport': transport,
|
|
||||||
'is_usb': is_usb,
|
|
||||||
'is_nvme': is_nvme,
|
|
||||||
'smart_status': smart_status,
|
|
||||||
'current_health': current_health,
|
|
||||||
'current_health_reason': health_reason,
|
|
||||||
'worst_health': worst_health,
|
|
||||||
'worst_health_date': worst_health_date,
|
|
||||||
'worst_health_reason': worst_health_reason,
|
|
||||||
'final_health': final_health,
|
|
||||||
'final_reason': final_reason,
|
|
||||||
'pending_sectors': pending_sectors,
|
|
||||||
'reallocated_sectors': reallocated_sectors,
|
|
||||||
'active_observations': active_observations,
|
|
||||||
'admin_cleared': admin_cleared,
|
|
||||||
})
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"[HealthMonitor] Error getting physical disks list: {e}")
|
|
||||||
|
|
||||||
return disks
|
|
||||||
|
|
||||||
# This function is no longer used in get_detailed_status, but kept for reference if needed.
|
# This function is no longer used in get_detailed_status, but kept for reference if needed.
|
||||||
# The new _check_proxmox_storage function handles this logic better.
|
# The new _check_proxmox_storage function handles this logic better.
|
||||||
def _check_proxmox_storages(self) -> Dict[str, Any]:
|
def _check_proxmox_storages(self) -> Dict[str, Any]:
|
||||||
|
|||||||
@@ -164,14 +164,25 @@ class HealthPersistence:
|
|||||||
removed INTEGER DEFAULT 0,
|
removed INTEGER DEFAULT 0,
|
||||||
worst_health TEXT DEFAULT 'healthy',
|
worst_health TEXT DEFAULT 'healthy',
|
||||||
worst_health_date TEXT,
|
worst_health_date TEXT,
|
||||||
worst_health_reason TEXT,
|
admin_cleared TEXT,
|
||||||
admin_cleared INTEGER DEFAULT 0,
|
|
||||||
admin_cleared_date TEXT,
|
|
||||||
admin_cleared_note TEXT,
|
|
||||||
UNIQUE(device_name, serial)
|
UNIQUE(device_name, serial)
|
||||||
)
|
)
|
||||||
''')
|
''')
|
||||||
|
|
||||||
|
# Migration: add worst_health columns if they don't exist (for existing DBs)
|
||||||
|
try:
|
||||||
|
cursor.execute('ALTER TABLE disk_registry ADD COLUMN worst_health TEXT DEFAULT "healthy"')
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
cursor.execute('ALTER TABLE disk_registry ADD COLUMN worst_health_date TEXT')
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
cursor.execute('ALTER TABLE disk_registry ADD COLUMN admin_cleared TEXT')
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
# Observation log: deduplicated error events per disk
|
# Observation log: deduplicated error events per disk
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
CREATE TABLE IF NOT EXISTS disk_observations (
|
CREATE TABLE IF NOT EXISTS disk_observations (
|
||||||
@@ -195,17 +206,6 @@ class HealthPersistence:
|
|||||||
cursor.execute('CREATE INDEX IF NOT EXISTS idx_obs_disk ON disk_observations(disk_registry_id)')
|
cursor.execute('CREATE INDEX IF NOT EXISTS idx_obs_disk ON disk_observations(disk_registry_id)')
|
||||||
cursor.execute('CREATE INDEX IF NOT EXISTS idx_obs_dismissed ON disk_observations(dismissed)')
|
cursor.execute('CREATE INDEX IF NOT EXISTS idx_obs_dismissed ON disk_observations(dismissed)')
|
||||||
|
|
||||||
# Migration: add worst_health columns to disk_registry if not present
|
|
||||||
cursor.execute("PRAGMA table_info(disk_registry)")
|
|
||||||
disk_columns = [col[1] for col in cursor.fetchall()]
|
|
||||||
if 'worst_health' not in disk_columns:
|
|
||||||
cursor.execute("ALTER TABLE disk_registry ADD COLUMN worst_health TEXT DEFAULT 'healthy'")
|
|
||||||
cursor.execute("ALTER TABLE disk_registry ADD COLUMN worst_health_date TEXT")
|
|
||||||
cursor.execute("ALTER TABLE disk_registry ADD COLUMN worst_health_reason TEXT")
|
|
||||||
cursor.execute("ALTER TABLE disk_registry ADD COLUMN admin_cleared INTEGER DEFAULT 0")
|
|
||||||
cursor.execute("ALTER TABLE disk_registry ADD COLUMN admin_cleared_date TEXT")
|
|
||||||
cursor.execute("ALTER TABLE disk_registry ADD COLUMN admin_cleared_note TEXT")
|
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
@@ -1231,26 +1231,11 @@ class HealthPersistence:
|
|||||||
# a different device_name (e.g. 'ata8' instead of 'sdh'),
|
# a different device_name (e.g. 'ata8' instead of 'sdh'),
|
||||||
# update that entry's device_name so observations carry over.
|
# update that entry's device_name so observations carry over.
|
||||||
if serial:
|
if serial:
|
||||||
# Try exact match first
|
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
SELECT id, device_name FROM disk_registry
|
SELECT id, device_name FROM disk_registry
|
||||||
WHERE serial = ? AND serial != '' AND device_name != ?
|
WHERE serial = ? AND serial != '' AND device_name != ?
|
||||||
''', (serial, device_name))
|
''', (serial, device_name))
|
||||||
old_rows = cursor.fetchall()
|
old_rows = cursor.fetchall()
|
||||||
|
|
||||||
# If no exact match, try normalized match (for USB disks with special chars)
|
|
||||||
if not old_rows:
|
|
||||||
normalized = self._normalize_serial(serial)
|
|
||||||
if normalized and normalized != serial:
|
|
||||||
cursor.execute(
|
|
||||||
'SELECT id, device_name, serial FROM disk_registry '
|
|
||||||
'WHERE serial != "" AND device_name != ?', (device_name,))
|
|
||||||
for row in cursor.fetchall():
|
|
||||||
db_normalized = self._normalize_serial(row[2])
|
|
||||||
if db_normalized == normalized or normalized in db_normalized or db_normalized in normalized:
|
|
||||||
old_rows.append((row[0], row[1]))
|
|
||||||
break
|
|
||||||
|
|
||||||
for old_id, old_dev in old_rows:
|
for old_id, old_dev in old_rows:
|
||||||
# Only consolidate ATA names -> block device names
|
# Only consolidate ATA names -> block device names
|
||||||
if old_dev.startswith('ata') and not device_name.startswith('ata'):
|
if old_dev.startswith('ata') and not device_name.startswith('ata'):
|
||||||
@@ -1288,23 +1273,6 @@ class HealthPersistence:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[HealthPersistence] Error registering disk {device_name}: {e}")
|
print(f"[HealthPersistence] Error registering disk {device_name}: {e}")
|
||||||
|
|
||||||
def _normalize_serial(self, serial: str) -> str:
|
|
||||||
"""Normalize serial number for comparison.
|
|
||||||
|
|
||||||
USB disks can have serials with escape sequences like \\x06\\x18
|
|
||||||
or non-printable characters. This normalizes them for matching.
|
|
||||||
"""
|
|
||||||
if not serial:
|
|
||||||
return ''
|
|
||||||
import re
|
|
||||||
# Remove escape sequences like \x06, \x18
|
|
||||||
normalized = re.sub(r'\\x[0-9a-fA-F]{2}', '', serial)
|
|
||||||
# Remove non-printable characters
|
|
||||||
normalized = ''.join(c for c in normalized if c.isprintable())
|
|
||||||
# Remove common prefixes that vary
|
|
||||||
normalized = normalized.strip()
|
|
||||||
return normalized
|
|
||||||
|
|
||||||
def _get_disk_registry_id(self, cursor, device_name: str,
|
def _get_disk_registry_id(self, cursor, device_name: str,
|
||||||
serial: Optional[str] = None) -> Optional[int]:
|
serial: Optional[str] = None) -> Optional[int]:
|
||||||
"""Find disk_registry.id, matching by serial first, then device_name.
|
"""Find disk_registry.id, matching by serial first, then device_name.
|
||||||
@@ -1313,25 +1281,12 @@ class HealthPersistence:
|
|||||||
checks entries with ATA names that share the same serial.
|
checks entries with ATA names that share the same serial.
|
||||||
"""
|
"""
|
||||||
if serial:
|
if serial:
|
||||||
# Try exact match first
|
|
||||||
cursor.execute(
|
cursor.execute(
|
||||||
'SELECT id FROM disk_registry WHERE serial = ? AND serial != "" ORDER BY last_seen DESC LIMIT 1',
|
'SELECT id FROM disk_registry WHERE serial = ? AND serial != "" ORDER BY last_seen DESC LIMIT 1',
|
||||||
(serial,))
|
(serial,))
|
||||||
row = cursor.fetchone()
|
row = cursor.fetchone()
|
||||||
if row:
|
if row:
|
||||||
return row[0]
|
return row[0]
|
||||||
|
|
||||||
# Try normalized serial match (for USB disks with special chars)
|
|
||||||
normalized = self._normalize_serial(serial)
|
|
||||||
if normalized and normalized != serial:
|
|
||||||
# Search for serials that start with or contain the normalized version
|
|
||||||
cursor.execute(
|
|
||||||
'SELECT id, serial FROM disk_registry WHERE serial != "" ORDER BY last_seen DESC')
|
|
||||||
for row in cursor.fetchall():
|
|
||||||
db_normalized = self._normalize_serial(row[1])
|
|
||||||
if db_normalized == normalized or normalized in db_normalized or db_normalized in normalized:
|
|
||||||
return row[0]
|
|
||||||
|
|
||||||
# Fallback: match by device_name (strip /dev/ prefix)
|
# Fallback: match by device_name (strip /dev/ prefix)
|
||||||
clean_dev = device_name.replace('/dev/', '')
|
clean_dev = device_name.replace('/dev/', '')
|
||||||
cursor.execute(
|
cursor.execute(
|
||||||
@@ -1340,7 +1295,6 @@ class HealthPersistence:
|
|||||||
row = cursor.fetchone()
|
row = cursor.fetchone()
|
||||||
if row:
|
if row:
|
||||||
return row[0]
|
return row[0]
|
||||||
|
|
||||||
# Last resort: search for ATA-named entries that might refer to this device
|
# Last resort: search for ATA-named entries that might refer to this device
|
||||||
# This handles cases where observations were recorded under 'ata8'
|
# This handles cases where observations were recorded under 'ata8'
|
||||||
# but we're querying for 'sdh'
|
# but we're querying for 'sdh'
|
||||||
@@ -1353,6 +1307,131 @@ class HealthPersistence:
|
|||||||
pass
|
pass
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def update_disk_worst_health(self, device_name: str, serial: Optional[str],
|
||||||
|
new_health: str) -> bool:
|
||||||
|
"""Update worst_health if new_health is worse than current.
|
||||||
|
|
||||||
|
Health hierarchy: healthy < warning < critical
|
||||||
|
Only escalates, never downgrades automatically.
|
||||||
|
|
||||||
|
Returns True if worst_health was updated.
|
||||||
|
"""
|
||||||
|
health_order = {'healthy': 0, 'warning': 1, 'critical': 2}
|
||||||
|
new_level = health_order.get(new_health.lower(), 0)
|
||||||
|
|
||||||
|
if new_level == 0: # healthy never updates worst_health
|
||||||
|
return False
|
||||||
|
|
||||||
|
now = datetime.now().isoformat()
|
||||||
|
try:
|
||||||
|
conn = self._get_conn()
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
disk_id = self._get_disk_registry_id(cursor, device_name, serial)
|
||||||
|
if not disk_id:
|
||||||
|
# Register disk first
|
||||||
|
self.register_disk(device_name.replace('/dev/', ''), serial)
|
||||||
|
disk_id = self._get_disk_registry_id(cursor, device_name, serial)
|
||||||
|
|
||||||
|
if not disk_id:
|
||||||
|
conn.close()
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Get current worst_health
|
||||||
|
cursor.execute('SELECT worst_health FROM disk_registry WHERE id = ?', (disk_id,))
|
||||||
|
row = cursor.fetchone()
|
||||||
|
current_worst = row[0] if row and row[0] else 'healthy'
|
||||||
|
current_level = health_order.get(current_worst.lower(), 0)
|
||||||
|
|
||||||
|
# Only update if new health is worse
|
||||||
|
if new_level > current_level:
|
||||||
|
cursor.execute('''
|
||||||
|
UPDATE disk_registry
|
||||||
|
SET worst_health = ?, worst_health_date = ?, admin_cleared = NULL
|
||||||
|
WHERE id = ?
|
||||||
|
''', (new_health.lower(), now, disk_id))
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
return True
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
return False
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[HealthPersistence] Error updating worst_health for {device_name}: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def get_disk_health_status(self, device_name: str, serial: Optional[str] = None) -> Dict[str, Any]:
|
||||||
|
"""Get the health status of a disk including worst_health.
|
||||||
|
|
||||||
|
Returns dict with:
|
||||||
|
- worst_health: 'healthy', 'warning', or 'critical'
|
||||||
|
- worst_health_date: ISO timestamp when worst_health was set
|
||||||
|
- admin_cleared: ISO timestamp if admin manually cleared the health
|
||||||
|
- observations_count: Number of recorded observations
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
conn = self._get_conn()
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
disk_id = self._get_disk_registry_id(cursor, device_name, serial)
|
||||||
|
if not disk_id:
|
||||||
|
conn.close()
|
||||||
|
return {'worst_health': 'healthy', 'observations_count': 0}
|
||||||
|
|
||||||
|
cursor.execute('''
|
||||||
|
SELECT worst_health, worst_health_date, admin_cleared
|
||||||
|
FROM disk_registry WHERE id = ?
|
||||||
|
''', (disk_id,))
|
||||||
|
row = cursor.fetchone()
|
||||||
|
|
||||||
|
# Count observations
|
||||||
|
cursor.execute(
|
||||||
|
'SELECT COUNT(*) FROM disk_observations WHERE disk_registry_id = ? AND dismissed = 0',
|
||||||
|
(disk_id,))
|
||||||
|
obs_count = cursor.fetchone()[0]
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
if row:
|
||||||
|
return {
|
||||||
|
'worst_health': row[0] or 'healthy',
|
||||||
|
'worst_health_date': row[1],
|
||||||
|
'admin_cleared': row[2],
|
||||||
|
'observations_count': obs_count
|
||||||
|
}
|
||||||
|
return {'worst_health': 'healthy', 'observations_count': obs_count}
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[HealthPersistence] Error getting disk health for {device_name}: {e}")
|
||||||
|
return {'worst_health': 'healthy', 'observations_count': 0}
|
||||||
|
|
||||||
|
def clear_disk_health_history(self, device_name: str, serial: Optional[str] = None) -> bool:
|
||||||
|
"""Admin action: clear worst_health back to healthy.
|
||||||
|
|
||||||
|
This resets the health status but keeps all observations for audit.
|
||||||
|
Records when the admin cleared it for accountability.
|
||||||
|
"""
|
||||||
|
now = datetime.now().isoformat()
|
||||||
|
try:
|
||||||
|
conn = self._get_conn()
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
disk_id = self._get_disk_registry_id(cursor, device_name, serial)
|
||||||
|
if not disk_id:
|
||||||
|
conn.close()
|
||||||
|
return False
|
||||||
|
|
||||||
|
cursor.execute('''
|
||||||
|
UPDATE disk_registry
|
||||||
|
SET worst_health = 'healthy', worst_health_date = NULL, admin_cleared = ?
|
||||||
|
WHERE id = ?
|
||||||
|
''', (now, disk_id))
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[HealthPersistence] Error clearing health for {device_name}: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
def record_disk_observation(self, device_name: str, serial: Optional[str],
|
def record_disk_observation(self, device_name: str, serial: Optional[str],
|
||||||
error_type: str, error_signature: str,
|
error_type: str, error_signature: str,
|
||||||
raw_message: str = '',
|
raw_message: str = '',
|
||||||
@@ -1391,6 +1470,10 @@ class HealthPersistence:
|
|||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
|
# Update worst_health based on observation severity
|
||||||
|
self.update_disk_worst_health(clean_dev, serial, severity)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[HealthPersistence] Error recording disk observation: {e}")
|
print(f"[HealthPersistence] Error recording disk observation: {e}")
|
||||||
|
|
||||||
@@ -1539,186 +1622,6 @@ class HealthPersistence:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[HealthPersistence] Error marking removed disks: {e}")
|
print(f"[HealthPersistence] Error marking removed disks: {e}")
|
||||||
|
|
||||||
# ────────────────────────────────────────────────────────────────
|
|
||||||
# Disk Worst Health State Tracking
|
|
||||||
# ────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
HEALTH_SEVERITY_ORDER = {'healthy': 0, 'warning': 1, 'critical': 2}
|
|
||||||
|
|
||||||
def update_disk_worst_health(self, device_name: str, serial: Optional[str],
|
|
||||||
health: str, reason: str = '') -> bool:
|
|
||||||
"""Update worst_health if the new health is worse than current.
|
|
||||||
|
|
||||||
Health progression is one-way: healthy -> warning -> critical
|
|
||||||
Only admin_clear_disk_health() can reset to healthy.
|
|
||||||
|
|
||||||
Returns True if worst_health was updated.
|
|
||||||
"""
|
|
||||||
health_lower = health.lower()
|
|
||||||
if health_lower not in self.HEALTH_SEVERITY_ORDER:
|
|
||||||
return False
|
|
||||||
|
|
||||||
try:
|
|
||||||
conn = self._get_conn()
|
|
||||||
cursor = conn.cursor()
|
|
||||||
|
|
||||||
disk_id = self._get_disk_registry_id(cursor, device_name.replace('/dev/', ''), serial)
|
|
||||||
if not disk_id:
|
|
||||||
# Auto-register disk if not present
|
|
||||||
self.register_disk(device_name.replace('/dev/', ''), serial)
|
|
||||||
disk_id = self._get_disk_registry_id(cursor, device_name.replace('/dev/', ''), serial)
|
|
||||||
|
|
||||||
if not disk_id:
|
|
||||||
conn.close()
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Get current worst_health
|
|
||||||
cursor.execute('SELECT worst_health, admin_cleared FROM disk_registry WHERE id = ?', (disk_id,))
|
|
||||||
row = cursor.fetchone()
|
|
||||||
if not row:
|
|
||||||
conn.close()
|
|
||||||
return False
|
|
||||||
|
|
||||||
current_worst = row[0] or 'healthy'
|
|
||||||
admin_cleared = row[1] or 0
|
|
||||||
|
|
||||||
# If admin cleared and new issue is the same or less severe, don't update
|
|
||||||
# But if admin cleared and issue escalates, update anyway
|
|
||||||
current_severity = self.HEALTH_SEVERITY_ORDER.get(current_worst, 0)
|
|
||||||
new_severity = self.HEALTH_SEVERITY_ORDER.get(health_lower, 0)
|
|
||||||
|
|
||||||
# Only update if new health is worse
|
|
||||||
if new_severity > current_severity:
|
|
||||||
now = datetime.now().isoformat()
|
|
||||||
cursor.execute('''
|
|
||||||
UPDATE disk_registry
|
|
||||||
SET worst_health = ?, worst_health_date = ?, worst_health_reason = ?,
|
|
||||||
admin_cleared = 0
|
|
||||||
WHERE id = ?
|
|
||||||
''', (health_lower, now, reason, disk_id))
|
|
||||||
conn.commit()
|
|
||||||
conn.close()
|
|
||||||
return True
|
|
||||||
|
|
||||||
conn.close()
|
|
||||||
return False
|
|
||||||
except Exception as e:
|
|
||||||
print(f"[HealthPersistence] Error updating disk worst_health: {e}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
def get_disk_worst_health(self, device_name: str, serial: Optional[str] = None) -> Optional[Dict[str, Any]]:
|
|
||||||
"""Get the worst health state for a specific disk."""
|
|
||||||
try:
|
|
||||||
conn = self._get_conn()
|
|
||||||
cursor = conn.cursor()
|
|
||||||
|
|
||||||
disk_id = self._get_disk_registry_id(cursor, device_name.replace('/dev/', ''), serial)
|
|
||||||
if not disk_id:
|
|
||||||
conn.close()
|
|
||||||
return None
|
|
||||||
|
|
||||||
cursor.execute('''
|
|
||||||
SELECT worst_health, worst_health_date, worst_health_reason,
|
|
||||||
admin_cleared, admin_cleared_date, admin_cleared_note
|
|
||||||
FROM disk_registry WHERE id = ?
|
|
||||||
''', (disk_id,))
|
|
||||||
row = cursor.fetchone()
|
|
||||||
conn.close()
|
|
||||||
|
|
||||||
if row:
|
|
||||||
return {
|
|
||||||
'worst_health': row[0] or 'healthy',
|
|
||||||
'worst_health_date': row[1],
|
|
||||||
'worst_health_reason': row[2],
|
|
||||||
'admin_cleared': bool(row[3]),
|
|
||||||
'admin_cleared_date': row[4],
|
|
||||||
'admin_cleared_note': row[5],
|
|
||||||
}
|
|
||||||
return None
|
|
||||||
except Exception as e:
|
|
||||||
print(f"[HealthPersistence] Error getting disk worst_health: {e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
def admin_clear_disk_health(self, device_name: str, serial: Optional[str], note: str) -> bool:
|
|
||||||
"""Admin manually clears disk health history (e.g., after disk replacement).
|
|
||||||
|
|
||||||
Requires a note explaining why (for audit trail).
|
|
||||||
"""
|
|
||||||
if not note or len(note.strip()) < 5:
|
|
||||||
return False # Require meaningful note
|
|
||||||
|
|
||||||
try:
|
|
||||||
conn = self._get_conn()
|
|
||||||
cursor = conn.cursor()
|
|
||||||
|
|
||||||
disk_id = self._get_disk_registry_id(cursor, device_name.replace('/dev/', ''), serial)
|
|
||||||
if not disk_id:
|
|
||||||
conn.close()
|
|
||||||
return False
|
|
||||||
|
|
||||||
now = datetime.now().isoformat()
|
|
||||||
cursor.execute('''
|
|
||||||
UPDATE disk_registry
|
|
||||||
SET worst_health = 'healthy', admin_cleared = 1,
|
|
||||||
admin_cleared_date = ?, admin_cleared_note = ?
|
|
||||||
WHERE id = ?
|
|
||||||
''', (now, note.strip(), disk_id))
|
|
||||||
|
|
||||||
# Also dismiss all active observations for this disk
|
|
||||||
cursor.execute('''
|
|
||||||
UPDATE disk_observations SET dismissed = 1 WHERE disk_registry_id = ?
|
|
||||||
''', (disk_id,))
|
|
||||||
|
|
||||||
conn.commit()
|
|
||||||
conn.close()
|
|
||||||
return True
|
|
||||||
except Exception as e:
|
|
||||||
print(f"[HealthPersistence] Error clearing disk health: {e}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
def get_all_disks_health_summary(self) -> List[Dict[str, Any]]:
|
|
||||||
"""Get health summary for all registered disks (for Health Monitor listing).
|
|
||||||
|
|
||||||
Returns list of disks with their current and worst health states.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
conn = self._get_conn()
|
|
||||||
cursor = conn.cursor()
|
|
||||||
|
|
||||||
cursor.execute('''
|
|
||||||
SELECT d.id, d.device_name, d.serial, d.model, d.size_bytes,
|
|
||||||
d.first_seen, d.last_seen, d.removed,
|
|
||||||
d.worst_health, d.worst_health_date, d.worst_health_reason,
|
|
||||||
d.admin_cleared, d.admin_cleared_date,
|
|
||||||
(SELECT COUNT(*) FROM disk_observations o
|
|
||||||
WHERE o.disk_registry_id = d.id AND o.dismissed = 0) as active_observations
|
|
||||||
FROM disk_registry d
|
|
||||||
WHERE d.removed = 0
|
|
||||||
ORDER BY d.device_name
|
|
||||||
''')
|
|
||||||
rows = cursor.fetchall()
|
|
||||||
conn.close()
|
|
||||||
|
|
||||||
return [{
|
|
||||||
'id': r[0],
|
|
||||||
'device_name': r[1],
|
|
||||||
'serial': r[2] or '',
|
|
||||||
'model': r[3] or 'Unknown',
|
|
||||||
'size_bytes': r[4],
|
|
||||||
'first_seen': r[5],
|
|
||||||
'last_seen': r[6],
|
|
||||||
'removed': bool(r[7]),
|
|
||||||
'worst_health': r[8] or 'healthy',
|
|
||||||
'worst_health_date': r[9],
|
|
||||||
'worst_health_reason': r[10] or '',
|
|
||||||
'admin_cleared': bool(r[11]),
|
|
||||||
'admin_cleared_date': r[12],
|
|
||||||
'active_observations': r[13],
|
|
||||||
} for r in rows]
|
|
||||||
except Exception as e:
|
|
||||||
print(f"[HealthPersistence] Error getting disks health summary: {e}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
|
|
||||||
# Global instance
|
# Global instance
|
||||||
health_persistence = HealthPersistence()
|
health_persistence = HealthPersistence()
|
||||||
|
|||||||
@@ -402,16 +402,47 @@ class JournalWatcher:
|
|||||||
entity = 'disk'
|
entity = 'disk'
|
||||||
entity_id = f'fs_{device}'
|
entity_id = f'fs_{device}'
|
||||||
|
|
||||||
# ── 24h dedup for filesystem errors per device ──
|
# ── Get disk serial for USB-aware cooldown ──
|
||||||
|
# USB disks can change device names (sda->sdb) on reconnect.
|
||||||
|
# Using serial as cooldown key ensures same physical disk
|
||||||
|
# shares one 24h cooldown regardless of device letter.
|
||||||
|
import os as _os
|
||||||
|
base_dev = re.sub(r'\d+$', '', device) if device != 'unknown' else ''
|
||||||
|
disk_serial = ''
|
||||||
|
is_usb_disk = False
|
||||||
|
if base_dev:
|
||||||
|
try:
|
||||||
|
# Check if USB via sysfs
|
||||||
|
sysfs_link = subprocess.run(
|
||||||
|
['readlink', '-f', f'/sys/block/{base_dev}'],
|
||||||
|
capture_output=True, text=True, timeout=2
|
||||||
|
)
|
||||||
|
is_usb_disk = 'usb' in sysfs_link.stdout.lower()
|
||||||
|
|
||||||
|
# Get serial from smartctl
|
||||||
|
smart_result = subprocess.run(
|
||||||
|
['smartctl', '-i', '-j', f'/dev/{base_dev}'],
|
||||||
|
capture_output=True, text=True, timeout=5
|
||||||
|
)
|
||||||
|
if smart_result.returncode in (0, 4):
|
||||||
|
import json
|
||||||
|
smart_data = json.loads(smart_result.stdout)
|
||||||
|
disk_serial = smart_data.get('serial_number', '')
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# ── 24h dedup for filesystem errors ──
|
||||||
|
# Use serial for USB disks, device name for others
|
||||||
now_fs = time.time()
|
now_fs = time.time()
|
||||||
fs_dedup_key = f'fs_{device}'
|
if is_usb_disk and disk_serial:
|
||||||
|
fs_dedup_key = f'fs_serial_{disk_serial}'
|
||||||
|
else:
|
||||||
|
fs_dedup_key = f'fs_{device}'
|
||||||
last_fs_notified = self._disk_io_notified.get(fs_dedup_key, 0)
|
last_fs_notified = self._disk_io_notified.get(fs_dedup_key, 0)
|
||||||
if now_fs - last_fs_notified < self._DISK_IO_COOLDOWN:
|
if now_fs - last_fs_notified < self._DISK_IO_COOLDOWN:
|
||||||
return # Already notified for this device recently
|
return # Already notified for this device recently
|
||||||
|
|
||||||
# ── SMART + device existence gating ──
|
# ── Device existence gating ──
|
||||||
import os as _os
|
|
||||||
base_dev = re.sub(r'\d+$', '', device) if device != 'unknown' else ''
|
|
||||||
device_exists = base_dev and _os.path.exists(f'/dev/{base_dev}')
|
device_exists = base_dev and _os.path.exists(f'/dev/{base_dev}')
|
||||||
|
|
||||||
if not device_exists and device != 'unknown':
|
if not device_exists and device != 'unknown':
|
||||||
@@ -749,7 +780,6 @@ class JournalWatcher:
|
|||||||
"""Extract device info from a smartd system-mail and record as disk observation."""
|
"""Extract device info from a smartd system-mail and record as disk observation."""
|
||||||
try:
|
try:
|
||||||
import re as _re
|
import re as _re
|
||||||
import subprocess
|
|
||||||
from health_persistence import health_persistence
|
from health_persistence import health_persistence
|
||||||
|
|
||||||
# Extract device path: "Device: /dev/sdh [SAT]" or "Device: /dev/sda"
|
# Extract device path: "Device: /dev/sdh [SAT]" or "Device: /dev/sda"
|
||||||
@@ -770,21 +800,6 @@ class JournalWatcher:
|
|||||||
if model_match:
|
if model_match:
|
||||||
model = model_match.group(1).strip()
|
model = model_match.group(1).strip()
|
||||||
|
|
||||||
# If no serial from message, try to get it from smartctl (important for USB disks)
|
|
||||||
if not serial or len(serial) < 3:
|
|
||||||
try:
|
|
||||||
result = subprocess.run(
|
|
||||||
['smartctl', '-i', '-j', f'/dev/{base_dev}'],
|
|
||||||
capture_output=True, text=True, timeout=5
|
|
||||||
)
|
|
||||||
import json as _json
|
|
||||||
data = _json.loads(result.stdout)
|
|
||||||
serial = data.get('serial_number', '') or serial
|
|
||||||
if not model:
|
|
||||||
model = data.get('model_name', '') or data.get('model_family', '')
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Extract error signature from title: "SMART error (FailedReadSmartSelfTestLog)"
|
# Extract error signature from title: "SMART error (FailedReadSmartSelfTestLog)"
|
||||||
sig_match = _re.search(r'SMART error\s*\((\w+)\)', title)
|
sig_match = _re.search(r'SMART error\s*\((\w+)\)', title)
|
||||||
if sig_match:
|
if sig_match:
|
||||||
@@ -821,12 +836,10 @@ class JournalWatcher:
|
|||||||
severity='warning',
|
severity='warning',
|
||||||
)
|
)
|
||||||
|
|
||||||
# Also update worst_health so the disk stays marked as warning
|
# Update worst_health for permanent tracking (record_disk_observation
|
||||||
# even if current SMART readings show 0 pending sectors
|
# already does this, but we ensure it here for safety)
|
||||||
warn_line_text = warn_line_m.group(1).strip() if warn_line_m else error_signature
|
health_persistence.update_disk_worst_health(base_dev, serial, 'warning')
|
||||||
health_persistence.update_disk_worst_health(
|
|
||||||
base_dev, serial, 'warning', warn_line_text
|
|
||||||
)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[DiskIOEventProcessor] Error recording smartd observation: {e}")
|
print(f"[DiskIOEventProcessor] Error recording smartd observation: {e}")
|
||||||
|
|
||||||
@@ -1751,8 +1764,26 @@ class PollingCollector:
|
|||||||
if isinstance(details_raw, dict):
|
if isinstance(details_raw, dict):
|
||||||
# Extract device name for a stable entity_id (24h cooldown key)
|
# Extract device name for a stable entity_id (24h cooldown key)
|
||||||
dev = details_raw.get('device', details_raw.get('disk', ''))
|
dev = details_raw.get('device', details_raw.get('disk', ''))
|
||||||
if dev:
|
serial = details_raw.get('serial', '')
|
||||||
eid = f'disk_{dev}' # Stable per-device fingerprint
|
|
||||||
|
# For USB disks, use serial as entity_id for stable cooldown
|
||||||
|
# USB disks can change device names (sda->sdb) on reconnect
|
||||||
|
# Using serial ensures same physical disk shares cooldown
|
||||||
|
if serial and dev:
|
||||||
|
# Check if this is a USB disk
|
||||||
|
try:
|
||||||
|
sysfs_result = subprocess.run(
|
||||||
|
['readlink', '-f', f'/sys/block/{dev.replace("/dev/", "")}'],
|
||||||
|
capture_output=True, text=True, timeout=2
|
||||||
|
)
|
||||||
|
if 'usb' in sysfs_result.stdout.lower():
|
||||||
|
eid = f'disk_serial_{serial}' # USB: use serial
|
||||||
|
else:
|
||||||
|
eid = f'disk_{dev}' # Non-USB: use device name
|
||||||
|
except Exception:
|
||||||
|
eid = f'disk_{dev}' # Fallback to device name
|
||||||
|
elif dev:
|
||||||
|
eid = f'disk_{dev}' # No serial: use device name
|
||||||
|
|
||||||
# Updates are always informational notifications except
|
# Updates are always informational notifications except
|
||||||
# system_age which can be WARNING (365+ days) or CRITICAL (548+ days).
|
# system_age which can be WARNING (365+ days) or CRITICAL (548+ days).
|
||||||
@@ -1818,15 +1849,26 @@ class PollingCollector:
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Skip recovery notifications for SMART disk errors (pending/reallocated sectors).
|
# Skip recovery notifications for PERMANENT disk events.
|
||||||
# These indicate physical disk degradation that doesn't truly "recover" --
|
# These indicate physical disk degradation that doesn't truly "recover":
|
||||||
# the disk may show 0 pending sectors later but the damage history persists.
|
# - SMART pending/reallocated sectors indicate physical damage
|
||||||
# The worst_health in disk_registry tracks this, so we don't send false "resolved".
|
# - Disk may show 0 pending sectors later but damage history persists
|
||||||
|
# - Sending "Resolved" gives false sense of security
|
||||||
|
# The worst_health in disk_registry tracks this permanently.
|
||||||
if category == 'disks':
|
if category == 'disks':
|
||||||
reason_lower = reason.lower() if reason else ''
|
reason_lower = (reason or '').lower()
|
||||||
if any(indicator in reason_lower for indicator in [
|
permanent_indicators = [
|
||||||
'pending', 'reallocated', 'sector', 'smart', 'unreadable'
|
'pending', # pending sectors
|
||||||
]):
|
'reallocated', # reallocated sectors
|
||||||
|
'unreadable', # unreadable sectors
|
||||||
|
'smart', # SMART errors
|
||||||
|
'surface error', # disk surface errors
|
||||||
|
'bad sector', # bad sectors
|
||||||
|
'i/o error', # I/O errors (repeated)
|
||||||
|
'medium error', # SCSI medium errors
|
||||||
|
]
|
||||||
|
if any(indicator in reason_lower for indicator in permanent_indicators):
|
||||||
|
# Don't send recovery - just clean up tracking
|
||||||
self._last_notified.pop(key, None)
|
self._last_notified.pop(key, None)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|||||||
@@ -559,13 +559,6 @@ TEMPLATES = {
|
|||||||
'group': 'storage',
|
'group': 'storage',
|
||||||
'default_enabled': True,
|
'default_enabled': True,
|
||||||
},
|
},
|
||||||
'smart_warning': {
|
|
||||||
'title': '{hostname}: SMART warning on {device}',
|
|
||||||
'body': '{device}: {reason}',
|
|
||||||
'label': 'SMART warning (sectors)',
|
|
||||||
'group': 'storage',
|
|
||||||
'default_enabled': True,
|
|
||||||
},
|
|
||||||
'storage_unavailable': {
|
'storage_unavailable': {
|
||||||
'title': '{hostname}: Storage unavailable - {storage_name}',
|
'title': '{hostname}: Storage unavailable - {storage_name}',
|
||||||
'body': 'PVE storage "{storage_name}" ({storage_type}) is not available.\n{reason}',
|
'body': 'PVE storage "{storage_name}" ({storage_type}) is not available.\n{reason}',
|
||||||
|
|||||||
Reference in New Issue
Block a user