Update notification service

This commit is contained in:
MacRimi
2026-03-08 22:47:04 +01:00
parent b8cff3e699
commit 3739560956
7 changed files with 467 additions and 756 deletions

View File

@@ -402,16 +402,47 @@ class JournalWatcher:
entity = 'disk'
entity_id = f'fs_{device}'
# ── 24h dedup for filesystem errors per device ──
# ── Get disk serial for USB-aware cooldown ──
# USB disks can change device names (sda->sdb) on reconnect.
# Using serial as cooldown key ensures same physical disk
# shares one 24h cooldown regardless of device letter.
import os as _os
base_dev = re.sub(r'\d+$', '', device) if device != 'unknown' else ''
disk_serial = ''
is_usb_disk = False
if base_dev:
try:
# Check if USB via sysfs
sysfs_link = subprocess.run(
['readlink', '-f', f'/sys/block/{base_dev}'],
capture_output=True, text=True, timeout=2
)
is_usb_disk = 'usb' in sysfs_link.stdout.lower()
# Get serial from smartctl
smart_result = subprocess.run(
['smartctl', '-i', '-j', f'/dev/{base_dev}'],
capture_output=True, text=True, timeout=5
)
if smart_result.returncode in (0, 4):
import json
smart_data = json.loads(smart_result.stdout)
disk_serial = smart_data.get('serial_number', '')
except Exception:
pass
# ── 24h dedup for filesystem errors ──
# Use serial for USB disks, device name for others
now_fs = time.time()
fs_dedup_key = f'fs_{device}'
if is_usb_disk and disk_serial:
fs_dedup_key = f'fs_serial_{disk_serial}'
else:
fs_dedup_key = f'fs_{device}'
last_fs_notified = self._disk_io_notified.get(fs_dedup_key, 0)
if now_fs - last_fs_notified < self._DISK_IO_COOLDOWN:
return # Already notified for this device recently
# ── SMART + device existence gating ──
import os as _os
base_dev = re.sub(r'\d+$', '', device) if device != 'unknown' else ''
# ── Device existence gating ──
device_exists = base_dev and _os.path.exists(f'/dev/{base_dev}')
if not device_exists and device != 'unknown':
@@ -749,7 +780,6 @@ class JournalWatcher:
"""Extract device info from a smartd system-mail and record as disk observation."""
try:
import re as _re
import subprocess
from health_persistence import health_persistence
# Extract device path: "Device: /dev/sdh [SAT]" or "Device: /dev/sda"
@@ -770,21 +800,6 @@ class JournalWatcher:
if model_match:
model = model_match.group(1).strip()
# If no serial from message, try to get it from smartctl (important for USB disks)
if not serial or len(serial) < 3:
try:
result = subprocess.run(
['smartctl', '-i', '-j', f'/dev/{base_dev}'],
capture_output=True, text=True, timeout=5
)
import json as _json
data = _json.loads(result.stdout)
serial = data.get('serial_number', '') or serial
if not model:
model = data.get('model_name', '') or data.get('model_family', '')
except Exception:
pass
# Extract error signature from title: "SMART error (FailedReadSmartSelfTestLog)"
sig_match = _re.search(r'SMART error\s*\((\w+)\)', title)
if sig_match:
@@ -821,12 +836,10 @@ class JournalWatcher:
severity='warning',
)
# Also update worst_health so the disk stays marked as warning
# even if current SMART readings show 0 pending sectors
warn_line_text = warn_line_m.group(1).strip() if warn_line_m else error_signature
health_persistence.update_disk_worst_health(
base_dev, serial, 'warning', warn_line_text
)
# Update worst_health for permanent tracking (record_disk_observation
# already does this, but we ensure it here for safety)
health_persistence.update_disk_worst_health(base_dev, serial, 'warning')
except Exception as e:
print(f"[DiskIOEventProcessor] Error recording smartd observation: {e}")
@@ -1751,8 +1764,26 @@ class PollingCollector:
if isinstance(details_raw, dict):
# Extract device name for a stable entity_id (24h cooldown key)
dev = details_raw.get('device', details_raw.get('disk', ''))
if dev:
eid = f'disk_{dev}' # Stable per-device fingerprint
serial = details_raw.get('serial', '')
# For USB disks, use serial as entity_id for stable cooldown
# USB disks can change device names (sda->sdb) on reconnect
# Using serial ensures same physical disk shares cooldown
if serial and dev:
# Check if this is a USB disk
try:
sysfs_result = subprocess.run(
['readlink', '-f', f'/sys/block/{dev.replace("/dev/", "")}'],
capture_output=True, text=True, timeout=2
)
if 'usb' in sysfs_result.stdout.lower():
eid = f'disk_serial_{serial}' # USB: use serial
else:
eid = f'disk_{dev}' # Non-USB: use device name
except Exception:
eid = f'disk_{dev}' # Fallback to device name
elif dev:
eid = f'disk_{dev}' # No serial: use device name
# Updates are always informational notifications except
# system_age which can be WARNING (365+ days) or CRITICAL (548+ days).
@@ -1818,15 +1849,26 @@ class PollingCollector:
except Exception:
pass
# Skip recovery notifications for SMART disk errors (pending/reallocated sectors).
# These indicate physical disk degradation that doesn't truly "recover" --
# the disk may show 0 pending sectors later but the damage history persists.
# The worst_health in disk_registry tracks this, so we don't send false "resolved".
# Skip recovery notifications for PERMANENT disk events.
# These indicate physical disk degradation that doesn't truly "recover":
# - SMART pending/reallocated sectors indicate physical damage
# - Disk may show 0 pending sectors later but damage history persists
# - Sending "Resolved" gives false sense of security
# The worst_health in disk_registry tracks this permanently.
if category == 'disks':
reason_lower = reason.lower() if reason else ''
if any(indicator in reason_lower for indicator in [
'pending', 'reallocated', 'sector', 'smart', 'unreadable'
]):
reason_lower = (reason or '').lower()
permanent_indicators = [
'pending', # pending sectors
'reallocated', # reallocated sectors
'unreadable', # unreadable sectors
'smart', # SMART errors
'surface error', # disk surface errors
'bad sector', # bad sectors
'i/o error', # I/O errors (repeated)
'medium error', # SCSI medium errors
]
if any(indicator in reason_lower for indicator in permanent_indicators):
# Don't send recovery - just clean up tracking
self._last_notified.pop(key, None)
continue