mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-04-25 08:56:21 +00:00
Update notification service
This commit is contained in:
@@ -383,6 +383,7 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
|
|||||||
log_persistent_errors: "Persistent Errors",
|
log_persistent_errors: "Persistent Errors",
|
||||||
log_critical_errors: "Critical Errors",
|
log_critical_errors: "Critical Errors",
|
||||||
// Updates
|
// Updates
|
||||||
|
pve_version: "Proxmox VE Version",
|
||||||
security_updates: "Security Updates",
|
security_updates: "Security Updates",
|
||||||
system_age: "System Age",
|
system_age: "System Age",
|
||||||
pending_updates: "Pending Updates",
|
pending_updates: "Pending Updates",
|
||||||
|
|||||||
@@ -71,11 +71,12 @@ class HealthMonitor:
|
|||||||
LOG_CHECK_INTERVAL = 300
|
LOG_CHECK_INTERVAL = 300
|
||||||
|
|
||||||
# Updates Thresholds
|
# Updates Thresholds
|
||||||
UPDATES_WARNING = 365 # Only warn after 1 year without updates
|
UPDATES_WARNING = 365 # Only warn after 1 year without updates (system_age)
|
||||||
UPDATES_CRITICAL = 730 # Critical after 2 years
|
UPDATES_CRITICAL = 548 # Critical after 18 months without updates
|
||||||
|
SECURITY_WARN_DAYS = 360 # Security updates only become WARNING after 360 days unpatched
|
||||||
|
|
||||||
BENIGN_ERROR_PATTERNS = [
|
BENIGN_ERROR_PATTERNS = [
|
||||||
# Proxmox specific benign patterns
|
# ── Proxmox API / proxy operational noise ──
|
||||||
r'got inotify poll request in wrong process',
|
r'got inotify poll request in wrong process',
|
||||||
r'auth key pair too old, rotating',
|
r'auth key pair too old, rotating',
|
||||||
r'proxy detected vanished client connection',
|
r'proxy detected vanished client connection',
|
||||||
@@ -84,33 +85,62 @@ class HealthMonitor:
|
|||||||
r'disconnect peer',
|
r'disconnect peer',
|
||||||
r'task OK',
|
r'task OK',
|
||||||
r'backup finished',
|
r'backup finished',
|
||||||
|
# PVE ticket / auth transient errors (web UI session expiry, API token
|
||||||
|
# refresh, brute-force bots). These are logged at WARNING/ERR level
|
||||||
|
# but are NOT system problems -- they are access-control events.
|
||||||
|
r'invalid PVE ticket',
|
||||||
|
r'authentication failure.*pve',
|
||||||
|
r'permission denied.*ticket',
|
||||||
|
r'no ticket',
|
||||||
|
r'CSRF.*failed',
|
||||||
|
r'pveproxy\[\d+\]: authentication failure',
|
||||||
|
r'pvedaemon\[\d+\]: authentication failure',
|
||||||
|
# PVE cluster/corosync normal chatter
|
||||||
|
r'corosync.*retransmit',
|
||||||
|
r'corosync.*delivering',
|
||||||
|
r'pmxcfs.*update',
|
||||||
|
r'pve-cluster\[\d+\]:.*status',
|
||||||
|
|
||||||
# Systemd informational messages
|
# ── Systemd informational messages ──
|
||||||
r'(started|starting|stopped|stopping) session',
|
r'(started|starting|stopped|stopping) session',
|
||||||
r'session \d+ logged (in|out)',
|
r'session \d+ logged (in|out)',
|
||||||
r'new session \d+ of user',
|
r'new session \d+ of user',
|
||||||
r'removed session \d+',
|
r'removed session \d+',
|
||||||
r'user@\d+\.service:',
|
r'user@\d+\.service:',
|
||||||
r'user runtime directory',
|
r'user runtime directory',
|
||||||
|
# Systemd service restarts (normal lifecycle)
|
||||||
|
r'systemd\[\d+\]: .+\.service: (Scheduled restart|Consumed)',
|
||||||
|
r'systemd\[\d+\]: .+\.service: Deactivated successfully',
|
||||||
|
|
||||||
# Network transient errors (common and usually self-recovering)
|
# ── Network transient errors (common and usually self-recovering) ──
|
||||||
r'dhcp.*timeout',
|
r'dhcp.*timeout',
|
||||||
r'temporary failure in name resolution',
|
r'temporary failure in name resolution',
|
||||||
r'network is unreachable',
|
r'network is unreachable',
|
||||||
r'no route to host',
|
r'no route to host',
|
||||||
|
|
||||||
# Backup and sync normal warnings
|
# ── Backup and sync normal warnings ──
|
||||||
r'rsync.*vanished',
|
r'rsync.*vanished',
|
||||||
r'backup job .* finished',
|
r'backup job .* finished',
|
||||||
r'vzdump backup .* finished',
|
r'vzdump backup .* finished',
|
||||||
|
|
||||||
# ZFS informational
|
# ── ZFS informational ──
|
||||||
r'zfs.*scrub (started|finished|in progress)',
|
r'zfs.*scrub (started|finished|in progress)',
|
||||||
r'zpool.*resilver',
|
r'zpool.*resilver',
|
||||||
|
|
||||||
# LXC/Container normal operations
|
# ── LXC/Container normal operations ──
|
||||||
r'lxc.*monitor',
|
r'lxc.*monitor',
|
||||||
r'systemd\[1\]: (started|stopped) .*\.scope',
|
r'systemd\[1\]: (started|stopped) .*\.scope',
|
||||||
|
|
||||||
|
# ── ATA/SCSI transient bus errors ──
|
||||||
|
# These are logged at ERR level but are common on SATA controllers
|
||||||
|
# during hot-plug, link renegotiation, or cable noise. They are NOT
|
||||||
|
# indicative of disk failure unless SMART also reports problems.
|
||||||
|
r'ata\d+.*SError.*BadCRC',
|
||||||
|
r'ata\d+.*Emask 0x10.*ATA bus error',
|
||||||
|
r'failed command: (READ|WRITE) FPDMA QUEUED',
|
||||||
|
r'ata\d+.*hard resetting link',
|
||||||
|
r'ata\d+.*link is slow',
|
||||||
|
r'ata\d+.*COMRESET',
|
||||||
]
|
]
|
||||||
|
|
||||||
CRITICAL_LOG_KEYWORDS = [
|
CRITICAL_LOG_KEYWORDS = [
|
||||||
@@ -120,14 +150,23 @@ class HealthMonitor:
|
|||||||
'ext4-fs error', 'xfs.*corruption',
|
'ext4-fs error', 'xfs.*corruption',
|
||||||
'lvm activation failed',
|
'lvm activation failed',
|
||||||
'hardware error', 'mce:',
|
'hardware error', 'mce:',
|
||||||
'segfault', 'general protection fault'
|
'general protection fault',
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Segfault is WARNING, not CRITICAL -- only PVE-critical process
|
||||||
|
# segfaults are escalated to CRITICAL in _classify_log_severity.
|
||||||
|
PVE_CRITICAL_PROCESSES = {
|
||||||
|
'pveproxy', 'pvedaemon', 'pvestatd', 'pve-cluster',
|
||||||
|
'corosync', 'qemu-system', 'lxc-start', 'ceph-osd',
|
||||||
|
'ceph-mon', 'pmxcfs', 'kvm',
|
||||||
|
}
|
||||||
|
|
||||||
WARNING_LOG_KEYWORDS = [
|
WARNING_LOG_KEYWORDS = [
|
||||||
'i/o error', 'ata error', 'scsi error',
|
'i/o error', 'ata error', 'scsi error',
|
||||||
'task hung', 'blocked for more than',
|
'task hung', 'blocked for more than',
|
||||||
'failed to start', 'service.*failed',
|
'failed to start', 'service.*failed',
|
||||||
'disk.*offline', 'disk.*removed'
|
'disk.*offline', 'disk.*removed',
|
||||||
|
'segfault', # WARNING by default; escalated to CRITICAL only for PVE processes
|
||||||
]
|
]
|
||||||
|
|
||||||
# PVE Critical Services
|
# PVE Critical Services
|
||||||
@@ -1483,7 +1522,7 @@ class HealthMonitor:
|
|||||||
else:
|
else:
|
||||||
health_persistence.resolve_error(error_key, 'Disk errors cleared')
|
health_persistence.resolve_error(error_key, 'Disk errors cleared')
|
||||||
|
|
||||||
# Also include active filesystem errors (detected by _check_system_logs
|
# Also include active filesystem errors (detected by _check_log_analysis
|
||||||
# and cross-referenced to the 'disks' category)
|
# and cross-referenced to the 'disks' category)
|
||||||
try:
|
try:
|
||||||
fs_errors = health_persistence.get_active_errors(category='disks')
|
fs_errors = health_persistence.get_active_errors(category='disks')
|
||||||
@@ -1491,6 +1530,11 @@ class HealthMonitor:
|
|||||||
err_key = err.get('error_key', '')
|
err_key = err.get('error_key', '')
|
||||||
if not err_key.startswith('disk_fs_'):
|
if not err_key.startswith('disk_fs_'):
|
||||||
continue # Only filesystem cross-references
|
continue # Only filesystem cross-references
|
||||||
|
|
||||||
|
# Skip acknowledged/dismissed errors
|
||||||
|
if err.get('acknowledged') == 1:
|
||||||
|
continue
|
||||||
|
|
||||||
details = err.get('details', {})
|
details = err.get('details', {})
|
||||||
if isinstance(details, str):
|
if isinstance(details, str):
|
||||||
try:
|
try:
|
||||||
@@ -1498,15 +1542,34 @@ class HealthMonitor:
|
|||||||
details = _json.loads(details)
|
details = _json.loads(details)
|
||||||
except Exception:
|
except Exception:
|
||||||
details = {}
|
details = {}
|
||||||
|
|
||||||
device = details.get('device', err_key.replace('disk_fs_', '/dev/'))
|
device = details.get('device', err_key.replace('disk_fs_', '/dev/'))
|
||||||
|
base_disk = details.get('disk', '')
|
||||||
|
|
||||||
|
# Check if the device still exists. If not, auto-resolve
|
||||||
|
# the error -- it was likely a disconnected USB/temp device.
|
||||||
|
dev_path = f'/dev/{base_disk}' if base_disk else device
|
||||||
|
if not os.path.exists(dev_path):
|
||||||
|
health_persistence.resolve_error(
|
||||||
|
err_key, 'Device no longer present in system')
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Cross-reference with SMART: if SMART is healthy for
|
||||||
|
# this disk, downgrade to INFO (transient fs error).
|
||||||
|
severity = err.get('severity', 'WARNING')
|
||||||
|
if base_disk:
|
||||||
|
smart_health = self._quick_smart_health(base_disk)
|
||||||
|
if smart_health == 'PASSED' and severity == 'CRITICAL':
|
||||||
|
severity = 'WARNING'
|
||||||
|
|
||||||
if device not in disk_results:
|
if device not in disk_results:
|
||||||
disk_results[device] = {
|
disk_results[device] = {
|
||||||
'status': err.get('severity', 'CRITICAL'),
|
'status': severity,
|
||||||
'reason': err.get('reason', 'Filesystem error'),
|
'reason': err.get('reason', 'Filesystem error'),
|
||||||
'device': details.get('disk', ''),
|
'device': base_disk,
|
||||||
'error_count': 1,
|
'error_count': 1,
|
||||||
'error_type': 'filesystem',
|
'error_type': 'filesystem',
|
||||||
'dismissable': False,
|
'dismissable': True,
|
||||||
'error_key': err_key,
|
'error_key': err_key,
|
||||||
}
|
}
|
||||||
except Exception:
|
except Exception:
|
||||||
@@ -2303,6 +2366,9 @@ class HealthMonitor:
|
|||||||
if 'segfault' in line_lower:
|
if 'segfault' in line_lower:
|
||||||
m = re.search(r'(\S+)\[\d+\].*segfault', line)
|
m = re.search(r'(\S+)\[\d+\].*segfault', line)
|
||||||
process = m.group(1) if m else 'unknown'
|
process = m.group(1) if m else 'unknown'
|
||||||
|
is_critical_proc = any(p in process.lower() for p in self.PVE_CRITICAL_PROCESSES)
|
||||||
|
if is_critical_proc:
|
||||||
|
return f'Critical process "{process}" crashed (segmentation fault) -- PVE service affected'
|
||||||
return f'Process "{process}" crashed (segmentation fault)'
|
return f'Process "{process}" crashed (segmentation fault)'
|
||||||
|
|
||||||
# Hardware error
|
# Hardware error
|
||||||
@@ -2324,31 +2390,43 @@ class HealthMonitor:
|
|||||||
"""
|
"""
|
||||||
Classify log line severity intelligently.
|
Classify log line severity intelligently.
|
||||||
Returns: 'CRITICAL', 'WARNING', or None (benign/info)
|
Returns: 'CRITICAL', 'WARNING', or None (benign/info)
|
||||||
|
|
||||||
|
Design principles:
|
||||||
|
- CRITICAL must be reserved for events that require IMMEDIATE action
|
||||||
|
(data loss risk, service outage, hardware failure confirmed by SMART).
|
||||||
|
- WARNING is for events worth investigating but not urgent.
|
||||||
|
- Everything else is None (benign/informational).
|
||||||
"""
|
"""
|
||||||
line_lower = line.lower()
|
line_lower = line.lower()
|
||||||
|
|
||||||
# Check if benign first
|
# Check if benign first -- fast path for known noise
|
||||||
if self._is_benign_error(line):
|
if self._is_benign_error(line):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Check critical keywords
|
# Check critical keywords (hard failures: OOM, panic, FS corruption, etc.)
|
||||||
for keyword in self.CRITICAL_LOG_KEYWORDS:
|
for keyword in self.CRITICAL_LOG_KEYWORDS:
|
||||||
if re.search(keyword, line_lower):
|
if re.search(keyword, line_lower):
|
||||||
return 'CRITICAL'
|
return 'CRITICAL'
|
||||||
|
|
||||||
# Check warning keywords
|
# Check warning keywords (includes segfault, I/O errors, etc.)
|
||||||
for keyword in self.WARNING_LOG_KEYWORDS:
|
for keyword in self.WARNING_LOG_KEYWORDS:
|
||||||
if re.search(keyword, line_lower):
|
if re.search(keyword, line_lower):
|
||||||
|
# Special case: segfault of a PVE-critical process is CRITICAL
|
||||||
|
if 'segfault' in line_lower:
|
||||||
|
for proc in self.PVE_CRITICAL_PROCESSES:
|
||||||
|
if proc in line_lower:
|
||||||
|
return 'CRITICAL'
|
||||||
return 'WARNING'
|
return 'WARNING'
|
||||||
|
|
||||||
# Generic error/warning classification based on common terms
|
# Generic classification -- very conservative to avoid false positives.
|
||||||
if 'critical' in line_lower or 'fatal' in line_lower or 'panic' in line_lower:
|
# Only escalate if the line explicitly uses severity-level keywords
|
||||||
|
# from the kernel or systemd (not just any line containing "error").
|
||||||
|
if 'kernel panic' in line_lower or 'fatal' in line_lower and 'non-fatal' not in line_lower:
|
||||||
return 'CRITICAL'
|
return 'CRITICAL'
|
||||||
elif 'error' in line_lower or 'fail' in line_lower:
|
|
||||||
return 'WARNING'
|
|
||||||
elif 'warning' in line_lower or 'warn' in line_lower:
|
|
||||||
return None # Generic warnings are often informational and not critical
|
|
||||||
|
|
||||||
|
# Lines from priority "err" that don't match any keyword above are
|
||||||
|
# likely informational noise (e.g. "error response from daemon").
|
||||||
|
# Return None to avoid flooding the dashboard with non-actionable items.
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _check_logs_with_persistence(self) -> Dict[str, Any]:
|
def _check_logs_with_persistence(self) -> Dict[str, Any]:
|
||||||
@@ -2424,18 +2502,61 @@ class HealthMonitor:
|
|||||||
pattern_hash = hashlib.md5(pattern.encode()).hexdigest()[:8]
|
pattern_hash = hashlib.md5(pattern.encode()).hexdigest()[:8]
|
||||||
error_key = f'log_critical_{pattern_hash}'
|
error_key = f'log_critical_{pattern_hash}'
|
||||||
|
|
||||||
|
# ── SMART cross-reference for disk/FS errors ──
|
||||||
|
# Filesystem and disk errors are only truly CRITICAL if
|
||||||
|
# the underlying disk is actually failing. We check:
|
||||||
|
# 1. Device exists? No -> WARNING (disconnected USB, etc.)
|
||||||
|
# 2. SMART PASSED? -> WARNING (transient error, not disk failure)
|
||||||
|
# 3. SMART FAILED? -> CRITICAL (confirmed hardware problem)
|
||||||
|
# 4. SMART UNKNOWN? -> WARNING (can't confirm, err on side of caution)
|
||||||
|
fs_dev_match = re.search(
|
||||||
|
r'(?:ext4-fs|btrfs|xfs|zfs)\s+error.*?device\s+(\S+?)\)?[:\s]',
|
||||||
|
line, re.IGNORECASE
|
||||||
|
)
|
||||||
|
smart_status_for_log = None
|
||||||
|
if fs_dev_match:
|
||||||
|
fs_dev = fs_dev_match.group(1).rstrip(')')
|
||||||
|
base_dev = re.sub(r'\d+$', '', fs_dev)
|
||||||
|
if not os.path.exists(f'/dev/{base_dev}'):
|
||||||
|
# Device not present -- almost certainly a disconnected drive
|
||||||
|
severity = 'WARNING'
|
||||||
|
smart_status_for_log = 'DEVICE_ABSENT'
|
||||||
|
elif self.capabilities.get('has_smart'):
|
||||||
|
smart_health = self._quick_smart_health(base_dev)
|
||||||
|
smart_status_for_log = smart_health
|
||||||
|
if smart_health == 'PASSED':
|
||||||
|
# SMART says disk is healthy -- transient FS error
|
||||||
|
severity = 'WARNING'
|
||||||
|
elif smart_health == 'UNKNOWN':
|
||||||
|
# Can't verify -- be conservative, don't alarm
|
||||||
|
severity = 'WARNING'
|
||||||
|
# smart_health == 'FAILED' -> keep CRITICAL
|
||||||
|
|
||||||
if pattern not in critical_errors_found:
|
if pattern not in critical_errors_found:
|
||||||
critical_errors_found[pattern] = line
|
# Only count as "critical" if severity wasn't downgraded
|
||||||
|
if severity == 'CRITICAL':
|
||||||
|
critical_errors_found[pattern] = line
|
||||||
# Build a human-readable reason from the raw log line
|
# Build a human-readable reason from the raw log line
|
||||||
enriched_reason = self._enrich_critical_log_reason(line)
|
enriched_reason = self._enrich_critical_log_reason(line)
|
||||||
|
|
||||||
|
# Append SMART context to the reason if we checked it
|
||||||
|
if smart_status_for_log == 'PASSED':
|
||||||
|
enriched_reason += '\nSMART: Passed (disk is healthy -- error is likely transient)'
|
||||||
|
elif smart_status_for_log == 'FAILED':
|
||||||
|
enriched_reason += '\nSMART: FAILED -- disk is failing, replace immediately'
|
||||||
|
elif smart_status_for_log == 'DEVICE_ABSENT':
|
||||||
|
enriched_reason += '\nDevice not currently detected -- may be a disconnected USB or temporary device'
|
||||||
|
|
||||||
# Record persistent error if it's not already active
|
# Record persistent error if it's not already active
|
||||||
if not health_persistence.is_error_active(error_key, category='logs'):
|
if not health_persistence.is_error_active(error_key, category='logs'):
|
||||||
health_persistence.record_error(
|
health_persistence.record_error(
|
||||||
error_key=error_key,
|
error_key=error_key,
|
||||||
category='logs',
|
category='logs',
|
||||||
severity='CRITICAL',
|
severity=severity,
|
||||||
reason=enriched_reason,
|
reason=enriched_reason,
|
||||||
details={'pattern': pattern, 'raw_line': line[:200], 'dismissable': True}
|
details={'pattern': pattern, 'raw_line': line[:200],
|
||||||
|
'smart_status': smart_status_for_log,
|
||||||
|
'dismissable': True}
|
||||||
)
|
)
|
||||||
|
|
||||||
# Cross-reference: filesystem errors also belong in the disks category
|
# Cross-reference: filesystem errors also belong in the disks category
|
||||||
@@ -2446,11 +2567,23 @@ class HealthMonitor:
|
|||||||
# Strip partition number to get base disk (sdb1 -> sdb)
|
# Strip partition number to get base disk (sdb1 -> sdb)
|
||||||
base_device = re.sub(r'\d+$', '', fs_device) if not ('nvme' in fs_device or 'mmcblk' in fs_device) else fs_device.rsplit('p', 1)[0] if 'p' in fs_device else fs_device
|
base_device = re.sub(r'\d+$', '', fs_device) if not ('nvme' in fs_device or 'mmcblk' in fs_device) else fs_device.rsplit('p', 1)[0] if 'p' in fs_device else fs_device
|
||||||
disk_error_key = f'disk_fs_{fs_device}'
|
disk_error_key = f'disk_fs_{fs_device}'
|
||||||
|
|
||||||
|
# Use the SMART-aware severity we already determined above
|
||||||
|
device_exists = os.path.exists(f'/dev/{base_device}')
|
||||||
|
if not device_exists:
|
||||||
|
fs_severity = 'WARNING'
|
||||||
|
elif smart_status_for_log == 'PASSED':
|
||||||
|
fs_severity = 'WARNING' # SMART healthy -> transient
|
||||||
|
elif smart_status_for_log == 'FAILED':
|
||||||
|
fs_severity = 'CRITICAL' # SMART failing -> real problem
|
||||||
|
else:
|
||||||
|
fs_severity = 'WARNING' # Can't confirm -> conservative
|
||||||
|
|
||||||
if not health_persistence.is_error_active(disk_error_key, category='disks'):
|
if not health_persistence.is_error_active(disk_error_key, category='disks'):
|
||||||
health_persistence.record_error(
|
health_persistence.record_error(
|
||||||
error_key=disk_error_key,
|
error_key=disk_error_key,
|
||||||
category='disks',
|
category='disks',
|
||||||
severity='CRITICAL',
|
severity=fs_severity,
|
||||||
reason=enriched_reason,
|
reason=enriched_reason,
|
||||||
details={
|
details={
|
||||||
'disk': base_device,
|
'disk': base_device,
|
||||||
@@ -2458,7 +2591,9 @@ class HealthMonitor:
|
|||||||
'error_type': 'filesystem',
|
'error_type': 'filesystem',
|
||||||
'error_count': 1,
|
'error_count': 1,
|
||||||
'sample': line[:200],
|
'sample': line[:200],
|
||||||
'dismissable': False
|
'smart_status': smart_status_for_log,
|
||||||
|
'dismissable': True,
|
||||||
|
'device_exists': device_exists,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -2529,11 +2664,17 @@ class HealthMonitor:
|
|||||||
# Use the original sample line for the notification,
|
# Use the original sample line for the notification,
|
||||||
# not the normalized pattern (which has IDs replaced).
|
# not the normalized pattern (which has IDs replaced).
|
||||||
sample = data.get('sample', pattern)
|
sample = data.get('sample', pattern)
|
||||||
|
# Strip journal timestamp prefix so the stored reason
|
||||||
|
# doesn't contain dated information that confuses
|
||||||
|
# re-notifications.
|
||||||
|
clean_sample = re.sub(
|
||||||
|
r'^[A-Z][a-z]{2}\s+\d+\s+[\d:]+\s+\S+\s+', '', sample
|
||||||
|
)
|
||||||
health_persistence.record_error(
|
health_persistence.record_error(
|
||||||
error_key=error_key,
|
error_key=error_key,
|
||||||
category='logs',
|
category='logs',
|
||||||
severity='WARNING',
|
severity='WARNING',
|
||||||
reason=f'Recurring error ({data["count"]}x): {sample[:150]}',
|
reason=f'Recurring error ({data["count"]}x): {clean_sample[:150]}',
|
||||||
details={'pattern': pattern, 'sample': sample,
|
details={'pattern': pattern, 'sample': sample,
|
||||||
'dismissable': True, 'occurrences': data['count']}
|
'dismissable': True, 'occurrences': data['count']}
|
||||||
)
|
)
|
||||||
@@ -2707,12 +2848,31 @@ class HealthMonitor:
|
|||||||
|
|
||||||
return pattern[:150] # Keep first 150 characters to avoid overly long patterns
|
return pattern[:150] # Keep first 150 characters to avoid overly long patterns
|
||||||
|
|
||||||
|
# Regex to parse Inst lines: Inst <pkg> [<cur>] (<new> <repo> [<arch>])
|
||||||
|
_RE_INST = re.compile(r'^Inst\s+(\S+)\s+\[([^\]]+)\]\s+\((\S+)\s+')
|
||||||
|
_RE_INST_NEW = re.compile(r'^Inst\s+(\S+)\s+\((\S+)\s+')
|
||||||
|
|
||||||
|
_PVE_PREFIXES = (
|
||||||
|
'pve-', 'proxmox-', 'qemu-server', 'lxc-pve', 'ceph',
|
||||||
|
'corosync', 'libpve', 'pbs-', 'pmg-',
|
||||||
|
)
|
||||||
|
_KERNEL_PREFIXES = ('linux-image', 'pve-kernel', 'pve-firmware')
|
||||||
|
_IMPORTANT_PKGS = {
|
||||||
|
'pve-manager', 'proxmox-ve', 'qemu-server', 'pve-container',
|
||||||
|
'pve-ha-manager', 'pve-firewall', 'ceph-common',
|
||||||
|
'proxmox-backup-client',
|
||||||
|
}
|
||||||
|
|
||||||
def _check_updates(self) -> Optional[Dict[str, Any]]:
|
def _check_updates(self) -> Optional[Dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
Check for pending system updates.
|
Check for pending system updates.
|
||||||
- WARNING: Security updates available, or system not updated >1 year (365 days).
|
- INFO: Any updates available (including security updates).
|
||||||
|
- WARNING: Security updates pending 360+ days unpatched, or system not updated >1 year (365 days).
|
||||||
- CRITICAL: System not updated >18 months (548 days).
|
- CRITICAL: System not updated >18 months (548 days).
|
||||||
- INFO: Kernel/PVE updates available, or >50 non-security updates pending.
|
|
||||||
|
Updates are always informational unless they represent a prolonged
|
||||||
|
unpatched state. Detects PVE version upgrades from pve-manager
|
||||||
|
Inst lines and exposes them as an INFO sub-check.
|
||||||
"""
|
"""
|
||||||
cache_key = 'updates_check'
|
cache_key = 'updates_check'
|
||||||
current_time = time.time()
|
current_time = time.time()
|
||||||
@@ -2734,150 +2894,214 @@ class HealthMonitor:
|
|||||||
days_since_update = (current_time - mtime) / 86400
|
days_since_update = (current_time - mtime) / 86400
|
||||||
last_update_days = int(days_since_update)
|
last_update_days = int(days_since_update)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass # Ignore if mtime fails
|
pass
|
||||||
|
|
||||||
# Perform a dry run of apt-get upgrade to see pending packages
|
# Perform a dry run of apt-get upgrade to see pending packages
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
['apt-get', 'upgrade', '--dry-run'],
|
['apt-get', 'upgrade', '--dry-run'],
|
||||||
capture_output=True,
|
capture_output=True, text=True, timeout=30
|
||||||
text=True,
|
|
||||||
timeout=10
|
|
||||||
)
|
)
|
||||||
except subprocess.TimeoutExpired:
|
except subprocess.TimeoutExpired:
|
||||||
print("[HealthMonitor] apt-get upgrade --dry-run timed out")
|
print("[HealthMonitor] apt-get upgrade --dry-run timed out")
|
||||||
return {
|
return {
|
||||||
'status': 'UNKNOWN',
|
'status': 'UNKNOWN',
|
||||||
'reason': 'apt-get timed out - repository may be unreachable',
|
'reason': 'apt-get timed out - repository may be unreachable',
|
||||||
'count': 0,
|
'count': 0, 'checks': {}
|
||||||
'checks': {}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
status = 'OK'
|
status = 'OK'
|
||||||
reason = None
|
reason = None
|
||||||
update_count = 0
|
update_count = 0
|
||||||
security_updates_packages = []
|
security_pkgs: list = []
|
||||||
kernel_pve_updates_packages = []
|
kernel_pkgs: list = []
|
||||||
|
pve_pkgs: list = []
|
||||||
|
important_pkgs: list = [] # {name, cur, new}
|
||||||
|
pve_manager_info = None # {cur, new} or None
|
||||||
sec_result = None
|
sec_result = None
|
||||||
|
sec_severity = 'INFO'
|
||||||
|
sec_days_unpatched = 0
|
||||||
|
|
||||||
if result.returncode == 0:
|
if result.returncode == 0:
|
||||||
lines = result.stdout.strip().split('\n')
|
for line in result.stdout.strip().split('\n'):
|
||||||
|
if not line.startswith('Inst '):
|
||||||
|
continue
|
||||||
|
update_count += 1
|
||||||
|
|
||||||
|
# Parse package name, current and new versions
|
||||||
|
m = self._RE_INST.match(line)
|
||||||
|
if m:
|
||||||
|
pkg_name, cur_ver, new_ver = m.group(1), m.group(2), m.group(3)
|
||||||
|
else:
|
||||||
|
m2 = self._RE_INST_NEW.match(line)
|
||||||
|
if m2:
|
||||||
|
pkg_name, cur_ver, new_ver = m2.group(1), '', m2.group(2)
|
||||||
|
else:
|
||||||
|
parts = line.split()
|
||||||
|
pkg_name = parts[1] if len(parts) > 1 else 'unknown'
|
||||||
|
cur_ver, new_ver = '', ''
|
||||||
|
|
||||||
|
# Strip arch suffix (e.g. package:amd64)
|
||||||
|
pkg_name = pkg_name.split(':')[0]
|
||||||
|
name_lower = pkg_name.lower()
|
||||||
|
line_lower = line.lower()
|
||||||
|
|
||||||
|
# Categorise
|
||||||
|
if 'security' in line_lower or 'debian-security' in line_lower:
|
||||||
|
security_pkgs.append(pkg_name)
|
||||||
|
|
||||||
|
if any(name_lower.startswith(p) for p in self._KERNEL_PREFIXES):
|
||||||
|
kernel_pkgs.append(pkg_name)
|
||||||
|
elif any(name_lower.startswith(p) for p in self._PVE_PREFIXES):
|
||||||
|
pve_pkgs.append(pkg_name)
|
||||||
|
|
||||||
|
# Collect important packages with version info
|
||||||
|
if pkg_name in self._IMPORTANT_PKGS and cur_ver:
|
||||||
|
important_pkgs.append({
|
||||||
|
'name': pkg_name, 'cur': cur_ver, 'new': new_ver
|
||||||
|
})
|
||||||
|
|
||||||
|
# Detect pve-manager upgrade -> PVE version upgrade
|
||||||
|
if pkg_name == 'pve-manager' and cur_ver and new_ver:
|
||||||
|
pve_manager_info = {'cur': cur_ver, 'new': new_ver}
|
||||||
|
|
||||||
for line in lines:
|
# ── Determine overall status ──────────────────────
|
||||||
# 'Inst ' indicates a package will be installed/upgraded
|
if security_pkgs:
|
||||||
if line.startswith('Inst '):
|
sec_days_unpatched = 0
|
||||||
update_count += 1
|
try:
|
||||||
line_lower = line.lower()
|
existing = health_persistence.get_error_by_key('security_updates')
|
||||||
package_name = line.split()[1].split(':')[0] # Get package name, strip arch if present
|
if existing and existing.get('first_seen'):
|
||||||
|
from datetime import datetime
|
||||||
# Check for security updates (common pattern in repo names)
|
first_dt = datetime.fromisoformat(existing['first_seen'])
|
||||||
if 'security' in line_lower or 'debian-security' in line_lower:
|
sec_days_unpatched = (datetime.now() - first_dt).days
|
||||||
security_updates_packages.append(package_name)
|
except Exception:
|
||||||
|
pass
|
||||||
# Check for kernel or critical PVE updates
|
|
||||||
if any(pkg in line_lower for pkg in ['linux-image', 'pve-kernel', 'pve-manager', 'proxmox-ve', 'qemu-server', 'pve-api-core']):
|
if sec_days_unpatched >= self.SECURITY_WARN_DAYS:
|
||||||
kernel_pve_updates_packages.append(package_name)
|
status = 'WARNING'
|
||||||
|
reason = f'{len(security_pkgs)} security update(s) pending for {sec_days_unpatched} days'
|
||||||
# Determine overall status based on findings
|
sec_severity = 'WARNING'
|
||||||
if security_updates_packages:
|
else:
|
||||||
status = 'WARNING'
|
status = 'INFO'
|
||||||
reason = f'{len(security_updates_packages)} security update(s) available'
|
reason = f'{len(security_pkgs)} security update(s) pending'
|
||||||
# Record persistent error for security updates to ensure it's visible
|
sec_severity = 'INFO'
|
||||||
|
|
||||||
sec_result = health_persistence.record_error(
|
sec_result = health_persistence.record_error(
|
||||||
error_key='security_updates',
|
error_key='security_updates',
|
||||||
category='updates',
|
category='updates',
|
||||||
severity='WARNING',
|
severity=sec_severity,
|
||||||
reason=reason,
|
reason=reason,
|
||||||
details={'count': len(security_updates_packages), 'packages': security_updates_packages[:5], 'dismissable': True}
|
details={'count': len(security_pkgs), 'packages': security_pkgs[:5],
|
||||||
|
'dismissable': sec_severity == 'WARNING',
|
||||||
|
'days_unpatched': sec_days_unpatched}
|
||||||
)
|
)
|
||||||
# If previously dismissed, downgrade to INFO
|
|
||||||
if sec_result and sec_result.get('type') == 'skipped_acknowledged':
|
if sec_result and sec_result.get('type') == 'skipped_acknowledged':
|
||||||
status = 'INFO'
|
status = 'INFO'
|
||||||
reason = None
|
reason = None
|
||||||
|
|
||||||
elif last_update_days and last_update_days >= 548:
|
elif last_update_days and last_update_days >= 548:
|
||||||
# 18+ months without updates - CRITICAL
|
|
||||||
status = 'CRITICAL'
|
status = 'CRITICAL'
|
||||||
reason = f'System not updated in {last_update_days} days (>18 months)'
|
reason = f'System not updated in {last_update_days} days (>18 months)'
|
||||||
health_persistence.record_error(
|
health_persistence.record_error(
|
||||||
error_key='system_age',
|
error_key='system_age', category='updates',
|
||||||
category='updates',
|
severity='CRITICAL', reason=reason,
|
||||||
severity='CRITICAL',
|
|
||||||
reason=reason,
|
|
||||||
details={'days': last_update_days, 'update_count': update_count, 'dismissable': False}
|
details={'days': last_update_days, 'update_count': update_count, 'dismissable': False}
|
||||||
)
|
)
|
||||||
elif last_update_days and last_update_days >= 365:
|
elif last_update_days and last_update_days >= 365:
|
||||||
# 1+ year without updates - WARNING
|
|
||||||
status = 'WARNING'
|
status = 'WARNING'
|
||||||
reason = f'System not updated in {last_update_days} days (>1 year)'
|
reason = f'System not updated in {last_update_days} days (>1 year)'
|
||||||
age_result = health_persistence.record_error(
|
age_result = health_persistence.record_error(
|
||||||
error_key='system_age',
|
error_key='system_age', category='updates',
|
||||||
category='updates',
|
severity='WARNING', reason=reason,
|
||||||
severity='WARNING',
|
|
||||||
reason=reason,
|
|
||||||
details={'days': last_update_days, 'update_count': update_count, 'dismissable': True}
|
details={'days': last_update_days, 'update_count': update_count, 'dismissable': True}
|
||||||
)
|
)
|
||||||
if age_result and age_result.get('type') == 'skipped_acknowledged':
|
if age_result and age_result.get('type') == 'skipped_acknowledged':
|
||||||
status = 'INFO'
|
status = 'INFO'
|
||||||
reason = None
|
reason = None
|
||||||
elif kernel_pve_updates_packages:
|
elif kernel_pkgs or pve_pkgs:
|
||||||
# Informational: Kernel or critical PVE components need update
|
|
||||||
status = 'INFO'
|
status = 'INFO'
|
||||||
reason = f'{len(kernel_pve_updates_packages)} kernel/PVE update(s) available'
|
reason = f'{len(kernel_pkgs)} kernel + {len(pve_pkgs)} Proxmox update(s) available'
|
||||||
elif update_count > 50:
|
elif update_count > 0:
|
||||||
# Informational: Large number of pending updates
|
|
||||||
status = 'INFO'
|
status = 'INFO'
|
||||||
reason = f'{update_count} updates pending (consider maintenance window)'
|
reason = f'{update_count} package update(s) pending'
|
||||||
|
|
||||||
# If apt-get upgrade --dry-run failed
|
|
||||||
elif result.returncode != 0:
|
elif result.returncode != 0:
|
||||||
status = 'WARNING'
|
status = 'WARNING'
|
||||||
reason = 'Failed to check for updates (apt-get error)'
|
reason = 'Failed to check for updates (apt-get error)'
|
||||||
|
|
||||||
# Build checks dict for updates sub-items
|
# ── Build checks dict ─────────────────────────────────
|
||||||
age_dismissed = bool(age_result and age_result.get('type') == 'skipped_acknowledged')
|
age_dismissed = bool(age_result and age_result.get('type') == 'skipped_acknowledged')
|
||||||
update_age_status = 'CRITICAL' if (last_update_days and last_update_days >= 548) else (
|
update_age_status = 'CRITICAL' if (last_update_days and last_update_days >= 548) else (
|
||||||
'INFO' if age_dismissed else ('WARNING' if (last_update_days and last_update_days >= 365) else 'OK'))
|
'INFO' if age_dismissed else ('WARNING' if (last_update_days and last_update_days >= 365) else 'OK'))
|
||||||
sec_dismissed = security_updates_packages and sec_result and sec_result.get('type') == 'skipped_acknowledged'
|
|
||||||
sec_status = 'INFO' if sec_dismissed else ('WARNING' if security_updates_packages else 'OK')
|
sec_dismissed = security_pkgs and sec_result and sec_result.get('type') == 'skipped_acknowledged'
|
||||||
kernel_status = 'INFO' if kernel_pve_updates_packages else 'OK'
|
if sec_dismissed:
|
||||||
|
sec_status = 'INFO'
|
||||||
|
elif security_pkgs:
|
||||||
|
sec_status = sec_severity
|
||||||
|
else:
|
||||||
|
sec_status = 'OK'
|
||||||
|
|
||||||
|
sec_detail = f'{len(security_pkgs)} security update(s) pending'
|
||||||
|
if security_pkgs and sec_days_unpatched >= self.SECURITY_WARN_DAYS:
|
||||||
|
sec_detail += f' ({sec_days_unpatched} days unpatched)'
|
||||||
|
|
||||||
checks = {
|
checks = {
|
||||||
|
'kernel_pve': {
|
||||||
|
'status': 'INFO' if kernel_pkgs else 'OK',
|
||||||
|
'detail': f'{len(kernel_pkgs)} kernel/PVE update(s)' if kernel_pkgs else 'Kernel/PVE up to date',
|
||||||
|
'error_key': 'kernel_pve'
|
||||||
|
},
|
||||||
|
'pending_updates': {
|
||||||
|
'status': 'INFO' if update_count > 0 else 'OK',
|
||||||
|
'detail': f'{update_count} package(s) pending',
|
||||||
|
'error_key': 'pending_updates'
|
||||||
|
},
|
||||||
'security_updates': {
|
'security_updates': {
|
||||||
'status': sec_status,
|
'status': sec_status,
|
||||||
'detail': f'{len(security_updates_packages)} security update(s) pending' if security_updates_packages else 'No security updates pending',
|
'detail': sec_detail if security_pkgs else 'No security updates pending',
|
||||||
'dismissable': True if security_updates_packages and not sec_dismissed else False,
|
'dismissable': sec_status == 'WARNING' and not sec_dismissed,
|
||||||
'dismissed': bool(sec_dismissed),
|
'dismissed': bool(sec_dismissed),
|
||||||
'error_key': 'security_updates'
|
'error_key': 'security_updates'
|
||||||
},
|
},
|
||||||
'system_age': {
|
'system_age': {
|
||||||
'status': update_age_status,
|
'status': update_age_status,
|
||||||
'detail': f'Last updated {last_update_days} day(s) ago' if last_update_days is not None else 'Unknown',
|
'detail': f'Last updated {last_update_days} day(s) ago' if last_update_days is not None else 'Unknown',
|
||||||
'dismissable': False if update_age_status == 'CRITICAL' else True if update_age_status == 'WARNING' else False,
|
'dismissable': update_age_status == 'WARNING' and not age_dismissed,
|
||||||
'dismissed': bool(age_dismissed),
|
'dismissed': bool(age_dismissed),
|
||||||
'error_key': 'system_age'
|
'error_key': 'system_age'
|
||||||
},
|
},
|
||||||
'pending_updates': {
|
|
||||||
'status': 'INFO' if update_count > 50 else 'OK',
|
|
||||||
'detail': f'{update_count} package(s) pending',
|
|
||||||
'error_key': 'pending_updates'
|
|
||||||
},
|
|
||||||
'kernel_pve': {
|
|
||||||
'status': kernel_status,
|
|
||||||
'detail': f'{len(kernel_pve_updates_packages)} kernel/PVE update(s)' if kernel_pve_updates_packages else 'Kernel/PVE up to date',
|
|
||||||
'error_key': 'kernel_pve'
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# PVE version sub-check (always INFO)
|
||||||
|
if pve_manager_info:
|
||||||
|
checks['pve_version'] = {
|
||||||
|
'status': 'INFO',
|
||||||
|
'detail': f"PVE {pve_manager_info['cur']} -> {pve_manager_info['new']} available",
|
||||||
|
'error_key': 'pve_version'
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
checks['pve_version'] = {
|
||||||
|
'status': 'OK',
|
||||||
|
'detail': 'Proxmox VE is up to date',
|
||||||
|
'error_key': 'pve_version'
|
||||||
|
}
|
||||||
|
|
||||||
# Construct result dictionary
|
# Construct result dictionary
|
||||||
update_result = {
|
update_result = {
|
||||||
'status': status,
|
'status': status,
|
||||||
'count': update_count,
|
'count': update_count,
|
||||||
'checks': checks
|
'checks': checks,
|
||||||
}
|
}
|
||||||
if reason:
|
if reason:
|
||||||
update_result['reason'] = reason
|
update_result['reason'] = reason
|
||||||
if last_update_days is not None:
|
if last_update_days is not None:
|
||||||
update_result['days_since_update'] = last_update_days
|
update_result['days_since_update'] = last_update_days
|
||||||
|
# Attach categorised counts for the frontend
|
||||||
|
update_result['security_count'] = len(security_pkgs)
|
||||||
|
update_result['pve_count'] = len(pve_pkgs)
|
||||||
|
update_result['kernel_count'] = len(kernel_pkgs)
|
||||||
|
update_result['important_packages'] = important_pkgs[:8]
|
||||||
|
|
||||||
self.cached_results[cache_key] = update_result
|
self.cached_results[cache_key] = update_result
|
||||||
self.last_check_times[cache_key] = current_time
|
self.last_check_times[cache_key] = current_time
|
||||||
|
|||||||
@@ -548,6 +548,33 @@ class HealthPersistence:
|
|||||||
|
|
||||||
return errors
|
return errors
|
||||||
|
|
||||||
|
def get_error_by_key(self, error_key: str) -> Optional[Dict[str, Any]]:
|
||||||
|
"""Get a single error record by its unique error_key.
|
||||||
|
|
||||||
|
Returns the full row as a dict (including first_seen, last_seen,
|
||||||
|
acknowledged, etc.) or None if not found / already resolved.
|
||||||
|
Only returns unresolved (active) errors.
|
||||||
|
"""
|
||||||
|
conn = self._get_conn()
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute('''
|
||||||
|
SELECT * FROM errors
|
||||||
|
WHERE error_key = ? AND resolved_at IS NULL
|
||||||
|
LIMIT 1
|
||||||
|
''', (error_key,))
|
||||||
|
row = cursor.fetchone()
|
||||||
|
conn.close()
|
||||||
|
if row is None:
|
||||||
|
return None
|
||||||
|
error_dict = dict(row)
|
||||||
|
if error_dict.get('details'):
|
||||||
|
try:
|
||||||
|
error_dict['details'] = json.loads(error_dict['details'])
|
||||||
|
except (json.JSONDecodeError, TypeError):
|
||||||
|
pass
|
||||||
|
return error_dict
|
||||||
|
|
||||||
def cleanup_old_errors(self):
|
def cleanup_old_errors(self):
|
||||||
"""Clean up old resolved errors and auto-resolve stale errors"""
|
"""Clean up old resolved errors and auto-resolve stale errors"""
|
||||||
with self._db_lock:
|
with self._db_lock:
|
||||||
|
|||||||
@@ -337,6 +337,16 @@ class JournalWatcher:
|
|||||||
entity = 'disk'
|
entity = 'disk'
|
||||||
entity_id = f'fs_{device}'
|
entity_id = f'fs_{device}'
|
||||||
|
|
||||||
|
# Check if the device physically exists to calibrate severity.
|
||||||
|
# A disconnected USB / temp device should NOT be CRITICAL.
|
||||||
|
import os as _os
|
||||||
|
base_dev = re.sub(r'\d+$', '', device) if device != 'unknown' else ''
|
||||||
|
device_exists = base_dev and _os.path.exists(f'/dev/{base_dev}')
|
||||||
|
|
||||||
|
if not device_exists and device != 'unknown':
|
||||||
|
# Device not present -- downgrade to WARNING
|
||||||
|
severity = 'WARNING'
|
||||||
|
|
||||||
# Identify what this device is (model, type, mountpoint)
|
# Identify what this device is (model, type, mountpoint)
|
||||||
device_info = self._identify_block_device(device)
|
device_info = self._identify_block_device(device)
|
||||||
|
|
||||||
@@ -357,7 +367,10 @@ class JournalWatcher:
|
|||||||
if inode:
|
if inode:
|
||||||
inode_hint = 'root directory' if inode == '2' else f'inode #{inode}'
|
inode_hint = 'root directory' if inode == '2' else f'inode #{inode}'
|
||||||
parts.append(f'Affected: {inode_hint}')
|
parts.append(f'Affected: {inode_hint}')
|
||||||
parts.append(f'Action: Run "fsck /dev/{device}" (unmount first) or check backup integrity')
|
if device_exists:
|
||||||
|
parts.append(f'Action: Run "fsck /dev/{device}" (unmount first) or check backup integrity')
|
||||||
|
else:
|
||||||
|
parts.append('Note: Device not currently connected -- this may be a stale journal entry')
|
||||||
enriched = '\n'.join(parts)
|
enriched = '\n'.join(parts)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
@@ -1325,7 +1338,7 @@ class PollingCollector:
|
|||||||
'network': 'network_down',
|
'network': 'network_down',
|
||||||
'pve_services': 'service_fail',
|
'pve_services': 'service_fail',
|
||||||
'security': 'auth_fail',
|
'security': 'auth_fail',
|
||||||
'updates': 'update_available',
|
'updates': 'update_summary',
|
||||||
'zfs': 'disk_io_error',
|
'zfs': 'disk_io_error',
|
||||||
'smart': 'disk_io_error',
|
'smart': 'disk_io_error',
|
||||||
'disks': 'disk_io_error',
|
'disks': 'disk_io_error',
|
||||||
@@ -1442,12 +1455,18 @@ class PollingCollector:
|
|||||||
event_type = self._CATEGORY_TO_EVENT_TYPE.get(category, 'system_problem')
|
event_type = self._CATEGORY_TO_EVENT_TYPE.get(category, 'system_problem')
|
||||||
entity, eid = self._ENTITY_MAP.get(category, ('node', ''))
|
entity, eid = self._ENTITY_MAP.get(category, ('node', ''))
|
||||||
|
|
||||||
|
# Updates are always informational notifications except
|
||||||
|
# system_age which can be WARNING (365+ days) or CRITICAL (548+ days).
|
||||||
|
emit_severity = severity
|
||||||
|
if category == 'updates' and error_key != 'system_age':
|
||||||
|
emit_severity = 'INFO'
|
||||||
|
|
||||||
data = {
|
data = {
|
||||||
'hostname': self._hostname,
|
'hostname': self._hostname,
|
||||||
'category': category,
|
'category': category,
|
||||||
'reason': reason,
|
'reason': reason,
|
||||||
'error_key': error_key,
|
'error_key': error_key,
|
||||||
'severity': severity,
|
'severity': emit_severity,
|
||||||
'first_seen': error.get('first_seen', ''),
|
'first_seen': error.get('first_seen', ''),
|
||||||
'last_seen': error.get('last_seen', ''),
|
'last_seen': error.get('last_seen', ''),
|
||||||
'is_persistent': not is_new,
|
'is_persistent': not is_new,
|
||||||
@@ -1464,7 +1483,7 @@ class PollingCollector:
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
self._queue.put(NotificationEvent(
|
self._queue.put(NotificationEvent(
|
||||||
event_type, severity, data, source='health',
|
event_type, emit_severity, data, source='health',
|
||||||
entity=entity, entity_id=eid or error_key,
|
entity=entity, entity_id=eid or error_key,
|
||||||
))
|
))
|
||||||
|
|
||||||
@@ -1482,11 +1501,36 @@ class PollingCollector:
|
|||||||
|
|
||||||
# ── Update check (enriched) ────────────────────────────────
|
# ── Update check (enriched) ────────────────────────────────
|
||||||
|
|
||||||
|
# Proxmox-related package prefixes used for categorisation
|
||||||
|
_PVE_PREFIXES = (
|
||||||
|
'pve-', 'proxmox-', 'qemu-server', 'lxc-pve', 'ceph',
|
||||||
|
'corosync', 'libpve', 'pbs-', 'pmg-',
|
||||||
|
)
|
||||||
|
_KERNEL_PREFIXES = ('linux-image', 'pve-kernel', 'pve-firmware')
|
||||||
|
_IMPORTANT_PKGS = {
|
||||||
|
'pve-manager', 'proxmox-ve', 'qemu-server', 'pve-container',
|
||||||
|
'pve-ha-manager', 'pve-firewall', 'pve-storage-iscsi-direct',
|
||||||
|
'ceph-common', 'proxmox-backup-client',
|
||||||
|
}
|
||||||
|
|
||||||
|
# Regex to parse Inst lines from apt-get -s upgrade
|
||||||
|
# Inst <pkg> [<cur_ver>] (<new_ver> <repo> [<arch>])
|
||||||
|
_RE_INST = re.compile(
|
||||||
|
r'^Inst\s+(\S+)\s+\[([^\]]+)\]\s+\((\S+)\s+'
|
||||||
|
)
|
||||||
|
# Fallback for new installs (no current version):
|
||||||
|
# Inst <pkg> (<new_ver> <repo> [<arch>])
|
||||||
|
_RE_INST_NEW = re.compile(
|
||||||
|
r'^Inst\s+(\S+)\s+\((\S+)\s+'
|
||||||
|
)
|
||||||
|
|
||||||
def _check_updates(self):
|
def _check_updates(self):
|
||||||
"""Check for available system updates every 24 h.
|
"""Check for available system updates every 24 h.
|
||||||
|
|
||||||
Enriched output: total count, security updates, PVE version hint,
|
Emits a structured ``update_summary`` notification with categorised
|
||||||
and top package names.
|
counts (security, Proxmox-related, kernel, other) and important
|
||||||
|
package versions. If pve-manager has an upgrade, also emits a
|
||||||
|
separate ``pve_update`` notification.
|
||||||
"""
|
"""
|
||||||
now = time.time()
|
now = time.time()
|
||||||
if now - self._last_update_check < self.UPDATE_CHECK_INTERVAL:
|
if now - self._last_update_check < self.UPDATE_CHECK_INTERVAL:
|
||||||
@@ -1502,58 +1546,84 @@ class PollingCollector:
|
|||||||
if result.returncode != 0:
|
if result.returncode != 0:
|
||||||
return
|
return
|
||||||
|
|
||||||
lines = [l for l in result.stdout.split('\n') if l.startswith('Inst ')]
|
inst_lines = [l for l in result.stdout.split('\n') if l.startswith('Inst ')]
|
||||||
total = len(lines)
|
total = len(inst_lines)
|
||||||
if total == 0:
|
if total == 0:
|
||||||
return
|
return
|
||||||
|
|
||||||
packages = [l.split()[1] for l in lines]
|
# ── Parse every Inst line ──────────────────────────────
|
||||||
security = [p for p in packages if any(
|
all_pkgs: list[dict] = [] # {name, cur, new}
|
||||||
kw in p.lower() for kw in ('security', 'cve', 'openssl', 'libssl')
|
security_pkgs: list[dict] = []
|
||||||
)]
|
pve_pkgs: list[dict] = []
|
||||||
|
kernel_pkgs: list[dict] = []
|
||||||
|
pve_manager_info: dict | None = None
|
||||||
|
|
||||||
# Also detect security updates via apt changelog / Debian-Security origin
|
for line in inst_lines:
|
||||||
sec_result = subprocess.run(
|
m = self._RE_INST.match(line)
|
||||||
['apt-get', '-s', 'upgrade', '-o', 'Dir::Etc::SourceList=/dev/null',
|
if m:
|
||||||
'-o', 'Dir::Etc::SourceParts=/dev/null'],
|
info = {'name': m.group(1), 'cur': m.group(2), 'new': m.group(3)}
|
||||||
capture_output=True, text=True, timeout=30,
|
else:
|
||||||
)
|
m2 = self._RE_INST_NEW.match(line)
|
||||||
# Count lines from security repo (rough heuristic)
|
if m2:
|
||||||
sec_count = max(len(security), 0)
|
info = {'name': m2.group(1), 'cur': '', 'new': m2.group(2)}
|
||||||
try:
|
else:
|
||||||
sec_output = subprocess.run(
|
pkg_name = line.split()[1] if len(line.split()) > 1 else 'unknown'
|
||||||
['apt-get', '-s', '--only-upgrade', 'install'] + packages[:50],
|
info = {'name': pkg_name, 'cur': '', 'new': ''}
|
||||||
capture_output=True, text=True, timeout=30,
|
|
||||||
)
|
all_pkgs.append(info)
|
||||||
for line in sec_output.stdout.split('\n'):
|
name_lower = info['name'].lower()
|
||||||
if 'security' in line.lower() and 'Inst ' in line:
|
line_lower = line.lower()
|
||||||
sec_count += 1
|
|
||||||
except Exception:
|
# Categorise
|
||||||
pass
|
if 'security' in line_lower or 'debian-security' in line_lower:
|
||||||
|
security_pkgs.append(info)
|
||||||
|
|
||||||
|
if any(name_lower.startswith(p) for p in self._KERNEL_PREFIXES):
|
||||||
|
kernel_pkgs.append(info)
|
||||||
|
elif any(name_lower.startswith(p) for p in self._PVE_PREFIXES):
|
||||||
|
pve_pkgs.append(info)
|
||||||
|
|
||||||
|
# Detect pve-manager upgrade specifically
|
||||||
|
if info['name'] == 'pve-manager':
|
||||||
|
pve_manager_info = info
|
||||||
|
|
||||||
# Check for PVE version upgrade
|
# ── Build important packages list ──────────────────────
|
||||||
pve_packages = [p for p in packages if 'pve-' in p.lower() or 'proxmox-' in p.lower()]
|
important_lines = []
|
||||||
|
for pkg in all_pkgs:
|
||||||
# Build display details
|
if pkg['name'] in self._IMPORTANT_PKGS and pkg['cur']:
|
||||||
top_pkgs = packages[:8]
|
important_lines.append(
|
||||||
details = ', '.join(top_pkgs)
|
f"{pkg['name']} ({pkg['cur']} -> {pkg['new']})"
|
||||||
if total > 8:
|
)
|
||||||
details += f', ... +{total - 8} more'
|
|
||||||
|
|
||||||
|
# ── Emit structured update_summary ─────────────────────
|
||||||
data = {
|
data = {
|
||||||
'hostname': self._hostname,
|
'hostname': self._hostname,
|
||||||
'count': str(total),
|
'total_count': str(total),
|
||||||
'security_count': str(sec_count),
|
'security_count': str(len(security_pkgs)),
|
||||||
'details': details,
|
'pve_count': str(len(pve_pkgs)),
|
||||||
'packages': ', '.join(packages[:20]),
|
'kernel_count': str(len(kernel_pkgs)),
|
||||||
|
'important_list': ', '.join(important_lines) if important_lines else 'none',
|
||||||
|
'package_list': ', '.join(important_lines[:6]) if important_lines else '',
|
||||||
}
|
}
|
||||||
if pve_packages:
|
|
||||||
data['pve_packages'] = ', '.join(pve_packages)
|
|
||||||
|
|
||||||
self._queue.put(NotificationEvent(
|
self._queue.put(NotificationEvent(
|
||||||
'update_available', 'INFO', data,
|
'update_summary', 'INFO', data,
|
||||||
source='polling', entity='node', entity_id='',
|
source='polling', entity='node', entity_id='',
|
||||||
))
|
))
|
||||||
|
|
||||||
|
# ── Emit pve_update if pve-manager has an upgrade ──────
|
||||||
|
if pve_manager_info and pve_manager_info['cur'] and pve_manager_info['new']:
|
||||||
|
pve_data = {
|
||||||
|
'hostname': self._hostname,
|
||||||
|
'current_version': pve_manager_info['cur'],
|
||||||
|
'new_version': pve_manager_info['new'],
|
||||||
|
'version': pve_manager_info['new'],
|
||||||
|
'details': f"pve-manager {pve_manager_info['cur']} -> {pve_manager_info['new']}",
|
||||||
|
}
|
||||||
|
self._queue.put(NotificationEvent(
|
||||||
|
'pve_update', 'INFO', pve_data,
|
||||||
|
source='polling', entity='node', entity_id='',
|
||||||
|
))
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|||||||
@@ -584,10 +584,10 @@ TEMPLATES = {
|
|||||||
'default_enabled': True,
|
'default_enabled': True,
|
||||||
},
|
},
|
||||||
'update_available': {
|
'update_available': {
|
||||||
'title': '{hostname}: Updates available ({count})',
|
'title': '{hostname}: Updates available',
|
||||||
'body': '{count} package updates are available.\n{details}',
|
'body': 'Total updates: {total_count}\nSecurity: {security_count}\nProxmox: {pve_count}\nKernel: {kernel_count}\nImportant: {important_list}',
|
||||||
'group': 'system',
|
'group': 'system',
|
||||||
'default_enabled': False,
|
'default_enabled': False, # Superseded by update_summary
|
||||||
},
|
},
|
||||||
'update_complete': {
|
'update_complete': {
|
||||||
'title': '{hostname}: Update completed',
|
'title': '{hostname}: Update completed',
|
||||||
@@ -626,14 +626,20 @@ TEMPLATES = {
|
|||||||
|
|
||||||
# ── Update notifications (enriched) ──
|
# ── Update notifications (enriched) ──
|
||||||
'update_summary': {
|
'update_summary': {
|
||||||
'title': '{hostname}: {total_count} updates available',
|
'title': '{hostname}: Updates available',
|
||||||
'body': '{security_count} security update(s), {total_count} total.\n{package_list}',
|
'body': (
|
||||||
|
'Total updates: {total_count}\n'
|
||||||
|
'Security updates: {security_count}\n'
|
||||||
|
'Proxmox-related updates: {pve_count}\n'
|
||||||
|
'Kernel updates: {kernel_count}\n'
|
||||||
|
'Important packages: {important_list}'
|
||||||
|
),
|
||||||
'group': 'system',
|
'group': 'system',
|
||||||
'default_enabled': True,
|
'default_enabled': True,
|
||||||
},
|
},
|
||||||
'pve_update': {
|
'pve_update': {
|
||||||
'title': '{hostname}: PVE update available ({version})',
|
'title': '{hostname}: Proxmox VE {new_version} available',
|
||||||
'body': 'Proxmox VE update available: {version}\n{details}',
|
'body': 'Proxmox VE {current_version} -> {new_version}\n{details}',
|
||||||
'group': 'system',
|
'group': 'system',
|
||||||
'default_enabled': True,
|
'default_enabled': True,
|
||||||
},
|
},
|
||||||
|
|||||||
Reference in New Issue
Block a user