Update notification service

This commit is contained in:
MacRimi
2026-02-26 18:21:01 +01:00
parent ffc202f6a3
commit 4d24d6d17b
3 changed files with 223 additions and 19 deletions

View File

@@ -821,8 +821,20 @@ class HealthMonitor:
issues = []
storage_details = {}
# Check disk usage and mount status first for critical mounts
critical_mounts = ['/']
# Check disk usage and mount status for important mounts.
# We detect actual mountpoints dynamically rather than hard-coding.
critical_mounts = set()
critical_mounts.add('/')
try:
for part in psutil.disk_partitions(all=False):
mp = part.mountpoint
# Include standard system mounts and PVE storage
if mp in ('/', '/var', '/tmp', '/boot', '/boot/efi') or \
mp.startswith('/var/lib/vz') or mp.startswith('/mnt/'):
critical_mounts.add(mp)
except Exception:
pass
critical_mounts = sorted(critical_mounts)
for mount_point in critical_mounts:
try:
@@ -857,9 +869,32 @@ class HealthMonitor:
# Check filesystem usage only if not already flagged as critical
if mount_point not in storage_details or storage_details[mount_point].get('status') == 'OK':
fs_status = self._check_filesystem(mount_point)
error_key = f'disk_space_{mount_point}'
if fs_status['status'] != 'OK':
issues.append(f"{mount_point}: {fs_status['reason']}")
storage_details[mount_point] = fs_status
# Record persistent error for notifications
usage = psutil.disk_usage(mount_point)
avail_gb = usage.free / (1024**3)
if avail_gb >= 1:
avail_str = f"{avail_gb:.1f} GiB"
else:
avail_str = f"{usage.free / (1024**2):.0f} MiB"
health_persistence.record_error(
error_key=error_key,
category='disk',
severity=fs_status['status'],
reason=f'{mount_point}: {fs_status["reason"]}',
details={
'mount': mount_point,
'used': str(round(usage.percent, 1)),
'available': avail_str,
'dismissable': False,
}
)
else:
# Space recovered -- clear any previous alert
health_persistence.clear_error(error_key)
except Exception:
pass # Silently skip if mountpoint check fails
@@ -1871,7 +1906,8 @@ class HealthMonitor:
self.persistent_log_patterns[pattern] = {
'count': 1,
'first_seen': current_time,
'last_seen': current_time
'last_seen': current_time,
'sample': line.strip()[:200], # Original line for display
}
for line in previous_lines:
@@ -1913,12 +1949,16 @@ class HealthMonitor:
pattern_hash = hashlib.md5(pattern.encode()).hexdigest()[:8]
error_key = f'log_persistent_{pattern_hash}'
if not health_persistence.is_error_active(error_key, category='logs'):
# Use the original sample line for the notification,
# not the normalized pattern (which has IDs replaced).
sample = data.get('sample', pattern)
health_persistence.record_error(
error_key=error_key,
category='logs',
severity='WARNING',
reason=f'Persistent error pattern detected: {pattern[:80]}',
details={'pattern': pattern, 'dismissable': True, 'occurrences': data['count']}
reason=f'Recurring error ({data["count"]}x): {sample[:150]}',
details={'pattern': pattern, 'sample': sample,
'dismissable': True, 'occurrences': data['count']}
)
patterns_to_remove = [

View File

@@ -249,6 +249,23 @@ class JournalWatcher:
def _check_kernel_critical(self, msg: str, syslog_id: str, priority: int):
"""Detect kernel panics, OOM, segfaults, hardware errors."""
# Only process messages from kernel or systemd (not app-level logs)
if syslog_id and syslog_id not in ('kernel', 'systemd', 'systemd-coredump', ''):
return
# Filter out normal kernel messages that are NOT problems
_KERNEL_NOISE = [
r'vfio-pci\s+\S+:\s*reset', # PCI passthrough resets (normal during VM start/stop)
r'vfio-pci\s+\S+:\s*resetting',
r'entered\s+(?:promiscuous|allmulticast)\s+mode', # Network bridge ops
r'entered\s+(?:blocking|forwarding|disabled)\s+state', # Bridge STP
r'tap\d+i\d+:', # TAP interface events
r'vmbr\d+:.*port\s+\d+', # Bridge port events
]
for noise in _KERNEL_NOISE:
if re.search(noise, msg, re.IGNORECASE):
return
critical_patterns = {
r'kernel panic': ('system_problem', 'CRITICAL', 'Kernel panic'),
r'Out of memory': ('system_problem', 'CRITICAL', 'Out of memory killer activated'),
@@ -318,6 +335,19 @@ class JournalWatcher:
def _check_service_failure(self, msg: str, unit: str):
"""Detect critical service failures with enriched context."""
# Filter out noise -- these are normal systemd transient units,
# not real service failures worth alerting about.
_NOISE_PATTERNS = [
r'session-\d+\.scope', # SSH/login sessions
r'user@\d+\.service', # Per-user service managers
r'user-runtime-dir@\d+', # User runtime dirs
r'systemd-coredump@', # Coredump handlers (transient)
r'run-.*\.mount', # Transient mounts
]
for noise in _NOISE_PATTERNS:
if re.search(noise, msg) or re.search(noise, unit):
return
service_patterns = [
r'Failed to start (.+)',
r'Unit (\S+) (?:entered failed state|failed)',
@@ -743,13 +773,16 @@ class PollingCollector:
'load': 'load_high',
'temperature': 'temp_high',
'disk': 'disk_space_low',
'storage': 'disk_space_low',
'storage': 'storage_unavailable',
'network': 'network_down',
'pve_services': 'service_fail',
'security': 'auth_fail',
'updates': 'update_available',
'zfs': 'disk_io_error',
'smart': 'disk_io_error',
'disks': 'disk_io_error',
'logs': 'system_problem',
'vms': 'system_problem',
}
def __init__(self, event_queue: Queue, poll_interval: int = 60):

View File

@@ -25,10 +25,10 @@ from typing import Dict, Any, Optional, List
def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]:
"""Parse a PVE vzdump notification message into structured data.
PVE vzdump messages contain:
- A table: VMID Name Status Time Size Filename
- Totals: Total running time: Xs / Total size: X GiB
- Full logs per VM
Supports two formats:
1. Local storage: table with columns VMID Name Status Time Size Filename
2. PBS storage: log-style output with 'Finished Backup of VM NNN (HH:MM:SS)'
and sizes in lines like 'root.pxar: had to backup X of Y' or 'transferred X'
Returns dict with 'vms' list, 'total_time', 'total_size', or None.
"""
@@ -41,7 +41,7 @@ def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]:
lines = message.split('\n')
# Find the table header line
# ── Strategy 1: classic table (local/NFS/CIFS storage) ──
header_idx = -1
for i, line in enumerate(lines):
if re.match(r'\s*VMID\s+Name\s+Status', line, re.IGNORECASE):
@@ -49,15 +49,10 @@ def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]:
break
if header_idx >= 0:
# Parse column positions from header
header = lines[header_idx]
# Parse table rows after header
for line in lines[header_idx + 1:]:
stripped = line.strip()
if not stripped or stripped.startswith('Total') or stripped.startswith('Logs') or stripped.startswith('='):
break
# Table row: VMID Name Status Time Size Filename
# Use regex to parse flexible whitespace columns
m = re.match(
r'\s*(\d+)\s+' # VMID
r'(\S+)\s+' # Name
@@ -74,10 +69,91 @@ def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]:
'status': m.group(3),
'time': m.group(4),
'size': m.group(5),
'filename': m.group(6).split('/')[-1], # just filename
'filename': m.group(6).split('/')[-1],
})
# Extract totals
# ── Strategy 2: log-style (PBS / Proxmox Backup Server) ──
# Parse from the full vzdump log lines.
# Look for patterns:
# "Starting Backup of VM NNN (lxc/qemu)" -> detect guest
# "CT Name: xxx" or "VM Name: xxx" -> guest name
# "Finished Backup of VM NNN (HH:MM:SS)" -> duration + status=ok
# "root.pxar: had to backup X of Y" -> size (CT)
# "transferred X in N seconds" -> size (QEMU)
# "creating ... archive 'ct/100/2026-..'" -> archive name for PBS
# "TASK ERROR:" or "ERROR:" -> status=error
if not vms:
current_vm: Optional[Dict[str, str]] = None
for line in lines:
# Remove "INFO: " prefix that PVE adds
clean = re.sub(r'^(?:INFO|WARNING|ERROR):\s*', '', line.strip())
# Start of a new VM backup
m_start = re.match(
r'Starting Backup of VM (\d+)\s+\((lxc|qemu)\)', clean)
if m_start:
if current_vm:
vms.append(current_vm)
current_vm = {
'vmid': m_start.group(1),
'name': '',
'status': 'ok',
'time': '',
'size': '',
'filename': '',
'type': m_start.group(2),
}
continue
if current_vm:
# Guest name
m_name = re.match(r'(?:CT|VM) Name:\s*(.+)', clean)
if m_name:
current_vm['name'] = m_name.group(1).strip()
continue
# PBS archive path -> extract as filename
m_archive = re.search(
r"creating .+ archive '([^']+)'", clean)
if m_archive:
current_vm['filename'] = m_archive.group(1)
continue
# Size for containers (pxar)
m_pxar = re.search(
r'root\.pxar:.*?of\s+([\d.]+\s+\S+)', clean)
if m_pxar:
current_vm['size'] = m_pxar.group(1)
continue
# Size for QEMU (transferred)
m_transfer = re.search(
r'transferred\s+([\d.]+\s+\S+)', clean)
if m_transfer:
current_vm['size'] = m_transfer.group(1)
continue
# Finished -> duration
m_finish = re.match(
r'Finished Backup of VM (\d+)\s+\(([^)]+)\)', clean)
if m_finish:
current_vm['time'] = m_finish.group(2)
current_vm['status'] = 'ok'
vms.append(current_vm)
current_vm = None
continue
# Error
if clean.startswith('ERROR:') or clean.startswith('TASK ERROR'):
if current_vm:
current_vm['status'] = 'error'
# Don't forget the last VM if it wasn't finished
if current_vm:
vms.append(current_vm)
# ── Extract totals ──
for line in lines:
m_time = re.search(r'Total running time:\s*(.+)', line)
if m_time:
@@ -86,6 +162,50 @@ def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]:
if m_size:
total_size = m_size.group(1).strip()
# For PBS: calculate total size if not explicitly stated
if not total_size and vms:
# Sum individual sizes if they share units
sizes_gib = 0.0
for vm in vms:
s = vm.get('size', '')
m = re.match(r'([\d.]+)\s+(.*)', s)
if m:
val = float(m.group(1))
unit = m.group(2).strip().upper()
if 'GIB' in unit or 'GB' in unit:
sizes_gib += val
elif 'MIB' in unit or 'MB' in unit:
sizes_gib += val / 1024
elif 'TIB' in unit or 'TB' in unit:
sizes_gib += val * 1024
if sizes_gib > 0:
if sizes_gib >= 1024:
total_size = f"{sizes_gib / 1024:.3f} TiB"
elif sizes_gib >= 1:
total_size = f"{sizes_gib:.3f} GiB"
else:
total_size = f"{sizes_gib * 1024:.3f} MiB"
# For PBS: calculate total time if not stated
if not total_time and vms:
total_secs = 0
for vm in vms:
t = vm.get('time', '')
# Parse HH:MM:SS format
m = re.match(r'(\d+):(\d+):(\d+)', t)
if m:
total_secs += int(m.group(1)) * 3600 + int(m.group(2)) * 60 + int(m.group(3))
if total_secs > 0:
hours = total_secs // 3600
mins = (total_secs % 3600) // 60
secs = total_secs % 60
if hours:
total_time = f"{hours}h {mins}m {secs}s"
elif mins:
total_time = f"{mins}m {secs}s"
else:
total_time = f"{secs}s"
if not vms and not total_size:
return None
@@ -113,7 +233,12 @@ def _format_vzdump_body(parsed: Dict[str, Any], is_success: bool) -> str:
if vm.get('time'):
details.append(f"Duration: {vm['time']}")
if vm.get('filename'):
details.append(f"File: {vm['filename']}")
fname = vm['filename']
# PBS archives look like "ct/100/2026-..." or "vm/105/2026-..."
if re.match(r'^(?:ct|vm)/\d+/', fname):
details.append(f"PBS: {fname}")
else:
details.append(f"File: {fname}")
if details:
parts.append(' | '.join(details))
parts.append('') # blank line between VMs
@@ -338,6 +463,12 @@ TEMPLATES = {
'group': 'storage',
'default_enabled': True,
},
'storage_unavailable': {
'title': '{hostname}: Storage unavailable - {storage_name}',
'body': 'PVE storage "{storage_name}" ({storage_type}) is not available.\n{reason}',
'group': 'storage',
'default_enabled': True,
},
'load_high': {
'title': '{hostname}: High system load ({value})',
'body': 'System load average: {value} on {cores} cores.\n{details}',