mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-04-30 11:26:23 +00:00
Update notification service
This commit is contained in:
@@ -821,8 +821,20 @@ class HealthMonitor:
|
|||||||
issues = []
|
issues = []
|
||||||
storage_details = {}
|
storage_details = {}
|
||||||
|
|
||||||
# Check disk usage and mount status first for critical mounts
|
# Check disk usage and mount status for important mounts.
|
||||||
critical_mounts = ['/']
|
# We detect actual mountpoints dynamically rather than hard-coding.
|
||||||
|
critical_mounts = set()
|
||||||
|
critical_mounts.add('/')
|
||||||
|
try:
|
||||||
|
for part in psutil.disk_partitions(all=False):
|
||||||
|
mp = part.mountpoint
|
||||||
|
# Include standard system mounts and PVE storage
|
||||||
|
if mp in ('/', '/var', '/tmp', '/boot', '/boot/efi') or \
|
||||||
|
mp.startswith('/var/lib/vz') or mp.startswith('/mnt/'):
|
||||||
|
critical_mounts.add(mp)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
critical_mounts = sorted(critical_mounts)
|
||||||
|
|
||||||
for mount_point in critical_mounts:
|
for mount_point in critical_mounts:
|
||||||
try:
|
try:
|
||||||
@@ -857,9 +869,32 @@ class HealthMonitor:
|
|||||||
# Check filesystem usage only if not already flagged as critical
|
# Check filesystem usage only if not already flagged as critical
|
||||||
if mount_point not in storage_details or storage_details[mount_point].get('status') == 'OK':
|
if mount_point not in storage_details or storage_details[mount_point].get('status') == 'OK':
|
||||||
fs_status = self._check_filesystem(mount_point)
|
fs_status = self._check_filesystem(mount_point)
|
||||||
|
error_key = f'disk_space_{mount_point}'
|
||||||
if fs_status['status'] != 'OK':
|
if fs_status['status'] != 'OK':
|
||||||
issues.append(f"{mount_point}: {fs_status['reason']}")
|
issues.append(f"{mount_point}: {fs_status['reason']}")
|
||||||
storage_details[mount_point] = fs_status
|
storage_details[mount_point] = fs_status
|
||||||
|
# Record persistent error for notifications
|
||||||
|
usage = psutil.disk_usage(mount_point)
|
||||||
|
avail_gb = usage.free / (1024**3)
|
||||||
|
if avail_gb >= 1:
|
||||||
|
avail_str = f"{avail_gb:.1f} GiB"
|
||||||
|
else:
|
||||||
|
avail_str = f"{usage.free / (1024**2):.0f} MiB"
|
||||||
|
health_persistence.record_error(
|
||||||
|
error_key=error_key,
|
||||||
|
category='disk',
|
||||||
|
severity=fs_status['status'],
|
||||||
|
reason=f'{mount_point}: {fs_status["reason"]}',
|
||||||
|
details={
|
||||||
|
'mount': mount_point,
|
||||||
|
'used': str(round(usage.percent, 1)),
|
||||||
|
'available': avail_str,
|
||||||
|
'dismissable': False,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Space recovered -- clear any previous alert
|
||||||
|
health_persistence.clear_error(error_key)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass # Silently skip if mountpoint check fails
|
pass # Silently skip if mountpoint check fails
|
||||||
|
|
||||||
@@ -1871,7 +1906,8 @@ class HealthMonitor:
|
|||||||
self.persistent_log_patterns[pattern] = {
|
self.persistent_log_patterns[pattern] = {
|
||||||
'count': 1,
|
'count': 1,
|
||||||
'first_seen': current_time,
|
'first_seen': current_time,
|
||||||
'last_seen': current_time
|
'last_seen': current_time,
|
||||||
|
'sample': line.strip()[:200], # Original line for display
|
||||||
}
|
}
|
||||||
|
|
||||||
for line in previous_lines:
|
for line in previous_lines:
|
||||||
@@ -1913,12 +1949,16 @@ class HealthMonitor:
|
|||||||
pattern_hash = hashlib.md5(pattern.encode()).hexdigest()[:8]
|
pattern_hash = hashlib.md5(pattern.encode()).hexdigest()[:8]
|
||||||
error_key = f'log_persistent_{pattern_hash}'
|
error_key = f'log_persistent_{pattern_hash}'
|
||||||
if not health_persistence.is_error_active(error_key, category='logs'):
|
if not health_persistence.is_error_active(error_key, category='logs'):
|
||||||
|
# Use the original sample line for the notification,
|
||||||
|
# not the normalized pattern (which has IDs replaced).
|
||||||
|
sample = data.get('sample', pattern)
|
||||||
health_persistence.record_error(
|
health_persistence.record_error(
|
||||||
error_key=error_key,
|
error_key=error_key,
|
||||||
category='logs',
|
category='logs',
|
||||||
severity='WARNING',
|
severity='WARNING',
|
||||||
reason=f'Persistent error pattern detected: {pattern[:80]}',
|
reason=f'Recurring error ({data["count"]}x): {sample[:150]}',
|
||||||
details={'pattern': pattern, 'dismissable': True, 'occurrences': data['count']}
|
details={'pattern': pattern, 'sample': sample,
|
||||||
|
'dismissable': True, 'occurrences': data['count']}
|
||||||
)
|
)
|
||||||
|
|
||||||
patterns_to_remove = [
|
patterns_to_remove = [
|
||||||
|
|||||||
@@ -249,6 +249,23 @@ class JournalWatcher:
|
|||||||
|
|
||||||
def _check_kernel_critical(self, msg: str, syslog_id: str, priority: int):
|
def _check_kernel_critical(self, msg: str, syslog_id: str, priority: int):
|
||||||
"""Detect kernel panics, OOM, segfaults, hardware errors."""
|
"""Detect kernel panics, OOM, segfaults, hardware errors."""
|
||||||
|
# Only process messages from kernel or systemd (not app-level logs)
|
||||||
|
if syslog_id and syslog_id not in ('kernel', 'systemd', 'systemd-coredump', ''):
|
||||||
|
return
|
||||||
|
|
||||||
|
# Filter out normal kernel messages that are NOT problems
|
||||||
|
_KERNEL_NOISE = [
|
||||||
|
r'vfio-pci\s+\S+:\s*reset', # PCI passthrough resets (normal during VM start/stop)
|
||||||
|
r'vfio-pci\s+\S+:\s*resetting',
|
||||||
|
r'entered\s+(?:promiscuous|allmulticast)\s+mode', # Network bridge ops
|
||||||
|
r'entered\s+(?:blocking|forwarding|disabled)\s+state', # Bridge STP
|
||||||
|
r'tap\d+i\d+:', # TAP interface events
|
||||||
|
r'vmbr\d+:.*port\s+\d+', # Bridge port events
|
||||||
|
]
|
||||||
|
for noise in _KERNEL_NOISE:
|
||||||
|
if re.search(noise, msg, re.IGNORECASE):
|
||||||
|
return
|
||||||
|
|
||||||
critical_patterns = {
|
critical_patterns = {
|
||||||
r'kernel panic': ('system_problem', 'CRITICAL', 'Kernel panic'),
|
r'kernel panic': ('system_problem', 'CRITICAL', 'Kernel panic'),
|
||||||
r'Out of memory': ('system_problem', 'CRITICAL', 'Out of memory killer activated'),
|
r'Out of memory': ('system_problem', 'CRITICAL', 'Out of memory killer activated'),
|
||||||
@@ -318,6 +335,19 @@ class JournalWatcher:
|
|||||||
|
|
||||||
def _check_service_failure(self, msg: str, unit: str):
|
def _check_service_failure(self, msg: str, unit: str):
|
||||||
"""Detect critical service failures with enriched context."""
|
"""Detect critical service failures with enriched context."""
|
||||||
|
# Filter out noise -- these are normal systemd transient units,
|
||||||
|
# not real service failures worth alerting about.
|
||||||
|
_NOISE_PATTERNS = [
|
||||||
|
r'session-\d+\.scope', # SSH/login sessions
|
||||||
|
r'user@\d+\.service', # Per-user service managers
|
||||||
|
r'user-runtime-dir@\d+', # User runtime dirs
|
||||||
|
r'systemd-coredump@', # Coredump handlers (transient)
|
||||||
|
r'run-.*\.mount', # Transient mounts
|
||||||
|
]
|
||||||
|
for noise in _NOISE_PATTERNS:
|
||||||
|
if re.search(noise, msg) or re.search(noise, unit):
|
||||||
|
return
|
||||||
|
|
||||||
service_patterns = [
|
service_patterns = [
|
||||||
r'Failed to start (.+)',
|
r'Failed to start (.+)',
|
||||||
r'Unit (\S+) (?:entered failed state|failed)',
|
r'Unit (\S+) (?:entered failed state|failed)',
|
||||||
@@ -743,13 +773,16 @@ class PollingCollector:
|
|||||||
'load': 'load_high',
|
'load': 'load_high',
|
||||||
'temperature': 'temp_high',
|
'temperature': 'temp_high',
|
||||||
'disk': 'disk_space_low',
|
'disk': 'disk_space_low',
|
||||||
'storage': 'disk_space_low',
|
'storage': 'storage_unavailable',
|
||||||
'network': 'network_down',
|
'network': 'network_down',
|
||||||
'pve_services': 'service_fail',
|
'pve_services': 'service_fail',
|
||||||
'security': 'auth_fail',
|
'security': 'auth_fail',
|
||||||
'updates': 'update_available',
|
'updates': 'update_available',
|
||||||
'zfs': 'disk_io_error',
|
'zfs': 'disk_io_error',
|
||||||
'smart': 'disk_io_error',
|
'smart': 'disk_io_error',
|
||||||
|
'disks': 'disk_io_error',
|
||||||
|
'logs': 'system_problem',
|
||||||
|
'vms': 'system_problem',
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, event_queue: Queue, poll_interval: int = 60):
|
def __init__(self, event_queue: Queue, poll_interval: int = 60):
|
||||||
|
|||||||
@@ -25,10 +25,10 @@ from typing import Dict, Any, Optional, List
|
|||||||
def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]:
|
def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]:
|
||||||
"""Parse a PVE vzdump notification message into structured data.
|
"""Parse a PVE vzdump notification message into structured data.
|
||||||
|
|
||||||
PVE vzdump messages contain:
|
Supports two formats:
|
||||||
- A table: VMID Name Status Time Size Filename
|
1. Local storage: table with columns VMID Name Status Time Size Filename
|
||||||
- Totals: Total running time: Xs / Total size: X GiB
|
2. PBS storage: log-style output with 'Finished Backup of VM NNN (HH:MM:SS)'
|
||||||
- Full logs per VM
|
and sizes in lines like 'root.pxar: had to backup X of Y' or 'transferred X'
|
||||||
|
|
||||||
Returns dict with 'vms' list, 'total_time', 'total_size', or None.
|
Returns dict with 'vms' list, 'total_time', 'total_size', or None.
|
||||||
"""
|
"""
|
||||||
@@ -41,7 +41,7 @@ def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]:
|
|||||||
|
|
||||||
lines = message.split('\n')
|
lines = message.split('\n')
|
||||||
|
|
||||||
# Find the table header line
|
# ── Strategy 1: classic table (local/NFS/CIFS storage) ──
|
||||||
header_idx = -1
|
header_idx = -1
|
||||||
for i, line in enumerate(lines):
|
for i, line in enumerate(lines):
|
||||||
if re.match(r'\s*VMID\s+Name\s+Status', line, re.IGNORECASE):
|
if re.match(r'\s*VMID\s+Name\s+Status', line, re.IGNORECASE):
|
||||||
@@ -49,15 +49,10 @@ def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]:
|
|||||||
break
|
break
|
||||||
|
|
||||||
if header_idx >= 0:
|
if header_idx >= 0:
|
||||||
# Parse column positions from header
|
|
||||||
header = lines[header_idx]
|
|
||||||
# Parse table rows after header
|
|
||||||
for line in lines[header_idx + 1:]:
|
for line in lines[header_idx + 1:]:
|
||||||
stripped = line.strip()
|
stripped = line.strip()
|
||||||
if not stripped or stripped.startswith('Total') or stripped.startswith('Logs') or stripped.startswith('='):
|
if not stripped or stripped.startswith('Total') or stripped.startswith('Logs') or stripped.startswith('='):
|
||||||
break
|
break
|
||||||
# Table row: VMID Name Status Time Size Filename
|
|
||||||
# Use regex to parse flexible whitespace columns
|
|
||||||
m = re.match(
|
m = re.match(
|
||||||
r'\s*(\d+)\s+' # VMID
|
r'\s*(\d+)\s+' # VMID
|
||||||
r'(\S+)\s+' # Name
|
r'(\S+)\s+' # Name
|
||||||
@@ -74,10 +69,91 @@ def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]:
|
|||||||
'status': m.group(3),
|
'status': m.group(3),
|
||||||
'time': m.group(4),
|
'time': m.group(4),
|
||||||
'size': m.group(5),
|
'size': m.group(5),
|
||||||
'filename': m.group(6).split('/')[-1], # just filename
|
'filename': m.group(6).split('/')[-1],
|
||||||
})
|
})
|
||||||
|
|
||||||
# Extract totals
|
# ── Strategy 2: log-style (PBS / Proxmox Backup Server) ──
|
||||||
|
# Parse from the full vzdump log lines.
|
||||||
|
# Look for patterns:
|
||||||
|
# "Starting Backup of VM NNN (lxc/qemu)" -> detect guest
|
||||||
|
# "CT Name: xxx" or "VM Name: xxx" -> guest name
|
||||||
|
# "Finished Backup of VM NNN (HH:MM:SS)" -> duration + status=ok
|
||||||
|
# "root.pxar: had to backup X of Y" -> size (CT)
|
||||||
|
# "transferred X in N seconds" -> size (QEMU)
|
||||||
|
# "creating ... archive 'ct/100/2026-..'" -> archive name for PBS
|
||||||
|
# "TASK ERROR:" or "ERROR:" -> status=error
|
||||||
|
if not vms:
|
||||||
|
current_vm: Optional[Dict[str, str]] = None
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
# Remove "INFO: " prefix that PVE adds
|
||||||
|
clean = re.sub(r'^(?:INFO|WARNING|ERROR):\s*', '', line.strip())
|
||||||
|
|
||||||
|
# Start of a new VM backup
|
||||||
|
m_start = re.match(
|
||||||
|
r'Starting Backup of VM (\d+)\s+\((lxc|qemu)\)', clean)
|
||||||
|
if m_start:
|
||||||
|
if current_vm:
|
||||||
|
vms.append(current_vm)
|
||||||
|
current_vm = {
|
||||||
|
'vmid': m_start.group(1),
|
||||||
|
'name': '',
|
||||||
|
'status': 'ok',
|
||||||
|
'time': '',
|
||||||
|
'size': '',
|
||||||
|
'filename': '',
|
||||||
|
'type': m_start.group(2),
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
|
||||||
|
if current_vm:
|
||||||
|
# Guest name
|
||||||
|
m_name = re.match(r'(?:CT|VM) Name:\s*(.+)', clean)
|
||||||
|
if m_name:
|
||||||
|
current_vm['name'] = m_name.group(1).strip()
|
||||||
|
continue
|
||||||
|
|
||||||
|
# PBS archive path -> extract as filename
|
||||||
|
m_archive = re.search(
|
||||||
|
r"creating .+ archive '([^']+)'", clean)
|
||||||
|
if m_archive:
|
||||||
|
current_vm['filename'] = m_archive.group(1)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Size for containers (pxar)
|
||||||
|
m_pxar = re.search(
|
||||||
|
r'root\.pxar:.*?of\s+([\d.]+\s+\S+)', clean)
|
||||||
|
if m_pxar:
|
||||||
|
current_vm['size'] = m_pxar.group(1)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Size for QEMU (transferred)
|
||||||
|
m_transfer = re.search(
|
||||||
|
r'transferred\s+([\d.]+\s+\S+)', clean)
|
||||||
|
if m_transfer:
|
||||||
|
current_vm['size'] = m_transfer.group(1)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Finished -> duration
|
||||||
|
m_finish = re.match(
|
||||||
|
r'Finished Backup of VM (\d+)\s+\(([^)]+)\)', clean)
|
||||||
|
if m_finish:
|
||||||
|
current_vm['time'] = m_finish.group(2)
|
||||||
|
current_vm['status'] = 'ok'
|
||||||
|
vms.append(current_vm)
|
||||||
|
current_vm = None
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Error
|
||||||
|
if clean.startswith('ERROR:') or clean.startswith('TASK ERROR'):
|
||||||
|
if current_vm:
|
||||||
|
current_vm['status'] = 'error'
|
||||||
|
|
||||||
|
# Don't forget the last VM if it wasn't finished
|
||||||
|
if current_vm:
|
||||||
|
vms.append(current_vm)
|
||||||
|
|
||||||
|
# ── Extract totals ──
|
||||||
for line in lines:
|
for line in lines:
|
||||||
m_time = re.search(r'Total running time:\s*(.+)', line)
|
m_time = re.search(r'Total running time:\s*(.+)', line)
|
||||||
if m_time:
|
if m_time:
|
||||||
@@ -86,6 +162,50 @@ def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]:
|
|||||||
if m_size:
|
if m_size:
|
||||||
total_size = m_size.group(1).strip()
|
total_size = m_size.group(1).strip()
|
||||||
|
|
||||||
|
# For PBS: calculate total size if not explicitly stated
|
||||||
|
if not total_size and vms:
|
||||||
|
# Sum individual sizes if they share units
|
||||||
|
sizes_gib = 0.0
|
||||||
|
for vm in vms:
|
||||||
|
s = vm.get('size', '')
|
||||||
|
m = re.match(r'([\d.]+)\s+(.*)', s)
|
||||||
|
if m:
|
||||||
|
val = float(m.group(1))
|
||||||
|
unit = m.group(2).strip().upper()
|
||||||
|
if 'GIB' in unit or 'GB' in unit:
|
||||||
|
sizes_gib += val
|
||||||
|
elif 'MIB' in unit or 'MB' in unit:
|
||||||
|
sizes_gib += val / 1024
|
||||||
|
elif 'TIB' in unit or 'TB' in unit:
|
||||||
|
sizes_gib += val * 1024
|
||||||
|
if sizes_gib > 0:
|
||||||
|
if sizes_gib >= 1024:
|
||||||
|
total_size = f"{sizes_gib / 1024:.3f} TiB"
|
||||||
|
elif sizes_gib >= 1:
|
||||||
|
total_size = f"{sizes_gib:.3f} GiB"
|
||||||
|
else:
|
||||||
|
total_size = f"{sizes_gib * 1024:.3f} MiB"
|
||||||
|
|
||||||
|
# For PBS: calculate total time if not stated
|
||||||
|
if not total_time and vms:
|
||||||
|
total_secs = 0
|
||||||
|
for vm in vms:
|
||||||
|
t = vm.get('time', '')
|
||||||
|
# Parse HH:MM:SS format
|
||||||
|
m = re.match(r'(\d+):(\d+):(\d+)', t)
|
||||||
|
if m:
|
||||||
|
total_secs += int(m.group(1)) * 3600 + int(m.group(2)) * 60 + int(m.group(3))
|
||||||
|
if total_secs > 0:
|
||||||
|
hours = total_secs // 3600
|
||||||
|
mins = (total_secs % 3600) // 60
|
||||||
|
secs = total_secs % 60
|
||||||
|
if hours:
|
||||||
|
total_time = f"{hours}h {mins}m {secs}s"
|
||||||
|
elif mins:
|
||||||
|
total_time = f"{mins}m {secs}s"
|
||||||
|
else:
|
||||||
|
total_time = f"{secs}s"
|
||||||
|
|
||||||
if not vms and not total_size:
|
if not vms and not total_size:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@@ -113,7 +233,12 @@ def _format_vzdump_body(parsed: Dict[str, Any], is_success: bool) -> str:
|
|||||||
if vm.get('time'):
|
if vm.get('time'):
|
||||||
details.append(f"Duration: {vm['time']}")
|
details.append(f"Duration: {vm['time']}")
|
||||||
if vm.get('filename'):
|
if vm.get('filename'):
|
||||||
details.append(f"File: {vm['filename']}")
|
fname = vm['filename']
|
||||||
|
# PBS archives look like "ct/100/2026-..." or "vm/105/2026-..."
|
||||||
|
if re.match(r'^(?:ct|vm)/\d+/', fname):
|
||||||
|
details.append(f"PBS: {fname}")
|
||||||
|
else:
|
||||||
|
details.append(f"File: {fname}")
|
||||||
if details:
|
if details:
|
||||||
parts.append(' | '.join(details))
|
parts.append(' | '.join(details))
|
||||||
parts.append('') # blank line between VMs
|
parts.append('') # blank line between VMs
|
||||||
@@ -338,6 +463,12 @@ TEMPLATES = {
|
|||||||
'group': 'storage',
|
'group': 'storage',
|
||||||
'default_enabled': True,
|
'default_enabled': True,
|
||||||
},
|
},
|
||||||
|
'storage_unavailable': {
|
||||||
|
'title': '{hostname}: Storage unavailable - {storage_name}',
|
||||||
|
'body': 'PVE storage "{storage_name}" ({storage_type}) is not available.\n{reason}',
|
||||||
|
'group': 'storage',
|
||||||
|
'default_enabled': True,
|
||||||
|
},
|
||||||
'load_high': {
|
'load_high': {
|
||||||
'title': '{hostname}: High system load ({value})',
|
'title': '{hostname}: High system load ({value})',
|
||||||
'body': 'System load average: {value} on {cores} cores.\n{details}',
|
'body': 'System load average: {value} on {cores} cores.\n{details}',
|
||||||
|
|||||||
Reference in New Issue
Block a user