mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-04-25 08:56:21 +00:00
Update notification service
This commit is contained in:
@@ -324,13 +324,6 @@ class HealthMonitor:
|
||||
Returns JSON structure with ALL 10 categories always present.
|
||||
Now includes persistent error tracking.
|
||||
"""
|
||||
# Run cleanup on every status check so stale errors are auto-resolved
|
||||
# using the user-configured Suppression Duration (single source of truth).
|
||||
try:
|
||||
health_persistence.cleanup_old_errors()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
active_errors = health_persistence.get_active_errors()
|
||||
# No need to create persistent_issues dict here, it's implicitly handled by the checks
|
||||
|
||||
@@ -828,20 +821,8 @@ class HealthMonitor:
|
||||
issues = []
|
||||
storage_details = {}
|
||||
|
||||
# Check disk usage and mount status for important mounts.
|
||||
# We detect actual mountpoints dynamically rather than hard-coding.
|
||||
critical_mounts = set()
|
||||
critical_mounts.add('/')
|
||||
try:
|
||||
for part in psutil.disk_partitions(all=False):
|
||||
mp = part.mountpoint
|
||||
# Include standard system mounts and PVE storage
|
||||
if mp in ('/', '/var', '/tmp', '/boot', '/boot/efi') or \
|
||||
mp.startswith('/var/lib/vz') or mp.startswith('/mnt/'):
|
||||
critical_mounts.add(mp)
|
||||
except Exception:
|
||||
pass
|
||||
critical_mounts = sorted(critical_mounts)
|
||||
# Check disk usage and mount status first for critical mounts
|
||||
critical_mounts = ['/']
|
||||
|
||||
for mount_point in critical_mounts:
|
||||
try:
|
||||
@@ -876,32 +857,9 @@ class HealthMonitor:
|
||||
# Check filesystem usage only if not already flagged as critical
|
||||
if mount_point not in storage_details or storage_details[mount_point].get('status') == 'OK':
|
||||
fs_status = self._check_filesystem(mount_point)
|
||||
error_key = f'disk_space_{mount_point}'
|
||||
if fs_status['status'] != 'OK':
|
||||
issues.append(f"{mount_point}: {fs_status['reason']}")
|
||||
storage_details[mount_point] = fs_status
|
||||
# Record persistent error for notifications
|
||||
usage = psutil.disk_usage(mount_point)
|
||||
avail_gb = usage.free / (1024**3)
|
||||
if avail_gb >= 1:
|
||||
avail_str = f"{avail_gb:.1f} GiB"
|
||||
else:
|
||||
avail_str = f"{usage.free / (1024**2):.0f} MiB"
|
||||
health_persistence.record_error(
|
||||
error_key=error_key,
|
||||
category='disk',
|
||||
severity=fs_status['status'],
|
||||
reason=f'{mount_point}: {fs_status["reason"]}',
|
||||
details={
|
||||
'mount': mount_point,
|
||||
'used': str(round(usage.percent, 1)),
|
||||
'available': avail_str,
|
||||
'dismissable': False,
|
||||
}
|
||||
)
|
||||
else:
|
||||
# Space recovered -- clear any previous alert
|
||||
health_persistence.clear_error(error_key)
|
||||
except Exception:
|
||||
pass # Silently skip if mountpoint check fails
|
||||
|
||||
@@ -1094,67 +1052,16 @@ class HealthMonitor:
|
||||
|
||||
return storages
|
||||
|
||||
def _resolve_ata_to_disk(self, ata_port: str) -> str:
|
||||
"""Resolve an ATA controller name (e.g. 'ata8') to a block device (e.g. 'sda').
|
||||
|
||||
Uses /sys/class/ata_port/ symlinks and /sys/block/ to find the mapping.
|
||||
Falls back to parsing dmesg for 'ata8: SATA link up' -> 'sd 7:0:0:0: [sda]'.
|
||||
"""
|
||||
if not ata_port or not ata_port.startswith('ata'):
|
||||
return ata_port
|
||||
|
||||
port_num = ata_port.replace('ata', '')
|
||||
|
||||
# Method 1: Walk /sys/class/ata_port/ -> host -> target -> block
|
||||
try:
|
||||
ata_path = f'/sys/class/ata_port/{ata_port}'
|
||||
if os.path.exists(ata_path):
|
||||
device_path = os.path.realpath(ata_path)
|
||||
# Walk up to find the SCSI host, then find block devices
|
||||
# Path: /sys/devices/.../ataX/hostY/targetY:0:0/Y:0:0:0/block/sdZ
|
||||
for root, dirs, files in os.walk(os.path.dirname(device_path)):
|
||||
if 'block' in dirs:
|
||||
block_path = os.path.join(root, 'block')
|
||||
devs = os.listdir(block_path)
|
||||
if devs:
|
||||
return devs[0] # e.g. 'sda'
|
||||
except (OSError, IOError):
|
||||
pass
|
||||
|
||||
# Method 2: Parse dmesg for ATA link messages
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['dmesg', '--notime'],
|
||||
capture_output=True, text=True, timeout=2
|
||||
)
|
||||
if result.returncode == 0:
|
||||
# Look for "ata8: SATA link up" followed by "sd X:0:0:0: [sda]"
|
||||
lines = result.stdout.split('\n')
|
||||
host_num = None
|
||||
for line in lines:
|
||||
m = re.search(rf'{ata_port}:\s+SATA link', line)
|
||||
if m:
|
||||
# ata port number maps to host(N-1) typically
|
||||
host_num = int(port_num) - 1
|
||||
if host_num is not None:
|
||||
m2 = re.search(rf'sd\s+{host_num}:\d+:\d+:\d+:\s+\[(\w+)\]', line)
|
||||
if m2:
|
||||
return m2.group(1)
|
||||
except (OSError, subprocess.TimeoutExpired):
|
||||
pass
|
||||
|
||||
return ata_port # Return original if resolution fails
|
||||
|
||||
def _check_disks_optimized(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Disk I/O error check -- the SINGLE source of truth for disk errors.
|
||||
|
||||
Reads dmesg for I/O/ATA/SCSI errors, counts per device, records in
|
||||
health_persistence, and returns status for the health dashboard.
|
||||
Resolves ATA controller names (ata8) to physical disks (sda).
|
||||
Optimized disk check - always returns status.
|
||||
Checks dmesg for I/O errors and SMART status.
|
||||
NOTE: This function is now largely covered by _check_storage_optimized,
|
||||
but kept for potential specific disk-level reporting if needed.
|
||||
Currently, its primary function is to detect recent I/O errors.
|
||||
"""
|
||||
current_time = time.time()
|
||||
disk_results = {} # Single dict for both WARNING and CRITICAL
|
||||
disk_issues = {}
|
||||
|
||||
try:
|
||||
# Check dmesg for I/O errors in the last 5 minutes
|
||||
@@ -1165,52 +1072,17 @@ class HealthMonitor:
|
||||
timeout=2
|
||||
)
|
||||
|
||||
# Collect a sample line per device for richer error messages
|
||||
disk_samples = {}
|
||||
|
||||
if result.returncode == 0:
|
||||
for line in result.stdout.split('\n'):
|
||||
line_lower = line.lower()
|
||||
# Detect various disk error formats
|
||||
is_disk_error = any(kw in line_lower for kw in [
|
||||
'i/o error', 'scsi error', 'medium error',
|
||||
'failed command:', 'exception emask',
|
||||
])
|
||||
ata_match = re.search(r'(ata\d+)[\.\d]*:.*(?:error|failed|exception)', line_lower)
|
||||
if ata_match:
|
||||
is_disk_error = True
|
||||
|
||||
if is_disk_error:
|
||||
# Extract device from multiple formats
|
||||
raw_device = None
|
||||
for dev_re in [
|
||||
r'dev\s+(sd[a-z]+)', # dev sdb
|
||||
r'\[(sd[a-z]+)\]', # [sda]
|
||||
r'/dev/(sd[a-z]+)', # /dev/sda
|
||||
r'(nvme\d+n\d+)', # nvme0n1
|
||||
r'device\s+(sd[a-z]+\d*)', # device sda1
|
||||
r'(ata\d+)', # ata8 (ATA controller)
|
||||
]:
|
||||
dm = re.search(dev_re, line)
|
||||
if dm:
|
||||
raw_device = dm.group(1)
|
||||
break
|
||||
|
||||
if raw_device:
|
||||
# Resolve ATA port to physical disk name
|
||||
if raw_device.startswith('ata'):
|
||||
resolved = self._resolve_ata_to_disk(raw_device)
|
||||
disk_name = resolved
|
||||
else:
|
||||
disk_name = raw_device.rstrip('0123456789') if raw_device.startswith('sd') else raw_device
|
||||
|
||||
if any(keyword in line_lower for keyword in ['i/o error', 'ata error', 'scsi error', 'medium error']):
|
||||
# Try to extract disk name
|
||||
disk_match = re.search(r'/dev/(sd[a-z]|nvme\d+n\d+)', line)
|
||||
if disk_match:
|
||||
disk_name = disk_match.group(1)
|
||||
self.io_error_history[disk_name].append(current_time)
|
||||
if disk_name not in disk_samples:
|
||||
# Clean the sample: strip dmesg timestamp prefix
|
||||
clean = re.sub(r'^\[.*?\]\s*', '', line.strip())
|
||||
disk_samples[disk_name] = clean[:200]
|
||||
|
||||
# Clean old history and evaluate per-disk status
|
||||
# Clean old history (keep errors from the last 5 minutes)
|
||||
for disk in list(self.io_error_history.keys()):
|
||||
self.io_error_history[disk] = [
|
||||
t for t in self.io_error_history[disk]
|
||||
@@ -1218,67 +1090,57 @@ class HealthMonitor:
|
||||
]
|
||||
|
||||
error_count = len(self.io_error_history[disk])
|
||||
error_key = f'disk_{disk}'
|
||||
sample = disk_samples.get(disk, '')
|
||||
display = f'/dev/{disk}' if not disk.startswith('/') else disk
|
||||
|
||||
# Report based on recent error count
|
||||
if error_count >= 3:
|
||||
error_key = f'disk_{disk}'
|
||||
severity = 'CRITICAL'
|
||||
reason = f'{display}: {error_count} I/O errors in 5 min'
|
||||
if sample:
|
||||
reason += f'\n{sample}'
|
||||
reason = f'{error_count} I/O errors in 5 minutes'
|
||||
|
||||
health_persistence.record_error(
|
||||
error_key=error_key,
|
||||
category='disks',
|
||||
severity=severity,
|
||||
reason=reason,
|
||||
details={'disk': disk, 'device': display,
|
||||
'error_count': error_count,
|
||||
'sample': sample, 'dismissable': False}
|
||||
details={'disk': disk, 'error_count': error_count, 'dismissable': False}
|
||||
)
|
||||
disk_results[display] = {
|
||||
|
||||
disk_details[disk] = {
|
||||
'status': severity,
|
||||
'reason': reason,
|
||||
'device': disk,
|
||||
'error_count': error_count,
|
||||
'dismissable': False,
|
||||
'dismissable': False
|
||||
}
|
||||
elif error_count >= 1:
|
||||
error_key = f'disk_{disk}'
|
||||
severity = 'WARNING'
|
||||
reason = f'{display}: {error_count} I/O error(s) in 5 min'
|
||||
if sample:
|
||||
reason += f'\n{sample}'
|
||||
reason = f'{error_count} I/O error(s) in 5 minutes'
|
||||
|
||||
rec_result = health_persistence.record_error(
|
||||
health_persistence.record_error(
|
||||
error_key=error_key,
|
||||
category='disks',
|
||||
severity=severity,
|
||||
reason=reason,
|
||||
details={'disk': disk, 'device': display,
|
||||
'error_count': error_count,
|
||||
'sample': sample, 'dismissable': True}
|
||||
details={'disk': disk, 'error_count': error_count, 'dismissable': True}
|
||||
)
|
||||
if not rec_result or rec_result.get('type') != 'skipped_acknowledged':
|
||||
disk_results[display] = {
|
||||
'status': severity,
|
||||
'reason': reason,
|
||||
'device': disk,
|
||||
'error_count': error_count,
|
||||
'dismissable': True,
|
||||
}
|
||||
|
||||
disk_issues[f'/dev/{disk}'] = {
|
||||
'status': severity,
|
||||
'reason': reason,
|
||||
'dismissable': True
|
||||
}
|
||||
else:
|
||||
error_key = f'disk_{disk}'
|
||||
health_persistence.resolve_error(error_key, 'Disk errors cleared')
|
||||
|
||||
if not disk_results:
|
||||
if not disk_issues:
|
||||
return {'status': 'OK'}
|
||||
|
||||
has_critical = any(d.get('status') == 'CRITICAL' for d in disk_results.values())
|
||||
has_critical = any(d.get('status') == 'CRITICAL' for d in disk_issues.values())
|
||||
|
||||
return {
|
||||
'status': 'CRITICAL' if has_critical else 'WARNING',
|
||||
'reason': f"{len(disk_results)} disk(s) with recent errors",
|
||||
'details': disk_results
|
||||
'reason': f"{len(disk_issues)} disk(s) with recent errors",
|
||||
'details': disk_issues
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
@@ -1489,51 +1351,12 @@ class HealthMonitor:
|
||||
except Exception:
|
||||
return {'status': 'UNKNOWN', 'reason': 'Ping command failed'}
|
||||
|
||||
def _is_vzdump_active(self) -> bool:
|
||||
"""Check if a vzdump (backup) job is currently running."""
|
||||
try:
|
||||
with open('/var/log/pve/tasks/active', 'r') as f:
|
||||
for line in f:
|
||||
if ':vzdump:' in line:
|
||||
return True
|
||||
except (OSError, IOError):
|
||||
pass
|
||||
return False
|
||||
|
||||
def _resolve_vm_name(self, vmid: str) -> str:
|
||||
"""Resolve VMID to guest name from PVE config files."""
|
||||
if not vmid:
|
||||
return ''
|
||||
for base in ['/etc/pve/qemu-server', '/etc/pve/lxc']:
|
||||
conf = os.path.join(base, f'{vmid}.conf')
|
||||
try:
|
||||
with open(conf) as f:
|
||||
for line in f:
|
||||
if line.startswith('hostname:') or line.startswith('name:'):
|
||||
return line.split(':', 1)[1].strip()
|
||||
except (OSError, IOError):
|
||||
continue
|
||||
return ''
|
||||
|
||||
def _check_vms_cts_optimized(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Optimized VM/CT check - detects qmp failures and startup errors from logs.
|
||||
Improved detection of container and VM errors from journalctl.
|
||||
"""
|
||||
try:
|
||||
# First: auto-resolve any persisted VM/CT errors where the guest
|
||||
# is now running. This clears stale "Failed to start" / QMP
|
||||
# errors that are no longer relevant.
|
||||
try:
|
||||
active_vm_errors = health_persistence.get_active_errors('vms')
|
||||
for err in active_vm_errors:
|
||||
details = err.get('details') or {}
|
||||
vmid = details.get('id', '')
|
||||
if vmid:
|
||||
health_persistence.check_vm_running(vmid)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
issues = []
|
||||
vm_details = {}
|
||||
|
||||
@@ -1544,28 +1367,20 @@ class HealthMonitor:
|
||||
timeout=3
|
||||
)
|
||||
|
||||
# Check if vzdump is running -- QMP timeouts during backup are normal
|
||||
_vzdump_running = self._is_vzdump_active()
|
||||
|
||||
if result.returncode == 0:
|
||||
for line in result.stdout.split('\n'):
|
||||
line_lower = line.lower()
|
||||
|
||||
vm_qmp_match = re.search(r'vm\s+(\d+)\s+qmp\s+command.*(?:failed|unable|timeout)', line_lower)
|
||||
if vm_qmp_match:
|
||||
if _vzdump_running:
|
||||
continue # Normal during backup
|
||||
vmid = vm_qmp_match.group(1)
|
||||
vm_name = self._resolve_vm_name(vmid)
|
||||
display = f"VM {vmid} ({vm_name})" if vm_name else f"VM {vmid}"
|
||||
key = f'vm_{vmid}'
|
||||
if key not in vm_details:
|
||||
issues.append(f'{display}: QMP communication issue')
|
||||
issues.append(f'VM {vmid}: Communication issue')
|
||||
vm_details[key] = {
|
||||
'status': 'WARNING',
|
||||
'reason': f'{display}: QMP command failed or timed out.\n{line.strip()[:200]}',
|
||||
'reason': 'QMP command timeout',
|
||||
'id': vmid,
|
||||
'vmname': vm_name,
|
||||
'type': 'VM'
|
||||
}
|
||||
continue
|
||||
@@ -1586,15 +1401,11 @@ class HealthMonitor:
|
||||
else:
|
||||
reason = 'Container error'
|
||||
|
||||
ct_name = self._resolve_vm_name(ctid)
|
||||
display = f"CT {ctid} ({ct_name})" if ct_name else f"CT {ctid}"
|
||||
full_reason = f'{display}: {reason}\n{line.strip()[:200]}'
|
||||
issues.append(f'{display}: {reason}')
|
||||
issues.append(f'CT {ctid}: {reason}')
|
||||
vm_details[key] = {
|
||||
'status': 'WARNING' if 'device' in reason.lower() else 'CRITICAL',
|
||||
'reason': full_reason,
|
||||
'reason': reason,
|
||||
'id': ctid,
|
||||
'vmname': ct_name,
|
||||
'type': 'CT'
|
||||
}
|
||||
continue
|
||||
@@ -1629,15 +1440,11 @@ class HealthMonitor:
|
||||
vmid = id_match.group(1)
|
||||
key = f'vmct_{vmid}'
|
||||
if key not in vm_details:
|
||||
vm_name = self._resolve_vm_name(vmid)
|
||||
display = f"VM/CT {vmid} ({vm_name})" if vm_name else f"VM/CT {vmid}"
|
||||
full_reason = f'{display}: Failed to start\n{line.strip()[:200]}'
|
||||
issues.append(f'{display}: Failed to start')
|
||||
issues.append(f'VM/CT {vmid}: Failed to start')
|
||||
vm_details[key] = {
|
||||
'status': 'CRITICAL',
|
||||
'reason': full_reason,
|
||||
'reason': 'Failed to start',
|
||||
'id': vmid,
|
||||
'vmname': vm_name,
|
||||
'type': 'VM/CT'
|
||||
}
|
||||
|
||||
@@ -1697,38 +1504,31 @@ class HealthMonitor:
|
||||
timeout=3
|
||||
)
|
||||
|
||||
_vzdump_running = self._is_vzdump_active()
|
||||
|
||||
if result.returncode == 0:
|
||||
for line in result.stdout.split('\n'):
|
||||
line_lower = line.lower()
|
||||
|
||||
# VM QMP errors (skip during active backup -- normal behavior)
|
||||
# VM QMP errors
|
||||
vm_qmp_match = re.search(r'vm\s+(\d+)\s+qmp\s+command.*(?:failed|unable|timeout)', line_lower)
|
||||
if vm_qmp_match:
|
||||
if _vzdump_running:
|
||||
continue # Normal during backup
|
||||
vmid = vm_qmp_match.group(1)
|
||||
vm_name = self._resolve_vm_name(vmid)
|
||||
display = f"VM {vmid} ({vm_name})" if vm_name else f"VM {vmid}"
|
||||
error_key = f'vm_{vmid}'
|
||||
if error_key not in vm_details:
|
||||
rec_result = health_persistence.record_error(
|
||||
# Record persistent error
|
||||
health_persistence.record_error(
|
||||
error_key=error_key,
|
||||
category='vms',
|
||||
severity='WARNING',
|
||||
reason=f'{display}: QMP command failed or timed out.\n{line.strip()[:200]}',
|
||||
details={'id': vmid, 'vmname': vm_name, 'type': 'VM'}
|
||||
reason='QMP command timeout',
|
||||
details={'id': vmid, 'type': 'VM'}
|
||||
)
|
||||
if not rec_result or rec_result.get('type') != 'skipped_acknowledged':
|
||||
issues.append(f'{display}: QMP communication issue')
|
||||
vm_details[error_key] = {
|
||||
'status': 'WARNING',
|
||||
'reason': f'{display}: QMP command failed or timed out',
|
||||
'id': vmid,
|
||||
'vmname': vm_name,
|
||||
'type': 'VM'
|
||||
}
|
||||
issues.append(f'VM {vmid}: Communication issue')
|
||||
vm_details[error_key] = {
|
||||
'status': 'WARNING',
|
||||
'reason': 'QMP command timeout',
|
||||
'id': vmid,
|
||||
'type': 'VM'
|
||||
}
|
||||
continue
|
||||
|
||||
# Container errors (including startup issues via vzstart)
|
||||
@@ -1748,21 +1548,20 @@ class HealthMonitor:
|
||||
reason = 'Startup error'
|
||||
|
||||
# Record persistent error
|
||||
rec_result = health_persistence.record_error(
|
||||
health_persistence.record_error(
|
||||
error_key=error_key,
|
||||
category='vms',
|
||||
severity='WARNING',
|
||||
reason=reason,
|
||||
details={'id': ctid, 'type': 'CT'}
|
||||
)
|
||||
if not rec_result or rec_result.get('type') != 'skipped_acknowledged':
|
||||
issues.append(f'CT {ctid}: {reason}')
|
||||
vm_details[error_key] = {
|
||||
'status': 'WARNING',
|
||||
'reason': reason,
|
||||
'id': ctid,
|
||||
'type': 'CT'
|
||||
}
|
||||
issues.append(f'CT {ctid}: {reason}')
|
||||
vm_details[error_key] = {
|
||||
'status': 'WARNING',
|
||||
'reason': reason,
|
||||
'id': ctid,
|
||||
'type': 'CT'
|
||||
}
|
||||
|
||||
# Generic failed to start for VMs and CTs
|
||||
if any(keyword in line_lower for keyword in ['failed to start', 'cannot start', 'activation failed', 'start error']):
|
||||
@@ -1787,28 +1586,22 @@ class HealthMonitor:
|
||||
vm_type = 'VM/CT'
|
||||
|
||||
if error_key not in vm_details:
|
||||
vm_name = self._resolve_vm_name(vmid_ctid)
|
||||
display = f"{vm_type} {vmid_ctid}"
|
||||
if vm_name:
|
||||
display = f"{vm_type} {vmid_ctid} ({vm_name})"
|
||||
reason = f'{display}: Failed to start\n{line.strip()[:200]}'
|
||||
reason = 'Failed to start'
|
||||
# Record persistent error
|
||||
rec_result = health_persistence.record_error(
|
||||
health_persistence.record_error(
|
||||
error_key=error_key,
|
||||
category='vms',
|
||||
severity='CRITICAL',
|
||||
reason=reason,
|
||||
details={'id': vmid_ctid, 'vmname': vm_name, 'type': vm_type}
|
||||
details={'id': vmid_ctid, 'type': vm_type}
|
||||
)
|
||||
if not rec_result or rec_result.get('type') != 'skipped_acknowledged':
|
||||
issues.append(f'{display}: Failed to start')
|
||||
vm_details[error_key] = {
|
||||
'status': 'CRITICAL',
|
||||
'reason': reason,
|
||||
'id': vmid_ctid,
|
||||
'vmname': vm_name,
|
||||
'type': vm_type
|
||||
}
|
||||
issues.append(f'{vm_type} {vmid_ctid}: {reason}')
|
||||
vm_details[error_key] = {
|
||||
'status': 'CRITICAL',
|
||||
'reason': reason,
|
||||
'id': vmid_ctid,
|
||||
'type': vm_type
|
||||
}
|
||||
|
||||
# Build checks dict from vm_details
|
||||
checks = {}
|
||||
@@ -1899,23 +1692,16 @@ class HealthMonitor:
|
||||
if failed_services:
|
||||
reason = f'Services inactive: {", ".join(failed_services)}'
|
||||
|
||||
# Record each failed service in persistence, respecting dismiss
|
||||
active_failed = []
|
||||
# Record each failed service in persistence
|
||||
for svc in failed_services:
|
||||
error_key = f'pve_service_{svc}'
|
||||
rec_result = health_persistence.record_error(
|
||||
health_persistence.record_error(
|
||||
error_key=error_key,
|
||||
category='pve_services',
|
||||
severity='CRITICAL',
|
||||
reason=f'PVE service {svc} is {service_details.get(svc, "inactive")}',
|
||||
details={'service': svc, 'state': service_details.get(svc, 'inactive')}
|
||||
)
|
||||
if rec_result and rec_result.get('type') == 'skipped_acknowledged':
|
||||
# Mark as dismissed in checks for frontend
|
||||
if svc in checks:
|
||||
checks[svc]['dismissed'] = True
|
||||
else:
|
||||
active_failed.append(svc)
|
||||
|
||||
# Auto-clear services that recovered
|
||||
for svc in services_to_check:
|
||||
@@ -1924,21 +1710,10 @@ class HealthMonitor:
|
||||
if health_persistence.is_error_active(error_key):
|
||||
health_persistence.clear_error(error_key)
|
||||
|
||||
# If all failed services are dismissed, return OK
|
||||
if not active_failed:
|
||||
return {
|
||||
'status': 'OK',
|
||||
'reason': None,
|
||||
'failed': [],
|
||||
'is_cluster': is_cluster,
|
||||
'services_checked': len(services_to_check),
|
||||
'checks': checks
|
||||
}
|
||||
|
||||
return {
|
||||
'status': 'CRITICAL',
|
||||
'reason': f'Services inactive: {", ".join(active_failed)}',
|
||||
'failed': active_failed,
|
||||
'reason': reason,
|
||||
'failed': failed_services,
|
||||
'is_cluster': is_cluster,
|
||||
'services_checked': len(services_to_check),
|
||||
'checks': checks
|
||||
@@ -2096,8 +1871,7 @@ class HealthMonitor:
|
||||
self.persistent_log_patterns[pattern] = {
|
||||
'count': 1,
|
||||
'first_seen': current_time,
|
||||
'last_seen': current_time,
|
||||
'sample': line.strip()[:200], # Original line for display
|
||||
'last_seen': current_time
|
||||
}
|
||||
|
||||
for line in previous_lines:
|
||||
@@ -2129,18 +1903,6 @@ class HealthMonitor:
|
||||
if recent_count >= 5 and recent_count >= prev_count * 4:
|
||||
spike_errors[pattern] = recent_count
|
||||
|
||||
# Helper: get human-readable samples from normalized patterns
|
||||
def _get_samples(error_dict, max_items=3):
|
||||
"""Return list of readable sample lines for error patterns."""
|
||||
samples = []
|
||||
for pattern in list(error_dict.keys())[:max_items]:
|
||||
pdata = self.persistent_log_patterns.get(pattern, {})
|
||||
sample = pdata.get('sample', pattern)
|
||||
# Trim timestamp prefix if present (e.g. "Feb 27 16:03:35 host ")
|
||||
clean = re.sub(r'^[A-Z][a-z]{2}\s+\d+\s+[\d:]+\s+\S+\s+', '', sample)
|
||||
samples.append(clean[:120])
|
||||
return samples
|
||||
|
||||
persistent_errors = {}
|
||||
for pattern, data in self.persistent_log_patterns.items():
|
||||
time_span = current_time - data['first_seen']
|
||||
@@ -2151,16 +1913,12 @@ class HealthMonitor:
|
||||
pattern_hash = hashlib.md5(pattern.encode()).hexdigest()[:8]
|
||||
error_key = f'log_persistent_{pattern_hash}'
|
||||
if not health_persistence.is_error_active(error_key, category='logs'):
|
||||
# Use the original sample line for the notification,
|
||||
# not the normalized pattern (which has IDs replaced).
|
||||
sample = data.get('sample', pattern)
|
||||
health_persistence.record_error(
|
||||
error_key=error_key,
|
||||
category='logs',
|
||||
severity='WARNING',
|
||||
reason=f'Recurring error ({data["count"]}x): {sample[:150]}',
|
||||
details={'pattern': pattern, 'sample': sample,
|
||||
'dismissable': True, 'occurrences': data['count']}
|
||||
reason=f'Persistent error pattern detected: {pattern[:80]}',
|
||||
details={'pattern': pattern, 'dismissable': True, 'occurrences': data['count']}
|
||||
)
|
||||
|
||||
patterns_to_remove = [
|
||||
@@ -2182,33 +1940,26 @@ class HealthMonitor:
|
||||
reason = f'Critical error detected: {representative_error[:100]}'
|
||||
elif cascade_count > 0:
|
||||
status = 'WARNING'
|
||||
samples = _get_samples(cascading_errors, 3)
|
||||
reason = f'Error cascade ({cascade_count} patterns repeating):\n' + '\n'.join(f' - {s}' for s in samples)
|
||||
reason = f'Error cascade detected: {cascade_count} pattern(s) repeating ≥15 times in 3min'
|
||||
elif spike_count > 0:
|
||||
status = 'WARNING'
|
||||
samples = _get_samples(spike_errors, 3)
|
||||
reason = f'Error spike ({spike_count} patterns with 4x increase):\n' + '\n'.join(f' - {s}' for s in samples)
|
||||
reason = f'Error spike detected: {spike_count} pattern(s) increased 4x'
|
||||
elif persistent_count > 0:
|
||||
status = 'WARNING'
|
||||
samples = _get_samples(persistent_errors, 3)
|
||||
reason = f'Persistent errors ({persistent_count} patterns over 15+ min):\n' + '\n'.join(f' - {s}' for s in samples)
|
||||
reason = f'Persistent errors: {persistent_count} pattern(s) recurring over 15+ minutes'
|
||||
else:
|
||||
# No significant issues found
|
||||
status = 'OK'
|
||||
reason = None
|
||||
|
||||
# Record/clear persistent errors for each log sub-check so Dismiss works
|
||||
cascade_samples = _get_samples(cascading_errors, 2) if cascade_count else []
|
||||
spike_samples = _get_samples(spike_errors, 2) if spike_count else []
|
||||
persist_samples = _get_samples(persistent_errors, 2) if persistent_count else []
|
||||
|
||||
log_sub_checks = {
|
||||
'log_error_cascade': {'active': cascade_count > 0, 'severity': 'WARNING',
|
||||
'reason': f'{cascade_count} pattern(s) repeating >=15 times:\n' + '\n'.join(f' - {s}' for s in cascade_samples) if cascade_count else ''},
|
||||
'reason': f'{cascade_count} pattern(s) repeating >=15 times'},
|
||||
'log_error_spike': {'active': spike_count > 0, 'severity': 'WARNING',
|
||||
'reason': f'{spike_count} pattern(s) with 4x increase:\n' + '\n'.join(f' - {s}' for s in spike_samples) if spike_count else ''},
|
||||
'reason': f'{spike_count} pattern(s) with 4x increase'},
|
||||
'log_persistent_errors': {'active': persistent_count > 0, 'severity': 'WARNING',
|
||||
'reason': f'{persistent_count} recurring pattern(s) over 15+ min:\n' + '\n'.join(f' - {s}' for s in persist_samples) if persistent_count else ''},
|
||||
'reason': f'{persistent_count} recurring pattern(s) over 15+ min'},
|
||||
'log_critical_errors': {'active': unique_critical_count > 0, 'severity': 'CRITICAL',
|
||||
'reason': f'{unique_critical_count} critical error(s) found', 'dismissable': False},
|
||||
}
|
||||
@@ -2584,7 +2335,20 @@ class HealthMonitor:
|
||||
msg = f'{total_banned} IP(s) currently banned by Fail2Ban (jails: {jails_str})'
|
||||
result['status'] = 'WARNING'
|
||||
result['detail'] = msg
|
||||
# Persistence handled by _check_security caller via security_fail2ban key
|
||||
|
||||
# Record in persistence (dismissable)
|
||||
health_persistence.record_error(
|
||||
error_key='fail2ban',
|
||||
category='security',
|
||||
severity='WARNING',
|
||||
reason=msg,
|
||||
details={
|
||||
'banned_count': total_banned,
|
||||
'jails': jails_with_bans,
|
||||
'banned_ips': all_banned_ips[:5],
|
||||
'dismissable': True
|
||||
}
|
||||
)
|
||||
else:
|
||||
result['detail'] = f'Fail2Ban active ({len(jails)} jail(s), no current bans)'
|
||||
# Auto-resolve if previously banned IPs are now gone
|
||||
@@ -2692,60 +2456,14 @@ class HealthMonitor:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Persist errors and respect dismiss for each sub-check
|
||||
dismissed_keys = set()
|
||||
security_sub_checks = {
|
||||
'security_login_attempts': checks.get('login_attempts', {}),
|
||||
'security_certificates': checks.get('certificates', {}),
|
||||
'security_uptime': checks.get('uptime', {}),
|
||||
'security_fail2ban': checks.get('fail2ban', {}),
|
||||
}
|
||||
|
||||
for err_key, check_info in security_sub_checks.items():
|
||||
check_status = check_info.get('status', 'OK')
|
||||
if check_status not in ('OK', 'INFO'):
|
||||
is_dismissable = check_info.get('dismissable', True)
|
||||
rec_result = health_persistence.record_error(
|
||||
error_key=err_key,
|
||||
category='security',
|
||||
severity=check_status,
|
||||
reason=check_info.get('detail', ''),
|
||||
details={'dismissable': is_dismissable}
|
||||
)
|
||||
if rec_result and rec_result.get('type') == 'skipped_acknowledged':
|
||||
dismissed_keys.add(err_key)
|
||||
elif health_persistence.is_error_active(err_key):
|
||||
health_persistence.clear_error(err_key)
|
||||
|
||||
# Rebuild issues excluding dismissed sub-checks
|
||||
key_to_check = {
|
||||
'security_login_attempts': 'login_attempts',
|
||||
'security_certificates': 'certificates',
|
||||
'security_uptime': 'uptime',
|
||||
'security_fail2ban': 'fail2ban',
|
||||
}
|
||||
active_issues = []
|
||||
for err_key, check_name in key_to_check.items():
|
||||
if err_key in dismissed_keys:
|
||||
# Mark as dismissed in checks for the frontend
|
||||
if check_name in checks:
|
||||
checks[check_name]['dismissed'] = True
|
||||
continue
|
||||
check_info = checks.get(check_name, {})
|
||||
if check_info.get('status', 'OK') not in ('OK', 'INFO'):
|
||||
active_issues.append(check_info.get('detail', ''))
|
||||
|
||||
# Determine overall security status from non-dismissed issues only
|
||||
if active_issues:
|
||||
has_critical = any(
|
||||
c.get('status') == 'CRITICAL'
|
||||
for k, c in checks.items()
|
||||
if f'security_{k}' not in dismissed_keys
|
||||
)
|
||||
# Determine overall security status
|
||||
if issues:
|
||||
# Check if any sub-check is CRITICAL
|
||||
has_critical = any(c.get('status') == 'CRITICAL' for c in checks.values())
|
||||
overall_status = 'CRITICAL' if has_critical else 'WARNING'
|
||||
return {
|
||||
'status': overall_status,
|
||||
'reason': '; '.join(active_issues[:2]),
|
||||
'reason': '; '.join(issues[:2]),
|
||||
'checks': checks
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user