mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-05-01 03:46:22 +00:00
update notification service
This commit is contained in:
@@ -69,7 +69,7 @@ def acknowledge_error():
|
|||||||
# Use the error's category to clear the correct cache
|
# Use the error's category to clear the correct cache
|
||||||
category = result.get('category', '')
|
category = result.get('category', '')
|
||||||
cache_key_map = {
|
cache_key_map = {
|
||||||
'logs': 'system_logs',
|
'logs': 'logs_analysis',
|
||||||
'pve_services': 'pve_services',
|
'pve_services': 'pve_services',
|
||||||
'updates': 'updates_check',
|
'updates': 'updates_check',
|
||||||
'security': 'security_check',
|
'security': 'security_check',
|
||||||
|
|||||||
@@ -1225,6 +1225,54 @@ class HealthMonitor:
|
|||||||
except Exception:
|
except Exception:
|
||||||
return 'UNKNOWN'
|
return 'UNKNOWN'
|
||||||
|
|
||||||
|
def _check_all_disks_smart(self, fallback: str = 'UNKNOWN') -> str:
|
||||||
|
"""Check SMART health of ALL physical disks.
|
||||||
|
|
||||||
|
Used when an ATA port can't be resolved to a specific /dev/sdX.
|
||||||
|
If ALL disks report PASSED, returns 'PASSED' (errors are transient).
|
||||||
|
If ANY disk reports FAILED, returns 'FAILED'.
|
||||||
|
Otherwise returns the fallback value.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# List all block devices (exclude partitions, loop, zram, dm)
|
||||||
|
result = subprocess.run(
|
||||||
|
['lsblk', '-dnpo', 'NAME,TYPE'],
|
||||||
|
capture_output=True, text=True, timeout=3
|
||||||
|
)
|
||||||
|
if result.returncode != 0:
|
||||||
|
return fallback
|
||||||
|
|
||||||
|
disks = []
|
||||||
|
for line in result.stdout.strip().split('\n'):
|
||||||
|
parts = line.split()
|
||||||
|
if len(parts) >= 2 and parts[1] == 'disk':
|
||||||
|
disks.append(parts[0]) # e.g. /dev/sda
|
||||||
|
|
||||||
|
if not disks:
|
||||||
|
return fallback
|
||||||
|
|
||||||
|
all_passed = True
|
||||||
|
any_failed = False
|
||||||
|
checked = 0
|
||||||
|
|
||||||
|
for dev in disks:
|
||||||
|
health = self._quick_smart_health(dev)
|
||||||
|
if health == 'PASSED':
|
||||||
|
checked += 1
|
||||||
|
elif health == 'FAILED':
|
||||||
|
any_failed = True
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
all_passed = False # Can't confirm this disk
|
||||||
|
|
||||||
|
if any_failed:
|
||||||
|
return 'FAILED'
|
||||||
|
if all_passed and checked > 0:
|
||||||
|
return 'PASSED'
|
||||||
|
return fallback
|
||||||
|
except Exception:
|
||||||
|
return fallback
|
||||||
|
|
||||||
def _check_disks_optimized(self) -> Dict[str, Any]:
|
def _check_disks_optimized(self) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Disk I/O error check -- the SINGLE source of truth for disk errors.
|
Disk I/O error check -- the SINGLE source of truth for disk errors.
|
||||||
@@ -1240,12 +1288,18 @@ class HealthMonitor:
|
|||||||
current_time = time.time()
|
current_time = time.time()
|
||||||
disk_results = {} # Single dict for both WARNING and CRITICAL
|
disk_results = {} # Single dict for both WARNING and CRITICAL
|
||||||
|
|
||||||
# Common transient ATA patterns that auto-recover and are not real disk failures
|
# Common transient ATA patterns that auto-recover and are not real disk failures.
|
||||||
# 'action 0x0' means the kernel recovered automatically with no action needed
|
# These are bus/controller level events, NOT media errors:
|
||||||
|
# action 0x0 = no action needed (fully recovered)
|
||||||
|
# action 0x6 = hard reset + port reinit (common cable/connector recovery)
|
||||||
|
# SError with BadCRC/Dispar = signal integrity issue (cable, not disk)
|
||||||
|
# Emask 0x10 = ATA bus error (controller/interconnect, not media)
|
||||||
TRANSIENT_PATTERNS = [
|
TRANSIENT_PATTERNS = [
|
||||||
re.compile(r'exception\s+emask.*action\s+0x0', re.IGNORECASE),
|
re.compile(r'exception\s+emask.*action\s+0x[06]', re.IGNORECASE),
|
||||||
re.compile(r'serror.*=.*0x[0-9a-f]+\s*\(', re.IGNORECASE),
|
re.compile(r'serror.*=.*0x[0-9a-f]+\s*\(', re.IGNORECASE),
|
||||||
re.compile(r'SError:.*\{.*\}', re.IGNORECASE),
|
re.compile(r'SError:.*\{.*(?:BadCRC|Dispar|CommWake).*\}', re.IGNORECASE),
|
||||||
|
re.compile(r'emask\s+0x10\s+\(ATA bus error\)', re.IGNORECASE),
|
||||||
|
re.compile(r'failed command:\s*READ FPDMA QUEUED', re.IGNORECASE),
|
||||||
]
|
]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -1328,9 +1382,32 @@ class HealthMonitor:
|
|||||||
if error_count >= 1:
|
if error_count >= 1:
|
||||||
# Cross-reference with SMART to determine real severity
|
# Cross-reference with SMART to determine real severity
|
||||||
smart_health = self._quick_smart_health(disk)
|
smart_health = self._quick_smart_health(disk)
|
||||||
|
|
||||||
|
# If SMART is UNKNOWN (unresolved ATA port), check ALL
|
||||||
|
# physical disks. If every disk passes SMART, the ATA
|
||||||
|
# errors are transient bus/controller noise.
|
||||||
|
if smart_health == 'UNKNOWN':
|
||||||
|
smart_health = self._check_all_disks_smart(smart_health)
|
||||||
|
|
||||||
smart_ok = smart_health == 'PASSED'
|
smart_ok = smart_health == 'PASSED'
|
||||||
|
|
||||||
if smart_ok:
|
# Transient-only errors (e.g. SError with auto-recovery)
|
||||||
|
# are always INFO regardless of SMART
|
||||||
|
if all_transient:
|
||||||
|
reason = f'{display}: {error_count} transient ATA event(s) in 5 min (auto-recovered)'
|
||||||
|
if sample:
|
||||||
|
reason += f'\n{sample}'
|
||||||
|
health_persistence.resolve_error(error_key, 'Transient ATA events, auto-recovered')
|
||||||
|
disk_results[display] = {
|
||||||
|
'status': 'INFO',
|
||||||
|
'reason': reason,
|
||||||
|
'device': disk,
|
||||||
|
'error_count': error_count,
|
||||||
|
'smart_status': smart_health,
|
||||||
|
'dismissable': False,
|
||||||
|
'error_key': error_key,
|
||||||
|
}
|
||||||
|
elif smart_ok:
|
||||||
# SMART is healthy -> dmesg errors are informational only
|
# SMART is healthy -> dmesg errors are informational only
|
||||||
# The disk is fine; these are transient controller/bus events
|
# The disk is fine; these are transient controller/bus events
|
||||||
reason = f'{display}: {error_count} I/O event(s) in 5 min (SMART: OK)'
|
reason = f'{display}: {error_count} I/O event(s) in 5 min (SMART: OK)'
|
||||||
@@ -1349,11 +1426,10 @@ class HealthMonitor:
|
|||||||
'dismissable': False,
|
'dismissable': False,
|
||||||
'error_key': error_key,
|
'error_key': error_key,
|
||||||
}
|
}
|
||||||
else:
|
elif smart_health == 'FAILED':
|
||||||
# SMART is FAILED or UNKNOWN with I/O errors -> real problem
|
# SMART confirms a real disk failure
|
||||||
severity = 'CRITICAL' if error_count >= 3 else 'WARNING'
|
severity = 'CRITICAL'
|
||||||
smart_note = f'SMART: {smart_health}' if smart_health != 'UNKNOWN' else 'SMART: check manually'
|
reason = f'{display}: {error_count} I/O error(s) in 5 min (SMART: FAILED)'
|
||||||
reason = f'{display}: {error_count} I/O error(s) in 5 min ({smart_note})'
|
|
||||||
if sample:
|
if sample:
|
||||||
reason += f'\n{sample}'
|
reason += f'\n{sample}'
|
||||||
|
|
||||||
@@ -1365,7 +1441,7 @@ class HealthMonitor:
|
|||||||
details={'disk': disk, 'device': display,
|
details={'disk': disk, 'device': display,
|
||||||
'error_count': error_count,
|
'error_count': error_count,
|
||||||
'smart_status': smart_health,
|
'smart_status': smart_health,
|
||||||
'sample': sample, 'dismissable': severity != 'CRITICAL'}
|
'sample': sample, 'dismissable': False}
|
||||||
)
|
)
|
||||||
disk_results[display] = {
|
disk_results[display] = {
|
||||||
'status': severity,
|
'status': severity,
|
||||||
@@ -1373,7 +1449,35 @@ class HealthMonitor:
|
|||||||
'device': disk,
|
'device': disk,
|
||||||
'error_count': error_count,
|
'error_count': error_count,
|
||||||
'smart_status': smart_health,
|
'smart_status': smart_health,
|
||||||
'dismissable': severity != 'CRITICAL',
|
'dismissable': False,
|
||||||
|
'error_key': error_key,
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
# SMART is genuinely UNKNOWN (no disk resolved, no
|
||||||
|
# smartctl at all) -- treat as WARNING, not CRITICAL.
|
||||||
|
# These are likely transient and will auto-resolve.
|
||||||
|
severity = 'WARNING'
|
||||||
|
reason = f'{display}: {error_count} I/O event(s) in 5 min (SMART: unavailable)'
|
||||||
|
if sample:
|
||||||
|
reason += f'\n{sample}'
|
||||||
|
|
||||||
|
health_persistence.record_error(
|
||||||
|
error_key=error_key,
|
||||||
|
category='disks',
|
||||||
|
severity=severity,
|
||||||
|
reason=reason,
|
||||||
|
details={'disk': disk, 'device': display,
|
||||||
|
'error_count': error_count,
|
||||||
|
'smart_status': smart_health,
|
||||||
|
'sample': sample, 'dismissable': True}
|
||||||
|
)
|
||||||
|
disk_results[display] = {
|
||||||
|
'status': severity,
|
||||||
|
'reason': reason,
|
||||||
|
'device': disk,
|
||||||
|
'error_count': error_count,
|
||||||
|
'smart_status': smart_health,
|
||||||
|
'dismissable': True,
|
||||||
'error_key': error_key,
|
'error_key': error_key,
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
@@ -1816,11 +1920,13 @@ class HealthMonitor:
|
|||||||
# Get persistent errors first
|
# Get persistent errors first
|
||||||
persistent_errors = health_persistence.get_active_errors('vms')
|
persistent_errors = health_persistence.get_active_errors('vms')
|
||||||
|
|
||||||
# Check if any persistent VMs/CTs have started
|
# Check if any persistent VMs/CTs have started or were dismissed
|
||||||
for error in persistent_errors:
|
for error in persistent_errors:
|
||||||
error_key = error['error_key']
|
error_key = error['error_key']
|
||||||
if error_key.startswith('vm_') or error_key.startswith('ct_'):
|
is_acknowledged = error.get('acknowledged') == 1
|
||||||
vm_id = error_key.split('_')[1]
|
|
||||||
|
if error_key.startswith(('vm_', 'ct_', 'vmct_')):
|
||||||
|
vm_id = error_key.split('_', 1)[1]
|
||||||
# Check if VM is running using persistence helper
|
# Check if VM is running using persistence helper
|
||||||
if health_persistence.check_vm_running(vm_id):
|
if health_persistence.check_vm_running(vm_id):
|
||||||
continue # Error auto-resolved if VM is now running
|
continue # Error auto-resolved if VM is now running
|
||||||
@@ -1831,9 +1937,12 @@ class HealthMonitor:
|
|||||||
'reason': error['reason'],
|
'reason': error['reason'],
|
||||||
'id': error.get('details', {}).get('id', 'unknown'),
|
'id': error.get('details', {}).get('id', 'unknown'),
|
||||||
'type': error.get('details', {}).get('type', 'VM/CT'),
|
'type': error.get('details', {}).get('type', 'VM/CT'),
|
||||||
'first_seen': error['first_seen']
|
'first_seen': error['first_seen'],
|
||||||
|
'dismissed': is_acknowledged,
|
||||||
}
|
}
|
||||||
issues.append(f"{error.get('details', {}).get('type', 'VM')} {error.get('details', {}).get('id', '')}: {error['reason']}")
|
# Only add to issues if not dismissed
|
||||||
|
if not is_acknowledged:
|
||||||
|
issues.append(f"{error.get('details', {}).get('type', 'VM')} {error.get('details', {}).get('id', '')}: {error['reason']}")
|
||||||
|
|
||||||
# Check for new errors in logs
|
# Check for new errors in logs
|
||||||
# Using 'warning' priority to catch potential startup issues
|
# Using 'warning' priority to catch potential startup issues
|
||||||
@@ -1958,24 +2067,31 @@ class HealthMonitor:
|
|||||||
}
|
}
|
||||||
|
|
||||||
# Build checks dict from vm_details
|
# Build checks dict from vm_details
|
||||||
|
# 'key' is the persistence error_key (e.g. 'qmp_110', 'ct_101', 'vm_110')
|
||||||
checks = {}
|
checks = {}
|
||||||
for key, val in vm_details.items():
|
for key, val in vm_details.items():
|
||||||
vm_label = f"{val.get('type', 'VM')} {val.get('id', key)}"
|
vm_label = f"{val.get('type', 'VM')} {val.get('id', key)}"
|
||||||
|
is_dismissed = val.get('dismissed', False)
|
||||||
checks[vm_label] = {
|
checks[vm_label] = {
|
||||||
'status': val.get('status', 'WARNING'),
|
'status': 'INFO' if is_dismissed else val.get('status', 'WARNING'),
|
||||||
'detail': val.get('reason', 'Error'),
|
'detail': val.get('reason', 'Error'),
|
||||||
'dismissable': True,
|
'dismissable': True,
|
||||||
'error_key': vm_label
|
'dismissed': is_dismissed,
|
||||||
|
'error_key': key # Must match the persistence DB key
|
||||||
}
|
}
|
||||||
|
|
||||||
if not issues:
|
if not issues:
|
||||||
checks['qmp_communication'] = {'status': 'OK', 'detail': 'No QMP timeouts detected'}
|
# No active (non-dismissed) issues
|
||||||
checks['container_startup'] = {'status': 'OK', 'detail': 'No container startup errors'}
|
if not checks:
|
||||||
checks['vm_startup'] = {'status': 'OK', 'detail': 'No VM startup failures'}
|
checks['qmp_communication'] = {'status': 'OK', 'detail': 'No QMP timeouts detected'}
|
||||||
checks['oom_killer'] = {'status': 'OK', 'detail': 'No OOM events detected'}
|
checks['container_startup'] = {'status': 'OK', 'detail': 'No container startup errors'}
|
||||||
|
checks['vm_startup'] = {'status': 'OK', 'detail': 'No VM startup failures'}
|
||||||
|
checks['oom_killer'] = {'status': 'OK', 'detail': 'No OOM events detected'}
|
||||||
return {'status': 'OK', 'checks': checks}
|
return {'status': 'OK', 'checks': checks}
|
||||||
|
|
||||||
has_critical = any(d.get('status') == 'CRITICAL' for d in vm_details.values())
|
# Only consider non-dismissed items for overall severity
|
||||||
|
active_details = {k: v for k, v in vm_details.items() if not v.get('dismissed')}
|
||||||
|
has_critical = any(d.get('status') == 'CRITICAL' for d in active_details.values())
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'status': 'CRITICAL' if has_critical else 'WARNING',
|
'status': 'CRITICAL' if has_critical else 'WARNING',
|
||||||
@@ -2532,10 +2648,21 @@ class HealthMonitor:
|
|||||||
}
|
}
|
||||||
|
|
||||||
# Recalculate overall status considering dismissed items
|
# Recalculate overall status considering dismissed items
|
||||||
active_issues = [k for k, v in log_checks.items() if v['status'] in ('WARNING', 'CRITICAL')]
|
active_issues = {k: v for k, v in log_checks.items() if v['status'] in ('WARNING', 'CRITICAL')}
|
||||||
if not active_issues:
|
if not active_issues:
|
||||||
status = 'OK'
|
status = 'OK'
|
||||||
reason = None
|
reason = None
|
||||||
|
else:
|
||||||
|
# Recalculate status and reason from only non-dismissed sub-checks
|
||||||
|
has_critical = any(v['status'] == 'CRITICAL' for v in active_issues.values())
|
||||||
|
status = 'CRITICAL' if has_critical else 'WARNING'
|
||||||
|
# Rebuild reason from active (non-dismissed) checks only
|
||||||
|
active_reasons = []
|
||||||
|
for k, v in active_issues.items():
|
||||||
|
detail = v.get('detail', '')
|
||||||
|
if detail:
|
||||||
|
active_reasons.append(detail)
|
||||||
|
reason = '; '.join(active_reasons[:3]) if active_reasons else None
|
||||||
|
|
||||||
log_result = {'status': status, 'checks': log_checks}
|
log_result = {'status': status, 'checks': log_checks}
|
||||||
if reason:
|
if reason:
|
||||||
|
|||||||
@@ -419,7 +419,7 @@ class HealthPersistence:
|
|||||||
for cat, prefix in [('updates', 'security_updates'), ('updates', 'system_age'),
|
for cat, prefix in [('updates', 'security_updates'), ('updates', 'system_age'),
|
||||||
('updates', 'pending_updates'), ('updates', 'kernel_pve'),
|
('updates', 'pending_updates'), ('updates', 'kernel_pve'),
|
||||||
('security', 'security_'),
|
('security', 'security_'),
|
||||||
('pve_services', 'pve_service_'), ('vms', 'vm_'), ('vms', 'ct_'),
|
('pve_services', 'pve_service_'), ('vms', 'vmct_'), ('vms', 'vm_'), ('vms', 'ct_'),
|
||||||
('disks', 'disk_'), ('logs', 'log_'), ('network', 'net_'),
|
('disks', 'disk_'), ('logs', 'log_'), ('network', 'net_'),
|
||||||
('temperature', 'temp_')]:
|
('temperature', 'temp_')]:
|
||||||
if error_key == prefix or error_key.startswith(prefix):
|
if error_key == prefix or error_key.startswith(prefix):
|
||||||
@@ -488,6 +488,22 @@ class HealthPersistence:
|
|||||||
'suppression_hours': sup_hours
|
'suppression_hours': sup_hours
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# Cascade acknowledge: when dismissing a group check
|
||||||
|
# (e.g. log_persistent_errors), also dismiss all individual
|
||||||
|
# sub-errors that share the same prefix in the DB.
|
||||||
|
# Currently only persistent errors have per-pattern sub-records
|
||||||
|
# (e.g. log_persistent_a1b2c3d4).
|
||||||
|
CASCADE_PREFIXES = {
|
||||||
|
'log_persistent_errors': 'log_persistent_',
|
||||||
|
}
|
||||||
|
child_prefix = CASCADE_PREFIXES.get(error_key)
|
||||||
|
if child_prefix:
|
||||||
|
cursor.execute('''
|
||||||
|
UPDATE errors
|
||||||
|
SET acknowledged = 1, resolved_at = ?, suppression_hours = ?
|
||||||
|
WHERE error_key LIKE ? AND acknowledged = 0
|
||||||
|
''', (now, sup_hours, child_prefix + '%'))
|
||||||
|
|
||||||
result = {
|
result = {
|
||||||
'success': True,
|
'success': True,
|
||||||
'error_key': error_key,
|
'error_key': error_key,
|
||||||
|
|||||||
@@ -189,6 +189,7 @@ class JournalWatcher:
|
|||||||
self._check_system_shutdown(msg, syslog_id)
|
self._check_system_shutdown(msg, syslog_id)
|
||||||
self._check_permission_change(msg, syslog_id)
|
self._check_permission_change(msg, syslog_id)
|
||||||
self._check_firewall(msg, syslog_id)
|
self._check_firewall(msg, syslog_id)
|
||||||
|
self._check_backup_start(msg, syslog_id)
|
||||||
|
|
||||||
def _process_plain(self, line: str):
|
def _process_plain(self, line: str):
|
||||||
"""Fallback: process a plain text log line."""
|
"""Fallback: process a plain text log line."""
|
||||||
@@ -690,6 +691,103 @@ class JournalWatcher:
|
|||||||
|
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
|
def _check_backup_start(self, msg: str, syslog_id: str):
|
||||||
|
"""Detect backup job start from pvedaemon journal messages.
|
||||||
|
|
||||||
|
Matches: INFO: starting new backup job: vzdump 110 --storage PBS-Cloud --mode stop ...
|
||||||
|
|
||||||
|
PVE always emits this message from pvedaemon for BOTH scheduled and
|
||||||
|
manual backups. It contains the full guest list and all parameters.
|
||||||
|
The UPID "starting task" message is ignored because it arrives first
|
||||||
|
but lacks storage/mode/compression details.
|
||||||
|
"""
|
||||||
|
if syslog_id != 'pvedaemon':
|
||||||
|
return
|
||||||
|
|
||||||
|
match = re.match(r'INFO: starting new backup job: vzdump\s+(.*)', msg)
|
||||||
|
if not match:
|
||||||
|
return
|
||||||
|
|
||||||
|
raw_args = match.group(1)
|
||||||
|
|
||||||
|
# Parse the vzdump arguments from the log message
|
||||||
|
guests = []
|
||||||
|
storage = ''
|
||||||
|
mode = ''
|
||||||
|
compress = ''
|
||||||
|
|
||||||
|
args = raw_args.split()
|
||||||
|
i = 0
|
||||||
|
while i < len(args):
|
||||||
|
arg = args[i]
|
||||||
|
if arg.isdigit():
|
||||||
|
guests.append(arg)
|
||||||
|
elif arg == '--storage' and i + 1 < len(args):
|
||||||
|
storage = args[i + 1]
|
||||||
|
i += 1
|
||||||
|
elif arg == '--mode' and i + 1 < len(args):
|
||||||
|
mode = args[i + 1]
|
||||||
|
i += 1
|
||||||
|
elif arg == '--compress' and i + 1 < len(args):
|
||||||
|
compress = args[i + 1]
|
||||||
|
i += 1
|
||||||
|
elif arg == '--all' and i + 1 < len(args):
|
||||||
|
if args[i + 1] == '1':
|
||||||
|
guests = ['all']
|
||||||
|
i += 1
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
# Build the notification body
|
||||||
|
reason_parts = []
|
||||||
|
|
||||||
|
if guests:
|
||||||
|
if guests == ['all']:
|
||||||
|
reason_parts.append('Guests: All VMs/CTs')
|
||||||
|
else:
|
||||||
|
guest_lines = []
|
||||||
|
for gid in guests:
|
||||||
|
gname = self._resolve_vm_name(gid)
|
||||||
|
if gname:
|
||||||
|
guest_lines.append(f' {gname} ({gid})')
|
||||||
|
else:
|
||||||
|
guest_lines.append(f' ID {gid}')
|
||||||
|
reason_parts.append('Guests:\n' + '\n'.join(guest_lines))
|
||||||
|
|
||||||
|
details = []
|
||||||
|
if storage:
|
||||||
|
details.append(f'Storage: {storage}')
|
||||||
|
if mode:
|
||||||
|
details.append(f'Mode: {mode}')
|
||||||
|
if compress:
|
||||||
|
details.append(f'Compression: {compress}')
|
||||||
|
if details:
|
||||||
|
reason_parts.append(' | '.join(details))
|
||||||
|
|
||||||
|
reason = '\n'.join(reason_parts) if reason_parts else 'Backup job started'
|
||||||
|
|
||||||
|
self._emit('backup_start', 'INFO', {
|
||||||
|
'vmid': ', '.join(guests),
|
||||||
|
'vmname': '',
|
||||||
|
'hostname': self._hostname,
|
||||||
|
'user': '',
|
||||||
|
'reason': reason,
|
||||||
|
}, entity='backup', entity_id='vzdump')
|
||||||
|
|
||||||
|
def _resolve_vm_name(self, vmid: str) -> str:
|
||||||
|
"""Try to resolve a VMID to its name from PVE config files."""
|
||||||
|
if not vmid or not vmid.isdigit():
|
||||||
|
return ''
|
||||||
|
for base in ['/etc/pve/qemu-server', '/etc/pve/lxc']:
|
||||||
|
conf = f'{base}/{vmid}.conf'
|
||||||
|
try:
|
||||||
|
with open(conf, 'r') as f:
|
||||||
|
for line in f:
|
||||||
|
if line.startswith('name:') or line.startswith('hostname:'):
|
||||||
|
return line.split(':', 1)[1].strip()
|
||||||
|
except (OSError, IOError):
|
||||||
|
pass
|
||||||
|
return ''
|
||||||
|
|
||||||
def _check_cluster_events(self, msg: str, syslog_id: str):
|
def _check_cluster_events(self, msg: str, syslog_id: str):
|
||||||
"""Detect cluster split-brain and node disconnect."""
|
"""Detect cluster split-brain and node disconnect."""
|
||||||
msg_lower = msg.lower()
|
msg_lower = msg.lower()
|
||||||
@@ -955,7 +1053,8 @@ class TaskWatcher:
|
|||||||
parts = line.strip().split(':')
|
parts = line.strip().split(':')
|
||||||
if len(parts) >= 3:
|
if len(parts) >= 3:
|
||||||
try:
|
try:
|
||||||
pid = int(parts[2])
|
# PID in UPID is HEXADECIMAL
|
||||||
|
pid = int(parts[2], 16)
|
||||||
os.kill(pid, 0) # Signal 0 = just check existence
|
os.kill(pid, 0) # Signal 0 = just check existence
|
||||||
self._vzdump_active_cache = now
|
self._vzdump_active_cache = now
|
||||||
return True
|
return True
|
||||||
@@ -999,24 +1098,22 @@ class TaskWatcher:
|
|||||||
time.sleep(2) # Check every 2 seconds
|
time.sleep(2) # Check every 2 seconds
|
||||||
|
|
||||||
def _check_active_tasks(self):
|
def _check_active_tasks(self):
|
||||||
"""Scan /var/log/pve/tasks/active for newly started vzdump tasks.
|
"""Scan /var/log/pve/tasks/active to track vzdump for VM suppression.
|
||||||
|
|
||||||
The 'active' file lists UPIDs of currently running PVE tasks.
|
This does NOT emit backup_start notifications (the JournalWatcher
|
||||||
Format: UPID:node:pid:pstart:starttime:type:id:user:
|
handles that via the 'starting new backup job' log message, which
|
||||||
|
contains the full guest list and parameters).
|
||||||
|
|
||||||
For multi-VM backups (`vzdump 100 101`), PVE creates:
|
This only keeps _vzdump_running_since updated so that
|
||||||
- A main UPID with EMPTY vmid: UPID:amd:PID:...:vzdump::root@pam:
|
_is_vzdump_active() can suppress VM start/stop notifications
|
||||||
- Per-VM sub-UPIDs: UPID:amd:PID:...:vzdump:101:root@pam:
|
during backup operations.
|
||||||
|
|
||||||
We only emit backup_start for the MAIN job (empty vmid) and read
|
|
||||||
/proc/PID/cmdline to discover which VMs are being backed up.
|
|
||||||
Per-VM sub-UPIDs are tracked but don't trigger a notification.
|
|
||||||
"""
|
"""
|
||||||
if not os.path.exists(self.TASK_ACTIVE):
|
if not os.path.exists(self.TASK_ACTIVE):
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
current_upids = set()
|
current_upids = set()
|
||||||
|
found_vzdump = False
|
||||||
with open(self.TASK_ACTIVE, 'r') as f:
|
with open(self.TASK_ACTIVE, 'r') as f:
|
||||||
for line in f:
|
for line in f:
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
@@ -1025,133 +1122,22 @@ class TaskWatcher:
|
|||||||
upid = line.split()[0] if line.split() else line
|
upid = line.split()[0] if line.split() else line
|
||||||
current_upids.add(upid)
|
current_upids.add(upid)
|
||||||
|
|
||||||
if ':vzdump:' not in upid:
|
if ':vzdump:' in upid:
|
||||||
continue
|
found_vzdump = True
|
||||||
|
|
||||||
if upid in self._seen_active_upids:
|
# Keep _vzdump_running_since fresh as long as vzdump is in active
|
||||||
continue
|
if found_vzdump:
|
||||||
|
self._vzdump_running_since = time.time()
|
||||||
self._seen_active_upids.add(upid)
|
|
||||||
|
|
||||||
# Parse UPID: UPID:node:pid:pstart:starttime:type:id:user:
|
|
||||||
upid_parts = upid.split(':')
|
|
||||||
if len(upid_parts) < 8:
|
|
||||||
continue
|
|
||||||
|
|
||||||
vmid = upid_parts[6] # Empty for main job, vmid for sub-task
|
|
||||||
user = upid_parts[7]
|
|
||||||
pid_hex = upid_parts[2]
|
|
||||||
|
|
||||||
# Track vzdump internally for VM suppression
|
|
||||||
self._vzdump_running_since = time.time()
|
|
||||||
|
|
||||||
# Only emit notification for the MAIN job (empty vmid).
|
|
||||||
# Per-VM sub-tasks are tracked for suppression but don't notify
|
|
||||||
# (otherwise you'd get N+1 notifications for N VMs).
|
|
||||||
if vmid:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Read /proc/PID/cmdline to discover which VMs and settings
|
|
||||||
backup_info = self._parse_vzdump_cmdline(pid_hex)
|
|
||||||
|
|
||||||
# Build the notification body
|
|
||||||
reason_parts = []
|
|
||||||
|
|
||||||
if backup_info.get('guests'):
|
|
||||||
guest_lines = []
|
|
||||||
for gid in backup_info['guests']:
|
|
||||||
gname = self._get_vm_name(gid)
|
|
||||||
if gname:
|
|
||||||
guest_lines.append(f' {gname} ({gid})')
|
|
||||||
else:
|
|
||||||
guest_lines.append(f' ID {gid}')
|
|
||||||
reason_parts.append('Guests:\n' + '\n'.join(guest_lines))
|
|
||||||
|
|
||||||
details = []
|
|
||||||
if backup_info.get('storage'):
|
|
||||||
details.append(f'Storage: {backup_info["storage"]}')
|
|
||||||
if backup_info.get('mode'):
|
|
||||||
details.append(f'Mode: {backup_info["mode"]}')
|
|
||||||
if backup_info.get('compress'):
|
|
||||||
details.append(f'Compression: {backup_info["compress"]}')
|
|
||||||
if details:
|
|
||||||
reason_parts.append(' | '.join(details))
|
|
||||||
|
|
||||||
if not reason_parts:
|
|
||||||
reason_parts.append('Backup job started')
|
|
||||||
|
|
||||||
reason = '\n'.join(reason_parts)
|
|
||||||
|
|
||||||
data = {
|
|
||||||
'vmid': ', '.join(backup_info.get('guests', [])),
|
|
||||||
'vmname': '',
|
|
||||||
'hostname': self._hostname,
|
|
||||||
'user': user,
|
|
||||||
'reason': reason,
|
|
||||||
'target_node': '',
|
|
||||||
'size': '',
|
|
||||||
'snapshot_name': '',
|
|
||||||
}
|
|
||||||
|
|
||||||
self._queue.put(NotificationEvent(
|
|
||||||
'backup_start', 'INFO', data,
|
|
||||||
source='tasks',
|
|
||||||
entity='backup',
|
|
||||||
entity_id='vzdump',
|
|
||||||
))
|
|
||||||
|
|
||||||
# Cleanup stale UPIDs
|
# Cleanup stale UPIDs
|
||||||
stale = self._seen_active_upids - current_upids
|
stale = self._seen_active_upids - current_upids
|
||||||
self._seen_active_upids -= stale
|
self._seen_active_upids -= stale
|
||||||
|
# Track new ones
|
||||||
|
self._seen_active_upids |= current_upids
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[TaskWatcher] Error reading active tasks: {e}")
|
print(f"[TaskWatcher] Error reading active tasks: {e}")
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _parse_vzdump_cmdline(pid_hex: str) -> dict:
|
|
||||||
"""Read /proc/PID/cmdline to extract vzdump parameters.
|
|
||||||
|
|
||||||
Returns dict with keys: guests (list), storage, mode, compress, all.
|
|
||||||
"""
|
|
||||||
info: dict = {'guests': [], 'storage': '', 'mode': '', 'compress': ''}
|
|
||||||
try:
|
|
||||||
pid = int(pid_hex, 16)
|
|
||||||
cmdline_path = f'/proc/{pid}/cmdline'
|
|
||||||
if not os.path.exists(cmdline_path):
|
|
||||||
return info
|
|
||||||
|
|
||||||
with open(cmdline_path, 'rb') as f:
|
|
||||||
raw = f.read()
|
|
||||||
|
|
||||||
# cmdline is null-byte separated
|
|
||||||
args = raw.decode('utf-8', errors='replace').split('\0')
|
|
||||||
args = [a for a in args if a] # remove empty
|
|
||||||
|
|
||||||
# Parse: vzdump VMID1 VMID2 --storage local --mode stop ...
|
|
||||||
i = 0
|
|
||||||
while i < len(args):
|
|
||||||
arg = args[i]
|
|
||||||
if arg.isdigit():
|
|
||||||
info['guests'].append(arg)
|
|
||||||
elif arg == '--storage' and i + 1 < len(args):
|
|
||||||
info['storage'] = args[i + 1]
|
|
||||||
i += 1
|
|
||||||
elif arg == '--mode' and i + 1 < len(args):
|
|
||||||
info['mode'] = args[i + 1]
|
|
||||||
i += 1
|
|
||||||
elif arg == '--compress' and i + 1 < len(args):
|
|
||||||
info['compress'] = args[i + 1]
|
|
||||||
i += 1
|
|
||||||
elif arg == '--all' and i + 1 < len(args):
|
|
||||||
if args[i + 1] == '1':
|
|
||||||
info['guests'] = ['all']
|
|
||||||
i += 1
|
|
||||||
i += 1
|
|
||||||
|
|
||||||
return info
|
|
||||||
except Exception:
|
|
||||||
return info
|
|
||||||
|
|
||||||
def _process_task_line(self, line: str):
|
def _process_task_line(self, line: str):
|
||||||
"""Process a single task index line.
|
"""Process a single task index line.
|
||||||
|
|
||||||
@@ -1431,8 +1417,21 @@ class PollingCollector:
|
|||||||
if severity in ('INFO', 'OK'):
|
if severity in ('INFO', 'OK'):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Skip dismissed/acknowledged errors -- the user already handled these
|
||||||
|
if error.get('acknowledged') == 1:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# On first poll, seed _last_notified for all existing errors so we
|
||||||
|
# don't re-notify old persistent errors that were already sent before
|
||||||
|
# a service restart. Only genuinely NEW errors (appearing after the
|
||||||
|
# first poll) will trigger immediate notifications.
|
||||||
|
if not self._first_poll_done:
|
||||||
|
if error_key not in self._last_notified:
|
||||||
|
self._last_notified[error_key] = now
|
||||||
|
continue
|
||||||
|
|
||||||
# Determine if we should notify
|
# Determine if we should notify
|
||||||
is_new = error_key not in self._known_errors and self._first_poll_done
|
is_new = error_key not in self._known_errors
|
||||||
last_sent = self._last_notified.get(error_key, 0)
|
last_sent = self._last_notified.get(error_key, 0)
|
||||||
is_due = (now - last_sent) >= self.DIGEST_INTERVAL
|
is_due = (now - last_sent) >= self.DIGEST_INTERVAL
|
||||||
|
|
||||||
|
|||||||
@@ -651,7 +651,8 @@ class NotificationManager:
|
|||||||
parts = line.strip().split(':')
|
parts = line.strip().split(':')
|
||||||
if len(parts) >= 3:
|
if len(parts) >= 3:
|
||||||
try:
|
try:
|
||||||
pid = int(parts[2])
|
# PID in UPID is HEXADECIMAL
|
||||||
|
pid = int(parts[2], 16)
|
||||||
os.kill(pid, 0)
|
os.kill(pid, 0)
|
||||||
return True
|
return True
|
||||||
except (ValueError, ProcessLookupError, PermissionError):
|
except (ValueError, ProcessLookupError, PermissionError):
|
||||||
|
|||||||
Reference in New Issue
Block a user