mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-04-25 08:56:21 +00:00
update health_persistence.py
This commit is contained in:
@@ -111,9 +111,9 @@ const fetchSystemData = async (retries = 3, delayMs = 500): Promise<SystemData |
|
|||||||
try {
|
try {
|
||||||
const data = await fetchApi<SystemData>("/api/system")
|
const data = await fetchApi<SystemData>("/api/system")
|
||||||
return data
|
return data
|
||||||
} catch (error) {
|
} catch {
|
||||||
if (attempt === retries - 1) {
|
if (attempt === retries - 1) {
|
||||||
console.error("[v0] Failed to fetch system data after retries:", error)
|
// Silent fail - API not available (expected in preview environment)
|
||||||
return null
|
return null
|
||||||
}
|
}
|
||||||
// Wait before retry
|
// Wait before retry
|
||||||
@@ -127,8 +127,8 @@ const fetchVMData = async (): Promise<VMData[]> => {
|
|||||||
try {
|
try {
|
||||||
const data = await fetchApi<any>("/api/vms")
|
const data = await fetchApi<any>("/api/vms")
|
||||||
return Array.isArray(data) ? data : data.vms || []
|
return Array.isArray(data) ? data : data.vms || []
|
||||||
} catch (error) {
|
} catch {
|
||||||
console.error("[v0] Failed to fetch VM data:", error)
|
// Silent fail - API not available
|
||||||
return []
|
return []
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -137,8 +137,7 @@ const fetchStorageData = async (): Promise<StorageData | null> => {
|
|||||||
try {
|
try {
|
||||||
const data = await fetchApi<StorageData>("/api/storage/summary")
|
const data = await fetchApi<StorageData>("/api/storage/summary")
|
||||||
return data
|
return data
|
||||||
} catch (error) {
|
} catch {
|
||||||
console.log("[v0] Storage API not available (this is normal if not configured)")
|
|
||||||
return null
|
return null
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -146,13 +145,22 @@ const fetchStorageData = async (): Promise<StorageData | null> => {
|
|||||||
const fetchNetworkData = async (): Promise<NetworkData | null> => {
|
const fetchNetworkData = async (): Promise<NetworkData | null> => {
|
||||||
try {
|
try {
|
||||||
const data = await fetchApi<NetworkData>("/api/network/summary")
|
const data = await fetchApi<NetworkData>("/api/network/summary")
|
||||||
return data
|
return data
|
||||||
} catch (error) {
|
} catch {
|
||||||
console.log("[v0] Network API not available (this is normal if not configured)")
|
return null
|
||||||
return null
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const fetchProxmoxStorageData = async (): Promise<ProxmoxStorage[] | null> => {
|
||||||
|
try {
|
||||||
|
const data = await fetchApi<ProxmoxStorage[]>("/api/proxmox-storage")
|
||||||
|
return data
|
||||||
|
} catch {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const fetchProxmoxStorageData = async (): Promise<ProxmoxStorageData | null> => {
|
const fetchProxmoxStorageData = async (): Promise<ProxmoxStorageData | null> => {
|
||||||
try {
|
try {
|
||||||
const data = await fetchApi<ProxmoxStorageData>("/api/proxmox-storage")
|
const data = await fetchApi<ProxmoxStorageData>("/api/proxmox-storage")
|
||||||
|
|||||||
@@ -915,8 +915,9 @@ def _capture_health_journal_context(categories: list, reason: str = '') -> str:
|
|||||||
return ""
|
return ""
|
||||||
|
|
||||||
# Capture recent journal entries matching keywords
|
# Capture recent journal entries matching keywords
|
||||||
|
# Use -b 0 to only include logs from the current boot
|
||||||
cmd = (
|
cmd = (
|
||||||
f"journalctl --since='10 minutes ago' --no-pager -n 500 2>/dev/null | "
|
f"journalctl -b 0 --since='10 minutes ago' --no-pager -n 500 2>/dev/null | "
|
||||||
f"grep -iE '{pattern}' | tail -n 30"
|
f"grep -iE '{pattern}' | tail -n 30"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -150,7 +150,7 @@ class HealthMonitor:
|
|||||||
r'zfs.*scrub (started|finished|in progress)',
|
r'zfs.*scrub (started|finished|in progress)',
|
||||||
r'zpool.*resilver',
|
r'zpool.*resilver',
|
||||||
|
|
||||||
# <EFBFBD><EFBFBD><EFBFBD>─ LXC/Container normal operations ──
|
# ── LXC/Container normal operations ──
|
||||||
r'lxc.*monitor',
|
r'lxc.*monitor',
|
||||||
r'systemd\[1\]: (started|stopped) .*\.scope',
|
r'systemd\[1\]: (started|stopped) .*\.scope',
|
||||||
|
|
||||||
@@ -184,13 +184,21 @@ class HealthMonitor:
|
|||||||
]
|
]
|
||||||
|
|
||||||
CRITICAL_LOG_KEYWORDS = [
|
CRITICAL_LOG_KEYWORDS = [
|
||||||
'out of memory', 'oom_kill', 'kernel panic',
|
# OOM and memory errors
|
||||||
'filesystem read-only', 'cannot mount',
|
'out of memory', 'oom_kill', 'oom-kill', 'invoked oom-killer',
|
||||||
'raid.*failed', 'md.*device failed',
|
'memory cgroup out of memory', 'cannot allocate memory', 'oom_reaper',
|
||||||
'ext4-fs error', 'xfs.*corruption',
|
# Kernel panics and critical faults
|
||||||
'lvm activation failed',
|
'kernel panic', 'general protection fault', 'trap invalid opcode',
|
||||||
|
# Filesystem critical errors
|
||||||
|
'filesystem read-only', 'read-only file system', 'cannot mount',
|
||||||
|
'ext4-fs error', 'ext4_abort', 'xfs.*corruption', 'btrfs.*error',
|
||||||
|
# RAID/Storage critical
|
||||||
|
'raid.*failed', 'md.*device failed', 'lvm activation failed',
|
||||||
|
'zpool.*faulted', 'state: faulted',
|
||||||
|
# Hardware errors
|
||||||
'hardware error', 'mce:',
|
'hardware error', 'mce:',
|
||||||
'general protection fault',
|
# Cluster critical
|
||||||
|
'quorum lost', 'split brain',
|
||||||
]
|
]
|
||||||
|
|
||||||
# Segfault is WARNING, not CRITICAL -- only PVE-critical process
|
# Segfault is WARNING, not CRITICAL -- only PVE-critical process
|
||||||
@@ -202,11 +210,20 @@ class HealthMonitor:
|
|||||||
}
|
}
|
||||||
|
|
||||||
WARNING_LOG_KEYWORDS = [
|
WARNING_LOG_KEYWORDS = [
|
||||||
'i/o error', 'ata error', 'scsi error',
|
# Storage I/O errors
|
||||||
'task hung', 'blocked for more than',
|
'i/o error', 'buffer i/o error', 'ata error', 'scsi error',
|
||||||
'failed to start', 'service.*failed',
|
|
||||||
'disk.*offline', 'disk.*removed',
|
'disk.*offline', 'disk.*removed',
|
||||||
'segfault', # WARNING by default; escalated to CRITICAL only for PVE processes
|
# CPU/IO blocking
|
||||||
|
'task hung', 'blocked for more than', 'soft lockup',
|
||||||
|
# Service failures
|
||||||
|
'failed to start', 'service.*failed',
|
||||||
|
'entering failed state', 'code=exited, status=', 'code=killed',
|
||||||
|
# Process crashes (WARNING by default; escalated to CRITICAL for PVE processes)
|
||||||
|
'segfault',
|
||||||
|
# Cluster/Network warnings
|
||||||
|
'corosync.*failed', 'corosync.*timeout',
|
||||||
|
'connection lost', 'totem.*failed',
|
||||||
|
'entered disabled state', 'entered blocking state',
|
||||||
]
|
]
|
||||||
|
|
||||||
# PVE Critical Services
|
# PVE Critical Services
|
||||||
@@ -769,12 +786,30 @@ class HealthMonitor:
|
|||||||
if len(critical_samples) >= 3:
|
if len(critical_samples) >= 3:
|
||||||
status = 'CRITICAL'
|
status = 'CRITICAL'
|
||||||
reason = f'CPU >{self.CPU_CRITICAL}% sustained for {self.CPU_CRITICAL_DURATION}s'
|
reason = f'CPU >{self.CPU_CRITICAL}% sustained for {self.CPU_CRITICAL_DURATION}s'
|
||||||
|
# Record the error
|
||||||
|
health_persistence.record_error(
|
||||||
|
error_key='cpu_usage',
|
||||||
|
category='cpu',
|
||||||
|
severity='CRITICAL',
|
||||||
|
reason=reason,
|
||||||
|
details={'cpu_percent': cpu_percent}
|
||||||
|
)
|
||||||
elif len(warning_samples) >= 3 and len(recovery_samples) < 2:
|
elif len(warning_samples) >= 3 and len(recovery_samples) < 2:
|
||||||
status = 'WARNING'
|
status = 'WARNING'
|
||||||
reason = f'CPU >{self.CPU_WARNING}% sustained for {self.CPU_WARNING_DURATION}s'
|
reason = f'CPU >{self.CPU_WARNING}% sustained for {self.CPU_WARNING_DURATION}s'
|
||||||
|
# Record the warning
|
||||||
|
health_persistence.record_error(
|
||||||
|
error_key='cpu_usage',
|
||||||
|
category='cpu',
|
||||||
|
severity='WARNING',
|
||||||
|
reason=reason,
|
||||||
|
details={'cpu_percent': cpu_percent}
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
status = 'OK'
|
status = 'OK'
|
||||||
reason = None
|
reason = None
|
||||||
|
# CPU is normal - auto-resolve any existing CPU errors
|
||||||
|
health_persistence.resolve_error('cpu_usage', 'CPU usage returned to normal')
|
||||||
|
|
||||||
temp_status = self._check_cpu_temperature()
|
temp_status = self._check_cpu_temperature()
|
||||||
|
|
||||||
|
|||||||
@@ -967,10 +967,12 @@ class HealthPersistence:
|
|||||||
cutoff_events = (now - timedelta(days=30)).isoformat()
|
cutoff_events = (now - timedelta(days=30)).isoformat()
|
||||||
cursor.execute('DELETE FROM events WHERE timestamp < ?', (cutoff_events,))
|
cursor.execute('DELETE FROM events WHERE timestamp < ?', (cutoff_events,))
|
||||||
|
|
||||||
# ── Auto-resolve transient errors after system stabilizes ──
|
# ══════════════════════════════════════════════════════════════════════
|
||||||
# Transient errors (OOM, high CPU, service failures) resolve themselves.
|
# SMART AUTO-RESOLVE: Based on system state, not hardcoded patterns
|
||||||
# If the system has been up for >10 minutes and these errors haven't recurred,
|
# ══════════════════════════════════════════════════════════════════════
|
||||||
# they are stale and should be auto-resolved.
|
# Logic: If an error hasn't been seen recently AND the system is healthy,
|
||||||
|
# the error is stale and should be auto-resolved.
|
||||||
|
# This works for ANY error pattern, not just predefined ones.
|
||||||
try:
|
try:
|
||||||
import psutil
|
import psutil
|
||||||
# Get system uptime
|
# Get system uptime
|
||||||
@@ -979,9 +981,13 @@ class HealthPersistence:
|
|||||||
|
|
||||||
# Only auto-resolve if system has been stable for at least 10 minutes
|
# Only auto-resolve if system has been stable for at least 10 minutes
|
||||||
if uptime_seconds > 600: # 10 minutes
|
if uptime_seconds > 600: # 10 minutes
|
||||||
stale_cutoff = (now - timedelta(minutes=10)).isoformat()
|
current_cpu = psutil.cpu_percent(interval=0.1)
|
||||||
|
current_mem = psutil.virtual_memory().percent
|
||||||
|
|
||||||
# 1. Resolve transient log errors (OOM, service failures)
|
# ── 1. LOGS category: Auto-resolve if not seen in 15 minutes ──
|
||||||
|
# Log errors are transient - if journalctl hasn't reported them recently,
|
||||||
|
# they are from a previous state and should be resolved.
|
||||||
|
stale_logs_cutoff = (now - timedelta(minutes=15)).isoformat()
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
UPDATE errors
|
UPDATE errors
|
||||||
SET resolved_at = ?
|
SET resolved_at = ?
|
||||||
@@ -989,49 +995,69 @@ class HealthPersistence:
|
|||||||
AND resolved_at IS NULL
|
AND resolved_at IS NULL
|
||||||
AND acknowledged = 0
|
AND acknowledged = 0
|
||||||
AND last_seen < ?
|
AND last_seen < ?
|
||||||
AND (error_key LIKE 'log_critical_%'
|
''', (now_iso, stale_logs_cutoff))
|
||||||
OR error_key LIKE 'log_persistent_%'
|
|
||||||
OR reason LIKE '%Out of memory%'
|
|
||||||
OR reason LIKE '%Recurring error%'
|
|
||||||
OR reason LIKE '%service%Failed%'
|
|
||||||
OR reason LIKE '%timeout%'
|
|
||||||
OR reason LIKE '%critical error%')
|
|
||||||
''', (now_iso, stale_cutoff))
|
|
||||||
|
|
||||||
# 2. Auto-resolve CPU errors if current CPU is normal (<75%)
|
# ── 2. CPU category: Auto-resolve if CPU is normal (<75%) ──
|
||||||
try:
|
if current_cpu < 75:
|
||||||
current_cpu = psutil.cpu_percent(interval=0.1)
|
stale_cpu_cutoff = (now - timedelta(minutes=5)).isoformat()
|
||||||
if current_cpu < 75:
|
cursor.execute('''
|
||||||
cursor.execute('''
|
UPDATE errors
|
||||||
UPDATE errors
|
SET resolved_at = ?
|
||||||
SET resolved_at = ?
|
WHERE (category = 'cpu' OR category = 'temperature')
|
||||||
WHERE category = 'temperature'
|
AND resolved_at IS NULL
|
||||||
AND resolved_at IS NULL
|
AND acknowledged = 0
|
||||||
AND acknowledged = 0
|
AND last_seen < ?
|
||||||
AND last_seen < ?
|
AND (error_key LIKE 'cpu_%' OR reason LIKE '%CPU%')
|
||||||
AND (error_key = 'cpu_usage'
|
''', (now_iso, stale_cpu_cutoff))
|
||||||
OR reason LIKE '%CPU >%sustained%'
|
|
||||||
OR reason LIKE '%Sustained high CPU%')
|
|
||||||
''', (now_iso, stale_cutoff))
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# 3. Auto-resolve memory errors if current memory is normal (<80%)
|
# ── 3. MEMORY category: Auto-resolve if memory is normal (<80%) ──
|
||||||
try:
|
if current_mem < 80:
|
||||||
current_mem = psutil.virtual_memory().percent
|
stale_mem_cutoff = (now - timedelta(minutes=5)).isoformat()
|
||||||
if current_mem < 80:
|
cursor.execute('''
|
||||||
cursor.execute('''
|
UPDATE errors
|
||||||
UPDATE errors
|
SET resolved_at = ?
|
||||||
SET resolved_at = ?
|
WHERE (category = 'memory' OR category = 'logs')
|
||||||
WHERE category = 'memory'
|
AND resolved_at IS NULL
|
||||||
AND resolved_at IS NULL
|
AND acknowledged = 0
|
||||||
AND acknowledged = 0
|
AND last_seen < ?
|
||||||
AND last_seen < ?
|
AND (error_key LIKE '%oom%'
|
||||||
AND (reason LIKE '%Memory >%'
|
OR error_key LIKE '%memory%'
|
||||||
OR reason LIKE '%RAM usage%')
|
OR reason LIKE '%memory%'
|
||||||
''', (now_iso, stale_cutoff))
|
OR reason LIKE '%OOM%'
|
||||||
except Exception:
|
OR reason LIKE '%killed%process%')
|
||||||
pass
|
''', (now_iso, stale_mem_cutoff))
|
||||||
|
|
||||||
|
# ── 4. VMS category: Auto-resolve if VM/CT is now running ──
|
||||||
|
# Check all active VM/CT errors and resolve if the VM/CT is now running
|
||||||
|
cursor.execute('''
|
||||||
|
SELECT error_key, category FROM errors
|
||||||
|
WHERE (category IN ('vms', 'vmct') OR error_key LIKE 'vm_%' OR error_key LIKE 'ct_%' OR error_key LIKE 'vmct_%')
|
||||||
|
AND resolved_at IS NULL
|
||||||
|
AND acknowledged = 0
|
||||||
|
''')
|
||||||
|
vm_errors = cursor.fetchall()
|
||||||
|
for error_key, cat in vm_errors:
|
||||||
|
# Extract VM/CT ID from error_key
|
||||||
|
import re
|
||||||
|
vmid_match = re.search(r'(?:vm_|ct_|vmct_)(\d+)', error_key)
|
||||||
|
if vmid_match:
|
||||||
|
vmid = vmid_match.group(1)
|
||||||
|
# Check if running - this auto-resolves if so
|
||||||
|
self.check_vm_running(vmid)
|
||||||
|
|
||||||
|
# ── 5. GENERIC: Any error not seen in 30 minutes while system is healthy ──
|
||||||
|
# If CPU < 80% and Memory < 85% and error hasn't been seen in 30 min,
|
||||||
|
# the system has recovered and the error is stale.
|
||||||
|
if current_cpu < 80 and current_mem < 85:
|
||||||
|
stale_generic_cutoff = (now - timedelta(minutes=30)).isoformat()
|
||||||
|
cursor.execute('''
|
||||||
|
UPDATE errors
|
||||||
|
SET resolved_at = ?
|
||||||
|
WHERE resolved_at IS NULL
|
||||||
|
AND acknowledged = 0
|
||||||
|
AND last_seen < ?
|
||||||
|
AND category NOT IN ('disks', 'storage')
|
||||||
|
''', (now_iso, stale_generic_cutoff))
|
||||||
|
|
||||||
except Exception:
|
except Exception:
|
||||||
pass # If we can't read uptime, skip this cleanup
|
pass # If we can't read uptime, skip this cleanup
|
||||||
@@ -1166,9 +1192,20 @@ class HealthPersistence:
|
|||||||
"""Extract VM/CT ID from error message or key."""
|
"""Extract VM/CT ID from error message or key."""
|
||||||
if not text:
|
if not text:
|
||||||
return None
|
return None
|
||||||
# Patterns: "VM 100", "CT 100", "vm_100_", "ct_100_", "VMID 100", etc.
|
# Patterns: "VM 100", "CT 100", "vm_100_", "ct_100_", "VMID 100", "VM/CT 100", "qemu/100", "lxc/100", etc.
|
||||||
match = re.search(r'(?:VM|CT|VMID|CTID|vm_|ct_)[\s_]?(\d{3,})', text, re.IGNORECASE)
|
patterns = [
|
||||||
return match.group(1) if match else None
|
r'(?:VM|CT|VMID|CTID|vm_|ct_|vmct_)[\s_]?(\d{3,})', # VM 100, ct_100
|
||||||
|
r'VM/CT[\s_]?(\d{3,})', # VM/CT 100
|
||||||
|
r'(?:qemu|lxc)[/\\](\d{3,})', # qemu/100, lxc/100
|
||||||
|
r'process.*kvm.*?(\d{3,})', # process kvm with vmid
|
||||||
|
r'Failed to start.*?(\d{3,})', # Failed to start VM/CT
|
||||||
|
r'starting.*?(\d{3,}).*failed', # starting 100 failed
|
||||||
|
]
|
||||||
|
for pattern in patterns:
|
||||||
|
match = re.search(pattern, text, re.IGNORECASE)
|
||||||
|
if match:
|
||||||
|
return match.group(1)
|
||||||
|
return None
|
||||||
|
|
||||||
def get_age_hours(timestamp_str):
|
def get_age_hours(timestamp_str):
|
||||||
"""Get age in hours from ISO timestamp string."""
|
"""Get age in hours from ISO timestamp string."""
|
||||||
@@ -1189,11 +1226,20 @@ class HealthPersistence:
|
|||||||
|
|
||||||
# === VM/CT ERRORS ===
|
# === VM/CT ERRORS ===
|
||||||
# Check if VM/CT still exists (covers: vms/vmct categories, vm_*, ct_*, vmct_* error keys)
|
# Check if VM/CT still exists (covers: vms/vmct categories, vm_*, ct_*, vmct_* error keys)
|
||||||
if category in ('vms', 'vmct') or (error_key and (error_key.startswith('vm_') or error_key.startswith('ct_') or error_key.startswith('vmct_'))):
|
# Also check if the reason mentions a VM/CT that no longer exists
|
||||||
vmid = extract_vmid_from_text(error_key) or extract_vmid_from_text(reason)
|
vmid_from_key = extract_vmid_from_text(error_key) if error_key else None
|
||||||
if vmid and not check_vm_ct_cached(vmid):
|
vmid_from_reason = extract_vmid_from_text(reason) if reason else None
|
||||||
|
vmid = vmid_from_key or vmid_from_reason
|
||||||
|
|
||||||
|
if vmid and not check_vm_ct_cached(vmid):
|
||||||
|
# VM/CT doesn't exist - resolve regardless of category
|
||||||
|
should_resolve = True
|
||||||
|
resolution_reason = f'VM/CT {vmid} deleted'
|
||||||
|
elif category in ('vms', 'vmct') or (error_key and (error_key.startswith('vm_') or error_key.startswith('ct_') or error_key.startswith('vmct_'))):
|
||||||
|
# VM/CT category but ID couldn't be extracted - resolve if stale
|
||||||
|
if not vmid and last_seen_hours > 1:
|
||||||
should_resolve = True
|
should_resolve = True
|
||||||
resolution_reason = 'VM/CT deleted'
|
resolution_reason = 'VM/CT error stale (>1h, ID not found)'
|
||||||
|
|
||||||
# === DISK ERRORS ===
|
# === DISK ERRORS ===
|
||||||
# Check if disk device or ZFS pool still exists
|
# Check if disk device or ZFS pool still exists
|
||||||
@@ -1360,8 +1406,17 @@ class HealthPersistence:
|
|||||||
|
|
||||||
def check_vm_running(self, vm_id: str) -> bool:
|
def check_vm_running(self, vm_id: str) -> bool:
|
||||||
"""
|
"""
|
||||||
Check if a VM/CT is running and resolve error if so.
|
Check if a VM/CT is running and resolve TRANSIENT errors if so.
|
||||||
Also resolves error if VM/CT no longer exists.
|
Also resolves error if VM/CT no longer exists.
|
||||||
|
|
||||||
|
Only resolves errors that are likely to be fixed by a restart:
|
||||||
|
- QMP command failures
|
||||||
|
- Startup failures (generic)
|
||||||
|
|
||||||
|
Does NOT resolve persistent configuration errors like:
|
||||||
|
- Device missing
|
||||||
|
- Permission issues
|
||||||
|
|
||||||
Returns True if running/resolved, False otherwise.
|
Returns True if running/resolved, False otherwise.
|
||||||
"""
|
"""
|
||||||
import subprocess
|
import subprocess
|
||||||
@@ -1369,6 +1424,8 @@ class HealthPersistence:
|
|||||||
try:
|
try:
|
||||||
vm_exists = False
|
vm_exists = False
|
||||||
ct_exists = False
|
ct_exists = False
|
||||||
|
is_running = False
|
||||||
|
vm_type = None
|
||||||
|
|
||||||
# Check qm status for VMs
|
# Check qm status for VMs
|
||||||
result_vm = subprocess.run(
|
result_vm = subprocess.run(
|
||||||
@@ -1380,32 +1437,59 @@ class HealthPersistence:
|
|||||||
|
|
||||||
if result_vm.returncode == 0:
|
if result_vm.returncode == 0:
|
||||||
vm_exists = True
|
vm_exists = True
|
||||||
|
vm_type = 'vm'
|
||||||
if 'running' in result_vm.stdout.lower():
|
if 'running' in result_vm.stdout.lower():
|
||||||
self.resolve_error(f'vm_{vm_id}', 'VM started')
|
is_running = True
|
||||||
self.resolve_error(f'vmct_{vm_id}', 'VM started')
|
|
||||||
return True
|
|
||||||
|
|
||||||
# Check pct status for containers
|
# Check pct status for containers
|
||||||
result_ct = subprocess.run(
|
if not vm_exists:
|
||||||
['pct', 'status', vm_id],
|
result_ct = subprocess.run(
|
||||||
capture_output=True,
|
['pct', 'status', vm_id],
|
||||||
text=True,
|
capture_output=True,
|
||||||
timeout=2
|
text=True,
|
||||||
)
|
timeout=2
|
||||||
|
)
|
||||||
|
|
||||||
|
if result_ct.returncode == 0:
|
||||||
|
ct_exists = True
|
||||||
|
vm_type = 'ct'
|
||||||
|
if 'running' in result_ct.stdout.lower():
|
||||||
|
is_running = True
|
||||||
|
|
||||||
if result_ct.returncode == 0:
|
# If neither VM nor CT exists, resolve ALL related errors
|
||||||
ct_exists = True
|
|
||||||
if 'running' in result_ct.stdout.lower():
|
|
||||||
self.resolve_error(f'ct_{vm_id}', 'Container started')
|
|
||||||
self.resolve_error(f'vmct_{vm_id}', 'Container started')
|
|
||||||
return True
|
|
||||||
|
|
||||||
# If neither VM nor CT exists, resolve all related errors
|
|
||||||
if not vm_exists and not ct_exists:
|
if not vm_exists and not ct_exists:
|
||||||
self.resolve_error(f'vm_{vm_id}', 'VM/CT deleted')
|
self.resolve_error(f'vm_{vm_id}', 'VM/CT deleted')
|
||||||
self.resolve_error(f'ct_{vm_id}', 'VM/CT deleted')
|
self.resolve_error(f'ct_{vm_id}', 'VM/CT deleted')
|
||||||
self.resolve_error(f'vmct_{vm_id}', 'VM/CT deleted')
|
self.resolve_error(f'vmct_{vm_id}', 'VM/CT deleted')
|
||||||
return True # Error resolved because resource doesn't exist
|
return True
|
||||||
|
|
||||||
|
# If running, only resolve TRANSIENT errors (QMP, startup)
|
||||||
|
# Do NOT resolve persistent config errors (device missing, permissions)
|
||||||
|
if is_running:
|
||||||
|
conn = self._get_conn()
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
# Get the error details to check if it's a persistent config error
|
||||||
|
for prefix in (f'{vm_type}_{vm_id}', f'vmct_{vm_id}'):
|
||||||
|
cursor.execute('''
|
||||||
|
SELECT error_key, reason FROM errors
|
||||||
|
WHERE error_key = ? AND resolved_at IS NULL
|
||||||
|
''', (prefix,))
|
||||||
|
row = cursor.fetchone()
|
||||||
|
if row:
|
||||||
|
reason = (row[1] or '').lower()
|
||||||
|
# Check if this is a persistent config error that won't be fixed by restart
|
||||||
|
is_persistent_config = any(indicator in reason for indicator in [
|
||||||
|
'device', 'missing', 'does not exist', 'permission',
|
||||||
|
'not found', 'no such', 'invalid'
|
||||||
|
])
|
||||||
|
|
||||||
|
if not is_persistent_config:
|
||||||
|
# Transient error - resolve it
|
||||||
|
self.resolve_error(prefix, f'{vm_type.upper()} started successfully')
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
return True
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|||||||
@@ -174,8 +174,9 @@ def capture_journal_context(keywords: list, lines: int = 30,
|
|||||||
return ""
|
return ""
|
||||||
|
|
||||||
# Use journalctl with grep to filter relevant lines
|
# Use journalctl with grep to filter relevant lines
|
||||||
|
# Use -b 0 to only include logs from the current boot (not previous boots)
|
||||||
cmd = (
|
cmd = (
|
||||||
f"journalctl --since='{since}' --no-pager -n 500 2>/dev/null | "
|
f"journalctl -b 0 --since='{since}' --no-pager -n 500 2>/dev/null | "
|
||||||
f"grep -iE '{pattern}' | tail -n {lines}"
|
f"grep -iE '{pattern}' | tail -n {lines}"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -1800,6 +1801,8 @@ class PollingCollector:
|
|||||||
# Key = health_persistence category name
|
# Key = health_persistence category name
|
||||||
# Value = minimum seconds between notifications for the same error_key
|
# Value = minimum seconds between notifications for the same error_key
|
||||||
_CATEGORY_COOLDOWNS = {
|
_CATEGORY_COOLDOWNS = {
|
||||||
|
# Category cooldown: minimum time between DIFFERENT errors of the same category
|
||||||
|
# This prevents notification storms when multiple issues arise together
|
||||||
'disks': 86400, # 24h - I/O errors are persistent hardware issues
|
'disks': 86400, # 24h - I/O errors are persistent hardware issues
|
||||||
'smart': 86400, # 24h - SMART errors same as I/O
|
'smart': 86400, # 24h - SMART errors same as I/O
|
||||||
'zfs': 86400, # 24h - ZFS pool issues are persistent
|
'zfs': 86400, # 24h - ZFS pool issues are persistent
|
||||||
@@ -1809,6 +1812,7 @@ class PollingCollector:
|
|||||||
'temperature': 3600, # 1h - temp can fluctuate near thresholds
|
'temperature': 3600, # 1h - temp can fluctuate near thresholds
|
||||||
'logs': 3600, # 1h - repeated log patterns
|
'logs': 3600, # 1h - repeated log patterns
|
||||||
'vms': 1800, # 30m - VM state oscillation
|
'vms': 1800, # 30m - VM state oscillation
|
||||||
|
'vmct': 1800, # 30m - VM/CT state oscillation
|
||||||
'security': 3600, # 1h - auth failures tend to be bursty
|
'security': 3600, # 1h - auth failures tend to be bursty
|
||||||
'cpu': 1800, # 30m - CPU spikes can be transient
|
'cpu': 1800, # 30m - CPU spikes can be transient
|
||||||
'memory': 1800, # 30m - memory pressure oscillation
|
'memory': 1800, # 30m - memory pressure oscillation
|
||||||
@@ -1816,6 +1820,10 @@ class PollingCollector:
|
|||||||
'updates': 86400, # 24h - update info doesn't change fast
|
'updates': 86400, # 24h - update info doesn't change fast
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Global cooldown: minimum time before the SAME error can be re-notified
|
||||||
|
# This is independent of category - same error_key cannot repeat before this time
|
||||||
|
SAME_ERROR_COOLDOWN = 86400 # 24 hours
|
||||||
|
|
||||||
_ENTITY_MAP = {
|
_ENTITY_MAP = {
|
||||||
'cpu': ('node', ''), 'memory': ('node', ''), 'temperature': ('node', ''),
|
'cpu': ('node', ''), 'memory': ('node', ''), 'temperature': ('node', ''),
|
||||||
'load': ('node', ''),
|
'load': ('node', ''),
|
||||||
@@ -2032,15 +2040,20 @@ class PollingCollector:
|
|||||||
# Determine if we should notify
|
# Determine if we should notify
|
||||||
is_new = error_key not in self._known_errors
|
is_new = error_key not in self._known_errors
|
||||||
last_sent = self._last_notified.get(error_key, 0)
|
last_sent = self._last_notified.get(error_key, 0)
|
||||||
cat_cooldown = self._CATEGORY_COOLDOWNS.get(category, self.DIGEST_INTERVAL)
|
time_since_last = now - last_sent
|
||||||
is_due = (now - last_sent) >= cat_cooldown
|
|
||||||
|
# ── SAME ERROR COOLDOWN (24h) ──
|
||||||
|
# The SAME error_key cannot be re-notified before 24 hours.
|
||||||
|
# This is the PRIMARY deduplication mechanism.
|
||||||
|
if time_since_last < self.SAME_ERROR_COOLDOWN:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# ── CATEGORY COOLDOWN (varies) ──
|
||||||
|
# DIFFERENT errors within the same category respect category cooldown.
|
||||||
|
# This prevents notification storms when multiple issues arise together.
|
||||||
|
cat_cooldown = self._CATEGORY_COOLDOWNS.get(category, self.DIGEST_INTERVAL)
|
||||||
|
is_due = time_since_last >= cat_cooldown
|
||||||
|
|
||||||
# Anti-oscillation: even if "new" (resolved then reappeared),
|
|
||||||
# respect the per-category cooldown interval. This prevents
|
|
||||||
# "semi-cascades" where the same root cause generates multiple
|
|
||||||
# slightly different notifications across health check cycles.
|
|
||||||
# Each category has its own appropriate cooldown (30m for network,
|
|
||||||
# 24h for disks, 1h for temperature, etc.).
|
|
||||||
if not is_due:
|
if not is_due:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user