mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-05-01 11:56:21 +00:00
Update health_persistence.py
This commit is contained in:
@@ -695,124 +695,113 @@ class HealthPersistence:
|
|||||||
result = {'success': False, 'error_key': error_key}
|
result = {'success': False, 'error_key': error_key}
|
||||||
|
|
||||||
if not row:
|
if not row:
|
||||||
# Error not in DB yet -- create a minimal record so the dismiss persists.
|
# Error not in DB yet -- create a minimal record so the dismiss persists.
|
||||||
# Try to infer category from the error_key prefix.
|
# Try to infer category from the error_key prefix.
|
||||||
category = ''
|
category = ''
|
||||||
# Order matters: more specific prefixes MUST come before shorter ones
|
# Order matters: more specific prefixes MUST come before shorter ones
|
||||||
# e.g. 'security_updates' (updates) before 'security_' (security)
|
# e.g. 'security_updates' (updates) before 'security_' (security)
|
||||||
for cat, prefix in [('updates', 'security_updates'), ('updates', 'system_age'),
|
for cat, prefix in [('updates', 'security_updates'), ('updates', 'system_age'),
|
||||||
('updates', 'pending_updates'), ('updates', 'kernel_pve'),
|
('updates', 'pending_updates'), ('updates', 'kernel_pve'),
|
||||||
('security', 'security_'),
|
('security', 'security_'),
|
||||||
('pve_services', 'pve_service_'), ('vms', 'vmct_'), ('vms', 'vm_'), ('vms', 'ct_'),
|
('pve_services', 'pve_service_'), ('vms', 'vmct_'), ('vms', 'vm_'), ('vms', 'ct_'),
|
||||||
('disks', 'disk_smart_'), ('disks', 'disk_'), ('disks', 'smart_'), ('disks', 'zfs_pool_'),
|
('disks', 'disk_smart_'), ('disks', 'disk_'), ('disks', 'smart_'), ('disks', 'zfs_pool_'),
|
||||||
('logs', 'log_'), ('network', 'net_'),
|
('logs', 'log_'), ('network', 'net_'),
|
||||||
('temperature', 'temp_')]:
|
('temperature', 'temp_')]:
|
||||||
if error_key == prefix or error_key.startswith(prefix):
|
if error_key == prefix or error_key.startswith(prefix):
|
||||||
category = cat
|
category = cat
|
||||||
break
|
break
|
||||||
|
|
||||||
# Fallback: if no category matched, try to infer from common patterns
|
# Fallback: if no category matched, try to infer from common patterns
|
||||||
if not category:
|
if not category:
|
||||||
if 'disk' in error_key or 'smart' in error_key or 'sda' in error_key or 'sdb' in error_key or 'nvme' in error_key:
|
if 'disk' in error_key or 'smart' in error_key or 'sda' in error_key or 'sdb' in error_key or 'nvme' in error_key:
|
||||||
category = 'disks'
|
category = 'disks'
|
||||||
else:
|
else:
|
||||||
category = 'general' # Use 'general' as ultimate fallback instead of empty string
|
category = 'general'
|
||||||
|
|
||||||
setting_key = self.CATEGORY_SETTING_MAP.get(category, '')
|
setting_key = self.CATEGORY_SETTING_MAP.get(category, '')
|
||||||
sup_hours = self.DEFAULT_SUPPRESSION_HOURS
|
sup_hours = self.DEFAULT_SUPPRESSION_HOURS
|
||||||
if setting_key:
|
if setting_key:
|
||||||
# P4 fix: use _get_setting_impl with existing connection
|
stored = self._get_setting_impl(conn, setting_key)
|
||||||
stored = self._get_setting_impl(conn, setting_key)
|
if stored is not None:
|
||||||
if stored is not None:
|
try:
|
||||||
try:
|
sup_hours = int(stored)
|
||||||
sup_hours = int(stored)
|
except (ValueError, TypeError):
|
||||||
except (ValueError, TypeError):
|
pass
|
||||||
pass
|
|
||||||
|
# Insert as acknowledged but NOT resolved - error remains active
|
||||||
# Insert as acknowledged but NOT resolved - error remains active
|
cursor.execute('''
|
||||||
cursor.execute('''
|
INSERT INTO errors (error_key, category, severity, reason, first_seen, last_seen,
|
||||||
INSERT INTO errors (error_key, category, severity, reason, first_seen, last_seen,
|
occurrence_count, acknowledged, acknowledged_at, suppression_hours)
|
||||||
occurrence_count, acknowledged, acknowledged_at, suppression_hours)
|
VALUES (?, ?, 'WARNING', 'Dismissed by user', ?, ?, 1, 1, ?, ?)
|
||||||
VALUES (?, ?, 'WARNING', 'Dismissed by user', ?, ?, 1, 1, ?, ?)
|
''', (error_key, category, now, now, now, sup_hours))
|
||||||
''', (error_key, category, now, now, now, sup_hours))
|
|
||||||
|
self._record_event(cursor, 'acknowledged', error_key, {
|
||||||
self._record_event(cursor, 'acknowledged', error_key, {
|
'original_severity': 'WARNING',
|
||||||
'original_severity': 'WARNING',
|
'category': category,
|
||||||
'category': category,
|
'suppression_hours': sup_hours
|
||||||
'suppression_hours': sup_hours
|
})
|
||||||
})
|
|
||||||
|
result = {
|
||||||
result = {
|
'success': True,
|
||||||
'success': True,
|
'error_key': error_key,
|
||||||
'error_key': error_key,
|
'original_severity': 'WARNING',
|
||||||
'original_severity': 'WARNING',
|
'category': category,
|
||||||
'category': category,
|
'suppression_hours': sup_hours,
|
||||||
'suppression_hours': sup_hours,
|
'acknowledged_at': now
|
||||||
'acknowledged_at': now
|
}
|
||||||
}
|
conn.commit()
|
||||||
conn.commit()
|
return result
|
||||||
return result
|
|
||||||
|
if row:
|
||||||
if row:
|
error_dict = dict(row)
|
||||||
error_dict = dict(row)
|
original_severity = error_dict.get('severity', 'WARNING')
|
||||||
original_severity = error_dict.get('severity', 'WARNING')
|
category = error_dict.get('category', '')
|
||||||
category = error_dict.get('category', '')
|
|
||||||
|
# Look up the user's configured suppression for this category
|
||||||
# Look up the user's configured suppression for this category
|
setting_key = self.CATEGORY_SETTING_MAP.get(category, '')
|
||||||
setting_key = self.CATEGORY_SETTING_MAP.get(category, '')
|
sup_hours = self.DEFAULT_SUPPRESSION_HOURS
|
||||||
sup_hours = self.DEFAULT_SUPPRESSION_HOURS
|
if setting_key:
|
||||||
if setting_key:
|
stored = self._get_setting_impl(conn, setting_key)
|
||||||
# P4 fix: use _get_setting_impl with existing connection
|
if stored is not None:
|
||||||
stored = self._get_setting_impl(conn, setting_key)
|
try:
|
||||||
if stored is not None:
|
sup_hours = int(stored)
|
||||||
try:
|
except (ValueError, TypeError):
|
||||||
sup_hours = int(stored)
|
pass
|
||||||
except (ValueError, TypeError):
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Mark as acknowledged but DO NOT set resolved_at
|
|
||||||
# The error remains active until it actually disappears from the system
|
|
||||||
# resolved_at should only be set when the error is truly resolved
|
|
||||||
cursor.execute('''
|
|
||||||
UPDATE errors
|
|
||||||
SET acknowledged = 1, acknowledged_at = ?, suppression_hours = ?
|
|
||||||
WHERE error_key = ?
|
|
||||||
''', (now, sup_hours, error_key))
|
|
||||||
|
|
||||||
self._record_event(cursor, 'acknowledged', error_key, {
|
|
||||||
'original_severity': original_severity,
|
|
||||||
'category': category,
|
|
||||||
'suppression_hours': sup_hours
|
|
||||||
})
|
|
||||||
|
|
||||||
# Cascade acknowledge: when dismissing a group check
|
|
||||||
# (e.g. log_persistent_errors), also dismiss all individual
|
|
||||||
# sub-errors that share the same prefix in the DB.
|
|
||||||
# Currently only persistent errors have per-pattern sub-records
|
|
||||||
# (e.g. log_persistent_a1b2c3d4).
|
|
||||||
CASCADE_PREFIXES = {
|
|
||||||
'log_persistent_errors': 'log_persistent_',
|
|
||||||
}
|
|
||||||
child_prefix = CASCADE_PREFIXES.get(error_key)
|
|
||||||
if child_prefix:
|
|
||||||
# Only cascade to active (unresolved) child errors.
|
|
||||||
# Already-resolved/expired entries must NOT be re-surfaced.
|
|
||||||
# Mark as acknowledged but DO NOT set resolved_at
|
# Mark as acknowledged but DO NOT set resolved_at
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
UPDATE errors
|
UPDATE errors
|
||||||
SET acknowledged = 1, acknowledged_at = ?, suppression_hours = ?
|
SET acknowledged = 1, acknowledged_at = ?, suppression_hours = ?
|
||||||
WHERE error_key LIKE ? AND acknowledged = 0 AND resolved_at IS NULL
|
WHERE error_key = ?
|
||||||
''', (now, sup_hours, child_prefix + '%'))
|
''', (now, sup_hours, error_key))
|
||||||
|
|
||||||
result = {
|
self._record_event(cursor, 'acknowledged', error_key, {
|
||||||
'success': True,
|
'original_severity': original_severity,
|
||||||
'error_key': error_key,
|
'category': category,
|
||||||
'original_severity': original_severity,
|
'suppression_hours': sup_hours
|
||||||
'category': category,
|
})
|
||||||
'acknowledged_at': now,
|
|
||||||
'suppression_hours': sup_hours
|
# Cascade acknowledge: when dismissing a group check
|
||||||
}
|
CASCADE_PREFIXES = {
|
||||||
|
'log_persistent_errors': 'log_persistent_',
|
||||||
|
}
|
||||||
|
child_prefix = CASCADE_PREFIXES.get(error_key)
|
||||||
|
if child_prefix:
|
||||||
|
cursor.execute('''
|
||||||
|
UPDATE errors
|
||||||
|
SET acknowledged = 1, acknowledged_at = ?, suppression_hours = ?
|
||||||
|
WHERE error_key LIKE ? AND acknowledged = 0 AND resolved_at IS NULL
|
||||||
|
''', (now, sup_hours, child_prefix + '%'))
|
||||||
|
|
||||||
|
result = {
|
||||||
|
'success': True,
|
||||||
|
'error_key': error_key,
|
||||||
|
'original_severity': original_severity,
|
||||||
|
'category': category,
|
||||||
|
'acknowledged_at': now,
|
||||||
|
'suppression_hours': sup_hours
|
||||||
|
}
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
finally:
|
finally:
|
||||||
conn.close()
|
conn.close()
|
||||||
@@ -935,199 +924,161 @@ class HealthPersistence:
|
|||||||
now_iso = now.isoformat()
|
now_iso = now.isoformat()
|
||||||
|
|
||||||
# Delete resolved errors older than 7 days
|
# Delete resolved errors older than 7 days
|
||||||
cutoff_resolved = (now - timedelta(days=7)).isoformat()
|
cutoff_resolved = (now - timedelta(days=7)).isoformat()
|
||||||
cursor.execute('DELETE FROM errors WHERE resolved_at < ?', (cutoff_resolved,))
|
cursor.execute('DELETE FROM errors WHERE resolved_at < ?', (cutoff_resolved,))
|
||||||
|
|
||||||
# ── Auto-resolve stale errors using Suppression Duration settings ──
|
# ── Auto-resolve stale errors using Suppression Duration settings ──
|
||||||
# Read per-category suppression hours from user_settings.
|
user_settings = {}
|
||||||
# If the user hasn't configured a value, use DEFAULT_SUPPRESSION_HOURS.
|
|
||||||
# This is the SINGLE source of truth for auto-resolution timing.
|
|
||||||
user_settings = {}
|
|
||||||
try:
|
|
||||||
cursor.execute(
|
|
||||||
'SELECT setting_key, setting_value FROM user_settings WHERE setting_key LIKE ?',
|
|
||||||
('suppress_%',)
|
|
||||||
)
|
|
||||||
for row in cursor.fetchall():
|
|
||||||
user_settings[row[0]] = row[1]
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
for category, setting_key in self.CATEGORY_SETTING_MAP.items():
|
|
||||||
stored = user_settings.get(setting_key)
|
|
||||||
try:
|
try:
|
||||||
hours = int(stored) if stored else self.DEFAULT_SUPPRESSION_HOURS
|
cursor.execute(
|
||||||
except (ValueError, TypeError):
|
'SELECT setting_key, setting_value FROM user_settings WHERE setting_key LIKE ?',
|
||||||
hours = self.DEFAULT_SUPPRESSION_HOURS
|
('suppress_%',)
|
||||||
|
)
|
||||||
# -1 means permanently suppressed -- skip auto-resolve
|
for row in cursor.fetchall():
|
||||||
if hours < 0:
|
user_settings[row[0]] = row[1]
|
||||||
continue
|
except Exception:
|
||||||
|
pass
|
||||||
cutoff = (now - timedelta(hours=hours)).isoformat()
|
|
||||||
cursor.execute('''
|
for category, setting_key in self.CATEGORY_SETTING_MAP.items():
|
||||||
UPDATE errors
|
stored = user_settings.get(setting_key)
|
||||||
SET resolved_at = ?
|
try:
|
||||||
WHERE category = ?
|
hours = int(stored) if stored else self.DEFAULT_SUPPRESSION_HOURS
|
||||||
AND resolved_at IS NULL
|
except (ValueError, TypeError):
|
||||||
AND last_seen < ?
|
hours = self.DEFAULT_SUPPRESSION_HOURS
|
||||||
AND acknowledged = 0
|
|
||||||
''', (now_iso, category, cutoff))
|
if hours < 0:
|
||||||
|
continue
|
||||||
# Catch-all: auto-resolve any error from an unmapped category
|
|
||||||
# whose last_seen exceeds DEFAULT_SUPPRESSION_HOURS.
|
cutoff = (now - timedelta(hours=hours)).isoformat()
|
||||||
fallback_cutoff = (now - timedelta(hours=self.DEFAULT_SUPPRESSION_HOURS)).isoformat()
|
|
||||||
cursor.execute('''
|
|
||||||
UPDATE errors
|
|
||||||
SET resolved_at = ?
|
|
||||||
WHERE resolved_at IS NULL
|
|
||||||
AND acknowledged = 0
|
|
||||||
AND last_seen < ?
|
|
||||||
''', (now_iso, fallback_cutoff))
|
|
||||||
|
|
||||||
# Delete old events (>30 days)
|
|
||||||
cutoff_events = (now - timedelta(days=30)).isoformat()
|
|
||||||
cursor.execute('DELETE FROM events WHERE timestamp < ?', (cutoff_events,))
|
|
||||||
|
|
||||||
# ══════════════════════════════════════════════════════════════════════
|
|
||||||
# SMART AUTO-RESOLVE: Based on system state, not hardcoded patterns
|
|
||||||
# ══════════════════════════════════════════════════════════════════════
|
|
||||||
# Logic: If an error hasn't been seen recently AND the system is healthy,
|
|
||||||
# the error is stale and should be auto-resolved.
|
|
||||||
# This works for ANY error pattern, not just predefined ones.
|
|
||||||
try:
|
|
||||||
import psutil
|
|
||||||
# Get system uptime
|
|
||||||
with open('/proc/uptime', 'r') as f:
|
|
||||||
uptime_seconds = float(f.read().split()[0])
|
|
||||||
|
|
||||||
# Only auto-resolve if system has been stable for at least 10 minutes
|
|
||||||
if uptime_seconds > 600: # 10 minutes
|
|
||||||
current_cpu = psutil.cpu_percent(interval=0.1)
|
|
||||||
current_mem = psutil.virtual_memory().percent
|
|
||||||
|
|
||||||
# ── 1. LOGS category: Auto-resolve if not seen in 15 minutes ──
|
|
||||||
# Log errors are transient - if journalctl hasn't reported them recently,
|
|
||||||
# they are from a previous state and should be resolved.
|
|
||||||
stale_logs_cutoff = (now - timedelta(minutes=15)).isoformat()
|
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
UPDATE errors
|
UPDATE errors
|
||||||
SET resolved_at = ?
|
SET resolved_at = ?
|
||||||
WHERE category = 'logs'
|
WHERE category = ?
|
||||||
AND resolved_at IS NULL
|
AND resolved_at IS NULL
|
||||||
AND acknowledged = 0
|
|
||||||
AND last_seen < ?
|
AND last_seen < ?
|
||||||
''', (now_iso, stale_logs_cutoff))
|
|
||||||
|
|
||||||
# ── 2. CPU category: Auto-resolve if CPU is normal (<75%) ──
|
|
||||||
if current_cpu < 75:
|
|
||||||
stale_cpu_cutoff = (now - timedelta(minutes=5)).isoformat()
|
|
||||||
cursor.execute('''
|
|
||||||
UPDATE errors
|
|
||||||
SET resolved_at = ?
|
|
||||||
WHERE (category = 'cpu' OR category = 'temperature')
|
|
||||||
AND resolved_at IS NULL
|
|
||||||
AND acknowledged = 0
|
|
||||||
AND last_seen < ?
|
|
||||||
AND (error_key LIKE 'cpu_%' OR reason LIKE '%CPU%')
|
|
||||||
''', (now_iso, stale_cpu_cutoff))
|
|
||||||
|
|
||||||
# ── 3. MEMORY category: Auto-resolve if memory is normal (<80%) ──
|
|
||||||
if current_mem < 80:
|
|
||||||
stale_mem_cutoff = (now - timedelta(minutes=5)).isoformat()
|
|
||||||
cursor.execute('''
|
|
||||||
UPDATE errors
|
|
||||||
SET resolved_at = ?
|
|
||||||
WHERE (category = 'memory' OR category = 'logs')
|
|
||||||
AND resolved_at IS NULL
|
|
||||||
AND acknowledged = 0
|
|
||||||
AND last_seen < ?
|
|
||||||
AND (error_key LIKE '%oom%'
|
|
||||||
OR error_key LIKE '%memory%'
|
|
||||||
OR reason LIKE '%memory%'
|
|
||||||
OR reason LIKE '%OOM%'
|
|
||||||
OR reason LIKE '%killed%process%')
|
|
||||||
''', (now_iso, stale_mem_cutoff))
|
|
||||||
|
|
||||||
# ── 4. VMS category: Auto-resolve if VM/CT is now running or deleted ──
|
|
||||||
# Check all active VM/CT errors and resolve if the VM/CT is now running
|
|
||||||
# NOTE: We do this inline to avoid deadlock (check_vm_running uses _db_lock)
|
|
||||||
cursor.execute('''
|
|
||||||
SELECT error_key, category, reason FROM errors
|
|
||||||
WHERE (category IN ('vms', 'vmct') OR error_key LIKE 'vm_%' OR error_key LIKE 'ct_%' OR error_key LIKE 'vmct_%')
|
|
||||||
AND resolved_at IS NULL
|
|
||||||
AND acknowledged = 0
|
AND acknowledged = 0
|
||||||
''')
|
''', (now_iso, category, cutoff))
|
||||||
vm_errors = cursor.fetchall()
|
|
||||||
for error_key, cat, reason in vm_errors:
|
# Catch-all: auto-resolve any error from an unmapped category
|
||||||
# Extract VM/CT ID from error_key
|
fallback_cutoff = (now - timedelta(hours=self.DEFAULT_SUPPRESSION_HOURS)).isoformat()
|
||||||
vmid_match = re.search(r'(?:vm_|ct_|vmct_)(\d+)', error_key)
|
cursor.execute('''
|
||||||
if vmid_match:
|
UPDATE errors
|
||||||
vmid = vmid_match.group(1)
|
SET resolved_at = ?
|
||||||
try:
|
WHERE resolved_at IS NULL
|
||||||
# Check if VM/CT exists and is running
|
AND acknowledged = 0
|
||||||
vm_running = False
|
AND last_seen < ?
|
||||||
ct_running = False
|
''', (now_iso, fallback_cutoff))
|
||||||
vm_exists = False
|
|
||||||
ct_exists = False
|
# Delete old events (>30 days)
|
||||||
|
cutoff_events = (now - timedelta(days=30)).isoformat()
|
||||||
# Check VM
|
cursor.execute('DELETE FROM events WHERE timestamp < ?', (cutoff_events,))
|
||||||
result_vm = subprocess.run(
|
|
||||||
['qm', 'status', vmid],
|
# ── SMART AUTO-RESOLVE: Based on system state ──
|
||||||
capture_output=True, text=True, timeout=2
|
try:
|
||||||
)
|
import psutil
|
||||||
if result_vm.returncode == 0:
|
with open('/proc/uptime', 'r') as f:
|
||||||
vm_exists = True
|
uptime_seconds = float(f.read().split()[0])
|
||||||
vm_running = 'running' in result_vm.stdout.lower()
|
|
||||||
|
if uptime_seconds > 600:
|
||||||
# Check CT
|
current_cpu = psutil.cpu_percent(interval=0.1)
|
||||||
if not vm_exists:
|
current_mem = psutil.virtual_memory().percent
|
||||||
result_ct = subprocess.run(
|
|
||||||
['pct', 'status', vmid],
|
# 1. LOGS: Auto-resolve if not seen in 15 minutes
|
||||||
capture_output=True, text=True, timeout=2
|
stale_logs_cutoff = (now - timedelta(minutes=15)).isoformat()
|
||||||
)
|
cursor.execute('''
|
||||||
if result_ct.returncode == 0:
|
UPDATE errors SET resolved_at = ?
|
||||||
ct_exists = True
|
WHERE category = 'logs' AND resolved_at IS NULL
|
||||||
ct_running = 'running' in result_ct.stdout.lower()
|
AND acknowledged = 0 AND last_seen < ?
|
||||||
|
''', (now_iso, stale_logs_cutoff))
|
||||||
# Resolve if deleted
|
|
||||||
if not vm_exists and not ct_exists:
|
# 2. CPU: Auto-resolve if CPU is normal (<75%)
|
||||||
cursor.execute('''
|
if current_cpu < 75:
|
||||||
UPDATE errors SET resolved_at = ?
|
stale_cpu_cutoff = (now - timedelta(minutes=5)).isoformat()
|
||||||
WHERE error_key = ? AND resolved_at IS NULL
|
cursor.execute('''
|
||||||
''', (now_iso, error_key))
|
UPDATE errors SET resolved_at = ?
|
||||||
# Resolve transient errors if running (not persistent config errors)
|
WHERE (category = 'cpu' OR category = 'temperature')
|
||||||
elif vm_running or ct_running:
|
AND resolved_at IS NULL AND acknowledged = 0
|
||||||
reason_lower = (reason or '').lower()
|
AND last_seen < ?
|
||||||
is_persistent = any(x in reason_lower for x in [
|
AND (error_key LIKE 'cpu_%' OR reason LIKE '%CPU%')
|
||||||
'device', 'missing', 'does not exist', 'permission',
|
''', (now_iso, stale_cpu_cutoff))
|
||||||
'not found', 'no such', 'invalid'
|
|
||||||
])
|
# 3. MEMORY: Auto-resolve if memory is normal (<80%)
|
||||||
if not is_persistent:
|
if current_mem < 80:
|
||||||
|
stale_mem_cutoff = (now - timedelta(minutes=5)).isoformat()
|
||||||
|
cursor.execute('''
|
||||||
|
UPDATE errors SET resolved_at = ?
|
||||||
|
WHERE (category = 'memory' OR category = 'logs')
|
||||||
|
AND resolved_at IS NULL AND acknowledged = 0
|
||||||
|
AND last_seen < ?
|
||||||
|
AND (error_key LIKE '%oom%' OR error_key LIKE '%memory%'
|
||||||
|
OR reason LIKE '%memory%' OR reason LIKE '%OOM%'
|
||||||
|
OR reason LIKE '%killed%process%')
|
||||||
|
''', (now_iso, stale_mem_cutoff))
|
||||||
|
|
||||||
|
# 4. VMS: Auto-resolve if VM/CT is now running or deleted
|
||||||
|
cursor.execute('''
|
||||||
|
SELECT error_key, category, reason FROM errors
|
||||||
|
WHERE (category IN ('vms', 'vmct') OR error_key LIKE 'vm_%'
|
||||||
|
OR error_key LIKE 'ct_%' OR error_key LIKE 'vmct_%')
|
||||||
|
AND resolved_at IS NULL AND acknowledged = 0
|
||||||
|
''')
|
||||||
|
vm_errors = cursor.fetchall()
|
||||||
|
for vm_ek, cat, vm_reason in vm_errors:
|
||||||
|
vmid_match = re.search(r'(?:vm_|ct_|vmct_)(\d+)', vm_ek)
|
||||||
|
if vmid_match:
|
||||||
|
vmid = vmid_match.group(1)
|
||||||
|
try:
|
||||||
|
vm_running = False
|
||||||
|
ct_running = False
|
||||||
|
vm_exists = False
|
||||||
|
ct_exists = False
|
||||||
|
|
||||||
|
result_vm = subprocess.run(
|
||||||
|
['qm', 'status', vmid],
|
||||||
|
capture_output=True, text=True, timeout=2)
|
||||||
|
if result_vm.returncode == 0:
|
||||||
|
vm_exists = True
|
||||||
|
vm_running = 'running' in result_vm.stdout.lower()
|
||||||
|
|
||||||
|
if not vm_exists:
|
||||||
|
result_ct = subprocess.run(
|
||||||
|
['pct', 'status', vmid],
|
||||||
|
capture_output=True, text=True, timeout=2)
|
||||||
|
if result_ct.returncode == 0:
|
||||||
|
ct_exists = True
|
||||||
|
ct_running = 'running' in result_ct.stdout.lower()
|
||||||
|
|
||||||
|
if not vm_exists and not ct_exists:
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
UPDATE errors SET resolved_at = ?
|
UPDATE errors SET resolved_at = ?
|
||||||
WHERE error_key = ? AND resolved_at IS NULL
|
WHERE error_key = ? AND resolved_at IS NULL
|
||||||
''', (now_iso, error_key))
|
''', (now_iso, vm_ek))
|
||||||
except Exception:
|
elif vm_running or ct_running:
|
||||||
pass # Skip this VM/CT if check fails
|
reason_lower = (vm_reason or '').lower()
|
||||||
|
is_persistent = any(x in reason_lower for x in [
|
||||||
# ── 5. GENERIC: Any error not seen in 30 minutes while system is healthy ──
|
'device', 'missing', 'does not exist', 'permission',
|
||||||
# If CPU < 80% and Memory < 85% and error hasn't been seen in 30 min,
|
'not found', 'no such', 'invalid'])
|
||||||
# the system has recovered and the error is stale.
|
if not is_persistent:
|
||||||
if current_cpu < 80 and current_mem < 85:
|
cursor.execute('''
|
||||||
stale_generic_cutoff = (now - timedelta(minutes=30)).isoformat()
|
UPDATE errors SET resolved_at = ?
|
||||||
cursor.execute('''
|
WHERE error_key = ? AND resolved_at IS NULL
|
||||||
UPDATE errors
|
''', (now_iso, vm_ek))
|
||||||
SET resolved_at = ?
|
except Exception:
|
||||||
WHERE resolved_at IS NULL
|
pass
|
||||||
AND acknowledged = 0
|
|
||||||
AND last_seen < ?
|
# 5. GENERIC: Any error not seen in 30 min while system is healthy
|
||||||
AND category NOT IN ('disks', 'storage')
|
if current_cpu < 80 and current_mem < 85:
|
||||||
''', (now_iso, stale_generic_cutoff))
|
stale_generic_cutoff = (now - timedelta(minutes=30)).isoformat()
|
||||||
|
cursor.execute('''
|
||||||
except Exception:
|
UPDATE errors SET resolved_at = ?
|
||||||
pass # If we can't read uptime, skip this cleanup
|
WHERE resolved_at IS NULL AND acknowledged = 0
|
||||||
|
AND last_seen < ?
|
||||||
|
AND category NOT IN ('disks', 'storage')
|
||||||
|
''', (now_iso, stale_generic_cutoff))
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
pass # If we can't read uptime, skip this cleanup
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
finally:
|
finally:
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|||||||
Reference in New Issue
Block a user