Update health_persistence.py

This commit is contained in:
MacRimi
2026-04-16 19:18:42 +02:00
parent 1ef4bc4fed
commit 6660122e69

View File

@@ -695,124 +695,113 @@ class HealthPersistence:
result = {'success': False, 'error_key': error_key} result = {'success': False, 'error_key': error_key}
if not row: if not row:
# Error not in DB yet -- create a minimal record so the dismiss persists. # Error not in DB yet -- create a minimal record so the dismiss persists.
# Try to infer category from the error_key prefix. # Try to infer category from the error_key prefix.
category = '' category = ''
# Order matters: more specific prefixes MUST come before shorter ones # Order matters: more specific prefixes MUST come before shorter ones
# e.g. 'security_updates' (updates) before 'security_' (security) # e.g. 'security_updates' (updates) before 'security_' (security)
for cat, prefix in [('updates', 'security_updates'), ('updates', 'system_age'), for cat, prefix in [('updates', 'security_updates'), ('updates', 'system_age'),
('updates', 'pending_updates'), ('updates', 'kernel_pve'), ('updates', 'pending_updates'), ('updates', 'kernel_pve'),
('security', 'security_'), ('security', 'security_'),
('pve_services', 'pve_service_'), ('vms', 'vmct_'), ('vms', 'vm_'), ('vms', 'ct_'), ('pve_services', 'pve_service_'), ('vms', 'vmct_'), ('vms', 'vm_'), ('vms', 'ct_'),
('disks', 'disk_smart_'), ('disks', 'disk_'), ('disks', 'smart_'), ('disks', 'zfs_pool_'), ('disks', 'disk_smart_'), ('disks', 'disk_'), ('disks', 'smart_'), ('disks', 'zfs_pool_'),
('logs', 'log_'), ('network', 'net_'), ('logs', 'log_'), ('network', 'net_'),
('temperature', 'temp_')]: ('temperature', 'temp_')]:
if error_key == prefix or error_key.startswith(prefix): if error_key == prefix or error_key.startswith(prefix):
category = cat category = cat
break break
# Fallback: if no category matched, try to infer from common patterns # Fallback: if no category matched, try to infer from common patterns
if not category: if not category:
if 'disk' in error_key or 'smart' in error_key or 'sda' in error_key or 'sdb' in error_key or 'nvme' in error_key: if 'disk' in error_key or 'smart' in error_key or 'sda' in error_key or 'sdb' in error_key or 'nvme' in error_key:
category = 'disks' category = 'disks'
else: else:
category = 'general' # Use 'general' as ultimate fallback instead of empty string category = 'general'
setting_key = self.CATEGORY_SETTING_MAP.get(category, '') setting_key = self.CATEGORY_SETTING_MAP.get(category, '')
sup_hours = self.DEFAULT_SUPPRESSION_HOURS sup_hours = self.DEFAULT_SUPPRESSION_HOURS
if setting_key: if setting_key:
# P4 fix: use _get_setting_impl with existing connection stored = self._get_setting_impl(conn, setting_key)
stored = self._get_setting_impl(conn, setting_key) if stored is not None:
if stored is not None: try:
try: sup_hours = int(stored)
sup_hours = int(stored) except (ValueError, TypeError):
except (ValueError, TypeError): pass
pass
# Insert as acknowledged but NOT resolved - error remains active
# Insert as acknowledged but NOT resolved - error remains active cursor.execute('''
cursor.execute(''' INSERT INTO errors (error_key, category, severity, reason, first_seen, last_seen,
INSERT INTO errors (error_key, category, severity, reason, first_seen, last_seen, occurrence_count, acknowledged, acknowledged_at, suppression_hours)
occurrence_count, acknowledged, acknowledged_at, suppression_hours) VALUES (?, ?, 'WARNING', 'Dismissed by user', ?, ?, 1, 1, ?, ?)
VALUES (?, ?, 'WARNING', 'Dismissed by user', ?, ?, 1, 1, ?, ?) ''', (error_key, category, now, now, now, sup_hours))
''', (error_key, category, now, now, now, sup_hours))
self._record_event(cursor, 'acknowledged', error_key, {
self._record_event(cursor, 'acknowledged', error_key, { 'original_severity': 'WARNING',
'original_severity': 'WARNING', 'category': category,
'category': category, 'suppression_hours': sup_hours
'suppression_hours': sup_hours })
})
result = {
result = { 'success': True,
'success': True, 'error_key': error_key,
'error_key': error_key, 'original_severity': 'WARNING',
'original_severity': 'WARNING', 'category': category,
'category': category, 'suppression_hours': sup_hours,
'suppression_hours': sup_hours, 'acknowledged_at': now
'acknowledged_at': now }
} conn.commit()
conn.commit() return result
return result
if row:
if row: error_dict = dict(row)
error_dict = dict(row) original_severity = error_dict.get('severity', 'WARNING')
original_severity = error_dict.get('severity', 'WARNING') category = error_dict.get('category', '')
category = error_dict.get('category', '')
# Look up the user's configured suppression for this category
# Look up the user's configured suppression for this category setting_key = self.CATEGORY_SETTING_MAP.get(category, '')
setting_key = self.CATEGORY_SETTING_MAP.get(category, '') sup_hours = self.DEFAULT_SUPPRESSION_HOURS
sup_hours = self.DEFAULT_SUPPRESSION_HOURS if setting_key:
if setting_key: stored = self._get_setting_impl(conn, setting_key)
# P4 fix: use _get_setting_impl with existing connection if stored is not None:
stored = self._get_setting_impl(conn, setting_key) try:
if stored is not None: sup_hours = int(stored)
try: except (ValueError, TypeError):
sup_hours = int(stored) pass
except (ValueError, TypeError):
pass
# Mark as acknowledged but DO NOT set resolved_at
# The error remains active until it actually disappears from the system
# resolved_at should only be set when the error is truly resolved
cursor.execute('''
UPDATE errors
SET acknowledged = 1, acknowledged_at = ?, suppression_hours = ?
WHERE error_key = ?
''', (now, sup_hours, error_key))
self._record_event(cursor, 'acknowledged', error_key, {
'original_severity': original_severity,
'category': category,
'suppression_hours': sup_hours
})
# Cascade acknowledge: when dismissing a group check
# (e.g. log_persistent_errors), also dismiss all individual
# sub-errors that share the same prefix in the DB.
# Currently only persistent errors have per-pattern sub-records
# (e.g. log_persistent_a1b2c3d4).
CASCADE_PREFIXES = {
'log_persistent_errors': 'log_persistent_',
}
child_prefix = CASCADE_PREFIXES.get(error_key)
if child_prefix:
# Only cascade to active (unresolved) child errors.
# Already-resolved/expired entries must NOT be re-surfaced.
# Mark as acknowledged but DO NOT set resolved_at # Mark as acknowledged but DO NOT set resolved_at
cursor.execute(''' cursor.execute('''
UPDATE errors UPDATE errors
SET acknowledged = 1, acknowledged_at = ?, suppression_hours = ? SET acknowledged = 1, acknowledged_at = ?, suppression_hours = ?
WHERE error_key LIKE ? AND acknowledged = 0 AND resolved_at IS NULL WHERE error_key = ?
''', (now, sup_hours, child_prefix + '%')) ''', (now, sup_hours, error_key))
result = { self._record_event(cursor, 'acknowledged', error_key, {
'success': True, 'original_severity': original_severity,
'error_key': error_key, 'category': category,
'original_severity': original_severity, 'suppression_hours': sup_hours
'category': category, })
'acknowledged_at': now,
'suppression_hours': sup_hours # Cascade acknowledge: when dismissing a group check
} CASCADE_PREFIXES = {
'log_persistent_errors': 'log_persistent_',
}
child_prefix = CASCADE_PREFIXES.get(error_key)
if child_prefix:
cursor.execute('''
UPDATE errors
SET acknowledged = 1, acknowledged_at = ?, suppression_hours = ?
WHERE error_key LIKE ? AND acknowledged = 0 AND resolved_at IS NULL
''', (now, sup_hours, child_prefix + '%'))
result = {
'success': True,
'error_key': error_key,
'original_severity': original_severity,
'category': category,
'acknowledged_at': now,
'suppression_hours': sup_hours
}
conn.commit() conn.commit()
finally: finally:
conn.close() conn.close()
@@ -935,199 +924,161 @@ class HealthPersistence:
now_iso = now.isoformat() now_iso = now.isoformat()
# Delete resolved errors older than 7 days # Delete resolved errors older than 7 days
cutoff_resolved = (now - timedelta(days=7)).isoformat() cutoff_resolved = (now - timedelta(days=7)).isoformat()
cursor.execute('DELETE FROM errors WHERE resolved_at < ?', (cutoff_resolved,)) cursor.execute('DELETE FROM errors WHERE resolved_at < ?', (cutoff_resolved,))
# ── Auto-resolve stale errors using Suppression Duration settings ── # ── Auto-resolve stale errors using Suppression Duration settings ──
# Read per-category suppression hours from user_settings. user_settings = {}
# If the user hasn't configured a value, use DEFAULT_SUPPRESSION_HOURS.
# This is the SINGLE source of truth for auto-resolution timing.
user_settings = {}
try:
cursor.execute(
'SELECT setting_key, setting_value FROM user_settings WHERE setting_key LIKE ?',
('suppress_%',)
)
for row in cursor.fetchall():
user_settings[row[0]] = row[1]
except Exception:
pass
for category, setting_key in self.CATEGORY_SETTING_MAP.items():
stored = user_settings.get(setting_key)
try: try:
hours = int(stored) if stored else self.DEFAULT_SUPPRESSION_HOURS cursor.execute(
except (ValueError, TypeError): 'SELECT setting_key, setting_value FROM user_settings WHERE setting_key LIKE ?',
hours = self.DEFAULT_SUPPRESSION_HOURS ('suppress_%',)
)
# -1 means permanently suppressed -- skip auto-resolve for row in cursor.fetchall():
if hours < 0: user_settings[row[0]] = row[1]
continue except Exception:
pass
cutoff = (now - timedelta(hours=hours)).isoformat()
cursor.execute(''' for category, setting_key in self.CATEGORY_SETTING_MAP.items():
UPDATE errors stored = user_settings.get(setting_key)
SET resolved_at = ? try:
WHERE category = ? hours = int(stored) if stored else self.DEFAULT_SUPPRESSION_HOURS
AND resolved_at IS NULL except (ValueError, TypeError):
AND last_seen < ? hours = self.DEFAULT_SUPPRESSION_HOURS
AND acknowledged = 0
''', (now_iso, category, cutoff)) if hours < 0:
continue
# Catch-all: auto-resolve any error from an unmapped category
# whose last_seen exceeds DEFAULT_SUPPRESSION_HOURS. cutoff = (now - timedelta(hours=hours)).isoformat()
fallback_cutoff = (now - timedelta(hours=self.DEFAULT_SUPPRESSION_HOURS)).isoformat()
cursor.execute('''
UPDATE errors
SET resolved_at = ?
WHERE resolved_at IS NULL
AND acknowledged = 0
AND last_seen < ?
''', (now_iso, fallback_cutoff))
# Delete old events (>30 days)
cutoff_events = (now - timedelta(days=30)).isoformat()
cursor.execute('DELETE FROM events WHERE timestamp < ?', (cutoff_events,))
# ══════════════════════════════════════════════════════════════════════
# SMART AUTO-RESOLVE: Based on system state, not hardcoded patterns
# ══════════════════════════════════════════════════════════════════════
# Logic: If an error hasn't been seen recently AND the system is healthy,
# the error is stale and should be auto-resolved.
# This works for ANY error pattern, not just predefined ones.
try:
import psutil
# Get system uptime
with open('/proc/uptime', 'r') as f:
uptime_seconds = float(f.read().split()[0])
# Only auto-resolve if system has been stable for at least 10 minutes
if uptime_seconds > 600: # 10 minutes
current_cpu = psutil.cpu_percent(interval=0.1)
current_mem = psutil.virtual_memory().percent
# ── 1. LOGS category: Auto-resolve if not seen in 15 minutes ──
# Log errors are transient - if journalctl hasn't reported them recently,
# they are from a previous state and should be resolved.
stale_logs_cutoff = (now - timedelta(minutes=15)).isoformat()
cursor.execute(''' cursor.execute('''
UPDATE errors UPDATE errors
SET resolved_at = ? SET resolved_at = ?
WHERE category = 'logs' WHERE category = ?
AND resolved_at IS NULL AND resolved_at IS NULL
AND acknowledged = 0
AND last_seen < ? AND last_seen < ?
''', (now_iso, stale_logs_cutoff))
# ── 2. CPU category: Auto-resolve if CPU is normal (<75%) ──
if current_cpu < 75:
stale_cpu_cutoff = (now - timedelta(minutes=5)).isoformat()
cursor.execute('''
UPDATE errors
SET resolved_at = ?
WHERE (category = 'cpu' OR category = 'temperature')
AND resolved_at IS NULL
AND acknowledged = 0
AND last_seen < ?
AND (error_key LIKE 'cpu_%' OR reason LIKE '%CPU%')
''', (now_iso, stale_cpu_cutoff))
# ── 3. MEMORY category: Auto-resolve if memory is normal (<80%) ──
if current_mem < 80:
stale_mem_cutoff = (now - timedelta(minutes=5)).isoformat()
cursor.execute('''
UPDATE errors
SET resolved_at = ?
WHERE (category = 'memory' OR category = 'logs')
AND resolved_at IS NULL
AND acknowledged = 0
AND last_seen < ?
AND (error_key LIKE '%oom%'
OR error_key LIKE '%memory%'
OR reason LIKE '%memory%'
OR reason LIKE '%OOM%'
OR reason LIKE '%killed%process%')
''', (now_iso, stale_mem_cutoff))
# ── 4. VMS category: Auto-resolve if VM/CT is now running or deleted ──
# Check all active VM/CT errors and resolve if the VM/CT is now running
# NOTE: We do this inline to avoid deadlock (check_vm_running uses _db_lock)
cursor.execute('''
SELECT error_key, category, reason FROM errors
WHERE (category IN ('vms', 'vmct') OR error_key LIKE 'vm_%' OR error_key LIKE 'ct_%' OR error_key LIKE 'vmct_%')
AND resolved_at IS NULL
AND acknowledged = 0 AND acknowledged = 0
''') ''', (now_iso, category, cutoff))
vm_errors = cursor.fetchall()
for error_key, cat, reason in vm_errors: # Catch-all: auto-resolve any error from an unmapped category
# Extract VM/CT ID from error_key fallback_cutoff = (now - timedelta(hours=self.DEFAULT_SUPPRESSION_HOURS)).isoformat()
vmid_match = re.search(r'(?:vm_|ct_|vmct_)(\d+)', error_key) cursor.execute('''
if vmid_match: UPDATE errors
vmid = vmid_match.group(1) SET resolved_at = ?
try: WHERE resolved_at IS NULL
# Check if VM/CT exists and is running AND acknowledged = 0
vm_running = False AND last_seen < ?
ct_running = False ''', (now_iso, fallback_cutoff))
vm_exists = False
ct_exists = False # Delete old events (>30 days)
cutoff_events = (now - timedelta(days=30)).isoformat()
# Check VM cursor.execute('DELETE FROM events WHERE timestamp < ?', (cutoff_events,))
result_vm = subprocess.run(
['qm', 'status', vmid], # ── SMART AUTO-RESOLVE: Based on system state ──
capture_output=True, text=True, timeout=2 try:
) import psutil
if result_vm.returncode == 0: with open('/proc/uptime', 'r') as f:
vm_exists = True uptime_seconds = float(f.read().split()[0])
vm_running = 'running' in result_vm.stdout.lower()
if uptime_seconds > 600:
# Check CT current_cpu = psutil.cpu_percent(interval=0.1)
if not vm_exists: current_mem = psutil.virtual_memory().percent
result_ct = subprocess.run(
['pct', 'status', vmid], # 1. LOGS: Auto-resolve if not seen in 15 minutes
capture_output=True, text=True, timeout=2 stale_logs_cutoff = (now - timedelta(minutes=15)).isoformat()
) cursor.execute('''
if result_ct.returncode == 0: UPDATE errors SET resolved_at = ?
ct_exists = True WHERE category = 'logs' AND resolved_at IS NULL
ct_running = 'running' in result_ct.stdout.lower() AND acknowledged = 0 AND last_seen < ?
''', (now_iso, stale_logs_cutoff))
# Resolve if deleted
if not vm_exists and not ct_exists: # 2. CPU: Auto-resolve if CPU is normal (<75%)
cursor.execute(''' if current_cpu < 75:
UPDATE errors SET resolved_at = ? stale_cpu_cutoff = (now - timedelta(minutes=5)).isoformat()
WHERE error_key = ? AND resolved_at IS NULL cursor.execute('''
''', (now_iso, error_key)) UPDATE errors SET resolved_at = ?
# Resolve transient errors if running (not persistent config errors) WHERE (category = 'cpu' OR category = 'temperature')
elif vm_running or ct_running: AND resolved_at IS NULL AND acknowledged = 0
reason_lower = (reason or '').lower() AND last_seen < ?
is_persistent = any(x in reason_lower for x in [ AND (error_key LIKE 'cpu_%' OR reason LIKE '%CPU%')
'device', 'missing', 'does not exist', 'permission', ''', (now_iso, stale_cpu_cutoff))
'not found', 'no such', 'invalid'
]) # 3. MEMORY: Auto-resolve if memory is normal (<80%)
if not is_persistent: if current_mem < 80:
stale_mem_cutoff = (now - timedelta(minutes=5)).isoformat()
cursor.execute('''
UPDATE errors SET resolved_at = ?
WHERE (category = 'memory' OR category = 'logs')
AND resolved_at IS NULL AND acknowledged = 0
AND last_seen < ?
AND (error_key LIKE '%oom%' OR error_key LIKE '%memory%'
OR reason LIKE '%memory%' OR reason LIKE '%OOM%'
OR reason LIKE '%killed%process%')
''', (now_iso, stale_mem_cutoff))
# 4. VMS: Auto-resolve if VM/CT is now running or deleted
cursor.execute('''
SELECT error_key, category, reason FROM errors
WHERE (category IN ('vms', 'vmct') OR error_key LIKE 'vm_%'
OR error_key LIKE 'ct_%' OR error_key LIKE 'vmct_%')
AND resolved_at IS NULL AND acknowledged = 0
''')
vm_errors = cursor.fetchall()
for vm_ek, cat, vm_reason in vm_errors:
vmid_match = re.search(r'(?:vm_|ct_|vmct_)(\d+)', vm_ek)
if vmid_match:
vmid = vmid_match.group(1)
try:
vm_running = False
ct_running = False
vm_exists = False
ct_exists = False
result_vm = subprocess.run(
['qm', 'status', vmid],
capture_output=True, text=True, timeout=2)
if result_vm.returncode == 0:
vm_exists = True
vm_running = 'running' in result_vm.stdout.lower()
if not vm_exists:
result_ct = subprocess.run(
['pct', 'status', vmid],
capture_output=True, text=True, timeout=2)
if result_ct.returncode == 0:
ct_exists = True
ct_running = 'running' in result_ct.stdout.lower()
if not vm_exists and not ct_exists:
cursor.execute(''' cursor.execute('''
UPDATE errors SET resolved_at = ? UPDATE errors SET resolved_at = ?
WHERE error_key = ? AND resolved_at IS NULL WHERE error_key = ? AND resolved_at IS NULL
''', (now_iso, error_key)) ''', (now_iso, vm_ek))
except Exception: elif vm_running or ct_running:
pass # Skip this VM/CT if check fails reason_lower = (vm_reason or '').lower()
is_persistent = any(x in reason_lower for x in [
# ── 5. GENERIC: Any error not seen in 30 minutes while system is healthy ── 'device', 'missing', 'does not exist', 'permission',
# If CPU < 80% and Memory < 85% and error hasn't been seen in 30 min, 'not found', 'no such', 'invalid'])
# the system has recovered and the error is stale. if not is_persistent:
if current_cpu < 80 and current_mem < 85: cursor.execute('''
stale_generic_cutoff = (now - timedelta(minutes=30)).isoformat() UPDATE errors SET resolved_at = ?
cursor.execute(''' WHERE error_key = ? AND resolved_at IS NULL
UPDATE errors ''', (now_iso, vm_ek))
SET resolved_at = ? except Exception:
WHERE resolved_at IS NULL pass
AND acknowledged = 0
AND last_seen < ? # 5. GENERIC: Any error not seen in 30 min while system is healthy
AND category NOT IN ('disks', 'storage') if current_cpu < 80 and current_mem < 85:
''', (now_iso, stale_generic_cutoff)) stale_generic_cutoff = (now - timedelta(minutes=30)).isoformat()
cursor.execute('''
except Exception: UPDATE errors SET resolved_at = ?
pass # If we can't read uptime, skip this cleanup WHERE resolved_at IS NULL AND acknowledged = 0
AND last_seen < ?
AND category NOT IN ('disks', 'storage')
''', (now_iso, stale_generic_cutoff))
except Exception:
pass # If we can't read uptime, skip this cleanup
conn.commit() conn.commit()
finally: finally:
conn.close() conn.close()