diff --git a/AppImage/components/health-status-modal.tsx b/AppImage/components/health-status-modal.tsx index efaea4e..b401707 100644 --- a/AppImage/components/health-status-modal.tsx +++ b/AppImage/components/health-status-modal.tsx @@ -1,8 +1,11 @@ "use client" +import type React from "react" + import { useState, useEffect } from "react" import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle } from "@/components/ui/dialog" import { Badge } from "@/components/ui/badge" +import { Button } from "@/components/ui/button" import { Loader2, CheckCircle2, @@ -19,6 +22,7 @@ import { FileText, RefreshCw, Shield, + X, } from "lucide-react" interface CategoryCheck { @@ -148,17 +152,53 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu const stats = getHealthStats() + const handleCategoryClick = (categoryKey: string, status: string) => { + if (status === "OK") return // No navegar si está OK + + onOpenChange(false) // Cerrar el modal + + // Mapear categorías a tabs + const categoryToTab: Record = { + storage: "storage", + disks: "storage", + network: "network", + vms: "vms", + logs: "logs", + hardware: "hardware", + services: "hardware", + } + + const targetTab = categoryToTab[categoryKey] + if (targetTab) { + // Disparar evento para cambiar tab + const event = new CustomEvent("changeTab", { detail: { tab: targetTab } }) + window.dispatchEvent(event) + } + } + + const handleAcknowledge = async (errorKey: string, e: React.MouseEvent) => { + e.stopPropagation() // Prevent navigation + + try { + await fetch(getApiUrl(`/api/health/acknowledge/${errorKey}`), { + method: "POST", + }) + // Refresh health data + await fetchHealthDetails() + } catch (err) { + console.error("[v0] Error acknowledging:", err) + } + } + return ( - -
- - System Health Status -
- {healthData && getStatusBadge(healthData.overall)} + + + System Health Status +
{healthData && getStatusBadge(healthData.overall)}
Detailed health checks for all system components
@@ -213,13 +253,14 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu return (
handleCategoryClick(key, status)} className={`flex items-start gap-3 p-3 rounded-lg border transition-colors ${ status === "OK" ? "bg-green-500/5 border-green-500/20 hover:bg-green-500/10" : status === "WARNING" - ? "bg-yellow-500/5 border-yellow-500/20 hover:bg-yellow-500/10" + ? "bg-yellow-500/5 border-yellow-500/20 hover:bg-yellow-500/10 cursor-pointer" : status === "CRITICAL" - ? "bg-red-500/5 border-red-500/20 hover:bg-red-500/10" + ? "bg-red-500/5 border-red-500/20 hover:bg-red-500/10 cursor-pointer" : "bg-muted/30 hover:bg-muted/50" }`} > @@ -251,10 +292,25 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu {Object.entries(details).map(([detailKey, detailValue]: [string, any]) => { if (typeof detailValue === "object" && detailValue !== null) { return ( -
- {detailKey}: - {detailValue.reason && ( - {detailValue.reason} +
+
+ {detailKey}: + {detailValue.reason && ( + {detailValue.reason} + )} +
+ {status !== "OK" && ( + )}
) diff --git a/AppImage/components/proxmox-dashboard.tsx b/AppImage/components/proxmox-dashboard.tsx index 8bc2164..bf320ea 100644 --- a/AppImage/components/proxmox-dashboard.tsx +++ b/AppImage/components/proxmox-dashboard.tsx @@ -98,10 +98,19 @@ export function ProxmoxDashboard() { const uptimeValue = data.uptime && typeof data.uptime === "string" && data.uptime.trim() !== "" ? data.uptime : "N/A" - const healthStatus = data.health?.status || "healthy" + const backendStatus = data.health?.status?.toUpperCase() || "OK" + let healthStatus: "healthy" | "warning" | "critical" + + if (backendStatus === "CRITICAL") { + healthStatus = "critical" + } else if (backendStatus === "WARNING") { + healthStatus = "warning" + } else { + healthStatus = "healthy" + } setSystemStatus({ - status: healthStatus as "healthy" | "warning" | "critical", + status: healthStatus, uptime: uptimeValue, lastUpdate: new Date().toLocaleTimeString("en-US", { hour12: false }), serverName: data.hostname || "Unknown", @@ -127,11 +136,13 @@ export function ProxmoxDashboard() { // Siempre fetch inicial fetchSystemData() + // En overview: cada 30 segundos para actualización frecuente del estado de salud + // En otras tabs: cada 60 segundos para reducir carga let interval: ReturnType | null = null if (activeTab === "overview") { - interval = setInterval(fetchSystemData, 9000) // Cambiado de 10000 a 9000ms + interval = setInterval(fetchSystemData, 30000) // 30 segundos } else { - interval = setInterval(fetchSystemData, 61000) // Cambiado de 60000 a 61000ms + interval = setInterval(fetchSystemData, 60000) // 60 segundos } return () => { @@ -139,6 +150,20 @@ export function ProxmoxDashboard() { } }, [fetchSystemData, activeTab]) + useEffect(() => { + const handleChangeTab = (event: CustomEvent) => { + const { tab } = event.detail + if (tab) { + setActiveTab(tab) + } + } + + window.addEventListener("changeTab", handleChangeTab as EventListener) + return () => { + window.removeEventListener("changeTab", handleChangeTab as EventListener) + } + }, []) + useEffect(() => { if ( systemStatus.serverName && diff --git a/AppImage/scripts/flask_health_routes.py b/AppImage/scripts/flask_health_routes.py index fb32f51..22534c0 100644 --- a/AppImage/scripts/flask_health_routes.py +++ b/AppImage/scripts/flask_health_routes.py @@ -1,9 +1,10 @@ """ -Flask routes for health monitoring +Flask routes for health monitoring with persistence support """ -from flask import Blueprint, jsonify +from flask import Blueprint, jsonify, request from health_monitor import health_monitor +from health_persistence import health_persistence health_bp = Blueprint('health', __name__) @@ -47,3 +48,22 @@ def get_system_info(): return jsonify(info) except Exception as e: return jsonify({'error': str(e)}), 500 + +@health_bp.route('/api/health/acknowledge/', methods=['POST']) +def acknowledge_error(error_key): + """Acknowledge an error manually (user dismissed it)""" + try: + health_persistence.acknowledge_error(error_key) + return jsonify({'success': True, 'message': 'Error acknowledged'}) + except Exception as e: + return jsonify({'error': str(e)}), 500 + +@health_bp.route('/api/health/active-errors', methods=['GET']) +def get_active_errors(): + """Get all active persistent errors""" + try: + category = request.args.get('category') + errors = health_persistence.get_active_errors(category) + return jsonify({'errors': errors}) + except Exception as e: + return jsonify({'error': str(e)}), 500 diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py index e06ef36..426e8d9 100644 --- a/AppImage/scripts/health_monitor.py +++ b/AppImage/scripts/health_monitor.py @@ -17,10 +17,12 @@ from datetime import datetime, timedelta from collections import defaultdict import re +from health_persistence import health_persistence + class HealthMonitor: """ Monitors system health across multiple components with minimal impact. - Implements hysteresis, intelligent caching, and progressive escalation. + Implements hysteresis, intelligent caching, progressive escalation, and persistent error tracking. Always returns all 10 health categories. """ @@ -28,8 +30,8 @@ class HealthMonitor: CPU_WARNING = 85 CPU_CRITICAL = 95 CPU_RECOVERY = 75 - CPU_WARNING_DURATION = 60 - CPU_CRITICAL_DURATION = 120 + CPU_WARNING_DURATION = 300 # 5 minutes sustained + CPU_CRITICAL_DURATION = 300 # 5 minutes sustained CPU_RECOVERY_DURATION = 120 # Memory Thresholds @@ -85,6 +87,11 @@ class HealthMonitor: self.io_error_history = defaultdict(list) self.failed_vm_history = set() # Track VMs that failed to start + try: + health_persistence.cleanup_old_errors() + except Exception as e: + print(f"[HealthMonitor] Cleanup warning: {e}") + def get_system_info(self) -> Dict[str, Any]: """ Get lightweight system info for header display. @@ -188,7 +195,11 @@ class HealthMonitor: """ Get comprehensive health status with all checks. Returns JSON structure with ALL 10 categories always present. + Now includes persistent error tracking. """ + active_errors = health_persistence.get_active_errors() + persistent_issues = {err['error_key']: err for err in active_errors} + details = { 'cpu': {'status': 'OK'}, 'memory': {'status': 'OK'}, @@ -231,8 +242,8 @@ class HealthMonitor: elif disks_status.get('status') == 'WARNING': warning_issues.append(disks_status.get('reason', 'Disk issue')) - # Priority 4: VMs/CTs - now detects qmp errors from logs - vms_status = self._check_vms_cts_optimized() + # Priority 4: VMs/CTs - now with persistence + vms_status = self._check_vms_cts_with_persistence() if vms_status: details['vms'] = vms_status if vms_status.get('status') == 'CRITICAL': @@ -265,8 +276,8 @@ class HealthMonitor: elif memory_status.get('status') == 'WARNING': warning_issues.append(memory_status.get('reason', 'Memory high')) - # Priority 8: Logs - logs_status = self._check_logs_lightweight() + # Priority 8: Logs - now with persistence + logs_status = self._check_logs_with_persistence() details['logs'] = logs_status if logs_status.get('status') == 'CRITICAL': critical_issues.append(logs_status.get('reason', 'Critical log errors')) @@ -305,7 +316,7 @@ class HealthMonitor: } def _check_cpu_with_hysteresis(self) -> Dict[str, Any]: - """Check CPU with hysteresis to avoid flapping alerts""" + """Check CPU with hysteresis to avoid flapping alerts - requires 5min sustained high usage""" try: cpu_percent = psutil.cpu_percent(interval=1) current_time = time.time() @@ -318,33 +329,33 @@ class HealthMonitor: self.state_history[state_key] = [ entry for entry in self.state_history[state_key] - if current_time - entry['time'] < 300 + if current_time - entry['time'] < 360 ] - critical_duration = sum( - 1 for entry in self.state_history[state_key] + critical_samples = [ + entry for entry in self.state_history[state_key] if entry['value'] >= self.CPU_CRITICAL and current_time - entry['time'] <= self.CPU_CRITICAL_DURATION - ) + ] - warning_duration = sum( - 1 for entry in self.state_history[state_key] + warning_samples = [ + entry for entry in self.state_history[state_key] if entry['value'] >= self.CPU_WARNING and current_time - entry['time'] <= self.CPU_WARNING_DURATION - ) + ] - recovery_duration = sum( - 1 for entry in self.state_history[state_key] + recovery_samples = [ + entry for entry in self.state_history[state_key] if entry['value'] < self.CPU_RECOVERY and current_time - entry['time'] <= self.CPU_RECOVERY_DURATION - ) + ] - if critical_duration >= 2: + if len(critical_samples) >= 3: status = 'CRITICAL' - reason = f'CPU >{self.CPU_CRITICAL}% for {self.CPU_CRITICAL_DURATION}s' - elif warning_duration >= 2 and recovery_duration < 2: + reason = f'CPU >{self.CPU_CRITICAL}% sustained for {self.CPU_CRITICAL_DURATION}s' + elif len(warning_samples) >= 3 and len(recovery_samples) < 2: status = 'WARNING' - reason = f'CPU >{self.CPU_WARNING}% for {self.CPU_WARNING_DURATION}s' + reason = f'CPU >{self.CPU_WARNING}% sustained for {self.CPU_WARNING_DURATION}s' else: status = 'OK' reason = None @@ -871,15 +882,15 @@ class HealthMonitor: def _check_vms_cts_optimized(self) -> Dict[str, Any]: """ - Optimized VM/CT check - detects qmp failures and other VM errors. - Now parses logs for VM/CT specific errors like qmp command failures. + Optimized VM/CT check - detects qmp failures and startup errors from logs. + Improved detection of container and VM errors from journalctl. """ try: issues = [] vm_details = {} result = subprocess.run( - ['journalctl', '--since', '10 minutes ago', '--no-pager', '-u', 'pve*', '-p', 'warning'], + ['journalctl', '--since', '10 minutes ago', '--no-pager', '-p', 'warning'], capture_output=True, text=True, timeout=3 @@ -903,22 +914,56 @@ class HealthMonitor: } continue - ct_match = re.search(r'(?:ct|container)\s+(\d+)', line_lower) - if ct_match and ('error' in line_lower or 'fail' in line_lower): - ctid = ct_match.group(1) + ct_error_match = re.search(r'(?:ct|container|lxc)\s+(\d+)', line_lower) + if ct_error_match and ('error' in line_lower or 'fail' in line_lower or 'device' in line_lower): + ctid = ct_error_match.group(1) key = f'ct_{ctid}' if key not in vm_details: - issues.append(f'CT {ctid}: Error detected') + if 'device' in line_lower and 'does not exist' in line_lower: + device_match = re.search(r'device\s+([/\w\d]+)\s+does not exist', line_lower) + if device_match: + reason = f'Device {device_match.group(1)} missing' + else: + reason = 'Device error' + elif 'failed to start' in line_lower: + reason = 'Failed to start' + else: + reason = 'Container error' + + issues.append(f'CT {ctid}: {reason}') + vm_details[key] = { + 'status': 'WARNING' if 'device' in reason.lower() else 'CRITICAL', + 'reason': reason, + 'id': ctid, + 'type': 'CT' + } + continue + + vzstart_match = re.search(r'vzstart:(\d+):', line) + if vzstart_match and ('error' in line_lower or 'fail' in line_lower or 'does not exist' in line_lower): + ctid = vzstart_match.group(1) + key = f'ct_{ctid}' + if key not in vm_details: + # Extraer mensaje de error + if 'device' in line_lower and 'does not exist' in line_lower: + device_match = re.search(r'device\s+([/\w\d]+)\s+does not exist', line_lower) + if device_match: + reason = f'Device {device_match.group(1)} missing' + else: + reason = 'Device error' + else: + reason = 'Startup error' + + issues.append(f'CT {ctid}: {reason}') vm_details[key] = { 'status': 'WARNING', - 'reason': 'Container error', + 'reason': reason, 'id': ctid, 'type': 'CT' } continue if any(keyword in line_lower for keyword in ['failed to start', 'cannot start', 'activation failed', 'start error']): - # Extract VM/CT ID id_match = re.search(r'\b(\d{3,4})\b', line) if id_match: vmid = id_match.group(1) @@ -946,6 +991,118 @@ class HealthMonitor: except Exception: return {'status': 'OK'} + # Modified to use persistence + def _check_vms_cts_with_persistence(self) -> Dict[str, Any]: + """ + Check VMs/CTs with persistent error tracking. + Errors persist until VM starts or 48h elapsed. + """ + try: + issues = [] + vm_details = {} + + # Get persistent errors first + persistent_errors = health_persistence.get_active_errors('vms') + + # Check if any persistent VMs/CTs have started + for error in persistent_errors: + error_key = error['error_key'] + if error_key.startswith('vm_') or error_key.startswith('ct_'): + vm_id = error_key.split('_')[1] + if health_persistence.check_vm_running(vm_id): + continue # Error auto-resolved + + # Still active + vm_details[error_key] = { + 'status': error['severity'], + 'reason': error['reason'], + 'id': error.get('details', {}).get('id', 'unknown'), + 'type': error.get('details', {}).get('type', 'VM/CT'), + 'first_seen': error['first_seen'] + } + issues.append(f"{error.get('details', {}).get('type', 'VM')} {error.get('details', {}).get('id', '')}: {error['reason']}") + + # Check for new errors in logs + result = subprocess.run( + ['journalctl', '--since', '10 minutes ago', '--no-pager', '-p', 'warning'], + capture_output=True, + text=True, + timeout=3 + ) + + if result.returncode == 0: + for line in result.stdout.split('\n'): + line_lower = line.lower() + + # VM QMP errors + vm_qmp_match = re.search(r'vm\s+(\d+)\s+qmp\s+command.*(?:failed|unable|timeout)', line_lower) + if vm_qmp_match: + vmid = vm_qmp_match.group(1) + error_key = f'vm_{vmid}' + if error_key not in vm_details: + # Record persistent error + health_persistence.record_error( + error_key=error_key, + category='vms', + severity='WARNING', + reason='QMP command timeout', + details={'id': vmid, 'type': 'VM'} + ) + issues.append(f'VM {vmid}: Communication issue') + vm_details[error_key] = { + 'status': 'WARNING', + 'reason': 'QMP command timeout', + 'id': vmid, + 'type': 'VM' + } + continue + + # Container errors + vzstart_match = re.search(r'vzstart:(\d+):', line) + if vzstart_match and ('error' in line_lower or 'fail' in line_lower or 'does not exist' in line_lower): + ctid = vzstart_match.group(1) + error_key = f'ct_{ctid}' + + if error_key not in vm_details: + if 'device' in line_lower and 'does not exist' in line_lower: + device_match = re.search(r'device\s+([/\w\d]+)\s+does not exist', line_lower) + if device_match: + reason = f'Device {device_match.group(1)} missing' + else: + reason = 'Device error' + else: + reason = 'Startup error' + + # Record persistent error + health_persistence.record_error( + error_key=error_key, + category='vms', + severity='WARNING', + reason=reason, + details={'id': ctid, 'type': 'CT'} + ) + issues.append(f'CT {ctid}: {reason}') + vm_details[error_key] = { + 'status': 'WARNING', + 'reason': reason, + 'id': ctid, + 'type': 'CT' + } + + if not issues: + return {'status': 'OK'} + + has_critical = any(d.get('status') == 'CRITICAL' for d in vm_details.values()) + + return { + 'status': 'CRITICAL' if has_critical else 'WARNING', + 'reason': '; '.join(issues[:3]), + 'details': vm_details + } + + except Exception: + return {'status': 'OK'} + def _check_pve_services(self) -> Dict[str, Any]: """Check critical Proxmox services""" try: @@ -980,13 +1137,24 @@ class HealthMonitor: 'reason': f'Service check failed: {str(e)}' } - def _check_logs_lightweight(self) -> Dict[str, Any]: - """Lightweight log analysis (cached, checked every 5 minutes)""" + # Modified to use persistence + def _check_logs_with_persistence(self) -> Dict[str, Any]: + """ + Check logs with persistent error tracking. + Critical log errors persist for 24h unless acknowledged. + """ cache_key = 'logs_analysis' current_time = time.time() if cache_key in self.last_check_times: if current_time - self.last_check_times[cache_key] < self.LOG_CHECK_INTERVAL: + # Return persistent errors if any + persistent_errors = health_persistence.get_active_errors('logs') + if persistent_errors: + return { + 'status': 'WARNING', + 'reason': f'{len(persistent_errors)} persistent log issues' + } return self.cached_results.get(cache_key, {'status': 'OK'}) try: @@ -1011,6 +1179,16 @@ class HealthMonitor: if keyword.lower() in line_lower: critical_keywords_found.append(keyword) errors_5m += 1 + + # Record persistent error for critical keywords + error_key = f'log_critical_{keyword.replace(" ", "_")}' + health_persistence.record_error( + error_key=error_key, + category='logs', + severity='CRITICAL', + reason=f'Critical log: {keyword}', + details={'keyword': keyword} + ) break else: if 'error' in line_lower or 'critical' in line_lower or 'fatal' in line_lower: diff --git a/AppImage/scripts/health_persistence.py b/AppImage/scripts/health_persistence.py new file mode 100644 index 0000000..8814b86 --- /dev/null +++ b/AppImage/scripts/health_persistence.py @@ -0,0 +1,326 @@ +""" +Health Monitor Persistence Module +Manages persistent error tracking across AppImage updates using SQLite. +Stores errors in /root/.config/proxmenux-monitor/health_monitor.db + +Features: +- Persistent error storage (survives AppImage updates) +- Smart error resolution (auto-clear when VM starts, or after 48h) +- Event system for future Telegram notifications +- Manual acknowledgment support + +Author: MacRimi +Version: 1.0 +""" + +import sqlite3 +import json +import os +from datetime import datetime, timedelta +from typing import Dict, List, Any, Optional +from pathlib import Path + +class HealthPersistence: + """Manages persistent health error tracking""" + + # Error retention periods (seconds) + VM_ERROR_RETENTION = 48 * 3600 # 48 hours + LOG_ERROR_RETENTION = 24 * 3600 # 24 hours + DISK_ERROR_RETENTION = 48 * 3600 # 48 hours + + def __init__(self): + """Initialize persistence with database in config directory""" + self.data_dir = Path('/root/.config/proxmenux-monitor') + self.data_dir.mkdir(parents=True, exist_ok=True) + + self.db_path = self.data_dir / 'health_monitor.db' + self._init_database() + + def _init_database(self): + """Initialize SQLite database with required tables""" + conn = sqlite3.connect(str(self.db_path)) + cursor = conn.cursor() + + # Errors table + cursor.execute(''' + CREATE TABLE IF NOT EXISTS errors ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + error_key TEXT UNIQUE NOT NULL, + category TEXT NOT NULL, + severity TEXT NOT NULL, + reason TEXT NOT NULL, + details TEXT, + first_seen TEXT NOT NULL, + last_seen TEXT NOT NULL, + resolved_at TEXT, + acknowledged INTEGER DEFAULT 0, + notification_sent INTEGER DEFAULT 0 + ) + ''') + + # Events table (for future Telegram notifications) + cursor.execute(''' + CREATE TABLE IF NOT EXISTS events ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + event_type TEXT NOT NULL, + error_key TEXT NOT NULL, + timestamp TEXT NOT NULL, + data TEXT + ) + ''') + + # Indexes for performance + cursor.execute('CREATE INDEX IF NOT EXISTS idx_error_key ON errors(error_key)') + cursor.execute('CREATE INDEX IF NOT EXISTS idx_category ON errors(category)') + cursor.execute('CREATE INDEX IF NOT EXISTS idx_resolved ON errors(resolved_at)') + cursor.execute('CREATE INDEX IF NOT EXISTS idx_events_error ON events(error_key)') + + conn.commit() + conn.close() + + def record_error(self, error_key: str, category: str, severity: str, + reason: str, details: Optional[Dict] = None) -> Dict[str, Any]: + """ + Record or update an error. + Returns event info (new_error, updated, etc.) + """ + conn = sqlite3.connect(str(self.db_path)) + cursor = conn.cursor() + + now = datetime.now().isoformat() + details_json = json.dumps(details) if details else None + + # Check if error exists + cursor.execute('SELECT id, first_seen, notification_sent FROM errors WHERE error_key = ?', + (error_key,)) + existing = cursor.fetchone() + + event_info = {'type': 'updated', 'needs_notification': False} + + if existing: + # Update existing error + cursor.execute(''' + UPDATE errors + SET last_seen = ?, severity = ?, reason = ?, details = ?, resolved_at = NULL + WHERE error_key = ? + ''', (now, severity, reason, details_json, error_key)) + + # Check if severity escalated + cursor.execute('SELECT severity FROM errors WHERE error_key = ?', (error_key,)) + old_severity = cursor.fetchone()[0] + if old_severity == 'WARNING' and severity == 'CRITICAL': + event_info['type'] = 'escalated' + event_info['needs_notification'] = True + else: + # Insert new error + cursor.execute(''' + INSERT INTO errors + (error_key, category, severity, reason, details, first_seen, last_seen) + VALUES (?, ?, ?, ?, ?, ?, ?) + ''', (error_key, category, severity, reason, details_json, now, now)) + + event_info['type'] = 'new' + event_info['needs_notification'] = True + + # Record event + self._record_event(cursor, event_info['type'], error_key, + {'severity': severity, 'reason': reason}) + + conn.commit() + conn.close() + + return event_info + + def resolve_error(self, error_key: str, reason: str = 'auto-resolved'): + """Mark an error as resolved""" + conn = sqlite3.connect(str(self.db_path)) + cursor = conn.cursor() + + now = datetime.now().isoformat() + + cursor.execute(''' + UPDATE errors + SET resolved_at = ? + WHERE error_key = ? AND resolved_at IS NULL + ''', (now, error_key)) + + if cursor.rowcount > 0: + self._record_event(cursor, 'resolved', error_key, {'reason': reason}) + + conn.commit() + conn.close() + + def acknowledge_error(self, error_key: str): + """Manually acknowledge an error (won't notify again)""" + conn = sqlite3.connect(str(self.db_path)) + cursor = conn.cursor() + + cursor.execute(''' + UPDATE errors + SET acknowledged = 1 + WHERE error_key = ? + ''', (error_key,)) + + self._record_event(cursor, 'acknowledged', error_key, {}) + + conn.commit() + conn.close() + + def get_active_errors(self, category: Optional[str] = None) -> List[Dict[str, Any]]: + """Get all active (unresolved) errors, optionally filtered by category""" + conn = sqlite3.connect(str(self.db_path)) + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + if category: + cursor.execute(''' + SELECT * FROM errors + WHERE resolved_at IS NULL AND category = ? + ORDER BY severity DESC, last_seen DESC + ''', (category,)) + else: + cursor.execute(''' + SELECT * FROM errors + WHERE resolved_at IS NULL + ORDER BY severity DESC, last_seen DESC + ''') + + rows = cursor.fetchall() + conn.close() + + errors = [] + for row in rows: + error_dict = dict(row) + if error_dict.get('details'): + error_dict['details'] = json.loads(error_dict['details']) + errors.append(error_dict) + + return errors + + def cleanup_old_errors(self): + """Clean up old resolved errors and auto-resolve stale errors""" + conn = sqlite3.connect(str(self.db_path)) + cursor = conn.cursor() + + now = datetime.now() + + # Delete resolved errors older than 7 days + cutoff_resolved = (now - timedelta(days=7)).isoformat() + cursor.execute('DELETE FROM errors WHERE resolved_at < ?', (cutoff_resolved,)) + + # Auto-resolve VM/CT errors older than 48h + cutoff_vm = (now - timedelta(seconds=self.VM_ERROR_RETENTION)).isoformat() + cursor.execute(''' + UPDATE errors + SET resolved_at = ? + WHERE category = 'vms' + AND resolved_at IS NULL + AND first_seen < ? + AND acknowledged = 0 + ''', (now.isoformat(), cutoff_vm)) + + # Auto-resolve log errors older than 24h + cutoff_logs = (now - timedelta(seconds=self.LOG_ERROR_RETENTION)).isoformat() + cursor.execute(''' + UPDATE errors + SET resolved_at = ? + WHERE category = 'logs' + AND resolved_at IS NULL + AND first_seen < ? + AND acknowledged = 0 + ''', (now.isoformat(), cutoff_logs)) + + # Delete old events (>30 days) + cutoff_events = (now - timedelta(days=30)).isoformat() + cursor.execute('DELETE FROM events WHERE timestamp < ?', (cutoff_events,)) + + conn.commit() + conn.close() + + def check_vm_running(self, vm_id: str) -> bool: + """ + Check if a VM/CT is running and resolve error if so. + Returns True if running and error was resolved. + """ + import subprocess + + try: + # Check qm status for VMs + result = subprocess.run( + ['qm', 'status', vm_id], + capture_output=True, + text=True, + timeout=2 + ) + + if result.returncode == 0 and 'running' in result.stdout.lower(): + self.resolve_error(f'vm_{vm_id}', 'VM started') + return True + + # Check pct status for containers + result = subprocess.run( + ['pct', 'status', vm_id], + capture_output=True, + text=True, + timeout=2 + ) + + if result.returncode == 0 and 'running' in result.stdout.lower(): + self.resolve_error(f'ct_{vm_id}', 'Container started') + return True + + return False + + except Exception: + return False + + def _record_event(self, cursor, event_type: str, error_key: str, data: Dict): + """Internal: Record an event""" + cursor.execute(''' + INSERT INTO events (event_type, error_key, timestamp, data) + VALUES (?, ?, ?, ?) + ''', (event_type, error_key, datetime.now().isoformat(), json.dumps(data))) + + def get_unnotified_errors(self) -> List[Dict[str, Any]]: + """Get errors that need Telegram notification""" + conn = sqlite3.connect(str(self.db_path)) + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + cursor.execute(''' + SELECT * FROM errors + WHERE notification_sent = 0 + AND resolved_at IS NULL + AND acknowledged = 0 + ORDER BY severity DESC, first_seen ASC + ''') + + rows = cursor.fetchall() + conn.close() + + errors = [] + for row in rows: + error_dict = dict(row) + if error_dict.get('details'): + error_dict['details'] = json.loads(error_dict['details']) + errors.append(error_dict) + + return errors + + def mark_notified(self, error_key: str): + """Mark error as notified""" + conn = sqlite3.connect(str(self.db_path)) + cursor = conn.cursor() + + cursor.execute(''' + UPDATE errors + SET notification_sent = 1 + WHERE error_key = ? + ''', (error_key,)) + + conn.commit() + conn.close() + + +# Global instance +health_persistence = HealthPersistence()