Update AppImage

2026-02-19 00:46:31 +00:00 · 2025-11-09 17:28:20 +01:00
parent 27353e160f
commit a0635a1026
5 changed files with 656 additions and 51 deletions
--- a/AppImage/components/health-status-modal.tsx
+++ b/AppImage/components/health-status-modal.tsx
@@ -1,8 +1,11 @@
 "use client"
 import type React from "react"
 import { useState, useEffect } from "react"
 import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle } from "@/components/ui/dialog"
 import { Badge } from "@/components/ui/badge"
 import { Button } from "@/components/ui/button"
 import {
  Loader2,
  CheckCircle2,
@@ -19,6 +22,7 @@ import {
  FileText,
  RefreshCw,
  Shield,
  X,
 } from "lucide-react"
 interface CategoryCheck {
@@ -148,17 +152,53 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
  const stats = getHealthStats()
  const handleCategoryClick = (categoryKey: string, status: string) => {
    if (status === "OK") return // No navegar si está OK
    onOpenChange(false) // Cerrar el modal
    // Mapear categorías a tabs
    const categoryToTab: Record<string, string> = {
      storage: "storage",
      disks: "storage",
      network: "network",
      vms: "vms",
      logs: "logs",
      hardware: "hardware",
      services: "hardware",
    }
    const targetTab = categoryToTab[categoryKey]
    if (targetTab) {
      // Disparar evento para cambiar tab
      const event = new CustomEvent("changeTab", { detail: { tab: targetTab } })
      window.dispatchEvent(event)
    }
  }
  const handleAcknowledge = async (errorKey: string, e: React.MouseEvent) => {
    e.stopPropagation() // Prevent navigation
    try {
      await fetch(getApiUrl(`/api/health/acknowledge/${errorKey}`), {
        method: "POST",
      })
      // Refresh health data
      await fetchHealthDetails()
    } catch (err) {
      console.error("[v0] Error acknowledging:", err)
    }
  }
  return (
    <Dialog open={open} onOpenChange={onOpenChange}>
      <DialogContent className="max-w-3xl max-h-[85vh] overflow-y-auto">
        <DialogHeader>
-          <DialogTitle className="flex items-center justify-between">
+          <DialogTitle className="flex items-center gap-2">
-            <div className="flex items-center gap-2">
+            <Activity className="h-6 w-6" />
-              <Activity className="h-6 w-6" />
+            System Health Status
              System Health Status
            </div>
            {healthData && getStatusBadge(healthData.overall)}
          </DialogTitle>
          <div className="mt-4">{healthData && getStatusBadge(healthData.overall)}</div>
          <DialogDescription>Detailed health checks for all system components</DialogDescription>
        </DialogHeader>
@@ -213,13 +253,14 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
                return (
                  <div
                    key={key}
                    onClick={() => handleCategoryClick(key, status)}
                    className={`flex items-start gap-3 p-3 rounded-lg border transition-colors ${
                      status === "OK"
                        ? "bg-green-500/5 border-green-500/20 hover:bg-green-500/10"
                        : status === "WARNING"
-                          ? "bg-yellow-500/5 border-yellow-500/20 hover:bg-yellow-500/10"
+                          ? "bg-yellow-500/5 border-yellow-500/20 hover:bg-yellow-500/10 cursor-pointer"
                          : status === "CRITICAL"
-                            ? "bg-red-500/5 border-red-500/20 hover:bg-red-500/10"
+                            ? "bg-red-500/5 border-red-500/20 hover:bg-red-500/10 cursor-pointer"
                            : "bg-muted/30 hover:bg-muted/50"
                    }`}
                  >
@@ -251,10 +292,25 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
                          {Object.entries(details).map(([detailKey, detailValue]: [string, any]) => {
                            if (typeof detailValue === "object" && detailValue !== null) {
                              return (
-                                <div key={detailKey} className="text-xs pl-3 border-l-2 border-muted">
+                                <div
-                                  <span className="font-medium">{detailKey}:</span>
+                                  key={detailKey}
-                                  {detailValue.reason && (
+                                  className="flex items-start justify-between gap-2 text-xs pl-3 border-l-2 border-muted"
-                                    <span className="ml-1 text-muted-foreground">{detailValue.reason}</span>
+                                >
                                  <div>
                                    <span className="font-medium">{detailKey}:</span>
                                    {detailValue.reason && (
                                      <span className="ml-1 text-muted-foreground">{detailValue.reason}</span>
                                    )}
                                  </div>
                                  {status !== "OK" && (
                                    <Button
                                      size="sm"
                                      variant="ghost"
                                      className="h-5 px-1 hover:bg-red-500/10"
                                      onClick={(e) => handleAcknowledge(detailKey, e)}
                                    >
                                      <X className="h-3 w-3" />
                                    </Button>
                                  )}
                                </div>
                              )
--- a/AppImage/components/proxmox-dashboard.tsx
+++ b/AppImage/components/proxmox-dashboard.tsx
@@ -98,10 +98,19 @@ export function ProxmoxDashboard() {
      const uptimeValue =
        data.uptime && typeof data.uptime === "string" && data.uptime.trim() !== "" ? data.uptime : "N/A"
-      const healthStatus = data.health?.status || "healthy"
+      const backendStatus = data.health?.status?.toUpperCase() || "OK"
      let healthStatus: "healthy" | "warning" | "critical"
      if (backendStatus === "CRITICAL") {
        healthStatus = "critical"
      } else if (backendStatus === "WARNING") {
        healthStatus = "warning"
      } else {
        healthStatus = "healthy"
      }
      setSystemStatus({
-        status: healthStatus as "healthy" | "warning" | "critical",
+        status: healthStatus,
        uptime: uptimeValue,
        lastUpdate: new Date().toLocaleTimeString("en-US", { hour12: false }),
        serverName: data.hostname || "Unknown",
@@ -127,11 +136,13 @@ export function ProxmoxDashboard() {
    // Siempre fetch inicial
    fetchSystemData()
    // En overview: cada 30 segundos para actualización frecuente del estado de salud
    // En otras tabs: cada 60 segundos para reducir carga
    let interval: ReturnType<typeof setInterval> | null = null
    if (activeTab === "overview") {
-      interval = setInterval(fetchSystemData, 9000) // Cambiado de 10000 a 9000ms
+      interval = setInterval(fetchSystemData, 30000) // 30 segundos
    } else {
-      interval = setInterval(fetchSystemData, 61000) // Cambiado de 60000 a 61000ms
+      interval = setInterval(fetchSystemData, 60000) // 60 segundos
    }
    return () => {
@@ -139,6 +150,20 @@ export function ProxmoxDashboard() {
    }
  }, [fetchSystemData, activeTab])
  useEffect(() => {
    const handleChangeTab = (event: CustomEvent) => {
      const { tab } = event.detail
      if (tab) {
        setActiveTab(tab)
      }
    }
    window.addEventListener("changeTab", handleChangeTab as EventListener)
    return () => {
      window.removeEventListener("changeTab", handleChangeTab as EventListener)
    }
  }, [])
  useEffect(() => {
    if (
      systemStatus.serverName &&
--- a/AppImage/scripts/flask_health_routes.py
+++ b/AppImage/scripts/flask_health_routes.py
@@ -1,9 +1,10 @@
 """
-Flask routes for health monitoring
+Flask routes for health monitoring with persistence support
 """
-from flask import Blueprint, jsonify
+from flask import Blueprint, jsonify, request
 from health_monitor import health_monitor
 from health_persistence import health_persistence
 health_bp = Blueprint('health', __name__)
@@ -47,3 +48,22 @@ def get_system_info():
        return jsonify(info)
    except Exception as e:
        return jsonify({'error': str(e)}), 500
@health_bp.route('/api/health/acknowledge/<error_key>', methods=['POST'])
 def acknowledge_error(error_key):
    """Acknowledge an error manually (user dismissed it)"""
    try:
        health_persistence.acknowledge_error(error_key)
        return jsonify({'success': True, 'message': 'Error acknowledged'})
    except Exception as e:
        return jsonify({'error': str(e)}), 500
@health_bp.route('/api/health/active-errors', methods=['GET'])
 def get_active_errors():
    """Get all active persistent errors"""
    try:
        category = request.args.get('category')
        errors = health_persistence.get_active_errors(category)
        return jsonify({'errors': errors})
    except Exception as e:
        return jsonify({'error': str(e)}), 500
--- a/AppImage/scripts/health_monitor.py
+++ b/AppImage/scripts/health_monitor.py
@@ -17,10 +17,12 @@ from datetime import datetime, timedelta
 from collections import defaultdict
 import re
 from health_persistence import health_persistence
 class HealthMonitor:
    """
    Monitors system health across multiple components with minimal impact.
-    Implements hysteresis, intelligent caching, and progressive escalation.
+    Implements hysteresis, intelligent caching, progressive escalation, and persistent error tracking.
    Always returns all 10 health categories.
    """
@@ -28,8 +30,8 @@ class HealthMonitor:
    CPU_WARNING = 85
    CPU_CRITICAL = 95
    CPU_RECOVERY = 75
-    CPU_WARNING_DURATION = 60
+    CPU_WARNING_DURATION = 300  # 5 minutes sustained
-    CPU_CRITICAL_DURATION = 120
+    CPU_CRITICAL_DURATION = 300  # 5 minutes sustained
    CPU_RECOVERY_DURATION = 120
    # Memory Thresholds
@@ -85,6 +87,11 @@ class HealthMonitor:
        self.io_error_history = defaultdict(list)
        self.failed_vm_history = set()  # Track VMs that failed to start
        try:
            health_persistence.cleanup_old_errors()
        except Exception as e:
            print(f"[HealthMonitor] Cleanup warning: {e}")
    def get_system_info(self) -> Dict[str, Any]:
        """
        Get lightweight system info for header display.
@@ -188,7 +195,11 @@ class HealthMonitor:
        """
        Get comprehensive health status with all checks.
        Returns JSON structure with ALL 10 categories always present.
        Now includes persistent error tracking.
        """
        active_errors = health_persistence.get_active_errors()
        persistent_issues = {err['error_key']: err for err in active_errors}
        details = {
            'cpu': {'status': 'OK'},
            'memory': {'status': 'OK'},
@@ -231,8 +242,8 @@ class HealthMonitor:
            elif disks_status.get('status') == 'WARNING':
                warning_issues.append(disks_status.get('reason', 'Disk issue'))
-        # Priority 4: VMs/CTs - now detects qmp errors from logs
+        # Priority 4: VMs/CTs - now with persistence
-        vms_status = self._check_vms_cts_optimized()
+        vms_status = self._check_vms_cts_with_persistence()
        if vms_status:
            details['vms'] = vms_status
            if vms_status.get('status') == 'CRITICAL':
@@ -265,8 +276,8 @@ class HealthMonitor:
        elif memory_status.get('status') == 'WARNING':
            warning_issues.append(memory_status.get('reason', 'Memory high'))
-        # Priority 8: Logs
+        # Priority 8: Logs - now with persistence
-        logs_status = self._check_logs_lightweight()
+        logs_status = self._check_logs_with_persistence()
        details['logs'] = logs_status
        if logs_status.get('status') == 'CRITICAL':
            critical_issues.append(logs_status.get('reason', 'Critical log errors'))
@@ -305,7 +316,7 @@ class HealthMonitor:
        }
    def _check_cpu_with_hysteresis(self) -> Dict[str, Any]:
-        """Check CPU with hysteresis to avoid flapping alerts"""
+        """Check CPU with hysteresis to avoid flapping alerts - requires 5min sustained high usage"""
        try:
            cpu_percent = psutil.cpu_percent(interval=1)
            current_time = time.time()
@@ -318,33 +329,33 @@ class HealthMonitor:
            self.state_history[state_key] = [
                entry for entry in self.state_history[state_key]
-                if current_time - entry['time'] < 300
+                if current_time - entry['time'] < 360
            ]
-            critical_duration = sum(
+            critical_samples = [
-                1 for entry in self.state_history[state_key]
+                entry for entry in self.state_history[state_key]
                if entry['value'] >= self.CPU_CRITICAL and
                current_time - entry['time'] <= self.CPU_CRITICAL_DURATION
-            )
+            ]
-            warning_duration = sum(
+            warning_samples = [
-                1 for entry in self.state_history[state_key]
+                entry for entry in self.state_history[state_key]
                if entry['value'] >= self.CPU_WARNING and
                current_time - entry['time'] <= self.CPU_WARNING_DURATION
-            )
+            ]
-            recovery_duration = sum(
+            recovery_samples = [
-                1 for entry in self.state_history[state_key]
+                entry for entry in self.state_history[state_key]
                if entry['value'] < self.CPU_RECOVERY and
                current_time - entry['time'] <= self.CPU_RECOVERY_DURATION
-            )
+            ]
-            if critical_duration >= 2:
+            if len(critical_samples) >= 3:
                status = 'CRITICAL'
-                reason = f'CPU >{self.CPU_CRITICAL}% for {self.CPU_CRITICAL_DURATION}s'
+                reason = f'CPU >{self.CPU_CRITICAL}% sustained for {self.CPU_CRITICAL_DURATION}s'
-            elif warning_duration >= 2 and recovery_duration < 2:
+            elif len(warning_samples) >= 3 and len(recovery_samples) < 2:
                status = 'WARNING'
-                reason = f'CPU >{self.CPU_WARNING}% for {self.CPU_WARNING_DURATION}s'
+                reason = f'CPU >{self.CPU_WARNING}% sustained for {self.CPU_WARNING_DURATION}s'
            else:
                status = 'OK'
                reason = None
@@ -871,15 +882,15 @@ class HealthMonitor:
    def _check_vms_cts_optimized(self) -> Dict[str, Any]:
        """
-        Optimized VM/CT check - detects qmp failures and other VM errors.
+        Optimized VM/CT check - detects qmp failures and startup errors from logs.
-        Now parses logs for VM/CT specific errors like qmp command failures.
+        Improved detection of container and VM errors from journalctl.
        """
        try:
            issues = []
            vm_details = {}
            result = subprocess.run(
-                ['journalctl', '--since', '10 minutes ago', '--no-pager', '-u', 'pve*', '-p', 'warning'],
+                ['journalctl', '--since', '10 minutes ago', '--no-pager', '-p', 'warning'],
                capture_output=True,
                text=True,
                timeout=3
@@ -903,22 +914,56 @@ class HealthMonitor:
                            }
                        continue
-                    ct_match = re.search(r'(?:ct|container)\s+(\d+)', line_lower)
+                    ct_error_match = re.search(r'(?:ct|container|lxc)\s+(\d+)', line_lower)
-                    if ct_match and ('error' in line_lower or 'fail' in line_lower):
+                    if ct_error_match and ('error' in line_lower or 'fail' in line_lower or 'device' in line_lower):
-                        ctid = ct_match.group(1)
+                        ctid = ct_error_match.group(1)
                        key = f'ct_{ctid}'
                        if key not in vm_details:
-                            issues.append(f'CT {ctid}: Error detected')
+                            if 'device' in line_lower and 'does not exist' in line_lower:
                                device_match = re.search(r'device\s+([/\w\d]+)\s+does not exist', line_lower)
                                if device_match:
                                    reason = f'Device {device_match.group(1)} missing'
                                else:
                                    reason = 'Device error'
                            elif 'failed to start' in line_lower:
                                reason = 'Failed to start'
                            else:
                                reason = 'Container error'
                            issues.append(f'CT {ctid}: {reason}')
                            vm_details[key] = {
                                'status': 'WARNING' if 'device' in reason.lower() else 'CRITICAL',
                                'reason': reason,
                                'id': ctid,
                                'type': 'CT'
                            }
                        continue
                    vzstart_match = re.search(r'vzstart:(\d+):', line)
                    if vzstart_match and ('error' in line_lower or 'fail' in line_lower or 'does not exist' in line_lower):
                        ctid = vzstart_match.group(1)
                        key = f'ct_{ctid}'
                        if key not in vm_details:
                            # Extraer mensaje de error
                            if 'device' in line_lower and 'does not exist' in line_lower:
                                device_match = re.search(r'device\s+([/\w\d]+)\s+does not exist', line_lower)
                                if device_match:
                                    reason = f'Device {device_match.group(1)} missing'
                                else:
                                    reason = 'Device error'
                            else:
                                reason = 'Startup error'
                            issues.append(f'CT {ctid}: {reason}')
                            vm_details[key] = {
                                'status': 'WARNING',
-                                'reason': 'Container error',
+                                'reason': reason,
                                'id': ctid,
                                'type': 'CT'
                            }
                        continue
                    if any(keyword in line_lower for keyword in ['failed to start', 'cannot start', 'activation failed', 'start error']):
                        # Extract VM/CT ID
                        id_match = re.search(r'\b(\d{3,4})\b', line)
                        if id_match:
                            vmid = id_match.group(1)
@@ -946,6 +991,118 @@ class HealthMonitor:
        except Exception:
            return {'status': 'OK'}
    # Modified to use persistence
    def _check_vms_cts_with_persistence(self) -> Dict[str, Any]:
        """
        Check VMs/CTs with persistent error tracking.
        Errors persist until VM starts or 48h elapsed.
        """
        try:
            issues = []
            vm_details = {}
            # Get persistent errors first
            persistent_errors = health_persistence.get_active_errors('vms')
            # Check if any persistent VMs/CTs have started
            for error in persistent_errors:
                error_key = error['error_key']
                if error_key.startswith('vm_') or error_key.startswith('ct_'):
                    vm_id = error_key.split('_')[1]
                    if health_persistence.check_vm_running(vm_id):
                        continue  # Error auto-resolved
                # Still active
                vm_details[error_key] = {
                    'status': error['severity'],
                    'reason': error['reason'],
                    'id': error.get('details', {}).get('id', 'unknown'),
                    'type': error.get('details', {}).get('type', 'VM/CT'),
                    'first_seen': error['first_seen']
                }
                issues.append(f"{error.get('details', {}).get('type', 'VM')} {error.get('details', {}).get('id', '')}: {error['reason']}")
            # Check for new errors in logs
            result = subprocess.run(
                ['journalctl', '--since', '10 minutes ago', '--no-pager', '-p', 'warning'],
                capture_output=True,
                text=True,
                timeout=3
            )
            if result.returncode == 0:
                for line in result.stdout.split('\n'):
                    line_lower = line.lower()
                    # VM QMP errors
                    vm_qmp_match = re.search(r'vm\s+(\d+)\s+qmp\s+command.*(?:failed|unable|timeout)', line_lower)
                    if vm_qmp_match:
                        vmid = vm_qmp_match.group(1)
                        error_key = f'vm_{vmid}'
                        if error_key not in vm_details:
                            # Record persistent error
                            health_persistence.record_error(
                                error_key=error_key,
                                category='vms',
                                severity='WARNING',
                                reason='QMP command timeout',
                                details={'id': vmid, 'type': 'VM'}
                            )
                            issues.append(f'VM {vmid}: Communication issue')
                            vm_details[error_key] = {
                                'status': 'WARNING',
                                'reason': 'QMP command timeout',
                                'id': vmid,
                                'type': 'VM'
                            }
                        continue
                    # Container errors
                    vzstart_match = re.search(r'vzstart:(\d+):', line)
                    if vzstart_match and ('error' in line_lower or 'fail' in line_lower or 'does not exist' in line_lower):
                        ctid = vzstart_match.group(1)
                        error_key = f'ct_{ctid}'
                        if error_key not in vm_details:
                            if 'device' in line_lower and 'does not exist' in line_lower:
                                device_match = re.search(r'device\s+([/\w\d]+)\s+does not exist', line_lower)
                                if device_match:
                                    reason = f'Device {device_match.group(1)} missing'
                                else:
                                    reason = 'Device error'
                            else:
                                reason = 'Startup error'
                            # Record persistent error
                            health_persistence.record_error(
                                error_key=error_key,
                                category='vms',
                                severity='WARNING',
                                reason=reason,
                                details={'id': ctid, 'type': 'CT'}
                            )
                            issues.append(f'CT {ctid}: {reason}')
                            vm_details[error_key] = {
                                'status': 'WARNING',
                                'reason': reason,
                                'id': ctid,
                                'type': 'CT'
                            }
            if not issues:
                return {'status': 'OK'}
            has_critical = any(d.get('status') == 'CRITICAL' for d in vm_details.values())
            return {
                'status': 'CRITICAL' if has_critical else 'WARNING',
                'reason': '; '.join(issues[:3]),
                'details': vm_details
            }
        except Exception:
            return {'status': 'OK'}
    def _check_pve_services(self) -> Dict[str, Any]:
        """Check critical Proxmox services"""
        try:
@@ -980,13 +1137,24 @@ class HealthMonitor:
                'reason': f'Service check failed: {str(e)}'
            }
-    def _check_logs_lightweight(self) -> Dict[str, Any]:
+    # Modified to use persistence
-        """Lightweight log analysis (cached, checked every 5 minutes)"""
+    def _check_logs_with_persistence(self) -> Dict[str, Any]:
        """
        Check logs with persistent error tracking.
        Critical log errors persist for 24h unless acknowledged.
        """
        cache_key = 'logs_analysis'
        current_time = time.time()
        if cache_key in self.last_check_times:
            if current_time - self.last_check_times[cache_key] < self.LOG_CHECK_INTERVAL:
                # Return persistent errors if any
                persistent_errors = health_persistence.get_active_errors('logs')
                if persistent_errors:
                    return {
                        'status': 'WARNING',
                        'reason': f'{len(persistent_errors)} persistent log issues'
                    }
                return self.cached_results.get(cache_key, {'status': 'OK'})
        try:
@@ -1011,6 +1179,16 @@ class HealthMonitor:
                        if keyword.lower() in line_lower:
                            critical_keywords_found.append(keyword)
                            errors_5m += 1
                            # Record persistent error for critical keywords
                            error_key = f'log_critical_{keyword.replace(" ", "_")}'
                            health_persistence.record_error(
                                error_key=error_key,
                                category='logs',
                                severity='CRITICAL',
                                reason=f'Critical log: {keyword}',
                                details={'keyword': keyword}
                            )
                            break
                    else:
                        if 'error' in line_lower or 'critical' in line_lower or 'fatal' in line_lower:
--- a/AppImage/scripts/health_persistence.py
+++ b/AppImage/scripts/health_persistence.py
@@ -0,0 +1,326 @@
 """
 Health Monitor Persistence Module
 Manages persistent error tracking across AppImage updates using SQLite.
 Stores errors in /root/.config/proxmenux-monitor/health_monitor.db
 Features:
 - Persistent error storage (survives AppImage updates)
 - Smart error resolution (auto-clear when VM starts, or after 48h)
 - Event system for future Telegram notifications
 - Manual acknowledgment support
 Author: MacRimi
 Version: 1.0
 """
 import sqlite3
 import json
 import os
 from datetime import datetime, timedelta
 from typing import Dict, List, Any, Optional
 from pathlib import Path
 class HealthPersistence:
    """Manages persistent health error tracking"""
    # Error retention periods (seconds)
    VM_ERROR_RETENTION = 48 * 3600  # 48 hours
    LOG_ERROR_RETENTION = 24 * 3600  # 24 hours
    DISK_ERROR_RETENTION = 48 * 3600  # 48 hours
    def __init__(self):
        """Initialize persistence with database in config directory"""
        self.data_dir = Path('/root/.config/proxmenux-monitor')
        self.data_dir.mkdir(parents=True, exist_ok=True)
        self.db_path = self.data_dir / 'health_monitor.db'
        self._init_database()
    def _init_database(self):
        """Initialize SQLite database with required tables"""
        conn = sqlite3.connect(str(self.db_path))
        cursor = conn.cursor()
        # Errors table
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS errors (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                error_key TEXT UNIQUE NOT NULL,
                category TEXT NOT NULL,
                severity TEXT NOT NULL,
                reason TEXT NOT NULL,
                details TEXT,
                first_seen TEXT NOT NULL,
                last_seen TEXT NOT NULL,
                resolved_at TEXT,
                acknowledged INTEGER DEFAULT 0,
                notification_sent INTEGER DEFAULT 0
            )
        ''')
        # Events table (for future Telegram notifications)
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS events (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                event_type TEXT NOT NULL,
                error_key TEXT NOT NULL,
                timestamp TEXT NOT NULL,
                data TEXT
            )
        ''')
        # Indexes for performance
        cursor.execute('CREATE INDEX IF NOT EXISTS idx_error_key ON errors(error_key)')
        cursor.execute('CREATE INDEX IF NOT EXISTS idx_category ON errors(category)')
        cursor.execute('CREATE INDEX IF NOT EXISTS idx_resolved ON errors(resolved_at)')
        cursor.execute('CREATE INDEX IF NOT EXISTS idx_events_error ON events(error_key)')
        conn.commit()
        conn.close()
    def record_error(self, error_key: str, category: str, severity: str, 
                    reason: str, details: Optional[Dict] = None) -> Dict[str, Any]:
        """
        Record or update an error.
        Returns event info (new_error, updated, etc.)
        """
        conn = sqlite3.connect(str(self.db_path))
        cursor = conn.cursor()
        now = datetime.now().isoformat()
        details_json = json.dumps(details) if details else None
        # Check if error exists
        cursor.execute('SELECT id, first_seen, notification_sent FROM errors WHERE error_key = ?', 
                      (error_key,))
        existing = cursor.fetchone()
        event_info = {'type': 'updated', 'needs_notification': False}
        if existing:
            # Update existing error
            cursor.execute('''
                UPDATE errors 
                SET last_seen = ?, severity = ?, reason = ?, details = ?, resolved_at = NULL
                WHERE error_key = ?
            ''', (now, severity, reason, details_json, error_key))
            # Check if severity escalated
            cursor.execute('SELECT severity FROM errors WHERE error_key = ?', (error_key,))
            old_severity = cursor.fetchone()[0]
            if old_severity == 'WARNING' and severity == 'CRITICAL':
                event_info['type'] = 'escalated'
                event_info['needs_notification'] = True
        else:
            # Insert new error
            cursor.execute('''
                INSERT INTO errors 
                (error_key, category, severity, reason, details, first_seen, last_seen)
                VALUES (?, ?, ?, ?, ?, ?, ?)
            ''', (error_key, category, severity, reason, details_json, now, now))
            event_info['type'] = 'new'
            event_info['needs_notification'] = True
        # Record event
        self._record_event(cursor, event_info['type'], error_key, 
                          {'severity': severity, 'reason': reason})
        conn.commit()
        conn.close()
        return event_info
    def resolve_error(self, error_key: str, reason: str = 'auto-resolved'):
        """Mark an error as resolved"""
        conn = sqlite3.connect(str(self.db_path))
        cursor = conn.cursor()
        now = datetime.now().isoformat()
        cursor.execute('''
            UPDATE errors 
            SET resolved_at = ?
            WHERE error_key = ? AND resolved_at IS NULL
        ''', (now, error_key))
        if cursor.rowcount > 0:
            self._record_event(cursor, 'resolved', error_key, {'reason': reason})
        conn.commit()
        conn.close()
    def acknowledge_error(self, error_key: str):
        """Manually acknowledge an error (won't notify again)"""
        conn = sqlite3.connect(str(self.db_path))
        cursor = conn.cursor()
        cursor.execute('''
            UPDATE errors 
            SET acknowledged = 1
            WHERE error_key = ?
        ''', (error_key,))
        self._record_event(cursor, 'acknowledged', error_key, {})
        conn.commit()
        conn.close()
    def get_active_errors(self, category: Optional[str] = None) -> List[Dict[str, Any]]:
        """Get all active (unresolved) errors, optionally filtered by category"""
        conn = sqlite3.connect(str(self.db_path))
        conn.row_factory = sqlite3.Row
        cursor = conn.cursor()
        if category:
            cursor.execute('''
                SELECT * FROM errors 
                WHERE resolved_at IS NULL AND category = ?
                ORDER BY severity DESC, last_seen DESC
            ''', (category,))
        else:
            cursor.execute('''
                SELECT * FROM errors 
                WHERE resolved_at IS NULL
                ORDER BY severity DESC, last_seen DESC
            ''')
        rows = cursor.fetchall()
        conn.close()
        errors = []
        for row in rows:
            error_dict = dict(row)
            if error_dict.get('details'):
                error_dict['details'] = json.loads(error_dict['details'])
            errors.append(error_dict)
        return errors
    def cleanup_old_errors(self):
        """Clean up old resolved errors and auto-resolve stale errors"""
        conn = sqlite3.connect(str(self.db_path))
        cursor = conn.cursor()
        now = datetime.now()
        # Delete resolved errors older than 7 days
        cutoff_resolved = (now - timedelta(days=7)).isoformat()
        cursor.execute('DELETE FROM errors WHERE resolved_at < ?', (cutoff_resolved,))
        # Auto-resolve VM/CT errors older than 48h
        cutoff_vm = (now - timedelta(seconds=self.VM_ERROR_RETENTION)).isoformat()
        cursor.execute('''
            UPDATE errors 
            SET resolved_at = ?
            WHERE category = 'vms' 
              AND resolved_at IS NULL 
              AND first_seen < ?
              AND acknowledged = 0
        ''', (now.isoformat(), cutoff_vm))
        # Auto-resolve log errors older than 24h
        cutoff_logs = (now - timedelta(seconds=self.LOG_ERROR_RETENTION)).isoformat()
        cursor.execute('''
            UPDATE errors 
            SET resolved_at = ?
            WHERE category = 'logs' 
              AND resolved_at IS NULL 
              AND first_seen < ?
              AND acknowledged = 0
        ''', (now.isoformat(), cutoff_logs))
        # Delete old events (>30 days)
        cutoff_events = (now - timedelta(days=30)).isoformat()
        cursor.execute('DELETE FROM events WHERE timestamp < ?', (cutoff_events,))
        conn.commit()
        conn.close()
    def check_vm_running(self, vm_id: str) -> bool:
        """
        Check if a VM/CT is running and resolve error if so.
        Returns True if running and error was resolved.
        """
        import subprocess
        try:
            # Check qm status for VMs
            result = subprocess.run(
                ['qm', 'status', vm_id],
                capture_output=True,
                text=True,
                timeout=2
            )
            if result.returncode == 0 and 'running' in result.stdout.lower():
                self.resolve_error(f'vm_{vm_id}', 'VM started')
                return True
            # Check pct status for containers
            result = subprocess.run(
                ['pct', 'status', vm_id],
                capture_output=True,
                text=True,
                timeout=2
            )
            if result.returncode == 0 and 'running' in result.stdout.lower():
                self.resolve_error(f'ct_{vm_id}', 'Container started')
                return True
            return False
        except Exception:
            return False
    def _record_event(self, cursor, event_type: str, error_key: str, data: Dict):
        """Internal: Record an event"""
        cursor.execute('''
            INSERT INTO events (event_type, error_key, timestamp, data)
            VALUES (?, ?, ?, ?)
        ''', (event_type, error_key, datetime.now().isoformat(), json.dumps(data)))
    def get_unnotified_errors(self) -> List[Dict[str, Any]]:
        """Get errors that need Telegram notification"""
        conn = sqlite3.connect(str(self.db_path))
        conn.row_factory = sqlite3.Row
        cursor = conn.cursor()
        cursor.execute('''
            SELECT * FROM errors 
            WHERE notification_sent = 0 
              AND resolved_at IS NULL 
              AND acknowledged = 0
            ORDER BY severity DESC, first_seen ASC
        ''')
        rows = cursor.fetchall()
        conn.close()
        errors = []
        for row in rows:
            error_dict = dict(row)
            if error_dict.get('details'):
                error_dict['details'] = json.loads(error_dict['details'])
            errors.append(error_dict)
        return errors
    def mark_notified(self, error_key: str):
        """Mark error as notified"""
        conn = sqlite3.connect(str(self.db_path))
        cursor = conn.cursor()
        cursor.execute('''
            UPDATE errors 
            SET notification_sent = 1
            WHERE error_key = ?
        ''', (error_key,))
        conn.commit()
        conn.close()
 # Global instance
 health_persistence = HealthPersistence()