Update AppImage

This commit is contained in:
MacRimi
2025-11-09 17:28:20 +01:00
parent 27353e160f
commit a0635a1026
5 changed files with 656 additions and 51 deletions

View File

@@ -1,8 +1,11 @@
"use client" "use client"
import type React from "react"
import { useState, useEffect } from "react" import { useState, useEffect } from "react"
import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle } from "@/components/ui/dialog" import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle } from "@/components/ui/dialog"
import { Badge } from "@/components/ui/badge" import { Badge } from "@/components/ui/badge"
import { Button } from "@/components/ui/button"
import { import {
Loader2, Loader2,
CheckCircle2, CheckCircle2,
@@ -19,6 +22,7 @@ import {
FileText, FileText,
RefreshCw, RefreshCw,
Shield, Shield,
X,
} from "lucide-react" } from "lucide-react"
interface CategoryCheck { interface CategoryCheck {
@@ -148,17 +152,53 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
const stats = getHealthStats() const stats = getHealthStats()
const handleCategoryClick = (categoryKey: string, status: string) => {
if (status === "OK") return // No navegar si está OK
onOpenChange(false) // Cerrar el modal
// Mapear categorías a tabs
const categoryToTab: Record<string, string> = {
storage: "storage",
disks: "storage",
network: "network",
vms: "vms",
logs: "logs",
hardware: "hardware",
services: "hardware",
}
const targetTab = categoryToTab[categoryKey]
if (targetTab) {
// Disparar evento para cambiar tab
const event = new CustomEvent("changeTab", { detail: { tab: targetTab } })
window.dispatchEvent(event)
}
}
const handleAcknowledge = async (errorKey: string, e: React.MouseEvent) => {
e.stopPropagation() // Prevent navigation
try {
await fetch(getApiUrl(`/api/health/acknowledge/${errorKey}`), {
method: "POST",
})
// Refresh health data
await fetchHealthDetails()
} catch (err) {
console.error("[v0] Error acknowledging:", err)
}
}
return ( return (
<Dialog open={open} onOpenChange={onOpenChange}> <Dialog open={open} onOpenChange={onOpenChange}>
<DialogContent className="max-w-3xl max-h-[85vh] overflow-y-auto"> <DialogContent className="max-w-3xl max-h-[85vh] overflow-y-auto">
<DialogHeader> <DialogHeader>
<DialogTitle className="flex items-center justify-between"> <DialogTitle className="flex items-center gap-2">
<div className="flex items-center gap-2">
<Activity className="h-6 w-6" /> <Activity className="h-6 w-6" />
System Health Status System Health Status
</div>
{healthData && getStatusBadge(healthData.overall)}
</DialogTitle> </DialogTitle>
<div className="mt-4">{healthData && getStatusBadge(healthData.overall)}</div>
<DialogDescription>Detailed health checks for all system components</DialogDescription> <DialogDescription>Detailed health checks for all system components</DialogDescription>
</DialogHeader> </DialogHeader>
@@ -213,13 +253,14 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
return ( return (
<div <div
key={key} key={key}
onClick={() => handleCategoryClick(key, status)}
className={`flex items-start gap-3 p-3 rounded-lg border transition-colors ${ className={`flex items-start gap-3 p-3 rounded-lg border transition-colors ${
status === "OK" status === "OK"
? "bg-green-500/5 border-green-500/20 hover:bg-green-500/10" ? "bg-green-500/5 border-green-500/20 hover:bg-green-500/10"
: status === "WARNING" : status === "WARNING"
? "bg-yellow-500/5 border-yellow-500/20 hover:bg-yellow-500/10" ? "bg-yellow-500/5 border-yellow-500/20 hover:bg-yellow-500/10 cursor-pointer"
: status === "CRITICAL" : status === "CRITICAL"
? "bg-red-500/5 border-red-500/20 hover:bg-red-500/10" ? "bg-red-500/5 border-red-500/20 hover:bg-red-500/10 cursor-pointer"
: "bg-muted/30 hover:bg-muted/50" : "bg-muted/30 hover:bg-muted/50"
}`} }`}
> >
@@ -251,12 +292,27 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
{Object.entries(details).map(([detailKey, detailValue]: [string, any]) => { {Object.entries(details).map(([detailKey, detailValue]: [string, any]) => {
if (typeof detailValue === "object" && detailValue !== null) { if (typeof detailValue === "object" && detailValue !== null) {
return ( return (
<div key={detailKey} className="text-xs pl-3 border-l-2 border-muted"> <div
key={detailKey}
className="flex items-start justify-between gap-2 text-xs pl-3 border-l-2 border-muted"
>
<div>
<span className="font-medium">{detailKey}:</span> <span className="font-medium">{detailKey}:</span>
{detailValue.reason && ( {detailValue.reason && (
<span className="ml-1 text-muted-foreground">{detailValue.reason}</span> <span className="ml-1 text-muted-foreground">{detailValue.reason}</span>
)} )}
</div> </div>
{status !== "OK" && (
<Button
size="sm"
variant="ghost"
className="h-5 px-1 hover:bg-red-500/10"
onClick={(e) => handleAcknowledge(detailKey, e)}
>
<X className="h-3 w-3" />
</Button>
)}
</div>
) )
} }
return null return null

View File

@@ -98,10 +98,19 @@ export function ProxmoxDashboard() {
const uptimeValue = const uptimeValue =
data.uptime && typeof data.uptime === "string" && data.uptime.trim() !== "" ? data.uptime : "N/A" data.uptime && typeof data.uptime === "string" && data.uptime.trim() !== "" ? data.uptime : "N/A"
const healthStatus = data.health?.status || "healthy" const backendStatus = data.health?.status?.toUpperCase() || "OK"
let healthStatus: "healthy" | "warning" | "critical"
if (backendStatus === "CRITICAL") {
healthStatus = "critical"
} else if (backendStatus === "WARNING") {
healthStatus = "warning"
} else {
healthStatus = "healthy"
}
setSystemStatus({ setSystemStatus({
status: healthStatus as "healthy" | "warning" | "critical", status: healthStatus,
uptime: uptimeValue, uptime: uptimeValue,
lastUpdate: new Date().toLocaleTimeString("en-US", { hour12: false }), lastUpdate: new Date().toLocaleTimeString("en-US", { hour12: false }),
serverName: data.hostname || "Unknown", serverName: data.hostname || "Unknown",
@@ -127,11 +136,13 @@ export function ProxmoxDashboard() {
// Siempre fetch inicial // Siempre fetch inicial
fetchSystemData() fetchSystemData()
// En overview: cada 30 segundos para actualización frecuente del estado de salud
// En otras tabs: cada 60 segundos para reducir carga
let interval: ReturnType<typeof setInterval> | null = null let interval: ReturnType<typeof setInterval> | null = null
if (activeTab === "overview") { if (activeTab === "overview") {
interval = setInterval(fetchSystemData, 9000) // Cambiado de 10000 a 9000ms interval = setInterval(fetchSystemData, 30000) // 30 segundos
} else { } else {
interval = setInterval(fetchSystemData, 61000) // Cambiado de 60000 a 61000ms interval = setInterval(fetchSystemData, 60000) // 60 segundos
} }
return () => { return () => {
@@ -139,6 +150,20 @@ export function ProxmoxDashboard() {
} }
}, [fetchSystemData, activeTab]) }, [fetchSystemData, activeTab])
useEffect(() => {
const handleChangeTab = (event: CustomEvent) => {
const { tab } = event.detail
if (tab) {
setActiveTab(tab)
}
}
window.addEventListener("changeTab", handleChangeTab as EventListener)
return () => {
window.removeEventListener("changeTab", handleChangeTab as EventListener)
}
}, [])
useEffect(() => { useEffect(() => {
if ( if (
systemStatus.serverName && systemStatus.serverName &&

View File

@@ -1,9 +1,10 @@
""" """
Flask routes for health monitoring Flask routes for health monitoring with persistence support
""" """
from flask import Blueprint, jsonify from flask import Blueprint, jsonify, request
from health_monitor import health_monitor from health_monitor import health_monitor
from health_persistence import health_persistence
health_bp = Blueprint('health', __name__) health_bp = Blueprint('health', __name__)
@@ -47,3 +48,22 @@ def get_system_info():
return jsonify(info) return jsonify(info)
except Exception as e: except Exception as e:
return jsonify({'error': str(e)}), 500 return jsonify({'error': str(e)}), 500
@health_bp.route('/api/health/acknowledge/<error_key>', methods=['POST'])
def acknowledge_error(error_key):
"""Acknowledge an error manually (user dismissed it)"""
try:
health_persistence.acknowledge_error(error_key)
return jsonify({'success': True, 'message': 'Error acknowledged'})
except Exception as e:
return jsonify({'error': str(e)}), 500
@health_bp.route('/api/health/active-errors', methods=['GET'])
def get_active_errors():
"""Get all active persistent errors"""
try:
category = request.args.get('category')
errors = health_persistence.get_active_errors(category)
return jsonify({'errors': errors})
except Exception as e:
return jsonify({'error': str(e)}), 500

View File

@@ -17,10 +17,12 @@ from datetime import datetime, timedelta
from collections import defaultdict from collections import defaultdict
import re import re
from health_persistence import health_persistence
class HealthMonitor: class HealthMonitor:
""" """
Monitors system health across multiple components with minimal impact. Monitors system health across multiple components with minimal impact.
Implements hysteresis, intelligent caching, and progressive escalation. Implements hysteresis, intelligent caching, progressive escalation, and persistent error tracking.
Always returns all 10 health categories. Always returns all 10 health categories.
""" """
@@ -28,8 +30,8 @@ class HealthMonitor:
CPU_WARNING = 85 CPU_WARNING = 85
CPU_CRITICAL = 95 CPU_CRITICAL = 95
CPU_RECOVERY = 75 CPU_RECOVERY = 75
CPU_WARNING_DURATION = 60 CPU_WARNING_DURATION = 300 # 5 minutes sustained
CPU_CRITICAL_DURATION = 120 CPU_CRITICAL_DURATION = 300 # 5 minutes sustained
CPU_RECOVERY_DURATION = 120 CPU_RECOVERY_DURATION = 120
# Memory Thresholds # Memory Thresholds
@@ -85,6 +87,11 @@ class HealthMonitor:
self.io_error_history = defaultdict(list) self.io_error_history = defaultdict(list)
self.failed_vm_history = set() # Track VMs that failed to start self.failed_vm_history = set() # Track VMs that failed to start
try:
health_persistence.cleanup_old_errors()
except Exception as e:
print(f"[HealthMonitor] Cleanup warning: {e}")
def get_system_info(self) -> Dict[str, Any]: def get_system_info(self) -> Dict[str, Any]:
""" """
Get lightweight system info for header display. Get lightweight system info for header display.
@@ -188,7 +195,11 @@ class HealthMonitor:
""" """
Get comprehensive health status with all checks. Get comprehensive health status with all checks.
Returns JSON structure with ALL 10 categories always present. Returns JSON structure with ALL 10 categories always present.
Now includes persistent error tracking.
""" """
active_errors = health_persistence.get_active_errors()
persistent_issues = {err['error_key']: err for err in active_errors}
details = { details = {
'cpu': {'status': 'OK'}, 'cpu': {'status': 'OK'},
'memory': {'status': 'OK'}, 'memory': {'status': 'OK'},
@@ -231,8 +242,8 @@ class HealthMonitor:
elif disks_status.get('status') == 'WARNING': elif disks_status.get('status') == 'WARNING':
warning_issues.append(disks_status.get('reason', 'Disk issue')) warning_issues.append(disks_status.get('reason', 'Disk issue'))
# Priority 4: VMs/CTs - now detects qmp errors from logs # Priority 4: VMs/CTs - now with persistence
vms_status = self._check_vms_cts_optimized() vms_status = self._check_vms_cts_with_persistence()
if vms_status: if vms_status:
details['vms'] = vms_status details['vms'] = vms_status
if vms_status.get('status') == 'CRITICAL': if vms_status.get('status') == 'CRITICAL':
@@ -265,8 +276,8 @@ class HealthMonitor:
elif memory_status.get('status') == 'WARNING': elif memory_status.get('status') == 'WARNING':
warning_issues.append(memory_status.get('reason', 'Memory high')) warning_issues.append(memory_status.get('reason', 'Memory high'))
# Priority 8: Logs # Priority 8: Logs - now with persistence
logs_status = self._check_logs_lightweight() logs_status = self._check_logs_with_persistence()
details['logs'] = logs_status details['logs'] = logs_status
if logs_status.get('status') == 'CRITICAL': if logs_status.get('status') == 'CRITICAL':
critical_issues.append(logs_status.get('reason', 'Critical log errors')) critical_issues.append(logs_status.get('reason', 'Critical log errors'))
@@ -305,7 +316,7 @@ class HealthMonitor:
} }
def _check_cpu_with_hysteresis(self) -> Dict[str, Any]: def _check_cpu_with_hysteresis(self) -> Dict[str, Any]:
"""Check CPU with hysteresis to avoid flapping alerts""" """Check CPU with hysteresis to avoid flapping alerts - requires 5min sustained high usage"""
try: try:
cpu_percent = psutil.cpu_percent(interval=1) cpu_percent = psutil.cpu_percent(interval=1)
current_time = time.time() current_time = time.time()
@@ -318,33 +329,33 @@ class HealthMonitor:
self.state_history[state_key] = [ self.state_history[state_key] = [
entry for entry in self.state_history[state_key] entry for entry in self.state_history[state_key]
if current_time - entry['time'] < 300 if current_time - entry['time'] < 360
] ]
critical_duration = sum( critical_samples = [
1 for entry in self.state_history[state_key] entry for entry in self.state_history[state_key]
if entry['value'] >= self.CPU_CRITICAL and if entry['value'] >= self.CPU_CRITICAL and
current_time - entry['time'] <= self.CPU_CRITICAL_DURATION current_time - entry['time'] <= self.CPU_CRITICAL_DURATION
) ]
warning_duration = sum( warning_samples = [
1 for entry in self.state_history[state_key] entry for entry in self.state_history[state_key]
if entry['value'] >= self.CPU_WARNING and if entry['value'] >= self.CPU_WARNING and
current_time - entry['time'] <= self.CPU_WARNING_DURATION current_time - entry['time'] <= self.CPU_WARNING_DURATION
) ]
recovery_duration = sum( recovery_samples = [
1 for entry in self.state_history[state_key] entry for entry in self.state_history[state_key]
if entry['value'] < self.CPU_RECOVERY and if entry['value'] < self.CPU_RECOVERY and
current_time - entry['time'] <= self.CPU_RECOVERY_DURATION current_time - entry['time'] <= self.CPU_RECOVERY_DURATION
) ]
if critical_duration >= 2: if len(critical_samples) >= 3:
status = 'CRITICAL' status = 'CRITICAL'
reason = f'CPU >{self.CPU_CRITICAL}% for {self.CPU_CRITICAL_DURATION}s' reason = f'CPU >{self.CPU_CRITICAL}% sustained for {self.CPU_CRITICAL_DURATION}s'
elif warning_duration >= 2 and recovery_duration < 2: elif len(warning_samples) >= 3 and len(recovery_samples) < 2:
status = 'WARNING' status = 'WARNING'
reason = f'CPU >{self.CPU_WARNING}% for {self.CPU_WARNING_DURATION}s' reason = f'CPU >{self.CPU_WARNING}% sustained for {self.CPU_WARNING_DURATION}s'
else: else:
status = 'OK' status = 'OK'
reason = None reason = None
@@ -871,15 +882,15 @@ class HealthMonitor:
def _check_vms_cts_optimized(self) -> Dict[str, Any]: def _check_vms_cts_optimized(self) -> Dict[str, Any]:
""" """
Optimized VM/CT check - detects qmp failures and other VM errors. Optimized VM/CT check - detects qmp failures and startup errors from logs.
Now parses logs for VM/CT specific errors like qmp command failures. Improved detection of container and VM errors from journalctl.
""" """
try: try:
issues = [] issues = []
vm_details = {} vm_details = {}
result = subprocess.run( result = subprocess.run(
['journalctl', '--since', '10 minutes ago', '--no-pager', '-u', 'pve*', '-p', 'warning'], ['journalctl', '--since', '10 minutes ago', '--no-pager', '-p', 'warning'],
capture_output=True, capture_output=True,
text=True, text=True,
timeout=3 timeout=3
@@ -903,22 +914,56 @@ class HealthMonitor:
} }
continue continue
ct_match = re.search(r'(?:ct|container)\s+(\d+)', line_lower) ct_error_match = re.search(r'(?:ct|container|lxc)\s+(\d+)', line_lower)
if ct_match and ('error' in line_lower or 'fail' in line_lower): if ct_error_match and ('error' in line_lower or 'fail' in line_lower or 'device' in line_lower):
ctid = ct_match.group(1) ctid = ct_error_match.group(1)
key = f'ct_{ctid}' key = f'ct_{ctid}'
if key not in vm_details: if key not in vm_details:
issues.append(f'CT {ctid}: Error detected') if 'device' in line_lower and 'does not exist' in line_lower:
device_match = re.search(r'device\s+([/\w\d]+)\s+does not exist', line_lower)
if device_match:
reason = f'Device {device_match.group(1)} missing'
else:
reason = 'Device error'
elif 'failed to start' in line_lower:
reason = 'Failed to start'
else:
reason = 'Container error'
issues.append(f'CT {ctid}: {reason}')
vm_details[key] = {
'status': 'WARNING' if 'device' in reason.lower() else 'CRITICAL',
'reason': reason,
'id': ctid,
'type': 'CT'
}
continue
vzstart_match = re.search(r'vzstart:(\d+):', line)
if vzstart_match and ('error' in line_lower or 'fail' in line_lower or 'does not exist' in line_lower):
ctid = vzstart_match.group(1)
key = f'ct_{ctid}'
if key not in vm_details:
# Extraer mensaje de error
if 'device' in line_lower and 'does not exist' in line_lower:
device_match = re.search(r'device\s+([/\w\d]+)\s+does not exist', line_lower)
if device_match:
reason = f'Device {device_match.group(1)} missing'
else:
reason = 'Device error'
else:
reason = 'Startup error'
issues.append(f'CT {ctid}: {reason}')
vm_details[key] = { vm_details[key] = {
'status': 'WARNING', 'status': 'WARNING',
'reason': 'Container error', 'reason': reason,
'id': ctid, 'id': ctid,
'type': 'CT' 'type': 'CT'
} }
continue continue
if any(keyword in line_lower for keyword in ['failed to start', 'cannot start', 'activation failed', 'start error']): if any(keyword in line_lower for keyword in ['failed to start', 'cannot start', 'activation failed', 'start error']):
# Extract VM/CT ID
id_match = re.search(r'\b(\d{3,4})\b', line) id_match = re.search(r'\b(\d{3,4})\b', line)
if id_match: if id_match:
vmid = id_match.group(1) vmid = id_match.group(1)
@@ -946,6 +991,118 @@ class HealthMonitor:
except Exception: except Exception:
return {'status': 'OK'} return {'status': 'OK'}
# Modified to use persistence
def _check_vms_cts_with_persistence(self) -> Dict[str, Any]:
"""
Check VMs/CTs with persistent error tracking.
Errors persist until VM starts or 48h elapsed.
"""
try:
issues = []
vm_details = {}
# Get persistent errors first
persistent_errors = health_persistence.get_active_errors('vms')
# Check if any persistent VMs/CTs have started
for error in persistent_errors:
error_key = error['error_key']
if error_key.startswith('vm_') or error_key.startswith('ct_'):
vm_id = error_key.split('_')[1]
if health_persistence.check_vm_running(vm_id):
continue # Error auto-resolved
# Still active
vm_details[error_key] = {
'status': error['severity'],
'reason': error['reason'],
'id': error.get('details', {}).get('id', 'unknown'),
'type': error.get('details', {}).get('type', 'VM/CT'),
'first_seen': error['first_seen']
}
issues.append(f"{error.get('details', {}).get('type', 'VM')} {error.get('details', {}).get('id', '')}: {error['reason']}")
# Check for new errors in logs
result = subprocess.run(
['journalctl', '--since', '10 minutes ago', '--no-pager', '-p', 'warning'],
capture_output=True,
text=True,
timeout=3
)
if result.returncode == 0:
for line in result.stdout.split('\n'):
line_lower = line.lower()
# VM QMP errors
vm_qmp_match = re.search(r'vm\s+(\d+)\s+qmp\s+command.*(?:failed|unable|timeout)', line_lower)
if vm_qmp_match:
vmid = vm_qmp_match.group(1)
error_key = f'vm_{vmid}'
if error_key not in vm_details:
# Record persistent error
health_persistence.record_error(
error_key=error_key,
category='vms',
severity='WARNING',
reason='QMP command timeout',
details={'id': vmid, 'type': 'VM'}
)
issues.append(f'VM {vmid}: Communication issue')
vm_details[error_key] = {
'status': 'WARNING',
'reason': 'QMP command timeout',
'id': vmid,
'type': 'VM'
}
continue
# Container errors
vzstart_match = re.search(r'vzstart:(\d+):', line)
if vzstart_match and ('error' in line_lower or 'fail' in line_lower or 'does not exist' in line_lower):
ctid = vzstart_match.group(1)
error_key = f'ct_{ctid}'
if error_key not in vm_details:
if 'device' in line_lower and 'does not exist' in line_lower:
device_match = re.search(r'device\s+([/\w\d]+)\s+does not exist', line_lower)
if device_match:
reason = f'Device {device_match.group(1)} missing'
else:
reason = 'Device error'
else:
reason = 'Startup error'
# Record persistent error
health_persistence.record_error(
error_key=error_key,
category='vms',
severity='WARNING',
reason=reason,
details={'id': ctid, 'type': 'CT'}
)
issues.append(f'CT {ctid}: {reason}')
vm_details[error_key] = {
'status': 'WARNING',
'reason': reason,
'id': ctid,
'type': 'CT'
}
if not issues:
return {'status': 'OK'}
has_critical = any(d.get('status') == 'CRITICAL' for d in vm_details.values())
return {
'status': 'CRITICAL' if has_critical else 'WARNING',
'reason': '; '.join(issues[:3]),
'details': vm_details
}
except Exception:
return {'status': 'OK'}
def _check_pve_services(self) -> Dict[str, Any]: def _check_pve_services(self) -> Dict[str, Any]:
"""Check critical Proxmox services""" """Check critical Proxmox services"""
try: try:
@@ -980,13 +1137,24 @@ class HealthMonitor:
'reason': f'Service check failed: {str(e)}' 'reason': f'Service check failed: {str(e)}'
} }
def _check_logs_lightweight(self) -> Dict[str, Any]: # Modified to use persistence
"""Lightweight log analysis (cached, checked every 5 minutes)""" def _check_logs_with_persistence(self) -> Dict[str, Any]:
"""
Check logs with persistent error tracking.
Critical log errors persist for 24h unless acknowledged.
"""
cache_key = 'logs_analysis' cache_key = 'logs_analysis'
current_time = time.time() current_time = time.time()
if cache_key in self.last_check_times: if cache_key in self.last_check_times:
if current_time - self.last_check_times[cache_key] < self.LOG_CHECK_INTERVAL: if current_time - self.last_check_times[cache_key] < self.LOG_CHECK_INTERVAL:
# Return persistent errors if any
persistent_errors = health_persistence.get_active_errors('logs')
if persistent_errors:
return {
'status': 'WARNING',
'reason': f'{len(persistent_errors)} persistent log issues'
}
return self.cached_results.get(cache_key, {'status': 'OK'}) return self.cached_results.get(cache_key, {'status': 'OK'})
try: try:
@@ -1011,6 +1179,16 @@ class HealthMonitor:
if keyword.lower() in line_lower: if keyword.lower() in line_lower:
critical_keywords_found.append(keyword) critical_keywords_found.append(keyword)
errors_5m += 1 errors_5m += 1
# Record persistent error for critical keywords
error_key = f'log_critical_{keyword.replace(" ", "_")}'
health_persistence.record_error(
error_key=error_key,
category='logs',
severity='CRITICAL',
reason=f'Critical log: {keyword}',
details={'keyword': keyword}
)
break break
else: else:
if 'error' in line_lower or 'critical' in line_lower or 'fatal' in line_lower: if 'error' in line_lower or 'critical' in line_lower or 'fatal' in line_lower:

View File

@@ -0,0 +1,326 @@
"""
Health Monitor Persistence Module
Manages persistent error tracking across AppImage updates using SQLite.
Stores errors in /root/.config/proxmenux-monitor/health_monitor.db
Features:
- Persistent error storage (survives AppImage updates)
- Smart error resolution (auto-clear when VM starts, or after 48h)
- Event system for future Telegram notifications
- Manual acknowledgment support
Author: MacRimi
Version: 1.0
"""
import sqlite3
import json
import os
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional
from pathlib import Path
class HealthPersistence:
"""Manages persistent health error tracking"""
# Error retention periods (seconds)
VM_ERROR_RETENTION = 48 * 3600 # 48 hours
LOG_ERROR_RETENTION = 24 * 3600 # 24 hours
DISK_ERROR_RETENTION = 48 * 3600 # 48 hours
def __init__(self):
"""Initialize persistence with database in config directory"""
self.data_dir = Path('/root/.config/proxmenux-monitor')
self.data_dir.mkdir(parents=True, exist_ok=True)
self.db_path = self.data_dir / 'health_monitor.db'
self._init_database()
def _init_database(self):
"""Initialize SQLite database with required tables"""
conn = sqlite3.connect(str(self.db_path))
cursor = conn.cursor()
# Errors table
cursor.execute('''
CREATE TABLE IF NOT EXISTS errors (
id INTEGER PRIMARY KEY AUTOINCREMENT,
error_key TEXT UNIQUE NOT NULL,
category TEXT NOT NULL,
severity TEXT NOT NULL,
reason TEXT NOT NULL,
details TEXT,
first_seen TEXT NOT NULL,
last_seen TEXT NOT NULL,
resolved_at TEXT,
acknowledged INTEGER DEFAULT 0,
notification_sent INTEGER DEFAULT 0
)
''')
# Events table (for future Telegram notifications)
cursor.execute('''
CREATE TABLE IF NOT EXISTS events (
id INTEGER PRIMARY KEY AUTOINCREMENT,
event_type TEXT NOT NULL,
error_key TEXT NOT NULL,
timestamp TEXT NOT NULL,
data TEXT
)
''')
# Indexes for performance
cursor.execute('CREATE INDEX IF NOT EXISTS idx_error_key ON errors(error_key)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_category ON errors(category)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_resolved ON errors(resolved_at)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_events_error ON events(error_key)')
conn.commit()
conn.close()
def record_error(self, error_key: str, category: str, severity: str,
reason: str, details: Optional[Dict] = None) -> Dict[str, Any]:
"""
Record or update an error.
Returns event info (new_error, updated, etc.)
"""
conn = sqlite3.connect(str(self.db_path))
cursor = conn.cursor()
now = datetime.now().isoformat()
details_json = json.dumps(details) if details else None
# Check if error exists
cursor.execute('SELECT id, first_seen, notification_sent FROM errors WHERE error_key = ?',
(error_key,))
existing = cursor.fetchone()
event_info = {'type': 'updated', 'needs_notification': False}
if existing:
# Update existing error
cursor.execute('''
UPDATE errors
SET last_seen = ?, severity = ?, reason = ?, details = ?, resolved_at = NULL
WHERE error_key = ?
''', (now, severity, reason, details_json, error_key))
# Check if severity escalated
cursor.execute('SELECT severity FROM errors WHERE error_key = ?', (error_key,))
old_severity = cursor.fetchone()[0]
if old_severity == 'WARNING' and severity == 'CRITICAL':
event_info['type'] = 'escalated'
event_info['needs_notification'] = True
else:
# Insert new error
cursor.execute('''
INSERT INTO errors
(error_key, category, severity, reason, details, first_seen, last_seen)
VALUES (?, ?, ?, ?, ?, ?, ?)
''', (error_key, category, severity, reason, details_json, now, now))
event_info['type'] = 'new'
event_info['needs_notification'] = True
# Record event
self._record_event(cursor, event_info['type'], error_key,
{'severity': severity, 'reason': reason})
conn.commit()
conn.close()
return event_info
def resolve_error(self, error_key: str, reason: str = 'auto-resolved'):
"""Mark an error as resolved"""
conn = sqlite3.connect(str(self.db_path))
cursor = conn.cursor()
now = datetime.now().isoformat()
cursor.execute('''
UPDATE errors
SET resolved_at = ?
WHERE error_key = ? AND resolved_at IS NULL
''', (now, error_key))
if cursor.rowcount > 0:
self._record_event(cursor, 'resolved', error_key, {'reason': reason})
conn.commit()
conn.close()
def acknowledge_error(self, error_key: str):
"""Manually acknowledge an error (won't notify again)"""
conn = sqlite3.connect(str(self.db_path))
cursor = conn.cursor()
cursor.execute('''
UPDATE errors
SET acknowledged = 1
WHERE error_key = ?
''', (error_key,))
self._record_event(cursor, 'acknowledged', error_key, {})
conn.commit()
conn.close()
def get_active_errors(self, category: Optional[str] = None) -> List[Dict[str, Any]]:
"""Get all active (unresolved) errors, optionally filtered by category"""
conn = sqlite3.connect(str(self.db_path))
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
if category:
cursor.execute('''
SELECT * FROM errors
WHERE resolved_at IS NULL AND category = ?
ORDER BY severity DESC, last_seen DESC
''', (category,))
else:
cursor.execute('''
SELECT * FROM errors
WHERE resolved_at IS NULL
ORDER BY severity DESC, last_seen DESC
''')
rows = cursor.fetchall()
conn.close()
errors = []
for row in rows:
error_dict = dict(row)
if error_dict.get('details'):
error_dict['details'] = json.loads(error_dict['details'])
errors.append(error_dict)
return errors
def cleanup_old_errors(self):
"""Clean up old resolved errors and auto-resolve stale errors"""
conn = sqlite3.connect(str(self.db_path))
cursor = conn.cursor()
now = datetime.now()
# Delete resolved errors older than 7 days
cutoff_resolved = (now - timedelta(days=7)).isoformat()
cursor.execute('DELETE FROM errors WHERE resolved_at < ?', (cutoff_resolved,))
# Auto-resolve VM/CT errors older than 48h
cutoff_vm = (now - timedelta(seconds=self.VM_ERROR_RETENTION)).isoformat()
cursor.execute('''
UPDATE errors
SET resolved_at = ?
WHERE category = 'vms'
AND resolved_at IS NULL
AND first_seen < ?
AND acknowledged = 0
''', (now.isoformat(), cutoff_vm))
# Auto-resolve log errors older than 24h
cutoff_logs = (now - timedelta(seconds=self.LOG_ERROR_RETENTION)).isoformat()
cursor.execute('''
UPDATE errors
SET resolved_at = ?
WHERE category = 'logs'
AND resolved_at IS NULL
AND first_seen < ?
AND acknowledged = 0
''', (now.isoformat(), cutoff_logs))
# Delete old events (>30 days)
cutoff_events = (now - timedelta(days=30)).isoformat()
cursor.execute('DELETE FROM events WHERE timestamp < ?', (cutoff_events,))
conn.commit()
conn.close()
def check_vm_running(self, vm_id: str) -> bool:
"""
Check if a VM/CT is running and resolve error if so.
Returns True if running and error was resolved.
"""
import subprocess
try:
# Check qm status for VMs
result = subprocess.run(
['qm', 'status', vm_id],
capture_output=True,
text=True,
timeout=2
)
if result.returncode == 0 and 'running' in result.stdout.lower():
self.resolve_error(f'vm_{vm_id}', 'VM started')
return True
# Check pct status for containers
result = subprocess.run(
['pct', 'status', vm_id],
capture_output=True,
text=True,
timeout=2
)
if result.returncode == 0 and 'running' in result.stdout.lower():
self.resolve_error(f'ct_{vm_id}', 'Container started')
return True
return False
except Exception:
return False
def _record_event(self, cursor, event_type: str, error_key: str, data: Dict):
"""Internal: Record an event"""
cursor.execute('''
INSERT INTO events (event_type, error_key, timestamp, data)
VALUES (?, ?, ?, ?)
''', (event_type, error_key, datetime.now().isoformat(), json.dumps(data)))
def get_unnotified_errors(self) -> List[Dict[str, Any]]:
"""Get errors that need Telegram notification"""
conn = sqlite3.connect(str(self.db_path))
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
cursor.execute('''
SELECT * FROM errors
WHERE notification_sent = 0
AND resolved_at IS NULL
AND acknowledged = 0
ORDER BY severity DESC, first_seen ASC
''')
rows = cursor.fetchall()
conn.close()
errors = []
for row in rows:
error_dict = dict(row)
if error_dict.get('details'):
error_dict['details'] = json.loads(error_dict['details'])
errors.append(error_dict)
return errors
def mark_notified(self, error_key: str):
"""Mark error as notified"""
conn = sqlite3.connect(str(self.db_path))
cursor = conn.cursor()
cursor.execute('''
UPDATE errors
SET notification_sent = 1
WHERE error_key = ?
''', (error_key,))
conn.commit()
conn.close()
# Global instance
health_persistence = HealthPersistence()