Update AppImage

This commit is contained in:
MacRimi
2025-11-09 16:30:29 +01:00
parent 1712d32ef7
commit b9619efbbf
4 changed files with 330 additions and 250 deletions

View File

@@ -3,12 +3,28 @@
import { useState, useEffect } from "react"
import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle } from "@/components/ui/dialog"
import { Badge } from "@/components/ui/badge"
import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card"
import { Loader2, CheckCircle2, AlertTriangle, XCircle, Activity } from "lucide-react"
import {
Loader2,
CheckCircle2,
AlertTriangle,
XCircle,
Activity,
Cpu,
MemoryStick,
HardDrive,
Disc,
Network,
Box,
Settings,
FileText,
RefreshCw,
Shield,
} from "lucide-react"
interface HealthDetail {
interface CategoryCheck {
status: string
reason?: string
details?: any
[key: string]: any
}
@@ -16,7 +32,16 @@ interface HealthDetails {
overall: string
summary: string
details: {
[category: string]: HealthDetail | { [key: string]: HealthDetail }
cpu: CategoryCheck
memory: CategoryCheck
storage: CategoryCheck
disks: CategoryCheck
network: CategoryCheck
vms: CategoryCheck
services: CategoryCheck
logs: CategoryCheck
updates: CategoryCheck
security: CategoryCheck
}
timestamp: string
}
@@ -27,6 +52,19 @@ interface HealthStatusModalProps {
getApiUrl: (path: string) => string
}
const CATEGORIES = [
{ key: "cpu", label: "CPU Usage & Temperature", Icon: Cpu },
{ key: "memory", label: "Memory & Swap", Icon: MemoryStick },
{ key: "storage", label: "Storage Mounts & Space", Icon: HardDrive },
{ key: "disks", label: "Disk I/O & Errors", Icon: Disc },
{ key: "network", label: "Network Interfaces", Icon: Network },
{ key: "vms", label: "VMs & Containers", Icon: Box },
{ key: "services", label: "PVE Services", Icon: Settings },
{ key: "logs", label: "System Logs", Icon: FileText },
{ key: "updates", label: "System Updates", Icon: RefreshCw },
{ key: "security", label: "Security & Certificates", Icon: Shield },
]
export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatusModalProps) {
const [loading, setLoading] = useState(true)
const [healthData, setHealthData] = useState<HealthDetails | null>(null)
@@ -58,74 +96,6 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
}
}
const getHealthStats = () => {
if (!healthData?.details) {
return { total: 0, healthy: 0, warnings: 0, critical: 0 }
}
let healthy = 0
let warnings = 0
let critical = 0
let total = 0
const countStatus = (detail: any) => {
if (detail && typeof detail === "object" && detail.status) {
total++
const status = detail.status.toUpperCase()
if (status === "OK") healthy++
else if (status === "WARNING") warnings++
else if (status === "CRITICAL") critical++
}
}
Object.values(healthData.details).forEach((categoryData) => {
if (categoryData && typeof categoryData === "object") {
if ("status" in categoryData) {
countStatus(categoryData)
} else {
Object.values(categoryData).forEach(countStatus)
}
}
})
return { total, healthy, warnings, critical }
}
const getGroupedChecks = () => {
if (!healthData?.details) return {}
const grouped: { [key: string]: Array<{ name: string; status: string; reason?: string; details?: any }> } = {}
Object.entries(healthData.details).forEach(([category, categoryData]) => {
if (!categoryData || typeof categoryData !== "object") return
const categoryName = category.charAt(0).toUpperCase() + category.slice(1)
grouped[categoryName] = []
if ("status" in categoryData) {
grouped[categoryName].push({
name: categoryName,
status: categoryData.status,
reason: categoryData.reason,
details: categoryData,
})
} else {
Object.entries(categoryData).forEach(([subKey, subData]: [string, any]) => {
if (subData && typeof subData === "object" && "status" in subData) {
grouped[categoryName].push({
name: subKey,
status: subData.status,
reason: subData.reason,
details: subData,
})
}
})
}
})
return grouped
}
const getStatusIcon = (status: string) => {
const statusUpper = status?.toUpperCase()
switch (statusUpper) {
@@ -144,28 +114,52 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
const statusUpper = status?.toUpperCase()
switch (statusUpper) {
case "OK":
return <Badge className="bg-green-500">Healthy</Badge>
return <Badge className="bg-green-500 text-white">Healthy</Badge>
case "WARNING":
return <Badge className="bg-yellow-500">Warning</Badge>
return <Badge className="bg-yellow-500 text-white">Warning</Badge>
case "CRITICAL":
return <Badge className="bg-red-500">Critical</Badge>
return <Badge className="bg-red-500 text-white">Critical</Badge>
default:
return <Badge>Unknown</Badge>
}
}
const getHealthStats = () => {
if (!healthData?.details) {
return { total: 0, healthy: 0, warnings: 0, critical: 0 }
}
let healthy = 0
let warnings = 0
let critical = 0
CATEGORIES.forEach(({ key }) => {
const categoryData = healthData.details[key as keyof typeof healthData.details]
if (categoryData) {
const status = categoryData.status?.toUpperCase()
if (status === "OK") healthy++
else if (status === "WARNING") warnings++
else if (status === "CRITICAL") critical++
}
})
return { total: CATEGORIES.length, healthy, warnings, critical }
}
const stats = getHealthStats()
const groupedChecks = getGroupedChecks()
return (
<Dialog open={open} onOpenChange={onOpenChange}>
<DialogContent className="max-w-4xl max-h-[80vh] overflow-y-auto">
<DialogContent className="max-w-3xl max-h-[85vh] overflow-y-auto">
<DialogHeader>
<DialogTitle className="flex items-center gap-2">
<Activity className="h-6 w-6" />
System Health Status
<DialogTitle className="flex items-center justify-between">
<div className="flex items-center gap-2">
<Activity className="h-6 w-6" />
System Health Status
</div>
{healthData && getStatusBadge(healthData.overall)}
</DialogTitle>
<DialogDescription>Detailed health checks for all system components</DialogDescription>
<DialogDescription>Comprehensive health checks for all system components</DialogDescription>
</DialogHeader>
{loading && (
@@ -182,82 +176,101 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
)}
{healthData && !loading && (
<div className="space-y-6">
{/* Overall Status Summary */}
<Card>
<CardHeader>
<CardTitle className="flex items-center justify-between">
<span>Overall Status</span>
{getStatusBadge(healthData.overall)}
</CardTitle>
</CardHeader>
<CardContent>
{healthData.summary && <p className="text-sm text-muted-foreground mb-4">{healthData.summary}</p>}
<div className="grid grid-cols-4 gap-4 text-center">
<div>
<div className="text-2xl font-bold">{stats.total}</div>
<div className="text-sm text-muted-foreground">Total Checks</div>
</div>
<div>
<div className="text-2xl font-bold text-green-500">{stats.healthy}</div>
<div className="text-sm text-muted-foreground">Healthy</div>
</div>
<div>
<div className="text-2xl font-bold text-yellow-500">{stats.warnings}</div>
<div className="text-sm text-muted-foreground">Warnings</div>
</div>
<div>
<div className="text-2xl font-bold text-red-500">{stats.critical}</div>
<div className="text-sm text-muted-foreground">Critical</div>
</div>
</div>
</CardContent>
</Card>
<div className="space-y-4">
{/* Overall Stats Summary */}
<div className="grid grid-cols-4 gap-3 p-4 rounded-lg bg-muted/30 border">
<div className="text-center">
<div className="text-2xl font-bold">{stats.total}</div>
<div className="text-xs text-muted-foreground">Total Checks</div>
</div>
<div className="text-center">
<div className="text-2xl font-bold text-green-500">{stats.healthy}</div>
<div className="text-xs text-muted-foreground">Healthy</div>
</div>
<div className="text-center">
<div className="text-2xl font-bold text-yellow-500">{stats.warnings}</div>
<div className="text-xs text-muted-foreground">Warnings</div>
</div>
<div className="text-center">
<div className="text-2xl font-bold text-red-500">{stats.critical}</div>
<div className="text-xs text-muted-foreground">Critical</div>
</div>
</div>
{/* Grouped Health Checks */}
{Object.entries(groupedChecks).map(([category, checks]) => (
<Card key={category}>
<CardHeader>
<CardTitle className="text-lg">{category}</CardTitle>
</CardHeader>
<CardContent>
<div className="space-y-3">
{checks.map((check, index) => (
<div
key={`${category}-${index}`}
className="flex items-start gap-3 rounded-lg border p-3 hover:bg-muted/50 transition-colors"
>
<div className="mt-0.5">{getStatusIcon(check.status)}</div>
<div className="flex-1 min-w-0">
<div className="flex items-center justify-between gap-2">
<p className="font-medium">{check.name}</p>
<Badge variant="outline" className="shrink-0">
{check.status}
</Badge>
</div>
{check.reason && <p className="text-sm text-muted-foreground mt-1">{check.reason}</p>}
{check.details && (
<div className="text-xs text-muted-foreground mt-2 space-y-0.5">
{Object.entries(check.details).map(([key, value]) => {
if (key === "status" || key === "reason" || typeof value === "object") return null
return (
<div key={key} className="font-mono">
{key}: {String(value)}
</div>
)
})}
</div>
)}
</div>
{healthData.summary && (
<div className="text-sm text-muted-foreground p-3 rounded-lg bg-muted/20 border">
{healthData.summary}
</div>
)}
<div className="space-y-2">
{CATEGORIES.map(({ key, label, Icon }) => {
const categoryData = healthData.details[key as keyof typeof healthData.details]
const status = categoryData?.status || "UNKNOWN"
const reason = categoryData?.reason
const details = categoryData?.details
return (
<div
key={key}
className={`flex items-start gap-3 p-3 rounded-lg border transition-colors ${
status === "OK"
? "bg-green-500/5 border-green-500/20 hover:bg-green-500/10"
: status === "WARNING"
? "bg-yellow-500/5 border-yellow-500/20 hover:bg-yellow-500/10"
: status === "CRITICAL"
? "bg-red-500/5 border-red-500/20 hover:bg-red-500/10"
: "bg-muted/30 hover:bg-muted/50"
}`}
>
<div className="mt-0.5 flex-shrink-0 flex items-center gap-2">
<Icon className="h-4 w-4 text-muted-foreground" />
{getStatusIcon(status)}
</div>
<div className="flex-1 min-w-0">
<div className="flex items-center justify-between gap-2 mb-1">
<p className="font-medium text-sm">{label}</p>
<Badge
variant="outline"
className={`shrink-0 text-xs ${
status === "OK"
? "border-green-500 text-green-500"
: status === "WARNING"
? "border-yellow-500 text-yellow-500"
: status === "CRITICAL"
? "border-red-500 text-red-500"
: ""
}`}
>
{status}
</Badge>
</div>
))}
{reason && <p className="text-xs text-muted-foreground mt-1">{reason}</p>}
{details && typeof details === "object" && (
<div className="mt-2 space-y-1">
{Object.entries(details).map(([detailKey, detailValue]: [string, any]) => {
if (typeof detailValue === "object" && detailValue !== null) {
return (
<div key={detailKey} className="text-xs pl-3 border-l-2 border-muted">
<span className="font-medium">{detailKey}:</span>
{detailValue.reason && (
<span className="ml-1 text-muted-foreground">{detailValue.reason}</span>
)}
</div>
)
}
return null
})}
</div>
)}
</div>
</div>
</CardContent>
</Card>
))}
)
})}
</div>
{healthData.timestamp && (
<div className="text-xs text-muted-foreground text-center">
<div className="text-xs text-muted-foreground text-center pt-2">
Last updated: {new Date(healthData.timestamp).toLocaleString()}
</div>
)}

View File

@@ -55,7 +55,9 @@ interface FlaskSystemInfo {
hostname: string
node_id: string
uptime: string
health_status: "healthy" | "warning" | "critical"
health: {
status: "healthy" | "warning" | "critical"
}
}
export function ProxmoxDashboard() {
@@ -96,8 +98,10 @@ export function ProxmoxDashboard() {
const uptimeValue =
data.uptime && typeof data.uptime === "string" && data.uptime.trim() !== "" ? data.uptime : "N/A"
const healthStatus = data.health?.status?.toLowerCase() || "healthy"
setSystemStatus({
status: data.health_status || "healthy",
status: healthStatus as "healthy" | "warning" | "critical",
uptime: uptimeValue,
lastUpdate: new Date().toLocaleTimeString("en-US", { hour12: false }),
serverName: data.hostname || "Unknown",

View File

@@ -29,11 +29,21 @@ def get_health_details():
def get_system_info():
"""
Get lightweight system info for header display.
Returns: hostname, uptime, and cached health status.
This is optimized for minimal server impact.
Returns: hostname, uptime, and health status with proper structure.
"""
try:
info = health_monitor.get_system_info()
if 'health' in info:
# Convert 'OK' to 'healthy', 'WARNING' to 'warning', 'CRITICAL' to 'critical'
status_map = {
'OK': 'healthy',
'WARNING': 'warning',
'CRITICAL': 'critical',
'UNKNOWN': 'warning'
}
current_status = info['health'].get('status', 'OK').upper()
info['health']['status'] = status_map.get(current_status, 'healthy')
return jsonify(info)
except Exception as e:
return jsonify({'error': str(e)}), 500

View File

@@ -4,7 +4,7 @@ Provides comprehensive, lightweight health checks for Proxmox systems.
Optimized for minimal system impact with intelligent thresholds and hysteresis.
Author: MacRimi
Version: 1.1 (Optimized for minimal overhead)
Version: 1.2 (Always returns all 10 categories)
"""
import psutil
@@ -15,12 +15,13 @@ import os
from typing import Dict, List, Any, Tuple, Optional
from datetime import datetime, timedelta
from collections import defaultdict
import re
class HealthMonitor:
"""
Monitors system health across multiple components with minimal impact.
Implements hysteresis, intelligent caching, and progressive escalation.
Only reports problems, not verbose OK statuses.
Always returns all 10 health categories.
"""
# CPU Thresholds
@@ -186,92 +187,104 @@ class HealthMonitor:
def get_detailed_status(self) -> Dict[str, Any]:
"""
Get comprehensive health status with all checks.
Returns JSON structure matching the specification.
OPTIMIZED: Only shows problems, not verbose OK messages.
Returns JSON structure with ALL 10 categories always present.
"""
details = {}
details = {
'cpu': {'status': 'OK'},
'memory': {'status': 'OK'},
'storage': {'status': 'OK'},
'disks': {'status': 'OK'},
'network': {'status': 'OK'},
'vms': {'status': 'OK'},
'services': {'status': 'OK'},
'logs': {'status': 'OK'},
'updates': {'status': 'OK'},
'security': {'status': 'OK'}
}
critical_issues = []
warning_issues = []
# Priority 1: Services PVE
services_status = self._check_pve_services()
if services_status['status'] != 'OK':
details['services'] = services_status
if services_status['status'] == 'CRITICAL':
critical_issues.append(services_status.get('reason', 'Service failure'))
elif services_status['status'] == 'WARNING':
warning_issues.append(services_status.get('reason', 'Service issue'))
details['services'] = services_status
if services_status['status'] == 'CRITICAL':
critical_issues.append(services_status.get('reason', 'Service failure'))
elif services_status['status'] == 'WARNING':
warning_issues.append(services_status.get('reason', 'Service issue'))
# Priority 2: Storage
storage_status = self._check_storage_optimized()
if storage_status and storage_status.get('status') != 'OK':
if storage_status:
details['storage'] = storage_status
if storage_status.get('status') == 'CRITICAL':
critical_issues.append(storage_status.get('reason', 'Storage failure'))
elif storage_status.get('status') == 'WARNING':
warning_issues.append(storage_status.get('reason', 'Storage issue'))
# Priority 3: Disks
disks_status = self._check_disks_optimized()
if disks_status and disks_status.get('status') != 'OK':
if disks_status:
details['disks'] = disks_status
if disks_status.get('status') == 'CRITICAL':
critical_issues.append(disks_status.get('reason', 'Disk failure'))
elif disks_status.get('status') == 'WARNING':
warning_issues.append(disks_status.get('reason', 'Disk issue'))
# Priority 4: VMs/CTs - now detects qmp errors from logs
vms_status = self._check_vms_cts_optimized()
if vms_status and vms_status.get('status') != 'OK':
if vms_status:
details['vms'] = vms_status
if vms_status.get('status') == 'CRITICAL':
critical_issues.append(vms_status.get('reason', 'VM/CT failure'))
elif vms_status.get('status') == 'WARNING':
warning_issues.append(vms_status.get('reason', 'VM/CT issue'))
# Priority 5: Network
network_status = self._check_network_optimized()
if network_status and network_status.get('status') != 'OK':
if network_status:
details['network'] = network_status
if network_status.get('status') == 'CRITICAL':
critical_issues.append(network_status.get('reason', 'Network failure'))
elif network_status.get('status') == 'WARNING':
warning_issues.append(network_status.get('reason', 'Network issue'))
# Priority 5: CPU/RAM (solo si hay problemas)
# Priority 6: CPU
cpu_status = self._check_cpu_with_hysteresis()
if cpu_status.get('status') != 'OK':
details['cpu'] = cpu_status
if cpu_status.get('status') == 'WARNING':
warning_issues.append(cpu_status.get('reason', 'CPU high'))
elif cpu_status.get('status') == 'CRITICAL':
critical_issues.append(cpu_status.get('reason', 'CPU critical'))
details['cpu'] = cpu_status
if cpu_status.get('status') == 'WARNING':
warning_issues.append(cpu_status.get('reason', 'CPU high'))
elif cpu_status.get('status') == 'CRITICAL':
critical_issues.append(cpu_status.get('reason', 'CPU critical'))
# Priority 7: Memory
memory_status = self._check_memory_comprehensive()
if memory_status.get('status') != 'OK':
details['memory'] = memory_status
if memory_status.get('status') == 'CRITICAL':
critical_issues.append(memory_status.get('reason', 'Memory critical'))
elif memory_status.get('status') == 'WARNING':
warning_issues.append(memory_status.get('reason', 'Memory high'))
details['memory'] = memory_status
if memory_status.get('status') == 'CRITICAL':
critical_issues.append(memory_status.get('reason', 'Memory critical'))
elif memory_status.get('status') == 'WARNING':
warning_issues.append(memory_status.get('reason', 'Memory high'))
# Priority 6: Logs (solo errores críticos)
# Priority 8: Logs
logs_status = self._check_logs_lightweight()
if logs_status.get('status') != 'OK':
details['logs'] = logs_status
if logs_status.get('status') == 'CRITICAL':
critical_issues.append(logs_status.get('reason', 'Critical log errors'))
elif logs_status.get('status') == 'WARNING':
warning_issues.append(logs_status.get('reason', 'Log warnings'))
details['logs'] = logs_status
if logs_status.get('status') == 'CRITICAL':
critical_issues.append(logs_status.get('reason', 'Critical log errors'))
elif logs_status.get('status') == 'WARNING':
warning_issues.append(logs_status.get('reason', 'Log warnings'))
# Priority 9: Updates
updates_status = self._check_updates()
if updates_status and updates_status.get('status') != 'OK':
if updates_status:
details['updates'] = updates_status
if updates_status.get('status') == 'WARNING':
warning_issues.append(updates_status.get('reason', 'Updates pending'))
# Priority 7: Security (solo problemas)
# Priority 10: Security
security_status = self._check_security()
if security_status.get('status') != 'OK':
details['security'] = security_status
if security_status.get('status') == 'WARNING':
warning_issues.append(security_status.get('reason', 'Security issue'))
details['security'] = security_status
if security_status.get('status') == 'WARNING':
warning_issues.append(security_status.get('reason', 'Security issue'))
# Determine overall status
if critical_issues:
@@ -498,9 +511,9 @@ class HealthMonitor:
except Exception as e:
return {'status': 'UNKNOWN', 'reason': f'Memory check failed: {str(e)}'}
def _check_storage_optimized(self) -> Optional[Dict[str, Any]]:
def _check_storage_optimized(self) -> Dict[str, Any]:
"""
Optimized storage check - only reports problems.
Optimized storage check - always returns status.
Checks critical mounts, LVM, and Proxmox storages.
"""
issues = []
@@ -510,12 +523,34 @@ class HealthMonitor:
critical_mounts = ['/', '/var/lib/vz']
for mount_point in critical_mounts:
if not os.path.exists(mount_point):
issues.append(f'{mount_point} not mounted')
storage_details[mount_point] = {
'status': 'CRITICAL',
'reason': 'Not mounted'
}
is_mounted = False
try:
result = subprocess.run(
['mountpoint', '-q', mount_point],
capture_output=True,
timeout=2
)
is_mounted = (result.returncode == 0)
except:
pass
if not is_mounted:
# Only report as error if it's supposed to exist
if mount_point == '/':
issues.append(f'{mount_point}: Not mounted')
storage_details[mount_point] = {
'status': 'CRITICAL',
'reason': 'Not mounted'
}
# For /var/lib/vz, it might not be a separate mount, check if dir exists
elif mount_point == '/var/lib/vz':
if os.path.exists(mount_point):
# It exists as directory, check usage
fs_status = self._check_filesystem(mount_point)
if fs_status['status'] != 'OK':
issues.append(f"{mount_point}: {fs_status['reason']}")
storage_details[mount_point] = fs_status
# If doesn't exist, skip silently (might use different storage)
continue
fs_status = self._check_filesystem(mount_point)
@@ -536,7 +571,6 @@ class HealthMonitor:
issues.append(f"{storage_name}: {storage_data.get('reason', 'Storage issue')}")
storage_details[storage_name] = storage_data
# If no issues, return None (optimized)
if not issues:
return {'status': 'OK'}
@@ -605,8 +639,8 @@ class HealthMonitor:
'reason': f'Check failed: {str(e)}'
}
def _check_lvm(self) -> Optional[Dict[str, Any]]:
"""Check LVM volumes, especially local-lvm"""
def _check_lvm(self) -> Dict[str, Any]:
"""Check LVM volumes - improved detection"""
try:
result = subprocess.run(
['lvs', '--noheadings', '--options', 'lv_name,vg_name,lv_attr'],
@@ -616,10 +650,9 @@ class HealthMonitor:
)
if result.returncode != 0:
return None
return {'status': 'OK'}
volumes = []
local_lvm_found = False
for line in result.stdout.strip().split('\n'):
if line.strip():
@@ -628,20 +661,11 @@ class HealthMonitor:
lv_name = parts[0].strip()
vg_name = parts[1].strip()
volumes.append(f'{vg_name}/{lv_name}')
if 'local-lvm' in lv_name or 'local-lvm' in vg_name:
local_lvm_found = True
if volumes and not local_lvm_found:
return {
'status': 'CRITICAL',
'reason': 'local-lvm volume not found'
}
return {'status': 'OK'}
return {'status': 'OK', 'volumes': len(volumes)}
except Exception:
return None
return {'status': 'OK'}
def _check_proxmox_storages(self) -> Dict[str, Any]:
"""Check Proxmox-specific storages (only report problems)"""
@@ -680,9 +704,9 @@ class HealthMonitor:
return storages
def _check_disks_optimized(self) -> Optional[Dict[str, Any]]:
def _check_disks_optimized(self) -> Dict[str, Any]:
"""
Optimized disk check - only reports I/O errors and SMART issues.
Optimized disk check - always returns status.
"""
current_time = time.time()
disk_issues = {}
@@ -725,7 +749,6 @@ class HealthMonitor:
'reason': f'{error_count} I/O error(s) in 5 minutes'
}
# If no issues, return OK
if not disk_issues:
return {'status': 'OK'}
@@ -738,12 +761,11 @@ class HealthMonitor:
}
except Exception:
return None
return {'status': 'OK'}
def _check_network_optimized(self) -> Optional[Dict[str, Any]]:
def _check_network_optimized(self) -> Dict[str, Any]:
"""
Optimized network check - only reports problems.
Checks interfaces down, no connectivity.
Optimized network check - always returns status.
"""
try:
issues = []
@@ -770,7 +792,6 @@ class HealthMonitor:
issues.append(latency_status.get('reason', 'Network latency issue'))
interface_details['connectivity'] = latency_status
# If no issues, return OK
if not issues:
return {'status': 'OK'}
@@ -783,7 +804,7 @@ class HealthMonitor:
}
except Exception:
return None
return {'status': 'OK'}
def _check_network_latency(self) -> Optional[Dict[str, Any]]:
"""Check network latency to 1.1.1.1 (cached)"""
@@ -843,18 +864,18 @@ class HealthMonitor:
except Exception:
return None
def _check_vms_cts_optimized(self) -> Optional[Dict[str, Any]]:
def _check_vms_cts_optimized(self) -> Dict[str, Any]:
"""
Optimized VM/CT check - only reports failed starts.
Checks logs for VMs/CTs that failed to start.
Optimized VM/CT check - detects qmp failures and other VM errors.
Now parses logs for VM/CT specific errors like qmp command failures.
"""
try:
issues = []
vm_details = {}
# Check logs for failed VM/CT starts
# Check logs for VM/CT errors
result = subprocess.run(
['journalctl', '--since', '10 minutes ago', '--no-pager', '-u', 'pve*'],
['journalctl', '--since', '10 minutes ago', '--no-pager', '-u', 'pve*', '-p', 'warning'],
capture_output=True,
text=True,
timeout=3
@@ -864,34 +885,66 @@ class HealthMonitor:
for line in result.stdout.split('\n'):
line_lower = line.lower()
# Detect VM/CT start failures
# Pattern 1: "VM 106 qmp command failed"
vm_qmp_match = re.search(r'vm\s+(\d+)\s+qmp\s+command', line_lower)
if vm_qmp_match:
vmid = vm_qmp_match.group(1)
key = f'vm_{vmid}'
if key not in vm_details:
issues.append(f'VM {vmid}: QMP command error')
vm_details[key] = {
'status': 'WARNING',
'reason': 'QMP command failed',
'id': vmid,
'type': 'VM'
}
continue
# Pattern 2: "CT 103 error" or "Container 103"
ct_match = re.search(r'(?:ct|container)\s+(\d+)', line_lower)
if ct_match and ('error' in line_lower or 'fail' in line_lower):
ctid = ct_match.group(1)
key = f'ct_{ctid}'
if key not in vm_details:
issues.append(f'CT {ctid}: Error detected')
vm_details[key] = {
'status': 'WARNING',
'reason': 'Container error',
'id': ctid,
'type': 'CT'
}
continue
# Pattern 3: Generic VM/CT start failures
if 'failed to start' in line_lower or 'error starting' in line_lower or \
'start error' in line_lower or 'cannot start' in line_lower:
# Extract VM/CT ID
for word in line.split():
if word.isdigit() and len(word) <= 4:
vmid = word
if vmid not in self.failed_vm_history:
self.failed_vm_history.add(vmid)
issues.append(f'VM/CT {vmid} failed to start')
vm_details[f'vmct_{vmid}'] = {
'status': 'CRITICAL',
'reason': 'Failed to start'
}
break
id_match = re.search(r'\b(\d{3,4})\b', line)
if id_match:
vmid = id_match.group(1)
key = f'vmct_{vmid}'
if key not in vm_details:
issues.append(f'VM/CT {vmid}: Failed to start')
vm_details[key] = {
'status': 'CRITICAL',
'reason': 'Failed to start',
'id': vmid,
'type': 'VM/CT'
}
# If no issues, return OK
if not issues:
return {'status': 'OK'}
has_critical = any(d.get('status') == 'CRITICAL' for d in vm_details.values())
return {
'status': 'CRITICAL',
'status': 'CRITICAL' if has_critical else 'WARNING',
'reason': '; '.join(issues[:3]),
'details': vm_details
}
except Exception:
return None
return {'status': 'OK'}
def _check_pve_services(self) -> Dict[str, Any]:
"""Check critical Proxmox services"""