Update AppImage

This commit is contained in:
MacRimi
2025-11-09 16:30:29 +01:00
parent 1712d32ef7
commit b9619efbbf
4 changed files with 330 additions and 250 deletions

View File

@@ -3,12 +3,28 @@
import { useState, useEffect } from "react" import { useState, useEffect } from "react"
import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle } from "@/components/ui/dialog" import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle } from "@/components/ui/dialog"
import { Badge } from "@/components/ui/badge" import { Badge } from "@/components/ui/badge"
import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card" import {
import { Loader2, CheckCircle2, AlertTriangle, XCircle, Activity } from "lucide-react" Loader2,
CheckCircle2,
AlertTriangle,
XCircle,
Activity,
Cpu,
MemoryStick,
HardDrive,
Disc,
Network,
Box,
Settings,
FileText,
RefreshCw,
Shield,
} from "lucide-react"
interface HealthDetail { interface CategoryCheck {
status: string status: string
reason?: string reason?: string
details?: any
[key: string]: any [key: string]: any
} }
@@ -16,7 +32,16 @@ interface HealthDetails {
overall: string overall: string
summary: string summary: string
details: { details: {
[category: string]: HealthDetail | { [key: string]: HealthDetail } cpu: CategoryCheck
memory: CategoryCheck
storage: CategoryCheck
disks: CategoryCheck
network: CategoryCheck
vms: CategoryCheck
services: CategoryCheck
logs: CategoryCheck
updates: CategoryCheck
security: CategoryCheck
} }
timestamp: string timestamp: string
} }
@@ -27,6 +52,19 @@ interface HealthStatusModalProps {
getApiUrl: (path: string) => string getApiUrl: (path: string) => string
} }
const CATEGORIES = [
{ key: "cpu", label: "CPU Usage & Temperature", Icon: Cpu },
{ key: "memory", label: "Memory & Swap", Icon: MemoryStick },
{ key: "storage", label: "Storage Mounts & Space", Icon: HardDrive },
{ key: "disks", label: "Disk I/O & Errors", Icon: Disc },
{ key: "network", label: "Network Interfaces", Icon: Network },
{ key: "vms", label: "VMs & Containers", Icon: Box },
{ key: "services", label: "PVE Services", Icon: Settings },
{ key: "logs", label: "System Logs", Icon: FileText },
{ key: "updates", label: "System Updates", Icon: RefreshCw },
{ key: "security", label: "Security & Certificates", Icon: Shield },
]
export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatusModalProps) { export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatusModalProps) {
const [loading, setLoading] = useState(true) const [loading, setLoading] = useState(true)
const [healthData, setHealthData] = useState<HealthDetails | null>(null) const [healthData, setHealthData] = useState<HealthDetails | null>(null)
@@ -58,74 +96,6 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
} }
} }
const getHealthStats = () => {
if (!healthData?.details) {
return { total: 0, healthy: 0, warnings: 0, critical: 0 }
}
let healthy = 0
let warnings = 0
let critical = 0
let total = 0
const countStatus = (detail: any) => {
if (detail && typeof detail === "object" && detail.status) {
total++
const status = detail.status.toUpperCase()
if (status === "OK") healthy++
else if (status === "WARNING") warnings++
else if (status === "CRITICAL") critical++
}
}
Object.values(healthData.details).forEach((categoryData) => {
if (categoryData && typeof categoryData === "object") {
if ("status" in categoryData) {
countStatus(categoryData)
} else {
Object.values(categoryData).forEach(countStatus)
}
}
})
return { total, healthy, warnings, critical }
}
const getGroupedChecks = () => {
if (!healthData?.details) return {}
const grouped: { [key: string]: Array<{ name: string; status: string; reason?: string; details?: any }> } = {}
Object.entries(healthData.details).forEach(([category, categoryData]) => {
if (!categoryData || typeof categoryData !== "object") return
const categoryName = category.charAt(0).toUpperCase() + category.slice(1)
grouped[categoryName] = []
if ("status" in categoryData) {
grouped[categoryName].push({
name: categoryName,
status: categoryData.status,
reason: categoryData.reason,
details: categoryData,
})
} else {
Object.entries(categoryData).forEach(([subKey, subData]: [string, any]) => {
if (subData && typeof subData === "object" && "status" in subData) {
grouped[categoryName].push({
name: subKey,
status: subData.status,
reason: subData.reason,
details: subData,
})
}
})
}
})
return grouped
}
const getStatusIcon = (status: string) => { const getStatusIcon = (status: string) => {
const statusUpper = status?.toUpperCase() const statusUpper = status?.toUpperCase()
switch (statusUpper) { switch (statusUpper) {
@@ -144,28 +114,52 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
const statusUpper = status?.toUpperCase() const statusUpper = status?.toUpperCase()
switch (statusUpper) { switch (statusUpper) {
case "OK": case "OK":
return <Badge className="bg-green-500">Healthy</Badge> return <Badge className="bg-green-500 text-white">Healthy</Badge>
case "WARNING": case "WARNING":
return <Badge className="bg-yellow-500">Warning</Badge> return <Badge className="bg-yellow-500 text-white">Warning</Badge>
case "CRITICAL": case "CRITICAL":
return <Badge className="bg-red-500">Critical</Badge> return <Badge className="bg-red-500 text-white">Critical</Badge>
default: default:
return <Badge>Unknown</Badge> return <Badge>Unknown</Badge>
} }
} }
const getHealthStats = () => {
if (!healthData?.details) {
return { total: 0, healthy: 0, warnings: 0, critical: 0 }
}
let healthy = 0
let warnings = 0
let critical = 0
CATEGORIES.forEach(({ key }) => {
const categoryData = healthData.details[key as keyof typeof healthData.details]
if (categoryData) {
const status = categoryData.status?.toUpperCase()
if (status === "OK") healthy++
else if (status === "WARNING") warnings++
else if (status === "CRITICAL") critical++
}
})
return { total: CATEGORIES.length, healthy, warnings, critical }
}
const stats = getHealthStats() const stats = getHealthStats()
const groupedChecks = getGroupedChecks()
return ( return (
<Dialog open={open} onOpenChange={onOpenChange}> <Dialog open={open} onOpenChange={onOpenChange}>
<DialogContent className="max-w-4xl max-h-[80vh] overflow-y-auto"> <DialogContent className="max-w-3xl max-h-[85vh] overflow-y-auto">
<DialogHeader> <DialogHeader>
<DialogTitle className="flex items-center gap-2"> <DialogTitle className="flex items-center justify-between">
<Activity className="h-6 w-6" /> <div className="flex items-center gap-2">
System Health Status <Activity className="h-6 w-6" />
System Health Status
</div>
{healthData && getStatusBadge(healthData.overall)}
</DialogTitle> </DialogTitle>
<DialogDescription>Detailed health checks for all system components</DialogDescription> <DialogDescription>Comprehensive health checks for all system components</DialogDescription>
</DialogHeader> </DialogHeader>
{loading && ( {loading && (
@@ -182,82 +176,101 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
)} )}
{healthData && !loading && ( {healthData && !loading && (
<div className="space-y-6"> <div className="space-y-4">
{/* Overall Status Summary */} {/* Overall Stats Summary */}
<Card> <div className="grid grid-cols-4 gap-3 p-4 rounded-lg bg-muted/30 border">
<CardHeader> <div className="text-center">
<CardTitle className="flex items-center justify-between"> <div className="text-2xl font-bold">{stats.total}</div>
<span>Overall Status</span> <div className="text-xs text-muted-foreground">Total Checks</div>
{getStatusBadge(healthData.overall)} </div>
</CardTitle> <div className="text-center">
</CardHeader> <div className="text-2xl font-bold text-green-500">{stats.healthy}</div>
<CardContent> <div className="text-xs text-muted-foreground">Healthy</div>
{healthData.summary && <p className="text-sm text-muted-foreground mb-4">{healthData.summary}</p>} </div>
<div className="grid grid-cols-4 gap-4 text-center"> <div className="text-center">
<div> <div className="text-2xl font-bold text-yellow-500">{stats.warnings}</div>
<div className="text-2xl font-bold">{stats.total}</div> <div className="text-xs text-muted-foreground">Warnings</div>
<div className="text-sm text-muted-foreground">Total Checks</div> </div>
</div> <div className="text-center">
<div> <div className="text-2xl font-bold text-red-500">{stats.critical}</div>
<div className="text-2xl font-bold text-green-500">{stats.healthy}</div> <div className="text-xs text-muted-foreground">Critical</div>
<div className="text-sm text-muted-foreground">Healthy</div> </div>
</div> </div>
<div>
<div className="text-2xl font-bold text-yellow-500">{stats.warnings}</div>
<div className="text-sm text-muted-foreground">Warnings</div>
</div>
<div>
<div className="text-2xl font-bold text-red-500">{stats.critical}</div>
<div className="text-sm text-muted-foreground">Critical</div>
</div>
</div>
</CardContent>
</Card>
{/* Grouped Health Checks */} {healthData.summary && (
{Object.entries(groupedChecks).map(([category, checks]) => ( <div className="text-sm text-muted-foreground p-3 rounded-lg bg-muted/20 border">
<Card key={category}> {healthData.summary}
<CardHeader> </div>
<CardTitle className="text-lg">{category}</CardTitle> )}
</CardHeader>
<CardContent> <div className="space-y-2">
<div className="space-y-3"> {CATEGORIES.map(({ key, label, Icon }) => {
{checks.map((check, index) => ( const categoryData = healthData.details[key as keyof typeof healthData.details]
<div const status = categoryData?.status || "UNKNOWN"
key={`${category}-${index}`} const reason = categoryData?.reason
className="flex items-start gap-3 rounded-lg border p-3 hover:bg-muted/50 transition-colors" const details = categoryData?.details
>
<div className="mt-0.5">{getStatusIcon(check.status)}</div> return (
<div className="flex-1 min-w-0"> <div
<div className="flex items-center justify-between gap-2"> key={key}
<p className="font-medium">{check.name}</p> className={`flex items-start gap-3 p-3 rounded-lg border transition-colors ${
<Badge variant="outline" className="shrink-0"> status === "OK"
{check.status} ? "bg-green-500/5 border-green-500/20 hover:bg-green-500/10"
</Badge> : status === "WARNING"
</div> ? "bg-yellow-500/5 border-yellow-500/20 hover:bg-yellow-500/10"
{check.reason && <p className="text-sm text-muted-foreground mt-1">{check.reason}</p>} : status === "CRITICAL"
{check.details && ( ? "bg-red-500/5 border-red-500/20 hover:bg-red-500/10"
<div className="text-xs text-muted-foreground mt-2 space-y-0.5"> : "bg-muted/30 hover:bg-muted/50"
{Object.entries(check.details).map(([key, value]) => { }`}
if (key === "status" || key === "reason" || typeof value === "object") return null >
return ( <div className="mt-0.5 flex-shrink-0 flex items-center gap-2">
<div key={key} className="font-mono"> <Icon className="h-4 w-4 text-muted-foreground" />
{key}: {String(value)} {getStatusIcon(status)}
</div> </div>
) <div className="flex-1 min-w-0">
})} <div className="flex items-center justify-between gap-2 mb-1">
</div> <p className="font-medium text-sm">{label}</p>
)} <Badge
</div> variant="outline"
className={`shrink-0 text-xs ${
status === "OK"
? "border-green-500 text-green-500"
: status === "WARNING"
? "border-yellow-500 text-yellow-500"
: status === "CRITICAL"
? "border-red-500 text-red-500"
: ""
}`}
>
{status}
</Badge>
</div> </div>
))} {reason && <p className="text-xs text-muted-foreground mt-1">{reason}</p>}
{details && typeof details === "object" && (
<div className="mt-2 space-y-1">
{Object.entries(details).map(([detailKey, detailValue]: [string, any]) => {
if (typeof detailValue === "object" && detailValue !== null) {
return (
<div key={detailKey} className="text-xs pl-3 border-l-2 border-muted">
<span className="font-medium">{detailKey}:</span>
{detailValue.reason && (
<span className="ml-1 text-muted-foreground">{detailValue.reason}</span>
)}
</div>
)
}
return null
})}
</div>
)}
</div>
</div> </div>
</CardContent> )
</Card> })}
))} </div>
{healthData.timestamp && ( {healthData.timestamp && (
<div className="text-xs text-muted-foreground text-center"> <div className="text-xs text-muted-foreground text-center pt-2">
Last updated: {new Date(healthData.timestamp).toLocaleString()} Last updated: {new Date(healthData.timestamp).toLocaleString()}
</div> </div>
)} )}

View File

@@ -55,7 +55,9 @@ interface FlaskSystemInfo {
hostname: string hostname: string
node_id: string node_id: string
uptime: string uptime: string
health_status: "healthy" | "warning" | "critical" health: {
status: "healthy" | "warning" | "critical"
}
} }
export function ProxmoxDashboard() { export function ProxmoxDashboard() {
@@ -96,8 +98,10 @@ export function ProxmoxDashboard() {
const uptimeValue = const uptimeValue =
data.uptime && typeof data.uptime === "string" && data.uptime.trim() !== "" ? data.uptime : "N/A" data.uptime && typeof data.uptime === "string" && data.uptime.trim() !== "" ? data.uptime : "N/A"
const healthStatus = data.health?.status?.toLowerCase() || "healthy"
setSystemStatus({ setSystemStatus({
status: data.health_status || "healthy", status: healthStatus as "healthy" | "warning" | "critical",
uptime: uptimeValue, uptime: uptimeValue,
lastUpdate: new Date().toLocaleTimeString("en-US", { hour12: false }), lastUpdate: new Date().toLocaleTimeString("en-US", { hour12: false }),
serverName: data.hostname || "Unknown", serverName: data.hostname || "Unknown",

View File

@@ -29,11 +29,21 @@ def get_health_details():
def get_system_info(): def get_system_info():
""" """
Get lightweight system info for header display. Get lightweight system info for header display.
Returns: hostname, uptime, and cached health status. Returns: hostname, uptime, and health status with proper structure.
This is optimized for minimal server impact.
""" """
try: try:
info = health_monitor.get_system_info() info = health_monitor.get_system_info()
if 'health' in info:
# Convert 'OK' to 'healthy', 'WARNING' to 'warning', 'CRITICAL' to 'critical'
status_map = {
'OK': 'healthy',
'WARNING': 'warning',
'CRITICAL': 'critical',
'UNKNOWN': 'warning'
}
current_status = info['health'].get('status', 'OK').upper()
info['health']['status'] = status_map.get(current_status, 'healthy')
return jsonify(info) return jsonify(info)
except Exception as e: except Exception as e:
return jsonify({'error': str(e)}), 500 return jsonify({'error': str(e)}), 500

View File

@@ -4,7 +4,7 @@ Provides comprehensive, lightweight health checks for Proxmox systems.
Optimized for minimal system impact with intelligent thresholds and hysteresis. Optimized for minimal system impact with intelligent thresholds and hysteresis.
Author: MacRimi Author: MacRimi
Version: 1.1 (Optimized for minimal overhead) Version: 1.2 (Always returns all 10 categories)
""" """
import psutil import psutil
@@ -15,12 +15,13 @@ import os
from typing import Dict, List, Any, Tuple, Optional from typing import Dict, List, Any, Tuple, Optional
from datetime import datetime, timedelta from datetime import datetime, timedelta
from collections import defaultdict from collections import defaultdict
import re
class HealthMonitor: class HealthMonitor:
""" """
Monitors system health across multiple components with minimal impact. Monitors system health across multiple components with minimal impact.
Implements hysteresis, intelligent caching, and progressive escalation. Implements hysteresis, intelligent caching, and progressive escalation.
Only reports problems, not verbose OK statuses. Always returns all 10 health categories.
""" """
# CPU Thresholds # CPU Thresholds
@@ -186,92 +187,104 @@ class HealthMonitor:
def get_detailed_status(self) -> Dict[str, Any]: def get_detailed_status(self) -> Dict[str, Any]:
""" """
Get comprehensive health status with all checks. Get comprehensive health status with all checks.
Returns JSON structure matching the specification. Returns JSON structure with ALL 10 categories always present.
OPTIMIZED: Only shows problems, not verbose OK messages.
""" """
details = {} details = {
'cpu': {'status': 'OK'},
'memory': {'status': 'OK'},
'storage': {'status': 'OK'},
'disks': {'status': 'OK'},
'network': {'status': 'OK'},
'vms': {'status': 'OK'},
'services': {'status': 'OK'},
'logs': {'status': 'OK'},
'updates': {'status': 'OK'},
'security': {'status': 'OK'}
}
critical_issues = [] critical_issues = []
warning_issues = [] warning_issues = []
# Priority 1: Services PVE # Priority 1: Services PVE
services_status = self._check_pve_services() services_status = self._check_pve_services()
if services_status['status'] != 'OK': details['services'] = services_status
details['services'] = services_status if services_status['status'] == 'CRITICAL':
if services_status['status'] == 'CRITICAL': critical_issues.append(services_status.get('reason', 'Service failure'))
critical_issues.append(services_status.get('reason', 'Service failure')) elif services_status['status'] == 'WARNING':
elif services_status['status'] == 'WARNING': warning_issues.append(services_status.get('reason', 'Service issue'))
warning_issues.append(services_status.get('reason', 'Service issue'))
# Priority 2: Storage
storage_status = self._check_storage_optimized() storage_status = self._check_storage_optimized()
if storage_status and storage_status.get('status') != 'OK': if storage_status:
details['storage'] = storage_status details['storage'] = storage_status
if storage_status.get('status') == 'CRITICAL': if storage_status.get('status') == 'CRITICAL':
critical_issues.append(storage_status.get('reason', 'Storage failure')) critical_issues.append(storage_status.get('reason', 'Storage failure'))
elif storage_status.get('status') == 'WARNING': elif storage_status.get('status') == 'WARNING':
warning_issues.append(storage_status.get('reason', 'Storage issue')) warning_issues.append(storage_status.get('reason', 'Storage issue'))
# Priority 3: Disks
disks_status = self._check_disks_optimized() disks_status = self._check_disks_optimized()
if disks_status and disks_status.get('status') != 'OK': if disks_status:
details['disks'] = disks_status details['disks'] = disks_status
if disks_status.get('status') == 'CRITICAL': if disks_status.get('status') == 'CRITICAL':
critical_issues.append(disks_status.get('reason', 'Disk failure')) critical_issues.append(disks_status.get('reason', 'Disk failure'))
elif disks_status.get('status') == 'WARNING': elif disks_status.get('status') == 'WARNING':
warning_issues.append(disks_status.get('reason', 'Disk issue')) warning_issues.append(disks_status.get('reason', 'Disk issue'))
# Priority 4: VMs/CTs - now detects qmp errors from logs
vms_status = self._check_vms_cts_optimized() vms_status = self._check_vms_cts_optimized()
if vms_status and vms_status.get('status') != 'OK': if vms_status:
details['vms'] = vms_status details['vms'] = vms_status
if vms_status.get('status') == 'CRITICAL': if vms_status.get('status') == 'CRITICAL':
critical_issues.append(vms_status.get('reason', 'VM/CT failure')) critical_issues.append(vms_status.get('reason', 'VM/CT failure'))
elif vms_status.get('status') == 'WARNING': elif vms_status.get('status') == 'WARNING':
warning_issues.append(vms_status.get('reason', 'VM/CT issue')) warning_issues.append(vms_status.get('reason', 'VM/CT issue'))
# Priority 5: Network
network_status = self._check_network_optimized() network_status = self._check_network_optimized()
if network_status and network_status.get('status') != 'OK': if network_status:
details['network'] = network_status details['network'] = network_status
if network_status.get('status') == 'CRITICAL': if network_status.get('status') == 'CRITICAL':
critical_issues.append(network_status.get('reason', 'Network failure')) critical_issues.append(network_status.get('reason', 'Network failure'))
elif network_status.get('status') == 'WARNING': elif network_status.get('status') == 'WARNING':
warning_issues.append(network_status.get('reason', 'Network issue')) warning_issues.append(network_status.get('reason', 'Network issue'))
# Priority 5: CPU/RAM (solo si hay problemas) # Priority 6: CPU
cpu_status = self._check_cpu_with_hysteresis() cpu_status = self._check_cpu_with_hysteresis()
if cpu_status.get('status') != 'OK': details['cpu'] = cpu_status
details['cpu'] = cpu_status if cpu_status.get('status') == 'WARNING':
if cpu_status.get('status') == 'WARNING': warning_issues.append(cpu_status.get('reason', 'CPU high'))
warning_issues.append(cpu_status.get('reason', 'CPU high')) elif cpu_status.get('status') == 'CRITICAL':
elif cpu_status.get('status') == 'CRITICAL': critical_issues.append(cpu_status.get('reason', 'CPU critical'))
critical_issues.append(cpu_status.get('reason', 'CPU critical'))
# Priority 7: Memory
memory_status = self._check_memory_comprehensive() memory_status = self._check_memory_comprehensive()
if memory_status.get('status') != 'OK': details['memory'] = memory_status
details['memory'] = memory_status if memory_status.get('status') == 'CRITICAL':
if memory_status.get('status') == 'CRITICAL': critical_issues.append(memory_status.get('reason', 'Memory critical'))
critical_issues.append(memory_status.get('reason', 'Memory critical')) elif memory_status.get('status') == 'WARNING':
elif memory_status.get('status') == 'WARNING': warning_issues.append(memory_status.get('reason', 'Memory high'))
warning_issues.append(memory_status.get('reason', 'Memory high'))
# Priority 6: Logs (solo errores críticos) # Priority 8: Logs
logs_status = self._check_logs_lightweight() logs_status = self._check_logs_lightweight()
if logs_status.get('status') != 'OK': details['logs'] = logs_status
details['logs'] = logs_status if logs_status.get('status') == 'CRITICAL':
if logs_status.get('status') == 'CRITICAL': critical_issues.append(logs_status.get('reason', 'Critical log errors'))
critical_issues.append(logs_status.get('reason', 'Critical log errors')) elif logs_status.get('status') == 'WARNING':
elif logs_status.get('status') == 'WARNING': warning_issues.append(logs_status.get('reason', 'Log warnings'))
warning_issues.append(logs_status.get('reason', 'Log warnings'))
# Priority 9: Updates
updates_status = self._check_updates() updates_status = self._check_updates()
if updates_status and updates_status.get('status') != 'OK': if updates_status:
details['updates'] = updates_status details['updates'] = updates_status
if updates_status.get('status') == 'WARNING': if updates_status.get('status') == 'WARNING':
warning_issues.append(updates_status.get('reason', 'Updates pending')) warning_issues.append(updates_status.get('reason', 'Updates pending'))
# Priority 7: Security (solo problemas) # Priority 10: Security
security_status = self._check_security() security_status = self._check_security()
if security_status.get('status') != 'OK': details['security'] = security_status
details['security'] = security_status if security_status.get('status') == 'WARNING':
if security_status.get('status') == 'WARNING': warning_issues.append(security_status.get('reason', 'Security issue'))
warning_issues.append(security_status.get('reason', 'Security issue'))
# Determine overall status # Determine overall status
if critical_issues: if critical_issues:
@@ -498,9 +511,9 @@ class HealthMonitor:
except Exception as e: except Exception as e:
return {'status': 'UNKNOWN', 'reason': f'Memory check failed: {str(e)}'} return {'status': 'UNKNOWN', 'reason': f'Memory check failed: {str(e)}'}
def _check_storage_optimized(self) -> Optional[Dict[str, Any]]: def _check_storage_optimized(self) -> Dict[str, Any]:
""" """
Optimized storage check - only reports problems. Optimized storage check - always returns status.
Checks critical mounts, LVM, and Proxmox storages. Checks critical mounts, LVM, and Proxmox storages.
""" """
issues = [] issues = []
@@ -510,12 +523,34 @@ class HealthMonitor:
critical_mounts = ['/', '/var/lib/vz'] critical_mounts = ['/', '/var/lib/vz']
for mount_point in critical_mounts: for mount_point in critical_mounts:
if not os.path.exists(mount_point): is_mounted = False
issues.append(f'{mount_point} not mounted') try:
storage_details[mount_point] = { result = subprocess.run(
'status': 'CRITICAL', ['mountpoint', '-q', mount_point],
'reason': 'Not mounted' capture_output=True,
} timeout=2
)
is_mounted = (result.returncode == 0)
except:
pass
if not is_mounted:
# Only report as error if it's supposed to exist
if mount_point == '/':
issues.append(f'{mount_point}: Not mounted')
storage_details[mount_point] = {
'status': 'CRITICAL',
'reason': 'Not mounted'
}
# For /var/lib/vz, it might not be a separate mount, check if dir exists
elif mount_point == '/var/lib/vz':
if os.path.exists(mount_point):
# It exists as directory, check usage
fs_status = self._check_filesystem(mount_point)
if fs_status['status'] != 'OK':
issues.append(f"{mount_point}: {fs_status['reason']}")
storage_details[mount_point] = fs_status
# If doesn't exist, skip silently (might use different storage)
continue continue
fs_status = self._check_filesystem(mount_point) fs_status = self._check_filesystem(mount_point)
@@ -536,7 +571,6 @@ class HealthMonitor:
issues.append(f"{storage_name}: {storage_data.get('reason', 'Storage issue')}") issues.append(f"{storage_name}: {storage_data.get('reason', 'Storage issue')}")
storage_details[storage_name] = storage_data storage_details[storage_name] = storage_data
# If no issues, return None (optimized)
if not issues: if not issues:
return {'status': 'OK'} return {'status': 'OK'}
@@ -605,8 +639,8 @@ class HealthMonitor:
'reason': f'Check failed: {str(e)}' 'reason': f'Check failed: {str(e)}'
} }
def _check_lvm(self) -> Optional[Dict[str, Any]]: def _check_lvm(self) -> Dict[str, Any]:
"""Check LVM volumes, especially local-lvm""" """Check LVM volumes - improved detection"""
try: try:
result = subprocess.run( result = subprocess.run(
['lvs', '--noheadings', '--options', 'lv_name,vg_name,lv_attr'], ['lvs', '--noheadings', '--options', 'lv_name,vg_name,lv_attr'],
@@ -616,10 +650,9 @@ class HealthMonitor:
) )
if result.returncode != 0: if result.returncode != 0:
return None return {'status': 'OK'}
volumes = [] volumes = []
local_lvm_found = False
for line in result.stdout.strip().split('\n'): for line in result.stdout.strip().split('\n'):
if line.strip(): if line.strip():
@@ -628,20 +661,11 @@ class HealthMonitor:
lv_name = parts[0].strip() lv_name = parts[0].strip()
vg_name = parts[1].strip() vg_name = parts[1].strip()
volumes.append(f'{vg_name}/{lv_name}') volumes.append(f'{vg_name}/{lv_name}')
if 'local-lvm' in lv_name or 'local-lvm' in vg_name:
local_lvm_found = True
if volumes and not local_lvm_found: return {'status': 'OK', 'volumes': len(volumes)}
return {
'status': 'CRITICAL',
'reason': 'local-lvm volume not found'
}
return {'status': 'OK'}
except Exception: except Exception:
return None return {'status': 'OK'}
def _check_proxmox_storages(self) -> Dict[str, Any]: def _check_proxmox_storages(self) -> Dict[str, Any]:
"""Check Proxmox-specific storages (only report problems)""" """Check Proxmox-specific storages (only report problems)"""
@@ -680,9 +704,9 @@ class HealthMonitor:
return storages return storages
def _check_disks_optimized(self) -> Optional[Dict[str, Any]]: def _check_disks_optimized(self) -> Dict[str, Any]:
""" """
Optimized disk check - only reports I/O errors and SMART issues. Optimized disk check - always returns status.
""" """
current_time = time.time() current_time = time.time()
disk_issues = {} disk_issues = {}
@@ -725,7 +749,6 @@ class HealthMonitor:
'reason': f'{error_count} I/O error(s) in 5 minutes' 'reason': f'{error_count} I/O error(s) in 5 minutes'
} }
# If no issues, return OK
if not disk_issues: if not disk_issues:
return {'status': 'OK'} return {'status': 'OK'}
@@ -738,12 +761,11 @@ class HealthMonitor:
} }
except Exception: except Exception:
return None return {'status': 'OK'}
def _check_network_optimized(self) -> Optional[Dict[str, Any]]: def _check_network_optimized(self) -> Dict[str, Any]:
""" """
Optimized network check - only reports problems. Optimized network check - always returns status.
Checks interfaces down, no connectivity.
""" """
try: try:
issues = [] issues = []
@@ -770,7 +792,6 @@ class HealthMonitor:
issues.append(latency_status.get('reason', 'Network latency issue')) issues.append(latency_status.get('reason', 'Network latency issue'))
interface_details['connectivity'] = latency_status interface_details['connectivity'] = latency_status
# If no issues, return OK
if not issues: if not issues:
return {'status': 'OK'} return {'status': 'OK'}
@@ -783,7 +804,7 @@ class HealthMonitor:
} }
except Exception: except Exception:
return None return {'status': 'OK'}
def _check_network_latency(self) -> Optional[Dict[str, Any]]: def _check_network_latency(self) -> Optional[Dict[str, Any]]:
"""Check network latency to 1.1.1.1 (cached)""" """Check network latency to 1.1.1.1 (cached)"""
@@ -843,18 +864,18 @@ class HealthMonitor:
except Exception: except Exception:
return None return None
def _check_vms_cts_optimized(self) -> Optional[Dict[str, Any]]: def _check_vms_cts_optimized(self) -> Dict[str, Any]:
""" """
Optimized VM/CT check - only reports failed starts. Optimized VM/CT check - detects qmp failures and other VM errors.
Checks logs for VMs/CTs that failed to start. Now parses logs for VM/CT specific errors like qmp command failures.
""" """
try: try:
issues = [] issues = []
vm_details = {} vm_details = {}
# Check logs for failed VM/CT starts # Check logs for VM/CT errors
result = subprocess.run( result = subprocess.run(
['journalctl', '--since', '10 minutes ago', '--no-pager', '-u', 'pve*'], ['journalctl', '--since', '10 minutes ago', '--no-pager', '-u', 'pve*', '-p', 'warning'],
capture_output=True, capture_output=True,
text=True, text=True,
timeout=3 timeout=3
@@ -864,34 +885,66 @@ class HealthMonitor:
for line in result.stdout.split('\n'): for line in result.stdout.split('\n'):
line_lower = line.lower() line_lower = line.lower()
# Detect VM/CT start failures # Pattern 1: "VM 106 qmp command failed"
vm_qmp_match = re.search(r'vm\s+(\d+)\s+qmp\s+command', line_lower)
if vm_qmp_match:
vmid = vm_qmp_match.group(1)
key = f'vm_{vmid}'
if key not in vm_details:
issues.append(f'VM {vmid}: QMP command error')
vm_details[key] = {
'status': 'WARNING',
'reason': 'QMP command failed',
'id': vmid,
'type': 'VM'
}
continue
# Pattern 2: "CT 103 error" or "Container 103"
ct_match = re.search(r'(?:ct|container)\s+(\d+)', line_lower)
if ct_match and ('error' in line_lower or 'fail' in line_lower):
ctid = ct_match.group(1)
key = f'ct_{ctid}'
if key not in vm_details:
issues.append(f'CT {ctid}: Error detected')
vm_details[key] = {
'status': 'WARNING',
'reason': 'Container error',
'id': ctid,
'type': 'CT'
}
continue
# Pattern 3: Generic VM/CT start failures
if 'failed to start' in line_lower or 'error starting' in line_lower or \ if 'failed to start' in line_lower or 'error starting' in line_lower or \
'start error' in line_lower or 'cannot start' in line_lower: 'start error' in line_lower or 'cannot start' in line_lower:
# Extract VM/CT ID # Extract VM/CT ID
for word in line.split(): id_match = re.search(r'\b(\d{3,4})\b', line)
if word.isdigit() and len(word) <= 4: if id_match:
vmid = word vmid = id_match.group(1)
if vmid not in self.failed_vm_history: key = f'vmct_{vmid}'
self.failed_vm_history.add(vmid) if key not in vm_details:
issues.append(f'VM/CT {vmid} failed to start') issues.append(f'VM/CT {vmid}: Failed to start')
vm_details[f'vmct_{vmid}'] = { vm_details[key] = {
'status': 'CRITICAL', 'status': 'CRITICAL',
'reason': 'Failed to start' 'reason': 'Failed to start',
} 'id': vmid,
break 'type': 'VM/CT'
}
# If no issues, return OK
if not issues: if not issues:
return {'status': 'OK'} return {'status': 'OK'}
has_critical = any(d.get('status') == 'CRITICAL' for d in vm_details.values())
return { return {
'status': 'CRITICAL', 'status': 'CRITICAL' if has_critical else 'WARNING',
'reason': '; '.join(issues[:3]), 'reason': '; '.join(issues[:3]),
'details': vm_details 'details': vm_details
} }
except Exception: except Exception:
return None return {'status': 'OK'}
def _check_pve_services(self) -> Dict[str, Any]: def _check_pve_services(self) -> Dict[str, Any]:
"""Check critical Proxmox services""" """Check critical Proxmox services"""