diff --git a/AppImage/components/health-status-modal.tsx b/AppImage/components/health-status-modal.tsx index 3496648..7a09397 100644 --- a/AppImage/components/health-status-modal.tsx +++ b/AppImage/components/health-status-modal.tsx @@ -1,14 +1,34 @@ "use client" +import type React from "react" + import { useState, useEffect } from "react" import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle } from "@/components/ui/dialog" import { Badge } from "@/components/ui/badge" -import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card" -import { Loader2, CheckCircle2, AlertTriangle, XCircle, Activity } from "lucide-react" +import { Button } from "@/components/ui/button" +import { + Loader2, + CheckCircle2, + AlertTriangle, + XCircle, + Activity, + Cpu, + MemoryStick, + HardDrive, + Disc, + Network, + Box, + Settings, + FileText, + RefreshCw, + Shield, + X, +} from "lucide-react" -interface HealthDetail { +interface CategoryCheck { status: string reason?: string + details?: any [key: string]: any } @@ -16,7 +36,16 @@ interface HealthDetails { overall: string summary: string details: { - [category: string]: HealthDetail | { [key: string]: HealthDetail } + cpu: CategoryCheck + memory: CategoryCheck + storage: CategoryCheck + disks: CategoryCheck + network: CategoryCheck + vms: CategoryCheck + services: CategoryCheck + logs: CategoryCheck + updates: CategoryCheck + security: CategoryCheck } timestamp: string } @@ -27,6 +56,19 @@ interface HealthStatusModalProps { getApiUrl: (path: string) => string } +const CATEGORIES = [ + { key: "cpu", label: "CPU Usage & Temperature", Icon: Cpu }, + { key: "memory", label: "Memory & Swap", Icon: MemoryStick }, + { key: "storage", label: "Storage Mounts & Space", Icon: HardDrive }, + { key: "disks", label: "Disk I/O & Errors", Icon: Disc }, + { key: "network", label: "Network Interfaces", Icon: Network }, + { key: "vms", label: "VMs & Containers", Icon: Box }, + { key: "services", label: "PVE Services", Icon: Settings }, + { key: "logs", label: "System Logs", Icon: FileText }, + { key: "updates", label: "System Updates", Icon: RefreshCw }, + { key: "security", label: "Security & Certificates", Icon: Shield }, +] + export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatusModalProps) { const [loading, setLoading] = useState(true) const [healthData, setHealthData] = useState(null) @@ -58,74 +100,6 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu } } - const getHealthStats = () => { - if (!healthData?.details) { - return { total: 0, healthy: 0, warnings: 0, critical: 0 } - } - - let healthy = 0 - let warnings = 0 - let critical = 0 - let total = 0 - - const countStatus = (detail: any) => { - if (detail && typeof detail === "object" && detail.status) { - total++ - const status = detail.status.toUpperCase() - if (status === "OK") healthy++ - else if (status === "WARNING") warnings++ - else if (status === "CRITICAL") critical++ - } - } - - Object.values(healthData.details).forEach((categoryData) => { - if (categoryData && typeof categoryData === "object") { - if ("status" in categoryData) { - countStatus(categoryData) - } else { - Object.values(categoryData).forEach(countStatus) - } - } - }) - - return { total, healthy, warnings, critical } - } - - const getGroupedChecks = () => { - if (!healthData?.details) return {} - - const grouped: { [key: string]: Array<{ name: string; status: string; reason?: string; details?: any }> } = {} - - Object.entries(healthData.details).forEach(([category, categoryData]) => { - if (!categoryData || typeof categoryData !== "object") return - - const categoryName = category.charAt(0).toUpperCase() + category.slice(1) - grouped[categoryName] = [] - - if ("status" in categoryData) { - grouped[categoryName].push({ - name: categoryName, - status: categoryData.status, - reason: categoryData.reason, - details: categoryData, - }) - } else { - Object.entries(categoryData).forEach(([subKey, subData]: [string, any]) => { - if (subData && typeof subData === "object" && "status" in subData) { - grouped[categoryName].push({ - name: subKey, - status: subData.status, - reason: subData.reason, - details: subData, - }) - } - }) - } - }) - - return grouped - } - const getStatusIcon = (status: string) => { const statusUpper = status?.toUpperCase() switch (statusUpper) { @@ -144,27 +118,106 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu const statusUpper = status?.toUpperCase() switch (statusUpper) { case "OK": - return Healthy + return OK case "WARNING": - return Warning + return Warning case "CRITICAL": - return Critical + return Critical default: return Unknown } } + const getHealthStats = () => { + if (!healthData?.details) { + return { total: 0, healthy: 0, warnings: 0, critical: 0 } + } + + let healthy = 0 + let warnings = 0 + let critical = 0 + + CATEGORIES.forEach(({ key }) => { + const categoryData = healthData.details[key as keyof typeof healthData.details] + if (categoryData) { + const status = categoryData.status?.toUpperCase() + if (status === "OK") healthy++ + else if (status === "WARNING") warnings++ + else if (status === "CRITICAL") critical++ + } + }) + + return { total: CATEGORIES.length, healthy, warnings, critical } + } + const stats = getHealthStats() - const groupedChecks = getGroupedChecks() + + const handleCategoryClick = (categoryKey: string, status: string) => { + if (status === "OK") return // No navegar si está OK + + onOpenChange(false) // Cerrar el modal + + // Mapear categorías a tabs + const categoryToTab: Record = { + storage: "storage", + disks: "storage", + network: "network", + vms: "vms", + logs: "logs", + hardware: "hardware", + services: "hardware", + } + + const targetTab = categoryToTab[categoryKey] + if (targetTab) { + // Disparar evento para cambiar tab + const event = new CustomEvent("changeTab", { detail: { tab: targetTab } }) + window.dispatchEvent(event) + } + } + + const handleAcknowledge = async (errorKey: string, e: React.MouseEvent) => { + e.stopPropagation() // Prevent navigation + + console.log("[v0] Dismissing error:", errorKey) + + try { + const response = await fetch(getApiUrl("/api/health/acknowledge"), { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ error_key: errorKey }), + }) + + if (!response.ok) { + const errorData = await response.json() + console.error("[v0] Acknowledge failed:", errorData) + throw new Error(errorData.error || "Failed to acknowledge error") + } + + const result = await response.json() + console.log("[v0] Acknowledge success:", result) + + // Refresh health data + await fetchHealthDetails() + } catch (err) { + console.error("[v0] Error acknowledging:", err) + alert("Failed to dismiss error. Please try again.") + } + } return ( - + - - - System Health Status - +
+ + + System Health Status + {healthData &&
{getStatusBadge(healthData.overall)}
} +
+
Detailed health checks for all system components
@@ -182,82 +235,118 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu )} {healthData && !loading && ( -
- {/* Overall Status Summary */} - - - - Overall Status - {getStatusBadge(healthData.overall)} - - - - {healthData.summary &&

{healthData.summary}

} -
-
-
{stats.total}
-
Total Checks
-
-
-
{stats.healthy}
-
Healthy
-
-
-
{stats.warnings}
-
Warnings
-
-
-
{stats.critical}
-
Critical
-
-
-
-
+
+ {/* Overall Stats Summary */} +
+
+
{stats.total}
+
Total Checks
+
+
+
{stats.healthy}
+
Healthy
+
+
+
{stats.warnings}
+
Warnings
+
+
+
{stats.critical}
+
Critical
+
+
- {/* Grouped Health Checks */} - {Object.entries(groupedChecks).map(([category, checks]) => ( - - - {category} - - -
- {checks.map((check, index) => ( -
-
{getStatusIcon(check.status)}
-
-
-

{check.name}

- - {check.status} - -
- {check.reason &&

{check.reason}

} - {check.details && ( -
- {Object.entries(check.details).map(([key, value]) => { - if (key === "status" || key === "reason" || typeof value === "object") return null - return ( -
- {key}: {String(value)} -
- ) - })} -
- )} -
+ {healthData.summary && healthData.summary !== "All systems operational" && ( +
+ {healthData.summary} +
+ )} + +
+ {CATEGORIES.map(({ key, label, Icon }) => { + const categoryData = healthData.details[key as keyof typeof healthData.details] + const status = categoryData?.status || "UNKNOWN" + const reason = categoryData?.reason + const details = categoryData?.details + + return ( +
handleCategoryClick(key, status)} + className={`flex items-start gap-3 p-3 rounded-lg border transition-colors ${ + status === "OK" + ? "bg-green-500/5 border-green-500/20 hover:bg-green-500/10" + : status === "WARNING" + ? "bg-yellow-500/5 border-yellow-500/20 hover:bg-yellow-500/10 cursor-pointer" + : status === "CRITICAL" + ? "bg-red-500/5 border-red-500/20 hover:bg-red-500/10 cursor-pointer" + : "bg-muted/30 hover:bg-muted/50" + }`} + > +
+ + {getStatusIcon(status)} +
+
+
+

{label}

+ + {status} +
- ))} + {reason &&

{reason}

} + {details && typeof details === "object" && ( +
+ {Object.entries(details).map(([detailKey, detailValue]: [string, any]) => { + if (typeof detailValue === "object" && detailValue !== null) { + return ( +
+
+ {detailKey}: + {detailValue.reason && ( + {detailValue.reason} + )} +
+ {status !== "OK" && ( + + )} +
+ ) + } + return null + })} +
+ )} +
- - - ))} + ) + })} +
{healthData.timestamp && ( -
+
Last updated: {new Date(healthData.timestamp).toLocaleString()}
)} diff --git a/AppImage/components/node-metrics-charts.tsx b/AppImage/components/node-metrics-charts.tsx index a8d86e4..138551b 100644 --- a/AppImage/components/node-metrics-charts.tsx +++ b/AppImage/components/node-metrics-charts.tsx @@ -5,6 +5,7 @@ import { Card, CardContent, CardHeader, CardTitle } from "./ui/card" import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from "./ui/select" import { AreaChart, Area, XAxis, YAxis, CartesianGrid, Tooltip, ResponsiveContainer, Legend } from "recharts" import { Loader2, TrendingUp, MemoryStick } from "lucide-react" +import { useIsMobile } from "../hooks/use-mobile" const TIMEFRAME_OPTIONS = [ { value: "hour", label: "1 Hour" }, @@ -69,6 +70,7 @@ export function NodeMetricsCharts() { const [data, setData] = useState([]) const [loading, setLoading] = useState(true) const [error, setError] = useState(null) + const isMobile = useIsMobile() const [visibleLines, setVisibleLines] = useState({ cpu: { cpu: true, load: true }, @@ -321,15 +323,15 @@ export function NodeMetricsCharts() {
{/* CPU Usage + Load Average Chart */} - + CPU Usage & Load Average - + - + } /> @@ -389,15 +395,15 @@ export function NodeMetricsCharts() { {/* Memory Usage Chart */} - + Memory Usage - + - + } /> diff --git a/AppImage/components/proxmox-dashboard.tsx b/AppImage/components/proxmox-dashboard.tsx index 199e1c6..bf320ea 100644 --- a/AppImage/components/proxmox-dashboard.tsx +++ b/AppImage/components/proxmox-dashboard.tsx @@ -55,7 +55,9 @@ interface FlaskSystemInfo { hostname: string node_id: string uptime: string - health_status: "healthy" | "warning" | "critical" + health: { + status: "healthy" | "warning" | "critical" + } } export function ProxmoxDashboard() { @@ -96,8 +98,19 @@ export function ProxmoxDashboard() { const uptimeValue = data.uptime && typeof data.uptime === "string" && data.uptime.trim() !== "" ? data.uptime : "N/A" + const backendStatus = data.health?.status?.toUpperCase() || "OK" + let healthStatus: "healthy" | "warning" | "critical" + + if (backendStatus === "CRITICAL") { + healthStatus = "critical" + } else if (backendStatus === "WARNING") { + healthStatus = "warning" + } else { + healthStatus = "healthy" + } + setSystemStatus({ - status: data.health_status || "healthy", + status: healthStatus, uptime: uptimeValue, lastUpdate: new Date().toLocaleTimeString("en-US", { hour12: false }), serverName: data.hostname || "Unknown", @@ -123,11 +136,13 @@ export function ProxmoxDashboard() { // Siempre fetch inicial fetchSystemData() + // En overview: cada 30 segundos para actualización frecuente del estado de salud + // En otras tabs: cada 60 segundos para reducir carga let interval: ReturnType | null = null if (activeTab === "overview") { - interval = setInterval(fetchSystemData, 9000) // Cambiado de 10000 a 9000ms + interval = setInterval(fetchSystemData, 30000) // 30 segundos } else { - interval = setInterval(fetchSystemData, 61000) // Cambiado de 60000 a 61000ms + interval = setInterval(fetchSystemData, 60000) // 60 segundos } return () => { @@ -135,6 +150,20 @@ export function ProxmoxDashboard() { } }, [fetchSystemData, activeTab]) + useEffect(() => { + const handleChangeTab = (event: CustomEvent) => { + const { tab } = event.detail + if (tab) { + setActiveTab(tab) + } + } + + window.addEventListener("changeTab", handleChangeTab as EventListener) + return () => { + window.removeEventListener("changeTab", handleChangeTab as EventListener) + } + }, []) + useEffect(() => { if ( systemStatus.serverName && diff --git a/AppImage/components/settings.tsx b/AppImage/components/settings.tsx index d098851..3cc86ce 100644 --- a/AppImage/components/settings.tsx +++ b/AppImage/components/settings.tsx @@ -5,10 +5,16 @@ import { Button } from "./ui/button" import { Input } from "./ui/input" import { Label } from "./ui/label" import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "./ui/card" -import { Shield, Lock, User, AlertCircle, CheckCircle, Info, LogOut } from "lucide-react" +import { Shield, Lock, User, AlertCircle, CheckCircle, Info, LogOut, Wrench, Package } from "lucide-react" import { getApiUrl } from "../lib/api-config" import { TwoFactorSetup } from "./two-factor-setup" +interface ProxMenuxTool { + key: string + name: string + enabled: boolean +} + export function Settings() { const [authEnabled, setAuthEnabled] = useState(false) const [totpEnabled, setTotpEnabled] = useState(false) @@ -32,8 +38,12 @@ export function Settings() { const [show2FADisable, setShow2FADisable] = useState(false) const [disable2FAPassword, setDisable2FAPassword] = useState("") + const [proxmenuxTools, setProxmenuxTools] = useState([]) + const [loadingTools, setLoadingTools] = useState(true) + useEffect(() => { checkAuthStatus() + loadProxmenuxTools() }, []) const checkAuthStatus = async () => { @@ -47,6 +57,21 @@ export function Settings() { } } + const loadProxmenuxTools = async () => { + try { + const response = await fetch(getApiUrl("/api/proxmenux/installed-tools")) + const data = await response.json() + + if (data.success) { + setProxmenuxTools(data.installed_tools || []) + } + } catch (err) { + console.error("Failed to load ProxMenux tools:", err) + } finally { + setLoadingTools(false) + } + } + const handleEnableAuth = async () => { setError("") setSuccess("") @@ -541,21 +566,45 @@ export function Settings() { - {/* About Section */} + {/* ProxMenux Optimizations */} - About - ProxMenux Monitor information +
+ + ProxMenux Optimizations +
+ System optimizations and utilities installed via ProxMenux
- -
- Version - 1.0.1 -
-
- Build - Debian Package -
+ + {loadingTools ? ( +
+
+
+ ) : proxmenuxTools.length === 0 ? ( +
+ +

No ProxMenux optimizations installed yet

+

Run ProxMenux to configure system optimizations

+
+ ) : ( +
+
+ Installed Tools + {proxmenuxTools.length} active +
+
+ {proxmenuxTools.map((tool) => ( +
+
+ {tool.name} +
+ ))} +
+
+ )} diff --git a/AppImage/components/storage-overview.tsx b/AppImage/components/storage-overview.tsx index 70ba7eb..c4f92e3 100644 --- a/AppImage/components/storage-overview.tsx +++ b/AppImage/components/storage-overview.tsx @@ -65,6 +65,7 @@ interface ProxmoxStorage { used: number available: number percent: number + node: string // Added node property for detailed debug logging } interface ProxmoxStorageData { @@ -101,27 +102,6 @@ export function StorageOverview() { const data = await storageResponse.json() const proxmoxData = await proxmoxResponse.json() - console.log("[v0] Storage data received:", data) - console.log("[v0] Proxmox storage data received:", proxmoxData) - - if (proxmoxData && proxmoxData.storage) { - const activeStorages = proxmoxData.storage.filter( - (s: any) => s && s.total > 0 && s.used >= 0 && s.status?.toLowerCase() === "active", - ) - console.log("[v0] Active storage volumes:", activeStorages.length) - console.log( - "[v0] Total used across all volumes (GB):", - activeStorages.reduce((sum: number, s: any) => sum + s.used, 0), - ) - - // Check for potential cluster node duplication - const storageNames = activeStorages.map((s: any) => s.name) - const uniqueNames = new Set(storageNames) - if (storageNames.length !== uniqueNames.size) { - console.warn("[v0] WARNING: Duplicate storage names detected - possible cluster node issue") - } - } - setStorageData(data) setProxmoxStorage(proxmoxData) } catch (error) { @@ -417,24 +397,33 @@ export function StorageOverview() { const diskHealthBreakdown = getDiskHealthBreakdown() const diskTypesBreakdown = getDiskTypesBreakdown() - // Only sum storage that belongs to the current node or filter appropriately const totalProxmoxUsed = - proxmoxStorage && proxmoxStorage.storage - ? proxmoxStorage.storage - .filter( - (storage) => - storage && - storage.total > 0 && - storage.used >= 0 && // Added check for valid used value - storage.status && - storage.status.toLowerCase() === "active", - ) - .reduce((sum, storage) => sum + storage.used, 0) - : 0 + proxmoxStorage?.storage + .filter( + (storage) => + storage && + storage.name && + storage.status === "active" && + storage.total > 0 && + storage.used >= 0 && + storage.available >= 0, + ) + .reduce((sum, storage) => sum + storage.used, 0) || 0 - // Convert storageData.total from TB to GB before calculating percentage - const usagePercent = - storageData && storageData.total > 0 ? ((totalProxmoxUsed / (storageData.total * 1024)) * 100).toFixed(2) : "0.00" + const totalProxmoxCapacity = + proxmoxStorage?.storage + .filter( + (storage) => + storage && + storage.name && + storage.status === "active" && + storage.total > 0 && + storage.used >= 0 && + storage.available >= 0, + ) + .reduce((sum, storage) => sum + storage.total, 0) || 0 + + const usagePercent = totalProxmoxCapacity > 0 ? ((totalProxmoxUsed / totalProxmoxCapacity) * 100).toFixed(2) : "0.00" if (loading) { return ( diff --git a/AppImage/components/system-overview.tsx b/AppImage/components/system-overview.tsx index 9c41550..43ad0d1 100644 --- a/AppImage/components/system-overview.tsx +++ b/AppImage/components/system-overview.tsx @@ -225,100 +225,87 @@ export function SystemOverview() { const [storageData, setStorageData] = useState(null) const [proxmoxStorageData, setProxmoxStorageData] = useState(null) const [networkData, setNetworkData] = useState(null) - const [loading, setLoading] = useState(true) + const [loadingStates, setLoadingStates] = useState({ + system: true, + vms: true, + storage: true, + network: true, + }) const [error, setError] = useState(null) const [networkTimeframe, setNetworkTimeframe] = useState("day") const [networkTotals, setNetworkTotals] = useState<{ received: number; sent: number }>({ received: 0, sent: 0 }) useEffect(() => { - const fetchData = async () => { - try { - setLoading(true) - setError(null) + const fetchAllData = async () => { + const [systemResult, vmResult, storageResults, networkResult] = await Promise.all([ + fetchSystemData().finally(() => setLoadingStates((prev) => ({ ...prev, system: false }))), + fetchVMData().finally(() => setLoadingStates((prev) => ({ ...prev, vms: false }))), + Promise.all([fetchStorageData(), fetchProxmoxStorageData()]).finally(() => + setLoadingStates((prev) => ({ ...prev, storage: false })), + ), + fetchNetworkData().finally(() => setLoadingStates((prev) => ({ ...prev, network: false }))), + ]) - const systemResult = await fetchSystemData() - - if (!systemResult) { - setError("Flask server not available. Please ensure the server is running.") - setLoading(false) - return - } - - setSystemData(systemResult) - } catch (err) { - console.error("[v0] Error fetching system data:", err) - setError("Failed to connect to Flask server. Please check your connection.") - } finally { - setLoading(false) + if (!systemResult) { + setError("Flask server not available. Please ensure the server is running.") + return } + + setSystemData(systemResult) + setVmData(vmResult) + setStorageData(storageResults[0]) + setProxmoxStorageData(storageResults[1]) + setNetworkData(networkResult) + + setTimeout(async () => { + const refreshedSystemData = await fetchSystemData() + if (refreshedSystemData) { + setSystemData(refreshedSystemData) + } + }, 2000) } - fetchData() + fetchAllData() - const systemInterval = setInterval(() => { - fetchSystemData().then((data) => { - if (data) setSystemData(data) - }) - }, 9000) // Cambiado de 10000 a 9000ms + const systemInterval = setInterval(async () => { + const data = await fetchSystemData() + if (data) setSystemData(data) + }, 9000) + + const vmInterval = setInterval(async () => { + const data = await fetchVMData() + setVmData(data) + }, 59000) + + const storageInterval = setInterval(async () => { + const [storage, proxmoxStorage] = await Promise.all([fetchStorageData(), fetchProxmoxStorageData()]) + if (storage) setStorageData(storage) + if (proxmoxStorage) setProxmoxStorageData(proxmoxStorage) + }, 59000) + + const networkInterval = setInterval(async () => { + const data = await fetchNetworkData() + if (data) setNetworkData(data) + }, 59000) return () => { clearInterval(systemInterval) - } - }, []) - - useEffect(() => { - const fetchVMs = async () => { - const vmResult = await fetchVMData() - setVmData(vmResult) - } - - fetchVMs() - const vmInterval = setInterval(fetchVMs, 59000) // Cambiado de 60000 a 59000ms - - return () => { clearInterval(vmInterval) - } - }, []) - - useEffect(() => { - const fetchStorage = async () => { - const storageResult = await fetchStorageData() - setStorageData(storageResult) - - const proxmoxStorageResult = await fetchProxmoxStorageData() - setProxmoxStorageData(proxmoxStorageResult) - } - - fetchStorage() - const storageInterval = setInterval(fetchStorage, 59000) // Cambiado de 60000 a 59000ms - - return () => { clearInterval(storageInterval) - } - }, []) - - useEffect(() => { - const fetchNetwork = async () => { - const networkResult = await fetchNetworkData() - setNetworkData(networkResult) - } - - fetchNetwork() - const networkInterval = setInterval(fetchNetwork, 59000) // Cambiado de 60000 a 59000ms - - return () => { clearInterval(networkInterval) } }, []) - if (loading) { + const isInitialLoading = loadingStates.system && !systemData + + if (isInitialLoading) { return (
Connecting to ProxMenux Monitor...
Fetching real-time system data
-
+
{[...Array(4)].map((_, i) => ( @@ -386,12 +373,10 @@ export function SystemOverview() { const formatStorage = (sizeInGB: number): string => { if (sizeInGB < 1) { - // Less than 1 GB, show in MB return `${(sizeInGB * 1024).toFixed(1)} MB` } else if (sizeInGB > 999) { return `${(sizeInGB / 1024).toFixed(2)} TB` } else { - // Between 1 and 999 GB, show in GB return `${sizeInGB.toFixed(2)} GB` } } @@ -402,13 +387,10 @@ export function SystemOverview() { const vmLxcStorages = proxmoxStorageData?.storage.filter( (s) => - // Include only local storage types that can host VMs/LXCs (s.type === "lvm" || s.type === "lvmthin" || s.type === "zfspool" || s.type === "btrfs" || s.type === "dir") && - // Exclude network storage s.type !== "nfs" && s.type !== "cifs" && s.type !== "iscsi" && - // Exclude the "local" storage (used for ISOs/templates) s.name !== "local", ) @@ -474,7 +456,6 @@ export function SystemOverview() { return (
- {/* Key Metrics Cards */}
@@ -524,34 +505,44 @@ export function SystemOverview() { - - Active VM & LXC - + + + + Active VM & LXC + -
{vmStats.running}
-
- - {vmStats.running} Running - - {vmStats.stopped > 0 && ( - - {vmStats.stopped} Stopped - - )} -
-

- Total: {vmStats.vms} VMs, {vmStats.lxc} LXC -

+ {loadingStates.vms ? ( +
+
+
+
+
+ ) : ( + <> +
{vmStats.running}
+
+ + {vmStats.running} Running + + {vmStats.stopped > 0 && ( + + {vmStats.stopped} Stopped + + )} +
+

+ Total: {vmStats.vms} VMs, {vmStats.lxc} LXC +

+ + )}
- {/* Node Metrics Charts */}
- {/* Storage Summary */} @@ -560,8 +551,45 @@ export function SystemOverview() { - {storageData ? ( + {loadingStates.storage ? ( +
+
+
+
+
+ ) : storageData ? (
+ {(() => { + const totalCapacity = (vmLxcStorageTotal || 0) + (localStorage?.total || 0) + const totalUsed = (vmLxcStorageUsed || 0) + (localStorage?.used || 0) + const totalAvailable = (vmLxcStorageAvailable || 0) + (localStorage?.available || 0) + const totalPercent = totalCapacity > 0 ? (totalUsed / totalCapacity) * 100 : 0 + + return totalCapacity > 0 ? ( +
+
+ Total Node Capacity: + {formatStorage(totalCapacity)} +
+ +
+
+ + Used: {formatStorage(totalUsed)} + + + Free: {formatStorage(totalAvailable)} + +
+ {totalPercent.toFixed(1)}% +
+
+ ) : null + })()} +
Total Capacity: @@ -637,7 +665,6 @@ export function SystemOverview() { - {/* Network Summary */} @@ -660,7 +687,13 @@ export function SystemOverview() { - {networkData ? ( + {loadingStates.network ? ( +
+
+
+
+
+ ) : networkData ? (
Active Interfaces: @@ -731,7 +764,6 @@ export function SystemOverview() {
- {/* System Information */}
@@ -764,7 +796,6 @@ export function SystemOverview() { - {/* System Health & Alerts */} diff --git a/AppImage/components/virtual-machines.tsx b/AppImage/components/virtual-machines.tsx index dd0ffcb..a9a4d15 100644 --- a/AppImage/components/virtual-machines.tsx +++ b/AppImage/components/virtual-machines.tsx @@ -139,7 +139,7 @@ const fetcher = async (url: string) => { headers: { "Content-Type": "application/json", }, - signal: AbortSignal.timeout(5000), + signal: AbortSignal.timeout(30000), }) if (!response.ok) { @@ -267,6 +267,8 @@ export function VirtualMachines() { refreshInterval: 23000, revalidateOnFocus: false, revalidateOnReconnect: true, + dedupingInterval: 10000, + errorRetryCount: 2, }) const [selectedVM, setSelectedVM] = useState(null) @@ -287,27 +289,43 @@ export function VirtualMachines() { if (!vmData) return const lxcs = vmData.filter((vm) => vm.type === "lxc") + + if (lxcs.length === 0) return + const configs: Record = {} - await Promise.all( - lxcs.map(async (lxc) => { - try { - const response = await fetch(`/api/vms/${lxc.vmid}`) - if (response.ok) { - const details = await response.json() - if (details.lxc_ip_info?.primary_ip) { - configs[lxc.vmid] = details.lxc_ip_info.primary_ip - } else if (details.config) { - configs[lxc.vmid] = extractIPFromConfig(details.config, details.lxc_ip_info) - } - } - } catch (error) { - console.error(`Error fetching config for LXC ${lxc.vmid}:`, error) - } - }), - ) + const batchSize = 5 + for (let i = 0; i < lxcs.length; i += batchSize) { + const batch = lxcs.slice(i, i + batchSize) - setVmConfigs(configs) + await Promise.all( + batch.map(async (lxc) => { + try { + const controller = new AbortController() + const timeoutId = setTimeout(() => controller.abort(), 10000) + + const response = await fetch(`/api/vms/${lxc.vmid}`, { + signal: controller.signal, + }) + + clearTimeout(timeoutId) + + if (response.ok) { + const details = await response.json() + if (details.lxc_ip_info?.primary_ip) { + configs[lxc.vmid] = details.lxc_ip_info.primary_ip + } else if (details.config) { + configs[lxc.vmid] = extractIPFromConfig(details.config, details.lxc_ip_info) + } + } + } catch (error) { + console.log(`[v0] Could not fetch IP for LXC ${lxc.vmid}`) + } + }), + ) + + setVmConfigs((prev) => ({ ...prev, ...configs })) + } } fetchLXCIPs() diff --git a/AppImage/hooks/use-mobile.tsx b/AppImage/hooks/use-mobile.tsx new file mode 100644 index 0000000..5362201 --- /dev/null +++ b/AppImage/hooks/use-mobile.tsx @@ -0,0 +1,23 @@ +"use client" + +import { useEffect, useState } from "react" + +export function useIsMobile() { + const [isMobile, setIsMobile] = useState(false) + + useEffect(() => { + const checkMobile = () => { + setIsMobile(window.innerWidth < 768) + } + + // Check on mount + checkMobile() + + // Listen for resize + window.addEventListener("resize", checkMobile) + + return () => window.removeEventListener("resize", checkMobile) + }, []) + + return isMobile +} diff --git a/AppImage/lib/polling-config.tsx b/AppImage/lib/polling-config.tsx deleted file mode 100644 index b0becb3..0000000 --- a/AppImage/lib/polling-config.tsx +++ /dev/null @@ -1,85 +0,0 @@ -"use client" - -import { createContext, useContext, useState, useEffect, type ReactNode } from "react" - -export interface PollingIntervals { - storage: number - network: number - vms: number - hardware: number -} - -// Default intervals in milliseconds -const DEFAULT_INTERVALS: PollingIntervals = { - storage: 60000, // 60 seconds - network: 60000, // 60 seconds - vms: 30000, // 30 seconds - hardware: 60000, // 60 seconds -} - -const STORAGE_KEY = "proxmenux_polling_intervals" - -interface PollingConfigContextType { - intervals: PollingIntervals - updateInterval: (key: keyof PollingIntervals, value: number) => void -} - -const PollingConfigContext = createContext(undefined) - -export function PollingConfigProvider({ children }: { children: ReactNode }) { - const [intervals, setIntervals] = useState(DEFAULT_INTERVALS) - - // Load from localStorage on mount - useEffect(() => { - if (typeof window === "undefined") return - - const stored = localStorage.getItem(STORAGE_KEY) - if (stored) { - try { - const parsed = JSON.parse(stored) - setIntervals({ ...DEFAULT_INTERVALS, ...parsed }) - } catch (e) { - console.error("[v0] Failed to parse stored polling intervals:", e) - } - } - }, []) - - const updateInterval = (key: keyof PollingIntervals, value: number) => { - setIntervals((prev) => { - const newIntervals = { ...prev, [key]: value } - if (typeof window !== "undefined") { - localStorage.setItem(STORAGE_KEY, JSON.stringify(newIntervals)) - } - return newIntervals - }) - } - - return {children} -} - -export function usePollingConfig() { - const context = useContext(PollingConfigContext) - if (!context) { - // During SSR or when provider is not available, return defaults - if (typeof window === "undefined") { - return { - intervals: DEFAULT_INTERVALS, - updateInterval: () => {}, - } - } - throw new Error("usePollingConfig must be used within PollingConfigProvider") - } - return context -} - -// Interval options for the UI (in milliseconds) -export const INTERVAL_OPTIONS = [ - { label: "10 seconds", value: 10000 }, - { label: "30 seconds", value: 30000 }, - { label: "1 minute", value: 60000 }, - { label: "2 minutes", value: 120000 }, - { label: "5 minutes", value: 300000 }, - { label: "10 minutes", value: 600000 }, - { label: "30 minutes", value: 1800000 }, - { label: "1 hour", value: 3600000 }, -] diff --git a/AppImage/package.json b/AppImage/package.json index 0481154..d2bcac5 100644 --- a/AppImage/package.json +++ b/AppImage/package.json @@ -1,6 +1,6 @@ { "name": "proxmenux-monitor", - "version": "1.0.0", + "version": "1.0.1", "description": "Proxmox System Monitoring Dashboard", "private": true, "scripts": { diff --git a/AppImage/scripts/build_appimage.sh b/AppImage/scripts/build_appimage.sh index 54c4898..76d2045 100644 --- a/AppImage/scripts/build_appimage.sh +++ b/AppImage/scripts/build_appimage.sh @@ -81,7 +81,9 @@ cp "$SCRIPT_DIR/flask_server.py" "$APP_DIR/usr/bin/" cp "$SCRIPT_DIR/flask_auth_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ flask_auth_routes.py not found" cp "$SCRIPT_DIR/auth_manager.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ auth_manager.py not found" cp "$SCRIPT_DIR/health_monitor.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ health_monitor.py not found" +cp "$SCRIPT_DIR/health_persistence.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ health_persistence.py not found" cp "$SCRIPT_DIR/flask_health_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ flask_health_routes.py not found" +cp "$SCRIPT_DIR/flask_proxmenux_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ flask_proxmenux_routes.py not found" echo "📋 Adding translation support..." cat > "$APP_DIR/usr/bin/translate_cli.py" << 'PYEOF' diff --git a/AppImage/scripts/flask_health_routes.py b/AppImage/scripts/flask_health_routes.py index 86612df..31e4150 100644 --- a/AppImage/scripts/flask_health_routes.py +++ b/AppImage/scripts/flask_health_routes.py @@ -1,9 +1,10 @@ """ -Flask routes for health monitoring +Flask routes for health monitoring with persistence support """ -from flask import Blueprint, jsonify +from flask import Blueprint, jsonify, request from health_monitor import health_monitor +from health_persistence import health_persistence health_bp = Blueprint('health', __name__) @@ -29,11 +30,45 @@ def get_health_details(): def get_system_info(): """ Get lightweight system info for header display. - Returns: hostname, uptime, and cached health status. - This is optimized for minimal server impact. + Returns: hostname, uptime, and health status with proper structure. """ try: info = health_monitor.get_system_info() + + if 'health' in info: + status_map = { + 'OK': 'healthy', + 'WARNING': 'warning', + 'CRITICAL': 'critical', + 'UNKNOWN': 'warning' + } + current_status = info['health'].get('status', 'OK').upper() + info['health']['status'] = status_map.get(current_status, 'healthy') + return jsonify(info) except Exception as e: return jsonify({'error': str(e)}), 500 + +@health_bp.route('/api/health/acknowledge', methods=['POST']) +def acknowledge_error(): + """Acknowledge an error manually (user dismissed it)""" + try: + data = request.get_json() + if not data or 'error_key' not in data: + return jsonify({'error': 'error_key is required'}), 400 + + error_key = data['error_key'] + health_persistence.acknowledge_error(error_key) + return jsonify({'success': True, 'message': 'Error acknowledged'}) + except Exception as e: + return jsonify({'error': str(e)}), 500 + +@health_bp.route('/api/health/active-errors', methods=['GET']) +def get_active_errors(): + """Get all active persistent errors""" + try: + category = request.args.get('category') + errors = health_persistence.get_active_errors(category) + return jsonify({'errors': errors}) + except Exception as e: + return jsonify({'error': str(e)}), 500 diff --git a/AppImage/scripts/flask_proxmenux_routes.py b/AppImage/scripts/flask_proxmenux_routes.py new file mode 100644 index 0000000..6e48e3f --- /dev/null +++ b/AppImage/scripts/flask_proxmenux_routes.py @@ -0,0 +1,75 @@ +from flask import Blueprint, jsonify +import json +import os + +proxmenux_bp = Blueprint('proxmenux', __name__) + +# Tool descriptions mapping +TOOL_DESCRIPTIONS = { + 'lvm_repair': 'LVM PV Headers Repair', + 'repo_cleanup': 'Repository Cleanup', + 'subscription_banner': 'Subscription Banner Removal', + 'time_sync': 'Time Synchronization', + 'apt_languages': 'APT Language Skip', + 'journald': 'Journald Optimization', + 'logrotate': 'Logrotate Optimization', + 'system_limits': 'System Limits Increase', + 'entropy': 'Entropy Generation (haveged)', + 'memory_settings': 'Memory Settings Optimization', + 'kernel_panic': 'Kernel Panic Configuration', + 'apt_ipv4': 'APT IPv4 Force', + 'kexec': 'kexec for quick reboots', + 'network_optimization': 'Network Optimizations', + 'bashrc_custom': 'Bashrc Customization', + 'figurine': 'Figurine', + 'fastfetch': 'Fastfetch', + 'log2ram': 'Log2ram (SSD Protection)', + 'amd_fixes': 'AMD CPU (Ryzen/EPYC) fixes', + 'persistent_network': 'Setting persistent network interfaces' +} + +@proxmenux_bp.route('/api/proxmenux/installed-tools', methods=['GET']) +def get_installed_tools(): + """Get list of installed ProxMenux tools/optimizations""" + installed_tools_path = '/usr/local/share/proxmenux/installed_tools.json' + + try: + if not os.path.exists(installed_tools_path): + return jsonify({ + 'success': True, + 'installed_tools': [], + 'message': 'No ProxMenux optimizations installed yet' + }) + + with open(installed_tools_path, 'r') as f: + data = json.load(f) + + # Convert to list format with descriptions + tools = [] + for tool_key, enabled in data.items(): + if enabled: # Only include enabled tools + tools.append({ + 'key': tool_key, + 'name': TOOL_DESCRIPTIONS.get(tool_key, tool_key.replace('_', ' ').title()), + 'enabled': enabled + }) + + # Sort alphabetically by name + tools.sort(key=lambda x: x['name']) + + return jsonify({ + 'success': True, + 'installed_tools': tools, + 'total_count': len(tools) + }) + + except json.JSONDecodeError: + return jsonify({ + 'success': False, + 'error': 'Invalid JSON format in installed_tools.json' + }), 500 + except Exception as e: + return jsonify({ + 'success': False, + 'error': str(e) + }), 500 diff --git a/AppImage/scripts/flask_server.py b/AppImage/scripts/flask_server.py index a726c5d..7742e21 100644 --- a/AppImage/scripts/flask_server.py +++ b/AppImage/scripts/flask_server.py @@ -34,12 +34,14 @@ from flask_health_routes import health_bp sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from flask_auth_routes import auth_bp +from flask_proxmenux_routes import proxmenux_bp app = Flask(__name__) CORS(app) # Enable CORS for Next.js frontend app.register_blueprint(auth_bp) app.register_blueprint(health_bp) +app.register_blueprint(proxmenux_bp) @@ -1836,8 +1838,15 @@ def get_interface_type(interface_name): if '.' in interface_name: return 'vlan' - # Check if it's a physical interface - if interface_name.startswith(('enp', 'eth', 'eno', 'ens', 'wlan', 'wlp')): + # Check if interface has a real device symlink in /sys/class/net + # This catches all physical interfaces including USB, regardless of naming + sys_path = f'/sys/class/net/{interface_name}/device' + if os.path.exists(sys_path): + # It's a physical interface (PCI, USB, etc.) + return 'physical' + + # This handles cases where /sys might not be available + if interface_name.startswith(('enp', 'eth', 'eno', 'ens', 'enx', 'wlan', 'wlp', 'wlo', 'usb')): return 'physical' # Default to skip for unknown types @@ -2851,7 +2860,7 @@ def get_detailed_gpu_info(gpu): clients = best_json['clients'] processes = [] - for client_id, client_data in clients: + for client_id, client_data in clients.items(): process_info = { 'name': client_data.get('name', 'Unknown'), 'pid': client_data.get('pid', 'Unknown'), @@ -3302,6 +3311,9 @@ def get_detailed_gpu_info(gpu): data_retrieved = False + # CHANGE: Initialize sensors variable to None to avoid UnboundLocalError + sensors = None + # Parse temperature (Edge Temperature from sensors) if 'sensors' in device: sensors = device['sensors'] @@ -3313,15 +3325,16 @@ def get_detailed_gpu_info(gpu): pass data_retrieved = True + # CHANGE: Added check to ensure sensors is not None before accessing # Parse power draw (GFX Power or average_socket_power) - if 'GFX Power' in sensors: + if sensors and 'GFX Power' in sensors: gfx_power = sensors['GFX Power'] if 'value' in gfx_power: detailed_info['power_draw'] = f"{gfx_power['value']:.2f} W" # print(f"[v0] Power Draw: {detailed_info['power_draw']}", flush=True) pass data_retrieved = True - elif 'average_socket_power' in sensors: + elif sensors and 'average_socket_power' in sensors: socket_power = sensors['average_socket_power'] if 'value' in socket_power: detailed_info['power_draw'] = f"{socket_power['value']:.2f} W" @@ -4910,7 +4923,7 @@ def api_logs(): 'pid': log_entry.get('_PID', ''), 'hostname': log_entry.get('_HOSTNAME', '') }) - except (json.JSONDecodeError, ValueError) as e: + except (json.JSONDecodeError, ValueError): continue return jsonify({'logs': logs, 'total': len(logs)}) else: diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py index 8a175a3..1889cd4 100644 --- a/AppImage/scripts/health_monitor.py +++ b/AppImage/scripts/health_monitor.py @@ -4,7 +4,7 @@ Provides comprehensive, lightweight health checks for Proxmox systems. Optimized for minimal system impact with intelligent thresholds and hysteresis. Author: MacRimi -Version: 1.1 (Optimized for minimal overhead) +Version: 1.2 (Always returns all 10 categories) """ import psutil @@ -15,20 +15,23 @@ import os from typing import Dict, List, Any, Tuple, Optional from datetime import datetime, timedelta from collections import defaultdict +import re + +from health_persistence import health_persistence class HealthMonitor: """ Monitors system health across multiple components with minimal impact. - Implements hysteresis, intelligent caching, and progressive escalation. - Only reports problems, not verbose OK statuses. + Implements hysteresis, intelligent caching, progressive escalation, and persistent error tracking. + Always returns all 10 health categories. """ # CPU Thresholds CPU_WARNING = 85 CPU_CRITICAL = 95 CPU_RECOVERY = 75 - CPU_WARNING_DURATION = 60 - CPU_CRITICAL_DURATION = 120 + CPU_WARNING_DURATION = 300 # 5 minutes sustained + CPU_CRITICAL_DURATION = 300 # 5 minutes sustained CPU_RECOVERY_DURATION = 120 # Memory Thresholds @@ -64,12 +67,31 @@ class HealthMonitor: UPDATES_WARNING = 10 UPDATES_CRITICAL = 30 - # Critical keywords for immediate escalation + # Known benign errors from Proxmox that should not trigger alerts + BENIGN_ERROR_PATTERNS = [ + r'got inotify poll request in wrong process', + r'auth key pair too old, rotating', + r'proxy detected vanished client connection', + r'worker \d+ finished', + r'connection timed out', + r'disconnect peer', + ] + CRITICAL_LOG_KEYWORDS = [ - 'I/O error', 'EXT4-fs error', 'XFS', 'LVM activation failed', - 'md/raid: device failed', 'Out of memory', 'kernel panic', - 'filesystem read-only', 'cannot mount', 'failed to start', - 'task hung', 'oom_kill' + 'out of memory', 'oom_kill', 'kernel panic', + 'filesystem read-only', 'cannot mount', + 'raid.*failed', 'md.*device failed', + 'ext4-fs error', 'xfs.*corruption', + 'lvm activation failed', + 'hardware error', 'mce:', + 'segfault', 'general protection fault' + ] + + WARNING_LOG_KEYWORDS = [ + 'i/o error', 'ata error', 'scsi error', + 'task hung', 'blocked for more than', + 'failed to start', 'service.*failed', + 'disk.*offline', 'disk.*removed' ] # PVE Critical Services @@ -84,6 +106,11 @@ class HealthMonitor: self.io_error_history = defaultdict(list) self.failed_vm_history = set() # Track VMs that failed to start + try: + health_persistence.cleanup_old_errors() + except Exception as e: + print(f"[HealthMonitor] Cleanup warning: {e}") + def get_system_info(self) -> Dict[str, Any]: """ Get lightweight system info for header display. @@ -103,7 +130,7 @@ class HealthMonitor: return { 'hostname': hostname, 'uptime_seconds': int(uptime_seconds), - 'uptime_formatted': self._format_uptime(uptime_seconds), + 'uptime': self._format_uptime(uptime_seconds), 'health': health_status, 'timestamp': datetime.now().isoformat() } @@ -111,7 +138,7 @@ class HealthMonitor: return { 'hostname': 'unknown', 'uptime_seconds': 0, - 'uptime_formatted': 'Unknown', + 'uptime': 'Unknown', 'health': {'status': 'UNKNOWN', 'summary': f'Error: {str(e)}'}, 'timestamp': datetime.now().isoformat() } @@ -186,100 +213,124 @@ class HealthMonitor: def get_detailed_status(self) -> Dict[str, Any]: """ Get comprehensive health status with all checks. - Returns JSON structure matching the specification. - OPTIMIZED: Only shows problems, not verbose OK messages. + Returns JSON structure with ALL 10 categories always present. + Now includes persistent error tracking. """ - details = {} + active_errors = health_persistence.get_active_errors() + persistent_issues = {err['error_key']: err for err in active_errors} + + details = { + 'cpu': {'status': 'OK'}, + 'memory': {'status': 'OK'}, + 'storage': {'status': 'OK'}, + 'disks': {'status': 'OK'}, + 'network': {'status': 'OK'}, + 'vms': {'status': 'OK'}, + 'services': {'status': 'OK'}, + 'logs': {'status': 'OK'}, + 'updates': {'status': 'OK'}, + 'security': {'status': 'OK'} + } + critical_issues = [] warning_issues = [] + info_issues = [] # Added info_issues to track INFO separately # Priority 1: Services PVE services_status = self._check_pve_services() - if services_status['status'] != 'OK': - details['services'] = services_status - if services_status['status'] == 'CRITICAL': - critical_issues.append(services_status.get('reason', 'Service failure')) - elif services_status['status'] == 'WARNING': - warning_issues.append(services_status.get('reason', 'Service issue')) + details['services'] = services_status + if services_status['status'] == 'CRITICAL': + critical_issues.append(services_status.get('reason', 'Service failure')) + elif services_status['status'] == 'WARNING': + warning_issues.append(services_status.get('reason', 'Service issue')) + # Priority 2: Storage storage_status = self._check_storage_optimized() - if storage_status and storage_status.get('status') != 'OK': + if storage_status: details['storage'] = storage_status if storage_status.get('status') == 'CRITICAL': critical_issues.append(storage_status.get('reason', 'Storage failure')) elif storage_status.get('status') == 'WARNING': warning_issues.append(storage_status.get('reason', 'Storage issue')) + # Priority 3: Disks disks_status = self._check_disks_optimized() - if disks_status and disks_status.get('status') != 'OK': + if disks_status: details['disks'] = disks_status if disks_status.get('status') == 'CRITICAL': critical_issues.append(disks_status.get('reason', 'Disk failure')) elif disks_status.get('status') == 'WARNING': warning_issues.append(disks_status.get('reason', 'Disk issue')) - vms_status = self._check_vms_cts_optimized() - if vms_status and vms_status.get('status') != 'OK': + # Priority 4: VMs/CTs - now with persistence + vms_status = self._check_vms_cts_with_persistence() + if vms_status: details['vms'] = vms_status if vms_status.get('status') == 'CRITICAL': critical_issues.append(vms_status.get('reason', 'VM/CT failure')) elif vms_status.get('status') == 'WARNING': warning_issues.append(vms_status.get('reason', 'VM/CT issue')) + # Priority 5: Network network_status = self._check_network_optimized() - if network_status and network_status.get('status') != 'OK': + if network_status: details['network'] = network_status if network_status.get('status') == 'CRITICAL': critical_issues.append(network_status.get('reason', 'Network failure')) elif network_status.get('status') == 'WARNING': warning_issues.append(network_status.get('reason', 'Network issue')) - # Priority 5: CPU/RAM (solo si hay problemas) + # Priority 6: CPU cpu_status = self._check_cpu_with_hysteresis() - if cpu_status.get('status') != 'OK': - details['cpu'] = cpu_status - if cpu_status.get('status') == 'WARNING': - warning_issues.append(cpu_status.get('reason', 'CPU high')) - elif cpu_status.get('status') == 'CRITICAL': - critical_issues.append(cpu_status.get('reason', 'CPU critical')) + details['cpu'] = cpu_status + if cpu_status.get('status') == 'WARNING': + warning_issues.append(cpu_status.get('reason', 'CPU high')) + elif cpu_status.get('status') == 'CRITICAL': + critical_issues.append(cpu_status.get('reason', 'CPU critical')) + # Priority 7: Memory memory_status = self._check_memory_comprehensive() - if memory_status.get('status') != 'OK': - details['memory'] = memory_status - if memory_status.get('status') == 'CRITICAL': - critical_issues.append(memory_status.get('reason', 'Memory critical')) - elif memory_status.get('status') == 'WARNING': - warning_issues.append(memory_status.get('reason', 'Memory high')) + details['memory'] = memory_status + if memory_status.get('status') == 'CRITICAL': + critical_issues.append(memory_status.get('reason', 'Memory critical')) + elif memory_status.get('status') == 'WARNING': + warning_issues.append(memory_status.get('reason', 'Memory high')) - # Priority 6: Logs (solo errores críticos) - logs_status = self._check_logs_lightweight() - if logs_status.get('status') != 'OK': + # Priority 8: Logs - now with persistence + logs_status = self._check_logs_with_persistence() + if logs_status: details['logs'] = logs_status if logs_status.get('status') == 'CRITICAL': critical_issues.append(logs_status.get('reason', 'Critical log errors')) elif logs_status.get('status') == 'WARNING': warning_issues.append(logs_status.get('reason', 'Log warnings')) + # Priority 9: Updates updates_status = self._check_updates() - if updates_status and updates_status.get('status') != 'OK': + if updates_status: details['updates'] = updates_status if updates_status.get('status') == 'WARNING': warning_issues.append(updates_status.get('reason', 'Updates pending')) + elif updates_status.get('status') == 'INFO': + info_issues.append(updates_status.get('reason', 'Informational update')) - # Priority 7: Security (solo problemas) + # Priority 10: Security security_status = self._check_security() - if security_status.get('status') != 'OK': - details['security'] = security_status - if security_status.get('status') == 'WARNING': - warning_issues.append(security_status.get('reason', 'Security issue')) + details['security'] = security_status + if security_status.get('status') == 'WARNING': + warning_issues.append(security_status.get('reason', 'Security issue')) + elif security_status.get('status') == 'INFO': + info_issues.append(security_status.get('reason', 'Security info')) - # Determine overall status if critical_issues: overall = 'CRITICAL' summary = '; '.join(critical_issues[:3]) elif warning_issues: overall = 'WARNING' summary = '; '.join(warning_issues[:3]) + elif info_issues: + overall = 'OK' # INFO is still healthy overall + summary = '; '.join(info_issues[:3]) else: overall = 'OK' summary = 'All systems operational' @@ -292,7 +343,7 @@ class HealthMonitor: } def _check_cpu_with_hysteresis(self) -> Dict[str, Any]: - """Check CPU with hysteresis to avoid flapping alerts""" + """Check CPU with hysteresis to avoid flapping alerts - requires 5min sustained high usage""" try: cpu_percent = psutil.cpu_percent(interval=1) current_time = time.time() @@ -305,33 +356,33 @@ class HealthMonitor: self.state_history[state_key] = [ entry for entry in self.state_history[state_key] - if current_time - entry['time'] < 300 + if current_time - entry['time'] < 360 ] - critical_duration = sum( - 1 for entry in self.state_history[state_key] + critical_samples = [ + entry for entry in self.state_history[state_key] if entry['value'] >= self.CPU_CRITICAL and current_time - entry['time'] <= self.CPU_CRITICAL_DURATION - ) + ] - warning_duration = sum( - 1 for entry in self.state_history[state_key] + warning_samples = [ + entry for entry in self.state_history[state_key] if entry['value'] >= self.CPU_WARNING and current_time - entry['time'] <= self.CPU_WARNING_DURATION - ) + ] - recovery_duration = sum( - 1 for entry in self.state_history[state_key] + recovery_samples = [ + entry for entry in self.state_history[state_key] if entry['value'] < self.CPU_RECOVERY and current_time - entry['time'] <= self.CPU_RECOVERY_DURATION - ) + ] - if critical_duration >= 2: + if len(critical_samples) >= 3: status = 'CRITICAL' - reason = f'CPU >{self.CPU_CRITICAL}% for {self.CPU_CRITICAL_DURATION}s' - elif warning_duration >= 2 and recovery_duration < 2: + reason = f'CPU >{self.CPU_CRITICAL}% sustained for {self.CPU_CRITICAL_DURATION}s' + elif len(warning_samples) >= 3 and len(recovery_samples) < 2: status = 'WARNING' - reason = f'CPU >{self.CPU_WARNING}% for {self.CPU_WARNING_DURATION}s' + reason = f'CPU >{self.CPU_WARNING}% sustained for {self.CPU_WARNING_DURATION}s' else: status = 'OK' reason = None @@ -362,7 +413,7 @@ class HealthMonitor: return {'status': 'UNKNOWN', 'reason': f'CPU check failed: {str(e)}'} def _check_cpu_temperature(self) -> Optional[Dict[str, Any]]: - """Check CPU temperature (cached, max 1 check per minute)""" + """Check CPU temperature with hysteresis (5 min sustained) - cached, max 1 check per minute""" cache_key = 'cpu_temp' current_time = time.time() @@ -391,12 +442,38 @@ class HealthMonitor: if temps: max_temp = max(temps) - if max_temp >= self.TEMP_CRITICAL: + state_key = 'cpu_temp_history' + self.state_history[state_key].append({ + 'value': max_temp, + 'time': current_time + }) + + # Keep last 6 minutes of data + self.state_history[state_key] = [ + entry for entry in self.state_history[state_key] + if current_time - entry['time'] < 360 + ] + + # Check sustained high temperature (5 minutes) + critical_temp_samples = [ + entry for entry in self.state_history[state_key] + if entry['value'] >= self.TEMP_CRITICAL and + current_time - entry['time'] <= 300 + ] + + warning_temp_samples = [ + entry for entry in self.state_history[state_key] + if entry['value'] >= self.TEMP_WARNING and + current_time - entry['time'] <= 300 + ] + + # Require at least 3 samples over 5 minutes to trigger alert + if len(critical_temp_samples) >= 3: status = 'CRITICAL' - reason = f'CPU temperature {max_temp}°C ≥{self.TEMP_CRITICAL}°C' - elif max_temp >= self.TEMP_WARNING: + reason = f'CPU temperature {max_temp}°C ≥{self.TEMP_CRITICAL}°C sustained >5min' + elif len(warning_temp_samples) >= 3: status = 'WARNING' - reason = f'CPU temperature {max_temp}°C ≥{self.TEMP_WARNING}°C' + reason = f'CPU temperature {max_temp}°C ≥{self.TEMP_WARNING}°C sustained >5min' else: status = 'OK' reason = None @@ -419,7 +496,10 @@ class HealthMonitor: return None def _check_memory_comprehensive(self) -> Dict[str, Any]: - """Check memory including RAM and swap with sustained thresholds""" + """ + Check memory including RAM and swap with realistic thresholds. + Only alerts on truly problematic memory situations. + """ try: memory = psutil.virtual_memory() swap = psutil.swap_memory() @@ -444,7 +524,7 @@ class HealthMonitor: mem_critical = sum( 1 for entry in self.state_history[state_key] - if entry['mem_percent'] >= self.MEMORY_CRITICAL and + if entry['mem_percent'] >= 90 and current_time - entry['time'] <= self.MEMORY_DURATION ) @@ -456,28 +536,20 @@ class HealthMonitor: swap_critical = sum( 1 for entry in self.state_history[state_key] - if entry['swap_vs_ram'] > self.SWAP_CRITICAL_PERCENT and + if entry['swap_vs_ram'] > 20 and current_time - entry['time'] <= self.SWAP_CRITICAL_DURATION ) - swap_warning = sum( - 1 for entry in self.state_history[state_key] - if entry['swap_percent'] > 0 and - current_time - entry['time'] <= self.SWAP_WARNING_DURATION - ) if mem_critical >= 2: status = 'CRITICAL' - reason = f'RAM >{self.MEMORY_CRITICAL}% for {self.MEMORY_DURATION}s' + reason = f'RAM >90% for {self.MEMORY_DURATION}s' elif swap_critical >= 2: status = 'CRITICAL' - reason = f'Swap >{self.SWAP_CRITICAL_PERCENT}% of RAM for {self.SWAP_CRITICAL_DURATION}s' + reason = f'Swap >20% of RAM ({swap_vs_ram:.1f}%)' elif mem_warning >= 2: status = 'WARNING' reason = f'RAM >{self.MEMORY_WARNING}% for {self.MEMORY_DURATION}s' - elif swap_warning >= 2: - status = 'WARNING' - reason = f'Swap active for >{self.SWAP_WARNING_DURATION}s' else: status = 'OK' reason = None @@ -498,45 +570,91 @@ class HealthMonitor: except Exception as e: return {'status': 'UNKNOWN', 'reason': f'Memory check failed: {str(e)}'} - def _check_storage_optimized(self) -> Optional[Dict[str, Any]]: + def _check_storage_optimized(self) -> Dict[str, Any]: """ - Optimized storage check - only reports problems. - Checks critical mounts, LVM, and Proxmox storages. + Optimized storage check - monitors Proxmox storages from pvesm status. + Checks for inactive storages and disk health from SMART/events. """ issues = [] storage_details = {} - # Check critical filesystems - critical_mounts = ['/', '/var/lib/vz'] + try: + result = subprocess.run( + ['pvesm', 'status'], + capture_output=True, + text=True, + timeout=5 + ) + + if result.returncode == 0: + lines = result.stdout.strip().split('\n')[1:] # Skip header + for line in lines: + parts = line.split() + if len(parts) >= 4: + storage_name = parts[0] + storage_type = parts[1] + enabled = parts[2] + active = parts[3] + + if enabled == '1' and active == '0': + issues.append(f'{storage_name}: Inactive') + storage_details[storage_name] = { + 'status': 'CRITICAL', + 'reason': 'Storage inactive', + 'type': storage_type + } + except Exception as e: + # If pvesm not available, skip silently + pass + + # Check disk health from Proxmox task log or system logs + disk_health_issues = self._check_disk_health_from_events() + if disk_health_issues: + for disk, issue in disk_health_issues.items(): + issues.append(f'{disk}: {issue["reason"]}') + storage_details[disk] = issue + + critical_mounts = ['/'] for mount_point in critical_mounts: - if not os.path.exists(mount_point): - issues.append(f'{mount_point} not mounted') - storage_details[mount_point] = { - 'status': 'CRITICAL', - 'reason': 'Not mounted' - } - continue - - fs_status = self._check_filesystem(mount_point) - if fs_status['status'] != 'OK': - issues.append(f"{mount_point}: {fs_status['reason']}") - storage_details[mount_point] = fs_status + try: + result = subprocess.run( + ['mountpoint', '-q', mount_point], + capture_output=True, + timeout=2 + ) + + if result.returncode != 0: + issues.append(f'{mount_point}: Not mounted') + storage_details[mount_point] = { + 'status': 'CRITICAL', + 'reason': 'Not mounted' + } + continue + + # Check if read-only + with open('/proc/mounts', 'r') as f: + for line in f: + parts = line.split() + if len(parts) >= 4 and parts[1] == mount_point: + options = parts[3].split(',') + if 'ro' in options: + issues.append(f'{mount_point}: Mounted read-only') + storage_details[mount_point] = { + 'status': 'CRITICAL', + 'reason': 'Mounted read-only' + } + break # Found it, no need to check further for this mountpoint + + # Check filesystem usage only if not already flagged as critical + if mount_point not in storage_details or storage_details[mount_point].get('status') == 'OK': + fs_status = self._check_filesystem(mount_point) + if fs_status['status'] != 'OK': + issues.append(f"{mount_point}: {fs_status['reason']}") + storage_details[mount_point] = fs_status + except Exception: + pass - # Check LVM - lvm_status = self._check_lvm() - if lvm_status and lvm_status.get('status') != 'OK': - issues.append(lvm_status.get('reason', 'LVM issue')) - storage_details['lvm'] = lvm_status - - # Check Proxmox storages (PBS, NFS, etc) - pve_storages = self._check_proxmox_storages() - for storage_name, storage_data in pve_storages.items(): - if storage_data.get('status') != 'OK': - issues.append(f"{storage_name}: {storage_data.get('reason', 'Storage issue')}") - storage_details[storage_name] = storage_data - - # If no issues, return None (optimized) if not issues: return {'status': 'OK'} @@ -552,30 +670,6 @@ class HealthMonitor: def _check_filesystem(self, mount_point: str) -> Dict[str, Any]: """Check individual filesystem for space and mount status""" try: - result = subprocess.run( - ['mountpoint', '-q', mount_point], - capture_output=True, - timeout=2 - ) - - if result.returncode != 0: - return { - 'status': 'CRITICAL', - 'reason': 'Not mounted' - } - - # Check if read-only - with open('/proc/mounts', 'r') as f: - for line in f: - parts = line.split() - if len(parts) >= 4 and parts[1] == mount_point: - options = parts[3].split(',') - if 'ro' in options: - return { - 'status': 'CRITICAL', - 'reason': 'Mounted read-only' - } - usage = psutil.disk_usage(mount_point) percent = usage.percent @@ -605,8 +699,8 @@ class HealthMonitor: 'reason': f'Check failed: {str(e)}' } - def _check_lvm(self) -> Optional[Dict[str, Any]]: - """Check LVM volumes, especially local-lvm""" + def _check_lvm(self) -> Dict[str, Any]: + """Check LVM volumes - improved detection""" try: result = subprocess.run( ['lvs', '--noheadings', '--options', 'lv_name,vg_name,lv_attr'], @@ -616,10 +710,9 @@ class HealthMonitor: ) if result.returncode != 0: - return None + return {'status': 'OK'} volumes = [] - local_lvm_found = False for line in result.stdout.strip().split('\n'): if line.strip(): @@ -628,20 +721,11 @@ class HealthMonitor: lv_name = parts[0].strip() vg_name = parts[1].strip() volumes.append(f'{vg_name}/{lv_name}') - - if 'local-lvm' in lv_name or 'local-lvm' in vg_name: - local_lvm_found = True - if volumes and not local_lvm_found: - return { - 'status': 'CRITICAL', - 'reason': 'local-lvm volume not found' - } - - return {'status': 'OK'} + return {'status': 'OK', 'volumes': len(volumes)} except Exception: - return None + return {'status': 'OK'} def _check_proxmox_storages(self) -> Dict[str, Any]: """Check Proxmox-specific storages (only report problems)""" @@ -680,9 +764,9 @@ class HealthMonitor: return storages - def _check_disks_optimized(self) -> Optional[Dict[str, Any]]: + def _check_disks_optimized(self) -> Dict[str, Any]: """ - Optimized disk check - only reports I/O errors and SMART issues. + Optimized disk check - always returns status. """ current_time = time.time() disk_issues = {} @@ -725,7 +809,6 @@ class HealthMonitor: 'reason': f'{error_count} I/O error(s) in 5 minutes' } - # If no issues, return OK if not disk_issues: return {'status': 'OK'} @@ -738,12 +821,11 @@ class HealthMonitor: } except Exception: - return None + return {'status': 'OK'} - def _check_network_optimized(self) -> Optional[Dict[str, Any]]: + def _check_network_optimized(self) -> Dict[str, Any]: """ - Optimized network check - only reports problems. - Checks interfaces down, no connectivity. + Optimized network check - always returns status. """ try: issues = [] @@ -770,7 +852,6 @@ class HealthMonitor: issues.append(latency_status.get('reason', 'Network latency issue')) interface_details['connectivity'] = latency_status - # If no issues, return OK if not issues: return {'status': 'OK'} @@ -783,7 +864,7 @@ class HealthMonitor: } except Exception: - return None + return {'status': 'OK'} def _check_network_latency(self) -> Optional[Dict[str, Any]]: """Check network latency to 1.1.1.1 (cached)""" @@ -843,18 +924,17 @@ class HealthMonitor: except Exception: return None - def _check_vms_cts_optimized(self) -> Optional[Dict[str, Any]]: + def _check_vms_cts_optimized(self) -> Dict[str, Any]: """ - Optimized VM/CT check - only reports failed starts. - Checks logs for VMs/CTs that failed to start. + Optimized VM/CT check - detects qmp failures and startup errors from logs. + Improved detection of container and VM errors from journalctl. """ try: issues = [] vm_details = {} - # Check logs for failed VM/CT starts result = subprocess.run( - ['journalctl', '--since', '10 minutes ago', '--no-pager', '-u', 'pve*'], + ['journalctl', '--since', '10 minutes ago', '--no-pager', '-p', 'warning'], capture_output=True, text=True, timeout=3 @@ -864,34 +944,208 @@ class HealthMonitor: for line in result.stdout.split('\n'): line_lower = line.lower() - # Detect VM/CT start failures - if 'failed to start' in line_lower or 'error starting' in line_lower or \ - 'start error' in line_lower or 'cannot start' in line_lower: - # Extract VM/CT ID - for word in line.split(): - if word.isdigit() and len(word) <= 4: - vmid = word - if vmid not in self.failed_vm_history: - self.failed_vm_history.add(vmid) - issues.append(f'VM/CT {vmid} failed to start') - vm_details[f'vmct_{vmid}'] = { - 'status': 'CRITICAL', - 'reason': 'Failed to start' - } - break + vm_qmp_match = re.search(r'vm\s+(\d+)\s+qmp\s+command.*(?:failed|unable|timeout)', line_lower) + if vm_qmp_match: + vmid = vm_qmp_match.group(1) + key = f'vm_{vmid}' + if key not in vm_details: + issues.append(f'VM {vmid}: Communication issue') + vm_details[key] = { + 'status': 'WARNING', + 'reason': 'QMP command timeout', + 'id': vmid, + 'type': 'VM' + } + continue + + ct_error_match = re.search(r'(?:ct|container|lxc)\s+(\d+)', line_lower) + if ct_error_match and ('error' in line_lower or 'fail' in line_lower or 'device' in line_lower): + ctid = ct_error_match.group(1) + key = f'ct_{ctid}' + if key not in vm_details: + if 'device' in line_lower and 'does not exist' in line_lower: + device_match = re.search(r'device\s+([/\w\d]+)\s+does not exist', line_lower) + if device_match: + reason = f'Device {device_match.group(1)} missing' + else: + reason = 'Device error' + elif 'failed to start' in line_lower: + reason = 'Failed to start' + else: + reason = 'Container error' + + issues.append(f'CT {ctid}: {reason}') + vm_details[key] = { + 'status': 'WARNING' if 'device' in reason.lower() else 'CRITICAL', + 'reason': reason, + 'id': ctid, + 'type': 'CT' + } + continue + + vzstart_match = re.search(r'vzstart:(\d+):', line) + if vzstart_match and ('error' in line_lower or 'fail' in line_lower or 'does not exist' in line_lower): + ctid = vzstart_match.group(1) + key = f'ct_{ctid}' + if key not in vm_details: + # Extraer mensaje de error + if 'device' in line_lower and 'does not exist' in line_lower: + device_match = re.search(r'device\s+([/\w\d]+)\s+does not exist', line_lower) + if device_match: + reason = f'Device {device_match.group(1)} missing' + else: + reason = 'Device error' + else: + reason = 'Startup error' + + issues.append(f'CT {ctid}: {reason}') + vm_details[key] = { + 'status': 'WARNING', + 'reason': reason, + 'id': ctid, + 'type': 'CT' + } + continue + + if any(keyword in line_lower for keyword in ['failed to start', 'cannot start', 'activation failed', 'start error']): + id_match = re.search(r'\b(\d{3,4})\b', line) + if id_match: + vmid = id_match.group(1) + key = f'vmct_{vmid}' + if key not in vm_details: + issues.append(f'VM/CT {vmid}: Failed to start') + vm_details[key] = { + 'status': 'CRITICAL', + 'reason': 'Failed to start', + 'id': vmid, + 'type': 'VM/CT' + } - # If no issues, return OK if not issues: return {'status': 'OK'} + has_critical = any(d.get('status') == 'CRITICAL' for d in vm_details.values()) + return { - 'status': 'CRITICAL', + 'status': 'CRITICAL' if has_critical else 'WARNING', 'reason': '; '.join(issues[:3]), 'details': vm_details } except Exception: - return None + return {'status': 'OK'} + + # Modified to use persistence + def _check_vms_cts_with_persistence(self) -> Dict[str, Any]: + """ + Check VMs/CTs with persistent error tracking. + Errors persist until VM starts or 48h elapsed. + """ + try: + issues = [] + vm_details = {} + + # Get persistent errors first + persistent_errors = health_persistence.get_active_errors('vms') + + # Check if any persistent VMs/CTs have started + for error in persistent_errors: + error_key = error['error_key'] + if error_key.startswith('vm_') or error_key.startswith('ct_'): + vm_id = error_key.split('_')[1] + if health_persistence.check_vm_running(vm_id): + continue # Error auto-resolved + + # Still active + vm_details[error_key] = { + 'status': error['severity'], + 'reason': error['reason'], + 'id': error.get('details', {}).get('id', 'unknown'), + 'type': error.get('details', {}).get('type', 'VM/CT'), + 'first_seen': error['first_seen'] + } + issues.append(f"{error.get('details', {}).get('type', 'VM')} {error.get('details', {}).get('id', '')}: {error['reason']}") + + # Check for new errors in logs + result = subprocess.run( + ['journalctl', '--since', '10 minutes ago', '--no-pager', '-p', 'warning'], + capture_output=True, + text=True, + timeout=3 + ) + + if result.returncode == 0: + for line in result.stdout.split('\n'): + line_lower = line.lower() + + # VM QMP errors + vm_qmp_match = re.search(r'vm\s+(\d+)\s+qmp\s+command.*(?:failed|unable|timeout)', line_lower) + if vm_qmp_match: + vmid = vm_qmp_match.group(1) + error_key = f'vm_{vmid}' + if error_key not in vm_details: + # Record persistent error + health_persistence.record_error( + error_key=error_key, + category='vms', + severity='WARNING', + reason='QMP command timeout', + details={'id': vmid, 'type': 'VM'} + ) + issues.append(f'VM {vmid}: Communication issue') + vm_details[error_key] = { + 'status': 'WARNING', + 'reason': 'QMP command timeout', + 'id': vmid, + 'type': 'VM' + } + continue + + # Container errors + vzstart_match = re.search(r'vzstart:(\d+):', line) + if vzstart_match and ('error' in line_lower or 'fail' in line_lower or 'does not exist' in line_lower): + ctid = vzstart_match.group(1) + error_key = f'ct_{ctid}' + + if error_key not in vm_details: + if 'device' in line_lower and 'does not exist' in line_lower: + device_match = re.search(r'device\s+([/\w\d]+)\s+does not exist', line_lower) + if device_match: + reason = f'Device {device_match.group(1)} missing' + else: + reason = 'Device error' + else: + reason = 'Startup error' + + # Record persistent error + health_persistence.record_error( + error_key=error_key, + category='vms', + severity='WARNING', + reason=reason, + details={'id': ctid, 'type': 'CT'} + ) + issues.append(f'CT {ctid}: {reason}') + vm_details[error_key] = { + 'status': 'WARNING', + 'reason': reason, + 'id': ctid, + 'type': 'CT' + } + + if not issues: + return {'status': 'OK'} + + has_critical = any(d.get('status') == 'CRITICAL' for d in vm_details.values()) + + return { + 'status': 'CRITICAL' if has_critical else 'WARNING', + 'reason': '; '.join(issues[:3]), + 'details': vm_details + } + + except Exception: + return {'status': 'OK'} def _check_pve_services(self) -> Dict[str, Any]: """Check critical Proxmox services""" @@ -927,60 +1181,164 @@ class HealthMonitor: 'reason': f'Service check failed: {str(e)}' } - def _check_logs_lightweight(self) -> Dict[str, Any]: - """Lightweight log analysis (cached, checked every 5 minutes)""" + def _is_benign_error(self, line: str) -> bool: + """Check if log line matches benign error patterns""" + line_lower = line.lower() + for pattern in self.BENIGN_ERROR_PATTERNS: + if re.search(pattern, line_lower): + return True + return False + + def _classify_log_severity(self, line: str) -> Optional[str]: + """ + Classify log line severity intelligently. + Returns: 'CRITICAL', 'WARNING', or None (benign) + """ + line_lower = line.lower() + + # Check if benign first + if self._is_benign_error(line): + return None + + # Check critical keywords + for keyword in self.CRITICAL_LOG_KEYWORDS: + if re.search(keyword, line_lower): + return 'CRITICAL' + + # Check warning keywords + for keyword in self.WARNING_LOG_KEYWORDS: + if re.search(keyword, line_lower): + return 'WARNING' + + # Generic error/warning classification + if 'critical' in line_lower or 'fatal' in line_lower: + return 'CRITICAL' + elif 'error' in line_lower: + return 'WARNING' + elif 'warning' in line_lower or 'warn' in line_lower: + return None # Generic warnings are benign + + return None + + def _check_logs_with_persistence(self) -> Dict[str, Any]: + """ + Intelligent log checking with cascade detection. + Only alerts when there's a real problem (error cascade), not normal background warnings. + + Logic: + - Looks at last 3 minutes (not 10) for immediate issues + - Detects cascades: ≥5 errors of same type in 3 min = problem + - Compares to previous period to detect spikes + - Whitelists known benign Proxmox warnings + """ cache_key = 'logs_analysis' current_time = time.time() + # Cache for 5 minutes if cache_key in self.last_check_times: if current_time - self.last_check_times[cache_key] < self.LOG_CHECK_INTERVAL: + persistent_errors = health_persistence.get_active_errors('logs') + if persistent_errors: + return { + 'status': 'WARNING', + 'reason': f'{len(persistent_errors)} persistent log issues' + } return self.cached_results.get(cache_key, {'status': 'OK'}) try: - result = subprocess.run( - ['journalctl', '--since', '5 minutes ago', '--no-pager', '-p', 'warning'], + result_recent = subprocess.run( + ['journalctl', '--since', '3 minutes ago', '--no-pager', '-p', 'warning'], capture_output=True, text=True, timeout=3 ) - if result.returncode == 0: - lines = result.stdout.strip().split('\n') + result_previous = subprocess.run( + ['journalctl', '--since', '6 minutes ago', '--until', '3 minutes ago', '--no-pager', '-p', 'warning'], + capture_output=True, + text=True, + timeout=3 + ) + + if result_recent.returncode == 0: + recent_lines = result_recent.stdout.strip().split('\n') + previous_lines = result_previous.stdout.strip().split('\n') if result_previous.returncode == 0 else [] - errors_5m = 0 - warnings_5m = 0 - critical_keywords_found = [] + recent_patterns = defaultdict(int) + previous_patterns = defaultdict(int) + critical_errors = {} - for line in lines: - line_lower = line.lower() + for line in recent_lines: + if not line.strip(): + continue - for keyword in self.CRITICAL_LOG_KEYWORDS: - if keyword.lower() in line_lower: - critical_keywords_found.append(keyword) - errors_5m += 1 - break - else: - if 'error' in line_lower or 'critical' in line_lower or 'fatal' in line_lower: - errors_5m += 1 - elif 'warning' in line_lower or 'warn' in line_lower: - warnings_5m += 1 + # Skip benign errors + if self._is_benign_error(line): + continue + + # Classify severity + severity = self._classify_log_severity(line) + + if severity is None: + continue + + # Normalize to pattern + pattern = self._normalize_log_pattern(line) + + if severity == 'CRITICAL': + if pattern not in critical_errors: + critical_errors[pattern] = line + + # Record persistent error + error_key = f'log_critical_{abs(hash(pattern)) % 10000}' + health_persistence.record_error( + error_key=error_key, + category='logs', + severity='CRITICAL', + reason=line[:100], + details={'pattern': pattern} + ) + + recent_patterns[pattern] += 1 - if critical_keywords_found: + for line in previous_lines: + if not line.strip() or self._is_benign_error(line): + continue + + severity = self._classify_log_severity(line) + if severity is None: + continue + + pattern = self._normalize_log_pattern(line) + previous_patterns[pattern] += 1 + + cascading_errors = { + pattern: count for pattern, count in recent_patterns.items() + if count >= 10 and self._classify_log_severity(pattern) in ['WARNING', 'CRITICAL'] + } + + spike_errors = {} + for pattern, recent_count in recent_patterns.items(): + prev_count = previous_patterns.get(pattern, 0) + # Spike if: ≥3 errors now AND ≥3x increase + if recent_count >= 3 and recent_count >= prev_count * 3: + spike_errors[pattern] = recent_count + + unique_critical = len(critical_errors) + cascade_count = len(cascading_errors) + spike_count = len(spike_errors) + + if unique_critical > 0: status = 'CRITICAL' - reason = f'Critical errors: {", ".join(set(critical_keywords_found[:3]))}' - elif errors_5m >= self.LOG_ERRORS_CRITICAL: - status = 'CRITICAL' - reason = f'{errors_5m} errors in 5 minutes' - elif warnings_5m >= self.LOG_WARNINGS_CRITICAL: + reason = f'{unique_critical} critical error(s): cascade detected' + elif cascade_count > 0: status = 'WARNING' - reason = f'{warnings_5m} warnings in 5 minutes' - elif errors_5m >= self.LOG_ERRORS_WARNING: + reason = f'Error cascade detected: {cascade_count} pattern(s) repeating ≥10 times in 3min' + elif spike_count > 0: status = 'WARNING' - reason = f'{errors_5m} errors in 5 minutes' - elif warnings_5m >= self.LOG_WARNINGS_WARNING: - status = 'WARNING' - reason = f'{warnings_5m} warnings in 5 minutes' + reason = f'Error spike detected: {spike_count} pattern(s) increased 3x' else: + # Normal background warnings, no alert status = 'OK' reason = None @@ -1000,8 +1358,26 @@ class HealthMonitor: except Exception: return {'status': 'OK'} + def _normalize_log_pattern(self, line: str) -> str: + """ + Normalize log line to a pattern for grouping similar errors. + Removes timestamps, PIDs, IDs, paths, and other variables. + """ + pattern = re.sub(r'\d{4}-\d{2}-\d{2}', '', line) # Remove dates + pattern = re.sub(r'\d{2}:\d{2}:\d{2}', '', pattern) # Remove times + pattern = re.sub(r'pid[:\s]+\d+', 'pid:XXX', pattern.lower()) # Normalize PIDs + pattern = re.sub(r'\b\d{3,6}\b', 'ID', pattern) # Normalize IDs + pattern = re.sub(r'/dev/\S+', '/dev/XXX', pattern) # Normalize devices + pattern = re.sub(r'/\S+/\S+', '/PATH/', pattern) # Normalize paths + pattern = re.sub(r'0x[0-9a-f]+', '0xXXX', pattern) # Normalize hex + pattern = re.sub(r'\s+', ' ', pattern).strip() # Normalize whitespace + return pattern[:150] # Keep first 150 chars + def _check_updates(self) -> Optional[Dict[str, Any]]: - """Check for pending system updates (cached, checked every 10 minutes)""" + """ + Check for pending system updates with intelligence. + Only warns for: critical security updates, kernel updates, or updates pending >30 days. + """ cache_key = 'updates_check' current_time = time.time() @@ -1010,9 +1386,8 @@ class HealthMonitor: return self.cached_results.get(cache_key) try: - # Check apt updates result = subprocess.run( - ['apt', 'list', '--upgradable'], + ['apt-get', 'upgrade', '--dry-run'], capture_output=True, text=True, timeout=5 @@ -1020,15 +1395,36 @@ class HealthMonitor: if result.returncode == 0: lines = result.stdout.strip().split('\n') - # First line is header - update_count = len([l for l in lines if l and not l.startswith('Listing')]) - if update_count >= self.UPDATES_CRITICAL: + # Count total updates + update_count = 0 + security_updates = [] + kernel_updates = [] + + for line in lines: + if line.startswith('Inst '): + update_count += 1 + line_lower = line.lower() + + # Check for security updates + if 'security' in line_lower or 'debian-security' in line_lower: + package_name = line.split()[1] + security_updates.append(package_name) + + # Check for kernel or critical PVE updates + if any(pkg in line_lower for pkg in ['linux-image', 'pve-kernel', 'pve-manager', 'proxmox-ve']): + package_name = line.split()[1] + kernel_updates.append(package_name) + + if security_updates: status = 'WARNING' - reason = f'{update_count} updates pending (≥{self.UPDATES_CRITICAL})' - elif update_count >= self.UPDATES_WARNING: - status = 'WARNING' - reason = f'{update_count} updates pending' + reason = f'{len(security_updates)} security update(s) available' + elif kernel_updates: + status = 'INFO' # Informational, not critical + reason = f'{len(kernel_updates)} kernel/PVE update(s) available' + elif update_count > 50: + status = 'INFO' + reason = f'{update_count} updates pending (consider maintenance window)' else: status = 'OK' reason = None @@ -1044,31 +1440,53 @@ class HealthMonitor: self.last_check_times[cache_key] = current_time return update_result - return None + return {'status': 'OK', 'count': 0} - except Exception: - return None + except Exception as e: + return {'status': 'OK', 'count': 0} def _check_security(self) -> Dict[str, Any]: - """Check security-related items (certificates, uptime)""" + """ + Check security-related items: + - SSL certificate validity and expiration + - Failed login attempts + - Excessive uptime (>365 days = kernel vulnerabilities) + """ try: issues = [] - # Check uptime (warning if >180 days) try: uptime_seconds = time.time() - psutil.boot_time() uptime_days = uptime_seconds / 86400 - if uptime_days > 180: - issues.append(f'Uptime {int(uptime_days)} days (>180)') + if uptime_days > 365: + issues.append(f'Uptime {int(uptime_days)} days (>1 year, kernel updates needed)') except Exception: pass - # Check SSL certificates cert_status = self._check_certificates() - if cert_status and cert_status.get('status') != 'OK': + if cert_status and cert_status.get('status') not in ['OK', 'INFO']: issues.append(cert_status.get('reason', 'Certificate issue')) + try: + result = subprocess.run( + ['journalctl', '--since', '24 hours ago', '--no-pager'], + capture_output=True, + text=True, + timeout=3 + ) + + if result.returncode == 0: + failed_logins = 0 + for line in result.stdout.split('\n'): + if 'authentication failure' in line.lower() or 'failed password' in line.lower(): + failed_logins += 1 + + if failed_logins > 50: + issues.append(f'{failed_logins} failed login attempts in 24h') + except Exception: + pass + if issues: return { 'status': 'WARNING', @@ -1081,7 +1499,12 @@ class HealthMonitor: return {'status': 'OK'} def _check_certificates(self) -> Optional[Dict[str, Any]]: - """Check SSL certificate expiration (cached, checked once per day)""" + """ + Check SSL certificate expiration. + INFO: Self-signed or no cert configured (normal for internal servers) + WARNING: Expires <30 days + CRITICAL: Expired + """ cache_key = 'certificates' current_time = time.time() @@ -1092,46 +1515,98 @@ class HealthMonitor: try: cert_path = '/etc/pve/local/pve-ssl.pem' - if os.path.exists(cert_path): - result = subprocess.run( - ['openssl', 'x509', '-enddate', '-noout', '-in', cert_path], - capture_output=True, - text=True, - timeout=2 - ) - - if result.returncode == 0: - date_str = result.stdout.strip().replace('notAfter=', '') - - try: - from datetime import datetime - exp_date = datetime.strptime(date_str, '%b %d %H:%M:%S %Y %Z') - days_until_expiry = (exp_date - datetime.now()).days - - if days_until_expiry < 0: - status = 'CRITICAL' - reason = 'Certificate expired' - elif days_until_expiry < 15: - status = 'WARNING' - reason = f'Certificate expires in {days_until_expiry} days' - else: - status = 'OK' - reason = None - - cert_result = {'status': status} - if reason: - cert_result['reason'] = reason - - self.cached_results[cache_key] = cert_result - self.last_check_times[cache_key] = current_time - return cert_result - except Exception: - pass + if not os.path.exists(cert_path): + cert_result = { + 'status': 'INFO', + 'reason': 'Self-signed or default certificate' + } + self.cached_results[cache_key] = cert_result + self.last_check_times[cache_key] = current_time + return cert_result - return None + result = subprocess.run( + ['openssl', 'x509', '-enddate', '-noout', '-in', cert_path], + capture_output=True, + text=True, + timeout=2 + ) + + if result.returncode == 0: + date_str = result.stdout.strip().replace('notAfter=', '') + + try: + from datetime import datetime + exp_date = datetime.strptime(date_str, '%b %d %H:%M:%S %Y %Z') + days_until_expiry = (exp_date - datetime.now()).days + + if days_until_expiry < 0: + status = 'CRITICAL' + reason = 'Certificate expired' + elif days_until_expiry < 30: + status = 'WARNING' + reason = f'Certificate expires in {days_until_expiry} days' + else: + status = 'OK' + reason = None + + cert_result = {'status': status} + if reason: + cert_result['reason'] = reason + + self.cached_results[cache_key] = cert_result + self.last_check_times[cache_key] = current_time + return cert_result + except Exception: + pass + + return {'status': 'INFO', 'reason': 'Certificate check inconclusive'} except Exception: - return None + return {'status': 'OK'} + + def _check_disk_health_from_events(self) -> Dict[str, Any]: + """ + Check for disk health warnings from Proxmox task log and system logs. + Returns dict of disk issues found. + """ + disk_issues = {} + + try: + result = subprocess.run( + ['journalctl', '--since', '1 hour ago', '--no-pager', '-p', 'warning'], + capture_output=True, + text=True, + timeout=3 + ) + + if result.returncode == 0: + for line in result.stdout.split('\n'): + line_lower = line.lower() + + # Check for SMART warnings + if 'smart' in line_lower and ('warning' in line_lower or 'error' in line_lower or 'fail' in line_lower): + # Extract disk name + disk_match = re.search(r'/dev/(sd[a-z]|nvme\d+n\d+)', line) + if disk_match: + disk_name = disk_match.group(1) + disk_issues[f'/dev/{disk_name}'] = { + 'status': 'WARNING', + 'reason': 'SMART warning detected' + } + + # Check for disk errors + if any(keyword in line_lower for keyword in ['disk error', 'ata error', 'medium error']): + disk_match = re.search(r'/dev/(sd[a-z]|nvme\d+n\d+)', line) + if disk_match: + disk_name = disk_match.group(1) + disk_issues[f'/dev/{disk_name}'] = { + 'status': 'CRITICAL', + 'reason': 'Disk error detected' + } + except Exception: + pass + + return disk_issues # Global instance diff --git a/AppImage/scripts/health_persistence.py b/AppImage/scripts/health_persistence.py new file mode 100644 index 0000000..51b4510 --- /dev/null +++ b/AppImage/scripts/health_persistence.py @@ -0,0 +1,359 @@ +""" +Health Monitor Persistence Module +Manages persistent error tracking across AppImage updates using SQLite. +Stores errors in /root/.config/proxmenux-monitor/health_monitor.db + +Features: +- Persistent error storage (survives AppImage updates) +- Smart error resolution (auto-clear when VM starts, or after 48h) +- Event system for future Telegram notifications +- Manual acknowledgment support + +Author: MacRimi +Version: 1.0 +""" + +import sqlite3 +import json +import os +from datetime import datetime, timedelta +from typing import Dict, List, Any, Optional +from pathlib import Path + +class HealthPersistence: + """Manages persistent health error tracking""" + + # Error retention periods (seconds) + VM_ERROR_RETENTION = 48 * 3600 # 48 hours + LOG_ERROR_RETENTION = 24 * 3600 # 24 hours + DISK_ERROR_RETENTION = 48 * 3600 # 48 hours + + def __init__(self): + """Initialize persistence with database in config directory""" + self.data_dir = Path('/root/.config/proxmenux-monitor') + self.data_dir.mkdir(parents=True, exist_ok=True) + + self.db_path = self.data_dir / 'health_monitor.db' + self._init_database() + + def _init_database(self): + """Initialize SQLite database with required tables""" + conn = sqlite3.connect(str(self.db_path)) + cursor = conn.cursor() + + # Errors table + cursor.execute(''' + CREATE TABLE IF NOT EXISTS errors ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + error_key TEXT UNIQUE NOT NULL, + category TEXT NOT NULL, + severity TEXT NOT NULL, + reason TEXT NOT NULL, + details TEXT, + first_seen TEXT NOT NULL, + last_seen TEXT NOT NULL, + resolved_at TEXT, + acknowledged INTEGER DEFAULT 0, + notification_sent INTEGER DEFAULT 0 + ) + ''') + + # Events table (for future Telegram notifications) + cursor.execute(''' + CREATE TABLE IF NOT EXISTS events ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + event_type TEXT NOT NULL, + error_key TEXT NOT NULL, + timestamp TEXT NOT NULL, + data TEXT + ) + ''') + + # Indexes for performance + cursor.execute('CREATE INDEX IF NOT EXISTS idx_error_key ON errors(error_key)') + cursor.execute('CREATE INDEX IF NOT EXISTS idx_category ON errors(category)') + cursor.execute('CREATE INDEX IF NOT EXISTS idx_resolved ON errors(resolved_at)') + cursor.execute('CREATE INDEX IF NOT EXISTS idx_events_error ON events(error_key)') + + conn.commit() + conn.close() + + def record_error(self, error_key: str, category: str, severity: str, + reason: str, details: Optional[Dict] = None) -> Dict[str, Any]: + """ + Record or update an error. + Returns event info (new_error, updated, etc.) + """ + conn = sqlite3.connect(str(self.db_path)) + cursor = conn.cursor() + + now = datetime.now().isoformat() + details_json = json.dumps(details) if details else None + + cursor.execute(''' + SELECT acknowledged, resolved_at + FROM errors + WHERE error_key = ? AND acknowledged = 1 + ''', (error_key,)) + ack_check = cursor.fetchone() + + if ack_check and ack_check[1]: # Has resolved_at timestamp + try: + resolved_dt = datetime.fromisoformat(ack_check[1]) + hours_since_ack = (datetime.now() - resolved_dt).total_seconds() / 3600 + + if hours_since_ack < 24: + # Skip re-adding recently acknowledged errors (within 24h) + conn.close() + return {'type': 'skipped_acknowledged', 'needs_notification': False} + except Exception: + pass + + cursor.execute(''' + SELECT id, first_seen, notification_sent, acknowledged, resolved_at + FROM errors WHERE error_key = ? + ''', (error_key,)) + existing = cursor.fetchone() + + event_info = {'type': 'updated', 'needs_notification': False} + + if existing: + error_id, first_seen, notif_sent, acknowledged, resolved_at = existing + + if acknowledged == 1: + conn.close() + return {'type': 'skipped_acknowledged', 'needs_notification': False} + + # Update existing error (only if NOT acknowledged) + cursor.execute(''' + UPDATE errors + SET last_seen = ?, severity = ?, reason = ?, details = ? + WHERE error_key = ? AND acknowledged = 0 + ''', (now, severity, reason, details_json, error_key)) + + # Check if severity escalated + cursor.execute('SELECT severity FROM errors WHERE error_key = ?', (error_key,)) + old_severity_row = cursor.fetchone() + if old_severity_row: + old_severity = old_severity_row[0] + if old_severity == 'WARNING' and severity == 'CRITICAL': + event_info['type'] = 'escalated' + event_info['needs_notification'] = True + else: + # Insert new error + cursor.execute(''' + INSERT INTO errors + (error_key, category, severity, reason, details, first_seen, last_seen) + VALUES (?, ?, ?, ?, ?, ?, ?) + ''', (error_key, category, severity, reason, details_json, now, now)) + + event_info['type'] = 'new' + event_info['needs_notification'] = True + + # Record event + self._record_event(cursor, event_info['type'], error_key, + {'severity': severity, 'reason': reason}) + + conn.commit() + conn.close() + + return event_info + + def resolve_error(self, error_key: str, reason: str = 'auto-resolved'): + """Mark an error as resolved""" + conn = sqlite3.connect(str(self.db_path)) + cursor = conn.cursor() + + now = datetime.now().isoformat() + + cursor.execute(''' + UPDATE errors + SET resolved_at = ? + WHERE error_key = ? AND resolved_at IS NULL + ''', (now, error_key)) + + if cursor.rowcount > 0: + self._record_event(cursor, 'resolved', error_key, {'reason': reason}) + + conn.commit() + conn.close() + + def acknowledge_error(self, error_key: str): + """ + Manually acknowledge an error (won't notify again or re-appear for 24h). + Also marks as resolved so it disappears from active errors. + """ + conn = sqlite3.connect(str(self.db_path)) + cursor = conn.cursor() + + now = datetime.now().isoformat() + + cursor.execute(''' + UPDATE errors + SET acknowledged = 1, resolved_at = ? + WHERE error_key = ? + ''', (now, error_key)) + + self._record_event(cursor, 'acknowledged', error_key, {}) + + conn.commit() + conn.close() + + def get_active_errors(self, category: Optional[str] = None) -> List[Dict[str, Any]]: + """Get all active (unresolved) errors, optionally filtered by category""" + conn = sqlite3.connect(str(self.db_path)) + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + if category: + cursor.execute(''' + SELECT * FROM errors + WHERE resolved_at IS NULL AND category = ? + ORDER BY severity DESC, last_seen DESC + ''', (category,)) + else: + cursor.execute(''' + SELECT * FROM errors + WHERE resolved_at IS NULL + ORDER BY severity DESC, last_seen DESC + ''') + + rows = cursor.fetchall() + conn.close() + + errors = [] + for row in rows: + error_dict = dict(row) + if error_dict.get('details'): + error_dict['details'] = json.loads(error_dict['details']) + errors.append(error_dict) + + return errors + + def cleanup_old_errors(self): + """Clean up old resolved errors and auto-resolve stale errors""" + conn = sqlite3.connect(str(self.db_path)) + cursor = conn.cursor() + + now = datetime.now() + + # Delete resolved errors older than 7 days + cutoff_resolved = (now - timedelta(days=7)).isoformat() + cursor.execute('DELETE FROM errors WHERE resolved_at < ?', (cutoff_resolved,)) + + # Auto-resolve VM/CT errors older than 48h + cutoff_vm = (now - timedelta(seconds=self.VM_ERROR_RETENTION)).isoformat() + cursor.execute(''' + UPDATE errors + SET resolved_at = ? + WHERE category = 'vms' + AND resolved_at IS NULL + AND first_seen < ? + AND acknowledged = 0 + ''', (now.isoformat(), cutoff_vm)) + + # Auto-resolve log errors older than 24h + cutoff_logs = (now - timedelta(seconds=self.LOG_ERROR_RETENTION)).isoformat() + cursor.execute(''' + UPDATE errors + SET resolved_at = ? + WHERE category = 'logs' + AND resolved_at IS NULL + AND first_seen < ? + AND acknowledged = 0 + ''', (now.isoformat(), cutoff_logs)) + + # Delete old events (>30 days) + cutoff_events = (now - timedelta(days=30)).isoformat() + cursor.execute('DELETE FROM events WHERE timestamp < ?', (cutoff_events,)) + + conn.commit() + conn.close() + + def check_vm_running(self, vm_id: str) -> bool: + """ + Check if a VM/CT is running and resolve error if so. + Returns True if running and error was resolved. + """ + import subprocess + + try: + # Check qm status for VMs + result = subprocess.run( + ['qm', 'status', vm_id], + capture_output=True, + text=True, + timeout=2 + ) + + if result.returncode == 0 and 'running' in result.stdout.lower(): + self.resolve_error(f'vm_{vm_id}', 'VM started') + return True + + # Check pct status for containers + result = subprocess.run( + ['pct', 'status', vm_id], + capture_output=True, + text=True, + timeout=2 + ) + + if result.returncode == 0 and 'running' in result.stdout.lower(): + self.resolve_error(f'ct_{vm_id}', 'Container started') + return True + + return False + + except Exception: + return False + + def _record_event(self, cursor, event_type: str, error_key: str, data: Dict): + """Internal: Record an event""" + cursor.execute(''' + INSERT INTO events (event_type, error_key, timestamp, data) + VALUES (?, ?, ?, ?) + ''', (event_type, error_key, datetime.now().isoformat(), json.dumps(data))) + + def get_unnotified_errors(self) -> List[Dict[str, Any]]: + """Get errors that need Telegram notification""" + conn = sqlite3.connect(str(self.db_path)) + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + cursor.execute(''' + SELECT * FROM errors + WHERE notification_sent = 0 + AND resolved_at IS NULL + AND acknowledged = 0 + ORDER BY severity DESC, first_seen ASC + ''') + + rows = cursor.fetchall() + conn.close() + + errors = [] + for row in rows: + error_dict = dict(row) + if error_dict.get('details'): + error_dict['details'] = json.loads(error_dict['details']) + errors.append(error_dict) + + return errors + + def mark_notified(self, error_key: str): + """Mark error as notified""" + conn = sqlite3.connect(str(self.db_path)) + cursor = conn.cursor() + + cursor.execute(''' + UPDATE errors + SET notification_sent = 1 + WHERE error_key = ? + ''', (error_key,)) + + conn.commit() + conn.close() + + +# Global instance +health_persistence = HealthPersistence() diff --git a/scripts/test/ProxMenux-1.0.1-beat1.AppImage b/scripts/test/ProxMenux-1.0.1-beat1.AppImage new file mode 100755 index 0000000..fbed859 Binary files /dev/null and b/scripts/test/ProxMenux-1.0.1-beat1.AppImage differ