update health_monitor.py

This commit is contained in:
MacRimi
2026-04-17 16:39:08 +02:00
parent ffadb2c508
commit 039e35f3c5
5 changed files with 353 additions and 221 deletions

View File

@@ -162,43 +162,41 @@ const groupAndSortTemperatures = (temperatures: any[]) => {
}
export default function Hardware() {
// Static data - load once without refresh
// Static data - loaded once on mount. Static fields (CPU, motherboard, memory
// modules, PCI, disks, GPU list) don't change at runtime, so no auto-refresh.
// `mutateStatic` is triggered explicitly after GPU switch-mode changes.
const {
data: staticHardwareData,
error: staticError,
isLoading: staticLoading,
mutate: mutateStatic,
} = useSWR<HardwareData>("/api/hardware", swrFetcher, {
revalidateOnFocus: false,
revalidateOnReconnect: false,
refreshInterval: 0, // No auto-refresh for static data
refreshInterval: 0,
})
// Dynamic data - refresh every 5 seconds for temperatures, fans, power, ups
// Live data - only temperatures, fans, power, UPS. Polled every 5s.
// Backend /api/hardware/live uses cached ipmitool output (10s) so this is cheap.
const {
data: dynamicHardwareData,
error: dynamicError,
isLoading: dynamicLoading,
} = useSWR<HardwareData>("/api/hardware", swrFetcher, {
refreshInterval: 7000,
} = useSWR<HardwareData>("/api/hardware/live", swrFetcher, {
refreshInterval: 5000,
revalidateOnFocus: true,
revalidateOnReconnect: true,
dedupingInterval: 2000,
})
// Merge static and dynamic data, preferring static for CPU/memory/PCI/disks
// Merge: static fields from initial load, live fields from the 5s poll.
const hardwareData = staticHardwareData
? {
...dynamicHardwareData,
// Keep static data from initial load
cpu: staticHardwareData.cpu,
motherboard: staticHardwareData.motherboard,
memory_modules: staticHardwareData.memory_modules,
pci_devices: staticHardwareData.pci_devices,
storage_devices: staticHardwareData.storage_devices,
gpus: staticHardwareData.gpus,
// Use dynamic data for these
temperatures: dynamicHardwareData?.temperatures,
fans: dynamicHardwareData?.fans,
power_meter: dynamicHardwareData?.power_meter,
power_supplies: dynamicHardwareData?.power_supplies,
ups: dynamicHardwareData?.ups,
...staticHardwareData,
temperatures: dynamicHardwareData?.temperatures ?? staticHardwareData.temperatures,
fans: dynamicHardwareData?.fans ?? staticHardwareData.fans,
power_meter: dynamicHardwareData?.power_meter ?? staticHardwareData.power_meter,
power_supplies: dynamicHardwareData?.power_supplies ?? staticHardwareData.power_supplies,
ups: dynamicHardwareData?.ups ?? staticHardwareData.ups,
}
: undefined
@@ -239,21 +237,6 @@ export default function Hardware() {
const [showSwitchModeModal, setShowSwitchModeModal] = useState(false)
const [switchModeParams, setSwitchModeParams] = useState<{ gpuSlot: string; targetMode: "lxc" | "vm" } | null>(null)
const fetcher = async (url: string) => {
const data = await fetchApi(url)
return data
}
const {
data: hardwareDataSWR,
error: swrError,
isLoading: swrLoading,
mutate: mutateHardware,
} = useSWR<HardwareData>("/api/hardware", fetcher, {
refreshInterval: 30000,
revalidateOnFocus: false,
})
// Determine GPU mode based on driver (vfio-pci = VM, native driver = LXC)
const getGpuSwitchMode = (gpu: GPU): "lxc" | "vm" | "unknown" => {
const driver = gpu.pci_driver?.toLowerCase() || ""
@@ -304,7 +287,7 @@ export default function Hardware() {
const handleSwitchModeSave = (gpuSlot: string, e: React.MouseEvent) => {
e.stopPropagation()
const pendingMode = pendingSwitchModes[gpuSlot]
const gpu = hardwareDataSWR?.gpus?.find(g => g.slot === gpuSlot)
const gpu = hardwareData?.gpus?.find(g => g.slot === gpuSlot)
const currentMode = gpu ? getGpuSwitchMode(gpu) : "unknown"
if (pendingMode && pendingMode !== currentMode && gpu) {
@@ -333,7 +316,7 @@ export default function Hardware() {
setSwitchModeParams(null)
setPendingSwitchModes({})
// Refresh hardware data
mutateHardware()
mutateStatic()
}
const handleInstallNvidiaDriver = () => {
@@ -391,14 +374,14 @@ export default function Hardware() {
}
const findPCIDeviceForGPU = (gpu: GPU): PCIDevice | null => {
if (!hardwareDataSWR?.pci_devices || !gpu.slot) return null
if (!hardwareData?.pci_devices || !gpu.slot) return null
// Try to find exact match first (e.g., "00:02.0")
let pciDevice = hardwareDataSWR.pci_devices.find((d) => d.slot === gpu.slot)
let pciDevice = hardwareData.pci_devices.find((d) => d.slot === gpu.slot)
// If not found, try to match by partial slot (e.g., "00" matches "00:02.0")
if (!pciDevice && gpu.slot.length <= 2) {
pciDevice = hardwareDataSWR.pci_devices.find(
pciDevice = hardwareData.pci_devices.find(
(d) =>
d.slot.startsWith(gpu.slot + ":") &&
(d.type.toLowerCase().includes("vga") ||
@@ -417,7 +400,7 @@ export default function Hardware() {
return realtimeGPUData.has_monitoring_tool === true
}
if (swrLoading) {
if (isLoading) {
return (
<div className="flex flex-col items-center justify-center min-h-[400px] gap-4">
<div className="relative">
@@ -433,7 +416,7 @@ export default function Hardware() {
return (
<div className="space-y-6">
{/* System Information - CPU & Motherboard */}
{(hardwareDataSWR?.cpu || hardwareDataSWR?.motherboard) && (
{(hardwareData?.cpu || hardwareData?.motherboard) && (
<Card className="border-border/50 bg-card/50 p-6">
<div className="mb-4 flex items-center gap-2">
<Cpu className="h-5 w-5 text-primary" />
@@ -442,44 +425,44 @@ export default function Hardware() {
<div className="grid gap-6 md:grid-cols-2">
{/* CPU Info */}
{hardwareDataSWR?.cpu && Object.keys(hardwareDataSWR.cpu).length > 0 && (
{hardwareData?.cpu && Object.keys(hardwareData.cpu).length > 0 && (
<div>
<div className="mb-2 flex items-center gap-2">
<CpuIcon className="h-4 w-4 text-muted-foreground" />
<h3 className="text-sm font-semibold">CPU</h3>
</div>
<div className="space-y-2">
{hardwareDataSWR.cpu.model && (
{hardwareData.cpu.model && (
<div className="flex justify-between text-sm">
<span className="text-muted-foreground">Model</span>
<span className="font-medium text-right">{hardwareDataSWR.cpu.model}</span>
<span className="font-medium text-right">{hardwareData.cpu.model}</span>
</div>
)}
{hardwareDataSWR.cpu.cores_per_socket && hardwareDataSWR.cpu.sockets && (
{hardwareData.cpu.cores_per_socket && hardwareData.cpu.sockets && (
<div className="flex justify-between text-sm">
<span className="text-muted-foreground">Cores</span>
<span className="font-medium">
{hardwareDataSWR.cpu.sockets} × {hardwareDataSWR.cpu.cores_per_socket} ={" "}
{hardwareDataSWR.cpu.sockets * hardwareDataSWR.cpu.cores_per_socket} cores
{hardwareData.cpu.sockets} × {hardwareData.cpu.cores_per_socket} ={" "}
{hardwareData.cpu.sockets * hardwareData.cpu.cores_per_socket} cores
</span>
</div>
)}
{hardwareDataSWR.cpu.total_threads && (
{hardwareData.cpu.total_threads && (
<div className="flex justify-between text-sm">
<span className="text-muted-foreground">Threads</span>
<span className="font-medium">{hardwareDataSWR.cpu.total_threads}</span>
<span className="font-medium">{hardwareData.cpu.total_threads}</span>
</div>
)}
{hardwareDataSWR.cpu.l3_cache && (
{hardwareData.cpu.l3_cache && (
<div className="flex justify-between text-sm">
<span className="text-muted-foreground">L3 Cache</span>
<span className="font-medium">{hardwareDataSWR.cpu.l3_cache}</span>
<span className="font-medium">{hardwareData.cpu.l3_cache}</span>
</div>
)}
{hardwareDataSWR.cpu.virtualization && (
{hardwareData.cpu.virtualization && (
<div className="flex justify-between text-sm">
<span className="text-muted-foreground">Virtualization</span>
<span className="font-medium">{hardwareDataSWR.cpu.virtualization}</span>
<span className="font-medium">{hardwareData.cpu.virtualization}</span>
</div>
)}
</div>
@@ -487,41 +470,41 @@ export default function Hardware() {
)}
{/* Motherboard Info */}
{hardwareDataSWR?.motherboard && Object.keys(hardwareDataSWR.motherboard).length > 0 && (
{hardwareData?.motherboard && Object.keys(hardwareData.motherboard).length > 0 && (
<div>
<div className="mb-2 flex items-center gap-2">
<Cpu className="h-4 w-4 text-muted-foreground" />
<h3 className="text-sm font-semibold">Motherboard</h3>
</div>
<div className="space-y-2">
{hardwareDataSWR.motherboard.manufacturer && (
{hardwareData.motherboard.manufacturer && (
<div className="flex justify-between text-sm">
<span className="text-muted-foreground">Manufacturer</span>
<span className="font-medium text-right">{hardwareDataSWR.motherboard.manufacturer}</span>
<span className="font-medium text-right">{hardwareData.motherboard.manufacturer}</span>
</div>
)}
{hardwareDataSWR.motherboard.model && (
{hardwareData.motherboard.model && (
<div className="flex justify-between text-sm">
<span className="text-muted-foreground">Model</span>
<span className="font-medium text-right">{hardwareDataSWR.motherboard.model}</span>
<span className="font-medium text-right">{hardwareData.motherboard.model}</span>
</div>
)}
{hardwareDataSWR.motherboard.bios?.vendor && (
{hardwareData.motherboard.bios?.vendor && (
<div className="flex justify-between text-sm">
<span className="text-muted-foreground">BIOS</span>
<span className="font-medium text-right">{hardwareDataSWR.motherboard.bios.vendor}</span>
<span className="font-medium text-right">{hardwareData.motherboard.bios.vendor}</span>
</div>
)}
{hardwareDataSWR.motherboard.bios?.version && (
{hardwareData.motherboard.bios?.version && (
<div className="flex justify-between text-sm">
<span className="text-muted-foreground">Version</span>
<span className="font-medium">{hardwareDataSWR.motherboard.bios.version}</span>
<span className="font-medium">{hardwareData.motherboard.bios.version}</span>
</div>
)}
{hardwareDataSWR.motherboard.bios?.date && (
{hardwareData.motherboard.bios?.date && (
<div className="flex justify-between text-sm">
<span className="text-muted-foreground">Date</span>
<span className="font-medium">{hardwareDataSWR.motherboard.bios.date}</span>
<span className="font-medium">{hardwareData.motherboard.bios.date}</span>
</div>
)}
</div>
@@ -532,18 +515,18 @@ export default function Hardware() {
)}
{/* Memory Modules */}
{hardwareDataSWR?.memory_modules && hardwareDataSWR.memory_modules.length > 0 && (
{hardwareData?.memory_modules && hardwareData.memory_modules.length > 0 && (
<Card className="border-border/50 bg-card/50 p-6">
<div className="mb-4 flex items-center gap-2">
<MemoryStick className="h-5 w-5 text-primary" />
<h2 className="text-lg font-semibold">Memory Modules</h2>
<Badge variant="outline" className="ml-auto">
{hardwareDataSWR.memory_modules.length} installed
{hardwareData.memory_modules.length} installed
</Badge>
</div>
<div className="grid gap-3 md:grid-cols-2 lg:grid-cols-3">
{hardwareDataSWR.memory_modules.map((module, index) => (
{hardwareData.memory_modules.map((module, index) => (
<div key={index} className="rounded-lg border border-border/30 bg-background/60 p-4">
<div className="mb-2 font-medium text-sm">{module.slot}</div>
<div className="space-y-1">
@@ -590,29 +573,29 @@ export default function Hardware() {
)}
{/* Thermal Monitoring */}
{hardwareDataSWR?.temperatures && hardwareDataSWR.temperatures.length > 0 && (
{hardwareData?.temperatures && hardwareData.temperatures.length > 0 && (
<Card className="border-border/50 bg-card/50 p-6">
<div className="mb-4 flex items-center gap-2">
<Thermometer className="h-5 w-5 text-primary" />
<h2 className="text-lg font-semibold">Thermal Monitoring</h2>
<Badge variant="outline" className="ml-auto">
{hardwareDataSWR.temperatures.length} sensors
{hardwareData.temperatures.length} sensors
</Badge>
</div>
<div className="grid gap-6 md:grid-cols-2">
{/* CPU Sensors */}
{groupAndSortTemperatures(hardwareDataSWR.temperatures).CPU.length > 0 && (
{groupAndSortTemperatures(hardwareData.temperatures).CPU.length > 0 && (
<div className="md:col-span-2">
<div className="mb-3 flex items-center gap-2">
<CpuIcon className="h-4 w-4 text-muted-foreground" />
<h3 className="text-sm font-semibold">CPU</h3>
<Badge variant="outline" className="text-xs">
{groupAndSortTemperatures(hardwareDataSWR.temperatures).CPU.length}
{groupAndSortTemperatures(hardwareData.temperatures).CPU.length}
</Badge>
</div>
<div className="grid gap-4 md:grid-cols-2">
{groupAndSortTemperatures(hardwareDataSWR.temperatures).CPU.map((temp, index) => {
{groupAndSortTemperatures(hardwareData.temperatures).CPU.map((temp, index) => {
const percentage =
temp.critical > 0 ? (temp.current / temp.critical) * 100 : (temp.current / 100) * 100
const isHot = temp.current > (temp.high || 80)
@@ -643,21 +626,21 @@ export default function Hardware() {
)}
{/* GPU Sensors */}
{groupAndSortTemperatures(hardwareDataSWR.temperatures).GPU.length > 0 && (
{groupAndSortTemperatures(hardwareData.temperatures).GPU.length > 0 && (
<div
className={groupAndSortTemperatures(hardwareDataSWR.temperatures).GPU.length > 1 ? "md:col-span-2" : ""}
className={groupAndSortTemperatures(hardwareData.temperatures).GPU.length > 1 ? "md:col-span-2" : ""}
>
<div className="mb-3 flex items-center gap-2">
<Gpu className="h-4 w-4 text-muted-foreground" />
<h3 className="text-sm font-semibold">GPU</h3>
<Badge variant="outline" className="text-xs">
{groupAndSortTemperatures(hardwareDataSWR.temperatures).GPU.length}
{groupAndSortTemperatures(hardwareData.temperatures).GPU.length}
</Badge>
</div>
<div
className={`grid gap-4 ${groupAndSortTemperatures(hardwareDataSWR.temperatures).GPU.length > 1 ? "md:grid-cols-2" : ""}`}
className={`grid gap-4 ${groupAndSortTemperatures(hardwareData.temperatures).GPU.length > 1 ? "md:grid-cols-2" : ""}`}
>
{groupAndSortTemperatures(hardwareDataSWR.temperatures).GPU.map((temp, index) => {
{groupAndSortTemperatures(hardwareData.temperatures).GPU.map((temp, index) => {
const percentage =
temp.critical > 0 ? (temp.current / temp.critical) * 100 : (temp.current / 100) * 100
const isHot = temp.current > (temp.high || 80)
@@ -688,23 +671,23 @@ export default function Hardware() {
)}
{/* NVME Sensors */}
{groupAndSortTemperatures(hardwareDataSWR.temperatures).NVME.length > 0 && (
{groupAndSortTemperatures(hardwareData.temperatures).NVME.length > 0 && (
<div
className={
groupAndSortTemperatures(hardwareDataSWR.temperatures).NVME.length > 1 ? "md:col-span-2" : ""
groupAndSortTemperatures(hardwareData.temperatures).NVME.length > 1 ? "md:col-span-2" : ""
}
>
<div className="mb-3 flex items-center gap-2">
<HardDrive className="h-4 w-4 text-muted-foreground" />
<h3 className="text-sm font-semibold">NVME</h3>
<Badge variant="outline" className="text-xs">
{groupAndSortTemperatures(hardwareDataSWR.temperatures).NVME.length}
{groupAndSortTemperatures(hardwareData.temperatures).NVME.length}
</Badge>
</div>
<div
className={`grid gap-4 ${groupAndSortTemperatures(hardwareDataSWR.temperatures).NVME.length > 1 ? "md:grid-cols-2" : ""}`}
className={`grid gap-4 ${groupAndSortTemperatures(hardwareData.temperatures).NVME.length > 1 ? "md:grid-cols-2" : ""}`}
>
{groupAndSortTemperatures(hardwareDataSWR.temperatures).NVME.map((temp, index) => {
{groupAndSortTemperatures(hardwareData.temperatures).NVME.map((temp, index) => {
const percentage =
temp.critical > 0 ? (temp.current / temp.critical) * 100 : (temp.current / 100) * 100
const isHot = temp.current > (temp.high || 80)
@@ -735,21 +718,21 @@ export default function Hardware() {
)}
{/* PCI Sensors */}
{groupAndSortTemperatures(hardwareDataSWR.temperatures).PCI.length > 0 && (
{groupAndSortTemperatures(hardwareData.temperatures).PCI.length > 0 && (
<div
className={groupAndSortTemperatures(hardwareDataSWR.temperatures).PCI.length > 1 ? "md:col-span-2" : ""}
className={groupAndSortTemperatures(hardwareData.temperatures).PCI.length > 1 ? "md:col-span-2" : ""}
>
<div className="mb-3 flex items-center gap-2">
<CpuIcon className="h-4 w-4 text-muted-foreground" />
<h3 className="text-sm font-semibold">PCI</h3>
<Badge variant="outline" className="text-xs">
{groupAndSortTemperatures(hardwareDataSWR.temperatures).PCI.length}
{groupAndSortTemperatures(hardwareData.temperatures).PCI.length}
</Badge>
</div>
<div
className={`grid gap-4 ${groupAndSortTemperatures(hardwareDataSWR.temperatures).PCI.length > 1 ? "md:grid-cols-2" : ""}`}
className={`grid gap-4 ${groupAndSortTemperatures(hardwareData.temperatures).PCI.length > 1 ? "md:grid-cols-2" : ""}`}
>
{groupAndSortTemperatures(hardwareDataSWR.temperatures).PCI.map((temp, index) => {
{groupAndSortTemperatures(hardwareData.temperatures).PCI.map((temp, index) => {
const percentage =
temp.critical > 0 ? (temp.current / temp.critical) * 100 : (temp.current / 100) * 100
const isHot = temp.current > (temp.high || 80)
@@ -780,23 +763,23 @@ export default function Hardware() {
)}
{/* OTHER Sensors */}
{groupAndSortTemperatures(hardwareDataSWR.temperatures).OTHER.length > 0 && (
{groupAndSortTemperatures(hardwareData.temperatures).OTHER.length > 0 && (
<div
className={
groupAndSortTemperatures(hardwareDataSWR.temperatures).OTHER.length > 1 ? "md:col-span-2" : ""
groupAndSortTemperatures(hardwareData.temperatures).OTHER.length > 1 ? "md:col-span-2" : ""
}
>
<div className="mb-3 flex items-center gap-2">
<Thermometer className="h-4 w-4 text-muted-foreground" />
<h3 className="text-sm font-semibold">OTHER</h3>
<Badge variant="outline" className="text-xs">
{groupAndSortTemperatures(hardwareDataSWR.temperatures).OTHER.length}
{groupAndSortTemperatures(hardwareData.temperatures).OTHER.length}
</Badge>
</div>
<div
className={`grid gap-4 ${groupAndSortTemperatures(hardwareDataSWR.temperatures).OTHER.length > 1 ? "md:grid-cols-2" : ""}`}
className={`grid gap-4 ${groupAndSortTemperatures(hardwareData.temperatures).OTHER.length > 1 ? "md:grid-cols-2" : ""}`}
>
{groupAndSortTemperatures(hardwareDataSWR.temperatures).OTHER.map((temp, index) => {
{groupAndSortTemperatures(hardwareData.temperatures).OTHER.map((temp, index) => {
const percentage =
temp.critical > 0 ? (temp.current / temp.critical) * 100 : (temp.current / 100) * 100
const isHot = temp.current > (temp.high || 80)
@@ -830,18 +813,18 @@ export default function Hardware() {
)}
{/* GPU Information - Enhanced with on-demand data fetching */}
{hardwareDataSWR?.gpus && hardwareDataSWR.gpus.length > 0 && (
{hardwareData?.gpus && hardwareData.gpus.length > 0 && (
<Card className="border-border/50 bg-card/50 p-6">
<div className="mb-4 flex items-center gap-2">
<Gpu className="h-5 w-5 text-primary" />
<h2 className="text-lg font-semibold">Graphics Cards</h2>
<Badge variant="outline" className="ml-auto">
{hardwareDataSWR.gpus.length} GPU{hardwareDataSWR.gpus.length > 1 ? "s" : ""}
{hardwareData.gpus.length} GPU{hardwareData.gpus.length > 1 ? "s" : ""}
</Badge>
</div>
<div className="grid gap-4 sm:grid-cols-2">
{hardwareDataSWR.gpus.map((gpu, index) => {
{hardwareData.gpus.map((gpu, index) => {
const pciDevice = findPCIDeviceForGPU(gpu)
const fullSlot = pciDevice?.slot || gpu.slot
@@ -1324,7 +1307,7 @@ return (
</Dialog>
{/* Power Consumption */}
{hardwareDataSWR?.power_meter && (
{hardwareData?.power_meter && (
<Card className="border-border/50 bg-card/50 p-6">
<div className="mb-4 flex items-center gap-2">
<Zap className="h-5 w-5 text-blue-500" />
@@ -1334,13 +1317,13 @@ return (
<div className="space-y-4">
<div className="flex items-center justify-between rounded-lg border border-border/30 bg-background/60 p-4">
<div className="space-y-1">
<p className="text-sm font-medium">{hardwareDataSWR.power_meter.name}</p>
{hardwareDataSWR.power_meter.adapter && (
<p className="text-xs text-muted-foreground">{hardwareDataSWR.power_meter.adapter}</p>
<p className="text-sm font-medium">{hardwareData.power_meter.name}</p>
{hardwareData.power_meter.adapter && (
<p className="text-xs text-muted-foreground">{hardwareData.power_meter.adapter}</p>
)}
</div>
<div className="text-right">
<p className="text-2xl font-bold text-blue-500">{hardwareDataSWR.power_meter.watts.toFixed(1)} W</p>
<p className="text-2xl font-bold text-blue-500">{hardwareData.power_meter.watts.toFixed(1)} W</p>
<p className="text-xs text-muted-foreground">Current Draw</p>
</div>
</div>
@@ -1349,18 +1332,18 @@ return (
)}
{/* Power Supplies */}
{hardwareDataSWR?.power_supplies && hardwareDataSWR.power_supplies.length > 0 && (
{hardwareData?.power_supplies && hardwareData.power_supplies.length > 0 && (
<Card className="border-border/50 bg-card/50 p-6">
<div className="mb-4 flex items-center gap-2">
<PowerIcon className="h-5 w-5 text-green-500" />
<h2 className="text-lg font-semibold">Power Supplies</h2>
<Badge variant="outline" className="ml-auto">
{hardwareDataSWR.power_supplies.length} PSUs
{hardwareData.power_supplies.length} PSUs
</Badge>
</div>
<div className="grid gap-3 md:grid-cols-2">
{hardwareDataSWR.power_supplies.map((psu, index) => (
{hardwareData.power_supplies.map((psu, index) => (
<div key={index} className="rounded-lg border border-border/30 bg-background/60 p-4">
<div className="flex items-center justify-between">
<span className="text-sm font-medium">{psu.name}</span>
@@ -1377,18 +1360,18 @@ return (
)}
{/* Fans */}
{hardwareDataSWR?.fans && hardwareDataSWR.fans.length > 0 && (
{hardwareData?.fans && hardwareData.fans.length > 0 && (
<Card className="border-border/50 bg-card/50 p-6">
<div className="mb-4 flex items-center gap-2">
<FanIcon className="h-5 w-5 text-primary" />
<h2 className="text-lg font-semibold">System Fans</h2>
<Badge variant="outline" className="ml-auto">
{hardwareDataSWR.fans.length} fans
{hardwareData.fans.length} fans
</Badge>
</div>
<div className="grid gap-4 md:grid-cols-2">
{hardwareDataSWR.fans.map((fan, index) => {
{hardwareData.fans.map((fan, index) => {
const isPercentage = fan.unit === "percent" || fan.unit === "%"
const percentage = isPercentage ? fan.speed : Math.min((fan.speed / 5000) * 100, 100)
@@ -1412,18 +1395,18 @@ return (
)}
{/* UPS */}
{hardwareDataSWR?.ups && Array.isArray(hardwareDataSWR.ups) && hardwareDataSWR.ups.length > 0 && (
{hardwareData?.ups && Array.isArray(hardwareData.ups) && hardwareData.ups.length > 0 && (
<Card className="border-border/50 bg-card/50 p-6">
<div className="mb-4 flex items-center gap-2">
<Battery className="h-5 w-5 text-primary" />
<h2 className="text-lg font-semibold">UPS Status</h2>
<Badge variant="outline" className="ml-auto">
{hardwareDataSWR.ups.length} UPS
{hardwareData.ups.length} UPS
</Badge>
</div>
<div className="grid gap-4 md:grid-cols-2">
{hardwareDataSWR.ups.map((ups: any, index: number) => {
{hardwareData.ups.map((ups: any, index: number) => {
const batteryCharge =
ups.battery_charge_raw || Number.parseFloat(ups.battery_charge?.replace("%", "") || "0")
const loadPercent = ups.load_percent_raw || Number.parseFloat(ups.load_percent?.replace("%", "") || "0")
@@ -1694,18 +1677,18 @@ return (
</Dialog>
{/* PCI Devices - Changed to modal */}
{hardwareDataSWR?.pci_devices && hardwareDataSWR.pci_devices.length > 0 && (
{hardwareData?.pci_devices && hardwareData.pci_devices.length > 0 && (
<Card className="border-border/50 bg-card/50 p-6">
<div className="mb-4 flex items-center gap-2">
<CpuIcon className="h-5 w-5 text-primary" />
<h2 className="text-lg font-semibold">PCI Devices</h2>
<Badge variant="outline" className="ml-auto">
{hardwareDataSWR.pci_devices.length} devices
{hardwareData.pci_devices.length} devices
</Badge>
</div>
<div className="grid gap-3 sm:grid-cols-2 lg:grid-cols-3">
{hardwareDataSWR.pci_devices.map((device, index) => (
{hardwareData.pci_devices.map((device, index) => (
<div
key={index}
onClick={() => setSelectedPCIDevice(device)}
@@ -1787,19 +1770,19 @@ return (
</Dialog>
{/* Network Summary - Clickable */}
{hardwareDataSWR?.pci_devices &&
hardwareDataSWR.pci_devices.filter((d) => d.type.toLowerCase().includes("network")).length > 0 && (
{hardwareData?.pci_devices &&
hardwareData.pci_devices.filter((d) => d.type.toLowerCase().includes("network")).length > 0 && (
<Card className="border-border/50 bg-card/50 p-6">
<div className="mb-4 flex items-center gap-2">
<Network className="h-5 w-5 text-primary" />
<h2 className="text-lg font-semibold">Network Summary</h2>
<Badge variant="outline" className="ml-auto">
{hardwareDataSWR.pci_devices.filter((d) => d.type.toLowerCase().includes("network")).length} interfaces
{hardwareData.pci_devices.filter((d) => d.type.toLowerCase().includes("network")).length} interfaces
</Badge>
</div>
<div className="grid gap-3 sm:grid-cols-2 lg:grid-cols-3">
{hardwareDataSWR.pci_devices
{hardwareData.pci_devices
.filter((d) => d.type.toLowerCase().includes("network"))
.map((device, index) => (
<div
@@ -1879,14 +1862,14 @@ return (
</Dialog>
{/* Storage Summary - Clickable */}
{hardwareDataSWR?.storage_devices && hardwareDataSWR.storage_devices.length > 0 && (
{hardwareData?.storage_devices && hardwareData.storage_devices.length > 0 && (
<Card className="border-border/50 bg-card/50 p-6">
<div className="mb-4 flex items-center gap-2">
<HardDrive className="h-5 w-5 text-primary" />
<h2 className="text-lg font-semibold">Storage Summary</h2>
<Badge variant="outline" className="ml-auto">
{
hardwareDataSWR.storage_devices.filter(
hardwareData.storage_devices.filter(
(device) =>
device.type === "disk" && !device.name.startsWith("zd") && !device.name.startsWith("loop"),
).length
@@ -1896,7 +1879,7 @@ return (
</div>
<div className="grid gap-3 sm:grid-cols-2 lg:grid-cols-3">
{hardwareDataSWR.storage_devices
{hardwareData.storage_devices
.filter(
(device) => device.type === "disk" && !device.name.startsWith("zd") && !device.name.startsWith("loop"),
)
@@ -2239,12 +2222,12 @@ return (
description="Installing NVIDIA proprietary drivers for GPU monitoring..."
onClose={() => {
setNvidiaSessionId(null)
mutateHardware()
mutateStatic()
}}
onComplete={(success) => {
console.log("[v0] NVIDIA installation completed:", success ? "success" : "failed")
if (success) {
mutateHardware()
mutateStatic()
}
}}
/> */}
@@ -2252,7 +2235,7 @@ return (
open={showNvidiaInstaller}
onClose={() => {
setShowNvidiaInstaller(false)
mutateHardware()
mutateStatic()
}}
scriptPath="/usr/local/share/proxmenux/scripts/gpu_tpu/nvidia_installer.sh"
scriptName="nvidia_installer"
@@ -2266,7 +2249,7 @@ return (
open={showAmdInstaller}
onClose={() => {
setShowAmdInstaller(false)
mutateHardware()
mutateStatic()
}}
scriptPath="/usr/local/share/proxmenux/scripts/gpu_tpu/amd_gpu_tools.sh"
scriptName="amd_gpu_tools"
@@ -2280,7 +2263,7 @@ title="AMD GPU Tools Installation"
open={showIntelInstaller}
onClose={() => {
setShowIntelInstaller(false)
mutateHardware()
mutateStatic()
}}
scriptPath="/usr/local/share/proxmenux/scripts/gpu_tpu/intel_gpu_tools.sh"
scriptName="intel_gpu_tools"

View File

@@ -222,7 +222,7 @@ export function SystemOverview() {
const systemInterval = setInterval(async () => {
const data = await fetchSystemData()
if (data) setSystemData(data)
}, 9000)
}, 5000)
const vmInterval = setInterval(async () => {
const data = await fetchVMData()

View File

@@ -295,10 +295,10 @@ export function VirtualMachines() {
isLoading,
mutate,
} = useSWR<VMData[]>("/api/vms", fetcher, {
refreshInterval: 23000,
revalidateOnFocus: false,
refreshInterval: 5000,
revalidateOnFocus: true,
revalidateOnReconnect: true,
dedupingInterval: 10000,
dedupingInterval: 2000,
errorRetryCount: 2,
})

View File

@@ -1088,43 +1088,50 @@ def _health_collector_loop():
def _vital_signs_sampler():
"""Dedicated thread for rapid CPU & temperature sampling.
"""Dedicated thread for rapid CPU, memory & temperature sampling.
Runs independently of the 5-min health collector loop.
- CPU usage: sampled every 30s (3 samples in 1.5 min for hysteresis)
- CPU usage: sampled every 30s (10 samples in 5 min for sustained detection)
- Memory: sampled every 30s (10 samples in 5 min for sustained detection)
- Temperature: sampled every 15s (12 samples in 3 min for temporal logic)
Uses time.monotonic() to avoid drift.
Staggered intervals: CPU at offset 0, Temp at offset 7s to avoid collision.
Staggered intervals to avoid collision: CPU at 0, Temp at +7s, Mem at +15s.
"""
from health_monitor import health_monitor
# Wait 15s after startup for sensors to be ready
time.sleep(15)
TEMP_INTERVAL = 15 # seconds (was 10s - reduced frequency by 33%)
CPU_INTERVAL = 30 # seconds
# Stagger: CPU starts immediately, Temp starts after 7s offset
MEM_INTERVAL = 30 # seconds (aligned with CPU for sustained-RAM detection)
# Stagger: CPU starts immediately, Temp after 7s, Mem after 15s
next_cpu = time.monotonic()
next_temp = time.monotonic() + 7
print("[ProxMenux] Vital signs sampler started (CPU: 30s, Temp: 10s)")
next_mem = time.monotonic() + 15
print("[ProxMenux] Vital signs sampler started (CPU: 30s, Mem: 30s, Temp: 15s)")
while True:
try:
now = time.monotonic()
if now >= next_temp:
health_monitor._sample_cpu_temperature()
next_temp = now + TEMP_INTERVAL
if now >= next_cpu:
health_monitor._sample_cpu_usage()
next_cpu = now + CPU_INTERVAL
if now >= next_mem:
health_monitor._sample_memory_usage()
next_mem = now + MEM_INTERVAL
# Sleep until the next earliest event (with 0.5s min to avoid busy-loop)
sleep_until = min(next_temp, next_cpu) - time.monotonic()
sleep_until = min(next_temp, next_cpu, next_mem) - time.monotonic()
time.sleep(max(sleep_until, 0.5))
except Exception as e:
print(f"[ProxMenux] Vital signs sampler error: {e}")
@@ -1160,7 +1167,7 @@ _pvesh_cache = {
'storage_list': None,
'storage_list_time': 0,
}
_PVESH_CACHE_TTL = 30 # 30 seconds - balances freshness with performance
_PVESH_CACHE_TTL = 5 # 5 seconds - near real-time for active UI; pvesh local cost is ~200-400ms
# Cache for sensors output (temperature readings)
_sensors_cache = {
@@ -1169,6 +1176,15 @@ _sensors_cache = {
}
_SENSORS_CACHE_TTL = 10 # 10 seconds - temperature changes slowly
# Cache for ipmitool sensor output (shared between fans, power supplies, power meter)
# ipmitool is slow (1-3s per call) and was called twice per /api/hardware hit.
_ipmi_cache = {
'output': None,
'time': 0,
'unavailable': False, # set True if ipmitool is missing, avoid retrying
}
_IPMI_CACHE_TTL = 10 # 10 seconds
# Cache for hardware info (lspci, dmidecode, lsblk)
_hardware_cache = {
'lspci': None,
@@ -3820,13 +3836,42 @@ def get_proxmox_vms():
# Return empty array instead of error object - frontend expects array
return []
def get_ipmi_fans():
"""Get fan information from IPMI"""
fans = []
def get_cached_ipmi_sensors():
"""Get ipmitool sensor output with 10s cache. Shared between fans/power parsers.
Returns empty string if ipmitool is unavailable (cached to avoid repeated FileNotFoundError).
"""
global _ipmi_cache
now = time.time()
if _ipmi_cache['unavailable']:
return ''
if _ipmi_cache['output'] is not None and \
now - _ipmi_cache['time'] < _IPMI_CACHE_TTL:
return _ipmi_cache['output']
try:
result = subprocess.run(['ipmitool', 'sensor'], capture_output=True, text=True, timeout=10)
if result.returncode == 0:
for line in result.stdout.split('\n'):
_ipmi_cache['output'] = result.stdout
_ipmi_cache['time'] = now
return result.stdout
except FileNotFoundError:
_ipmi_cache['unavailable'] = True
return ''
except Exception:
pass
return _ipmi_cache['output'] or ''
def get_ipmi_fans():
"""Get fan information from IPMI (uses cached sensor output)."""
fans = []
try:
output = get_cached_ipmi_sensors()
if output:
for line in output.split('\n'):
if 'fan' in line.lower() and '|' in line:
parts = [p.strip() for p in line.split('|')]
if len(parts) >= 3:
@@ -3862,14 +3907,14 @@ def get_ipmi_fans():
return fans
def get_ipmi_power():
"""Get power supply information from IPMI"""
"""Get power supply information from IPMI (uses cached sensor output)."""
power_supplies = []
power_meter = None
try:
result = subprocess.run(['ipmitool', 'sensor'], capture_output=True, text=True, timeout=10)
if result.returncode == 0:
for line in result.stdout.split('\n'):
output = get_cached_ipmi_sensors()
if output:
for line in output.split('\n'):
if ('power supply' in line.lower() or 'power meter' in line.lower()) and '|' in line:
parts = [p.strip() for p in line.split('|')]
if len(parts) >= 3:
@@ -4202,7 +4247,97 @@ def identify_fan(sensor_name, adapter, chip_name=None):
return sensor_name
# Default: return original name
return sensor_name
return sensor_name
def _parse_sensor_fans(sensors_output):
"""Parse fan entries from `sensors` output. Extracted for reuse between
get_hardware_info (static full payload) and get_hardware_live_info (live endpoint)."""
fans = []
if not sensors_output:
return fans
current_adapter = None
current_chip = None
for line in sensors_output.split('\n'):
line = line.strip()
if not line:
continue
if not ':' in line and not line.startswith(' ') and not line.startswith('Adapter'):
current_chip = line
continue
if line.startswith('Adapter:'):
current_adapter = line.replace('Adapter:', '').strip()
continue
if ':' in line and not line.startswith(' '):
parts = line.split(':', 1)
sensor_name = parts[0].strip()
value_part = parts[1].strip()
if 'RPM' in value_part:
rpm_match = re.search(r'([\d.]+)\s*RPM', value_part)
if rpm_match:
fan_speed = int(float(rpm_match.group(1)))
identified_name = identify_fan(sensor_name, current_adapter, current_chip)
fans.append({
'name': identified_name,
'original_name': sensor_name,
'speed': fan_speed,
'unit': 'RPM',
'adapter': current_adapter
})
return fans
def get_hardware_live_info():
"""Build only the live/dynamic hardware fields for /api/hardware/live.
Skips all the heavy static collection (lscpu, dmidecode, lsblk, smartctl, lspci...).
Uses cached sensors + cached ipmitool output to stay cheap under 5s polling.
"""
result = {
'temperatures': [],
'fans': [],
'power_meter': None,
'power_supplies': [],
'ups': None,
}
try:
temp_info = get_temperature_info()
result['temperatures'] = temp_info.get('temperatures', [])
result['power_meter'] = temp_info.get('power_meter')
except Exception:
pass
try:
sensor_fans = _parse_sensor_fans(get_cached_sensors_output())
except Exception:
sensor_fans = []
try:
ipmi_fans = get_ipmi_fans()
except Exception:
ipmi_fans = []
result['fans'] = sensor_fans + ipmi_fans
try:
ipmi_power = get_ipmi_power()
if ipmi_power:
result['power_supplies'] = ipmi_power.get('power_supplies', [])
# Fallback: if sensors didn't provide a power_meter, use IPMI's
if result['power_meter'] is None and ipmi_power.get('power_meter'):
result['power_meter'] = ipmi_power['power_meter']
except Exception:
pass
try:
ups_info = get_ups_info()
if ups_info:
result['ups'] = ups_info
except Exception:
pass
return result
def get_temperature_info():
@@ -6102,52 +6237,8 @@ def get_hardware_info():
pass
try:
sensors_output = get_cached_sensors_output()
if sensors_output:
current_adapter = None
current_chip = None # Add chip name tracking
fans = []
for line in sensors_output.split('\n'):
line = line.strip()
if not line:
continue
# Chip names don't have ":" and are not indented
if not ':' in line and not line.startswith(' ') and not line.startswith('Adapter'):
current_chip = line
continue
# Detect adapter line
if line.startswith('Adapter:'):
current_adapter = line.replace('Adapter:', '').strip()
continue
# Parse fan sensors
if ':' in line and not line.startswith(' '):
parts = line.split(':', 1)
sensor_name = parts[0].strip()
value_part = parts[1].strip()
# Look for fan sensors (RPM)
if 'RPM' in value_part:
rpm_match = re.search(r'([\d.]+)\s*RPM', value_part)
if rpm_match:
fan_speed = int(float(rpm_match.group(1)))
identified_name = identify_fan(sensor_name, current_adapter, current_chip)
fans.append({
'name': identified_name,
'original_name': sensor_name,
'speed': fan_speed,
'unit': 'RPM',
'adapter': current_adapter
})
hardware_data['sensors']['fans'] = fans
except Exception as e:
# print(f"[v0] Error getting fan sensors: {e}")
hardware_data['sensors']['fans'] = _parse_sensor_fans(get_cached_sensors_output())
except Exception:
pass
# Power Supply / UPS
@@ -6226,7 +6317,9 @@ def get_hardware_info():
def api_system():
"""Get system information including CPU, memory, and temperature"""
try:
cpu_usage = psutil.cpu_percent(interval=0.5)
# Non-blocking: returns %CPU since the last psutil call (sampler or prior API hit).
# The background vital-signs sampler keeps psutil's internal state primed.
cpu_usage = psutil.cpu_percent(interval=0)
memory = psutil.virtual_memory()
memory_used_gb = memory.used / (1024 ** 3)
@@ -9286,6 +9379,23 @@ def api_hardware():
traceback.print_exc()
return jsonify({'error': str(e)}), 500
@app.route('/api/hardware/live', methods=['GET'])
@require_auth
def api_hardware_live():
"""Lightweight endpoint: only dynamic hardware fields (temps, fans, power, UPS).
Designed for the active Hardware page to poll every 3-5s without re-running the
expensive static collectors (lscpu, dmidecode, lsblk, smartctl). ipmitool output
is cached internally (10s) so repeated polls don't hammer the BMC.
"""
try:
return jsonify(get_hardware_live_info())
except Exception as e:
import traceback
traceback.print_exc()
return jsonify({'error': str(e)}), 500
@app.route('/api/gpu/<slot>/realtime', methods=['GET'])
@require_auth
def api_gpu_realtime(slot):
@@ -9526,8 +9636,11 @@ def api_vm_control(vmid):
control_result = subprocess.run(
['pvesh', 'create', f'/nodes/{node}/{vm_type}/{vmid}/status/{action}'],
capture_output=True, text=True, timeout=30)
if control_result.returncode == 0:
# Invalidate VM resources cache so the next /api/vms call
# returns fresh status instead of the pre-action snapshot.
_pvesh_cache['cluster_resources_vm_time'] = 0
return jsonify({
'success': True,
'vmid': vmid,

View File

@@ -67,7 +67,7 @@ class HealthMonitor:
# Memory Thresholds
MEMORY_WARNING = 85
MEMORY_CRITICAL = 95
MEMORY_DURATION = 60
MEMORY_DURATION = 300 # 5 minutes sustained (aligned with CPU)
SWAP_WARNING_DURATION = 300
SWAP_CRITICAL_PERCENT = 5
SWAP_CRITICAL_DURATION = 120
@@ -402,6 +402,30 @@ class HealthMonitor:
except Exception:
pass # Sampling must never crash the thread
def _sample_memory_usage(self):
"""Lightweight memory sample: read RAM/swap % and append to history. ~1ms cost."""
try:
memory = psutil.virtual_memory()
swap = psutil.swap_memory()
current_time = time.time()
mem_percent = memory.percent
swap_percent = swap.percent if swap.total > 0 else 0
swap_vs_ram = (swap.used / memory.total * 100) if memory.total > 0 else 0
state_key = 'memory_usage'
self.state_history[state_key].append({
'mem_percent': mem_percent,
'swap_percent': swap_percent,
'swap_vs_ram': swap_vs_ram,
'time': current_time
})
# Prune entries older than 10 minutes
self.state_history[state_key] = [
e for e in self.state_history[state_key]
if current_time - e['time'] < 600
]
except Exception:
pass # Sampling must never crash the thread
def _sample_cpu_temperature(self):
"""Lightweight temperature sample: read sensor and append to history. ~50ms cost."""
try:
@@ -1050,34 +1074,46 @@ class HealthMonitor:
if current_time - entry['time'] < 600
]
mem_critical = sum(
1 for entry in self.state_history[state_key]
mem_critical_samples = [
entry for entry in self.state_history[state_key]
if entry['mem_percent'] >= 90 and
current_time - entry['time'] <= self.MEMORY_DURATION
)
mem_warning = sum(
1 for entry in self.state_history[state_key]
]
mem_warning_samples = [
entry for entry in self.state_history[state_key]
if entry['mem_percent'] >= self.MEMORY_WARNING and
current_time - entry['time'] <= self.MEMORY_DURATION
)
]
swap_critical = sum(
1 for entry in self.state_history[state_key]
if entry['swap_vs_ram'] > 20 and
current_time - entry['time'] <= self.SWAP_CRITICAL_DURATION
)
if mem_critical >= 2:
# Require sustained high usage across most of the 300s window.
# With ~30s sampling: 300s = ~10 samples, so 8 ≈ 240s sustained.
# Mirrors CPU's ~83% coverage threshold (25/30).
MEM_CRITICAL_MIN_SAMPLES = 8
MEM_WARNING_MIN_SAMPLES = 8
mem_critical_count = len(mem_critical_samples)
mem_warning_count = len(mem_warning_samples)
if mem_critical_count >= MEM_CRITICAL_MIN_SAMPLES:
oldest = min(s['time'] for s in mem_critical_samples)
actual_duration = int(current_time - oldest)
status = 'CRITICAL'
reason = f'RAM >90% for {self.MEMORY_DURATION}s'
reason = f'RAM >90% sustained for {actual_duration}s'
elif swap_critical >= 2:
status = 'CRITICAL'
reason = f'Swap >20% of RAM ({swap_vs_ram:.1f}%)'
elif mem_warning >= 2:
elif mem_warning_count >= MEM_WARNING_MIN_SAMPLES:
oldest = min(s['time'] for s in mem_warning_samples)
actual_duration = int(current_time - oldest)
status = 'WARNING'
reason = f'RAM >{self.MEMORY_WARNING}% for {self.MEMORY_DURATION}s'
reason = f'RAM >{self.MEMORY_WARNING}% sustained for {actual_duration}s'
else:
status = 'OK'
reason = None
@@ -1088,7 +1124,7 @@ class HealthMonitor:
swap_total_gb = round(swap.total / (1024**3), 2)
# Determine per-sub-check status
ram_status = 'CRITICAL' if mem_percent >= 90 and mem_critical >= 2 else ('WARNING' if mem_percent >= self.MEMORY_WARNING and mem_warning >= 2 else 'OK')
ram_status = 'CRITICAL' if mem_percent >= 90 and mem_critical_count >= MEM_CRITICAL_MIN_SAMPLES else ('WARNING' if mem_percent >= self.MEMORY_WARNING and mem_warning_count >= MEM_WARNING_MIN_SAMPLES else 'OK')
swap_status = 'CRITICAL' if swap_critical >= 2 else 'OK'
result = {