update health_monitor.py

This commit is contained in:
MacRimi
2026-04-17 16:39:08 +02:00
parent ffadb2c508
commit 039e35f3c5
5 changed files with 353 additions and 221 deletions

View File

@@ -162,43 +162,41 @@ const groupAndSortTemperatures = (temperatures: any[]) => {
} }
export default function Hardware() { export default function Hardware() {
// Static data - load once without refresh // Static data - loaded once on mount. Static fields (CPU, motherboard, memory
// modules, PCI, disks, GPU list) don't change at runtime, so no auto-refresh.
// `mutateStatic` is triggered explicitly after GPU switch-mode changes.
const { const {
data: staticHardwareData, data: staticHardwareData,
error: staticError, error: staticError,
isLoading: staticLoading, isLoading: staticLoading,
mutate: mutateStatic,
} = useSWR<HardwareData>("/api/hardware", swrFetcher, { } = useSWR<HardwareData>("/api/hardware", swrFetcher, {
revalidateOnFocus: false, revalidateOnFocus: false,
revalidateOnReconnect: false, revalidateOnReconnect: false,
refreshInterval: 0, // No auto-refresh for static data refreshInterval: 0,
}) })
// Dynamic data - refresh every 5 seconds for temperatures, fans, power, ups // Live data - only temperatures, fans, power, UPS. Polled every 5s.
// Backend /api/hardware/live uses cached ipmitool output (10s) so this is cheap.
const { const {
data: dynamicHardwareData, data: dynamicHardwareData,
error: dynamicError, error: dynamicError,
isLoading: dynamicLoading, } = useSWR<HardwareData>("/api/hardware/live", swrFetcher, {
} = useSWR<HardwareData>("/api/hardware", swrFetcher, { refreshInterval: 5000,
refreshInterval: 7000, revalidateOnFocus: true,
revalidateOnReconnect: true,
dedupingInterval: 2000,
}) })
// Merge static and dynamic data, preferring static for CPU/memory/PCI/disks // Merge: static fields from initial load, live fields from the 5s poll.
const hardwareData = staticHardwareData const hardwareData = staticHardwareData
? { ? {
...dynamicHardwareData, ...staticHardwareData,
// Keep static data from initial load temperatures: dynamicHardwareData?.temperatures ?? staticHardwareData.temperatures,
cpu: staticHardwareData.cpu, fans: dynamicHardwareData?.fans ?? staticHardwareData.fans,
motherboard: staticHardwareData.motherboard, power_meter: dynamicHardwareData?.power_meter ?? staticHardwareData.power_meter,
memory_modules: staticHardwareData.memory_modules, power_supplies: dynamicHardwareData?.power_supplies ?? staticHardwareData.power_supplies,
pci_devices: staticHardwareData.pci_devices, ups: dynamicHardwareData?.ups ?? staticHardwareData.ups,
storage_devices: staticHardwareData.storage_devices,
gpus: staticHardwareData.gpus,
// Use dynamic data for these
temperatures: dynamicHardwareData?.temperatures,
fans: dynamicHardwareData?.fans,
power_meter: dynamicHardwareData?.power_meter,
power_supplies: dynamicHardwareData?.power_supplies,
ups: dynamicHardwareData?.ups,
} }
: undefined : undefined
@@ -239,21 +237,6 @@ export default function Hardware() {
const [showSwitchModeModal, setShowSwitchModeModal] = useState(false) const [showSwitchModeModal, setShowSwitchModeModal] = useState(false)
const [switchModeParams, setSwitchModeParams] = useState<{ gpuSlot: string; targetMode: "lxc" | "vm" } | null>(null) const [switchModeParams, setSwitchModeParams] = useState<{ gpuSlot: string; targetMode: "lxc" | "vm" } | null>(null)
const fetcher = async (url: string) => {
const data = await fetchApi(url)
return data
}
const {
data: hardwareDataSWR,
error: swrError,
isLoading: swrLoading,
mutate: mutateHardware,
} = useSWR<HardwareData>("/api/hardware", fetcher, {
refreshInterval: 30000,
revalidateOnFocus: false,
})
// Determine GPU mode based on driver (vfio-pci = VM, native driver = LXC) // Determine GPU mode based on driver (vfio-pci = VM, native driver = LXC)
const getGpuSwitchMode = (gpu: GPU): "lxc" | "vm" | "unknown" => { const getGpuSwitchMode = (gpu: GPU): "lxc" | "vm" | "unknown" => {
const driver = gpu.pci_driver?.toLowerCase() || "" const driver = gpu.pci_driver?.toLowerCase() || ""
@@ -304,7 +287,7 @@ export default function Hardware() {
const handleSwitchModeSave = (gpuSlot: string, e: React.MouseEvent) => { const handleSwitchModeSave = (gpuSlot: string, e: React.MouseEvent) => {
e.stopPropagation() e.stopPropagation()
const pendingMode = pendingSwitchModes[gpuSlot] const pendingMode = pendingSwitchModes[gpuSlot]
const gpu = hardwareDataSWR?.gpus?.find(g => g.slot === gpuSlot) const gpu = hardwareData?.gpus?.find(g => g.slot === gpuSlot)
const currentMode = gpu ? getGpuSwitchMode(gpu) : "unknown" const currentMode = gpu ? getGpuSwitchMode(gpu) : "unknown"
if (pendingMode && pendingMode !== currentMode && gpu) { if (pendingMode && pendingMode !== currentMode && gpu) {
@@ -333,7 +316,7 @@ export default function Hardware() {
setSwitchModeParams(null) setSwitchModeParams(null)
setPendingSwitchModes({}) setPendingSwitchModes({})
// Refresh hardware data // Refresh hardware data
mutateHardware() mutateStatic()
} }
const handleInstallNvidiaDriver = () => { const handleInstallNvidiaDriver = () => {
@@ -391,14 +374,14 @@ export default function Hardware() {
} }
const findPCIDeviceForGPU = (gpu: GPU): PCIDevice | null => { const findPCIDeviceForGPU = (gpu: GPU): PCIDevice | null => {
if (!hardwareDataSWR?.pci_devices || !gpu.slot) return null if (!hardwareData?.pci_devices || !gpu.slot) return null
// Try to find exact match first (e.g., "00:02.0") // Try to find exact match first (e.g., "00:02.0")
let pciDevice = hardwareDataSWR.pci_devices.find((d) => d.slot === gpu.slot) let pciDevice = hardwareData.pci_devices.find((d) => d.slot === gpu.slot)
// If not found, try to match by partial slot (e.g., "00" matches "00:02.0") // If not found, try to match by partial slot (e.g., "00" matches "00:02.0")
if (!pciDevice && gpu.slot.length <= 2) { if (!pciDevice && gpu.slot.length <= 2) {
pciDevice = hardwareDataSWR.pci_devices.find( pciDevice = hardwareData.pci_devices.find(
(d) => (d) =>
d.slot.startsWith(gpu.slot + ":") && d.slot.startsWith(gpu.slot + ":") &&
(d.type.toLowerCase().includes("vga") || (d.type.toLowerCase().includes("vga") ||
@@ -417,7 +400,7 @@ export default function Hardware() {
return realtimeGPUData.has_monitoring_tool === true return realtimeGPUData.has_monitoring_tool === true
} }
if (swrLoading) { if (isLoading) {
return ( return (
<div className="flex flex-col items-center justify-center min-h-[400px] gap-4"> <div className="flex flex-col items-center justify-center min-h-[400px] gap-4">
<div className="relative"> <div className="relative">
@@ -433,7 +416,7 @@ export default function Hardware() {
return ( return (
<div className="space-y-6"> <div className="space-y-6">
{/* System Information - CPU & Motherboard */} {/* System Information - CPU & Motherboard */}
{(hardwareDataSWR?.cpu || hardwareDataSWR?.motherboard) && ( {(hardwareData?.cpu || hardwareData?.motherboard) && (
<Card className="border-border/50 bg-card/50 p-6"> <Card className="border-border/50 bg-card/50 p-6">
<div className="mb-4 flex items-center gap-2"> <div className="mb-4 flex items-center gap-2">
<Cpu className="h-5 w-5 text-primary" /> <Cpu className="h-5 w-5 text-primary" />
@@ -442,44 +425,44 @@ export default function Hardware() {
<div className="grid gap-6 md:grid-cols-2"> <div className="grid gap-6 md:grid-cols-2">
{/* CPU Info */} {/* CPU Info */}
{hardwareDataSWR?.cpu && Object.keys(hardwareDataSWR.cpu).length > 0 && ( {hardwareData?.cpu && Object.keys(hardwareData.cpu).length > 0 && (
<div> <div>
<div className="mb-2 flex items-center gap-2"> <div className="mb-2 flex items-center gap-2">
<CpuIcon className="h-4 w-4 text-muted-foreground" /> <CpuIcon className="h-4 w-4 text-muted-foreground" />
<h3 className="text-sm font-semibold">CPU</h3> <h3 className="text-sm font-semibold">CPU</h3>
</div> </div>
<div className="space-y-2"> <div className="space-y-2">
{hardwareDataSWR.cpu.model && ( {hardwareData.cpu.model && (
<div className="flex justify-between text-sm"> <div className="flex justify-between text-sm">
<span className="text-muted-foreground">Model</span> <span className="text-muted-foreground">Model</span>
<span className="font-medium text-right">{hardwareDataSWR.cpu.model}</span> <span className="font-medium text-right">{hardwareData.cpu.model}</span>
</div> </div>
)} )}
{hardwareDataSWR.cpu.cores_per_socket && hardwareDataSWR.cpu.sockets && ( {hardwareData.cpu.cores_per_socket && hardwareData.cpu.sockets && (
<div className="flex justify-between text-sm"> <div className="flex justify-between text-sm">
<span className="text-muted-foreground">Cores</span> <span className="text-muted-foreground">Cores</span>
<span className="font-medium"> <span className="font-medium">
{hardwareDataSWR.cpu.sockets} × {hardwareDataSWR.cpu.cores_per_socket} ={" "} {hardwareData.cpu.sockets} × {hardwareData.cpu.cores_per_socket} ={" "}
{hardwareDataSWR.cpu.sockets * hardwareDataSWR.cpu.cores_per_socket} cores {hardwareData.cpu.sockets * hardwareData.cpu.cores_per_socket} cores
</span> </span>
</div> </div>
)} )}
{hardwareDataSWR.cpu.total_threads && ( {hardwareData.cpu.total_threads && (
<div className="flex justify-between text-sm"> <div className="flex justify-between text-sm">
<span className="text-muted-foreground">Threads</span> <span className="text-muted-foreground">Threads</span>
<span className="font-medium">{hardwareDataSWR.cpu.total_threads}</span> <span className="font-medium">{hardwareData.cpu.total_threads}</span>
</div> </div>
)} )}
{hardwareDataSWR.cpu.l3_cache && ( {hardwareData.cpu.l3_cache && (
<div className="flex justify-between text-sm"> <div className="flex justify-between text-sm">
<span className="text-muted-foreground">L3 Cache</span> <span className="text-muted-foreground">L3 Cache</span>
<span className="font-medium">{hardwareDataSWR.cpu.l3_cache}</span> <span className="font-medium">{hardwareData.cpu.l3_cache}</span>
</div> </div>
)} )}
{hardwareDataSWR.cpu.virtualization && ( {hardwareData.cpu.virtualization && (
<div className="flex justify-between text-sm"> <div className="flex justify-between text-sm">
<span className="text-muted-foreground">Virtualization</span> <span className="text-muted-foreground">Virtualization</span>
<span className="font-medium">{hardwareDataSWR.cpu.virtualization}</span> <span className="font-medium">{hardwareData.cpu.virtualization}</span>
</div> </div>
)} )}
</div> </div>
@@ -487,41 +470,41 @@ export default function Hardware() {
)} )}
{/* Motherboard Info */} {/* Motherboard Info */}
{hardwareDataSWR?.motherboard && Object.keys(hardwareDataSWR.motherboard).length > 0 && ( {hardwareData?.motherboard && Object.keys(hardwareData.motherboard).length > 0 && (
<div> <div>
<div className="mb-2 flex items-center gap-2"> <div className="mb-2 flex items-center gap-2">
<Cpu className="h-4 w-4 text-muted-foreground" /> <Cpu className="h-4 w-4 text-muted-foreground" />
<h3 className="text-sm font-semibold">Motherboard</h3> <h3 className="text-sm font-semibold">Motherboard</h3>
</div> </div>
<div className="space-y-2"> <div className="space-y-2">
{hardwareDataSWR.motherboard.manufacturer && ( {hardwareData.motherboard.manufacturer && (
<div className="flex justify-between text-sm"> <div className="flex justify-between text-sm">
<span className="text-muted-foreground">Manufacturer</span> <span className="text-muted-foreground">Manufacturer</span>
<span className="font-medium text-right">{hardwareDataSWR.motherboard.manufacturer}</span> <span className="font-medium text-right">{hardwareData.motherboard.manufacturer}</span>
</div> </div>
)} )}
{hardwareDataSWR.motherboard.model && ( {hardwareData.motherboard.model && (
<div className="flex justify-between text-sm"> <div className="flex justify-between text-sm">
<span className="text-muted-foreground">Model</span> <span className="text-muted-foreground">Model</span>
<span className="font-medium text-right">{hardwareDataSWR.motherboard.model}</span> <span className="font-medium text-right">{hardwareData.motherboard.model}</span>
</div> </div>
)} )}
{hardwareDataSWR.motherboard.bios?.vendor && ( {hardwareData.motherboard.bios?.vendor && (
<div className="flex justify-between text-sm"> <div className="flex justify-between text-sm">
<span className="text-muted-foreground">BIOS</span> <span className="text-muted-foreground">BIOS</span>
<span className="font-medium text-right">{hardwareDataSWR.motherboard.bios.vendor}</span> <span className="font-medium text-right">{hardwareData.motherboard.bios.vendor}</span>
</div> </div>
)} )}
{hardwareDataSWR.motherboard.bios?.version && ( {hardwareData.motherboard.bios?.version && (
<div className="flex justify-between text-sm"> <div className="flex justify-between text-sm">
<span className="text-muted-foreground">Version</span> <span className="text-muted-foreground">Version</span>
<span className="font-medium">{hardwareDataSWR.motherboard.bios.version}</span> <span className="font-medium">{hardwareData.motherboard.bios.version}</span>
</div> </div>
)} )}
{hardwareDataSWR.motherboard.bios?.date && ( {hardwareData.motherboard.bios?.date && (
<div className="flex justify-between text-sm"> <div className="flex justify-between text-sm">
<span className="text-muted-foreground">Date</span> <span className="text-muted-foreground">Date</span>
<span className="font-medium">{hardwareDataSWR.motherboard.bios.date}</span> <span className="font-medium">{hardwareData.motherboard.bios.date}</span>
</div> </div>
)} )}
</div> </div>
@@ -532,18 +515,18 @@ export default function Hardware() {
)} )}
{/* Memory Modules */} {/* Memory Modules */}
{hardwareDataSWR?.memory_modules && hardwareDataSWR.memory_modules.length > 0 && ( {hardwareData?.memory_modules && hardwareData.memory_modules.length > 0 && (
<Card className="border-border/50 bg-card/50 p-6"> <Card className="border-border/50 bg-card/50 p-6">
<div className="mb-4 flex items-center gap-2"> <div className="mb-4 flex items-center gap-2">
<MemoryStick className="h-5 w-5 text-primary" /> <MemoryStick className="h-5 w-5 text-primary" />
<h2 className="text-lg font-semibold">Memory Modules</h2> <h2 className="text-lg font-semibold">Memory Modules</h2>
<Badge variant="outline" className="ml-auto"> <Badge variant="outline" className="ml-auto">
{hardwareDataSWR.memory_modules.length} installed {hardwareData.memory_modules.length} installed
</Badge> </Badge>
</div> </div>
<div className="grid gap-3 md:grid-cols-2 lg:grid-cols-3"> <div className="grid gap-3 md:grid-cols-2 lg:grid-cols-3">
{hardwareDataSWR.memory_modules.map((module, index) => ( {hardwareData.memory_modules.map((module, index) => (
<div key={index} className="rounded-lg border border-border/30 bg-background/60 p-4"> <div key={index} className="rounded-lg border border-border/30 bg-background/60 p-4">
<div className="mb-2 font-medium text-sm">{module.slot}</div> <div className="mb-2 font-medium text-sm">{module.slot}</div>
<div className="space-y-1"> <div className="space-y-1">
@@ -590,29 +573,29 @@ export default function Hardware() {
)} )}
{/* Thermal Monitoring */} {/* Thermal Monitoring */}
{hardwareDataSWR?.temperatures && hardwareDataSWR.temperatures.length > 0 && ( {hardwareData?.temperatures && hardwareData.temperatures.length > 0 && (
<Card className="border-border/50 bg-card/50 p-6"> <Card className="border-border/50 bg-card/50 p-6">
<div className="mb-4 flex items-center gap-2"> <div className="mb-4 flex items-center gap-2">
<Thermometer className="h-5 w-5 text-primary" /> <Thermometer className="h-5 w-5 text-primary" />
<h2 className="text-lg font-semibold">Thermal Monitoring</h2> <h2 className="text-lg font-semibold">Thermal Monitoring</h2>
<Badge variant="outline" className="ml-auto"> <Badge variant="outline" className="ml-auto">
{hardwareDataSWR.temperatures.length} sensors {hardwareData.temperatures.length} sensors
</Badge> </Badge>
</div> </div>
<div className="grid gap-6 md:grid-cols-2"> <div className="grid gap-6 md:grid-cols-2">
{/* CPU Sensors */} {/* CPU Sensors */}
{groupAndSortTemperatures(hardwareDataSWR.temperatures).CPU.length > 0 && ( {groupAndSortTemperatures(hardwareData.temperatures).CPU.length > 0 && (
<div className="md:col-span-2"> <div className="md:col-span-2">
<div className="mb-3 flex items-center gap-2"> <div className="mb-3 flex items-center gap-2">
<CpuIcon className="h-4 w-4 text-muted-foreground" /> <CpuIcon className="h-4 w-4 text-muted-foreground" />
<h3 className="text-sm font-semibold">CPU</h3> <h3 className="text-sm font-semibold">CPU</h3>
<Badge variant="outline" className="text-xs"> <Badge variant="outline" className="text-xs">
{groupAndSortTemperatures(hardwareDataSWR.temperatures).CPU.length} {groupAndSortTemperatures(hardwareData.temperatures).CPU.length}
</Badge> </Badge>
</div> </div>
<div className="grid gap-4 md:grid-cols-2"> <div className="grid gap-4 md:grid-cols-2">
{groupAndSortTemperatures(hardwareDataSWR.temperatures).CPU.map((temp, index) => { {groupAndSortTemperatures(hardwareData.temperatures).CPU.map((temp, index) => {
const percentage = const percentage =
temp.critical > 0 ? (temp.current / temp.critical) * 100 : (temp.current / 100) * 100 temp.critical > 0 ? (temp.current / temp.critical) * 100 : (temp.current / 100) * 100
const isHot = temp.current > (temp.high || 80) const isHot = temp.current > (temp.high || 80)
@@ -643,21 +626,21 @@ export default function Hardware() {
)} )}
{/* GPU Sensors */} {/* GPU Sensors */}
{groupAndSortTemperatures(hardwareDataSWR.temperatures).GPU.length > 0 && ( {groupAndSortTemperatures(hardwareData.temperatures).GPU.length > 0 && (
<div <div
className={groupAndSortTemperatures(hardwareDataSWR.temperatures).GPU.length > 1 ? "md:col-span-2" : ""} className={groupAndSortTemperatures(hardwareData.temperatures).GPU.length > 1 ? "md:col-span-2" : ""}
> >
<div className="mb-3 flex items-center gap-2"> <div className="mb-3 flex items-center gap-2">
<Gpu className="h-4 w-4 text-muted-foreground" /> <Gpu className="h-4 w-4 text-muted-foreground" />
<h3 className="text-sm font-semibold">GPU</h3> <h3 className="text-sm font-semibold">GPU</h3>
<Badge variant="outline" className="text-xs"> <Badge variant="outline" className="text-xs">
{groupAndSortTemperatures(hardwareDataSWR.temperatures).GPU.length} {groupAndSortTemperatures(hardwareData.temperatures).GPU.length}
</Badge> </Badge>
</div> </div>
<div <div
className={`grid gap-4 ${groupAndSortTemperatures(hardwareDataSWR.temperatures).GPU.length > 1 ? "md:grid-cols-2" : ""}`} className={`grid gap-4 ${groupAndSortTemperatures(hardwareData.temperatures).GPU.length > 1 ? "md:grid-cols-2" : ""}`}
> >
{groupAndSortTemperatures(hardwareDataSWR.temperatures).GPU.map((temp, index) => { {groupAndSortTemperatures(hardwareData.temperatures).GPU.map((temp, index) => {
const percentage = const percentage =
temp.critical > 0 ? (temp.current / temp.critical) * 100 : (temp.current / 100) * 100 temp.critical > 0 ? (temp.current / temp.critical) * 100 : (temp.current / 100) * 100
const isHot = temp.current > (temp.high || 80) const isHot = temp.current > (temp.high || 80)
@@ -688,23 +671,23 @@ export default function Hardware() {
)} )}
{/* NVME Sensors */} {/* NVME Sensors */}
{groupAndSortTemperatures(hardwareDataSWR.temperatures).NVME.length > 0 && ( {groupAndSortTemperatures(hardwareData.temperatures).NVME.length > 0 && (
<div <div
className={ className={
groupAndSortTemperatures(hardwareDataSWR.temperatures).NVME.length > 1 ? "md:col-span-2" : "" groupAndSortTemperatures(hardwareData.temperatures).NVME.length > 1 ? "md:col-span-2" : ""
} }
> >
<div className="mb-3 flex items-center gap-2"> <div className="mb-3 flex items-center gap-2">
<HardDrive className="h-4 w-4 text-muted-foreground" /> <HardDrive className="h-4 w-4 text-muted-foreground" />
<h3 className="text-sm font-semibold">NVME</h3> <h3 className="text-sm font-semibold">NVME</h3>
<Badge variant="outline" className="text-xs"> <Badge variant="outline" className="text-xs">
{groupAndSortTemperatures(hardwareDataSWR.temperatures).NVME.length} {groupAndSortTemperatures(hardwareData.temperatures).NVME.length}
</Badge> </Badge>
</div> </div>
<div <div
className={`grid gap-4 ${groupAndSortTemperatures(hardwareDataSWR.temperatures).NVME.length > 1 ? "md:grid-cols-2" : ""}`} className={`grid gap-4 ${groupAndSortTemperatures(hardwareData.temperatures).NVME.length > 1 ? "md:grid-cols-2" : ""}`}
> >
{groupAndSortTemperatures(hardwareDataSWR.temperatures).NVME.map((temp, index) => { {groupAndSortTemperatures(hardwareData.temperatures).NVME.map((temp, index) => {
const percentage = const percentage =
temp.critical > 0 ? (temp.current / temp.critical) * 100 : (temp.current / 100) * 100 temp.critical > 0 ? (temp.current / temp.critical) * 100 : (temp.current / 100) * 100
const isHot = temp.current > (temp.high || 80) const isHot = temp.current > (temp.high || 80)
@@ -735,21 +718,21 @@ export default function Hardware() {
)} )}
{/* PCI Sensors */} {/* PCI Sensors */}
{groupAndSortTemperatures(hardwareDataSWR.temperatures).PCI.length > 0 && ( {groupAndSortTemperatures(hardwareData.temperatures).PCI.length > 0 && (
<div <div
className={groupAndSortTemperatures(hardwareDataSWR.temperatures).PCI.length > 1 ? "md:col-span-2" : ""} className={groupAndSortTemperatures(hardwareData.temperatures).PCI.length > 1 ? "md:col-span-2" : ""}
> >
<div className="mb-3 flex items-center gap-2"> <div className="mb-3 flex items-center gap-2">
<CpuIcon className="h-4 w-4 text-muted-foreground" /> <CpuIcon className="h-4 w-4 text-muted-foreground" />
<h3 className="text-sm font-semibold">PCI</h3> <h3 className="text-sm font-semibold">PCI</h3>
<Badge variant="outline" className="text-xs"> <Badge variant="outline" className="text-xs">
{groupAndSortTemperatures(hardwareDataSWR.temperatures).PCI.length} {groupAndSortTemperatures(hardwareData.temperatures).PCI.length}
</Badge> </Badge>
</div> </div>
<div <div
className={`grid gap-4 ${groupAndSortTemperatures(hardwareDataSWR.temperatures).PCI.length > 1 ? "md:grid-cols-2" : ""}`} className={`grid gap-4 ${groupAndSortTemperatures(hardwareData.temperatures).PCI.length > 1 ? "md:grid-cols-2" : ""}`}
> >
{groupAndSortTemperatures(hardwareDataSWR.temperatures).PCI.map((temp, index) => { {groupAndSortTemperatures(hardwareData.temperatures).PCI.map((temp, index) => {
const percentage = const percentage =
temp.critical > 0 ? (temp.current / temp.critical) * 100 : (temp.current / 100) * 100 temp.critical > 0 ? (temp.current / temp.critical) * 100 : (temp.current / 100) * 100
const isHot = temp.current > (temp.high || 80) const isHot = temp.current > (temp.high || 80)
@@ -780,23 +763,23 @@ export default function Hardware() {
)} )}
{/* OTHER Sensors */} {/* OTHER Sensors */}
{groupAndSortTemperatures(hardwareDataSWR.temperatures).OTHER.length > 0 && ( {groupAndSortTemperatures(hardwareData.temperatures).OTHER.length > 0 && (
<div <div
className={ className={
groupAndSortTemperatures(hardwareDataSWR.temperatures).OTHER.length > 1 ? "md:col-span-2" : "" groupAndSortTemperatures(hardwareData.temperatures).OTHER.length > 1 ? "md:col-span-2" : ""
} }
> >
<div className="mb-3 flex items-center gap-2"> <div className="mb-3 flex items-center gap-2">
<Thermometer className="h-4 w-4 text-muted-foreground" /> <Thermometer className="h-4 w-4 text-muted-foreground" />
<h3 className="text-sm font-semibold">OTHER</h3> <h3 className="text-sm font-semibold">OTHER</h3>
<Badge variant="outline" className="text-xs"> <Badge variant="outline" className="text-xs">
{groupAndSortTemperatures(hardwareDataSWR.temperatures).OTHER.length} {groupAndSortTemperatures(hardwareData.temperatures).OTHER.length}
</Badge> </Badge>
</div> </div>
<div <div
className={`grid gap-4 ${groupAndSortTemperatures(hardwareDataSWR.temperatures).OTHER.length > 1 ? "md:grid-cols-2" : ""}`} className={`grid gap-4 ${groupAndSortTemperatures(hardwareData.temperatures).OTHER.length > 1 ? "md:grid-cols-2" : ""}`}
> >
{groupAndSortTemperatures(hardwareDataSWR.temperatures).OTHER.map((temp, index) => { {groupAndSortTemperatures(hardwareData.temperatures).OTHER.map((temp, index) => {
const percentage = const percentage =
temp.critical > 0 ? (temp.current / temp.critical) * 100 : (temp.current / 100) * 100 temp.critical > 0 ? (temp.current / temp.critical) * 100 : (temp.current / 100) * 100
const isHot = temp.current > (temp.high || 80) const isHot = temp.current > (temp.high || 80)
@@ -830,18 +813,18 @@ export default function Hardware() {
)} )}
{/* GPU Information - Enhanced with on-demand data fetching */} {/* GPU Information - Enhanced with on-demand data fetching */}
{hardwareDataSWR?.gpus && hardwareDataSWR.gpus.length > 0 && ( {hardwareData?.gpus && hardwareData.gpus.length > 0 && (
<Card className="border-border/50 bg-card/50 p-6"> <Card className="border-border/50 bg-card/50 p-6">
<div className="mb-4 flex items-center gap-2"> <div className="mb-4 flex items-center gap-2">
<Gpu className="h-5 w-5 text-primary" /> <Gpu className="h-5 w-5 text-primary" />
<h2 className="text-lg font-semibold">Graphics Cards</h2> <h2 className="text-lg font-semibold">Graphics Cards</h2>
<Badge variant="outline" className="ml-auto"> <Badge variant="outline" className="ml-auto">
{hardwareDataSWR.gpus.length} GPU{hardwareDataSWR.gpus.length > 1 ? "s" : ""} {hardwareData.gpus.length} GPU{hardwareData.gpus.length > 1 ? "s" : ""}
</Badge> </Badge>
</div> </div>
<div className="grid gap-4 sm:grid-cols-2"> <div className="grid gap-4 sm:grid-cols-2">
{hardwareDataSWR.gpus.map((gpu, index) => { {hardwareData.gpus.map((gpu, index) => {
const pciDevice = findPCIDeviceForGPU(gpu) const pciDevice = findPCIDeviceForGPU(gpu)
const fullSlot = pciDevice?.slot || gpu.slot const fullSlot = pciDevice?.slot || gpu.slot
@@ -1324,7 +1307,7 @@ return (
</Dialog> </Dialog>
{/* Power Consumption */} {/* Power Consumption */}
{hardwareDataSWR?.power_meter && ( {hardwareData?.power_meter && (
<Card className="border-border/50 bg-card/50 p-6"> <Card className="border-border/50 bg-card/50 p-6">
<div className="mb-4 flex items-center gap-2"> <div className="mb-4 flex items-center gap-2">
<Zap className="h-5 w-5 text-blue-500" /> <Zap className="h-5 w-5 text-blue-500" />
@@ -1334,13 +1317,13 @@ return (
<div className="space-y-4"> <div className="space-y-4">
<div className="flex items-center justify-between rounded-lg border border-border/30 bg-background/60 p-4"> <div className="flex items-center justify-between rounded-lg border border-border/30 bg-background/60 p-4">
<div className="space-y-1"> <div className="space-y-1">
<p className="text-sm font-medium">{hardwareDataSWR.power_meter.name}</p> <p className="text-sm font-medium">{hardwareData.power_meter.name}</p>
{hardwareDataSWR.power_meter.adapter && ( {hardwareData.power_meter.adapter && (
<p className="text-xs text-muted-foreground">{hardwareDataSWR.power_meter.adapter}</p> <p className="text-xs text-muted-foreground">{hardwareData.power_meter.adapter}</p>
)} )}
</div> </div>
<div className="text-right"> <div className="text-right">
<p className="text-2xl font-bold text-blue-500">{hardwareDataSWR.power_meter.watts.toFixed(1)} W</p> <p className="text-2xl font-bold text-blue-500">{hardwareData.power_meter.watts.toFixed(1)} W</p>
<p className="text-xs text-muted-foreground">Current Draw</p> <p className="text-xs text-muted-foreground">Current Draw</p>
</div> </div>
</div> </div>
@@ -1349,18 +1332,18 @@ return (
)} )}
{/* Power Supplies */} {/* Power Supplies */}
{hardwareDataSWR?.power_supplies && hardwareDataSWR.power_supplies.length > 0 && ( {hardwareData?.power_supplies && hardwareData.power_supplies.length > 0 && (
<Card className="border-border/50 bg-card/50 p-6"> <Card className="border-border/50 bg-card/50 p-6">
<div className="mb-4 flex items-center gap-2"> <div className="mb-4 flex items-center gap-2">
<PowerIcon className="h-5 w-5 text-green-500" /> <PowerIcon className="h-5 w-5 text-green-500" />
<h2 className="text-lg font-semibold">Power Supplies</h2> <h2 className="text-lg font-semibold">Power Supplies</h2>
<Badge variant="outline" className="ml-auto"> <Badge variant="outline" className="ml-auto">
{hardwareDataSWR.power_supplies.length} PSUs {hardwareData.power_supplies.length} PSUs
</Badge> </Badge>
</div> </div>
<div className="grid gap-3 md:grid-cols-2"> <div className="grid gap-3 md:grid-cols-2">
{hardwareDataSWR.power_supplies.map((psu, index) => ( {hardwareData.power_supplies.map((psu, index) => (
<div key={index} className="rounded-lg border border-border/30 bg-background/60 p-4"> <div key={index} className="rounded-lg border border-border/30 bg-background/60 p-4">
<div className="flex items-center justify-between"> <div className="flex items-center justify-between">
<span className="text-sm font-medium">{psu.name}</span> <span className="text-sm font-medium">{psu.name}</span>
@@ -1377,18 +1360,18 @@ return (
)} )}
{/* Fans */} {/* Fans */}
{hardwareDataSWR?.fans && hardwareDataSWR.fans.length > 0 && ( {hardwareData?.fans && hardwareData.fans.length > 0 && (
<Card className="border-border/50 bg-card/50 p-6"> <Card className="border-border/50 bg-card/50 p-6">
<div className="mb-4 flex items-center gap-2"> <div className="mb-4 flex items-center gap-2">
<FanIcon className="h-5 w-5 text-primary" /> <FanIcon className="h-5 w-5 text-primary" />
<h2 className="text-lg font-semibold">System Fans</h2> <h2 className="text-lg font-semibold">System Fans</h2>
<Badge variant="outline" className="ml-auto"> <Badge variant="outline" className="ml-auto">
{hardwareDataSWR.fans.length} fans {hardwareData.fans.length} fans
</Badge> </Badge>
</div> </div>
<div className="grid gap-4 md:grid-cols-2"> <div className="grid gap-4 md:grid-cols-2">
{hardwareDataSWR.fans.map((fan, index) => { {hardwareData.fans.map((fan, index) => {
const isPercentage = fan.unit === "percent" || fan.unit === "%" const isPercentage = fan.unit === "percent" || fan.unit === "%"
const percentage = isPercentage ? fan.speed : Math.min((fan.speed / 5000) * 100, 100) const percentage = isPercentage ? fan.speed : Math.min((fan.speed / 5000) * 100, 100)
@@ -1412,18 +1395,18 @@ return (
)} )}
{/* UPS */} {/* UPS */}
{hardwareDataSWR?.ups && Array.isArray(hardwareDataSWR.ups) && hardwareDataSWR.ups.length > 0 && ( {hardwareData?.ups && Array.isArray(hardwareData.ups) && hardwareData.ups.length > 0 && (
<Card className="border-border/50 bg-card/50 p-6"> <Card className="border-border/50 bg-card/50 p-6">
<div className="mb-4 flex items-center gap-2"> <div className="mb-4 flex items-center gap-2">
<Battery className="h-5 w-5 text-primary" /> <Battery className="h-5 w-5 text-primary" />
<h2 className="text-lg font-semibold">UPS Status</h2> <h2 className="text-lg font-semibold">UPS Status</h2>
<Badge variant="outline" className="ml-auto"> <Badge variant="outline" className="ml-auto">
{hardwareDataSWR.ups.length} UPS {hardwareData.ups.length} UPS
</Badge> </Badge>
</div> </div>
<div className="grid gap-4 md:grid-cols-2"> <div className="grid gap-4 md:grid-cols-2">
{hardwareDataSWR.ups.map((ups: any, index: number) => { {hardwareData.ups.map((ups: any, index: number) => {
const batteryCharge = const batteryCharge =
ups.battery_charge_raw || Number.parseFloat(ups.battery_charge?.replace("%", "") || "0") ups.battery_charge_raw || Number.parseFloat(ups.battery_charge?.replace("%", "") || "0")
const loadPercent = ups.load_percent_raw || Number.parseFloat(ups.load_percent?.replace("%", "") || "0") const loadPercent = ups.load_percent_raw || Number.parseFloat(ups.load_percent?.replace("%", "") || "0")
@@ -1694,18 +1677,18 @@ return (
</Dialog> </Dialog>
{/* PCI Devices - Changed to modal */} {/* PCI Devices - Changed to modal */}
{hardwareDataSWR?.pci_devices && hardwareDataSWR.pci_devices.length > 0 && ( {hardwareData?.pci_devices && hardwareData.pci_devices.length > 0 && (
<Card className="border-border/50 bg-card/50 p-6"> <Card className="border-border/50 bg-card/50 p-6">
<div className="mb-4 flex items-center gap-2"> <div className="mb-4 flex items-center gap-2">
<CpuIcon className="h-5 w-5 text-primary" /> <CpuIcon className="h-5 w-5 text-primary" />
<h2 className="text-lg font-semibold">PCI Devices</h2> <h2 className="text-lg font-semibold">PCI Devices</h2>
<Badge variant="outline" className="ml-auto"> <Badge variant="outline" className="ml-auto">
{hardwareDataSWR.pci_devices.length} devices {hardwareData.pci_devices.length} devices
</Badge> </Badge>
</div> </div>
<div className="grid gap-3 sm:grid-cols-2 lg:grid-cols-3"> <div className="grid gap-3 sm:grid-cols-2 lg:grid-cols-3">
{hardwareDataSWR.pci_devices.map((device, index) => ( {hardwareData.pci_devices.map((device, index) => (
<div <div
key={index} key={index}
onClick={() => setSelectedPCIDevice(device)} onClick={() => setSelectedPCIDevice(device)}
@@ -1787,19 +1770,19 @@ return (
</Dialog> </Dialog>
{/* Network Summary - Clickable */} {/* Network Summary - Clickable */}
{hardwareDataSWR?.pci_devices && {hardwareData?.pci_devices &&
hardwareDataSWR.pci_devices.filter((d) => d.type.toLowerCase().includes("network")).length > 0 && ( hardwareData.pci_devices.filter((d) => d.type.toLowerCase().includes("network")).length > 0 && (
<Card className="border-border/50 bg-card/50 p-6"> <Card className="border-border/50 bg-card/50 p-6">
<div className="mb-4 flex items-center gap-2"> <div className="mb-4 flex items-center gap-2">
<Network className="h-5 w-5 text-primary" /> <Network className="h-5 w-5 text-primary" />
<h2 className="text-lg font-semibold">Network Summary</h2> <h2 className="text-lg font-semibold">Network Summary</h2>
<Badge variant="outline" className="ml-auto"> <Badge variant="outline" className="ml-auto">
{hardwareDataSWR.pci_devices.filter((d) => d.type.toLowerCase().includes("network")).length} interfaces {hardwareData.pci_devices.filter((d) => d.type.toLowerCase().includes("network")).length} interfaces
</Badge> </Badge>
</div> </div>
<div className="grid gap-3 sm:grid-cols-2 lg:grid-cols-3"> <div className="grid gap-3 sm:grid-cols-2 lg:grid-cols-3">
{hardwareDataSWR.pci_devices {hardwareData.pci_devices
.filter((d) => d.type.toLowerCase().includes("network")) .filter((d) => d.type.toLowerCase().includes("network"))
.map((device, index) => ( .map((device, index) => (
<div <div
@@ -1879,14 +1862,14 @@ return (
</Dialog> </Dialog>
{/* Storage Summary - Clickable */} {/* Storage Summary - Clickable */}
{hardwareDataSWR?.storage_devices && hardwareDataSWR.storage_devices.length > 0 && ( {hardwareData?.storage_devices && hardwareData.storage_devices.length > 0 && (
<Card className="border-border/50 bg-card/50 p-6"> <Card className="border-border/50 bg-card/50 p-6">
<div className="mb-4 flex items-center gap-2"> <div className="mb-4 flex items-center gap-2">
<HardDrive className="h-5 w-5 text-primary" /> <HardDrive className="h-5 w-5 text-primary" />
<h2 className="text-lg font-semibold">Storage Summary</h2> <h2 className="text-lg font-semibold">Storage Summary</h2>
<Badge variant="outline" className="ml-auto"> <Badge variant="outline" className="ml-auto">
{ {
hardwareDataSWR.storage_devices.filter( hardwareData.storage_devices.filter(
(device) => (device) =>
device.type === "disk" && !device.name.startsWith("zd") && !device.name.startsWith("loop"), device.type === "disk" && !device.name.startsWith("zd") && !device.name.startsWith("loop"),
).length ).length
@@ -1896,7 +1879,7 @@ return (
</div> </div>
<div className="grid gap-3 sm:grid-cols-2 lg:grid-cols-3"> <div className="grid gap-3 sm:grid-cols-2 lg:grid-cols-3">
{hardwareDataSWR.storage_devices {hardwareData.storage_devices
.filter( .filter(
(device) => device.type === "disk" && !device.name.startsWith("zd") && !device.name.startsWith("loop"), (device) => device.type === "disk" && !device.name.startsWith("zd") && !device.name.startsWith("loop"),
) )
@@ -2239,12 +2222,12 @@ return (
description="Installing NVIDIA proprietary drivers for GPU monitoring..." description="Installing NVIDIA proprietary drivers for GPU monitoring..."
onClose={() => { onClose={() => {
setNvidiaSessionId(null) setNvidiaSessionId(null)
mutateHardware() mutateStatic()
}} }}
onComplete={(success) => { onComplete={(success) => {
console.log("[v0] NVIDIA installation completed:", success ? "success" : "failed") console.log("[v0] NVIDIA installation completed:", success ? "success" : "failed")
if (success) { if (success) {
mutateHardware() mutateStatic()
} }
}} }}
/> */} /> */}
@@ -2252,7 +2235,7 @@ return (
open={showNvidiaInstaller} open={showNvidiaInstaller}
onClose={() => { onClose={() => {
setShowNvidiaInstaller(false) setShowNvidiaInstaller(false)
mutateHardware() mutateStatic()
}} }}
scriptPath="/usr/local/share/proxmenux/scripts/gpu_tpu/nvidia_installer.sh" scriptPath="/usr/local/share/proxmenux/scripts/gpu_tpu/nvidia_installer.sh"
scriptName="nvidia_installer" scriptName="nvidia_installer"
@@ -2266,7 +2249,7 @@ return (
open={showAmdInstaller} open={showAmdInstaller}
onClose={() => { onClose={() => {
setShowAmdInstaller(false) setShowAmdInstaller(false)
mutateHardware() mutateStatic()
}} }}
scriptPath="/usr/local/share/proxmenux/scripts/gpu_tpu/amd_gpu_tools.sh" scriptPath="/usr/local/share/proxmenux/scripts/gpu_tpu/amd_gpu_tools.sh"
scriptName="amd_gpu_tools" scriptName="amd_gpu_tools"
@@ -2280,7 +2263,7 @@ title="AMD GPU Tools Installation"
open={showIntelInstaller} open={showIntelInstaller}
onClose={() => { onClose={() => {
setShowIntelInstaller(false) setShowIntelInstaller(false)
mutateHardware() mutateStatic()
}} }}
scriptPath="/usr/local/share/proxmenux/scripts/gpu_tpu/intel_gpu_tools.sh" scriptPath="/usr/local/share/proxmenux/scripts/gpu_tpu/intel_gpu_tools.sh"
scriptName="intel_gpu_tools" scriptName="intel_gpu_tools"

View File

@@ -222,7 +222,7 @@ export function SystemOverview() {
const systemInterval = setInterval(async () => { const systemInterval = setInterval(async () => {
const data = await fetchSystemData() const data = await fetchSystemData()
if (data) setSystemData(data) if (data) setSystemData(data)
}, 9000) }, 5000)
const vmInterval = setInterval(async () => { const vmInterval = setInterval(async () => {
const data = await fetchVMData() const data = await fetchVMData()

View File

@@ -295,10 +295,10 @@ export function VirtualMachines() {
isLoading, isLoading,
mutate, mutate,
} = useSWR<VMData[]>("/api/vms", fetcher, { } = useSWR<VMData[]>("/api/vms", fetcher, {
refreshInterval: 23000, refreshInterval: 5000,
revalidateOnFocus: false, revalidateOnFocus: true,
revalidateOnReconnect: true, revalidateOnReconnect: true,
dedupingInterval: 10000, dedupingInterval: 2000,
errorRetryCount: 2, errorRetryCount: 2,
}) })

View File

@@ -1088,43 +1088,50 @@ def _health_collector_loop():
def _vital_signs_sampler(): def _vital_signs_sampler():
"""Dedicated thread for rapid CPU & temperature sampling. """Dedicated thread for rapid CPU, memory & temperature sampling.
Runs independently of the 5-min health collector loop. Runs independently of the 5-min health collector loop.
- CPU usage: sampled every 30s (3 samples in 1.5 min for hysteresis) - CPU usage: sampled every 30s (10 samples in 5 min for sustained detection)
- Memory: sampled every 30s (10 samples in 5 min for sustained detection)
- Temperature: sampled every 15s (12 samples in 3 min for temporal logic) - Temperature: sampled every 15s (12 samples in 3 min for temporal logic)
Uses time.monotonic() to avoid drift. Uses time.monotonic() to avoid drift.
Staggered intervals: CPU at offset 0, Temp at offset 7s to avoid collision. Staggered intervals to avoid collision: CPU at 0, Temp at +7s, Mem at +15s.
""" """
from health_monitor import health_monitor from health_monitor import health_monitor
# Wait 15s after startup for sensors to be ready # Wait 15s after startup for sensors to be ready
time.sleep(15) time.sleep(15)
TEMP_INTERVAL = 15 # seconds (was 10s - reduced frequency by 33%) TEMP_INTERVAL = 15 # seconds (was 10s - reduced frequency by 33%)
CPU_INTERVAL = 30 # seconds CPU_INTERVAL = 30 # seconds
MEM_INTERVAL = 30 # seconds (aligned with CPU for sustained-RAM detection)
# Stagger: CPU starts immediately, Temp starts after 7s offset
# Stagger: CPU starts immediately, Temp after 7s, Mem after 15s
next_cpu = time.monotonic() next_cpu = time.monotonic()
next_temp = time.monotonic() + 7 next_temp = time.monotonic() + 7
next_mem = time.monotonic() + 15
print("[ProxMenux] Vital signs sampler started (CPU: 30s, Temp: 10s)")
print("[ProxMenux] Vital signs sampler started (CPU: 30s, Mem: 30s, Temp: 15s)")
while True: while True:
try: try:
now = time.monotonic() now = time.monotonic()
if now >= next_temp: if now >= next_temp:
health_monitor._sample_cpu_temperature() health_monitor._sample_cpu_temperature()
next_temp = now + TEMP_INTERVAL next_temp = now + TEMP_INTERVAL
if now >= next_cpu: if now >= next_cpu:
health_monitor._sample_cpu_usage() health_monitor._sample_cpu_usage()
next_cpu = now + CPU_INTERVAL next_cpu = now + CPU_INTERVAL
if now >= next_mem:
health_monitor._sample_memory_usage()
next_mem = now + MEM_INTERVAL
# Sleep until the next earliest event (with 0.5s min to avoid busy-loop) # Sleep until the next earliest event (with 0.5s min to avoid busy-loop)
sleep_until = min(next_temp, next_cpu) - time.monotonic() sleep_until = min(next_temp, next_cpu, next_mem) - time.monotonic()
time.sleep(max(sleep_until, 0.5)) time.sleep(max(sleep_until, 0.5))
except Exception as e: except Exception as e:
print(f"[ProxMenux] Vital signs sampler error: {e}") print(f"[ProxMenux] Vital signs sampler error: {e}")
@@ -1160,7 +1167,7 @@ _pvesh_cache = {
'storage_list': None, 'storage_list': None,
'storage_list_time': 0, 'storage_list_time': 0,
} }
_PVESH_CACHE_TTL = 30 # 30 seconds - balances freshness with performance _PVESH_CACHE_TTL = 5 # 5 seconds - near real-time for active UI; pvesh local cost is ~200-400ms
# Cache for sensors output (temperature readings) # Cache for sensors output (temperature readings)
_sensors_cache = { _sensors_cache = {
@@ -1169,6 +1176,15 @@ _sensors_cache = {
} }
_SENSORS_CACHE_TTL = 10 # 10 seconds - temperature changes slowly _SENSORS_CACHE_TTL = 10 # 10 seconds - temperature changes slowly
# Cache for ipmitool sensor output (shared between fans, power supplies, power meter)
# ipmitool is slow (1-3s per call) and was called twice per /api/hardware hit.
_ipmi_cache = {
'output': None,
'time': 0,
'unavailable': False, # set True if ipmitool is missing, avoid retrying
}
_IPMI_CACHE_TTL = 10 # 10 seconds
# Cache for hardware info (lspci, dmidecode, lsblk) # Cache for hardware info (lspci, dmidecode, lsblk)
_hardware_cache = { _hardware_cache = {
'lspci': None, 'lspci': None,
@@ -3820,13 +3836,42 @@ def get_proxmox_vms():
# Return empty array instead of error object - frontend expects array # Return empty array instead of error object - frontend expects array
return [] return []
def get_ipmi_fans(): def get_cached_ipmi_sensors():
"""Get fan information from IPMI""" """Get ipmitool sensor output with 10s cache. Shared between fans/power parsers.
fans = []
Returns empty string if ipmitool is unavailable (cached to avoid repeated FileNotFoundError).
"""
global _ipmi_cache
now = time.time()
if _ipmi_cache['unavailable']:
return ''
if _ipmi_cache['output'] is not None and \
now - _ipmi_cache['time'] < _IPMI_CACHE_TTL:
return _ipmi_cache['output']
try: try:
result = subprocess.run(['ipmitool', 'sensor'], capture_output=True, text=True, timeout=10) result = subprocess.run(['ipmitool', 'sensor'], capture_output=True, text=True, timeout=10)
if result.returncode == 0: if result.returncode == 0:
for line in result.stdout.split('\n'): _ipmi_cache['output'] = result.stdout
_ipmi_cache['time'] = now
return result.stdout
except FileNotFoundError:
_ipmi_cache['unavailable'] = True
return ''
except Exception:
pass
return _ipmi_cache['output'] or ''
def get_ipmi_fans():
"""Get fan information from IPMI (uses cached sensor output)."""
fans = []
try:
output = get_cached_ipmi_sensors()
if output:
for line in output.split('\n'):
if 'fan' in line.lower() and '|' in line: if 'fan' in line.lower() and '|' in line:
parts = [p.strip() for p in line.split('|')] parts = [p.strip() for p in line.split('|')]
if len(parts) >= 3: if len(parts) >= 3:
@@ -3862,14 +3907,14 @@ def get_ipmi_fans():
return fans return fans
def get_ipmi_power(): def get_ipmi_power():
"""Get power supply information from IPMI""" """Get power supply information from IPMI (uses cached sensor output)."""
power_supplies = [] power_supplies = []
power_meter = None power_meter = None
try: try:
result = subprocess.run(['ipmitool', 'sensor'], capture_output=True, text=True, timeout=10) output = get_cached_ipmi_sensors()
if result.returncode == 0: if output:
for line in result.stdout.split('\n'): for line in output.split('\n'):
if ('power supply' in line.lower() or 'power meter' in line.lower()) and '|' in line: if ('power supply' in line.lower() or 'power meter' in line.lower()) and '|' in line:
parts = [p.strip() for p in line.split('|')] parts = [p.strip() for p in line.split('|')]
if len(parts) >= 3: if len(parts) >= 3:
@@ -4202,7 +4247,97 @@ def identify_fan(sensor_name, adapter, chip_name=None):
return sensor_name return sensor_name
# Default: return original name # Default: return original name
return sensor_name return sensor_name
def _parse_sensor_fans(sensors_output):
"""Parse fan entries from `sensors` output. Extracted for reuse between
get_hardware_info (static full payload) and get_hardware_live_info (live endpoint)."""
fans = []
if not sensors_output:
return fans
current_adapter = None
current_chip = None
for line in sensors_output.split('\n'):
line = line.strip()
if not line:
continue
if not ':' in line and not line.startswith(' ') and not line.startswith('Adapter'):
current_chip = line
continue
if line.startswith('Adapter:'):
current_adapter = line.replace('Adapter:', '').strip()
continue
if ':' in line and not line.startswith(' '):
parts = line.split(':', 1)
sensor_name = parts[0].strip()
value_part = parts[1].strip()
if 'RPM' in value_part:
rpm_match = re.search(r'([\d.]+)\s*RPM', value_part)
if rpm_match:
fan_speed = int(float(rpm_match.group(1)))
identified_name = identify_fan(sensor_name, current_adapter, current_chip)
fans.append({
'name': identified_name,
'original_name': sensor_name,
'speed': fan_speed,
'unit': 'RPM',
'adapter': current_adapter
})
return fans
def get_hardware_live_info():
"""Build only the live/dynamic hardware fields for /api/hardware/live.
Skips all the heavy static collection (lscpu, dmidecode, lsblk, smartctl, lspci...).
Uses cached sensors + cached ipmitool output to stay cheap under 5s polling.
"""
result = {
'temperatures': [],
'fans': [],
'power_meter': None,
'power_supplies': [],
'ups': None,
}
try:
temp_info = get_temperature_info()
result['temperatures'] = temp_info.get('temperatures', [])
result['power_meter'] = temp_info.get('power_meter')
except Exception:
pass
try:
sensor_fans = _parse_sensor_fans(get_cached_sensors_output())
except Exception:
sensor_fans = []
try:
ipmi_fans = get_ipmi_fans()
except Exception:
ipmi_fans = []
result['fans'] = sensor_fans + ipmi_fans
try:
ipmi_power = get_ipmi_power()
if ipmi_power:
result['power_supplies'] = ipmi_power.get('power_supplies', [])
# Fallback: if sensors didn't provide a power_meter, use IPMI's
if result['power_meter'] is None and ipmi_power.get('power_meter'):
result['power_meter'] = ipmi_power['power_meter']
except Exception:
pass
try:
ups_info = get_ups_info()
if ups_info:
result['ups'] = ups_info
except Exception:
pass
return result
def get_temperature_info(): def get_temperature_info():
@@ -6102,52 +6237,8 @@ def get_hardware_info():
pass pass
try: try:
sensors_output = get_cached_sensors_output() hardware_data['sensors']['fans'] = _parse_sensor_fans(get_cached_sensors_output())
if sensors_output: except Exception:
current_adapter = None
current_chip = None # Add chip name tracking
fans = []
for line in sensors_output.split('\n'):
line = line.strip()
if not line:
continue
# Chip names don't have ":" and are not indented
if not ':' in line and not line.startswith(' ') and not line.startswith('Adapter'):
current_chip = line
continue
# Detect adapter line
if line.startswith('Adapter:'):
current_adapter = line.replace('Adapter:', '').strip()
continue
# Parse fan sensors
if ':' in line and not line.startswith(' '):
parts = line.split(':', 1)
sensor_name = parts[0].strip()
value_part = parts[1].strip()
# Look for fan sensors (RPM)
if 'RPM' in value_part:
rpm_match = re.search(r'([\d.]+)\s*RPM', value_part)
if rpm_match:
fan_speed = int(float(rpm_match.group(1)))
identified_name = identify_fan(sensor_name, current_adapter, current_chip)
fans.append({
'name': identified_name,
'original_name': sensor_name,
'speed': fan_speed,
'unit': 'RPM',
'adapter': current_adapter
})
hardware_data['sensors']['fans'] = fans
except Exception as e:
# print(f"[v0] Error getting fan sensors: {e}")
pass pass
# Power Supply / UPS # Power Supply / UPS
@@ -6226,7 +6317,9 @@ def get_hardware_info():
def api_system(): def api_system():
"""Get system information including CPU, memory, and temperature""" """Get system information including CPU, memory, and temperature"""
try: try:
cpu_usage = psutil.cpu_percent(interval=0.5) # Non-blocking: returns %CPU since the last psutil call (sampler or prior API hit).
# The background vital-signs sampler keeps psutil's internal state primed.
cpu_usage = psutil.cpu_percent(interval=0)
memory = psutil.virtual_memory() memory = psutil.virtual_memory()
memory_used_gb = memory.used / (1024 ** 3) memory_used_gb = memory.used / (1024 ** 3)
@@ -9286,6 +9379,23 @@ def api_hardware():
traceback.print_exc() traceback.print_exc()
return jsonify({'error': str(e)}), 500 return jsonify({'error': str(e)}), 500
@app.route('/api/hardware/live', methods=['GET'])
@require_auth
def api_hardware_live():
"""Lightweight endpoint: only dynamic hardware fields (temps, fans, power, UPS).
Designed for the active Hardware page to poll every 3-5s without re-running the
expensive static collectors (lscpu, dmidecode, lsblk, smartctl). ipmitool output
is cached internally (10s) so repeated polls don't hammer the BMC.
"""
try:
return jsonify(get_hardware_live_info())
except Exception as e:
import traceback
traceback.print_exc()
return jsonify({'error': str(e)}), 500
@app.route('/api/gpu/<slot>/realtime', methods=['GET']) @app.route('/api/gpu/<slot>/realtime', methods=['GET'])
@require_auth @require_auth
def api_gpu_realtime(slot): def api_gpu_realtime(slot):
@@ -9526,8 +9636,11 @@ def api_vm_control(vmid):
control_result = subprocess.run( control_result = subprocess.run(
['pvesh', 'create', f'/nodes/{node}/{vm_type}/{vmid}/status/{action}'], ['pvesh', 'create', f'/nodes/{node}/{vm_type}/{vmid}/status/{action}'],
capture_output=True, text=True, timeout=30) capture_output=True, text=True, timeout=30)
if control_result.returncode == 0: if control_result.returncode == 0:
# Invalidate VM resources cache so the next /api/vms call
# returns fresh status instead of the pre-action snapshot.
_pvesh_cache['cluster_resources_vm_time'] = 0
return jsonify({ return jsonify({
'success': True, 'success': True,
'vmid': vmid, 'vmid': vmid,

View File

@@ -67,7 +67,7 @@ class HealthMonitor:
# Memory Thresholds # Memory Thresholds
MEMORY_WARNING = 85 MEMORY_WARNING = 85
MEMORY_CRITICAL = 95 MEMORY_CRITICAL = 95
MEMORY_DURATION = 60 MEMORY_DURATION = 300 # 5 minutes sustained (aligned with CPU)
SWAP_WARNING_DURATION = 300 SWAP_WARNING_DURATION = 300
SWAP_CRITICAL_PERCENT = 5 SWAP_CRITICAL_PERCENT = 5
SWAP_CRITICAL_DURATION = 120 SWAP_CRITICAL_DURATION = 120
@@ -402,6 +402,30 @@ class HealthMonitor:
except Exception: except Exception:
pass # Sampling must never crash the thread pass # Sampling must never crash the thread
def _sample_memory_usage(self):
"""Lightweight memory sample: read RAM/swap % and append to history. ~1ms cost."""
try:
memory = psutil.virtual_memory()
swap = psutil.swap_memory()
current_time = time.time()
mem_percent = memory.percent
swap_percent = swap.percent if swap.total > 0 else 0
swap_vs_ram = (swap.used / memory.total * 100) if memory.total > 0 else 0
state_key = 'memory_usage'
self.state_history[state_key].append({
'mem_percent': mem_percent,
'swap_percent': swap_percent,
'swap_vs_ram': swap_vs_ram,
'time': current_time
})
# Prune entries older than 10 minutes
self.state_history[state_key] = [
e for e in self.state_history[state_key]
if current_time - e['time'] < 600
]
except Exception:
pass # Sampling must never crash the thread
def _sample_cpu_temperature(self): def _sample_cpu_temperature(self):
"""Lightweight temperature sample: read sensor and append to history. ~50ms cost.""" """Lightweight temperature sample: read sensor and append to history. ~50ms cost."""
try: try:
@@ -1050,34 +1074,46 @@ class HealthMonitor:
if current_time - entry['time'] < 600 if current_time - entry['time'] < 600
] ]
mem_critical = sum( mem_critical_samples = [
1 for entry in self.state_history[state_key] entry for entry in self.state_history[state_key]
if entry['mem_percent'] >= 90 and if entry['mem_percent'] >= 90 and
current_time - entry['time'] <= self.MEMORY_DURATION current_time - entry['time'] <= self.MEMORY_DURATION
) ]
mem_warning = sum( mem_warning_samples = [
1 for entry in self.state_history[state_key] entry for entry in self.state_history[state_key]
if entry['mem_percent'] >= self.MEMORY_WARNING and if entry['mem_percent'] >= self.MEMORY_WARNING and
current_time - entry['time'] <= self.MEMORY_DURATION current_time - entry['time'] <= self.MEMORY_DURATION
) ]
swap_critical = sum( swap_critical = sum(
1 for entry in self.state_history[state_key] 1 for entry in self.state_history[state_key]
if entry['swap_vs_ram'] > 20 and if entry['swap_vs_ram'] > 20 and
current_time - entry['time'] <= self.SWAP_CRITICAL_DURATION current_time - entry['time'] <= self.SWAP_CRITICAL_DURATION
) )
# Require sustained high usage across most of the 300s window.
if mem_critical >= 2: # With ~30s sampling: 300s = ~10 samples, so 8 ≈ 240s sustained.
# Mirrors CPU's ~83% coverage threshold (25/30).
MEM_CRITICAL_MIN_SAMPLES = 8
MEM_WARNING_MIN_SAMPLES = 8
mem_critical_count = len(mem_critical_samples)
mem_warning_count = len(mem_warning_samples)
if mem_critical_count >= MEM_CRITICAL_MIN_SAMPLES:
oldest = min(s['time'] for s in mem_critical_samples)
actual_duration = int(current_time - oldest)
status = 'CRITICAL' status = 'CRITICAL'
reason = f'RAM >90% for {self.MEMORY_DURATION}s' reason = f'RAM >90% sustained for {actual_duration}s'
elif swap_critical >= 2: elif swap_critical >= 2:
status = 'CRITICAL' status = 'CRITICAL'
reason = f'Swap >20% of RAM ({swap_vs_ram:.1f}%)' reason = f'Swap >20% of RAM ({swap_vs_ram:.1f}%)'
elif mem_warning >= 2: elif mem_warning_count >= MEM_WARNING_MIN_SAMPLES:
oldest = min(s['time'] for s in mem_warning_samples)
actual_duration = int(current_time - oldest)
status = 'WARNING' status = 'WARNING'
reason = f'RAM >{self.MEMORY_WARNING}% for {self.MEMORY_DURATION}s' reason = f'RAM >{self.MEMORY_WARNING}% sustained for {actual_duration}s'
else: else:
status = 'OK' status = 'OK'
reason = None reason = None
@@ -1088,7 +1124,7 @@ class HealthMonitor:
swap_total_gb = round(swap.total / (1024**3), 2) swap_total_gb = round(swap.total / (1024**3), 2)
# Determine per-sub-check status # Determine per-sub-check status
ram_status = 'CRITICAL' if mem_percent >= 90 and mem_critical >= 2 else ('WARNING' if mem_percent >= self.MEMORY_WARNING and mem_warning >= 2 else 'OK') ram_status = 'CRITICAL' if mem_percent >= 90 and mem_critical_count >= MEM_CRITICAL_MIN_SAMPLES else ('WARNING' if mem_percent >= self.MEMORY_WARNING and mem_warning_count >= MEM_WARNING_MIN_SAMPLES else 'OK')
swap_status = 'CRITICAL' if swap_critical >= 2 else 'OK' swap_status = 'CRITICAL' if swap_critical >= 2 else 'OK'
result = { result = {