Update health monitor

2026-02-18 16:36:27 +00:00 · 2026-02-16 15:48:41 +01:00
parent dcbc52efc6
commit 2ee5be7402
5 changed files with 1153 additions and 324 deletions
--- a/AppImage/app/page.tsx
+++ b/AppImage/app/page.tsx
@@ -40,7 +40,7 @@ export default function Home() {
        authenticated,
      })
    } catch (error) {
-      console.error("[v0] Failed to check auth status:", error)
+      console.error("Failed to check auth status:", error)
      setAuthStatus({
        loading: false,
        authEnabled: false,
--- a/AppImage/components/health-status-modal.tsx
+++ b/AppImage/components/health-status-modal.tsx
@@ -2,7 +2,7 @@

 import type React from "react"

-import { useState, useEffect } from "react"
+import { useState, useEffect, useCallback } from "react"
 import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle } from "@/components/ui/dialog"
 import { Badge } from "@/components/ui/badge"
 import { Button } from "@/components/ui/button"
@@ -11,6 +11,7 @@ import {
  CheckCircle2,
  AlertTriangle,
  XCircle,
+  Info,
  Activity,
  Cpu,
  MemoryStick,
@@ -23,16 +24,30 @@ import {
  RefreshCw,
  Shield,
  X,
+  Clock,
+  BellOff,
+  ChevronRight,
 } from "lucide-react"

 interface CategoryCheck {
  status: string
  reason?: string
  details?: any
+  checks?: Record<string, { status: string; detail: string; [key: string]: any }>
  dismissable?: boolean
  [key: string]: any
 }

+interface DismissedError {
+  error_key: string
+  category: string
+  severity: string
+  reason: string
+  dismissed: boolean
+  suppression_remaining_hours: number
+  resolved_at: string
+}
+
 interface HealthDetails {
  overall: string
  summary: string
@@ -51,6 +66,13 @@ interface HealthDetails {
  timestamp: string
 }

+interface FullHealthData {
+  health: HealthDetails
+  active_errors: any[]
+  dismissed: DismissedError[]
+  timestamp: string
+}
+
 interface HealthStatusModalProps {
  open: boolean
  onOpenChange: (open: boolean) => void
@@ -73,7 +95,41 @@ const CATEGORIES = [
 export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatusModalProps) {
  const [loading, setLoading] = useState(true)
  const [healthData, setHealthData] = useState<HealthDetails | null>(null)
+  const [dismissedItems, setDismissedItems] = useState<DismissedError[]>([])
  const [error, setError] = useState<string | null>(null)
+  const [dismissingKey, setDismissingKey] = useState<string | null>(null)
+  const [expandedCategories, setExpandedCategories] = useState<Set<string>>(new Set())
+
+  const fetchHealthDetails = useCallback(async () => {
+    setLoading(true)
+    setError(null)
+
+    try {
+      // Use the new combined endpoint for fewer round-trips
+      const response = await fetch(getApiUrl("/api/health/full"))
+      if (!response.ok) {
+        // Fallback to legacy endpoint
+        const legacyResponse = await fetch(getApiUrl("/api/health/details"))
+        if (!legacyResponse.ok) throw new Error("Failed to fetch health details")
+        const data = await legacyResponse.json()
+        setHealthData(data)
+        setDismissedItems([])
+      } else {
+        const fullData: FullHealthData = await response.json()
+        setHealthData(fullData.health)
+        setDismissedItems(fullData.dismissed || [])
+      }
+
+      const event = new CustomEvent("healthStatusUpdated", {
+        detail: { status: healthData?.overall || "OK" },
+      })
+      window.dispatchEvent(event)
+    } catch (err) {
+      setError(err instanceof Error ? err.message : "Unknown error")
+    } finally {
+      setLoading(false)
+    }
+  }, [getApiUrl, healthData?.overall])

  useEffect(() => {
    if (open) {
@@ -81,42 +137,46 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
    }
  }, [open])

-  const fetchHealthDetails = async () => {
-    setLoading(true)
-    setError(null)
-
-    try {
-      const response = await fetch(getApiUrl("/api/health/details"))
-      if (!response.ok) {
-        throw new Error("Failed to fetch health details")
+  // Auto-expand non-OK categories when data loads
+  useEffect(() => {
+    if (healthData?.details) {
+      const nonOkCategories = new Set<string>()
+      CATEGORIES.forEach(({ key }) => {
+        const cat = healthData.details[key as keyof typeof healthData.details]
+        if (cat && cat.status?.toUpperCase() !== "OK") {
+          nonOkCategories.add(key)
        }
-      const data = await response.json()
-      console.log("[v0] Health data received:", data)
-      setHealthData(data)
-
-      const event = new CustomEvent("healthStatusUpdated", {
-        detail: { status: data.overall },
      })
-      window.dispatchEvent(event)
-    } catch (err) {
-      console.error("[v0] Error fetching health data:", err)
-      setError(err instanceof Error ? err.message : "Unknown error")
-    } finally {
-      setLoading(false)
+      setExpandedCategories(nonOkCategories)
    }
+  }, [healthData])
+
+  const toggleCategory = (key: string) => {
+    setExpandedCategories(prev => {
+      const next = new Set(prev)
+      if (next.has(key)) {
+        next.delete(key)
+      } else {
+        next.add(key)
+      }
+      return next
+    })
  }

-  const getStatusIcon = (status: string) => {
+  const getStatusIcon = (status: string, size: "sm" | "md" = "md") => {
    const statusUpper = status?.toUpperCase()
+    const cls = size === "sm" ? "h-4 w-4" : "h-5 w-5"
    switch (statusUpper) {
      case "OK":
-        return <CheckCircle2 className="h-5 w-5 text-green-500" />
+        return <CheckCircle2 className={`${cls} text-green-500`} />
+      case "INFO":
+        return <Info className={`${cls} text-blue-500`} />
      case "WARNING":
-        return <AlertTriangle className="h-5 w-5 text-yellow-500" />
+        return <AlertTriangle className={`${cls} text-yellow-500`} />
      case "CRITICAL":
-        return <XCircle className="h-5 w-5 text-red-500" />
+        return <XCircle className={`${cls} text-red-500`} />
      default:
-        return <Activity className="h-5 w-5 text-gray-500" />
+        return <Activity className={`${cls} text-muted-foreground`} />
    }
  }

@@ -125,6 +185,8 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
    switch (statusUpper) {
      case "OK":
        return <Badge className="bg-green-500 text-white hover:bg-green-500">OK</Badge>
+      case "INFO":
+        return <Badge className="bg-blue-500 text-white hover:bg-blue-500">Info</Badge>
      case "WARNING":
        return <Badge className="bg-yellow-500 text-white hover:bg-yellow-500">Warning</Badge>
      case "CRITICAL":
@@ -136,10 +198,11 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu

  const getHealthStats = () => {
    if (!healthData?.details) {
-      return { total: 0, healthy: 0, warnings: 0, critical: 0 }
+      return { total: 0, healthy: 0, info: 0, warnings: 0, critical: 0 }
    }

    let healthy = 0
+    let info = 0
    let warnings = 0
    let critical = 0

@@ -148,22 +211,22 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
      if (categoryData) {
        const status = categoryData.status?.toUpperCase()
        if (status === "OK") healthy++
+        else if (status === "INFO") info++
        else if (status === "WARNING") warnings++
        else if (status === "CRITICAL") critical++
      }
    })

-    return { total: CATEGORIES.length, healthy, warnings, critical }
+    return { total: CATEGORIES.length, healthy, info, warnings, critical }
  }

  const stats = getHealthStats()

  const handleCategoryClick = (categoryKey: string, status: string) => {
-    if (status === "OK") return // No navegar si está OK
+    if (status === "OK" || status === "INFO") return

-    onOpenChange(false) // Cerrar el modal
+    onOpenChange(false)

-    // Mapear categorías a tabs
    const categoryToTab: Record<string, string> = {
      storage: "storage",
      disks: "storage",
@@ -176,43 +239,156 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu

    const targetTab = categoryToTab[categoryKey]
    if (targetTab) {
-      // Disparar evento para cambiar tab
      const event = new CustomEvent("changeTab", { detail: { tab: targetTab } })
      window.dispatchEvent(event)
    }
  }

  const handleAcknowledge = async (errorKey: string, e: React.MouseEvent) => {
-    e.stopPropagation() // Prevent navigation
-
-    console.log("[v0] Dismissing error:", errorKey)
+    e.stopPropagation()
+    setDismissingKey(errorKey)

    try {
      const response = await fetch(getApiUrl("/api/health/acknowledge"), {
        method: "POST",
-        headers: {
-          "Content-Type": "application/json",
-        },
+        headers: { "Content-Type": "application/json" },
        body: JSON.stringify({ error_key: errorKey }),
      })

      if (!response.ok) {
        const errorData = await response.json()
-        console.error("[v0] Acknowledge failed:", errorData)
-        throw new Error(errorData.error || "Failed to acknowledge error")
+        throw new Error(errorData.error || "Failed to dismiss error")
      }

-      const result = await response.json()
-      console.log("[v0] Acknowledge success:", result)
-
-      // Refresh health data
      await fetchHealthDetails()
    } catch (err) {
-      console.error("[v0] Error acknowledging:", err)
-      alert("Failed to dismiss error. Please try again.")
+      console.error("Error dismissing:", err)
+    } finally {
+      setDismissingKey(null)
    }
  }

+  const getTimeSinceCheck = () => {
+    if (!healthData?.timestamp) return null
+    const checkTime = new Date(healthData.timestamp)
+    const now = new Date()
+    const diffMs = now.getTime() - checkTime.getTime()
+    const diffMin = Math.floor(diffMs / 60000)
+    if (diffMin < 1) return "just now"
+    if (diffMin === 1) return "1 minute ago"
+    if (diffMin < 60) return `${diffMin} minutes ago`
+    const diffHours = Math.floor(diffMin / 60)
+    return `${diffHours}h ${diffMin % 60}m ago`
+  }
+
+  const getCategoryRowStyle = (status: string) => {
+    const s = status?.toUpperCase()
+    if (s === "CRITICAL") return "bg-red-500/5 border-red-500/20 hover:bg-red-500/10 cursor-pointer"
+    if (s === "WARNING") return "bg-yellow-500/5 border-yellow-500/20 hover:bg-yellow-500/10 cursor-pointer"
+    if (s === "INFO") return "bg-blue-500/5 border-blue-500/20 hover:bg-blue-500/10"
+    return "bg-card border-border hover:bg-muted/30"
+  }
+
+  const getOutlineBadgeStyle = (status: string) => {
+    const s = status?.toUpperCase()
+    if (s === "OK") return "border-green-500 text-green-500 bg-transparent"
+    if (s === "INFO") return "border-blue-500 text-blue-500 bg-blue-500/5"
+    if (s === "WARNING") return "border-yellow-500 text-yellow-500 bg-yellow-500/5"
+    if (s === "CRITICAL") return "border-red-500 text-red-500 bg-red-500/5"
+    return ""
+  }
+
+  const formatCheckLabel = (key: string): string => {
+    const labels: Record<string, string> = {
+      cpu_usage: "CPU Usage",
+      cpu_temperature: "Temperature",
+      ram_usage: "RAM Usage",
+      swap_usage: "Swap Usage",
+      root_filesystem: "Root Filesystem",
+      lvm_check: "LVM Status",
+      connectivity: "Connectivity",
+      all_vms_cts: "VMs & Containers",
+      cluster_mode: "Cluster Mode",
+      error_cascade: "Error Cascade",
+      error_spike: "Error Spike",
+      persistent_errors: "Persistent Errors",
+      critical_errors: "Critical Errors",
+      security_updates: "Security Updates",
+      system_age: "System Age",
+      pending_updates: "Pending Updates",
+      kernel_pve: "Kernel / PVE",
+      uptime: "Uptime",
+      certificates: "Certificates",
+      login_attempts: "Login Attempts",
+      fail2ban: "Fail2Ban",
+    }
+    if (labels[key]) return labels[key]
+    // Convert snake_case or camelCase to Title Case
+    return key
+      .replace(/_/g, " ")
+      .replace(/([a-z])([A-Z])/g, "$1 $2")
+      .replace(/\b\w/g, (c) => c.toUpperCase())
+  }
+
+  const renderChecks = (
+    checks: Record<string, { status: string; detail: string; dismissable?: boolean; thresholds?: string; [key: string]: any }>,
+    categoryKey: string
+  ) => {
+    if (!checks || Object.keys(checks).length === 0) return null
+
+    return (
+      <div className="mt-2 space-y-0.5">
+        {Object.entries(checks).map(([checkKey, checkData]) => {
+          const isDismissable = checkData.dismissable === true
+          const checkStatus = checkData.status?.toUpperCase() || "OK"
+
+          return (
+            <div
+              key={checkKey}
+              className="flex items-center justify-between gap-2 text-xs py-1.5 px-3 rounded-md hover:bg-muted/40 transition-colors"
+            >
+              <div className="flex items-center gap-2 min-w-0 flex-1">
+                {getStatusIcon(checkData.status, "sm")}
+                <span className="font-medium shrink-0">{formatCheckLabel(checkKey)}</span>
+                <span className="text-muted-foreground truncate">{checkData.detail}</span>
+              </div>
+              <div className="flex items-center gap-1.5 shrink-0">
+                {checkData.thresholds && (
+                  <span className="text-[10px] text-muted-foreground/60 hidden sm:inline">
+                    ({checkData.thresholds})
+                  </span>
+                )}
+                {(checkStatus === "WARNING" || checkStatus === "CRITICAL") && isDismissable && (
+                  <Button
+                    size="sm"
+                    variant="outline"
+                    className="h-5 px-1.5 shrink-0 hover:bg-red-500/10 hover:border-red-500/50 bg-transparent text-[10px]"
+                    disabled={dismissingKey === checkKey}
+                    onClick={(e) => {
+                      e.stopPropagation()
+                      handleAcknowledge(checkKey, e)
+                    }}
+                  >
+                    {dismissingKey === checkKey ? (
+                      <Loader2 className="h-3 w-3 animate-spin" />
+                    ) : (
+                      <>
+                        <X className="h-3 w-3 mr-0.5" />
+                        Dismiss
+                      </>
+                    )}
+                  </Button>
+                )}
+              </div>
+            </div>
+          )
+        })}
+      </div>
+    )
+  }
+
+
+
  return (
    <Dialog open={open} onOpenChange={onOpenChange}>
      <DialogContent className="max-w-3xl max-h-[85vh] overflow-y-auto">
@@ -224,7 +400,15 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
              {healthData && <div className="ml-2">{getStatusBadge(healthData.overall)}</div>}
            </DialogTitle>
          </div>
-          <DialogDescription>Detailed health checks for all system components</DialogDescription>
+          <DialogDescription className="flex items-center gap-2">
+            Detailed health checks for all system components
+            {getTimeSinceCheck() && (
+              <span className="inline-flex items-center gap-1 text-xs text-muted-foreground">
+                <Clock className="h-3 w-3" />
+                Last check: {getTimeSinceCheck()}
+              </span>
+            )}
+          </DialogDescription>
        </DialogHeader>

        {loading && (
@@ -243,15 +427,21 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
        {healthData && !loading && (
          <div className="space-y-4">
            {/* Overall Stats Summary */}
-            <div className="grid grid-cols-4 gap-3 p-4 rounded-lg bg-muted/30 border">
+            <div className={`grid gap-3 p-4 rounded-lg bg-muted/30 border ${stats.info > 0 ? "grid-cols-5" : "grid-cols-4"}`}>
              <div className="text-center">
                <div className="text-2xl font-bold">{stats.total}</div>
-                <div className="text-xs text-muted-foreground">Total Checks</div>
+                <div className="text-xs text-muted-foreground">Total</div>
              </div>
              <div className="text-center">
                <div className="text-2xl font-bold text-green-500">{stats.healthy}</div>
                <div className="text-xs text-muted-foreground">Healthy</div>
              </div>
+              {stats.info > 0 && (
+                <div className="text-center">
+                  <div className="text-2xl font-bold text-blue-500">{stats.info}</div>
+                  <div className="text-xs text-muted-foreground">Info</div>
+                </div>
+              )}
              <div className="text-center">
                <div className="text-2xl font-bold text-yellow-500">{stats.warnings}</div>
                <div className="text-xs text-muted-foreground">Warnings</div>
@@ -268,90 +458,116 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
              </div>
            )}

+            {/* Category List */}
            <div className="space-y-2">
              {CATEGORIES.map(({ key, label, Icon }) => {
                const categoryData = healthData.details[key as keyof typeof healthData.details]
                const status = categoryData?.status || "UNKNOWN"
                const reason = categoryData?.reason
-                const details = categoryData?.details
+                const checks = categoryData?.checks
+                const isExpanded = expandedCategories.has(key)
+                const hasChecks = checks && Object.keys(checks).length > 0

                return (
                  <div
                    key={key}
-                    onClick={() => handleCategoryClick(key, status)}
-                    className={`flex items-start gap-3 p-3 rounded-lg border transition-colors ${
-                      status === "OK"
-                        ? "bg-card border-border hover:bg-muted/30"
-                        : status === "WARNING"
-                          ? "bg-yellow-500/5 border-yellow-500/20 hover:bg-yellow-500/10 cursor-pointer"
-                          : status === "CRITICAL"
-                            ? "bg-red-500/5 border-red-500/20 hover:bg-red-500/10 cursor-pointer"
-                            : "bg-muted/30 hover:bg-muted/50"
-                    }`}
+                    className={`rounded-lg border transition-colors overflow-hidden ${getCategoryRowStyle(status)}`}
                  >
-                    <div className="mt-0.5 flex-shrink-0 flex items-center gap-2">
+                    {/* Clickable header row */}
+                    <div
+                      className="flex items-center gap-3 p-3 cursor-pointer select-none"
+                      onClick={() => toggleCategory(key)}
+                    >
+                      <div className="flex-shrink-0 flex items-center gap-2">
                        <Icon className="h-4 w-4 text-blue-500" />
                        {getStatusIcon(status)}
                      </div>
                      <div className="flex-1 min-w-0">
-                      <div className="flex items-center justify-between gap-2 mb-1">
+                        <div className="flex items-center gap-2">
                          <p className="font-medium text-sm">{label}</p>
-                        <Badge
-                          variant="outline"
-                          className={`shrink-0 text-xs ${
-                            status === "OK"
-                              ? "border-green-500 text-green-500 bg-transparent"
-                              : status === "WARNING"
-                                ? "border-yellow-500 text-yellow-500 bg-yellow-500/5"
-                                : status === "CRITICAL"
-                                  ? "border-red-500 text-red-500 bg-red-500/5"
-                                  : ""
-                          }`}
-                        >
+                          {hasChecks && (
+                            <span className="text-[10px] text-muted-foreground">
+                              ({Object.keys(checks).length} checks)
+                            </span>
+                          )}
+                        </div>
+                        {reason && !isExpanded && (
+                          <p className="text-xs text-muted-foreground mt-0.5 truncate">{reason}</p>
+                        )}
+                      </div>
+                      <div className="flex items-center gap-2 shrink-0">
+                        <Badge variant="outline" className={`text-xs ${getOutlineBadgeStyle(status)}`}>
                          {status}
                        </Badge>
+                        <ChevronRight
+                          className={`h-4 w-4 text-muted-foreground transition-transform duration-200 ${
+                            isExpanded ? "rotate-90" : ""
+                          }`}
+                        />
+                      </div>
                    </div>
-                      {reason && <p className="text-xs text-muted-foreground mt-1">{reason}</p>}
-                      {details && typeof details === "object" && (
-                        <div className="mt-2 space-y-1">
-                          {Object.entries(details).map(([detailKey, detailValue]: [string, any]) => {
-                            if (typeof detailValue === "object" && detailValue !== null) {
-                              const isDismissable = detailValue.dismissable !== false

-                              return (
+                    {/* Expandable checks section */}
+                    {isExpanded && (
+                      <div className="border-t border-border/50 bg-muted/5 px-2 py-1.5">
+                        {reason && (
+                          <p className="text-xs text-muted-foreground px-3 py-1.5 mb-1">{reason}</p>
+                        )}
+                        {hasChecks ? (
+                          renderChecks(checks, key)
+                        ) : (
+                          <div className="flex items-center gap-2 text-xs text-muted-foreground px-3 py-2">
+                            <CheckCircle2 className="h-3.5 w-3.5 text-green-500" />
+                            No issues detected
+                          </div>
+                        )}
+                      </div>
+                    )}
+                  </div>
+                )
+              })}
+            </div>
+
+            {/* Dismissed Items Section */}
+            {dismissedItems.length > 0 && (
+              <div className="space-y-2">
+                <div className="flex items-center gap-2 text-sm font-medium text-muted-foreground pt-2">
+                  <BellOff className="h-4 w-4" />
+                  Dismissed Items ({dismissedItems.length})
+                </div>
+                {dismissedItems.map((item) => (
                  <div
-                                  key={detailKey}
-                                  className="flex items-start justify-between gap-2 text-xs pl-3 border-l-2 border-muted py-1"
+                    key={item.error_key}
+                    className="flex items-start gap-3 p-3 rounded-lg border bg-muted/10 border-muted opacity-75"
                  >
-                                  <div className="flex-1">
-                                    <span className="font-medium">{detailKey}:</span>
-                                    {detailValue.reason && (
-                                      <span className="ml-1 text-muted-foreground">{detailValue.reason}</span>
-                                    )}
+                    <div className="mt-0.5 flex-shrink-0 flex items-center gap-2">
+                      <BellOff className="h-4 w-4 text-muted-foreground" />
+                      {getStatusIcon("INFO")}
                    </div>
-                                  {(status === "WARNING" || status === "CRITICAL") && isDismissable && (
-                                    <Button
-                                      size="sm"
-                                      variant="outline"
-                                      className="h-6 px-2 shrink-0 hover:bg-red-500/10 hover:border-red-500/50 bg-transparent"
-                                      onClick={(e) => handleAcknowledge(detailKey, e)}
-                                    >
-                                      <X className="h-3 w-3 mr-1" />
-                                      <span className="text-xs">Dismiss</span>
-                                    </Button>
-                                  )}
+                    <div className="flex-1 min-w-0">
+                      <div className="flex items-center justify-between gap-2 mb-1">
+                        <p className="font-medium text-sm text-muted-foreground">{item.reason}</p>
+                        <div className="flex items-center gap-1.5 shrink-0">
+                          <Badge variant="outline" className="text-xs border-blue-500/50 text-blue-500/70 bg-transparent">
+                            Dismissed
+                          </Badge>
+                          <Badge variant="outline" className={`text-xs ${getOutlineBadgeStyle(item.severity)}`}>
+                            was {item.severity}
+                          </Badge>
                        </div>
-                              )
-                            }
-                            return null
-                          })}
+                      </div>
+                      <p className="text-xs text-muted-foreground flex items-center gap-1">
+                        <Clock className="h-3 w-3" />
+                        Suppressed for {item.suppression_remaining_hours < 24
+                          ? `${Math.round(item.suppression_remaining_hours)}h`
+                          : `${Math.round(item.suppression_remaining_hours / 24)} days`
+                        } more
+                      </p>
+                    </div>
+                  </div>
+                ))}
              </div>
            )}
-                    </div>
-                  </div>
-                )
-              })}
-            </div>

            {healthData.timestamp && (
              <div className="text-xs text-muted-foreground text-center pt-2">
--- a/AppImage/scripts/flask_health_routes.py
+++ b/AppImage/scripts/flask_health_routes.py
@@ -51,15 +51,45 @@ def get_system_info():

@health_bp.route('/api/health/acknowledge', methods=['POST'])
 def acknowledge_error():
-    """Acknowledge an error manually (user dismissed it)"""
+    """
+    Acknowledge/dismiss an error manually.
+    Returns details about the acknowledged error including original severity
+    and suppression period info.
+    """
    try:
        data = request.get_json()
        if not data or 'error_key' not in data:
            return jsonify({'error': 'error_key is required'}), 400
        
        error_key = data['error_key']
-        health_persistence.acknowledge_error(error_key)
-        return jsonify({'success': True, 'message': 'Error acknowledged'})
+        result = health_persistence.acknowledge_error(error_key)
+        
+        if result.get('success'):
+            # Determine suppression period for the response
+            category = result.get('category', '')
+            if category == 'updates':
+                suppression_hours = 180 * 24  # 180 days in hours
+                suppression_label = '6 months'
+            else:
+                suppression_hours = 24
+                suppression_label = '24 hours'
+            
+            return jsonify({
+                'success': True,
+                'message': f'Error dismissed for {suppression_label}',
+                'error_key': error_key,
+                'original_severity': result.get('original_severity', 'WARNING'),
+                'category': category,
+                'suppression_hours': suppression_hours,
+                'suppression_label': suppression_label,
+                'acknowledged_at': result.get('acknowledged_at')
+            })
+        else:
+            return jsonify({
+                'success': False,
+                'message': 'Error not found or already dismissed',
+                'error_key': error_key
+            }), 404
    except Exception as e:
        return jsonify({'error': str(e)}), 500

@@ -72,3 +102,67 @@ def get_active_errors():
        return jsonify({'errors': errors})
    except Exception as e:
        return jsonify({'error': str(e)}), 500
+
+@health_bp.route('/api/health/dismissed', methods=['GET'])
+def get_dismissed_errors():
+    """
+    Get dismissed errors that are still within their suppression period.
+    These are shown as INFO items with a 'Dismissed' badge in the frontend.
+    """
+    try:
+        dismissed = health_persistence.get_dismissed_errors()
+        return jsonify({'dismissed': dismissed})
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
+
+@health_bp.route('/api/health/full', methods=['GET'])
+def get_full_health():
+    """
+    Get complete health data in a single request: detailed status + active errors + dismissed.
+    Reduces frontend round-trips.
+    """
+    try:
+        details = health_monitor.get_detailed_status()
+        active_errors = health_persistence.get_active_errors()
+        dismissed = health_persistence.get_dismissed_errors()
+        
+        return jsonify({
+            'health': details,
+            'active_errors': active_errors,
+            'dismissed': dismissed,
+            'timestamp': details.get('timestamp')
+        })
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
+
+@health_bp.route('/api/health/pending-notifications', methods=['GET'])
+def get_pending_notifications():
+    """
+    Get events pending notification (for future Telegram/Gotify/Discord integration).
+    This endpoint will be consumed by the Notification Service (Bloque A).
+    """
+    try:
+        pending = health_persistence.get_pending_notifications()
+        return jsonify({'pending': pending, 'count': len(pending)})
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
+
+@health_bp.route('/api/health/mark-notified', methods=['POST'])
+def mark_events_notified():
+    """
+    Mark events as notified after notification was sent successfully.
+    Used by the Notification Service (Bloque A) after sending alerts.
+    """
+    try:
+        data = request.get_json()
+        if not data or 'event_ids' not in data:
+            return jsonify({'error': 'event_ids array is required'}), 400
+        
+        event_ids = data['event_ids']
+        if not isinstance(event_ids, list):
+            return jsonify({'error': 'event_ids must be an array'}), 400
+        
+        health_persistence.mark_events_notified(event_ids)
+        return jsonify({'success': True, 'marked_count': len(event_ids)})
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
--- a/AppImage/scripts/health_monitor.py
+++ b/AppImage/scripts/health_monitor.py
@@ -373,6 +373,44 @@ class HealthMonitor:
            overall = 'OK'
            summary = 'All systems operational'
        
+        # --- Emit events for state changes (Bloque A: Notification prep) ---
+        try:
+            previous_overall = getattr(self, '_last_overall_status', None)
+            if previous_overall and previous_overall != overall:
+                # Overall status changed - emit event
+                health_persistence.emit_event(
+                    event_type='state_change',
+                    category='overall',
+                    severity=overall,
+                    data={
+                        'previous': previous_overall,
+                        'current': overall,
+                        'summary': summary
+                    }
+                )
+            
+            # Track per-category state changes
+            previous_details = getattr(self, '_last_category_statuses', {})
+            for cat_key, cat_data in details.items():
+                cat_status = cat_data.get('status', 'OK')
+                prev_status = previous_details.get(cat_key, 'OK')
+                if prev_status != cat_status and cat_status in ('WARNING', 'CRITICAL'):
+                    health_persistence.emit_event(
+                        event_type='state_change',
+                        category=cat_key,
+                        severity=cat_status,
+                        data={
+                            'previous': prev_status,
+                            'current': cat_status,
+                            'reason': cat_data.get('reason', '')
+                        }
+                    )
+            
+            self._last_overall_status = overall
+            self._last_category_statuses = {k: v.get('status', 'OK') for k, v in details.items()}
+        except Exception:
+            pass  # Event emission should never break health checks
+        
        return {
            'overall': overall,
            'summary': summary,
@@ -445,6 +483,30 @@ class HealthMonitor:
                    result['status'] = 'WARNING'
                    result['reason'] = temp_status.get('reason')
            
+            # Build checks dict for frontend expandable section
+            checks = {
+                'cpu_usage': {
+                    'status': status,
+                    'detail': f'{round(cpu_percent, 1)}% ({psutil.cpu_count()} cores)',
+                    'value': round(cpu_percent, 1),
+                    'thresholds': f'Warning >{self.CPU_WARNING}%, Critical >{self.CPU_CRITICAL}%'
+                }
+            }
+            if temp_status and temp_status.get('status') != 'UNKNOWN':
+                temp_val = temp_status.get('value', 'N/A')
+                checks['cpu_temperature'] = {
+                    'status': temp_status.get('status', 'OK'),
+                    'detail': f'{temp_val}°C' if isinstance(temp_val, (int, float)) else str(temp_val),
+                    'value': temp_val,
+                    'thresholds': 'Warning >80°C sustained >3min'
+                }
+            else:
+                checks['cpu_temperature'] = {
+                    'status': 'OK',
+                    'detail': 'Sensor not available',
+                }
+            
+            result['checks'] = checks
            return result
            
        except Exception as e:
@@ -617,12 +679,35 @@ class HealthMonitor:
                status = 'OK'
                reason = None
            
+            ram_avail_gb = round(memory.available / (1024**3), 2)
+            ram_total_gb = round(memory.total / (1024**3), 2)
+            swap_used_gb = round(swap.used / (1024**3), 2)
+            swap_total_gb = round(swap.total / (1024**3), 2)
+            
+            # Determine per-sub-check status
+            ram_status = 'CRITICAL' if mem_percent >= 90 and mem_critical >= 2 else ('WARNING' if mem_percent >= self.MEMORY_WARNING and mem_warning >= 2 else 'OK')
+            swap_status = 'CRITICAL' if swap_critical >= 2 else 'OK'
+            
            result = {
                'status': status,
                'ram_percent': round(mem_percent, 1),
-                'ram_available_gb': round(memory.available / (1024**3), 2),
+                'ram_available_gb': ram_avail_gb,
                'swap_percent': round(swap_percent, 1),
-                'swap_used_gb': round(swap.used / (1024**3), 2)
+                'swap_used_gb': swap_used_gb,
+                'checks': {
+                    'ram_usage': {
+                        'status': ram_status,
+                        'detail': f'{round(mem_percent, 1)}% used ({ram_avail_gb} GB free of {ram_total_gb} GB)',
+                        'value': round(mem_percent, 1),
+                        'thresholds': f'Warning >{self.MEMORY_WARNING}%, Critical >90%'
+                    },
+                    'swap_usage': {
+                        'status': swap_status,
+                        'detail': f'{round(swap_percent, 1)}% used ({swap_used_gb} GB of {swap_total_gb} GB)' if swap.total > 0 else 'No swap configured',
+                        'value': round(swap_percent, 1),
+                        'thresholds': 'Critical when swap >20% of RAM'
+                    }
+                }
            }
            
            if reason:
@@ -706,8 +791,28 @@ class HealthMonitor:
            issues.append(f"LVM check: {lvm_status.get('reason')}")
            storage_details['lvm_check'] = lvm_status
        
+        # Check dmesg for real-time I/O errors (dmesg-based, complements journalctl SMART checks)
+        dmesg_io_result = self._check_disks_optimized()
+        if dmesg_io_result.get('status') != 'OK':
+            dmesg_details = dmesg_io_result.get('details', {})
+            for disk_path, disk_info in dmesg_details.items():
+                if disk_path not in storage_details or storage_details[disk_path].get('status') == 'OK':
+                    issues.append(f'{disk_path}: {disk_info.get("reason", "I/O errors")}')
+                    storage_details[disk_path] = disk_info
+        
+        # Build checks dict from storage_details, adding OK entries for items with no issues
+        checks = {}
+        for key, val in storage_details.items():
+            checks[key] = {
+                'status': val.get('status', 'OK'),
+                'detail': val.get('reason', 'OK'),
+                **{k: v for k, v in val.items() if k not in ('status', 'reason')}
+            }
+        
        if not issues:
-            return {'status': 'OK'}
+            # Add a summary OK entry if nothing specific
+            checks['root_filesystem'] = checks.get('/', {'status': 'OK', 'detail': 'Root filesystem healthy'})
+            return {'status': 'OK', 'checks': checks}
        
        # Determine overall status
        has_critical = any(d.get('status') == 'CRITICAL' for d in storage_details.values())
@@ -715,7 +820,8 @@ class HealthMonitor:
        return {
            'status': 'CRITICAL' if has_critical else 'WARNING',
            'reason': '; '.join(issues[:3]),
-            'details': storage_details
+            'details': storage_details,
+            'checks': checks
        }
    
    def _check_filesystem(self, mount_point: str) -> Dict[str, Any]:
@@ -1025,19 +1131,42 @@ class HealthMonitor:
            
            # Check connectivity (latency)
            latency_status = self._check_network_latency()
-            if latency_status and latency_status.get('status') not in ['OK', 'INFO', 'UNKNOWN']:
-                issues.append(latency_status.get('reason', 'Network latency issue'))
+            if latency_status:
+                latency_ms = latency_status.get('latency_ms', 'N/A')
+                latency_sev = latency_status.get('status', 'OK')
                interface_details['connectivity'] = latency_status
+                connectivity_check = {
+                    'status': latency_sev if latency_sev not in ['UNKNOWN'] else 'OK',
+                    'detail': f'Latency {latency_ms}ms to 1.1.1.1' if isinstance(latency_ms, (int, float)) else latency_status.get('reason', 'Unknown'),
+                }
+                if latency_sev not in ['OK', 'INFO', 'UNKNOWN']:
+                    issues.append(latency_status.get('reason', 'Network latency issue'))
+            else:
+                connectivity_check = {'status': 'OK', 'detail': 'Not tested'}
+            
+            # Build checks dict
+            checks = {}
+            for iface in active_interfaces:
+                checks[iface] = {'status': 'OK', 'detail': 'UP'}
+            for iface, detail in interface_details.items():
+                if iface != 'connectivity':
+                    checks[iface] = {
+                        'status': detail.get('status', 'OK'),
+                        'detail': detail.get('reason', 'DOWN'),
+                        'dismissable': detail.get('dismissable', False)
+                    }
+            checks['connectivity'] = connectivity_check
            
            if not issues:
-                return {'status': 'OK'}
+                return {'status': 'OK', 'checks': checks}
            
            has_critical = any(d.get('status') == 'CRITICAL' for d in interface_details.values())
            
            return {
                'status': 'CRITICAL' if has_critical else 'WARNING',
                'reason': '; '.join(issues[:2]),
-                'details': interface_details
+                'details': interface_details,
+                'checks': checks
            }
            
        except Exception:
@@ -1348,26 +1477,51 @@ class HealthMonitor:
                                    'type': vm_type
                                }
            
+            # Build checks dict from vm_details
+            checks = {}
+            for key, val in vm_details.items():
+                vm_label = f"{val.get('type', 'VM')} {val.get('id', key)}"
+                checks[vm_label] = {
+                    'status': val.get('status', 'WARNING'),
+                    'detail': val.get('reason', 'Error'),
+                    'dismissable': True
+                }
+            
            if not issues:
-                return {'status': 'OK'}
+                checks['all_vms_cts'] = {'status': 'OK', 'detail': 'No issues detected in logs'}
+                return {'status': 'OK', 'checks': checks}
            
            has_critical = any(d.get('status') == 'CRITICAL' for d in vm_details.values())
            
            return {
                'status': 'CRITICAL' if has_critical else 'WARNING',
                'reason': '; '.join(issues[:3]),
-                'details': vm_details
+                'details': vm_details,
+                'checks': checks
            }
            
        except Exception:
-            return {'status': 'OK'}
+            return {'status': 'OK', 'checks': {}}
    
    def _check_pve_services(self) -> Dict[str, Any]:
-        """Check critical Proxmox services"""
+        """
+        Check critical Proxmox services with persistence tracking.
+        - Checks the base PVE_SERVICES list
+        - Dynamically adds corosync if a cluster config exists
+        - Records failed services in persistence for tracking/dismiss
+        - Auto-clears when services recover
+        """
        try:
-            failed_services = []
+            # Build service list: base PVE services + corosync if clustered
+            services_to_check = list(self.PVE_SERVICES)
+            is_cluster = os.path.exists('/etc/corosync/corosync.conf')
+            if is_cluster and 'corosync' not in services_to_check:
+                services_to_check.append('corosync')
            
-            for service in self.PVE_SERVICES:
+            failed_services = []
+            service_details = {}
+            
+            for service in services_to_check:
                try:
                    result = subprocess.run(
                        ['systemctl', 'is-active', service],
@@ -1376,23 +1530,79 @@ class HealthMonitor:
                        timeout=2
                    )
                    
-                    if result.returncode != 0 or result.stdout.strip() != 'active':
+                    status = result.stdout.strip()
+                    if result.returncode != 0 or status != 'active':
                        failed_services.append(service)
+                        service_details[service] = status or 'inactive'
                except Exception:
-                    # If systemctl fails (e.g., command not found or service doesn't exist), treat as failed
                    failed_services.append(service)
+                    service_details[service] = 'error'
            
-            if failed_services:
-                return {
+            # Build checks dict with status per service
+            checks = {}
+            for svc in services_to_check:
+                if svc in failed_services:
+                    state = service_details.get(svc, 'inactive')
+                    checks[svc] = {
                        'status': 'CRITICAL',
-                    'reason': f'Services inactive: {", ".join(failed_services)}',
-                    'failed': failed_services
+                        'detail': f'Service is {state}',
+                    }
+                else:
+                    checks[svc] = {
+                        'status': 'OK',
+                        'detail': 'Active',
                    }
            
-            return {'status': 'OK'}
+            if is_cluster:
+                checks['cluster_mode'] = {
+                    'status': 'OK',
+                    'detail': 'Cluster detected (corosync.conf present)',
+                }
+            
+            if failed_services:
+                reason = f'Services inactive: {", ".join(failed_services)}'
+                
+                # Record each failed service in persistence
+                for svc in failed_services:
+                    error_key = f'pve_service_{svc}'
+                    health_persistence.record_error(
+                        error_key=error_key,
+                        category='services',
+                        severity='CRITICAL',
+                        reason=f'PVE service {svc} is {service_details.get(svc, "inactive")}',
+                        details={'service': svc, 'state': service_details.get(svc, 'inactive')}
+                    )
+                
+                # Auto-clear services that recovered
+                for svc in services_to_check:
+                    if svc not in failed_services:
+                        error_key = f'pve_service_{svc}'
+                        if health_persistence.is_error_active(error_key):
+                            health_persistence.clear_error(error_key)
+                
+                return {
+                    'status': 'CRITICAL',
+                    'reason': reason,
+                    'failed': failed_services,
+                    'is_cluster': is_cluster,
+                    'services_checked': len(services_to_check),
+                    'checks': checks
+                }
+            
+            # All OK - clear any previously tracked service errors
+            for svc in services_to_check:
+                error_key = f'pve_service_{svc}'
+                if health_persistence.is_error_active(error_key):
+                    health_persistence.clear_error(error_key)
+            
+            return {
+                'status': 'OK',
+                'is_cluster': is_cluster,
+                'services_checked': len(services_to_check),
+                'checks': checks
+            }
            
        except Exception as e:
-            # If the entire systemctl check fails
            return {
                'status': 'WARNING',
                'reason': f'Service check command failed: {str(e)}'
@@ -1620,7 +1830,31 @@ class HealthMonitor:
                    status = 'OK'
                    reason = None
                
-                log_result = {'status': status}
+                # Build checks dict for log sub-items
+                log_checks = {
+                    'error_cascade': {
+                        'status': 'WARNING' if cascade_count > 0 else 'OK',
+                        'detail': f'{cascade_count} pattern(s) repeating >=15 times' if cascade_count > 0 else 'No cascading errors',
+                        'dismissable': True
+                    },
+                    'error_spike': {
+                        'status': 'WARNING' if spike_count > 0 else 'OK',
+                        'detail': f'{spike_count} pattern(s) with 4x increase' if spike_count > 0 else 'No error spikes',
+                        'dismissable': True
+                    },
+                    'persistent_errors': {
+                        'status': 'WARNING' if persistent_count > 0 else 'OK',
+                        'detail': f'{persistent_count} recurring pattern(s) over 15+ min' if persistent_count > 0 else 'No persistent patterns',
+                        'dismissable': True
+                    },
+                    'critical_errors': {
+                        'status': 'CRITICAL' if unique_critical_count > 0 else 'OK',
+                        'detail': f'{unique_critical_count} critical error(s) found' if unique_critical_count > 0 else 'No critical errors',
+                        'dismissable': True
+                    }
+                }
+                
+                log_result = {'status': status, 'checks': log_checks}
                if reason:
                    log_result['reason'] = reason
                
@@ -1629,7 +1863,12 @@ class HealthMonitor:
                return log_result
            
            # If journalctl command failed or returned no data
-            ok_result = {'status': 'OK'}
+            ok_result = {'status': 'OK', 'checks': {
+                'error_cascade': {'status': 'OK', 'detail': 'No cascading errors'},
+                'error_spike': {'status': 'OK', 'detail': 'No error spikes'},
+                'persistent_errors': {'status': 'OK', 'detail': 'No persistent patterns'},
+                'critical_errors': {'status': 'OK', 'detail': 'No critical errors'}
+            }}
            self.cached_results[cache_key] = ok_result
            self.last_check_times[cache_key] = current_time
            return ok_result
@@ -1662,9 +1901,9 @@ class HealthMonitor:
    def _check_updates(self) -> Optional[Dict[str, Any]]:
        """
        Check for pending system updates.
-        - WARNING: If security updates are available.
-        - CRITICAL: If system not updated in >2 years.
-        - INFO: If 1-2 years without updates, or many non-security updates.
+        - WARNING: Security updates available, or system not updated >1 year (365 days).
+        - CRITICAL: System not updated >18 months (548 days).
+        - INFO: Kernel/PVE updates available, or >50 non-security updates pending.
        """
        cache_key = 'updates_check'
        current_time = time.time()
@@ -1730,12 +1969,12 @@ class HealthMonitor:
                        reason=reason,
                        details={'count': len(security_updates_packages), 'packages': security_updates_packages[:5]}
                    )
-                elif last_update_days and last_update_days >= 730:
-                    # 2+ years without updates - CRITICAL
+                elif last_update_days and last_update_days >= 548:
+                    # 18+ months without updates - CRITICAL
                    status = 'CRITICAL'
-                    reason = f'System not updated in {last_update_days} days (>2 years)'
+                    reason = f'System not updated in {last_update_days} days (>18 months)'
                    health_persistence.record_error(
-                        error_key='updates_730days',
+                        error_key='updates_548days',
                        category='updates',
                        severity='CRITICAL',
                        reason=reason,
@@ -1766,14 +2005,40 @@ class HealthMonitor:
                status = 'WARNING'
                reason = 'Failed to check for updates (apt-get error)'

+            # Build checks dict for updates sub-items
+            update_age_status = 'CRITICAL' if (last_update_days and last_update_days >= 548) else ('WARNING' if (last_update_days and last_update_days >= 365) else 'OK')
+            sec_status = 'WARNING' if security_updates_packages else 'OK'
+            kernel_status = 'INFO' if kernel_pve_updates_packages else 'OK'
+            
+            checks = {
+                'security_updates': {
+                    'status': sec_status,
+                    'detail': f'{len(security_updates_packages)} security update(s) pending' if security_updates_packages else 'No security updates pending',
+                },
+                'system_age': {
+                    'status': update_age_status,
+                    'detail': f'Last updated {last_update_days} day(s) ago' if last_update_days is not None else 'Unknown',
+                    'thresholds': 'Warning >365 days, Critical >548 days'
+                },
+                'pending_updates': {
+                    'status': 'INFO' if update_count > 50 else 'OK',
+                    'detail': f'{update_count} package(s) pending',
+                },
+                'kernel_pve': {
+                    'status': kernel_status,
+                    'detail': f'{len(kernel_pve_updates_packages)} kernel/PVE update(s)' if kernel_pve_updates_packages else 'Kernel/PVE up to date',
+                }
+            }
+            
            # Construct result dictionary
            update_result = {
                'status': status,
-                'count': update_count
+                'count': update_count,
+                'checks': checks
            }
            if reason:
                update_result['reason'] = reason
-            if last_update_days is not None: # Only add if we could determine days_since_update
+            if last_update_days is not None:
                update_result['days_since_update'] = last_update_days
            
            self.cached_results[cache_key] = update_result
@@ -1782,39 +2047,188 @@ class HealthMonitor:
            
        except Exception as e:
            print(f"[HealthMonitor] Error checking updates: {e}")
-            # Return OK on exception to avoid false alerts
-            return {'status': 'OK', 'count': 0}
+            return {'status': 'OK', 'count': 0, 'checks': {}}
+    
+    def _check_fail2ban_bans(self) -> Dict[str, Any]:
+        """
+        Check if fail2ban is installed and if there are currently banned IPs.
+        Cached for 60 seconds to avoid hammering fail2ban-client.
+        
+        Returns:
+          {'installed': bool, 'active': bool, 'status': str, 'detail': str,
+           'banned_count': int, 'jails': [...], 'banned_ips': [...]}
+        """
+        cache_key = 'fail2ban_bans'
+        current_time = time.time()
+        
+        if cache_key in self.last_check_times:
+            if current_time - self.last_check_times[cache_key] < 60:
+                return self.cached_results.get(cache_key, {'installed': False, 'status': 'OK', 'detail': 'Not installed'})
+        
+        result = {'installed': False, 'active': False, 'status': 'OK', 'detail': 'Not installed', 'banned_count': 0, 'jails': [], 'banned_ips': []}
+        
+        try:
+            # Check if fail2ban-client exists
+            which_result = subprocess.run(
+                ['which', 'fail2ban-client'],
+                capture_output=True, text=True, timeout=2
+            )
+            if which_result.returncode != 0:
+                self.cached_results[cache_key] = result
+                self.last_check_times[cache_key] = current_time
+                return result
+            
+            result['installed'] = True
+            
+            # Check if fail2ban service is active
+            active_check = subprocess.run(
+                ['systemctl', 'is-active', 'fail2ban'],
+                capture_output=True, text=True, timeout=2
+            )
+            if active_check.stdout.strip() != 'active':
+                result['detail'] = 'Fail2Ban installed but service not active'
+                self.cached_results[cache_key] = result
+                self.last_check_times[cache_key] = current_time
+                return result
+            
+            result['active'] = True
+            
+            # Get list of active jails
+            jails_result = subprocess.run(
+                ['fail2ban-client', 'status'],
+                capture_output=True, text=True, timeout=3
+            )
+            
+            jails = []
+            if jails_result.returncode == 0:
+                for line in jails_result.stdout.split('\n'):
+                    if 'Jail list:' in line:
+                        jail_str = line.split('Jail list:')[1].strip()
+                        jails = [j.strip() for j in jail_str.split(',') if j.strip()]
+                        break
+            
+            if not jails:
+                result['detail'] = 'Fail2Ban active, no jails configured'
+                self.cached_results[cache_key] = result
+                self.last_check_times[cache_key] = current_time
+                return result
+            
+            result['jails'] = jails
+            
+            # Check each jail for banned IPs
+            total_banned = 0
+            all_banned_ips = []
+            jails_with_bans = []
+            
+            for jail in jails:
+                try:
+                    jail_result = subprocess.run(
+                        ['fail2ban-client', 'status', jail],
+                        capture_output=True, text=True, timeout=2
+                    )
+                    if jail_result.returncode == 0:
+                        for line in jail_result.stdout.split('\n'):
+                            if 'Currently banned:' in line:
+                                try:
+                                    count = int(line.split('Currently banned:')[1].strip())
+                                    if count > 0:
+                                        total_banned += count
+                                        jails_with_bans.append(jail)
+                                except (ValueError, IndexError):
+                                    pass
+                            elif 'Banned IP list:' in line:
+                                ips_str = line.split('Banned IP list:')[1].strip()
+                                if ips_str:
+                                    ips = [ip.strip() for ip in ips_str.split() if ip.strip()]
+                                    all_banned_ips.extend(ips[:10])  # Limit to 10 IPs per jail
+                except Exception:
+                    pass
+            
+            result['banned_count'] = total_banned
+            result['banned_ips'] = all_banned_ips[:20]  # Max 20 total
+            
+            if total_banned > 0:
+                jails_str = ', '.join(jails_with_bans)
+                msg = f'{total_banned} IP(s) currently banned by Fail2Ban (jails: {jails_str})'
+                result['status'] = 'WARNING'
+                result['detail'] = msg
+                
+                # Record in persistence (dismissable)
+                health_persistence.record_error(
+                    error_key='security_fail2ban_ban',
+                    category='security',
+                    severity='WARNING',
+                    reason=msg,
+                    details={
+                        'banned_count': total_banned,
+                        'jails': jails_with_bans,
+                        'banned_ips': all_banned_ips[:5],
+                        'dismissable': True
+                    }
+                )
+            else:
+                result['detail'] = f'Fail2Ban active ({len(jails)} jail(s), no current bans)'
+                # Auto-resolve if previously banned IPs are now gone
+                if health_persistence.is_error_active('security_fail2ban_ban'):
+                    health_persistence.clear_error('security_fail2ban_ban')
+            
+        except Exception as e:
+            result['detail'] = f'Unable to check Fail2Ban: {str(e)[:50]}'
+        
+        self.cached_results[cache_key] = result
+        self.last_check_times[cache_key] = current_time
+        return result
    
    def _check_security(self) -> Dict[str, Any]:
        """
-        Check security-related items:
-        - Uptime > 1 year (indicates potential kernel vulnerability if not updated)
-        - SSL certificate expiration (non-INFO certs)
-        - Excessive failed login attempts
+        Check security-related items with detailed sub-item breakdown:
+        - Uptime check: >1 year without kernel update indicates vulnerability
+        - SSL certificates: PVE certificate expiration
+        - Login attempts: Excessive failed logins (brute force detection)
+        - Fail2Ban: Currently banned IPs (if fail2ban is installed)
+        
+        Returns a result with 'checks' dict containing per-item status.
        """
        try:
            issues = []
+            checks = {
+                'uptime': {'status': 'OK', 'detail': ''},
+                'certificates': {'status': 'OK', 'detail': ''},
+                'login_attempts': {'status': 'OK', 'detail': ''},
+                'fail2ban': {'status': 'OK', 'detail': 'Not installed'}
+            }
            
-            # Check uptime for potential kernel vulnerabilities (if not updated)
+            # Sub-check 1: Uptime for potential kernel vulnerabilities
            try:
                uptime_seconds = time.time() - psutil.boot_time()
                uptime_days = uptime_seconds / 86400
                
-                # If uptime is over a year and no recent updates, it's a warning
                if uptime_days > 365:
-                    # Check if updates check shows recent activity
                    updates_data = self.cached_results.get('updates_check')
                    if updates_data and updates_data.get('days_since_update', 9999) > 365:
-                        issues.append(f'Uptime {int(uptime_days)} days (>1 year, consider updating kernel/system)')
+                        msg = f'Uptime {int(uptime_days)} days (>1 year, consider updating kernel/system)'
+                        issues.append(msg)
+                        checks['uptime'] = {'status': 'WARNING', 'detail': msg, 'days': int(uptime_days)}
+                    else:
+                        checks['uptime'] = {'status': 'OK', 'detail': f'Uptime {int(uptime_days)} days, system recently updated'}
+                else:
+                    checks['uptime'] = {'status': 'OK', 'detail': f'Uptime {int(uptime_days)} days'}
            except Exception:
-                pass # Ignore if uptime calculation fails
+                checks['uptime'] = {'status': 'OK', 'detail': 'Unable to determine uptime'}
            
-            # Check SSL certificates (only report non-OK statuses)
+            # Sub-check 2: SSL certificates
            cert_status = self._check_certificates()
-            if cert_status and cert_status.get('status') not in ['OK', 'INFO']:
-                issues.append(cert_status.get('reason', 'Certificate issue'))
+            if cert_status:
+                cert_sev = cert_status.get('status', 'OK')
+                cert_reason = cert_status.get('reason', '')
+                checks['certificates'] = {
+                    'status': cert_sev,
+                    'detail': cert_reason if cert_reason else 'Certificate valid'
+                }
+                if cert_sev not in ['OK', 'INFO']:
+                    issues.append(cert_reason or 'Certificate issue')
            
-            # Check for excessive failed login attempts in the last 24 hours
+            # Sub-check 3: Failed login attempts (brute force detection)
            try:
                result = subprocess.run(
                    ['journalctl', '--since', '24 hours ago', '--no-pager'],
@@ -1823,29 +2237,57 @@ class HealthMonitor:
                    timeout=3
                )
                
-                if result.returncode == 0:
                failed_logins = 0
+                if result.returncode == 0:
                    for line in result.stdout.split('\n'):
-                        # Common patterns for failed logins in journald
-                        if 'authentication failure' in line.lower() or 'failed password' in line.lower() or 'invalid user' in line.lower():
+                        line_lower = line.lower()
+                        if 'authentication failure' in line_lower or 'failed password' in line_lower or 'invalid user' in line_lower:
                            failed_logins += 1
                    
-                    if failed_logins > 50: # Threshold for significant failed attempts
-                        issues.append(f'{failed_logins} failed login attempts in 24h')
+                    if failed_logins > 50:
+                        msg = f'{failed_logins} failed login attempts in 24h'
+                        issues.append(msg)
+                        checks['login_attempts'] = {'status': 'WARNING', 'detail': msg, 'count': failed_logins}
+                    elif failed_logins > 0:
+                        checks['login_attempts'] = {'status': 'OK', 'detail': f'{failed_logins} failed attempts in 24h (within threshold)', 'count': failed_logins}
+                    else:
+                        checks['login_attempts'] = {'status': 'OK', 'detail': 'No failed login attempts in 24h', 'count': 0}
            except Exception:
-                pass # Ignore if journalctl fails
+                checks['login_attempts'] = {'status': 'OK', 'detail': 'Unable to check login attempts'}
            
+            # Sub-check 4: Fail2Ban ban detection
+            try:
+                f2b = self._check_fail2ban_bans()
+                checks['fail2ban'] = {
+                    'status': f2b.get('status', 'OK'),
+                    'detail': f2b.get('detail', ''),
+                    'installed': f2b.get('installed', False),
+                    'banned_count': f2b.get('banned_count', 0)
+                }
+                if f2b.get('status') == 'WARNING':
+                    issues.append(f2b.get('detail', 'Fail2Ban bans detected'))
+            except Exception:
+                checks['fail2ban'] = {'status': 'OK', 'detail': 'Unable to check Fail2Ban'}
+            
+            # Determine overall security status
            if issues:
+                # Check if any sub-check is CRITICAL
+                has_critical = any(c.get('status') == 'CRITICAL' for c in checks.values())
+                overall_status = 'CRITICAL' if has_critical else 'WARNING'
                return {
-                    'status': 'WARNING', # Security issues are typically warnings
-                    'reason': '; '.join(issues[:2]) # Show up to 2 issues
+                    'status': overall_status,
+                    'reason': '; '.join(issues[:2]),
+                    'checks': checks
                }
            
-            return {'status': 'OK'}
+            return {
+                'status': 'OK',
+                'checks': checks
+            }
            
        except Exception as e:
            print(f"[HealthMonitor] Error checking security: {e}")
-            return {'status': 'OK'}
+            return {'status': 'OK', 'checks': {}}
    
    def _check_certificates(self) -> Optional[Dict[str, Any]]:
        """
@@ -2138,141 +2580,7 @@ class HealthMonitor:
            'timestamp': datetime.now().isoformat()
        }
    
-    # This is a duplicate of the get_detailed_status method at the top of the file.
-    # It's likely an oversight from copy-pasting. One of them should be removed or renamed.
-    # Keeping both for now to match the provided structure, but in a refactor, this would be cleaned up.
-    def get_detailed_status(self) -> Dict[str, Any]:
-        """
-        Get comprehensive health status with all checks.
-        Returns JSON structure with ALL 10 categories always present.
-        Now includes persistent error tracking.
-        """
-        active_errors = health_persistence.get_active_errors()
-        # No need to create persistent_issues dict here, it's implicitly handled by the checks
-        
-        details = {
-            'cpu': {'status': 'OK'},
-            'memory': {'status': 'OK'},
-            'storage': {'status': 'OK'}, # This will be overwritten by specific storage checks
-            'disks': {'status': 'OK'}, # This will be overwritten by disk/filesystem checks
-            'network': {'status': 'OK'},
-            'vms': {'status': 'OK'},
-            'services': {'status': 'OK'},
-            'logs': {'status': 'OK'},
-            'updates': {'status': 'OK'},
-            'security': {'status': 'OK'}
-        }
-        
-        critical_issues = []
-        warning_issues = []
-        info_issues = []  # Added info_issues to track INFO separately
-        
-        # --- Priority Order of Checks ---
-        
-        # Priority 1: Critical PVE Services
-        services_status = self._check_pve_services()
-        details['services'] = services_status
-        if services_status['status'] == 'CRITICAL':
-            critical_issues.append(f"PVE Services: {services_status.get('reason', 'Service failure')}")
-        elif services_status['status'] == 'WARNING':
-            warning_issues.append(f"PVE Services: {services_status.get('reason', 'Service issue')}")
-        
-        # Priority 1.5: Proxmox Storage Check (External Module)
-        proxmox_storage_result = self._check_proxmox_storage()
-        if proxmox_storage_result: # Only process if the check ran (module available)
-            details['storage'] = proxmox_storage_result
-            if proxmox_storage_result.get('status') == 'CRITICAL':
-                critical_issues.append(proxmox_storage_result.get('reason', 'Proxmox storage unavailable'))
-            elif proxmox_storage_result.get('status') == 'WARNING':
-                warning_issues.append(proxmox_storage_result.get('reason', 'Proxmox storage issue'))
-        
-        # Priority 2: Disk/Filesystem Health (Internal checks: usage, ZFS, SMART, IO errors)
-        storage_status = self._check_storage_optimized()
-        details['disks'] = storage_status # Use 'disks' for filesystem/disk specific issues
-        if storage_status.get('status') == 'CRITICAL':
-            critical_issues.append(f"Storage/Disks: {storage_status.get('reason', 'Disk/Storage failure')}")
-        elif storage_status.get('status') == 'WARNING':
-            warning_issues.append(f"Storage/Disks: {storage_status.get('reason', 'Disk/Storage issue')}")
-        
-        # Priority 3: VMs/CTs Status (with persistence)
-        vms_status = self._check_vms_cts_with_persistence()
-        details['vms'] = vms_status
-        if vms_status.get('status') == 'CRITICAL':
-            critical_issues.append(f"VMs/CTs: {vms_status.get('reason', 'VM/CT failure')}")
-        elif vms_status.get('status') == 'WARNING':
-            warning_issues.append(f"VMs/CTs: {vms_status.get('reason', 'VM/CT issue')}")
-        
-        # Priority 4: Network Connectivity
-        network_status = self._check_network_optimized()
-        details['network'] = network_status
-        if network_status.get('status') == 'CRITICAL':
-            critical_issues.append(f"Network: {network_status.get('reason', 'Network failure')}")
-        elif network_status.get('status') == 'WARNING':
-            warning_issues.append(f"Network: {network_status.get('reason', 'Network issue')}")
-        
-        # Priority 5: CPU Usage (with hysteresis)
-        cpu_status = self._check_cpu_with_hysteresis()
-        details['cpu'] = cpu_status
-        if cpu_status.get('status') == 'CRITICAL':
-            critical_issues.append(f"CPU: {cpu_status.get('reason', 'CPU critical')}")
-        elif cpu_status.get('status') == 'WARNING':
-            warning_issues.append(f"CPU: {cpu_status.get('reason', 'CPU high')}")
-        
-        # Priority 6: Memory Usage (RAM and Swap)
-        memory_status = self._check_memory_comprehensive()
-        details['memory'] = memory_status
-        if memory_status.get('status') == 'CRITICAL':
-            critical_issues.append(f"Memory: {memory_status.get('reason', 'Memory critical')}")
-        elif memory_status.get('status') == 'WARNING':
-            warning_issues.append(f"Memory: {memory_status.get('reason', 'Memory high')}")
-        
-        # Priority 7: Log Analysis (with persistence)
-        logs_status = self._check_logs_with_persistence()
-        details['logs'] = logs_status
-        if logs_status.get('status') == 'CRITICAL':
-            critical_issues.append(f"Logs: {logs_status.get('reason', 'Critical log errors')}")
-        elif logs_status.get('status') == 'WARNING':
-            warning_issues.append(f"Logs: {logs_status.get('reason', 'Log warnings')}")
-        
-        # Priority 8: System Updates
-        updates_status = self._check_updates()
-        details['updates'] = updates_status
-        if updates_status.get('status') == 'CRITICAL':
-            critical_issues.append(f"Updates: {updates_status.get('reason', 'System not updated')}")
-        elif updates_status.get('status') == 'WARNING':
-            warning_issues.append(f"Updates: {updates_status.get('reason', 'Updates pending')}")
-        elif updates_status.get('status') == 'INFO':
-            info_issues.append(f"Updates: {updates_status.get('reason', 'Informational update notice')}")
-        
-        # Priority 9: Security Checks
-        security_status = self._check_security()
-        details['security'] = security_status
-        if security_status.get('status') == 'WARNING':
-            warning_issues.append(f"Security: {security_status.get('reason', 'Security issue')}")
-        elif security_status.get('status') == 'INFO':
-            info_issues.append(f"Security: {security_status.get('reason', 'Security information')}")
-        
-        # --- Determine Overall Status ---
-        # Use a fixed order of severity: CRITICAL > WARNING > INFO > OK
-        if critical_issues:
-            overall = 'CRITICAL'
-            summary = '; '.join(critical_issues[:3]) # Limit summary to 3 issues
-        elif warning_issues:
-            overall = 'WARNING'
-            summary = '; '.join(warning_issues[:3])
-        elif info_issues:
-            overall = 'OK'  # INFO statuses don't degrade overall health
-            summary = '; '.join(info_issues[:3])
-        else:
-            overall = 'OK'
-            summary = 'All systems operational'
-        
-        return {
-            'overall': overall,
-            'summary': summary,
-            'details': details,
-            'timestamp': datetime.now().isoformat()
-        }
+    # Duplicate get_detailed_status was removed during refactor (v1.1)


 # Global instance
--- a/AppImage/scripts/health_persistence.py
+++ b/AppImage/scripts/health_persistence.py
@@ -1,7 +1,8 @@
 """
 Health Monitor Persistence Module
 Manages persistent error tracking across AppImage updates using SQLite.
-Stores errors in /root/.config/proxmenux-monitor/health_monitor.db
+Stores errors in /usr/local/share/proxmenux/health_monitor.db
+(same directory as monitor.db for temperature history)

 Features:
 - Persistent error storage (survives AppImage updates)
@@ -10,7 +11,7 @@ Features:
 - Manual acknowledgment support

 Author: MacRimi
-Version: 1.0
+Version: 1.1
 """

 import sqlite3
@@ -30,8 +31,8 @@ class HealthPersistence:
    UPDATES_SUPPRESSION = 180 * 24 * 3600  # 180 days (6 months)
    
    def __init__(self):
-        """Initialize persistence with database in config directory"""
-        self.data_dir = Path('/root/.config/proxmenux-monitor')
+        """Initialize persistence with database in shared ProxMenux data directory"""
+        self.data_dir = Path('/usr/local/share/proxmenux')
        self.data_dir.mkdir(parents=True, exist_ok=True)
        
        self.db_path = self.data_dir / 'health_monitor.db'
@@ -186,26 +187,104 @@ class HealthPersistence:
        conn.commit()
        conn.close()
    
-    def acknowledge_error(self, error_key: str):
+    def is_error_active(self, error_key: str, category: Optional[str] = None) -> bool:
        """
-        Manually acknowledge an error (won't notify again or re-appear for 24h).
-        Also marks as resolved so it disappears from active errors.
+        Check if an error is currently active (unresolved and not acknowledged).
+        Used by checks to avoid re-recording errors that are already tracked.
+        """
+        conn = sqlite3.connect(str(self.db_path))
+        cursor = conn.cursor()
+        
+        if category:
+            cursor.execute('''
+                SELECT COUNT(*) FROM errors 
+                WHERE error_key = ? AND category = ?
+                  AND resolved_at IS NULL AND acknowledged = 0
+            ''', (error_key, category))
+        else:
+            cursor.execute('''
+                SELECT COUNT(*) FROM errors 
+                WHERE error_key = ? 
+                  AND resolved_at IS NULL AND acknowledged = 0
+            ''', (error_key,))
+        
+        count = cursor.fetchone()[0]
+        conn.close()
+        return count > 0
+    
+    def clear_error(self, error_key: str):
+        """
+        Remove/resolve a specific error immediately.
+        Used when the condition that caused the error no longer exists
+        (e.g., storage became available again).
        """
        conn = sqlite3.connect(str(self.db_path))
        cursor = conn.cursor()
        
        now = datetime.now().isoformat()
        
+        cursor.execute('''
+            UPDATE errors 
+            SET resolved_at = ?
+            WHERE error_key = ? AND resolved_at IS NULL
+        ''', (now, error_key))
+        
+        if cursor.rowcount > 0:
+            self._record_event(cursor, 'cleared', error_key, {'reason': 'condition_resolved'})
+        
+        conn.commit()
+        conn.close()
+    
+    def acknowledge_error(self, error_key: str) -> Dict[str, Any]:
+        """
+        Manually acknowledge an error (dismiss).
+        - Marks as acknowledged so it won't re-appear during the suppression period
+        - Stores the original severity for reference
+        - Returns info about the acknowledged error
+        
+        Suppression periods:
+        - updates category: 180 days (6 months)
+        - other categories: 24 hours
+        """
+        conn = sqlite3.connect(str(self.db_path))
+        conn.row_factory = sqlite3.Row
+        cursor = conn.cursor()
+        
+        now = datetime.now().isoformat()
+        
+        # Get current error info before acknowledging
+        cursor.execute('SELECT * FROM errors WHERE error_key = ?', (error_key,))
+        row = cursor.fetchone()
+        
+        result = {'success': False, 'error_key': error_key}
+        
+        if row:
+            error_dict = dict(row)
+            original_severity = error_dict.get('severity', 'WARNING')
+            category = error_dict.get('category', '')
+            
            cursor.execute('''
                UPDATE errors 
                SET acknowledged = 1, resolved_at = ?
                WHERE error_key = ?
            ''', (now, error_key))
            
-        self._record_event(cursor, 'acknowledged', error_key, {})
+            self._record_event(cursor, 'acknowledged', error_key, {
+                'original_severity': original_severity,
+                'category': category
+            })
+            
+            result = {
+                'success': True,
+                'error_key': error_key,
+                'original_severity': original_severity,
+                'category': category,
+                'acknowledged_at': now
+            }
        
        conn.commit()
        conn.close()
+        return result
    
    def get_active_errors(self, category: Optional[str] = None) -> List[Dict[str, Any]]:
        """Get all active (unresolved) errors, optionally filtered by category"""
@@ -315,6 +394,138 @@ class HealthPersistence:
        except Exception:
            return False
    
+    def get_dismissed_errors(self) -> List[Dict[str, Any]]:
+        """
+        Get errors that were acknowledged/dismissed but still within suppression period.
+        These are shown as INFO in the frontend with a 'Dismissed' badge.
+        """
+        conn = sqlite3.connect(str(self.db_path))
+        conn.row_factory = sqlite3.Row
+        cursor = conn.cursor()
+        
+        cursor.execute('''
+            SELECT * FROM errors 
+            WHERE acknowledged = 1 AND resolved_at IS NOT NULL
+            ORDER BY resolved_at DESC
+        ''')
+        
+        rows = cursor.fetchall()
+        conn.close()
+        
+        dismissed = []
+        now = datetime.now()
+        
+        for row in rows:
+            error_dict = dict(row)
+            if error_dict.get('details'):
+                try:
+                    error_dict['details'] = json.loads(error_dict['details'])
+                except (json.JSONDecodeError, TypeError):
+                    pass
+            
+            # Check if still within suppression period
+            try:
+                resolved_dt = datetime.fromisoformat(error_dict['resolved_at'])
+                elapsed_seconds = (now - resolved_dt).total_seconds()
+                
+                if error_dict.get('category') == 'updates':
+                    suppression = self.UPDATES_SUPPRESSION
+                else:
+                    suppression = 24 * 3600  # 24 hours
+                
+                if elapsed_seconds < suppression:
+                    error_dict['dismissed'] = True
+                    error_dict['suppression_remaining_hours'] = round(
+                        (suppression - elapsed_seconds) / 3600, 1
+                    )
+                    dismissed.append(error_dict)
+            except (ValueError, TypeError):
+                pass
+        
+        return dismissed
+    
+    def emit_event(self, event_type: str, category: str, severity: str, 
+                   data: Optional[Dict] = None) -> int:
+        """
+        Emit a health event for the notification system.
+        Returns the event ID.
+        
+        Event types: 
+        - 'state_change': severity changed (OK->WARNING, WARNING->CRITICAL, etc.)
+        - 'new_error': new error detected
+        - 'resolved': error resolved
+        - 'escalated': severity increased
+        """
+        conn = sqlite3.connect(str(self.db_path))
+        cursor = conn.cursor()
+        
+        event_data = data or {}
+        event_data['category'] = category
+        event_data['severity'] = severity
+        event_data['needs_notification'] = True
+        
+        cursor.execute('''
+            INSERT INTO events (event_type, error_key, timestamp, data)
+            VALUES (?, ?, ?, ?)
+        ''', (event_type, f'{category}_{severity}', datetime.now().isoformat(), 
+              json.dumps(event_data)))
+        
+        event_id = cursor.lastrowid
+        conn.commit()
+        conn.close()
+        return event_id
+    
+    def get_pending_notifications(self) -> List[Dict[str, Any]]:
+        """
+        Get events that need notification (for future Telegram/Gotify integration).
+        Groups by severity for batch notification sending.
+        """
+        conn = sqlite3.connect(str(self.db_path))
+        conn.row_factory = sqlite3.Row
+        cursor = conn.cursor()
+        
+        cursor.execute('''
+            SELECT e.*, err.category as error_category, err.reason as error_reason
+            FROM events e
+            LEFT JOIN errors err ON e.error_key = err.error_key
+            WHERE json_extract(e.data, '$.needs_notification') = 1
+            ORDER BY e.timestamp DESC
+            LIMIT 100
+        ''')
+        
+        rows = cursor.fetchall()
+        conn.close()
+        
+        events = []
+        for row in rows:
+            event_dict = dict(row)
+            if event_dict.get('data'):
+                try:
+                    event_dict['data'] = json.loads(event_dict['data'])
+                except (json.JSONDecodeError, TypeError):
+                    pass
+            events.append(event_dict)
+        
+        return events
+    
+    def mark_events_notified(self, event_ids: List[int]):
+        """Mark events as notified (notification was sent successfully)"""
+        if not event_ids:
+            return
+        
+        conn = sqlite3.connect(str(self.db_path))
+        cursor = conn.cursor()
+        
+        for event_id in event_ids:
+            cursor.execute('''
+                UPDATE events 
+                SET data = json_set(COALESCE(data, '{}'), '$.needs_notification', 0, '$.notified_at', ?)
+                WHERE id = ?
+            ''', (datetime.now().isoformat(), event_id))
+        
+        conn.commit()
+        conn.close()
+    
    def _record_event(self, cursor, event_type: str, error_key: str, data: Dict):
        """Internal: Record an event"""
        cursor.execute('''