From 2ee5be7402a4208b6455bf7bd54918e1e7fc8295 Mon Sep 17 00:00:00 2001 From: MacRimi Date: Mon, 16 Feb 2026 15:48:41 +0100 Subject: [PATCH] Update health monitor --- AppImage/app/page.tsx | 2 +- AppImage/components/health-status-modal.tsx | 448 +++++++++---- AppImage/scripts/flask_health_routes.py | 100 ++- AppImage/scripts/health_monitor.py | 696 ++++++++++++++------ AppImage/scripts/health_persistence.py | 231 ++++++- 5 files changed, 1153 insertions(+), 324 deletions(-) diff --git a/AppImage/app/page.tsx b/AppImage/app/page.tsx index 029d59c1..36f64735 100644 --- a/AppImage/app/page.tsx +++ b/AppImage/app/page.tsx @@ -40,7 +40,7 @@ export default function Home() { authenticated, }) } catch (error) { - console.error("[v0] Failed to check auth status:", error) + console.error("Failed to check auth status:", error) setAuthStatus({ loading: false, authEnabled: false, diff --git a/AppImage/components/health-status-modal.tsx b/AppImage/components/health-status-modal.tsx index d1968e84..3f59f5e4 100644 --- a/AppImage/components/health-status-modal.tsx +++ b/AppImage/components/health-status-modal.tsx @@ -2,7 +2,7 @@ import type React from "react" -import { useState, useEffect } from "react" +import { useState, useEffect, useCallback } from "react" import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle } from "@/components/ui/dialog" import { Badge } from "@/components/ui/badge" import { Button } from "@/components/ui/button" @@ -11,6 +11,7 @@ import { CheckCircle2, AlertTriangle, XCircle, + Info, Activity, Cpu, MemoryStick, @@ -23,16 +24,30 @@ import { RefreshCw, Shield, X, + Clock, + BellOff, + ChevronRight, } from "lucide-react" interface CategoryCheck { status: string reason?: string details?: any + checks?: Record dismissable?: boolean [key: string]: any } +interface DismissedError { + error_key: string + category: string + severity: string + reason: string + dismissed: boolean + suppression_remaining_hours: number + resolved_at: string +} + interface HealthDetails { overall: string summary: string @@ -51,6 +66,13 @@ interface HealthDetails { timestamp: string } +interface FullHealthData { + health: HealthDetails + active_errors: any[] + dismissed: DismissedError[] + timestamp: string +} + interface HealthStatusModalProps { open: boolean onOpenChange: (open: boolean) => void @@ -73,7 +95,41 @@ const CATEGORIES = [ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatusModalProps) { const [loading, setLoading] = useState(true) const [healthData, setHealthData] = useState(null) + const [dismissedItems, setDismissedItems] = useState([]) const [error, setError] = useState(null) + const [dismissingKey, setDismissingKey] = useState(null) + const [expandedCategories, setExpandedCategories] = useState>(new Set()) + + const fetchHealthDetails = useCallback(async () => { + setLoading(true) + setError(null) + + try { + // Use the new combined endpoint for fewer round-trips + const response = await fetch(getApiUrl("/api/health/full")) + if (!response.ok) { + // Fallback to legacy endpoint + const legacyResponse = await fetch(getApiUrl("/api/health/details")) + if (!legacyResponse.ok) throw new Error("Failed to fetch health details") + const data = await legacyResponse.json() + setHealthData(data) + setDismissedItems([]) + } else { + const fullData: FullHealthData = await response.json() + setHealthData(fullData.health) + setDismissedItems(fullData.dismissed || []) + } + + const event = new CustomEvent("healthStatusUpdated", { + detail: { status: healthData?.overall || "OK" }, + }) + window.dispatchEvent(event) + } catch (err) { + setError(err instanceof Error ? err.message : "Unknown error") + } finally { + setLoading(false) + } + }, [getApiUrl, healthData?.overall]) useEffect(() => { if (open) { @@ -81,42 +137,46 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu } }, [open]) - const fetchHealthDetails = async () => { - setLoading(true) - setError(null) - - try { - const response = await fetch(getApiUrl("/api/health/details")) - if (!response.ok) { - throw new Error("Failed to fetch health details") - } - const data = await response.json() - console.log("[v0] Health data received:", data) - setHealthData(data) - - const event = new CustomEvent("healthStatusUpdated", { - detail: { status: data.overall }, + // Auto-expand non-OK categories when data loads + useEffect(() => { + if (healthData?.details) { + const nonOkCategories = new Set() + CATEGORIES.forEach(({ key }) => { + const cat = healthData.details[key as keyof typeof healthData.details] + if (cat && cat.status?.toUpperCase() !== "OK") { + nonOkCategories.add(key) + } }) - window.dispatchEvent(event) - } catch (err) { - console.error("[v0] Error fetching health data:", err) - setError(err instanceof Error ? err.message : "Unknown error") - } finally { - setLoading(false) + setExpandedCategories(nonOkCategories) } + }, [healthData]) + + const toggleCategory = (key: string) => { + setExpandedCategories(prev => { + const next = new Set(prev) + if (next.has(key)) { + next.delete(key) + } else { + next.add(key) + } + return next + }) } - const getStatusIcon = (status: string) => { + const getStatusIcon = (status: string, size: "sm" | "md" = "md") => { const statusUpper = status?.toUpperCase() + const cls = size === "sm" ? "h-4 w-4" : "h-5 w-5" switch (statusUpper) { case "OK": - return + return + case "INFO": + return case "WARNING": - return + return case "CRITICAL": - return + return default: - return + return } } @@ -125,6 +185,8 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu switch (statusUpper) { case "OK": return OK + case "INFO": + return Info case "WARNING": return Warning case "CRITICAL": @@ -136,10 +198,11 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu const getHealthStats = () => { if (!healthData?.details) { - return { total: 0, healthy: 0, warnings: 0, critical: 0 } + return { total: 0, healthy: 0, info: 0, warnings: 0, critical: 0 } } let healthy = 0 + let info = 0 let warnings = 0 let critical = 0 @@ -148,22 +211,22 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu if (categoryData) { const status = categoryData.status?.toUpperCase() if (status === "OK") healthy++ + else if (status === "INFO") info++ else if (status === "WARNING") warnings++ else if (status === "CRITICAL") critical++ } }) - return { total: CATEGORIES.length, healthy, warnings, critical } + return { total: CATEGORIES.length, healthy, info, warnings, critical } } const stats = getHealthStats() const handleCategoryClick = (categoryKey: string, status: string) => { - if (status === "OK") return // No navegar si está OK + if (status === "OK" || status === "INFO") return - onOpenChange(false) // Cerrar el modal + onOpenChange(false) - // Mapear categorías a tabs const categoryToTab: Record = { storage: "storage", disks: "storage", @@ -176,43 +239,156 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu const targetTab = categoryToTab[categoryKey] if (targetTab) { - // Disparar evento para cambiar tab const event = new CustomEvent("changeTab", { detail: { tab: targetTab } }) window.dispatchEvent(event) } } const handleAcknowledge = async (errorKey: string, e: React.MouseEvent) => { - e.stopPropagation() // Prevent navigation - - console.log("[v0] Dismissing error:", errorKey) + e.stopPropagation() + setDismissingKey(errorKey) try { const response = await fetch(getApiUrl("/api/health/acknowledge"), { method: "POST", - headers: { - "Content-Type": "application/json", - }, + headers: { "Content-Type": "application/json" }, body: JSON.stringify({ error_key: errorKey }), }) if (!response.ok) { const errorData = await response.json() - console.error("[v0] Acknowledge failed:", errorData) - throw new Error(errorData.error || "Failed to acknowledge error") + throw new Error(errorData.error || "Failed to dismiss error") } - const result = await response.json() - console.log("[v0] Acknowledge success:", result) - - // Refresh health data await fetchHealthDetails() } catch (err) { - console.error("[v0] Error acknowledging:", err) - alert("Failed to dismiss error. Please try again.") + console.error("Error dismissing:", err) + } finally { + setDismissingKey(null) } } + const getTimeSinceCheck = () => { + if (!healthData?.timestamp) return null + const checkTime = new Date(healthData.timestamp) + const now = new Date() + const diffMs = now.getTime() - checkTime.getTime() + const diffMin = Math.floor(diffMs / 60000) + if (diffMin < 1) return "just now" + if (diffMin === 1) return "1 minute ago" + if (diffMin < 60) return `${diffMin} minutes ago` + const diffHours = Math.floor(diffMin / 60) + return `${diffHours}h ${diffMin % 60}m ago` + } + + const getCategoryRowStyle = (status: string) => { + const s = status?.toUpperCase() + if (s === "CRITICAL") return "bg-red-500/5 border-red-500/20 hover:bg-red-500/10 cursor-pointer" + if (s === "WARNING") return "bg-yellow-500/5 border-yellow-500/20 hover:bg-yellow-500/10 cursor-pointer" + if (s === "INFO") return "bg-blue-500/5 border-blue-500/20 hover:bg-blue-500/10" + return "bg-card border-border hover:bg-muted/30" + } + + const getOutlineBadgeStyle = (status: string) => { + const s = status?.toUpperCase() + if (s === "OK") return "border-green-500 text-green-500 bg-transparent" + if (s === "INFO") return "border-blue-500 text-blue-500 bg-blue-500/5" + if (s === "WARNING") return "border-yellow-500 text-yellow-500 bg-yellow-500/5" + if (s === "CRITICAL") return "border-red-500 text-red-500 bg-red-500/5" + return "" + } + + const formatCheckLabel = (key: string): string => { + const labels: Record = { + cpu_usage: "CPU Usage", + cpu_temperature: "Temperature", + ram_usage: "RAM Usage", + swap_usage: "Swap Usage", + root_filesystem: "Root Filesystem", + lvm_check: "LVM Status", + connectivity: "Connectivity", + all_vms_cts: "VMs & Containers", + cluster_mode: "Cluster Mode", + error_cascade: "Error Cascade", + error_spike: "Error Spike", + persistent_errors: "Persistent Errors", + critical_errors: "Critical Errors", + security_updates: "Security Updates", + system_age: "System Age", + pending_updates: "Pending Updates", + kernel_pve: "Kernel / PVE", + uptime: "Uptime", + certificates: "Certificates", + login_attempts: "Login Attempts", + fail2ban: "Fail2Ban", + } + if (labels[key]) return labels[key] + // Convert snake_case or camelCase to Title Case + return key + .replace(/_/g, " ") + .replace(/([a-z])([A-Z])/g, "$1 $2") + .replace(/\b\w/g, (c) => c.toUpperCase()) + } + + const renderChecks = ( + checks: Record, + categoryKey: string + ) => { + if (!checks || Object.keys(checks).length === 0) return null + + return ( +
+ {Object.entries(checks).map(([checkKey, checkData]) => { + const isDismissable = checkData.dismissable === true + const checkStatus = checkData.status?.toUpperCase() || "OK" + + return ( +
+
+ {getStatusIcon(checkData.status, "sm")} + {formatCheckLabel(checkKey)} + {checkData.detail} +
+
+ {checkData.thresholds && ( + + ({checkData.thresholds}) + + )} + {(checkStatus === "WARNING" || checkStatus === "CRITICAL") && isDismissable && ( + + )} +
+
+ ) + })} +
+ ) + } + + + return ( @@ -224,7 +400,15 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu {healthData &&
{getStatusBadge(healthData.overall)}
} - Detailed health checks for all system components + + Detailed health checks for all system components + {getTimeSinceCheck() && ( + + + Last check: {getTimeSinceCheck()} + + )} + {loading && ( @@ -243,15 +427,21 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu {healthData && !loading && (
{/* Overall Stats Summary */} -
+
0 ? "grid-cols-5" : "grid-cols-4"}`}>
{stats.total}
-
Total Checks
+
Total
{stats.healthy}
Healthy
+ {stats.info > 0 && ( +
+
{stats.info}
+
Info
+
+ )}
{stats.warnings}
Warnings
@@ -268,91 +458,117 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
)} + {/* Category List */}
{CATEGORIES.map(({ key, label, Icon }) => { const categoryData = healthData.details[key as keyof typeof healthData.details] const status = categoryData?.status || "UNKNOWN" const reason = categoryData?.reason - const details = categoryData?.details + const checks = categoryData?.checks + const isExpanded = expandedCategories.has(key) + const hasChecks = checks && Object.keys(checks).length > 0 return (
handleCategoryClick(key, status)} - className={`flex items-start gap-3 p-3 rounded-lg border transition-colors ${ - status === "OK" - ? "bg-card border-border hover:bg-muted/30" - : status === "WARNING" - ? "bg-yellow-500/5 border-yellow-500/20 hover:bg-yellow-500/10 cursor-pointer" - : status === "CRITICAL" - ? "bg-red-500/5 border-red-500/20 hover:bg-red-500/10 cursor-pointer" - : "bg-muted/30 hover:bg-muted/50" - }`} + className={`rounded-lg border transition-colors overflow-hidden ${getCategoryRowStyle(status)}`} > -
- - {getStatusIcon(status)} -
-
-
-

{label}

- + {/* Clickable header row */} +
toggleCategory(key)} + > +
+ + {getStatusIcon(status)} +
+
+
+

{label}

+ {hasChecks && ( + + ({Object.keys(checks).length} checks) + + )} +
+ {reason && !isExpanded && ( +

{reason}

+ )} +
+
+ {status} +
- {reason &&

{reason}

} - {details && typeof details === "object" && ( -
- {Object.entries(details).map(([detailKey, detailValue]: [string, any]) => { - if (typeof detailValue === "object" && detailValue !== null) { - const isDismissable = detailValue.dismissable !== false - - return ( -
-
- {detailKey}: - {detailValue.reason && ( - {detailValue.reason} - )} -
- {(status === "WARNING" || status === "CRITICAL") && isDismissable && ( - - )} -
- ) - } - return null - })} -
- )}
+ + {/* Expandable checks section */} + {isExpanded && ( +
+ {reason && ( +

{reason}

+ )} + {hasChecks ? ( + renderChecks(checks, key) + ) : ( +
+ + No issues detected +
+ )} +
+ )}
) })}
+ {/* Dismissed Items Section */} + {dismissedItems.length > 0 && ( +
+
+ + Dismissed Items ({dismissedItems.length}) +
+ {dismissedItems.map((item) => ( +
+
+ + {getStatusIcon("INFO")} +
+
+
+

{item.reason}

+
+ + Dismissed + + + was {item.severity} + +
+
+

+ + Suppressed for {item.suppression_remaining_hours < 24 + ? `${Math.round(item.suppression_remaining_hours)}h` + : `${Math.round(item.suppression_remaining_hours / 24)} days` + } more +

+
+
+ ))} +
+ )} + {healthData.timestamp && (
Last updated: {new Date(healthData.timestamp).toLocaleString()} diff --git a/AppImage/scripts/flask_health_routes.py b/AppImage/scripts/flask_health_routes.py index 31e41503..a89300dc 100644 --- a/AppImage/scripts/flask_health_routes.py +++ b/AppImage/scripts/flask_health_routes.py @@ -51,15 +51,45 @@ def get_system_info(): @health_bp.route('/api/health/acknowledge', methods=['POST']) def acknowledge_error(): - """Acknowledge an error manually (user dismissed it)""" + """ + Acknowledge/dismiss an error manually. + Returns details about the acknowledged error including original severity + and suppression period info. + """ try: data = request.get_json() if not data or 'error_key' not in data: return jsonify({'error': 'error_key is required'}), 400 error_key = data['error_key'] - health_persistence.acknowledge_error(error_key) - return jsonify({'success': True, 'message': 'Error acknowledged'}) + result = health_persistence.acknowledge_error(error_key) + + if result.get('success'): + # Determine suppression period for the response + category = result.get('category', '') + if category == 'updates': + suppression_hours = 180 * 24 # 180 days in hours + suppression_label = '6 months' + else: + suppression_hours = 24 + suppression_label = '24 hours' + + return jsonify({ + 'success': True, + 'message': f'Error dismissed for {suppression_label}', + 'error_key': error_key, + 'original_severity': result.get('original_severity', 'WARNING'), + 'category': category, + 'suppression_hours': suppression_hours, + 'suppression_label': suppression_label, + 'acknowledged_at': result.get('acknowledged_at') + }) + else: + return jsonify({ + 'success': False, + 'message': 'Error not found or already dismissed', + 'error_key': error_key + }), 404 except Exception as e: return jsonify({'error': str(e)}), 500 @@ -72,3 +102,67 @@ def get_active_errors(): return jsonify({'errors': errors}) except Exception as e: return jsonify({'error': str(e)}), 500 + +@health_bp.route('/api/health/dismissed', methods=['GET']) +def get_dismissed_errors(): + """ + Get dismissed errors that are still within their suppression period. + These are shown as INFO items with a 'Dismissed' badge in the frontend. + """ + try: + dismissed = health_persistence.get_dismissed_errors() + return jsonify({'dismissed': dismissed}) + except Exception as e: + return jsonify({'error': str(e)}), 500 + +@health_bp.route('/api/health/full', methods=['GET']) +def get_full_health(): + """ + Get complete health data in a single request: detailed status + active errors + dismissed. + Reduces frontend round-trips. + """ + try: + details = health_monitor.get_detailed_status() + active_errors = health_persistence.get_active_errors() + dismissed = health_persistence.get_dismissed_errors() + + return jsonify({ + 'health': details, + 'active_errors': active_errors, + 'dismissed': dismissed, + 'timestamp': details.get('timestamp') + }) + except Exception as e: + return jsonify({'error': str(e)}), 500 + +@health_bp.route('/api/health/pending-notifications', methods=['GET']) +def get_pending_notifications(): + """ + Get events pending notification (for future Telegram/Gotify/Discord integration). + This endpoint will be consumed by the Notification Service (Bloque A). + """ + try: + pending = health_persistence.get_pending_notifications() + return jsonify({'pending': pending, 'count': len(pending)}) + except Exception as e: + return jsonify({'error': str(e)}), 500 + +@health_bp.route('/api/health/mark-notified', methods=['POST']) +def mark_events_notified(): + """ + Mark events as notified after notification was sent successfully. + Used by the Notification Service (Bloque A) after sending alerts. + """ + try: + data = request.get_json() + if not data or 'event_ids' not in data: + return jsonify({'error': 'event_ids array is required'}), 400 + + event_ids = data['event_ids'] + if not isinstance(event_ids, list): + return jsonify({'error': 'event_ids must be an array'}), 400 + + health_persistence.mark_events_notified(event_ids) + return jsonify({'success': True, 'marked_count': len(event_ids)}) + except Exception as e: + return jsonify({'error': str(e)}), 500 diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py index 723d6e61..a87aeea3 100644 --- a/AppImage/scripts/health_monitor.py +++ b/AppImage/scripts/health_monitor.py @@ -373,6 +373,44 @@ class HealthMonitor: overall = 'OK' summary = 'All systems operational' + # --- Emit events for state changes (Bloque A: Notification prep) --- + try: + previous_overall = getattr(self, '_last_overall_status', None) + if previous_overall and previous_overall != overall: + # Overall status changed - emit event + health_persistence.emit_event( + event_type='state_change', + category='overall', + severity=overall, + data={ + 'previous': previous_overall, + 'current': overall, + 'summary': summary + } + ) + + # Track per-category state changes + previous_details = getattr(self, '_last_category_statuses', {}) + for cat_key, cat_data in details.items(): + cat_status = cat_data.get('status', 'OK') + prev_status = previous_details.get(cat_key, 'OK') + if prev_status != cat_status and cat_status in ('WARNING', 'CRITICAL'): + health_persistence.emit_event( + event_type='state_change', + category=cat_key, + severity=cat_status, + data={ + 'previous': prev_status, + 'current': cat_status, + 'reason': cat_data.get('reason', '') + } + ) + + self._last_overall_status = overall + self._last_category_statuses = {k: v.get('status', 'OK') for k, v in details.items()} + except Exception: + pass # Event emission should never break health checks + return { 'overall': overall, 'summary': summary, @@ -445,6 +483,30 @@ class HealthMonitor: result['status'] = 'WARNING' result['reason'] = temp_status.get('reason') + # Build checks dict for frontend expandable section + checks = { + 'cpu_usage': { + 'status': status, + 'detail': f'{round(cpu_percent, 1)}% ({psutil.cpu_count()} cores)', + 'value': round(cpu_percent, 1), + 'thresholds': f'Warning >{self.CPU_WARNING}%, Critical >{self.CPU_CRITICAL}%' + } + } + if temp_status and temp_status.get('status') != 'UNKNOWN': + temp_val = temp_status.get('value', 'N/A') + checks['cpu_temperature'] = { + 'status': temp_status.get('status', 'OK'), + 'detail': f'{temp_val}°C' if isinstance(temp_val, (int, float)) else str(temp_val), + 'value': temp_val, + 'thresholds': 'Warning >80°C sustained >3min' + } + else: + checks['cpu_temperature'] = { + 'status': 'OK', + 'detail': 'Sensor not available', + } + + result['checks'] = checks return result except Exception as e: @@ -617,12 +679,35 @@ class HealthMonitor: status = 'OK' reason = None + ram_avail_gb = round(memory.available / (1024**3), 2) + ram_total_gb = round(memory.total / (1024**3), 2) + swap_used_gb = round(swap.used / (1024**3), 2) + swap_total_gb = round(swap.total / (1024**3), 2) + + # Determine per-sub-check status + ram_status = 'CRITICAL' if mem_percent >= 90 and mem_critical >= 2 else ('WARNING' if mem_percent >= self.MEMORY_WARNING and mem_warning >= 2 else 'OK') + swap_status = 'CRITICAL' if swap_critical >= 2 else 'OK' + result = { 'status': status, 'ram_percent': round(mem_percent, 1), - 'ram_available_gb': round(memory.available / (1024**3), 2), + 'ram_available_gb': ram_avail_gb, 'swap_percent': round(swap_percent, 1), - 'swap_used_gb': round(swap.used / (1024**3), 2) + 'swap_used_gb': swap_used_gb, + 'checks': { + 'ram_usage': { + 'status': ram_status, + 'detail': f'{round(mem_percent, 1)}% used ({ram_avail_gb} GB free of {ram_total_gb} GB)', + 'value': round(mem_percent, 1), + 'thresholds': f'Warning >{self.MEMORY_WARNING}%, Critical >90%' + }, + 'swap_usage': { + 'status': swap_status, + 'detail': f'{round(swap_percent, 1)}% used ({swap_used_gb} GB of {swap_total_gb} GB)' if swap.total > 0 else 'No swap configured', + 'value': round(swap_percent, 1), + 'thresholds': 'Critical when swap >20% of RAM' + } + } } if reason: @@ -706,8 +791,28 @@ class HealthMonitor: issues.append(f"LVM check: {lvm_status.get('reason')}") storage_details['lvm_check'] = lvm_status + # Check dmesg for real-time I/O errors (dmesg-based, complements journalctl SMART checks) + dmesg_io_result = self._check_disks_optimized() + if dmesg_io_result.get('status') != 'OK': + dmesg_details = dmesg_io_result.get('details', {}) + for disk_path, disk_info in dmesg_details.items(): + if disk_path not in storage_details or storage_details[disk_path].get('status') == 'OK': + issues.append(f'{disk_path}: {disk_info.get("reason", "I/O errors")}') + storage_details[disk_path] = disk_info + + # Build checks dict from storage_details, adding OK entries for items with no issues + checks = {} + for key, val in storage_details.items(): + checks[key] = { + 'status': val.get('status', 'OK'), + 'detail': val.get('reason', 'OK'), + **{k: v for k, v in val.items() if k not in ('status', 'reason')} + } + if not issues: - return {'status': 'OK'} + # Add a summary OK entry if nothing specific + checks['root_filesystem'] = checks.get('/', {'status': 'OK', 'detail': 'Root filesystem healthy'}) + return {'status': 'OK', 'checks': checks} # Determine overall status has_critical = any(d.get('status') == 'CRITICAL' for d in storage_details.values()) @@ -715,7 +820,8 @@ class HealthMonitor: return { 'status': 'CRITICAL' if has_critical else 'WARNING', 'reason': '; '.join(issues[:3]), - 'details': storage_details + 'details': storage_details, + 'checks': checks } def _check_filesystem(self, mount_point: str) -> Dict[str, Any]: @@ -1025,19 +1131,42 @@ class HealthMonitor: # Check connectivity (latency) latency_status = self._check_network_latency() - if latency_status and latency_status.get('status') not in ['OK', 'INFO', 'UNKNOWN']: - issues.append(latency_status.get('reason', 'Network latency issue')) + if latency_status: + latency_ms = latency_status.get('latency_ms', 'N/A') + latency_sev = latency_status.get('status', 'OK') interface_details['connectivity'] = latency_status + connectivity_check = { + 'status': latency_sev if latency_sev not in ['UNKNOWN'] else 'OK', + 'detail': f'Latency {latency_ms}ms to 1.1.1.1' if isinstance(latency_ms, (int, float)) else latency_status.get('reason', 'Unknown'), + } + if latency_sev not in ['OK', 'INFO', 'UNKNOWN']: + issues.append(latency_status.get('reason', 'Network latency issue')) + else: + connectivity_check = {'status': 'OK', 'detail': 'Not tested'} + + # Build checks dict + checks = {} + for iface in active_interfaces: + checks[iface] = {'status': 'OK', 'detail': 'UP'} + for iface, detail in interface_details.items(): + if iface != 'connectivity': + checks[iface] = { + 'status': detail.get('status', 'OK'), + 'detail': detail.get('reason', 'DOWN'), + 'dismissable': detail.get('dismissable', False) + } + checks['connectivity'] = connectivity_check if not issues: - return {'status': 'OK'} + return {'status': 'OK', 'checks': checks} has_critical = any(d.get('status') == 'CRITICAL' for d in interface_details.values()) return { 'status': 'CRITICAL' if has_critical else 'WARNING', 'reason': '; '.join(issues[:2]), - 'details': interface_details + 'details': interface_details, + 'checks': checks } except Exception: @@ -1348,26 +1477,51 @@ class HealthMonitor: 'type': vm_type } + # Build checks dict from vm_details + checks = {} + for key, val in vm_details.items(): + vm_label = f"{val.get('type', 'VM')} {val.get('id', key)}" + checks[vm_label] = { + 'status': val.get('status', 'WARNING'), + 'detail': val.get('reason', 'Error'), + 'dismissable': True + } + if not issues: - return {'status': 'OK'} + checks['all_vms_cts'] = {'status': 'OK', 'detail': 'No issues detected in logs'} + return {'status': 'OK', 'checks': checks} has_critical = any(d.get('status') == 'CRITICAL' for d in vm_details.values()) return { 'status': 'CRITICAL' if has_critical else 'WARNING', 'reason': '; '.join(issues[:3]), - 'details': vm_details + 'details': vm_details, + 'checks': checks } except Exception: - return {'status': 'OK'} + return {'status': 'OK', 'checks': {}} def _check_pve_services(self) -> Dict[str, Any]: - """Check critical Proxmox services""" + """ + Check critical Proxmox services with persistence tracking. + - Checks the base PVE_SERVICES list + - Dynamically adds corosync if a cluster config exists + - Records failed services in persistence for tracking/dismiss + - Auto-clears when services recover + """ try: - failed_services = [] + # Build service list: base PVE services + corosync if clustered + services_to_check = list(self.PVE_SERVICES) + is_cluster = os.path.exists('/etc/corosync/corosync.conf') + if is_cluster and 'corosync' not in services_to_check: + services_to_check.append('corosync') - for service in self.PVE_SERVICES: + failed_services = [] + service_details = {} + + for service in services_to_check: try: result = subprocess.run( ['systemctl', 'is-active', service], @@ -1376,23 +1530,79 @@ class HealthMonitor: timeout=2 ) - if result.returncode != 0 or result.stdout.strip() != 'active': + status = result.stdout.strip() + if result.returncode != 0 or status != 'active': failed_services.append(service) + service_details[service] = status or 'inactive' except Exception: - # If systemctl fails (e.g., command not found or service doesn't exist), treat as failed failed_services.append(service) + service_details[service] = 'error' - if failed_services: - return { - 'status': 'CRITICAL', - 'reason': f'Services inactive: {", ".join(failed_services)}', - 'failed': failed_services + # Build checks dict with status per service + checks = {} + for svc in services_to_check: + if svc in failed_services: + state = service_details.get(svc, 'inactive') + checks[svc] = { + 'status': 'CRITICAL', + 'detail': f'Service is {state}', + } + else: + checks[svc] = { + 'status': 'OK', + 'detail': 'Active', + } + + if is_cluster: + checks['cluster_mode'] = { + 'status': 'OK', + 'detail': 'Cluster detected (corosync.conf present)', } - return {'status': 'OK'} + if failed_services: + reason = f'Services inactive: {", ".join(failed_services)}' + + # Record each failed service in persistence + for svc in failed_services: + error_key = f'pve_service_{svc}' + health_persistence.record_error( + error_key=error_key, + category='services', + severity='CRITICAL', + reason=f'PVE service {svc} is {service_details.get(svc, "inactive")}', + details={'service': svc, 'state': service_details.get(svc, 'inactive')} + ) + + # Auto-clear services that recovered + for svc in services_to_check: + if svc not in failed_services: + error_key = f'pve_service_{svc}' + if health_persistence.is_error_active(error_key): + health_persistence.clear_error(error_key) + + return { + 'status': 'CRITICAL', + 'reason': reason, + 'failed': failed_services, + 'is_cluster': is_cluster, + 'services_checked': len(services_to_check), + 'checks': checks + } + + # All OK - clear any previously tracked service errors + for svc in services_to_check: + error_key = f'pve_service_{svc}' + if health_persistence.is_error_active(error_key): + health_persistence.clear_error(error_key) + + return { + 'status': 'OK', + 'is_cluster': is_cluster, + 'services_checked': len(services_to_check), + 'checks': checks + } except Exception as e: - # If the entire systemctl check fails return { 'status': 'WARNING', 'reason': f'Service check command failed: {str(e)}' @@ -1620,7 +1830,31 @@ class HealthMonitor: status = 'OK' reason = None - log_result = {'status': status} + # Build checks dict for log sub-items + log_checks = { + 'error_cascade': { + 'status': 'WARNING' if cascade_count > 0 else 'OK', + 'detail': f'{cascade_count} pattern(s) repeating >=15 times' if cascade_count > 0 else 'No cascading errors', + 'dismissable': True + }, + 'error_spike': { + 'status': 'WARNING' if spike_count > 0 else 'OK', + 'detail': f'{spike_count} pattern(s) with 4x increase' if spike_count > 0 else 'No error spikes', + 'dismissable': True + }, + 'persistent_errors': { + 'status': 'WARNING' if persistent_count > 0 else 'OK', + 'detail': f'{persistent_count} recurring pattern(s) over 15+ min' if persistent_count > 0 else 'No persistent patterns', + 'dismissable': True + }, + 'critical_errors': { + 'status': 'CRITICAL' if unique_critical_count > 0 else 'OK', + 'detail': f'{unique_critical_count} critical error(s) found' if unique_critical_count > 0 else 'No critical errors', + 'dismissable': True + } + } + + log_result = {'status': status, 'checks': log_checks} if reason: log_result['reason'] = reason @@ -1629,7 +1863,12 @@ class HealthMonitor: return log_result # If journalctl command failed or returned no data - ok_result = {'status': 'OK'} + ok_result = {'status': 'OK', 'checks': { + 'error_cascade': {'status': 'OK', 'detail': 'No cascading errors'}, + 'error_spike': {'status': 'OK', 'detail': 'No error spikes'}, + 'persistent_errors': {'status': 'OK', 'detail': 'No persistent patterns'}, + 'critical_errors': {'status': 'OK', 'detail': 'No critical errors'} + }} self.cached_results[cache_key] = ok_result self.last_check_times[cache_key] = current_time return ok_result @@ -1662,9 +1901,9 @@ class HealthMonitor: def _check_updates(self) -> Optional[Dict[str, Any]]: """ Check for pending system updates. - - WARNING: If security updates are available. - - CRITICAL: If system not updated in >2 years. - - INFO: If 1-2 years without updates, or many non-security updates. + - WARNING: Security updates available, or system not updated >1 year (365 days). + - CRITICAL: System not updated >18 months (548 days). + - INFO: Kernel/PVE updates available, or >50 non-security updates pending. """ cache_key = 'updates_check' current_time = time.time() @@ -1730,12 +1969,12 @@ class HealthMonitor: reason=reason, details={'count': len(security_updates_packages), 'packages': security_updates_packages[:5]} ) - elif last_update_days and last_update_days >= 730: - # 2+ years without updates - CRITICAL + elif last_update_days and last_update_days >= 548: + # 18+ months without updates - CRITICAL status = 'CRITICAL' - reason = f'System not updated in {last_update_days} days (>2 years)' + reason = f'System not updated in {last_update_days} days (>18 months)' health_persistence.record_error( - error_key='updates_730days', + error_key='updates_548days', category='updates', severity='CRITICAL', reason=reason, @@ -1766,14 +2005,40 @@ class HealthMonitor: status = 'WARNING' reason = 'Failed to check for updates (apt-get error)' + # Build checks dict for updates sub-items + update_age_status = 'CRITICAL' if (last_update_days and last_update_days >= 548) else ('WARNING' if (last_update_days and last_update_days >= 365) else 'OK') + sec_status = 'WARNING' if security_updates_packages else 'OK' + kernel_status = 'INFO' if kernel_pve_updates_packages else 'OK' + + checks = { + 'security_updates': { + 'status': sec_status, + 'detail': f'{len(security_updates_packages)} security update(s) pending' if security_updates_packages else 'No security updates pending', + }, + 'system_age': { + 'status': update_age_status, + 'detail': f'Last updated {last_update_days} day(s) ago' if last_update_days is not None else 'Unknown', + 'thresholds': 'Warning >365 days, Critical >548 days' + }, + 'pending_updates': { + 'status': 'INFO' if update_count > 50 else 'OK', + 'detail': f'{update_count} package(s) pending', + }, + 'kernel_pve': { + 'status': kernel_status, + 'detail': f'{len(kernel_pve_updates_packages)} kernel/PVE update(s)' if kernel_pve_updates_packages else 'Kernel/PVE up to date', + } + } + # Construct result dictionary update_result = { 'status': status, - 'count': update_count + 'count': update_count, + 'checks': checks } if reason: update_result['reason'] = reason - if last_update_days is not None: # Only add if we could determine days_since_update + if last_update_days is not None: update_result['days_since_update'] = last_update_days self.cached_results[cache_key] = update_result @@ -1782,39 +2047,188 @@ class HealthMonitor: except Exception as e: print(f"[HealthMonitor] Error checking updates: {e}") - # Return OK on exception to avoid false alerts - return {'status': 'OK', 'count': 0} + return {'status': 'OK', 'count': 0, 'checks': {}} + + def _check_fail2ban_bans(self) -> Dict[str, Any]: + """ + Check if fail2ban is installed and if there are currently banned IPs. + Cached for 60 seconds to avoid hammering fail2ban-client. + + Returns: + {'installed': bool, 'active': bool, 'status': str, 'detail': str, + 'banned_count': int, 'jails': [...], 'banned_ips': [...]} + """ + cache_key = 'fail2ban_bans' + current_time = time.time() + + if cache_key in self.last_check_times: + if current_time - self.last_check_times[cache_key] < 60: + return self.cached_results.get(cache_key, {'installed': False, 'status': 'OK', 'detail': 'Not installed'}) + + result = {'installed': False, 'active': False, 'status': 'OK', 'detail': 'Not installed', 'banned_count': 0, 'jails': [], 'banned_ips': []} + + try: + # Check if fail2ban-client exists + which_result = subprocess.run( + ['which', 'fail2ban-client'], + capture_output=True, text=True, timeout=2 + ) + if which_result.returncode != 0: + self.cached_results[cache_key] = result + self.last_check_times[cache_key] = current_time + return result + + result['installed'] = True + + # Check if fail2ban service is active + active_check = subprocess.run( + ['systemctl', 'is-active', 'fail2ban'], + capture_output=True, text=True, timeout=2 + ) + if active_check.stdout.strip() != 'active': + result['detail'] = 'Fail2Ban installed but service not active' + self.cached_results[cache_key] = result + self.last_check_times[cache_key] = current_time + return result + + result['active'] = True + + # Get list of active jails + jails_result = subprocess.run( + ['fail2ban-client', 'status'], + capture_output=True, text=True, timeout=3 + ) + + jails = [] + if jails_result.returncode == 0: + for line in jails_result.stdout.split('\n'): + if 'Jail list:' in line: + jail_str = line.split('Jail list:')[1].strip() + jails = [j.strip() for j in jail_str.split(',') if j.strip()] + break + + if not jails: + result['detail'] = 'Fail2Ban active, no jails configured' + self.cached_results[cache_key] = result + self.last_check_times[cache_key] = current_time + return result + + result['jails'] = jails + + # Check each jail for banned IPs + total_banned = 0 + all_banned_ips = [] + jails_with_bans = [] + + for jail in jails: + try: + jail_result = subprocess.run( + ['fail2ban-client', 'status', jail], + capture_output=True, text=True, timeout=2 + ) + if jail_result.returncode == 0: + for line in jail_result.stdout.split('\n'): + if 'Currently banned:' in line: + try: + count = int(line.split('Currently banned:')[1].strip()) + if count > 0: + total_banned += count + jails_with_bans.append(jail) + except (ValueError, IndexError): + pass + elif 'Banned IP list:' in line: + ips_str = line.split('Banned IP list:')[1].strip() + if ips_str: + ips = [ip.strip() for ip in ips_str.split() if ip.strip()] + all_banned_ips.extend(ips[:10]) # Limit to 10 IPs per jail + except Exception: + pass + + result['banned_count'] = total_banned + result['banned_ips'] = all_banned_ips[:20] # Max 20 total + + if total_banned > 0: + jails_str = ', '.join(jails_with_bans) + msg = f'{total_banned} IP(s) currently banned by Fail2Ban (jails: {jails_str})' + result['status'] = 'WARNING' + result['detail'] = msg + + # Record in persistence (dismissable) + health_persistence.record_error( + error_key='security_fail2ban_ban', + category='security', + severity='WARNING', + reason=msg, + details={ + 'banned_count': total_banned, + 'jails': jails_with_bans, + 'banned_ips': all_banned_ips[:5], + 'dismissable': True + } + ) + else: + result['detail'] = f'Fail2Ban active ({len(jails)} jail(s), no current bans)' + # Auto-resolve if previously banned IPs are now gone + if health_persistence.is_error_active('security_fail2ban_ban'): + health_persistence.clear_error('security_fail2ban_ban') + + except Exception as e: + result['detail'] = f'Unable to check Fail2Ban: {str(e)[:50]}' + + self.cached_results[cache_key] = result + self.last_check_times[cache_key] = current_time + return result def _check_security(self) -> Dict[str, Any]: """ - Check security-related items: - - Uptime > 1 year (indicates potential kernel vulnerability if not updated) - - SSL certificate expiration (non-INFO certs) - - Excessive failed login attempts + Check security-related items with detailed sub-item breakdown: + - Uptime check: >1 year without kernel update indicates vulnerability + - SSL certificates: PVE certificate expiration + - Login attempts: Excessive failed logins (brute force detection) + - Fail2Ban: Currently banned IPs (if fail2ban is installed) + + Returns a result with 'checks' dict containing per-item status. """ try: issues = [] + checks = { + 'uptime': {'status': 'OK', 'detail': ''}, + 'certificates': {'status': 'OK', 'detail': ''}, + 'login_attempts': {'status': 'OK', 'detail': ''}, + 'fail2ban': {'status': 'OK', 'detail': 'Not installed'} + } - # Check uptime for potential kernel vulnerabilities (if not updated) + # Sub-check 1: Uptime for potential kernel vulnerabilities try: uptime_seconds = time.time() - psutil.boot_time() uptime_days = uptime_seconds / 86400 - # If uptime is over a year and no recent updates, it's a warning if uptime_days > 365: - # Check if updates check shows recent activity updates_data = self.cached_results.get('updates_check') if updates_data and updates_data.get('days_since_update', 9999) > 365: - issues.append(f'Uptime {int(uptime_days)} days (>1 year, consider updating kernel/system)') + msg = f'Uptime {int(uptime_days)} days (>1 year, consider updating kernel/system)' + issues.append(msg) + checks['uptime'] = {'status': 'WARNING', 'detail': msg, 'days': int(uptime_days)} + else: + checks['uptime'] = {'status': 'OK', 'detail': f'Uptime {int(uptime_days)} days, system recently updated'} + else: + checks['uptime'] = {'status': 'OK', 'detail': f'Uptime {int(uptime_days)} days'} except Exception: - pass # Ignore if uptime calculation fails + checks['uptime'] = {'status': 'OK', 'detail': 'Unable to determine uptime'} - # Check SSL certificates (only report non-OK statuses) + # Sub-check 2: SSL certificates cert_status = self._check_certificates() - if cert_status and cert_status.get('status') not in ['OK', 'INFO']: - issues.append(cert_status.get('reason', 'Certificate issue')) + if cert_status: + cert_sev = cert_status.get('status', 'OK') + cert_reason = cert_status.get('reason', '') + checks['certificates'] = { + 'status': cert_sev, + 'detail': cert_reason if cert_reason else 'Certificate valid' + } + if cert_sev not in ['OK', 'INFO']: + issues.append(cert_reason or 'Certificate issue') - # Check for excessive failed login attempts in the last 24 hours + # Sub-check 3: Failed login attempts (brute force detection) try: result = subprocess.run( ['journalctl', '--since', '24 hours ago', '--no-pager'], @@ -1823,29 +2237,57 @@ class HealthMonitor: timeout=3 ) + failed_logins = 0 if result.returncode == 0: - failed_logins = 0 for line in result.stdout.split('\n'): - # Common patterns for failed logins in journald - if 'authentication failure' in line.lower() or 'failed password' in line.lower() or 'invalid user' in line.lower(): + line_lower = line.lower() + if 'authentication failure' in line_lower or 'failed password' in line_lower or 'invalid user' in line_lower: failed_logins += 1 - if failed_logins > 50: # Threshold for significant failed attempts - issues.append(f'{failed_logins} failed login attempts in 24h') + if failed_logins > 50: + msg = f'{failed_logins} failed login attempts in 24h' + issues.append(msg) + checks['login_attempts'] = {'status': 'WARNING', 'detail': msg, 'count': failed_logins} + elif failed_logins > 0: + checks['login_attempts'] = {'status': 'OK', 'detail': f'{failed_logins} failed attempts in 24h (within threshold)', 'count': failed_logins} + else: + checks['login_attempts'] = {'status': 'OK', 'detail': 'No failed login attempts in 24h', 'count': 0} except Exception: - pass # Ignore if journalctl fails + checks['login_attempts'] = {'status': 'OK', 'detail': 'Unable to check login attempts'} + # Sub-check 4: Fail2Ban ban detection + try: + f2b = self._check_fail2ban_bans() + checks['fail2ban'] = { + 'status': f2b.get('status', 'OK'), + 'detail': f2b.get('detail', ''), + 'installed': f2b.get('installed', False), + 'banned_count': f2b.get('banned_count', 0) + } + if f2b.get('status') == 'WARNING': + issues.append(f2b.get('detail', 'Fail2Ban bans detected')) + except Exception: + checks['fail2ban'] = {'status': 'OK', 'detail': 'Unable to check Fail2Ban'} + + # Determine overall security status if issues: + # Check if any sub-check is CRITICAL + has_critical = any(c.get('status') == 'CRITICAL' for c in checks.values()) + overall_status = 'CRITICAL' if has_critical else 'WARNING' return { - 'status': 'WARNING', # Security issues are typically warnings - 'reason': '; '.join(issues[:2]) # Show up to 2 issues + 'status': overall_status, + 'reason': '; '.join(issues[:2]), + 'checks': checks } - return {'status': 'OK'} + return { + 'status': 'OK', + 'checks': checks + } except Exception as e: print(f"[HealthMonitor] Error checking security: {e}") - return {'status': 'OK'} + return {'status': 'OK', 'checks': {}} def _check_certificates(self) -> Optional[Dict[str, Any]]: """ @@ -2138,141 +2580,7 @@ class HealthMonitor: 'timestamp': datetime.now().isoformat() } - # This is a duplicate of the get_detailed_status method at the top of the file. - # It's likely an oversight from copy-pasting. One of them should be removed or renamed. - # Keeping both for now to match the provided structure, but in a refactor, this would be cleaned up. - def get_detailed_status(self) -> Dict[str, Any]: - """ - Get comprehensive health status with all checks. - Returns JSON structure with ALL 10 categories always present. - Now includes persistent error tracking. - """ - active_errors = health_persistence.get_active_errors() - # No need to create persistent_issues dict here, it's implicitly handled by the checks - - details = { - 'cpu': {'status': 'OK'}, - 'memory': {'status': 'OK'}, - 'storage': {'status': 'OK'}, # This will be overwritten by specific storage checks - 'disks': {'status': 'OK'}, # This will be overwritten by disk/filesystem checks - 'network': {'status': 'OK'}, - 'vms': {'status': 'OK'}, - 'services': {'status': 'OK'}, - 'logs': {'status': 'OK'}, - 'updates': {'status': 'OK'}, - 'security': {'status': 'OK'} - } - - critical_issues = [] - warning_issues = [] - info_issues = [] # Added info_issues to track INFO separately - - # --- Priority Order of Checks --- - - # Priority 1: Critical PVE Services - services_status = self._check_pve_services() - details['services'] = services_status - if services_status['status'] == 'CRITICAL': - critical_issues.append(f"PVE Services: {services_status.get('reason', 'Service failure')}") - elif services_status['status'] == 'WARNING': - warning_issues.append(f"PVE Services: {services_status.get('reason', 'Service issue')}") - - # Priority 1.5: Proxmox Storage Check (External Module) - proxmox_storage_result = self._check_proxmox_storage() - if proxmox_storage_result: # Only process if the check ran (module available) - details['storage'] = proxmox_storage_result - if proxmox_storage_result.get('status') == 'CRITICAL': - critical_issues.append(proxmox_storage_result.get('reason', 'Proxmox storage unavailable')) - elif proxmox_storage_result.get('status') == 'WARNING': - warning_issues.append(proxmox_storage_result.get('reason', 'Proxmox storage issue')) - - # Priority 2: Disk/Filesystem Health (Internal checks: usage, ZFS, SMART, IO errors) - storage_status = self._check_storage_optimized() - details['disks'] = storage_status # Use 'disks' for filesystem/disk specific issues - if storage_status.get('status') == 'CRITICAL': - critical_issues.append(f"Storage/Disks: {storage_status.get('reason', 'Disk/Storage failure')}") - elif storage_status.get('status') == 'WARNING': - warning_issues.append(f"Storage/Disks: {storage_status.get('reason', 'Disk/Storage issue')}") - - # Priority 3: VMs/CTs Status (with persistence) - vms_status = self._check_vms_cts_with_persistence() - details['vms'] = vms_status - if vms_status.get('status') == 'CRITICAL': - critical_issues.append(f"VMs/CTs: {vms_status.get('reason', 'VM/CT failure')}") - elif vms_status.get('status') == 'WARNING': - warning_issues.append(f"VMs/CTs: {vms_status.get('reason', 'VM/CT issue')}") - - # Priority 4: Network Connectivity - network_status = self._check_network_optimized() - details['network'] = network_status - if network_status.get('status') == 'CRITICAL': - critical_issues.append(f"Network: {network_status.get('reason', 'Network failure')}") - elif network_status.get('status') == 'WARNING': - warning_issues.append(f"Network: {network_status.get('reason', 'Network issue')}") - - # Priority 5: CPU Usage (with hysteresis) - cpu_status = self._check_cpu_with_hysteresis() - details['cpu'] = cpu_status - if cpu_status.get('status') == 'CRITICAL': - critical_issues.append(f"CPU: {cpu_status.get('reason', 'CPU critical')}") - elif cpu_status.get('status') == 'WARNING': - warning_issues.append(f"CPU: {cpu_status.get('reason', 'CPU high')}") - - # Priority 6: Memory Usage (RAM and Swap) - memory_status = self._check_memory_comprehensive() - details['memory'] = memory_status - if memory_status.get('status') == 'CRITICAL': - critical_issues.append(f"Memory: {memory_status.get('reason', 'Memory critical')}") - elif memory_status.get('status') == 'WARNING': - warning_issues.append(f"Memory: {memory_status.get('reason', 'Memory high')}") - - # Priority 7: Log Analysis (with persistence) - logs_status = self._check_logs_with_persistence() - details['logs'] = logs_status - if logs_status.get('status') == 'CRITICAL': - critical_issues.append(f"Logs: {logs_status.get('reason', 'Critical log errors')}") - elif logs_status.get('status') == 'WARNING': - warning_issues.append(f"Logs: {logs_status.get('reason', 'Log warnings')}") - - # Priority 8: System Updates - updates_status = self._check_updates() - details['updates'] = updates_status - if updates_status.get('status') == 'CRITICAL': - critical_issues.append(f"Updates: {updates_status.get('reason', 'System not updated')}") - elif updates_status.get('status') == 'WARNING': - warning_issues.append(f"Updates: {updates_status.get('reason', 'Updates pending')}") - elif updates_status.get('status') == 'INFO': - info_issues.append(f"Updates: {updates_status.get('reason', 'Informational update notice')}") - - # Priority 9: Security Checks - security_status = self._check_security() - details['security'] = security_status - if security_status.get('status') == 'WARNING': - warning_issues.append(f"Security: {security_status.get('reason', 'Security issue')}") - elif security_status.get('status') == 'INFO': - info_issues.append(f"Security: {security_status.get('reason', 'Security information')}") - - # --- Determine Overall Status --- - # Use a fixed order of severity: CRITICAL > WARNING > INFO > OK - if critical_issues: - overall = 'CRITICAL' - summary = '; '.join(critical_issues[:3]) # Limit summary to 3 issues - elif warning_issues: - overall = 'WARNING' - summary = '; '.join(warning_issues[:3]) - elif info_issues: - overall = 'OK' # INFO statuses don't degrade overall health - summary = '; '.join(info_issues[:3]) - else: - overall = 'OK' - summary = 'All systems operational' - - return { - 'overall': overall, - 'summary': summary, - 'details': details, - 'timestamp': datetime.now().isoformat() - } + # Duplicate get_detailed_status was removed during refactor (v1.1) # Global instance diff --git a/AppImage/scripts/health_persistence.py b/AppImage/scripts/health_persistence.py index 639d1081..973bbe9c 100644 --- a/AppImage/scripts/health_persistence.py +++ b/AppImage/scripts/health_persistence.py @@ -1,7 +1,8 @@ """ Health Monitor Persistence Module Manages persistent error tracking across AppImage updates using SQLite. -Stores errors in /root/.config/proxmenux-monitor/health_monitor.db +Stores errors in /usr/local/share/proxmenux/health_monitor.db +(same directory as monitor.db for temperature history) Features: - Persistent error storage (survives AppImage updates) @@ -10,7 +11,7 @@ Features: - Manual acknowledgment support Author: MacRimi -Version: 1.0 +Version: 1.1 """ import sqlite3 @@ -30,8 +31,8 @@ class HealthPersistence: UPDATES_SUPPRESSION = 180 * 24 * 3600 # 180 days (6 months) def __init__(self): - """Initialize persistence with database in config directory""" - self.data_dir = Path('/root/.config/proxmenux-monitor') + """Initialize persistence with database in shared ProxMenux data directory""" + self.data_dir = Path('/usr/local/share/proxmenux') self.data_dir.mkdir(parents=True, exist_ok=True) self.db_path = self.data_dir / 'health_monitor.db' @@ -186,10 +187,36 @@ class HealthPersistence: conn.commit() conn.close() - def acknowledge_error(self, error_key: str): + def is_error_active(self, error_key: str, category: Optional[str] = None) -> bool: """ - Manually acknowledge an error (won't notify again or re-appear for 24h). - Also marks as resolved so it disappears from active errors. + Check if an error is currently active (unresolved and not acknowledged). + Used by checks to avoid re-recording errors that are already tracked. + """ + conn = sqlite3.connect(str(self.db_path)) + cursor = conn.cursor() + + if category: + cursor.execute(''' + SELECT COUNT(*) FROM errors + WHERE error_key = ? AND category = ? + AND resolved_at IS NULL AND acknowledged = 0 + ''', (error_key, category)) + else: + cursor.execute(''' + SELECT COUNT(*) FROM errors + WHERE error_key = ? + AND resolved_at IS NULL AND acknowledged = 0 + ''', (error_key,)) + + count = cursor.fetchone()[0] + conn.close() + return count > 0 + + def clear_error(self, error_key: str): + """ + Remove/resolve a specific error immediately. + Used when the condition that caused the error no longer exists + (e.g., storage became available again). """ conn = sqlite3.connect(str(self.db_path)) cursor = conn.cursor() @@ -198,15 +225,67 @@ class HealthPersistence: cursor.execute(''' UPDATE errors - SET acknowledged = 1, resolved_at = ? - WHERE error_key = ? + SET resolved_at = ? + WHERE error_key = ? AND resolved_at IS NULL ''', (now, error_key)) - self._record_event(cursor, 'acknowledged', error_key, {}) + if cursor.rowcount > 0: + self._record_event(cursor, 'cleared', error_key, {'reason': 'condition_resolved'}) conn.commit() conn.close() + def acknowledge_error(self, error_key: str) -> Dict[str, Any]: + """ + Manually acknowledge an error (dismiss). + - Marks as acknowledged so it won't re-appear during the suppression period + - Stores the original severity for reference + - Returns info about the acknowledged error + + Suppression periods: + - updates category: 180 days (6 months) + - other categories: 24 hours + """ + conn = sqlite3.connect(str(self.db_path)) + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + now = datetime.now().isoformat() + + # Get current error info before acknowledging + cursor.execute('SELECT * FROM errors WHERE error_key = ?', (error_key,)) + row = cursor.fetchone() + + result = {'success': False, 'error_key': error_key} + + if row: + error_dict = dict(row) + original_severity = error_dict.get('severity', 'WARNING') + category = error_dict.get('category', '') + + cursor.execute(''' + UPDATE errors + SET acknowledged = 1, resolved_at = ? + WHERE error_key = ? + ''', (now, error_key)) + + self._record_event(cursor, 'acknowledged', error_key, { + 'original_severity': original_severity, + 'category': category + }) + + result = { + 'success': True, + 'error_key': error_key, + 'original_severity': original_severity, + 'category': category, + 'acknowledged_at': now + } + + conn.commit() + conn.close() + return result + def get_active_errors(self, category: Optional[str] = None) -> List[Dict[str, Any]]: """Get all active (unresolved) errors, optionally filtered by category""" conn = sqlite3.connect(str(self.db_path)) @@ -315,6 +394,138 @@ class HealthPersistence: except Exception: return False + def get_dismissed_errors(self) -> List[Dict[str, Any]]: + """ + Get errors that were acknowledged/dismissed but still within suppression period. + These are shown as INFO in the frontend with a 'Dismissed' badge. + """ + conn = sqlite3.connect(str(self.db_path)) + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + cursor.execute(''' + SELECT * FROM errors + WHERE acknowledged = 1 AND resolved_at IS NOT NULL + ORDER BY resolved_at DESC + ''') + + rows = cursor.fetchall() + conn.close() + + dismissed = [] + now = datetime.now() + + for row in rows: + error_dict = dict(row) + if error_dict.get('details'): + try: + error_dict['details'] = json.loads(error_dict['details']) + except (json.JSONDecodeError, TypeError): + pass + + # Check if still within suppression period + try: + resolved_dt = datetime.fromisoformat(error_dict['resolved_at']) + elapsed_seconds = (now - resolved_dt).total_seconds() + + if error_dict.get('category') == 'updates': + suppression = self.UPDATES_SUPPRESSION + else: + suppression = 24 * 3600 # 24 hours + + if elapsed_seconds < suppression: + error_dict['dismissed'] = True + error_dict['suppression_remaining_hours'] = round( + (suppression - elapsed_seconds) / 3600, 1 + ) + dismissed.append(error_dict) + except (ValueError, TypeError): + pass + + return dismissed + + def emit_event(self, event_type: str, category: str, severity: str, + data: Optional[Dict] = None) -> int: + """ + Emit a health event for the notification system. + Returns the event ID. + + Event types: + - 'state_change': severity changed (OK->WARNING, WARNING->CRITICAL, etc.) + - 'new_error': new error detected + - 'resolved': error resolved + - 'escalated': severity increased + """ + conn = sqlite3.connect(str(self.db_path)) + cursor = conn.cursor() + + event_data = data or {} + event_data['category'] = category + event_data['severity'] = severity + event_data['needs_notification'] = True + + cursor.execute(''' + INSERT INTO events (event_type, error_key, timestamp, data) + VALUES (?, ?, ?, ?) + ''', (event_type, f'{category}_{severity}', datetime.now().isoformat(), + json.dumps(event_data))) + + event_id = cursor.lastrowid + conn.commit() + conn.close() + return event_id + + def get_pending_notifications(self) -> List[Dict[str, Any]]: + """ + Get events that need notification (for future Telegram/Gotify integration). + Groups by severity for batch notification sending. + """ + conn = sqlite3.connect(str(self.db_path)) + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + cursor.execute(''' + SELECT e.*, err.category as error_category, err.reason as error_reason + FROM events e + LEFT JOIN errors err ON e.error_key = err.error_key + WHERE json_extract(e.data, '$.needs_notification') = 1 + ORDER BY e.timestamp DESC + LIMIT 100 + ''') + + rows = cursor.fetchall() + conn.close() + + events = [] + for row in rows: + event_dict = dict(row) + if event_dict.get('data'): + try: + event_dict['data'] = json.loads(event_dict['data']) + except (json.JSONDecodeError, TypeError): + pass + events.append(event_dict) + + return events + + def mark_events_notified(self, event_ids: List[int]): + """Mark events as notified (notification was sent successfully)""" + if not event_ids: + return + + conn = sqlite3.connect(str(self.db_path)) + cursor = conn.cursor() + + for event_id in event_ids: + cursor.execute(''' + UPDATE events + SET data = json_set(COALESCE(data, '{}'), '$.needs_notification', 0, '$.notified_at', ?) + WHERE id = ? + ''', (datetime.now().isoformat(), event_id)) + + conn.commit() + conn.close() + def _record_event(self, cursor, event_type: str, error_key: str, data: Dict): """Internal: Record an event""" cursor.execute('''