From 0eebb7743846330c9cc0d0ba3b96c012a37f65dc Mon Sep 17 00:00:00 2001
From: MacRimi <ricoextincion@gmail.com>
Date: Thu, 27 Nov 2025 11:58:20 +0100
Subject: [PATCH] Update AppImage

---
 AppImage/scripts/build_appimage.sh          |   1 +
 AppImage/scripts/flask_server.py            |  30 +-
 AppImage/scripts/health_monitor.py          | 794 ++++++++++++++------
 AppImage/scripts/proxmox_storage_monitor.py | 202 +++++
 4 files changed, 793 insertions(+), 234 deletions(-)
 create mode 100644 AppImage/scripts/proxmox_storage_monitor.py

diff --git a/AppImage/scripts/build_appimage.sh b/AppImage/scripts/build_appimage.sh
index 2b0b7ed..a107b9d 100644
--- a/AppImage/scripts/build_appimage.sh
+++ b/AppImage/scripts/build_appimage.sh
@@ -87,6 +87,7 @@ cp "$SCRIPT_DIR/flask_health_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo
 cp "$SCRIPT_DIR/flask_proxmenux_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️  flask_proxmenux_routes.py not found"
 cp "$SCRIPT_DIR/flask_terminal_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️  flask_terminal_routes.py not found"
 cp "$SCRIPT_DIR/hardware_monitor.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️  hardware_monitor.py not found"
+cp "$SCRIPT_DIR/proxmox_storage_monitor.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️  proxmox_storage_monitor.py not found"
 
 echo "📋 Adding translation support..."
 cat > "$APP_DIR/usr/bin/translate_cli.py" << 'PYEOF'
diff --git a/AppImage/scripts/flask_server.py b/AppImage/scripts/flask_server.py
index 0ca5842..c901f22 100644
--- a/AppImage/scripts/flask_server.py
+++ b/AppImage/scripts/flask_server.py
@@ -36,6 +36,7 @@ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 if BASE_DIR not in sys.path:
     sys.path.insert(0, BASE_DIR)
 
+from proxmox_storage_monitor import proxmox_storage_monitor
 from flask_terminal_routes import terminal_bp, init_terminal_routes  # noqa: E402
 from flask_health_routes import health_bp  # noqa: E402
 from flask_auth_routes import auth_bp  # noqa: E402
@@ -1758,18 +1759,7 @@ def get_proxmox_storage():
                 pass
                 continue
             
-            # Si total es 0, significa que hay un error de conexión o el datastore no está disponible
-            if total == 0:
-                # print(f"[v0] Skipping storage {name} - invalid data (total=0, likely connection error)")
-                pass
-                continue
-            
-            # Si el status es "inactive", también lo omitimos
-            if status.lower() != "available":
-                # print(f"[v0] Skipping storage {name} - status is not available: {status}")
-                pass
-                continue
-            
+            # No filtrar storages no disponibles - mantenerlos para mostrar errores
             # Calcular porcentaje
             percent = (used / total * 100) if total > 0 else 0.0
             
@@ -1778,10 +1768,18 @@ def get_proxmox_storage():
             used_gb = round(used / (1024**3), 2)
             available_gb = round(available / (1024**3), 2)
             
+            # Determine storage status
+            if total == 0:
+                storage_status = 'error'
+            elif status.lower() != "available":
+                storage_status = 'error'
+            else:
+                storage_status = 'active'
+            
             storage_info = {
                 'name': name,
                 'type': storage_type,
-                'status': 'active',  # Normalizar status para compatibilidad con frontend
+                'status': storage_status,  # Usar el status determinado (active o error)
                 'total': total_gb,
                 'used': used_gb,
                 'available': available_gb,
@@ -1792,6 +1790,12 @@ def get_proxmox_storage():
 
             storage_list.append(storage_info)
         
+        # Get unavailable storages from monitor
+        storage_status_data = proxmox_storage_monitor.get_storage_status()
+        unavailable_storages = storage_status_data.get('unavailable', [])
+        
+        # Add unavailable storages to the list
+        storage_list.extend(unavailable_storages)
 
         return {'storage': storage_list}
         
diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py
index 075ea66..02953e3 100644
--- a/AppImage/scripts/health_monitor.py
+++ b/AppImage/scripts/health_monitor.py
@@ -19,6 +19,12 @@ import re
 
 from health_persistence import health_persistence
 
+try:
+    from proxmox_storage_monitor import proxmox_storage_monitor
+    PROXMOX_STORAGE_AVAILABLE = True
+except ImportError:
+    PROXMOX_STORAGE_AVAILABLE = False
+
 class HealthMonitor:
     """
     Monitors system health across multiple components with minimal impact.
@@ -244,23 +250,32 @@ class HealthMonitor:
         elif services_status['status'] == 'WARNING':
             warning_issues.append(services_status.get('reason', 'Service issue'))
         
-        # Priority 2: Storage
+        # Priority 1.5: Proxmox Storage Check (uses external monitor)
+        proxmox_storage_result = self._check_proxmox_storage()
+        if proxmox_storage_result:
+            details['storage'] = proxmox_storage_result
+            if proxmox_storage_result.get('status') == 'CRITICAL':
+                critical_issues.append(proxmox_storage_result.get('reason', 'Proxmox storage unavailable'))
+            elif proxmox_storage_result.get('status') == 'WARNING':
+                warning_issues.append(proxmox_storage_result.get('reason', 'Proxmox storage issue'))
+        
+        # Priority 2: Storage (filesystem usage, ZFS, SMART etc.)
         storage_status = self._check_storage_optimized()
         if storage_status:
-            details['storage'] = storage_status
+            details['disks'] = storage_status # Rename from 'storage' to 'disks' for clarity
             if storage_status.get('status') == 'CRITICAL':
-                critical_issues.append(storage_status.get('reason', 'Storage failure'))
+                critical_issues.append(storage_status.get('reason', 'Disk/Storage failure'))
             elif storage_status.get('status') == 'WARNING':
-                warning_issues.append(storage_status.get('reason', 'Storage issue'))
+                warning_issues.append(storage_status.get('reason', 'Disk/Storage issue'))
         
-        # Priority 3: Disks
-        disks_status = self._check_disks_optimized()
-        if disks_status:
-            details['disks'] = disks_status
-            if disks_status.get('status') == 'CRITICAL':
-                critical_issues.append(disks_status.get('reason', 'Disk failure'))
-            elif disks_status.get('status') == 'WARNING':
-                warning_issues.append(disks_status.get('reason', 'Disk issue'))
+        # Priority 3: Disks (redundant with storage_optimized, but keeping for now)
+        # disks_status = self._check_disks_optimized() # This is now covered by _check_storage_optimized
+        # if disks_status:
+        #     details['disks'] = disks_status
+        #     if disks_status.get('status') == 'CRITICAL':
+        #         critical_issues.append(disks_status.get('reason', 'Disk failure'))
+        #     elif disks_status.get('status') == 'WARNING':
+        #         warning_issues.append(disks_status.get('reason', 'Disk issue'))
         
         # Priority 4: VMs/CTs - now with persistence
         vms_status = self._check_vms_cts_with_persistence()
@@ -578,49 +593,7 @@ class HealthMonitor:
         issues = []
         storage_details = {}
         
-        try:
-            result = subprocess.run(
-                ['pvesm', 'status'],
-                capture_output=True,
-                text=True,
-                timeout=5
-            )
-            
-            if result.returncode == 0:
-                lines = result.stdout.strip().split('\n')[1:]  # Skip header
-                for line in lines:
-                    parts = line.split()
-                    if len(parts) >= 4:
-                        storage_name = parts[0]
-                        storage_type = parts[1]
-                        enabled = parts[2]
-                        active = parts[3]
-                        
-                        if enabled == '1' and active == '0':
-                            issues.append(f'{storage_name}: Inactive')
-                            storage_details[storage_name] = {
-                                'status': 'CRITICAL',
-                                'reason': 'Storage inactive',
-                                'type': storage_type
-                            }
-        except Exception as e:
-            # If pvesm not available, skip silently
-            pass
-        
-        # Check ZFS pool health status
-        zfs_pool_issues = self._check_zfs_pool_health()
-        if zfs_pool_issues:
-            for pool_name, pool_info in zfs_pool_issues.items():
-                issues.append(f'{pool_name}: {pool_info["reason"]}')
-                storage_details[pool_name] = pool_info
-        
-        # Check disk health from Proxmox task log or system logs
-        disk_health_issues = self._check_disk_health_from_events()
-        if disk_health_issues:
-            for disk, issue in disk_health_issues.items():
-                issues.append(f'{disk}: {issue["reason"]}')
-                storage_details[disk] = issue
-        
+        # Check disk usage and mount status first for critical mounts
         critical_mounts = ['/']
         
         for mount_point in critical_mounts:
@@ -660,7 +633,30 @@ class HealthMonitor:
                         issues.append(f"{mount_point}: {fs_status['reason']}")
                         storage_details[mount_point] = fs_status
             except Exception:
-                pass
+                pass # Silently skip if mountpoint check fails
+        
+        # Check ZFS pool health status
+        zfs_pool_issues = self._check_zfs_pool_health()
+        if zfs_pool_issues:
+            for pool_name, pool_info in zfs_pool_issues.items():
+                issues.append(f'{pool_name}: {pool_info["reason"]}')
+                storage_details[pool_name] = pool_info
+        
+        # Check disk health from Proxmox task log or system logs (SMART, etc.)
+        disk_health_issues = self._check_disk_health_from_events()
+        if disk_health_issues:
+            for disk, issue in disk_health_issues.items():
+                # Only add if not already covered by critical mountpoint issues
+                if disk not in storage_details or storage_details[disk].get('status') == 'OK':
+                    issues.append(f'{disk}: {issue["reason"]}')
+                    storage_details[disk] = issue
+        
+        # Check LVM status
+        lvm_status = self._check_lvm()
+        if lvm_status.get('status') == 'WARNING':
+            # LVM volumes might be okay but indicate potential issues
+            issues.append(f"LVM check: {lvm_status.get('reason')}")
+            storage_details['lvm_check'] = lvm_status
         
         if not issues:
             return {'status': 'OK'}
@@ -709,6 +705,16 @@ class HealthMonitor:
     def _check_lvm(self) -> Dict[str, Any]:
         """Check LVM volumes - improved detection"""
         try:
+            # Check if lvs command is available
+            result_which = subprocess.run(
+                ['which', 'lvs'],
+                capture_output=True,
+                text=True,
+                timeout=1
+            )
+            if result_which.returncode != 0:
+                return {'status': 'OK'} # LVM not installed
+
             result = subprocess.run(
                 ['lvs', '--noheadings', '--options', 'lv_name,vg_name,lv_attr'],
                 capture_output=True,
@@ -717,23 +723,40 @@ class HealthMonitor:
             )
             
             if result.returncode != 0:
-                return {'status': 'OK'}
+                return {'status': 'WARNING', 'reason': 'lvs command failed'}
             
             volumes = []
-            
             for line in result.stdout.strip().split('\n'):
                 if line.strip():
                     parts = line.split()
                     if len(parts) >= 2:
                         lv_name = parts[0].strip()
                         vg_name = parts[1].strip()
-                        volumes.append(f'{vg_name}/{lv_name}')
+                        # Check for 'a' attribute indicating active/available
+                        if 'a' in parts[2]:
+                            volumes.append(f'{vg_name}/{lv_name}')
+            
+            # If LVM is configured but no active volumes are found, it might be an issue or just not used
+            if not volumes:
+                # Check if any VGs exist to determine if LVM is truly unconfigured or just inactive
+                vg_result = subprocess.run(
+                    ['vgs', '--noheadings', '--options', 'vg_name'],
+                    capture_output=True,
+                    text=True,
+                    timeout=3
+                )
+                if vg_result.returncode == 0 and vg_result.stdout.strip():
+                    return {'status': 'WARNING', 'reason': 'No active LVM volumes detected'}
+                else:
+                    return {'status': 'OK'} # No VGs found, LVM not in use
             
             return {'status': 'OK', 'volumes': len(volumes)}
             
         except Exception:
             return {'status': 'OK'}
     
+    # This function is no longer used in get_detailed_status, but kept for reference if needed.
+    # The new _check_proxmox_storage function handles this logic better.
     def _check_proxmox_storages(self) -> Dict[str, Any]:
         """Check Proxmox-specific storages (only report problems)"""
         storages = {}
@@ -748,7 +771,9 @@ class HealthMonitor:
                         line = line.strip()
                         
                         if line.startswith('dir:') or line.startswith('nfs:') or \
-                           line.startswith('cifs:') or line.startswith('pbs:'):
+                           line.startswith('cifs:') or line.startswith('pbs:') or \
+                           line.startswith('rbd:') or line.startswith('cephfs:') or \
+                           line.startswith('zfs:') or line.startswith('zfs-send:'):
                             parts = line.split(':', 1)
                             storage_type = parts[0]
                             current_storage = parts[1].strip()
@@ -774,12 +799,16 @@ class HealthMonitor:
     def _check_disks_optimized(self) -> Dict[str, Any]:
         """
         Optimized disk check - always returns status.
+        Checks dmesg for I/O errors and SMART status.
+        NOTE: This function is now largely covered by _check_storage_optimized,
+              but kept for potential specific disk-level reporting if needed.
+              Currently, its primary function is to detect recent I/O errors.
         """
         current_time = time.time()
         disk_issues = {}
         
         try:
-            # Check dmesg for I/O errors
+            # Check dmesg for I/O errors in the last 5 minutes
             result = subprocess.run(
                 ['dmesg', '-T', '--level=err,warn', '--since', '5 minutes ago'],
                 capture_output=True,
@@ -790,13 +819,14 @@ class HealthMonitor:
             if result.returncode == 0:
                 for line in result.stdout.split('\n'):
                     line_lower = line.lower()
-                    if any(keyword in line_lower for keyword in ['i/o error', 'ata error', 'scsi error']):
-                        for part in line.split():
-                            if part.startswith('sd') or part.startswith('nvme') or part.startswith('hd'):
-                                disk_name = part.rstrip(':,')
-                                self.io_error_history[disk_name].append(current_time)
+                    if any(keyword in line_lower for keyword in ['i/o error', 'ata error', 'scsi error', 'medium error']):
+                        # Try to extract disk name
+                        disk_match = re.search(r'/dev/(sd[a-z]|nvme\d+n\d+)', line)
+                        if disk_match:
+                            disk_name = disk_match.group(1)
+                            self.io_error_history[disk_name].append(current_time)
                 
-                # Clean old history
+                # Clean old history (keep errors from last 5 minutes)
                 for disk in list(self.io_error_history.keys()):
                     self.io_error_history[disk] = [
                         t for t in self.io_error_history[disk]
@@ -805,6 +835,7 @@ class HealthMonitor:
                     
                     error_count = len(self.io_error_history[disk])
                     
+                    # Report based on recent error count
                     if error_count >= 3:
                         disk_issues[f'/dev/{disk}'] = {
                             'status': 'CRITICAL',
@@ -823,16 +854,18 @@ class HealthMonitor:
             
             return {
                 'status': 'CRITICAL' if has_critical else 'WARNING',
-                'reason': f"{len(disk_issues)} disk(s) with errors",
+                'reason': f"{len(disk_issues)} disk(s) with recent errors",
                 'details': disk_issues
             }
             
         except Exception:
+            # If dmesg check fails, return OK as it's not a critical system failure
             return {'status': 'OK'}
     
     def _check_network_optimized(self) -> Dict[str, Any]:
         """
         Optimized network check - always returns status.
+        Checks interface status and basic latency.
         """
         try:
             issues = []
@@ -846,16 +879,17 @@ class HealthMonitor:
                 
                 # Check if important interface is down
                 if not stats.isup:
-                    if interface.startswith('vmbr') or interface.startswith('eth') or interface.startswith('ens'):
+                    # Consider common PVE bridge interfaces and physical NICs as important
+                    if interface.startswith('vmbr') or interface.startswith('eth') or interface.startswith('ens') or interface.startswith('enp'):
                         issues.append(f'{interface} is DOWN')
                         interface_details[interface] = {
                             'status': 'CRITICAL',
                             'reason': 'Interface DOWN'
                         }
             
-            # Check connectivity
+            # Check connectivity (latency)
             latency_status = self._check_network_latency()
-            if latency_status and latency_status.get('status') not in ['OK', 'UNKNOWN']:
+            if latency_status and latency_status.get('status') not in ['OK', 'INFO', 'UNKNOWN']:
                 issues.append(latency_status.get('reason', 'Network latency issue'))
                 interface_details['connectivity'] = latency_status
             
@@ -920,16 +954,17 @@ class HealthMonitor:
                         except:
                             pass
             
+            # If ping failed (timeout, unreachable)
             packet_loss_result = {
                 'status': 'CRITICAL',
-                'reason': 'Packet loss or timeout'
+                'reason': 'Packet loss or timeout to 1.1.1.1'
             }
             self.cached_results[cache_key] = packet_loss_result
             self.last_check_times[cache_key] = current_time
             return packet_loss_result
             
         except Exception:
-            return None
+            return {'status': 'UNKNOWN', 'reason': 'Ping command failed'}
     
     def _check_vms_cts_optimized(self) -> Dict[str, Any]:
         """
@@ -1060,10 +1095,11 @@ class HealthMonitor:
                 error_key = error['error_key']
                 if error_key.startswith('vm_') or error_key.startswith('ct_'):
                     vm_id = error_key.split('_')[1]
+                    # Check if VM is running using persistence helper
                     if health_persistence.check_vm_running(vm_id):
-                        continue  # Error auto-resolved
+                        continue  # Error auto-resolved if VM is now running
                 
-                # Still active
+                # Still active, add to details
                 vm_details[error_key] = {
                     'status': error['severity'],
                     'reason': error['reason'],
@@ -1074,6 +1110,7 @@ class HealthMonitor:
                 issues.append(f"{error.get('details', {}).get('type', 'VM')} {error.get('details', {}).get('id', '')}: {error['reason']}")
             
             # Check for new errors in logs
+            # Using 'warning' priority to catch potential startup issues
             result = subprocess.run(
                 ['journalctl', '--since', '10 minutes ago', '--no-pager', '-p', 'warning'],
                 capture_output=True,
@@ -1108,7 +1145,7 @@ class HealthMonitor:
                             }
                         continue
                     
-                    # Container errors
+                    # Container errors (including startup issues via vzstart)
                     vzstart_match = re.search(r'vzstart:(\d+):', line)
                     if vzstart_match and ('error' in line_lower or 'fail' in line_lower or 'does not exist' in line_lower):
                         ctid = vzstart_match.group(1)
@@ -1139,6 +1176,41 @@ class HealthMonitor:
                                 'id': ctid,
                                 'type': 'CT'
                             }
+                    
+                    # Generic failed to start for VMs and CTs
+                    if any(keyword in line_lower for keyword in ['failed to start', 'cannot start', 'activation failed', 'start error']):
+                        id_match = re.search(r'\b(\d{3,5})\b', line) # Increased digit count for wider match
+                        if id_match:
+                            vmid_ctid = id_match.group(1)
+                            # Determine if it's a VM or CT based on context, if possible
+                            if 'vm' in line_lower or 'qemu' in line_lower:
+                                error_key = f'vm_{vmid_ctid}'
+                                vm_type = 'VM'
+                            elif 'ct' in line_lower or 'lxc' in line_lower:
+                                error_key = f'ct_{vmid_ctid}'
+                                vm_type = 'CT'
+                            else:
+                                # Fallback if type is unclear
+                                error_key = f'vmct_{vmid_ctid}'
+                                vm_type = 'VM/CT'
+                            
+                            if error_key not in vm_details:
+                                reason = 'Failed to start'
+                                # Record persistent error
+                                health_persistence.record_error(
+                                    error_key=error_key,
+                                    category='vms',
+                                    severity='CRITICAL',
+                                    reason=reason,
+                                    details={'id': vmid_ctid, 'type': vm_type}
+                                )
+                                issues.append(f'{vm_type} {vmid_ctid}: {reason}')
+                                vm_details[error_key] = {
+                                    'status': 'CRITICAL',
+                                    'reason': reason,
+                                    'id': vmid_ctid,
+                                    'type': vm_type
+                                }
             
             if not issues:
                 return {'status': 'OK'}
@@ -1171,6 +1243,7 @@ class HealthMonitor:
                     if result.returncode != 0 or result.stdout.strip() != 'active':
                         failed_services.append(service)
                 except Exception:
+                    # If systemctl fails (e.g., command not found or service doesn't exist), treat as failed
                     failed_services.append(service)
             
             if failed_services:
@@ -1183,9 +1256,10 @@ class HealthMonitor:
             return {'status': 'OK'}
             
         except Exception as e:
+            # If the entire systemctl check fails
             return {
                 'status': 'WARNING',
-                'reason': f'Service check failed: {str(e)}'
+                'reason': f'Service check command failed: {str(e)}'
             }
     
     def _is_benign_error(self, line: str) -> bool:
@@ -1199,7 +1273,7 @@ class HealthMonitor:
     def _classify_log_severity(self, line: str) -> Optional[str]:
         """
         Classify log line severity intelligently.
-        Returns: 'CRITICAL', 'WARNING', or None (benign)
+        Returns: 'CRITICAL', 'WARNING', or None (benign/info)
         """
         line_lower = line.lower()
         
@@ -1217,42 +1291,38 @@ class HealthMonitor:
             if re.search(keyword, line_lower):
                 return 'WARNING'
         
-        # Generic error/warning classification
-        if 'critical' in line_lower or 'fatal' in line_lower:
+        # Generic error/warning classification based on common terms
+        if 'critical' in line_lower or 'fatal' in line_lower or 'panic' in line_lower:
             return 'CRITICAL'
-        elif 'error' in line_lower:
+        elif 'error' in line_lower or 'fail' in line_lower:
             return 'WARNING'
         elif 'warning' in line_lower or 'warn' in line_lower:
-            return None  # Generic warnings are benign
+            return None  # Generic warnings are often informational and not critical
         
         return None
 
     def _check_logs_with_persistence(self) -> Dict[str, Any]:
         """
-        Intelligent log checking with cascade detection.
-        Only alerts when there's a real problem (error cascade), not normal background warnings.
-        
-        Logic:
-        - Looks at last 3 minutes (not 10) for immediate issues
-        - Detects cascades: ≥5 errors of same type in 3 min = problem
-        - Compares to previous period to detect spikes
-        - Whitelists known benign Proxmox warnings
+        Intelligent log checking with cascade detection and persistence.
+        Focuses on detecting significant error patterns rather than transient warnings.
         """
         cache_key = 'logs_analysis'
         current_time = time.time()
         
-        # Cache for 5 minutes
+        # Cache the result for 5 minutes to avoid excessive journalctl calls
         if cache_key in self.last_check_times:
             if current_time - self.last_check_times[cache_key] < self.LOG_CHECK_INTERVAL:
+                # Check persistent log errors recorded by health_persistence
                 persistent_errors = health_persistence.get_active_errors('logs')
                 if persistent_errors:
                     return {
-                        'status': 'WARNING',
-                        'reason': f'{len(persistent_errors)} persistent log issues'
+                        'status': 'WARNING', # Or CRITICAL depending on severity of persistent errors
+                        'reason': f'{len(persistent_errors)} persistent log issues detected'
                     }
                 return self.cached_results.get(cache_key, {'status': 'OK'})
         
         try:
+            # Fetch logs from the last 3 minutes for immediate issue detection
             result_recent = subprocess.run(
                 ['journalctl', '--since', '3 minutes ago', '--no-pager', '-p', 'warning'],
                 capture_output=True,
@@ -1260,6 +1330,7 @@ class HealthMonitor:
                 timeout=3
             )
             
+            # Fetch logs from the previous 3-minute interval to detect spikes/cascades
             result_previous = subprocess.run(
                 ['journalctl', '--since', '6 minutes ago', '--until', '3 minutes ago', '--no-pager', '-p', 'warning'],
                 capture_output=True,
@@ -1273,7 +1344,7 @@ class HealthMonitor:
                 
                 recent_patterns = defaultdict(int)
                 previous_patterns = defaultdict(int)
-                critical_errors = {}
+                critical_errors_found = {} # To store unique critical error lines for persistence
                 
                 for line in recent_lines:
                     if not line.strip():
@@ -1286,25 +1357,26 @@ class HealthMonitor:
                     # Classify severity
                     severity = self._classify_log_severity(line)
                     
-                    if severity is None:
+                    if severity is None: # Skip informational or classified benign lines
                         continue
                     
-                    # Normalize to pattern
+                    # Normalize to a pattern for grouping
                     pattern = self._normalize_log_pattern(line)
                     
                     if severity == 'CRITICAL':
-                        if pattern not in critical_errors:
-                            critical_errors[pattern] = line
-                            
-                            # Record persistent error
-                            error_key = f'log_critical_{abs(hash(pattern)) % 10000}'
-                            health_persistence.record_error(
-                                error_key=error_key,
-                                category='logs',
-                                severity='CRITICAL',
-                                reason=line[:100],
-                                details={'pattern': pattern}
-                            )
+                        # If this critical pattern is new or we haven't logged it recently
+                        error_key = f'log_critical_{abs(hash(pattern)) % 10000}'
+                        if pattern not in critical_errors_found:
+                            critical_errors_found[pattern] = line
+                            # Record persistent error if it's not already active and within recent persistence
+                            if not health_persistence.is_error_active(error_key, category='logs'):
+                                health_persistence.record_error(
+                                    error_key=error_key,
+                                    category='logs',
+                                    severity='CRITICAL',
+                                    reason=line[:100], # Truncate reason for brevity
+                                    details={'pattern': pattern}
+                                )
                     
                     recent_patterns[pattern] += 1
                 
@@ -1319,25 +1391,28 @@ class HealthMonitor:
                     pattern = self._normalize_log_pattern(line)
                     previous_patterns[pattern] += 1
                 
+                # Detect cascades: ≥10 errors of same type in 3 min
                 cascading_errors = {
                     pattern: count for pattern, count in recent_patterns.items()
                     if count >= 10 and self._classify_log_severity(pattern) in ['WARNING', 'CRITICAL']
                 }
                 
+                # Detect spikes: ≥3 errors now AND ≥3x increase from previous period
                 spike_errors = {}
                 for pattern, recent_count in recent_patterns.items():
                     prev_count = previous_patterns.get(pattern, 0)
-                    # Spike if: ≥3 errors now AND ≥3x increase
                     if recent_count >= 3 and recent_count >= prev_count * 3:
                         spike_errors[pattern] = recent_count
                 
-                unique_critical = len(critical_errors)
+                unique_critical_count = len(critical_errors_found)
                 cascade_count = len(cascading_errors)
                 spike_count = len(spike_errors)
                 
-                if unique_critical > 0:
+                if unique_critical_count > 0:
                     status = 'CRITICAL'
-                    reason = f'{unique_critical} critical error(s): cascade detected'
+                    # Get a representative critical error reason
+                    representative_error = next(iter(critical_errors_found.values()))
+                    reason = f'Critical error detected: {representative_error[:100]}'
                 elif cascade_count > 0:
                     status = 'WARNING'
                     reason = f'Error cascade detected: {cascade_count} pattern(s) repeating ≥10 times in 3min'
@@ -1345,7 +1420,7 @@ class HealthMonitor:
                     status = 'WARNING'
                     reason = f'Error spike detected: {spike_count} pattern(s) increased 3x'
                 else:
-                    # Normal background warnings, no alert
+                    # No significant issues found
                     status = 'OK'
                     reason = None
                 
@@ -1357,12 +1432,15 @@ class HealthMonitor:
                 self.last_check_times[cache_key] = current_time
                 return log_result
             
+            # If journalctl command failed or returned no data
             ok_result = {'status': 'OK'}
             self.cached_results[cache_key] = ok_result
             self.last_check_times[cache_key] = current_time
             return ok_result
             
-        except Exception:
+        except Exception as e:
+            # Log the exception but return OK to avoid alert storms on check failure
+            print(f"[HealthMonitor] Error checking logs: {e}")
             return {'status': 'OK'}
     
     def _normalize_log_pattern(self, line: str) -> str:
@@ -1370,25 +1448,32 @@ class HealthMonitor:
         Normalize log line to a pattern for grouping similar errors.
         Removes timestamps, PIDs, IDs, paths, and other variables.
         """
-        pattern = re.sub(r'\d{4}-\d{2}-\d{2}', '', line)  # Remove dates
+        # Remove standard syslog timestamp and process info if present
+        pattern = re.sub(r'^\w{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+\S+(\s+\[\d+\])?:\s+', '', line)
+        
+        pattern = re.sub(r'\d{4}-\d{2}-\d{2}', '', pattern)  # Remove dates
         pattern = re.sub(r'\d{2}:\d{2}:\d{2}', '', pattern)  # Remove times
         pattern = re.sub(r'pid[:\s]+\d+', 'pid:XXX', pattern.lower())  # Normalize PIDs
-        pattern = re.sub(r'\b\d{3,6}\b', 'ID', pattern)  # Normalize IDs
-        pattern = re.sub(r'/dev/\S+', '/dev/XXX', pattern)  # Normalize devices
-        pattern = re.sub(r'/\S+/\S+', '/PATH/', pattern)  # Normalize paths
-        pattern = re.sub(r'0x[0-9a-f]+', '0xXXX', pattern)  # Normalize hex
+        pattern = re.sub(r'\b\d{3,6}\b', 'ID', pattern)  # Normalize IDs (common for container/VM IDs)
+        pattern = re.sub(r'/dev/\S+', '/dev/XXX', pattern)  # Normalize device paths
+        pattern = re.sub(r'/\S+/\S+', '/PATH/', pattern)  # Normalize general paths
+        pattern = re.sub(r'0x[0-9a-f]+', '0xXXX', pattern)  # Normalize hex values
+        pattern = re.sub(r'\b(uuid|guid|hash)[:=]\s*[\w-]+\b', r'\1=XXX', pattern.lower()) # Normalize UUIDs/GUIDs
         pattern = re.sub(r'\s+', ' ', pattern).strip()  # Normalize whitespace
-        return pattern[:150]  # Keep first 150 chars
+        
+        return pattern[:150]  # Keep first 150 characters to avoid overly long patterns
     
     def _check_updates(self) -> Optional[Dict[str, Any]]:
         """
-        Check for pending system updates with intelligence.
-        Now only warns after 365 days without updates.
-        Critical security updates and kernel updates trigger INFO status immediately.
+        Check for pending system updates.
+        - WARNING: If security updates are available.
+        - CRITICAL: If system not updated in >2 years.
+        - INFO: If 1-2 years without updates, or many non-security updates.
         """
         cache_key = 'updates_check'
         current_time = time.time()
         
+        # Cache for 10 minutes
         if cache_key in self.last_check_times:
             if current_time - self.last_check_times[cache_key] < 600:
                 return self.cached_results.get(cache_key)
@@ -1403,48 +1488,51 @@ class HealthMonitor:
                     days_since_update = (current_time - mtime) / 86400
                     last_update_days = int(days_since_update)
                 except Exception:
-                    pass
+                    pass # Ignore if mtime fails
             
+            # Perform a dry run of apt-get upgrade to see pending packages
             result = subprocess.run(
                 ['apt-get', 'upgrade', '--dry-run'],
                 capture_output=True,
                 text=True,
-                timeout=5
+                timeout=5 # Increased timeout for safety
             )
             
+            status = 'OK'
+            reason = None
+            update_count = 0
+            security_updates_packages = []
+            kernel_pve_updates_packages = []
+            
             if result.returncode == 0:
                 lines = result.stdout.strip().split('\n')
                 
-                # Count total updates
-                update_count = 0
-                security_updates = []
-                kernel_updates = []
-                
                 for line in lines:
+                    # 'Inst' indicates a package will be installed/upgraded
                     if line.startswith('Inst '):
                         update_count += 1
                         line_lower = line.lower()
+                        package_name = line.split()[1].split(':')[0] # Get package name, strip arch if present
                         
-                        # Check for security updates
+                        # Check for security updates (common pattern in repo names)
                         if 'security' in line_lower or 'debian-security' in line_lower:
-                            package_name = line.split()[1]
-                            security_updates.append(package_name)
+                            security_updates_packages.append(package_name)
                         
                         # Check for kernel or critical PVE updates
-                        if any(pkg in line_lower for pkg in ['linux-image', 'pve-kernel', 'pve-manager', 'proxmox-ve']):
-                            package_name = line.split()[1]
-                            kernel_updates.append(package_name)
+                        if any(pkg in line_lower for pkg in ['linux-image', 'pve-kernel', 'pve-manager', 'proxmox-ve', 'qemu-server', 'pve-api-core']):
+                            kernel_pve_updates_packages.append(package_name)
                 
-                if security_updates:
+                # Determine overall status based on findings
+                if security_updates_packages:
                     status = 'WARNING'
-                    reason = f'{len(security_updates)} security update(s) available'
-                    # Record persistent error for security updates
+                    reason = f'{len(security_updates_packages)} security update(s) available'
+                    # Record persistent error for security updates to ensure it's visible
                     health_persistence.record_error(
                         error_key='updates_security',
                         category='updates',
                         severity='WARNING',
                         reason=reason,
-                        details={'count': len(security_updates), 'packages': security_updates[:5]}
+                        details={'count': len(security_updates_packages), 'packages': security_updates_packages[:5]}
                     )
                 elif last_update_days and last_update_days >= 730:
                     # 2+ years without updates - CRITICAL
@@ -1468,57 +1556,69 @@ class HealthMonitor:
                         reason=reason,
                         details={'days': last_update_days, 'update_count': update_count}
                     )
-                elif kernel_updates:
+                elif kernel_pve_updates_packages:
+                    # Informational: Kernel or critical PVE components need update
                     status = 'INFO'
-                    reason = f'{len(kernel_updates)} kernel/PVE update(s) available'
+                    reason = f'{len(kernel_pve_updates_packages)} kernel/PVE update(s) available'
                 elif update_count > 50:
+                    # Informational: Large number of pending updates
                     status = 'INFO'
                     reason = f'{update_count} updates pending (consider maintenance window)'
-                else:
-                    status = 'OK'
-                    reason = None
-                
-                update_result = {
-                    'status': status,
-                    'count': update_count
-                }
-                if reason:
-                    update_result['reason'] = reason
-                if last_update_days:
-                    update_result['days_since_update'] = last_update_days
-                
-                self.cached_results[cache_key] = update_result
-                self.last_check_times[cache_key] = current_time
-                return update_result
             
-            return {'status': 'OK', 'count': 0}
+            # If apt-get upgrade --dry-run failed
+            elif result.returncode != 0:
+                status = 'WARNING'
+                reason = 'Failed to check for updates (apt-get error)'
+
+            # Construct result dictionary
+            update_result = {
+                'status': status,
+                'count': update_count
+            }
+            if reason:
+                update_result['reason'] = reason
+            if last_update_days is not None: # Only add if we could determine days_since_update
+                update_result['days_since_update'] = last_update_days
+            
+            self.cached_results[cache_key] = update_result
+            self.last_check_times[cache_key] = current_time
+            return update_result
             
         except Exception as e:
+            print(f"[HealthMonitor] Error checking updates: {e}")
+            # Return OK on exception to avoid false alerts
             return {'status': 'OK', 'count': 0}
     
     def _check_security(self) -> Dict[str, Any]:
         """
         Check security-related items:
-        - SSL certificate validity and expiration
-        - Failed login attempts
-        - Excessive uptime (>365 days = kernel vulnerabilities)
+        - Uptime > 1 year (indicates potential kernel vulnerability if not updated)
+        - SSL certificate expiration (non-INFO certs)
+        - Excessive failed login attempts
         """
         try:
             issues = []
             
+            # Check uptime for potential kernel vulnerabilities (if not updated)
             try:
                 uptime_seconds = time.time() - psutil.boot_time()
                 uptime_days = uptime_seconds / 86400
                 
+                # If uptime is over a year and no recent updates, it's a warning
                 if uptime_days > 365:
-                    issues.append(f'Uptime {int(uptime_days)} days (>1 year, kernel updates needed)')
+                    # Check if updates check shows recent activity
+                    updates_data = self.cached_results.get('updates_check')
+                    if updates_data and updates_data.get('days_since_update', 9999) > 365:
+                        issues.append(f'Uptime {int(uptime_days)} days (>1 year, consider updating kernel/system)')
             except Exception:
-                pass
+                pass # Ignore if uptime calculation fails
             
+            # Check SSL certificates (only report non-OK statuses)
             cert_status = self._check_certificates()
             if cert_status and cert_status.get('status') not in ['OK', 'INFO']:
                 issues.append(cert_status.get('reason', 'Certificate issue'))
             
+            # Check for excessive failed login attempts in the last 24 hours
             try:
                 result = subprocess.run(
                     ['journalctl', '--since', '24 hours ago', '--no-pager'],
@@ -1530,28 +1630,30 @@ class HealthMonitor:
                 if result.returncode == 0:
                     failed_logins = 0
                     for line in result.stdout.split('\n'):
-                        if 'authentication failure' in line.lower() or 'failed password' in line.lower():
+                        # Common patterns for failed logins in journald
+                        if 'authentication failure' in line.lower() or 'failed password' in line.lower() or 'invalid user' in line.lower():
                             failed_logins += 1
                     
-                    if failed_logins > 50:
+                    if failed_logins > 50: # Threshold for significant failed attempts
                         issues.append(f'{failed_logins} failed login attempts in 24h')
             except Exception:
-                pass
+                pass # Ignore if journalctl fails
             
             if issues:
                 return {
-                    'status': 'WARNING',
-                    'reason': '; '.join(issues[:2])
+                    'status': 'WARNING', # Security issues are typically warnings
+                    'reason': '; '.join(issues[:2]) # Show up to 2 issues
                 }
             
             return {'status': 'OK'}
             
-        except Exception:
+        except Exception as e:
+            print(f"[HealthMonitor] Error checking security: {e}")
             return {'status': 'OK'}
     
     def _check_certificates(self) -> Optional[Dict[str, Any]]:
         """
-        Check SSL certificate expiration.
+        Check SSL certificate expiration for PVE's default certificate.
         INFO: Self-signed or no cert configured (normal for internal servers)
         WARNING: Expires <30 days
         CRITICAL: Expired
@@ -1559,6 +1661,7 @@ class HealthMonitor:
         cache_key = 'certificates'
         current_time = time.time()
         
+        # Cache for 1 day (86400 seconds)
         if cache_key in self.last_check_times:
             if current_time - self.last_check_times[cache_key] < 86400:
                 return self.cached_results.get(cache_key)
@@ -1569,12 +1672,13 @@ class HealthMonitor:
             if not os.path.exists(cert_path):
                 cert_result = {
                     'status': 'INFO',
-                    'reason': 'Self-signed or default certificate'
+                    'reason': 'Self-signed or default PVE certificate'
                 }
                 self.cached_results[cache_key] = cert_result
                 self.last_check_times[cache_key] = current_time
                 return cert_result
             
+            # Use openssl to get the expiry date
             result = subprocess.run(
                 ['openssl', 'x509', '-enddate', '-noout', '-in', cert_path],
                 capture_output=True,
@@ -1586,43 +1690,62 @@ class HealthMonitor:
                 date_str = result.stdout.strip().replace('notAfter=', '')
                 
                 try:
-                    from datetime import datetime
-                    exp_date = datetime.strptime(date_str, '%b %d %H:%M:%S %Y %Z')
-                    days_until_expiry = (exp_date - datetime.now()).days
-                    
-                    if days_until_expiry < 0:
-                        status = 'CRITICAL'
-                        reason = 'Certificate expired'
-                    elif days_until_expiry < 30:
-                        status = 'WARNING'
-                        reason = f'Certificate expires in {days_until_expiry} days'
-                    else:
-                        status = 'OK'
-                        reason = None
-                    
-                    cert_result = {'status': status}
-                    if reason:
-                        cert_result['reason'] = reason
-                    
-                    self.cached_results[cache_key] = cert_result
-                    self.last_check_times[cache_key] = current_time
-                    return cert_result
-                except Exception:
-                    pass
+                    # Parse the date string (format can vary, e.g., 'Jun 15 10:00:00 2024 GMT')
+                    # Attempt common formats
+                    exp_date = None
+                    try:
+                        # Try more detailed format first
+                        exp_date = datetime.strptime(date_str, '%b %d %H:%M:%S %Y %Z')
+                    except ValueError:
+                        # Fallback to simpler format if needed
+                        try:
+                            exp_date = datetime.strptime(date_str, '%b %d %H:%M:%S %Y')
+                        except ValueError:
+                            # Fallback for "notAfter=..." string itself being the issue
+                            if 'notAfter=' in date_str: # If it's the raw string itself
+                                pass # Will result in 'INFO' status
+                                
+                    if exp_date:
+                        days_until_expiry = (exp_date - datetime.now()).days
+                        
+                        if days_until_expiry < 0:
+                            status = 'CRITICAL'
+                            reason = 'Certificate expired'
+                        elif days_until_expiry < 30:
+                            status = 'WARNING'
+                            reason = f'Certificate expires in {days_until_expiry} days'
+                        else:
+                            status = 'OK'
+                            reason = None
+                        
+                        cert_result = {'status': status}
+                        if reason:
+                            cert_result['reason'] = reason
+                        
+                        self.cached_results[cache_key] = cert_result
+                        self.last_check_times[cache_key] = current_time
+                        return cert_result
+                except Exception as e:
+                    print(f"[HealthMonitor] Error parsing certificate expiry date '{date_str}': {e}")
+                    # Fall through to return INFO if parsing fails
             
+            # If openssl command failed or date parsing failed
             return {'status': 'INFO', 'reason': 'Certificate check inconclusive'}
             
-        except Exception:
-            return {'status': 'OK'}
+        except Exception as e:
+            print(f"[HealthMonitor] Error checking certificates: {e}")
+            return {'status': 'OK'} # Return OK on exception
     
     def _check_disk_health_from_events(self) -> Dict[str, Any]:
         """
-        Check for disk health warnings from Proxmox task log and system logs.
+        Check for disk health warnings/errors from system logs (journalctl).
+        Looks for SMART warnings and specific disk errors.
         Returns dict of disk issues found.
         """
         disk_issues = {}
         
         try:
+            # Check journalctl for warnings/errors related to disks in the last hour
             result = subprocess.run(
                 ['journalctl', '--since', '1 hour ago', '--no-pager', '-p', 'warning'],
                 capture_output=True,
@@ -1634,54 +1757,58 @@ class HealthMonitor:
                 for line in result.stdout.split('\n'):
                     line_lower = line.lower()
                     
-                    # Check for SMART warnings
+                    # Check for SMART warnings/errors
                     if 'smart' in line_lower and ('warning' in line_lower or 'error' in line_lower or 'fail' in line_lower):
-                        # Extract disk name
-                        disk_match = re.search(r'/dev/(sd[a-z]|nvme\d+n\d+)', line)
+                        # Extract disk name using regex for common disk identifiers
+                        disk_match = re.search(r'/dev/(sd[a-z]|nvme\d+n\d+|hd\d+)', line)
                         if disk_match:
                             disk_name = disk_match.group(1)
-                            disk_issues[f'/dev/{disk_name}'] = {
-                                'status': 'WARNING',
-                                'reason': 'SMART warning detected'
-                            }
+                            # Prioritize CRITICAL if already warned, otherwise set to WARNING
+                            if disk_name not in disk_issues or disk_issues[f'/dev/{disk_name}']['status'] != 'CRITICAL':
+                                disk_issues[f'/dev/{disk_name}'] = {
+                                    'status': 'WARNING',
+                                    'reason': 'SMART warning detected'
+                                }
                     
-                    # Check for disk errors
-                    if any(keyword in line_lower for keyword in ['disk error', 'ata error', 'medium error']):
-                        disk_match = re.search(r'/dev/(sd[a-z]|nvme\d+n\d+)', line)
+                    # Check for specific disk I/O or medium errors
+                    if any(keyword in line_lower for keyword in ['disk error', 'ata error', 'medium error', 'io error']):
+                        disk_match = re.search(r'/dev/(sd[a-z]|nvme\d+n\d+|hd\d+)', line)
                         if disk_match:
                             disk_name = disk_match.group(1)
                             disk_issues[f'/dev/{disk_name}'] = {
                                 'status': 'CRITICAL',
                                 'reason': 'Disk error detected'
                             }
-        except Exception:
+        except Exception as e:
+            print(f"[HealthMonitor] Error checking disk health from events: {e}")
+            # Return empty dict on error, as this check isn't system-critical itself
             pass
         
         return disk_issues
     
     def _check_zfs_pool_health(self) -> Dict[str, Any]:
         """
-        Check ZFS pool health status using zpool status command.
-        Returns dict of pools with non-ONLINE status (DEGRADED, FAULTED, UNAVAIL, etc.)
+        Check ZFS pool health status using 'zpool status' command.
+        Returns dict of pools with non-ONLINE status (DEGRADED, FAULTED, UNAVAIL, etc.).
         """
         zfs_issues = {}
         
         try:
-            # First check if zpool command exists
-            result = subprocess.run(
+            # First check if 'zpool' command exists to avoid errors on non-ZFS systems
+            result_which = subprocess.run(
                 ['which', 'zpool'],
                 capture_output=True,
                 text=True,
                 timeout=1
             )
             
-            if result.returncode != 0:
-                # ZFS not installed, return empty
+            if result_which.returncode != 0:
+                # ZFS is not installed or 'zpool' command not in PATH, so no ZFS issues to report.
                 return zfs_issues
             
-            # Get list of all pools
+            # Get list of all pools and their health status
             result = subprocess.run(
-                ['zpool', 'list', '-H', '-o', 'name,health'],
+                ['zpool', 'list', '-H', '-o', 'name,health'], # -H for no header
                 capture_output=True,
                 text=True,
                 timeout=5
@@ -1696,11 +1823,12 @@ class HealthMonitor:
                     parts = line.split()
                     if len(parts) >= 2:
                         pool_name = parts[0]
-                        pool_health = parts[1].upper()
+                        pool_health = parts[1].upper() # Ensure uppercase for consistent comparison
                         
-                        # ONLINE is healthy, anything else is a problem
+                        # 'ONLINE' is the healthy state. Any other status indicates a problem.
                         if pool_health != 'ONLINE':
                             if pool_health in ['DEGRADED', 'FAULTED', 'UNAVAIL', 'REMOVED']:
+                                # These are critical states
                                 status = 'CRITICAL'
                                 reason = f'ZFS pool {pool_health.lower()}'
                             else:
@@ -1708,18 +1836,242 @@ class HealthMonitor:
                                 status = 'WARNING'
                                 reason = f'ZFS pool status: {pool_health.lower()}'
                             
+                            # Use a unique key for each pool issue
                             zfs_issues[f'zpool_{pool_name}'] = {
                                 'status': status,
                                 'reason': reason,
                                 'pool_name': pool_name,
                                 'health': pool_health
                             }
-        except Exception:
-            # If zpool command fails, silently ignore
+        except Exception as e:
+            print(f"[HealthMonitor] Error checking ZFS pool health: {e}")
+            # If 'zpool status' command itself fails, we can't report ZFS issues.
+            # Return empty dict as no specific ZFS issues were detected by this check.
             pass
         
         return zfs_issues
 
+    def _check_proxmox_storage(self) -> Optional[Dict[str, Any]]:
+        """
+        Check Proxmox storage status using the proxmox_storage_monitor module.
+        Detects unavailable storages configured in PVE.
+        Returns CRITICAL if any configured storage is unavailable.
+        Returns None if the module is not available.
+        """
+        if not PROXMOX_STORAGE_AVAILABLE:
+            return None
+        
+        try:
+            # Reload configuration to ensure we have the latest storage definitions
+            proxmox_storage_monitor.reload_configuration()
+            
+            # Get the current status of all configured storages
+            storage_status = proxmox_storage_monitor.get_storage_status()
+            unavailable_storages = storage_status.get('unavailable', [])
+            
+            if not unavailable_storages:
+                # All storages are available. We should also clear any previously recorded storage errors.
+                active_errors = health_persistence.get_active_errors()
+                for error in active_errors:
+                    # Target errors related to storage unavailability
+                    if error.get('category') == 'storage' and error.get('error_key', '').startswith('storage_unavailable_'):
+                        health_persistence.clear_error(error['error_key'])
+                return {'status': 'OK'}
+            
+            # If there are unavailable storages, record them as persistent errors and report.
+            storage_issues_details = []
+            for storage in unavailable_storages:
+                storage_name = storage['name']
+                error_key = f'storage_unavailable_{storage_name}'
+                status_detail = storage.get('status_detail', 'unavailable') # e.g., 'not_found', 'connection_error'
+                
+                # Formulate a descriptive reason for the issue
+                if status_detail == 'not_found':
+                    reason = f"Storage '{storage_name}' is configured but not found on the server."
+                elif status_detail == 'unavailable':
+                    reason = f"Storage '{storage_name}' is not available (connection error or backend issue)."
+                else:
+                    reason = f"Storage '{storage_name}' has status: {status_detail}."
+                
+                # Record a persistent CRITICAL error for each unavailable storage
+                health_persistence.record_error(
+                    error_key=error_key,
+                    category='storage', # Category for persistence lookup
+                    severity='CRITICAL', # Storage unavailability is always critical
+                    reason=reason,
+                    details={
+                        'storage_name': storage_name,
+                        'storage_type': storage.get('type', 'unknown'),
+                        'status_detail': status_detail,
+                        'dismissable': False  # Storage errors are not dismissable as they impact operations
+                    }
+                )
+                storage_issues_details.append(reason) # Collect reasons for the summary
+            
+            return {
+                'status': 'CRITICAL',
+                'reason': f'{len(unavailable_storages)} Proxmox storage(s) unavailable',
+                'details': {
+                    'unavailable_storages': unavailable_storages,
+                    'issues': storage_issues_details
+                }
+            }
+        
+        except Exception as e:
+            print(f"[HealthMonitor] Error checking Proxmox storage: {e}")
+            # Return None on exception to indicate the check could not be performed, not necessarily a failure.
+            return None
+    
+    def get_health_status(self) -> Dict[str, Any]:
+        """
+        Main function to get the comprehensive health status.
+        This function orchestrates all individual checks and aggregates results.
+        """
+        # Trigger all checks, including those with caching
+        detailed_status = self.get_detailed_status()
+        overall_status = self.get_overall_status()
+        system_info = self.get_system_info()
+        
+        return {
+            'system_info': system_info,
+            'overall_health': overall_status,
+            'detailed_health': detailed_status,
+            'timestamp': datetime.now().isoformat()
+        }
+    
+    def get_detailed_status(self) -> Dict[str, Any]:
+        """
+        Get comprehensive health status with all checks.
+        Returns JSON structure with ALL 10 categories always present.
+        Now includes persistent error tracking.
+        """
+        active_errors = health_persistence.get_active_errors()
+        # No need to create persistent_issues dict here, it's implicitly handled by the checks
+        
+        details = {
+            'cpu': {'status': 'OK'},
+            'memory': {'status': 'OK'},
+            'storage': {'status': 'OK'}, # This will be overwritten by specific storage checks
+            'disks': {'status': 'OK'}, # This will be overwritten by disk/filesystem checks
+            'network': {'status': 'OK'},
+            'vms': {'status': 'OK'},
+            'services': {'status': 'OK'},
+            'logs': {'status': 'OK'},
+            'updates': {'status': 'OK'},
+            'security': {'status': 'OK'}
+        }
+        
+        critical_issues = []
+        warning_issues = []
+        info_issues = []  # Added info_issues to track INFO separately
+        
+        # --- Priority Order of Checks ---
+        
+        # Priority 1: Critical PVE Services
+        services_status = self._check_pve_services()
+        details['services'] = services_status
+        if services_status['status'] == 'CRITICAL':
+            critical_issues.append(f"PVE Services: {services_status.get('reason', 'Service failure')}")
+        elif services_status['status'] == 'WARNING':
+            warning_issues.append(f"PVE Services: {services_status.get('reason', 'Service issue')}")
+        
+        # Priority 1.5: Proxmox Storage Check (External Module)
+        proxmox_storage_result = self._check_proxmox_storage()
+        if proxmox_storage_result: # Only process if the check ran (module available)
+            details['storage'] = proxmox_storage_result
+            if proxmox_storage_result.get('status') == 'CRITICAL':
+                critical_issues.append(proxmox_storage_result.get('reason', 'Proxmox storage unavailable'))
+            elif proxmox_storage_result.get('status') == 'WARNING':
+                warning_issues.append(proxmox_storage_result.get('reason', 'Proxmox storage issue'))
+        
+        # Priority 2: Disk/Filesystem Health (Internal checks: usage, ZFS, SMART, IO errors)
+        storage_status = self._check_storage_optimized()
+        details['disks'] = storage_status # Use 'disks' for filesystem/disk specific issues
+        if storage_status.get('status') == 'CRITICAL':
+            critical_issues.append(f"Storage/Disks: {storage_status.get('reason', 'Disk/Storage failure')}")
+        elif storage_status.get('status') == 'WARNING':
+            warning_issues.append(f"Storage/Disks: {storage_status.get('reason', 'Disk/Storage issue')}")
+        
+        # Priority 3: VMs/CTs Status (with persistence)
+        vms_status = self._check_vms_cts_with_persistence()
+        details['vms'] = vms_status
+        if vms_status.get('status') == 'CRITICAL':
+            critical_issues.append(f"VMs/CTs: {vms_status.get('reason', 'VM/CT failure')}")
+        elif vms_status.get('status') == 'WARNING':
+            warning_issues.append(f"VMs/CTs: {vms_status.get('reason', 'VM/CT issue')}")
+        
+        # Priority 4: Network Connectivity
+        network_status = self._check_network_optimized()
+        details['network'] = network_status
+        if network_status.get('status') == 'CRITICAL':
+            critical_issues.append(f"Network: {network_status.get('reason', 'Network failure')}")
+        elif network_status.get('status') == 'WARNING':
+            warning_issues.append(f"Network: {network_status.get('reason', 'Network issue')}")
+        
+        # Priority 5: CPU Usage (with hysteresis)
+        cpu_status = self._check_cpu_with_hysteresis()
+        details['cpu'] = cpu_status
+        if cpu_status.get('status') == 'CRITICAL':
+            critical_issues.append(f"CPU: {cpu_status.get('reason', 'CPU critical')}")
+        elif cpu_status.get('status') == 'WARNING':
+            warning_issues.append(f"CPU: {cpu_status.get('reason', 'CPU high')}")
+        
+        # Priority 6: Memory Usage (RAM and Swap)
+        memory_status = self._check_memory_comprehensive()
+        details['memory'] = memory_status
+        if memory_status.get('status') == 'CRITICAL':
+            critical_issues.append(f"Memory: {memory_status.get('reason', 'Memory critical')}")
+        elif memory_status.get('status') == 'WARNING':
+            warning_issues.append(f"Memory: {memory_status.get('reason', 'Memory high')}")
+        
+        # Priority 7: Log Analysis (with persistence)
+        logs_status = self._check_logs_with_persistence()
+        details['logs'] = logs_status
+        if logs_status.get('status') == 'CRITICAL':
+            critical_issues.append(f"Logs: {logs_status.get('reason', 'Critical log errors')}")
+        elif logs_status.get('status') == 'WARNING':
+            warning_issues.append(f"Logs: {logs_status.get('reason', 'Log warnings')}")
+        
+        # Priority 8: System Updates
+        updates_status = self._check_updates()
+        details['updates'] = updates_status
+        if updates_status.get('status') == 'CRITICAL':
+            critical_issues.append(f"Updates: {updates_status.get('reason', 'System not updated')}")
+        elif updates_status.get('status') == 'WARNING':
+            warning_issues.append(f"Updates: {updates_status.get('reason', 'Updates pending')}")
+        elif updates_status.get('status') == 'INFO':
+            info_issues.append(f"Updates: {updates_status.get('reason', 'Informational update notice')}")
+        
+        # Priority 9: Security Checks
+        security_status = self._check_security()
+        details['security'] = security_status
+        if security_status.get('status') == 'WARNING':
+            warning_issues.append(f"Security: {security_status.get('reason', 'Security issue')}")
+        elif security_status.get('status') == 'INFO':
+            info_issues.append(f"Security: {security_status.get('reason', 'Security information')}")
+        
+        # --- Determine Overall Status ---
+        # Use a fixed order of severity: CRITICAL > WARNING > INFO > OK
+        if critical_issues:
+            overall = 'CRITICAL'
+            summary = '; '.join(critical_issues[:3]) # Limit summary to 3 issues
+        elif warning_issues:
+            overall = 'WARNING'
+            summary = '; '.join(warning_issues[:3])
+        elif info_issues:
+            overall = 'OK'  # INFO statuses don't degrade overall health
+            summary = '; '.join(info_issues[:3])
+        else:
+            overall = 'OK'
+            summary = 'All systems operational'
+        
+        return {
+            'overall': overall,
+            'summary': summary,
+            'details': details,
+            'timestamp': datetime.now().isoformat()
+        }
+
 
 # Global instance
 health_monitor = HealthMonitor()
diff --git a/AppImage/scripts/proxmox_storage_monitor.py b/AppImage/scripts/proxmox_storage_monitor.py
new file mode 100644
index 0000000..10cdcaf
--- /dev/null
+++ b/AppImage/scripts/proxmox_storage_monitor.py
@@ -0,0 +1,202 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+ProxMenux - Proxmox Storage Monitor
+Monitors configured Proxmox storages and tracks unavailable storages
+"""
+
+import json
+import subprocess
+import socket
+from typing import Dict, List, Any, Optional
+
+
+class ProxmoxStorageMonitor:
+    """Monitor Proxmox storage configuration and status"""
+    
+    def __init__(self):
+        self.configured_storages: Dict[str, Dict[str, Any]] = {}
+        self._load_configured_storages()
+    
+    def _get_node_name(self) -> str:
+        """Get current Proxmox node name"""
+        try:
+            result = subprocess.run(
+                ['pvesh', 'get', '/nodes', '--output-format', 'json'],
+                capture_output=True,
+                text=True,
+                timeout=5
+            )
+            if result.returncode == 0:
+                nodes = json.loads(result.stdout)
+                hostname = socket.gethostname()
+                for node in nodes:
+                    if node.get('node') == hostname:
+                        return hostname
+                if nodes:
+                    return nodes[0].get('node', hostname)
+            return socket.gethostname()
+        except Exception:
+            return socket.gethostname()
+    
+    def _load_configured_storages(self) -> None:
+        """Load configured storages from Proxmox configuration"""
+        try:
+            local_node = self._get_node_name()
+            
+            # Read storage configuration from pvesh
+            result = subprocess.run(
+                ['pvesh', 'get', '/storage', '--output-format', 'json'],
+                capture_output=True,
+                text=True,
+                timeout=5
+            )
+            
+            if result.returncode != 0:
+                return
+            
+            storages = json.loads(result.stdout)
+            
+            for storage in storages:
+                storage_id = storage.get('storage')
+                if not storage_id:
+                    continue
+                
+                # Check if storage is enabled for this node
+                nodes = storage.get('nodes')
+                if nodes and local_node not in nodes.split(','):
+                    continue
+                
+                disabled = storage.get('disable', 0)
+                if disabled == 1:
+                    continue
+                
+                self.configured_storages[storage_id] = {
+                    'name': storage_id,
+                    'type': storage.get('type', 'unknown'),
+                    'content': storage.get('content', ''),
+                    'path': storage.get('path', ''),
+                    'enabled': True
+                }
+        
+        except Exception:
+            pass
+    
+    def get_storage_status(self) -> Dict[str, List[Dict[str, Any]]]:
+        """
+        Get storage status, including unavailable storages
+        
+        Returns:
+            {
+                'available': [...],
+                'unavailable': [...]
+            }
+        """
+        try:
+            local_node = self._get_node_name()
+            
+            # Get current storage status from pvesh
+            result = subprocess.run(
+                ['pvesh', 'get', '/cluster/resources', '--type', 'storage', '--output-format', 'json'],
+                capture_output=True,
+                text=True,
+                timeout=10
+            )
+            
+            if result.returncode != 0:
+                return {'available': [], 'unavailable': list(self.configured_storages.values())}
+            
+            resources = json.loads(result.stdout)
+            
+            # Track which configured storages are available
+            available_storages = []
+            unavailable_storages = []
+            seen_storage_names = set()
+            
+            for resource in resources:
+                node = resource.get('node', '')
+                
+                # Filter only local node storages
+                if node != local_node:
+                    continue
+                
+                name = resource.get('storage', 'unknown')
+                seen_storage_names.add(name)
+                storage_type = resource.get('plugintype', 'unknown')
+                status = resource.get('status', 'unknown')
+                
+                try:
+                    total = int(resource.get('maxdisk', 0))
+                    used = int(resource.get('disk', 0))
+                    available = total - used if total > 0 else 0
+                except (ValueError, TypeError):
+                    total = 0
+                    used = 0
+                    available = 0
+                
+                # Calculate percentage
+                percent = (used / total * 100) if total > 0 else 0.0
+                
+                # Convert bytes to GB
+                total_gb = round(total / (1024**3), 2)
+                used_gb = round(used / (1024**3), 2)
+                available_gb = round(available / (1024**3), 2)
+                
+                storage_info = {
+                    'name': name,
+                    'type': storage_type,
+                    'total': total_gb,
+                    'used': used_gb,
+                    'available': available_gb,
+                    'percent': round(percent, 2),
+                    'node': node
+                }
+                
+                # Check if storage is available
+                if total == 0 or status.lower() != "available":
+                    storage_info['status'] = 'error'
+                    storage_info['status_detail'] = 'unavailable' if total == 0 else status
+                    unavailable_storages.append(storage_info)
+                else:
+                    storage_info['status'] = 'active'
+                    available_storages.append(storage_info)
+            
+            # Check for configured storages that are completely missing
+            for storage_name, storage_config in self.configured_storages.items():
+                if storage_name not in seen_storage_names:
+                    unavailable_storages.append({
+                        'name': storage_name,
+                        'type': storage_config['type'],
+                        'status': 'error',
+                        'status_detail': 'not_found',
+                        'total': 0,
+                        'used': 0,
+                        'available': 0,
+                        'percent': 0,
+                        'node': local_node
+                    })
+            
+            return {
+                'available': available_storages,
+                'unavailable': unavailable_storages
+            }
+        
+        except Exception:
+            return {
+                'available': [],
+                'unavailable': list(self.configured_storages.values())
+            }
+    
+    def get_unavailable_count(self) -> int:
+        """Get count of unavailable storages"""
+        status = self.get_storage_status()
+        return len(status['unavailable'])
+    
+    def reload_configuration(self) -> None:
+        """Reload storage configuration from Proxmox"""
+        self.configured_storages.clear()
+        self._load_configured_storages()
+
+
+# Global instance
+proxmox_storage_monitor = ProxmoxStorageMonitor()