From adde2ce5b9ed15c171bef8bcdc33fc8c0d92f03a Mon Sep 17 00:00:00 2001
From: MacRimi <ricoextincion@gmail.com>
Date: Mon, 6 Apr 2026 12:02:05 +0200
Subject: [PATCH] update health_persistence.py

---
 AppImage/components/system-overview.tsx |  28 +--
 AppImage/scripts/flask_server.py        |   3 +-
 AppImage/scripts/health_monitor.py      |  57 ++++--
 AppImage/scripts/health_persistence.py  | 230 ++++++++++++++++--------
 AppImage/scripts/notification_events.py |  31 +++-
 5 files changed, 245 insertions(+), 104 deletions(-)
diff --git a/AppImage/components/system-overview.tsx b/AppImage/components/system-overview.tsx
index d8f6cb5e..b1907209 100644
--- a/AppImage/components/system-overview.tsx
+++ b/AppImage/components/system-overview.tsx
@@ -111,9 +111,9 @@ const fetchSystemData = async (retries = 3, delayMs = 500): Promise<SystemData |
     try {
       const data = await fetchApi<SystemData>("/api/system")
       return data
-    } catch (error) {
+    } catch {
       if (attempt === retries - 1) {
-        console.error("[v0] Failed to fetch system data after retries:", error)
+        // Silent fail - API not available (expected in preview environment)
         return null
       }
       // Wait before retry
@@ -127,8 +127,8 @@ const fetchVMData = async (): Promise<VMData[]> => {
   try {
     const data = await fetchApi<any>("/api/vms")
     return Array.isArray(data) ? data : data.vms || []
-  } catch (error) {
-    console.error("[v0] Failed to fetch VM data:", error)
+  } catch {
+    // Silent fail - API not available
     return []
   }
 }
@@ -137,8 +137,7 @@ const fetchStorageData = async (): Promise<StorageData | null> => {
   try {
     const data = await fetchApi<StorageData>("/api/storage/summary")
     return data
-  } catch (error) {
-    console.log("[v0] Storage API not available (this is normal if not configured)")
+  } catch {
     return null
   }
 }
@@ -146,13 +145,22 @@ const fetchStorageData = async (): Promise<StorageData | null> => {
 const fetchNetworkData = async (): Promise<NetworkData | null> => {
   try {
     const data = await fetchApi<NetworkData>("/api/network/summary")
-    return data
-  } catch (error) {
-    console.log("[v0] Network API not available (this is normal if not configured)")
-    return null
+  return data
+  } catch {
+  return null
   }
 }
 
+const fetchProxmoxStorageData = async (): Promise<ProxmoxStorage[] | null> => {
+  try {
+  const data = await fetchApi<ProxmoxStorage[]>("/api/proxmox-storage")
+  return data
+  } catch {
+  return null
+  }
+}
+}
+
 const fetchProxmoxStorageData = async (): Promise<ProxmoxStorageData | null> => {
   try {
     const data = await fetchApi<ProxmoxStorageData>("/api/proxmox-storage")
diff --git a/AppImage/scripts/flask_server.py b/AppImage/scripts/flask_server.py
index 2279827e..4cd385d2 100644
--- a/AppImage/scripts/flask_server.py
+++ b/AppImage/scripts/flask_server.py
@@ -915,8 +915,9 @@ def _capture_health_journal_context(categories: list, reason: str = '') -> str:
             return ""
         
         # Capture recent journal entries matching keywords
+        # Use -b 0 to only include logs from the current boot
         cmd = (
-            f"journalctl --since='10 minutes ago' --no-pager -n 500 2>/dev/null | "
+            f"journalctl -b 0 --since='10 minutes ago' --no-pager -n 500 2>/dev/null | "
             f"grep -iE '{pattern}' | tail -n 30"
         )
         
diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py
index 231687ca..72b863bd 100644
--- a/AppImage/scripts/health_monitor.py
+++ b/AppImage/scripts/health_monitor.py
@@ -150,7 +150,7 @@ class HealthMonitor:
         r'zfs.*scrub (started|finished|in progress)',
         r'zpool.*resilver',
         
-        # ���─ LXC/Container normal operations ──
+        # ── LXC/Container normal operations ──
         r'lxc.*monitor',
         r'systemd\[1\]: (started|stopped) .*\.scope',
         
@@ -184,13 +184,21 @@ class HealthMonitor:
     ]
     
     CRITICAL_LOG_KEYWORDS = [
-        'out of memory', 'oom_kill', 'kernel panic',
-        'filesystem read-only', 'cannot mount',
-        'raid.*failed', 'md.*device failed',
-        'ext4-fs error', 'xfs.*corruption',
-        'lvm activation failed',
+        # OOM and memory errors
+        'out of memory', 'oom_kill', 'oom-kill', 'invoked oom-killer',
+        'memory cgroup out of memory', 'cannot allocate memory', 'oom_reaper',
+        # Kernel panics and critical faults
+        'kernel panic', 'general protection fault', 'trap invalid opcode',
+        # Filesystem critical errors
+        'filesystem read-only', 'read-only file system', 'cannot mount',
+        'ext4-fs error', 'ext4_abort', 'xfs.*corruption', 'btrfs.*error',
+        # RAID/Storage critical
+        'raid.*failed', 'md.*device failed', 'lvm activation failed',
+        'zpool.*faulted', 'state: faulted',
+        # Hardware errors
         'hardware error', 'mce:',
-        'general protection fault',
+        # Cluster critical
+        'quorum lost', 'split brain',
     ]
     
     # Segfault is WARNING, not CRITICAL -- only PVE-critical process
@@ -202,11 +210,20 @@ class HealthMonitor:
     }
     
     WARNING_LOG_KEYWORDS = [
-        'i/o error', 'ata error', 'scsi error',
-        'task hung', 'blocked for more than',
-        'failed to start', 'service.*failed',
+        # Storage I/O errors
+        'i/o error', 'buffer i/o error', 'ata error', 'scsi error',
         'disk.*offline', 'disk.*removed',
-        'segfault',  # WARNING by default; escalated to CRITICAL only for PVE processes
+        # CPU/IO blocking
+        'task hung', 'blocked for more than', 'soft lockup',
+        # Service failures
+        'failed to start', 'service.*failed',
+        'entering failed state', 'code=exited, status=', 'code=killed',
+        # Process crashes (WARNING by default; escalated to CRITICAL for PVE processes)
+        'segfault',
+        # Cluster/Network warnings
+        'corosync.*failed', 'corosync.*timeout',
+        'connection lost', 'totem.*failed',
+        'entered disabled state', 'entered blocking state',
     ]
     
     # PVE Critical Services
@@ -769,12 +786,30 @@ class HealthMonitor:
             if len(critical_samples) >= 3:
                 status = 'CRITICAL'
                 reason = f'CPU >{self.CPU_CRITICAL}% sustained for {self.CPU_CRITICAL_DURATION}s'
+                # Record the error
+                health_persistence.record_error(
+                    error_key='cpu_usage',
+                    category='cpu',
+                    severity='CRITICAL',
+                    reason=reason,
+                    details={'cpu_percent': cpu_percent}
+                )
             elif len(warning_samples) >= 3 and len(recovery_samples) < 2:
                 status = 'WARNING'
                 reason = f'CPU >{self.CPU_WARNING}% sustained for {self.CPU_WARNING_DURATION}s'
+                # Record the warning
+                health_persistence.record_error(
+                    error_key='cpu_usage',
+                    category='cpu',
+                    severity='WARNING',
+                    reason=reason,
+                    details={'cpu_percent': cpu_percent}
+                )
             else:
                 status = 'OK'
                 reason = None
+                # CPU is normal - auto-resolve any existing CPU errors
+                health_persistence.resolve_error('cpu_usage', 'CPU usage returned to normal')
             
             temp_status = self._check_cpu_temperature()
             
diff --git a/AppImage/scripts/health_persistence.py b/AppImage/scripts/health_persistence.py
index c95db4bf..948ac701 100644
--- a/AppImage/scripts/health_persistence.py
+++ b/AppImage/scripts/health_persistence.py
@@ -967,10 +967,12 @@ class HealthPersistence:
         cutoff_events = (now - timedelta(days=30)).isoformat()
         cursor.execute('DELETE FROM events WHERE timestamp < ?', (cutoff_events,))
         
-        # ── Auto-resolve transient errors after system stabilizes ──
-        # Transient errors (OOM, high CPU, service failures) resolve themselves.
-        # If the system has been up for >10 minutes and these errors haven't recurred,
-        # they are stale and should be auto-resolved.
+        # ══════════════════════════════════════════════════════════════════════
+        # SMART AUTO-RESOLVE: Based on system state, not hardcoded patterns
+        # ══════════════════════════════════════════════════════════════════════
+        # Logic: If an error hasn't been seen recently AND the system is healthy,
+        # the error is stale and should be auto-resolved.
+        # This works for ANY error pattern, not just predefined ones.
         try:
             import psutil
             # Get system uptime
@@ -979,9 +981,13 @@ class HealthPersistence:
             
             # Only auto-resolve if system has been stable for at least 10 minutes
             if uptime_seconds > 600:  # 10 minutes
-                stale_cutoff = (now - timedelta(minutes=10)).isoformat()
+                current_cpu = psutil.cpu_percent(interval=0.1)
+                current_mem = psutil.virtual_memory().percent
                 
-                # 1. Resolve transient log errors (OOM, service failures)
+                # ── 1. LOGS category: Auto-resolve if not seen in 15 minutes ──
+                # Log errors are transient - if journalctl hasn't reported them recently,
+                # they are from a previous state and should be resolved.
+                stale_logs_cutoff = (now - timedelta(minutes=15)).isoformat()
                 cursor.execute('''
                     UPDATE errors 
                     SET resolved_at = ?
@@ -989,49 +995,69 @@ class HealthPersistence:
                       AND resolved_at IS NULL 
                       AND acknowledged = 0
                       AND last_seen < ?
-                      AND (error_key LIKE 'log_critical_%' 
-                           OR error_key LIKE 'log_persistent_%'
-                           OR reason LIKE '%Out of memory%'
-                           OR reason LIKE '%Recurring error%'
-                           OR reason LIKE '%service%Failed%'
-                           OR reason LIKE '%timeout%'
-                           OR reason LIKE '%critical error%')
-                ''', (now_iso, stale_cutoff))
+                ''', (now_iso, stale_logs_cutoff))
                 
-                # 2. Auto-resolve CPU errors if current CPU is normal (<75%)
-                try:
-                    current_cpu = psutil.cpu_percent(interval=0.1)
-                    if current_cpu < 75:
-                        cursor.execute('''
-                            UPDATE errors 
-                            SET resolved_at = ?
-                            WHERE category = 'temperature'
-                              AND resolved_at IS NULL 
-                              AND acknowledged = 0
-                              AND last_seen < ?
-                              AND (error_key = 'cpu_usage'
-                                   OR reason LIKE '%CPU >%sustained%'
-                                   OR reason LIKE '%Sustained high CPU%')
-                        ''', (now_iso, stale_cutoff))
-                except Exception:
-                    pass
+                # ── 2. CPU category: Auto-resolve if CPU is normal (<75%) ──
+                if current_cpu < 75:
+                    stale_cpu_cutoff = (now - timedelta(minutes=5)).isoformat()
+                    cursor.execute('''
+                        UPDATE errors 
+                        SET resolved_at = ?
+                        WHERE (category = 'cpu' OR category = 'temperature')
+                          AND resolved_at IS NULL 
+                          AND acknowledged = 0
+                          AND last_seen < ?
+                          AND (error_key LIKE 'cpu_%' OR reason LIKE '%CPU%')
+                    ''', (now_iso, stale_cpu_cutoff))
                 
-                # 3. Auto-resolve memory errors if current memory is normal (<80%)
-                try:
-                    current_mem = psutil.virtual_memory().percent
-                    if current_mem < 80:
-                        cursor.execute('''
-                            UPDATE errors 
-                            SET resolved_at = ?
-                            WHERE category = 'memory'
-                              AND resolved_at IS NULL 
-                              AND acknowledged = 0
-                              AND last_seen < ?
-                              AND (reason LIKE '%Memory >%'
-                                   OR reason LIKE '%RAM usage%')
-                        ''', (now_iso, stale_cutoff))
-                except Exception:
-                    pass
+                # ── 3. MEMORY category: Auto-resolve if memory is normal (<80%) ──
+                if current_mem < 80:
+                    stale_mem_cutoff = (now - timedelta(minutes=5)).isoformat()
+                    cursor.execute('''
+                        UPDATE errors 
+                        SET resolved_at = ?
+                        WHERE (category = 'memory' OR category = 'logs')
+                          AND resolved_at IS NULL 
+                          AND acknowledged = 0
+                          AND last_seen < ?
+                          AND (error_key LIKE '%oom%' 
+                               OR error_key LIKE '%memory%'
+                               OR reason LIKE '%memory%'
+                               OR reason LIKE '%OOM%'
+                               OR reason LIKE '%killed%process%')
+                    ''', (now_iso, stale_mem_cutoff))
+                
+                # ── 4. VMS category: Auto-resolve if VM/CT is now running ──
+                # Check all active VM/CT errors and resolve if the VM/CT is now running
+                cursor.execute('''
+                    SELECT error_key, category FROM errors 
+                    WHERE (category IN ('vms', 'vmct') OR error_key LIKE 'vm_%' OR error_key LIKE 'ct_%' OR error_key LIKE 'vmct_%')
+                      AND resolved_at IS NULL 
+                      AND acknowledged = 0
+                ''')
+                vm_errors = cursor.fetchall()
+                for error_key, cat in vm_errors:
+                    # Extract VM/CT ID from error_key
+                    import re
+                    vmid_match = re.search(r'(?:vm_|ct_|vmct_)(\d+)', error_key)
+                    if vmid_match:
+                        vmid = vmid_match.group(1)
+                        # Check if running - this auto-resolves if so
+                        self.check_vm_running(vmid)
+                
+                # ── 5. GENERIC: Any error not seen in 30 minutes while system is healthy ──
+                # If CPU < 80% and Memory < 85% and error hasn't been seen in 30 min,
+                # the system has recovered and the error is stale.
+                if current_cpu < 80 and current_mem < 85:
+                    stale_generic_cutoff = (now - timedelta(minutes=30)).isoformat()
+                    cursor.execute('''
+                        UPDATE errors 
+                        SET resolved_at = ?
+                        WHERE resolved_at IS NULL 
+                          AND acknowledged = 0
+                          AND last_seen < ?
+                          AND category NOT IN ('disks', 'storage')
+                    ''', (now_iso, stale_generic_cutoff))
                     
         except Exception:
             pass  # If we can't read uptime, skip this cleanup
@@ -1166,9 +1192,20 @@ class HealthPersistence:
             """Extract VM/CT ID from error message or key."""
             if not text:
                 return None
-            # Patterns: "VM 100", "CT 100", "vm_100_", "ct_100_", "VMID 100", etc.
-            match = re.search(r'(?:VM|CT|VMID|CTID|vm_|ct_)[\s_]?(\d{3,})', text, re.IGNORECASE)
-            return match.group(1) if match else None
+            # Patterns: "VM 100", "CT 100", "vm_100_", "ct_100_", "VMID 100", "VM/CT 100", "qemu/100", "lxc/100", etc.
+            patterns = [
+                r'(?:VM|CT|VMID|CTID|vm_|ct_|vmct_)[\s_]?(\d{3,})',  # VM 100, ct_100
+                r'VM/CT[\s_]?(\d{3,})',                               # VM/CT 100
+                r'(?:qemu|lxc)[/\\](\d{3,})',                         # qemu/100, lxc/100
+                r'process.*kvm.*?(\d{3,})',                           # process kvm with vmid
+                r'Failed to start.*?(\d{3,})',                        # Failed to start VM/CT
+                r'starting.*?(\d{3,}).*failed',                       # starting 100 failed
+            ]
+            for pattern in patterns:
+                match = re.search(pattern, text, re.IGNORECASE)
+                if match:
+                    return match.group(1)
+            return None
         
         def get_age_hours(timestamp_str):
             """Get age in hours from ISO timestamp string."""
@@ -1189,11 +1226,20 @@ class HealthPersistence:
             
             # === VM/CT ERRORS ===
             # Check if VM/CT still exists (covers: vms/vmct categories, vm_*, ct_*, vmct_* error keys)
-            if category in ('vms', 'vmct') or (error_key and (error_key.startswith('vm_') or error_key.startswith('ct_') or error_key.startswith('vmct_'))):
-                vmid = extract_vmid_from_text(error_key) or extract_vmid_from_text(reason)
-                if vmid and not check_vm_ct_cached(vmid):
+            # Also check if the reason mentions a VM/CT that no longer exists
+            vmid_from_key = extract_vmid_from_text(error_key) if error_key else None
+            vmid_from_reason = extract_vmid_from_text(reason) if reason else None
+            vmid = vmid_from_key or vmid_from_reason
+            
+            if vmid and not check_vm_ct_cached(vmid):
+                # VM/CT doesn't exist - resolve regardless of category
+                should_resolve = True
+                resolution_reason = f'VM/CT {vmid} deleted'
+            elif category in ('vms', 'vmct') or (error_key and (error_key.startswith('vm_') or error_key.startswith('ct_') or error_key.startswith('vmct_'))):
+                # VM/CT category but ID couldn't be extracted - resolve if stale
+                if not vmid and last_seen_hours > 1:
                     should_resolve = True
-                    resolution_reason = 'VM/CT deleted'
+                    resolution_reason = 'VM/CT error stale (>1h, ID not found)'
             
             # === DISK ERRORS ===
             # Check if disk device or ZFS pool still exists
@@ -1360,8 +1406,17 @@ class HealthPersistence:
     
     def check_vm_running(self, vm_id: str) -> bool:
         """
-        Check if a VM/CT is running and resolve error if so.
+        Check if a VM/CT is running and resolve TRANSIENT errors if so.
         Also resolves error if VM/CT no longer exists.
+        
+        Only resolves errors that are likely to be fixed by a restart:
+        - QMP command failures
+        - Startup failures (generic)
+        
+        Does NOT resolve persistent configuration errors like:
+        - Device missing
+        - Permission issues
+        
         Returns True if running/resolved, False otherwise.
         """
         import subprocess
@@ -1369,6 +1424,8 @@ class HealthPersistence:
         try:
             vm_exists = False
             ct_exists = False
+            is_running = False
+            vm_type = None
             
             # Check qm status for VMs
             result_vm = subprocess.run(
@@ -1380,32 +1437,59 @@ class HealthPersistence:
             
             if result_vm.returncode == 0:
                 vm_exists = True
+                vm_type = 'vm'
                 if 'running' in result_vm.stdout.lower():
-                    self.resolve_error(f'vm_{vm_id}', 'VM started')
-                    self.resolve_error(f'vmct_{vm_id}', 'VM started')
-                    return True
+                    is_running = True
             
             # Check pct status for containers
-            result_ct = subprocess.run(
-                ['pct', 'status', vm_id],
-                capture_output=True,
-                text=True,
-                timeout=2
-            )
+            if not vm_exists:
+                result_ct = subprocess.run(
+                    ['pct', 'status', vm_id],
+                    capture_output=True,
+                    text=True,
+                    timeout=2
+                )
+                
+                if result_ct.returncode == 0:
+                    ct_exists = True
+                    vm_type = 'ct'
+                    if 'running' in result_ct.stdout.lower():
+                        is_running = True
             
-            if result_ct.returncode == 0:
-                ct_exists = True
-                if 'running' in result_ct.stdout.lower():
-                    self.resolve_error(f'ct_{vm_id}', 'Container started')
-                    self.resolve_error(f'vmct_{vm_id}', 'Container started')
-                    return True
-            
-            # If neither VM nor CT exists, resolve all related errors
+            # If neither VM nor CT exists, resolve ALL related errors
             if not vm_exists and not ct_exists:
                 self.resolve_error(f'vm_{vm_id}', 'VM/CT deleted')
                 self.resolve_error(f'ct_{vm_id}', 'VM/CT deleted')
                 self.resolve_error(f'vmct_{vm_id}', 'VM/CT deleted')
-                return True  # Error resolved because resource doesn't exist
+                return True
+            
+            # If running, only resolve TRANSIENT errors (QMP, startup)
+            # Do NOT resolve persistent config errors (device missing, permissions)
+            if is_running:
+                conn = self._get_conn()
+                cursor = conn.cursor()
+                
+                # Get the error details to check if it's a persistent config error
+                for prefix in (f'{vm_type}_{vm_id}', f'vmct_{vm_id}'):
+                    cursor.execute('''
+                        SELECT error_key, reason FROM errors 
+                        WHERE error_key = ? AND resolved_at IS NULL
+                    ''', (prefix,))
+                    row = cursor.fetchone()
+                    if row:
+                        reason = (row[1] or '').lower()
+                        # Check if this is a persistent config error that won't be fixed by restart
+                        is_persistent_config = any(indicator in reason for indicator in [
+                            'device', 'missing', 'does not exist', 'permission', 
+                            'not found', 'no such', 'invalid'
+                        ])
+                        
+                        if not is_persistent_config:
+                            # Transient error - resolve it
+                            self.resolve_error(prefix, f'{vm_type.upper()} started successfully')
+                
+                conn.close()
+                return True
             
             return False
             
diff --git a/AppImage/scripts/notification_events.py b/AppImage/scripts/notification_events.py
index 5393e23c..cc00e136 100644
--- a/AppImage/scripts/notification_events.py
+++ b/AppImage/scripts/notification_events.py
@@ -174,8 +174,9 @@ def capture_journal_context(keywords: list, lines: int = 30,
             return ""
         
         # Use journalctl with grep to filter relevant lines
+        # Use -b 0 to only include logs from the current boot (not previous boots)
         cmd = (
-            f"journalctl --since='{since}' --no-pager -n 500 2>/dev/null | "
+            f"journalctl -b 0 --since='{since}' --no-pager -n 500 2>/dev/null | "
             f"grep -iE '{pattern}' | tail -n {lines}"
         )
         
@@ -1800,6 +1801,8 @@ class PollingCollector:
     # Key = health_persistence category name
     # Value = minimum seconds between notifications for the same error_key
     _CATEGORY_COOLDOWNS = {
+        # Category cooldown: minimum time between DIFFERENT errors of the same category
+        # This prevents notification storms when multiple issues arise together
         'disks':        86400,   # 24h - I/O errors are persistent hardware issues
         'smart':        86400,   # 24h - SMART errors same as I/O
         'zfs':          86400,   # 24h - ZFS pool issues are persistent
@@ -1809,6 +1812,7 @@ class PollingCollector:
         'temperature':  3600,    # 1h  - temp can fluctuate near thresholds
         'logs':         3600,    # 1h  - repeated log patterns
         'vms':          1800,    # 30m - VM state oscillation
+        'vmct':         1800,    # 30m - VM/CT state oscillation
         'security':     3600,    # 1h  - auth failures tend to be bursty
         'cpu':          1800,    # 30m - CPU spikes can be transient
         'memory':       1800,    # 30m - memory pressure oscillation
@@ -1816,6 +1820,10 @@ class PollingCollector:
         'updates':      86400,   # 24h - update info doesn't change fast
     }
     
+    # Global cooldown: minimum time before the SAME error can be re-notified
+    # This is independent of category - same error_key cannot repeat before this time
+    SAME_ERROR_COOLDOWN = 86400  # 24 hours
+    
     _ENTITY_MAP = {
         'cpu': ('node', ''), 'memory': ('node', ''), 'temperature': ('node', ''),
         'load': ('node', ''),
@@ -2032,15 +2040,20 @@ class PollingCollector:
             # Determine if we should notify
             is_new = error_key not in self._known_errors
             last_sent = self._last_notified.get(error_key, 0)
-            cat_cooldown = self._CATEGORY_COOLDOWNS.get(category, self.DIGEST_INTERVAL)
-            is_due = (now - last_sent) >= cat_cooldown
+            time_since_last = now - last_sent
+            
+            # ── SAME ERROR COOLDOWN (24h) ──
+            # The SAME error_key cannot be re-notified before 24 hours.
+            # This is the PRIMARY deduplication mechanism.
+            if time_since_last < self.SAME_ERROR_COOLDOWN:
+                continue
+            
+            # ── CATEGORY COOLDOWN (varies) ──
+            # DIFFERENT errors within the same category respect category cooldown.
+            # This prevents notification storms when multiple issues arise together.
+            cat_cooldown = self._CATEGORY_COOLDOWNS.get(category, self.DIGEST_INTERVAL)
+            is_due = time_since_last >= cat_cooldown
             
-            # Anti-oscillation: even if "new" (resolved then reappeared),
-            # respect the per-category cooldown interval.  This prevents
-            # "semi-cascades" where the same root cause generates multiple
-            # slightly different notifications across health check cycles.
-            # Each category has its own appropriate cooldown (30m for network,
-            # 24h for disks, 1h for temperature, etc.).
             if not is_due:
                 continue