From 1317c5bddc6a3674257015a2065b18cb7d2420c6 Mon Sep 17 00:00:00 2001
From: MacRimi <ricoextincion@gmail.com>
Date: Tue, 17 Feb 2026 20:01:45 +0100
Subject: [PATCH] Update health monitor

---
 AppImage/components/health-status-modal.tsx |  21 ++-
 AppImage/scripts/flask_server.py            |  48 +++++
 AppImage/scripts/health_monitor.py          | 189 ++++++++++++++++----
 AppImage/scripts/health_persistence.py      | 127 ++++++++++---
 4 files changed, 318 insertions(+), 67 deletions(-)
diff --git a/AppImage/components/health-status-modal.tsx b/AppImage/components/health-status-modal.tsx
index d189d3e9..b6da6742 100644
--- a/AppImage/components/health-status-modal.tsx
+++ b/AppImage/components/health-status-modal.tsx
@@ -28,6 +28,7 @@ import {
   BellOff,
   ChevronRight,
   Settings2,
+  HelpCircle,
 } from "lucide-react"
 
 interface CategoryCheck {
@@ -207,6 +208,8 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
         return <AlertTriangle className={`${cls} text-yellow-500`} />
       case "CRITICAL":
         return <XCircle className={`${cls} text-red-500`} />
+      case "UNKNOWN":
+        return <HelpCircle className={`${cls} text-amber-400`} />
       default:
         return <Activity className={`${cls} text-muted-foreground`} />
     }
@@ -223,6 +226,8 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
         return <Badge className="bg-yellow-500 text-white hover:bg-yellow-500">Warning</Badge>
       case "CRITICAL":
         return <Badge className="bg-red-500 text-white hover:bg-red-500">Critical</Badge>
+      case "UNKNOWN":
+        return <Badge className="bg-amber-500 text-white hover:bg-amber-500">UNKNOWN</Badge>
       default:
         return <Badge>Unknown</Badge>
     }
@@ -230,13 +235,14 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
 
   const getHealthStats = () => {
     if (!healthData?.details) {
-      return { total: 0, healthy: 0, info: 0, warnings: 0, critical: 0 }
+      return { total: 0, healthy: 0, info: 0, warnings: 0, critical: 0, unknown: 0 }
     }
 
     let healthy = 0
     let info = 0
     let warnings = 0
     let critical = 0
+    let unknown = 0
 
     CATEGORIES.forEach(({ key }) => {
       const categoryData = healthData.details[key as keyof typeof healthData.details]
@@ -246,10 +252,11 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
         else if (status === "INFO") info++
         else if (status === "WARNING") warnings++
         else if (status === "CRITICAL") critical++
+        else if (status === "UNKNOWN") unknown++
       }
     })
 
-    return { total: CATEGORIES.length, healthy, info, warnings, critical }
+    return { total: CATEGORIES.length, healthy, info, warnings, critical, unknown }
   }
 
   const stats = getHealthStats()
@@ -317,16 +324,18 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
     const s = status?.toUpperCase()
     if (s === "CRITICAL") return "bg-red-500/5 border-red-500/20 hover:bg-red-500/10 cursor-pointer"
     if (s === "WARNING") return "bg-yellow-500/5 border-yellow-500/20 hover:bg-yellow-500/10 cursor-pointer"
+    if (s === "UNKNOWN") return "bg-amber-500/5 border-amber-500/20 hover:bg-amber-500/10 cursor-pointer"
     if (s === "INFO") return "bg-blue-500/5 border-blue-500/20 hover:bg-blue-500/10"
     return "bg-card border-border hover:bg-muted/30"
   }
-
+  
   const getOutlineBadgeStyle = (status: string) => {
     const s = status?.toUpperCase()
     if (s === "OK") return "border-green-500 text-green-500 bg-transparent"
     if (s === "INFO") return "border-blue-500 text-blue-500 bg-blue-500/5"
     if (s === "WARNING") return "border-yellow-500 text-yellow-500 bg-yellow-500/5"
     if (s === "CRITICAL") return "border-red-500 text-red-500 bg-red-500/5"
+    if (s === "UNKNOWN") return "border-amber-400 text-amber-400 bg-amber-500/5"
     return ""
   }
 
@@ -502,6 +511,12 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
                 <div className="text-lg sm:text-2xl font-bold text-red-500">{stats.critical}</div>
                 <div className="text-[10px] sm:text-xs text-muted-foreground">Critical</div>
               </div>
+              {stats.unknown > 0 && (
+              <div className="text-center">
+                <div className="text-lg sm:text-2xl font-bold text-amber-400">{stats.unknown}</div>
+                <div className="text-[10px] sm:text-xs text-muted-foreground">Unknown</div>
+              </div>
+              )}
             </div>
 
             {healthData.summary && healthData.summary !== "All systems operational" && (
diff --git a/AppImage/scripts/flask_server.py b/AppImage/scripts/flask_server.py
index 23103b1e..003d15f4 100644
--- a/AppImage/scripts/flask_server.py
+++ b/AppImage/scripts/flask_server.py
@@ -600,6 +600,47 @@ def _health_collector_loop():
         time.sleep(300)  # Every 5 minutes
 
 
+def _vital_signs_sampler():
+    """Dedicated thread for rapid CPU & temperature sampling.
+    
+    Runs independently of the 5-min health collector loop.
+    - CPU usage:   sampled every 30s  (3 samples in 1.5 min for hysteresis)
+    - Temperature:  sampled every 10s  (18 samples in 3 min for temporal logic)
+    Uses time.monotonic() to avoid drift.
+    """
+    from health_monitor import health_monitor
+    
+    # Wait 15s after startup for sensors to be ready
+    time.sleep(15)
+    
+    TEMP_INTERVAL = 10   # seconds
+    CPU_INTERVAL  = 30   # seconds
+    
+    next_temp = time.monotonic()
+    next_cpu  = time.monotonic()
+    
+    print("[ProxMenux] Vital signs sampler started (CPU: 30s, Temp: 10s)")
+    
+    while True:
+        try:
+            now = time.monotonic()
+            
+            if now >= next_temp:
+                health_monitor._sample_cpu_temperature()
+                next_temp = now + TEMP_INTERVAL
+            
+            if now >= next_cpu:
+                health_monitor._sample_cpu_usage()
+                next_cpu = now + CPU_INTERVAL
+            
+            # Sleep until the next earliest event (with 0.5s min to avoid busy-loop)
+            sleep_until = min(next_temp, next_cpu) - time.monotonic()
+            time.sleep(max(sleep_until, 0.5))
+        except Exception as e:
+            print(f"[ProxMenux] Vital signs sampler error: {e}")
+            time.sleep(10)
+
+
 def get_uptime():
     """Get system uptime in a human-readable format."""
     try:
@@ -7046,6 +7087,13 @@ if __name__ == '__main__':
     except Exception as e:
         print(f"[ProxMenux] Background health monitor failed to start: {e}")
 
+    # ── Vital Signs Sampler (rapid CPU + Temperature) ──
+    try:
+        vital_thread = threading.Thread(target=_vital_signs_sampler, daemon=True)
+        vital_thread.start()
+    except Exception as e:
+        print(f"[ProxMenux] Vital signs sampler failed to start: {e}")
+
     # Check for SSL configuration
     ssl_ctx = None
     try:
diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py
index c90be6c4..ef381192 100644
--- a/AppImage/scripts/health_monitor.py
+++ b/AppImage/scripts/health_monitor.py
@@ -60,7 +60,7 @@ class HealthMonitor:
     # Network Thresholds
     NETWORK_LATENCY_WARNING = 100
     NETWORK_LATENCY_CRITICAL = 300
-    NETWORK_TIMEOUT = 0.9
+    NETWORK_TIMEOUT = 2
     NETWORK_INACTIVE_DURATION = 600
     
     # Log Thresholds
@@ -142,6 +142,7 @@ class HealthMonitor:
         self.io_error_history = defaultdict(list)
         self.failed_vm_history = set()  # Track VMs that failed to start
         self.persistent_log_patterns = defaultdict(lambda: {'count': 0, 'first_seen': 0, 'last_seen': 0})
+        self._unknown_counts = {}  # Track consecutive UNKNOWN cycles per category
         
         # System capabilities - derived from Proxmox storage types at runtime (Priority 1.5)
         # SMART detection still uses filesystem check on init (lightweight)
@@ -153,6 +154,63 @@ class HealthMonitor:
         except Exception as e:
             print(f"[HealthMonitor] Cleanup warning: {e}")
     
+    # ─── Lightweight sampling methods for the dedicated vital-signs thread ───
+    # These ONLY append data to state_history without triggering evaluation,
+    # persistence, or subprocess-heavy operations.
+    
+    def _sample_cpu_usage(self):
+        """Lightweight CPU sample: read usage % and append to history. ~30ms cost."""
+        try:
+            cpu_percent = psutil.cpu_percent(interval=0)
+            current_time = time.time()
+            state_key = 'cpu_usage'
+            self.state_history[state_key].append({
+                'value': cpu_percent,
+                'time': current_time
+            })
+            # Prune entries older than 6 minutes
+            self.state_history[state_key] = [
+                e for e in self.state_history[state_key]
+                if current_time - e['time'] < 360
+            ]
+        except Exception:
+            pass  # Sampling must never crash the thread
+    
+    def _sample_cpu_temperature(self):
+        """Lightweight temperature sample: read sensor and append to history. ~50ms cost."""
+        try:
+            result = subprocess.run(
+                ['sensors', '-A', '-u'],
+                capture_output=True, text=True, timeout=2
+            )
+            if result.returncode != 0:
+                return
+            
+            temps = []
+            for line in result.stdout.split('\n'):
+                if 'temp' in line.lower() and '_input' in line:
+                    try:
+                        temp = float(line.split(':')[1].strip())
+                        temps.append(temp)
+                    except Exception:
+                        continue
+            
+            if temps:
+                max_temp = max(temps)
+                current_time = time.time()
+                state_key = 'cpu_temp_history'
+                self.state_history[state_key].append({
+                    'value': max_temp,
+                    'time': current_time
+                })
+                # Prune entries older than 4 minutes
+                self.state_history[state_key] = [
+                    e for e in self.state_history[state_key]
+                    if current_time - e['time'] < 240
+                ]
+        except Exception:
+            pass  # Sampling must never crash the thread
+    
     def get_system_info(self) -> Dict[str, Any]:
         """
         Get lightweight system info for header display.
@@ -377,14 +435,34 @@ class HealthMonitor:
         elif security_status.get('status') == 'INFO':
             info_issues.append(f"Security: {security_status.get('reason', 'Security information')}")
         
+        # --- Track UNKNOWN counts and persist if >= 3 consecutive cycles ---
+        unknown_issues = []
+        for cat_key, cat_data in details.items():
+            cat_status = cat_data.get('status', 'OK')
+            if cat_status == 'UNKNOWN':
+                count = self._unknown_counts.get(cat_key, 0) + 1
+                self._unknown_counts[cat_key] = min(count, 10)  # Cap to avoid unbounded growth
+                unknown_issues.append(f"{cat_key}: {cat_data.get('reason', 'Check unavailable')}")
+                if count == 3:  # Only persist on the exact 3rd cycle, not every cycle after
+                    try:
+                        health_persistence.record_unknown_persistent(
+                            cat_key, cat_data.get('reason', 'Check unavailable'))
+                    except Exception:
+                        pass
+            else:
+                self._unknown_counts[cat_key] = 0
+        
         # --- Determine Overall Status ---
-        # Use a fixed order of severity: CRITICAL > WARNING > INFO > OK
+        # Severity: CRITICAL > WARNING > UNKNOWN (capped at WARNING) > INFO > OK
         if critical_issues:
             overall = 'CRITICAL'
-            summary = '; '.join(critical_issues[:3]) # Limit summary to 3 issues
+            summary = '; '.join(critical_issues[:3])
         elif warning_issues:
             overall = 'WARNING'
             summary = '; '.join(warning_issues[:3])
+        elif unknown_issues:
+            overall = 'WARNING'  # UNKNOWN caps at WARNING, never escalates to CRITICAL
+            summary = '; '.join(unknown_issues[:3])
         elif info_issues:
             overall = 'OK'  # INFO statuses don't degrade overall health
             summary = '; '.join(info_issues[:3])
@@ -444,13 +522,17 @@ class HealthMonitor:
             current_time = time.time()
             
             state_key = 'cpu_usage'
+            # Add this reading as well (supplements the sampler thread)
             self.state_history[state_key].append({
                 'value': cpu_percent,
                 'time': current_time
             })
             
+            # Snapshot the list for thread-safe reading (sampler may append concurrently)
+            cpu_snapshot = list(self.state_history[state_key])
+            # Prune old entries via snapshot replacement (atomic assignment)
             self.state_history[state_key] = [
-                entry for entry in self.state_history[state_key]
+                entry for entry in cpu_snapshot
                 if current_time - entry['time'] < 360
             ]
             
@@ -517,8 +599,8 @@ class HealthMonitor:
                 }
             else:
                 checks['cpu_temperature'] = {
-                    'status': 'OK',
-                    'detail': 'Sensor not available',
+                    'status': 'INFO',
+                    'detail': 'No temperature sensor detected - install lm-sensors if hardware supports it',
                 }
             
             result['checks'] = checks
@@ -564,14 +646,16 @@ class HealthMonitor:
                     max_temp = max(temps)
                     
                     state_key = 'cpu_temp_history'
+                    # Add this reading (supplements the sampler thread)
                     self.state_history[state_key].append({
                         'value': max_temp,
                         'time': current_time
                     })
                     
-                    # Keep last 4 minutes of data (240 seconds)
+                    # Snapshot for thread-safe reading, then atomic prune
+                    temp_snapshot = list(self.state_history[state_key])
                     self.state_history[state_key] = [
-                        entry for entry in self.state_history[state_key]
+                        entry for entry in temp_snapshot
                         if current_time - entry['time'] < 240
                     ]
                     
@@ -1058,9 +1142,10 @@ class HealthMonitor:
                 'reason': f"{len(disk_issues)} disk(s) with recent errors",
                 'details': disk_issues
             }
-            
-        except Exception:
-            return {'status': 'OK'}
+        
+        except Exception as e:
+            print(f"[HealthMonitor] Disk/IO check failed: {e}")
+            return {'status': 'UNKNOWN', 'reason': f'Disk check unavailable: {str(e)}', 'checks': {}}
     
     def _check_network_optimized(self) -> Dict[str, Any]:
         """
@@ -1186,9 +1271,10 @@ class HealthMonitor:
                 'details': interface_details,
                 'checks': checks
             }
-            
-        except Exception:
-            return {'status': 'OK'}
+        
+        except Exception as e:
+            print(f"[HealthMonitor] Network check failed: {e}")
+            return {'status': 'UNKNOWN', 'reason': f'Network check unavailable: {str(e)}', 'checks': {}}
     
     def _check_network_latency(self) -> Optional[Dict[str, Any]]:
         """Check network latency to 1.1.1.1 (cached)"""
@@ -1237,15 +1323,31 @@ class HealthMonitor:
                         except:
                             pass
             
-            # If ping failed (timeout, unreachable)
+            # If ping failed (timeout, unreachable) - distinguish the reason
+            stderr_lower = (result.stderr or '').lower() if hasattr(result, 'stderr') else ''
+            if 'unreachable' in stderr_lower or 'network is unreachable' in stderr_lower:
+                fail_reason = 'Network unreachable - no route to 1.1.1.1'
+            elif result.returncode == 1:
+                fail_reason = 'Packet loss to 1.1.1.1 (100% loss)'
+            else:
+                fail_reason = f'Ping failed (exit code {result.returncode})'
+            
             packet_loss_result = {
                 'status': 'CRITICAL',
-                'reason': 'Packet loss or timeout to 1.1.1.1'
+                'reason': fail_reason
             }
             self.cached_results[cache_key] = packet_loss_result
             self.last_check_times[cache_key] = current_time
             return packet_loss_result
             
+        except subprocess.TimeoutExpired:
+            timeout_result = {
+                'status': 'WARNING',
+                'reason': f'Ping timeout (>{self.NETWORK_TIMEOUT}s) - possible high latency'
+            }
+            self.cached_results[cache_key] = timeout_result
+            self.last_check_times[cache_key] = current_time
+            return timeout_result
         except Exception:
             return {'status': 'UNKNOWN', 'reason': 'Ping command failed'}
     
@@ -1356,9 +1458,10 @@ class HealthMonitor:
                 'reason': '; '.join(issues[:3]),
                 'details': vm_details
             }
-            
-        except Exception:
-            return {'status': 'OK'}
+        
+        except Exception as e:
+            print(f"[HealthMonitor] VMs/CTs check failed: {e}")
+            return {'status': 'UNKNOWN', 'reason': f'VM/CT check unavailable: {str(e)}', 'checks': {}}
     
     # Modified to use persistence
     def _check_vms_cts_with_persistence(self) -> Dict[str, Any]:
@@ -1462,7 +1565,12 @@ class HealthMonitor:
                     
                     # Generic failed to start for VMs and CTs
                     if any(keyword in line_lower for keyword in ['failed to start', 'cannot start', 'activation failed', 'start error']):
-                        id_match = re.search(r'\b(\d{3,5})\b', line) # Increased digit count for wider match
+                        # Try contextual VMID patterns first (more precise), then fallback to generic
+                        id_match = (
+                            re.search(r'(?:VMID|vmid|VM|CT|qemu|lxc|pct|qm)[:\s=/]+(\d{3,5})\b', line) or
+                            re.search(r'\b(\d{3,5})\.conf\b', line) or
+                            re.search(r'\b(\d{3,5})\b', line)
+                        )
                         if id_match:
                             vmid_ctid = id_match.group(1)
                             # Determine if it's a VM or CT based on context, if possible
@@ -1521,8 +1629,9 @@ class HealthMonitor:
                 'checks': checks
             }
             
-        except Exception:
-            return {'status': 'OK', 'checks': {}}
+        except Exception as e:
+            print(f"[HealthMonitor] VMs/CTs persistence check failed: {e}")
+            return {'status': 'UNKNOWN', 'reason': f'VM/CT check unavailable: {str(e)}', 'checks': {}}
     
     def _check_pve_services(self) -> Dict[str, Any]:
         """
@@ -1588,7 +1697,7 @@ class HealthMonitor:
                     error_key = f'pve_service_{svc}'
                     health_persistence.record_error(
                         error_key=error_key,
-                        category='services',
+                        category='pve_services',
                         severity='CRITICAL',
                         reason=f'PVE service {svc} is {service_details.get(svc, "inactive")}',
                         details={'service': svc, 'state': service_details.get(svc, 'inactive')}
@@ -1932,9 +2041,8 @@ class HealthMonitor:
             return ok_result
             
         except Exception as e:
-            # Log the exception but return OK to avoid alert storms on check failure
-            print(f"[HealthMonitor] Error checking logs: {e}")
-            return {'status': 'OK'}
+            print(f"[HealthMonitor] Log check failed: {e}")
+            return {'status': 'UNKNOWN', 'reason': f'Log check unavailable: {str(e)}', 'checks': {}}
     
     def _normalize_log_pattern(self, line: str) -> str:
         """
@@ -1984,12 +2092,21 @@ class HealthMonitor:
                     pass # Ignore if mtime fails
             
             # Perform a dry run of apt-get upgrade to see pending packages
-            result = subprocess.run(
-                ['apt-get', 'upgrade', '--dry-run'],
-                capture_output=True,
-                text=True,
-                timeout=5 # Increased timeout for safety
-            )
+            try:
+                result = subprocess.run(
+                    ['apt-get', 'upgrade', '--dry-run'],
+                    capture_output=True,
+                    text=True,
+                    timeout=10
+                )
+            except subprocess.TimeoutExpired:
+                print("[HealthMonitor] apt-get upgrade --dry-run timed out")
+                return {
+                    'status': 'UNKNOWN',
+                    'reason': 'apt-get timed out - repository may be unreachable',
+                    'count': 0,
+                    'checks': {}
+                }
             
             status = 'OK'
             reason = None
@@ -2112,8 +2229,8 @@ class HealthMonitor:
             return update_result
             
         except Exception as e:
-            print(f"[HealthMonitor] Error checking updates: {e}")
-            return {'status': 'OK', 'count': 0, 'checks': {}}
+            print(f"[HealthMonitor] Updates check failed: {e}")
+            return {'status': 'UNKNOWN', 'reason': f'Updates check unavailable: {str(e)}', 'count': 0, 'checks': {}}
     
     def _check_fail2ban_bans(self) -> Dict[str, Any]:
         """
@@ -2356,8 +2473,8 @@ class HealthMonitor:
             }
             
         except Exception as e:
-            print(f"[HealthMonitor] Error checking security: {e}")
-            return {'status': 'OK', 'checks': {}}
+            print(f"[HealthMonitor] Security check failed: {e}")
+            return {'status': 'UNKNOWN', 'reason': f'Security check unavailable: {str(e)}', 'checks': {}}
     
     def _check_certificates(self) -> Optional[Dict[str, Any]]:
         """
diff --git a/AppImage/scripts/health_persistence.py b/AppImage/scripts/health_persistence.py
index ea6179b1..377f71da 100644
--- a/AppImage/scripts/health_persistence.py
+++ b/AppImage/scripts/health_persistence.py
@@ -17,6 +17,7 @@ Version: 1.1
 import sqlite3
 import json
 import os
+import threading
 from datetime import datetime, timedelta
 from typing import Dict, List, Any, Optional
 from pathlib import Path
@@ -52,11 +53,19 @@ class HealthPersistence:
         self.data_dir.mkdir(parents=True, exist_ok=True)
         
         self.db_path = self.data_dir / 'health_monitor.db'
+        self._db_lock = threading.Lock()
         self._init_database()
     
+    def _get_conn(self) -> sqlite3.Connection:
+        """Get a SQLite connection with timeout and WAL mode for safe concurrency."""
+        conn = sqlite3.connect(str(self.db_path), timeout=10)
+        conn.execute('PRAGMA journal_mode=WAL')
+        conn.execute('PRAGMA busy_timeout=5000')
+        return conn
+    
     def _init_database(self):
         """Initialize SQLite database with required tables"""
-        conn = sqlite3.connect(str(self.db_path))
+        conn = self._get_conn()
         cursor = conn.cursor()
         
         # Errors table
@@ -126,7 +135,11 @@ class HealthPersistence:
         Record or update an error.
         Returns event info (new_error, updated, etc.)
         """
-        conn = sqlite3.connect(str(self.db_path))
+        with self._db_lock:
+            return self._record_error_impl(error_key, category, severity, reason, details)
+    
+    def _record_error_impl(self, error_key, category, severity, reason, details):
+        conn = self._get_conn()
         cursor = conn.cursor()
         
         now = datetime.now().isoformat()
@@ -262,7 +275,11 @@ class HealthPersistence:
     
     def resolve_error(self, error_key: str, reason: str = 'auto-resolved'):
         """Mark an error as resolved"""
-        conn = sqlite3.connect(str(self.db_path))
+        with self._db_lock:
+            return self._resolve_error_impl(error_key, reason)
+    
+    def _resolve_error_impl(self, error_key, reason):
+        conn = self._get_conn()
         cursor = conn.cursor()
         
         now = datetime.now().isoformat()
@@ -284,7 +301,7 @@ class HealthPersistence:
         Check if an error is currently active (unresolved and not acknowledged).
         Used by checks to avoid re-recording errors that are already tracked.
         """
-        conn = sqlite3.connect(str(self.db_path))
+        conn = self._get_conn()
         cursor = conn.cursor()
         
         if category:
@@ -314,7 +331,7 @@ class HealthPersistence:
         we delete the record entirely so it can re-trigger as a fresh
         event if the condition returns later.
         """
-        conn = sqlite3.connect(str(self.db_path))
+        conn = self._get_conn()
         cursor = conn.cursor()
         
         now = datetime.now().isoformat()
@@ -353,7 +370,11 @@ class HealthPersistence:
         - Stores suppression_hours on the error record (snapshot at dismiss time)
         - Marks as acknowledged so it won't re-appear during the suppression period
         """
-        conn = sqlite3.connect(str(self.db_path))
+        with self._db_lock:
+            return self._acknowledge_error_impl(error_key)
+    
+    def _acknowledge_error_impl(self, error_key):
+        conn = self._get_conn()
         conn.row_factory = sqlite3.Row
         cursor = conn.cursor()
         
@@ -408,7 +429,7 @@ class HealthPersistence:
     
     def get_active_errors(self, category: Optional[str] = None) -> List[Dict[str, Any]]:
         """Get all active (unresolved) errors, optionally filtered by category"""
-        conn = sqlite3.connect(str(self.db_path))
+        conn = self._get_conn()
         conn.row_factory = sqlite3.Row
         cursor = conn.cursor()
         
@@ -439,7 +460,11 @@ class HealthPersistence:
     
     def cleanup_old_errors(self):
         """Clean up old resolved errors and auto-resolve stale errors"""
-        conn = sqlite3.connect(str(self.db_path))
+        with self._db_lock:
+            return self._cleanup_old_errors_impl()
+    
+    def _cleanup_old_errors_impl(self):
+        conn = self._get_conn()
         cursor = conn.cursor()
         
         now = datetime.now()
@@ -519,7 +544,7 @@ class HealthPersistence:
         Get errors that were acknowledged/dismissed but still within suppression period.
         These are shown as INFO in the frontend with a 'Dismissed' badge.
         """
-        conn = sqlite3.connect(str(self.db_path))
+        conn = self._get_conn()
         conn.row_factory = sqlite3.Row
         cursor = conn.cursor()
         
@@ -584,7 +609,7 @@ class HealthPersistence:
         - 'resolved': error resolved
         - 'escalated': severity increased
         """
-        conn = sqlite3.connect(str(self.db_path))
+        conn = self._get_conn()
         cursor = conn.cursor()
         
         event_data = data or {}
@@ -608,7 +633,7 @@ class HealthPersistence:
         Get events that need notification (for future Telegram/Gotify integration).
         Groups by severity for batch notification sending.
         """
-        conn = sqlite3.connect(str(self.db_path))
+        conn = self._get_conn()
         conn.row_factory = sqlite3.Row
         cursor = conn.cursor()
         
@@ -641,7 +666,7 @@ class HealthPersistence:
         if not event_ids:
             return
         
-        conn = sqlite3.connect(str(self.db_path))
+        conn = self._get_conn()
         cursor = conn.cursor()
         
         for event_id in event_ids:
@@ -663,7 +688,7 @@ class HealthPersistence:
     
     def get_unnotified_errors(self) -> List[Dict[str, Any]]:
         """Get errors that need Telegram notification"""
-        conn = sqlite3.connect(str(self.db_path))
+        conn = self._get_conn()
         conn.row_factory = sqlite3.Row
         cursor = conn.cursor()
         
@@ -689,7 +714,7 @@ class HealthPersistence:
     
     def mark_notified(self, error_key: str):
         """Mark error as notified"""
-        conn = sqlite3.connect(str(self.db_path))
+        conn = self._get_conn()
         cursor = conn.cursor()
         
         cursor.execute('''
@@ -708,7 +733,7 @@ class HealthPersistence:
         Get a cached system capability value.
         Returns None if not yet detected.
         """
-        conn = sqlite3.connect(str(self.db_path))
+        conn = self._get_conn()
         cursor = conn.cursor()
         cursor.execute(
             'SELECT cap_value FROM system_capabilities WHERE cap_key = ?',
@@ -720,7 +745,7 @@ class HealthPersistence:
     
     def set_capability(self, cap_key: str, cap_value: str):
         """Store a system capability value (detected once, cached forever)."""
-        conn = sqlite3.connect(str(self.db_path))
+        conn = self._get_conn()
         cursor = conn.cursor()
         cursor.execute('''
             INSERT OR REPLACE INTO system_capabilities (cap_key, cap_value, detected_at)
@@ -731,7 +756,7 @@ class HealthPersistence:
     
     def get_all_capabilities(self) -> Dict[str, str]:
         """Get all cached system capabilities as a dict."""
-        conn = sqlite3.connect(str(self.db_path))
+        conn = self._get_conn()
         cursor = conn.cursor()
         cursor.execute('SELECT cap_key, cap_value FROM system_capabilities')
         rows = cursor.fetchall()
@@ -747,7 +772,7 @@ class HealthPersistence:
     
     def get_setting(self, key: str, default: Optional[str] = None) -> Optional[str]:
         """Get a user setting value by key."""
-        conn = sqlite3.connect(str(self.db_path))
+        conn = self._get_conn()
         cursor = conn.cursor()
         cursor.execute(
             'SELECT setting_value FROM user_settings WHERE setting_key = ?', (key,)
@@ -758,18 +783,19 @@ class HealthPersistence:
     
     def set_setting(self, key: str, value: str):
         """Store a user setting value."""
-        conn = sqlite3.connect(str(self.db_path))
-        cursor = conn.cursor()
-        cursor.execute('''
-            INSERT OR REPLACE INTO user_settings (setting_key, setting_value, updated_at)
-            VALUES (?, ?, ?)
-        ''', (key, value, datetime.now().isoformat()))
-        conn.commit()
-        conn.close()
+        with self._db_lock:
+            conn = self._get_conn()
+            cursor = conn.cursor()
+            cursor.execute('''
+                INSERT OR REPLACE INTO user_settings (setting_key, setting_value, updated_at)
+                VALUES (?, ?, ?)
+            ''', (key, value, datetime.now().isoformat()))
+            conn.commit()
+            conn.close()
     
     def get_all_settings(self, prefix: Optional[str] = None) -> Dict[str, str]:
         """Get all user settings, optionally filtered by key prefix."""
-        conn = sqlite3.connect(str(self.db_path))
+        conn = self._get_conn()
         cursor = conn.cursor()
         if prefix:
             cursor.execute(
@@ -791,7 +817,7 @@ class HealthPersistence:
         For each dismissed error, looks up its category's configured hours
         and updates the suppression_hours column to match.
         """
-        conn = sqlite3.connect(str(self.db_path))
+        conn = self._get_conn()
         cursor = conn.cursor()
         
         # Build reverse map: category -> setting_key
@@ -882,6 +908,51 @@ class HealthPersistence:
         """
         all_cats = self.get_suppression_categories()
         return [c for c in all_cats if c['hours'] != self.DEFAULT_SUPPRESSION_HOURS]
+    
+    def record_unknown_persistent(self, category: str, reason: str):
+        """
+        Record a persistent UNKNOWN event when a health check has been
+        unable to verify for >= 3 consecutive cycles (~15 min).
+        Avoids duplicates by only recording once per 30 min per category.
+        """
+        with self._db_lock:
+            self._record_unknown_persistent_impl(category, reason)
+    
+    def _record_unknown_persistent_impl(self, category, reason):
+        try:
+            event_key = f'unknown_persistent_{category}'
+            now = datetime.now().isoformat()
+            
+            conn = self._get_conn()
+            cursor = conn.cursor()
+            
+            # Check if we already recorded this within the last 30 minutes
+            # Note: events table has columns (id, event_type, error_key, timestamp, data)
+            # We use error_key for deduplication since it contains the category
+            cursor.execute('''
+                SELECT MAX(timestamp) FROM events 
+                WHERE event_type = ? AND error_key = ?
+            ''', ('unknown_persistent', event_key))
+            row = cursor.fetchone()
+            if row and row[0]:
+                try:
+                    last_recorded = datetime.fromisoformat(row[0])
+                    if (datetime.now() - last_recorded).total_seconds() < 1800:
+                        conn.close()
+                        return  # Already recorded recently
+                except (ValueError, TypeError):
+                    pass  # If timestamp is malformed, proceed with recording
+            
+            cursor.execute('''
+                INSERT INTO events (event_type, error_key, timestamp, data)
+                VALUES (?, ?, ?, ?)
+            ''', ('unknown_persistent', event_key, now, 
+                  json.dumps({'category': category, 'reason': reason})))
+            
+            conn.commit()
+            conn.close()
+        except Exception as e:
+            print(f"[HealthPersistence] Error recording UNKNOWN persistent: {e}")
 
 
 # Global instance