From 1317c5bddc6a3674257015a2065b18cb7d2420c6 Mon Sep 17 00:00:00 2001 From: MacRimi Date: Tue, 17 Feb 2026 20:01:45 +0100 Subject: [PATCH] Update health monitor --- AppImage/components/health-status-modal.tsx | 21 ++- AppImage/scripts/flask_server.py | 48 +++++ AppImage/scripts/health_monitor.py | 189 ++++++++++++++++---- AppImage/scripts/health_persistence.py | 127 ++++++++++--- 4 files changed, 318 insertions(+), 67 deletions(-) diff --git a/AppImage/components/health-status-modal.tsx b/AppImage/components/health-status-modal.tsx index d189d3e9..b6da6742 100644 --- a/AppImage/components/health-status-modal.tsx +++ b/AppImage/components/health-status-modal.tsx @@ -28,6 +28,7 @@ import { BellOff, ChevronRight, Settings2, + HelpCircle, } from "lucide-react" interface CategoryCheck { @@ -207,6 +208,8 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu return case "CRITICAL": return + case "UNKNOWN": + return default: return } @@ -223,6 +226,8 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu return Warning case "CRITICAL": return Critical + case "UNKNOWN": + return UNKNOWN default: return Unknown } @@ -230,13 +235,14 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu const getHealthStats = () => { if (!healthData?.details) { - return { total: 0, healthy: 0, info: 0, warnings: 0, critical: 0 } + return { total: 0, healthy: 0, info: 0, warnings: 0, critical: 0, unknown: 0 } } let healthy = 0 let info = 0 let warnings = 0 let critical = 0 + let unknown = 0 CATEGORIES.forEach(({ key }) => { const categoryData = healthData.details[key as keyof typeof healthData.details] @@ -246,10 +252,11 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu else if (status === "INFO") info++ else if (status === "WARNING") warnings++ else if (status === "CRITICAL") critical++ + else if (status === "UNKNOWN") unknown++ } }) - return { total: CATEGORIES.length, healthy, info, warnings, critical } + return { total: CATEGORIES.length, healthy, info, warnings, critical, unknown } } const stats = getHealthStats() @@ -317,16 +324,18 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu const s = status?.toUpperCase() if (s === "CRITICAL") return "bg-red-500/5 border-red-500/20 hover:bg-red-500/10 cursor-pointer" if (s === "WARNING") return "bg-yellow-500/5 border-yellow-500/20 hover:bg-yellow-500/10 cursor-pointer" + if (s === "UNKNOWN") return "bg-amber-500/5 border-amber-500/20 hover:bg-amber-500/10 cursor-pointer" if (s === "INFO") return "bg-blue-500/5 border-blue-500/20 hover:bg-blue-500/10" return "bg-card border-border hover:bg-muted/30" } - + const getOutlineBadgeStyle = (status: string) => { const s = status?.toUpperCase() if (s === "OK") return "border-green-500 text-green-500 bg-transparent" if (s === "INFO") return "border-blue-500 text-blue-500 bg-blue-500/5" if (s === "WARNING") return "border-yellow-500 text-yellow-500 bg-yellow-500/5" if (s === "CRITICAL") return "border-red-500 text-red-500 bg-red-500/5" + if (s === "UNKNOWN") return "border-amber-400 text-amber-400 bg-amber-500/5" return "" } @@ -502,6 +511,12 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
{stats.critical}
Critical
+ {stats.unknown > 0 && ( +
+
{stats.unknown}
+
Unknown
+
+ )} {healthData.summary && healthData.summary !== "All systems operational" && ( diff --git a/AppImage/scripts/flask_server.py b/AppImage/scripts/flask_server.py index 23103b1e..003d15f4 100644 --- a/AppImage/scripts/flask_server.py +++ b/AppImage/scripts/flask_server.py @@ -600,6 +600,47 @@ def _health_collector_loop(): time.sleep(300) # Every 5 minutes +def _vital_signs_sampler(): + """Dedicated thread for rapid CPU & temperature sampling. + + Runs independently of the 5-min health collector loop. + - CPU usage: sampled every 30s (3 samples in 1.5 min for hysteresis) + - Temperature: sampled every 10s (18 samples in 3 min for temporal logic) + Uses time.monotonic() to avoid drift. + """ + from health_monitor import health_monitor + + # Wait 15s after startup for sensors to be ready + time.sleep(15) + + TEMP_INTERVAL = 10 # seconds + CPU_INTERVAL = 30 # seconds + + next_temp = time.monotonic() + next_cpu = time.monotonic() + + print("[ProxMenux] Vital signs sampler started (CPU: 30s, Temp: 10s)") + + while True: + try: + now = time.monotonic() + + if now >= next_temp: + health_monitor._sample_cpu_temperature() + next_temp = now + TEMP_INTERVAL + + if now >= next_cpu: + health_monitor._sample_cpu_usage() + next_cpu = now + CPU_INTERVAL + + # Sleep until the next earliest event (with 0.5s min to avoid busy-loop) + sleep_until = min(next_temp, next_cpu) - time.monotonic() + time.sleep(max(sleep_until, 0.5)) + except Exception as e: + print(f"[ProxMenux] Vital signs sampler error: {e}") + time.sleep(10) + + def get_uptime(): """Get system uptime in a human-readable format.""" try: @@ -7046,6 +7087,13 @@ if __name__ == '__main__': except Exception as e: print(f"[ProxMenux] Background health monitor failed to start: {e}") + # ── Vital Signs Sampler (rapid CPU + Temperature) ── + try: + vital_thread = threading.Thread(target=_vital_signs_sampler, daemon=True) + vital_thread.start() + except Exception as e: + print(f"[ProxMenux] Vital signs sampler failed to start: {e}") + # Check for SSL configuration ssl_ctx = None try: diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py index c90be6c4..ef381192 100644 --- a/AppImage/scripts/health_monitor.py +++ b/AppImage/scripts/health_monitor.py @@ -60,7 +60,7 @@ class HealthMonitor: # Network Thresholds NETWORK_LATENCY_WARNING = 100 NETWORK_LATENCY_CRITICAL = 300 - NETWORK_TIMEOUT = 0.9 + NETWORK_TIMEOUT = 2 NETWORK_INACTIVE_DURATION = 600 # Log Thresholds @@ -142,6 +142,7 @@ class HealthMonitor: self.io_error_history = defaultdict(list) self.failed_vm_history = set() # Track VMs that failed to start self.persistent_log_patterns = defaultdict(lambda: {'count': 0, 'first_seen': 0, 'last_seen': 0}) + self._unknown_counts = {} # Track consecutive UNKNOWN cycles per category # System capabilities - derived from Proxmox storage types at runtime (Priority 1.5) # SMART detection still uses filesystem check on init (lightweight) @@ -153,6 +154,63 @@ class HealthMonitor: except Exception as e: print(f"[HealthMonitor] Cleanup warning: {e}") + # ─── Lightweight sampling methods for the dedicated vital-signs thread ─── + # These ONLY append data to state_history without triggering evaluation, + # persistence, or subprocess-heavy operations. + + def _sample_cpu_usage(self): + """Lightweight CPU sample: read usage % and append to history. ~30ms cost.""" + try: + cpu_percent = psutil.cpu_percent(interval=0) + current_time = time.time() + state_key = 'cpu_usage' + self.state_history[state_key].append({ + 'value': cpu_percent, + 'time': current_time + }) + # Prune entries older than 6 minutes + self.state_history[state_key] = [ + e for e in self.state_history[state_key] + if current_time - e['time'] < 360 + ] + except Exception: + pass # Sampling must never crash the thread + + def _sample_cpu_temperature(self): + """Lightweight temperature sample: read sensor and append to history. ~50ms cost.""" + try: + result = subprocess.run( + ['sensors', '-A', '-u'], + capture_output=True, text=True, timeout=2 + ) + if result.returncode != 0: + return + + temps = [] + for line in result.stdout.split('\n'): + if 'temp' in line.lower() and '_input' in line: + try: + temp = float(line.split(':')[1].strip()) + temps.append(temp) + except Exception: + continue + + if temps: + max_temp = max(temps) + current_time = time.time() + state_key = 'cpu_temp_history' + self.state_history[state_key].append({ + 'value': max_temp, + 'time': current_time + }) + # Prune entries older than 4 minutes + self.state_history[state_key] = [ + e for e in self.state_history[state_key] + if current_time - e['time'] < 240 + ] + except Exception: + pass # Sampling must never crash the thread + def get_system_info(self) -> Dict[str, Any]: """ Get lightweight system info for header display. @@ -377,14 +435,34 @@ class HealthMonitor: elif security_status.get('status') == 'INFO': info_issues.append(f"Security: {security_status.get('reason', 'Security information')}") + # --- Track UNKNOWN counts and persist if >= 3 consecutive cycles --- + unknown_issues = [] + for cat_key, cat_data in details.items(): + cat_status = cat_data.get('status', 'OK') + if cat_status == 'UNKNOWN': + count = self._unknown_counts.get(cat_key, 0) + 1 + self._unknown_counts[cat_key] = min(count, 10) # Cap to avoid unbounded growth + unknown_issues.append(f"{cat_key}: {cat_data.get('reason', 'Check unavailable')}") + if count == 3: # Only persist on the exact 3rd cycle, not every cycle after + try: + health_persistence.record_unknown_persistent( + cat_key, cat_data.get('reason', 'Check unavailable')) + except Exception: + pass + else: + self._unknown_counts[cat_key] = 0 + # --- Determine Overall Status --- - # Use a fixed order of severity: CRITICAL > WARNING > INFO > OK + # Severity: CRITICAL > WARNING > UNKNOWN (capped at WARNING) > INFO > OK if critical_issues: overall = 'CRITICAL' - summary = '; '.join(critical_issues[:3]) # Limit summary to 3 issues + summary = '; '.join(critical_issues[:3]) elif warning_issues: overall = 'WARNING' summary = '; '.join(warning_issues[:3]) + elif unknown_issues: + overall = 'WARNING' # UNKNOWN caps at WARNING, never escalates to CRITICAL + summary = '; '.join(unknown_issues[:3]) elif info_issues: overall = 'OK' # INFO statuses don't degrade overall health summary = '; '.join(info_issues[:3]) @@ -444,13 +522,17 @@ class HealthMonitor: current_time = time.time() state_key = 'cpu_usage' + # Add this reading as well (supplements the sampler thread) self.state_history[state_key].append({ 'value': cpu_percent, 'time': current_time }) + # Snapshot the list for thread-safe reading (sampler may append concurrently) + cpu_snapshot = list(self.state_history[state_key]) + # Prune old entries via snapshot replacement (atomic assignment) self.state_history[state_key] = [ - entry for entry in self.state_history[state_key] + entry for entry in cpu_snapshot if current_time - entry['time'] < 360 ] @@ -517,8 +599,8 @@ class HealthMonitor: } else: checks['cpu_temperature'] = { - 'status': 'OK', - 'detail': 'Sensor not available', + 'status': 'INFO', + 'detail': 'No temperature sensor detected - install lm-sensors if hardware supports it', } result['checks'] = checks @@ -564,14 +646,16 @@ class HealthMonitor: max_temp = max(temps) state_key = 'cpu_temp_history' + # Add this reading (supplements the sampler thread) self.state_history[state_key].append({ 'value': max_temp, 'time': current_time }) - # Keep last 4 minutes of data (240 seconds) + # Snapshot for thread-safe reading, then atomic prune + temp_snapshot = list(self.state_history[state_key]) self.state_history[state_key] = [ - entry for entry in self.state_history[state_key] + entry for entry in temp_snapshot if current_time - entry['time'] < 240 ] @@ -1058,9 +1142,10 @@ class HealthMonitor: 'reason': f"{len(disk_issues)} disk(s) with recent errors", 'details': disk_issues } - - except Exception: - return {'status': 'OK'} + + except Exception as e: + print(f"[HealthMonitor] Disk/IO check failed: {e}") + return {'status': 'UNKNOWN', 'reason': f'Disk check unavailable: {str(e)}', 'checks': {}} def _check_network_optimized(self) -> Dict[str, Any]: """ @@ -1186,9 +1271,10 @@ class HealthMonitor: 'details': interface_details, 'checks': checks } - - except Exception: - return {'status': 'OK'} + + except Exception as e: + print(f"[HealthMonitor] Network check failed: {e}") + return {'status': 'UNKNOWN', 'reason': f'Network check unavailable: {str(e)}', 'checks': {}} def _check_network_latency(self) -> Optional[Dict[str, Any]]: """Check network latency to 1.1.1.1 (cached)""" @@ -1237,15 +1323,31 @@ class HealthMonitor: except: pass - # If ping failed (timeout, unreachable) + # If ping failed (timeout, unreachable) - distinguish the reason + stderr_lower = (result.stderr or '').lower() if hasattr(result, 'stderr') else '' + if 'unreachable' in stderr_lower or 'network is unreachable' in stderr_lower: + fail_reason = 'Network unreachable - no route to 1.1.1.1' + elif result.returncode == 1: + fail_reason = 'Packet loss to 1.1.1.1 (100% loss)' + else: + fail_reason = f'Ping failed (exit code {result.returncode})' + packet_loss_result = { 'status': 'CRITICAL', - 'reason': 'Packet loss or timeout to 1.1.1.1' + 'reason': fail_reason } self.cached_results[cache_key] = packet_loss_result self.last_check_times[cache_key] = current_time return packet_loss_result + except subprocess.TimeoutExpired: + timeout_result = { + 'status': 'WARNING', + 'reason': f'Ping timeout (>{self.NETWORK_TIMEOUT}s) - possible high latency' + } + self.cached_results[cache_key] = timeout_result + self.last_check_times[cache_key] = current_time + return timeout_result except Exception: return {'status': 'UNKNOWN', 'reason': 'Ping command failed'} @@ -1356,9 +1458,10 @@ class HealthMonitor: 'reason': '; '.join(issues[:3]), 'details': vm_details } - - except Exception: - return {'status': 'OK'} + + except Exception as e: + print(f"[HealthMonitor] VMs/CTs check failed: {e}") + return {'status': 'UNKNOWN', 'reason': f'VM/CT check unavailable: {str(e)}', 'checks': {}} # Modified to use persistence def _check_vms_cts_with_persistence(self) -> Dict[str, Any]: @@ -1462,7 +1565,12 @@ class HealthMonitor: # Generic failed to start for VMs and CTs if any(keyword in line_lower for keyword in ['failed to start', 'cannot start', 'activation failed', 'start error']): - id_match = re.search(r'\b(\d{3,5})\b', line) # Increased digit count for wider match + # Try contextual VMID patterns first (more precise), then fallback to generic + id_match = ( + re.search(r'(?:VMID|vmid|VM|CT|qemu|lxc|pct|qm)[:\s=/]+(\d{3,5})\b', line) or + re.search(r'\b(\d{3,5})\.conf\b', line) or + re.search(r'\b(\d{3,5})\b', line) + ) if id_match: vmid_ctid = id_match.group(1) # Determine if it's a VM or CT based on context, if possible @@ -1521,8 +1629,9 @@ class HealthMonitor: 'checks': checks } - except Exception: - return {'status': 'OK', 'checks': {}} + except Exception as e: + print(f"[HealthMonitor] VMs/CTs persistence check failed: {e}") + return {'status': 'UNKNOWN', 'reason': f'VM/CT check unavailable: {str(e)}', 'checks': {}} def _check_pve_services(self) -> Dict[str, Any]: """ @@ -1588,7 +1697,7 @@ class HealthMonitor: error_key = f'pve_service_{svc}' health_persistence.record_error( error_key=error_key, - category='services', + category='pve_services', severity='CRITICAL', reason=f'PVE service {svc} is {service_details.get(svc, "inactive")}', details={'service': svc, 'state': service_details.get(svc, 'inactive')} @@ -1932,9 +2041,8 @@ class HealthMonitor: return ok_result except Exception as e: - # Log the exception but return OK to avoid alert storms on check failure - print(f"[HealthMonitor] Error checking logs: {e}") - return {'status': 'OK'} + print(f"[HealthMonitor] Log check failed: {e}") + return {'status': 'UNKNOWN', 'reason': f'Log check unavailable: {str(e)}', 'checks': {}} def _normalize_log_pattern(self, line: str) -> str: """ @@ -1984,12 +2092,21 @@ class HealthMonitor: pass # Ignore if mtime fails # Perform a dry run of apt-get upgrade to see pending packages - result = subprocess.run( - ['apt-get', 'upgrade', '--dry-run'], - capture_output=True, - text=True, - timeout=5 # Increased timeout for safety - ) + try: + result = subprocess.run( + ['apt-get', 'upgrade', '--dry-run'], + capture_output=True, + text=True, + timeout=10 + ) + except subprocess.TimeoutExpired: + print("[HealthMonitor] apt-get upgrade --dry-run timed out") + return { + 'status': 'UNKNOWN', + 'reason': 'apt-get timed out - repository may be unreachable', + 'count': 0, + 'checks': {} + } status = 'OK' reason = None @@ -2112,8 +2229,8 @@ class HealthMonitor: return update_result except Exception as e: - print(f"[HealthMonitor] Error checking updates: {e}") - return {'status': 'OK', 'count': 0, 'checks': {}} + print(f"[HealthMonitor] Updates check failed: {e}") + return {'status': 'UNKNOWN', 'reason': f'Updates check unavailable: {str(e)}', 'count': 0, 'checks': {}} def _check_fail2ban_bans(self) -> Dict[str, Any]: """ @@ -2356,8 +2473,8 @@ class HealthMonitor: } except Exception as e: - print(f"[HealthMonitor] Error checking security: {e}") - return {'status': 'OK', 'checks': {}} + print(f"[HealthMonitor] Security check failed: {e}") + return {'status': 'UNKNOWN', 'reason': f'Security check unavailable: {str(e)}', 'checks': {}} def _check_certificates(self) -> Optional[Dict[str, Any]]: """ diff --git a/AppImage/scripts/health_persistence.py b/AppImage/scripts/health_persistence.py index ea6179b1..377f71da 100644 --- a/AppImage/scripts/health_persistence.py +++ b/AppImage/scripts/health_persistence.py @@ -17,6 +17,7 @@ Version: 1.1 import sqlite3 import json import os +import threading from datetime import datetime, timedelta from typing import Dict, List, Any, Optional from pathlib import Path @@ -52,11 +53,19 @@ class HealthPersistence: self.data_dir.mkdir(parents=True, exist_ok=True) self.db_path = self.data_dir / 'health_monitor.db' + self._db_lock = threading.Lock() self._init_database() + def _get_conn(self) -> sqlite3.Connection: + """Get a SQLite connection with timeout and WAL mode for safe concurrency.""" + conn = sqlite3.connect(str(self.db_path), timeout=10) + conn.execute('PRAGMA journal_mode=WAL') + conn.execute('PRAGMA busy_timeout=5000') + return conn + def _init_database(self): """Initialize SQLite database with required tables""" - conn = sqlite3.connect(str(self.db_path)) + conn = self._get_conn() cursor = conn.cursor() # Errors table @@ -126,7 +135,11 @@ class HealthPersistence: Record or update an error. Returns event info (new_error, updated, etc.) """ - conn = sqlite3.connect(str(self.db_path)) + with self._db_lock: + return self._record_error_impl(error_key, category, severity, reason, details) + + def _record_error_impl(self, error_key, category, severity, reason, details): + conn = self._get_conn() cursor = conn.cursor() now = datetime.now().isoformat() @@ -262,7 +275,11 @@ class HealthPersistence: def resolve_error(self, error_key: str, reason: str = 'auto-resolved'): """Mark an error as resolved""" - conn = sqlite3.connect(str(self.db_path)) + with self._db_lock: + return self._resolve_error_impl(error_key, reason) + + def _resolve_error_impl(self, error_key, reason): + conn = self._get_conn() cursor = conn.cursor() now = datetime.now().isoformat() @@ -284,7 +301,7 @@ class HealthPersistence: Check if an error is currently active (unresolved and not acknowledged). Used by checks to avoid re-recording errors that are already tracked. """ - conn = sqlite3.connect(str(self.db_path)) + conn = self._get_conn() cursor = conn.cursor() if category: @@ -314,7 +331,7 @@ class HealthPersistence: we delete the record entirely so it can re-trigger as a fresh event if the condition returns later. """ - conn = sqlite3.connect(str(self.db_path)) + conn = self._get_conn() cursor = conn.cursor() now = datetime.now().isoformat() @@ -353,7 +370,11 @@ class HealthPersistence: - Stores suppression_hours on the error record (snapshot at dismiss time) - Marks as acknowledged so it won't re-appear during the suppression period """ - conn = sqlite3.connect(str(self.db_path)) + with self._db_lock: + return self._acknowledge_error_impl(error_key) + + def _acknowledge_error_impl(self, error_key): + conn = self._get_conn() conn.row_factory = sqlite3.Row cursor = conn.cursor() @@ -408,7 +429,7 @@ class HealthPersistence: def get_active_errors(self, category: Optional[str] = None) -> List[Dict[str, Any]]: """Get all active (unresolved) errors, optionally filtered by category""" - conn = sqlite3.connect(str(self.db_path)) + conn = self._get_conn() conn.row_factory = sqlite3.Row cursor = conn.cursor() @@ -439,7 +460,11 @@ class HealthPersistence: def cleanup_old_errors(self): """Clean up old resolved errors and auto-resolve stale errors""" - conn = sqlite3.connect(str(self.db_path)) + with self._db_lock: + return self._cleanup_old_errors_impl() + + def _cleanup_old_errors_impl(self): + conn = self._get_conn() cursor = conn.cursor() now = datetime.now() @@ -519,7 +544,7 @@ class HealthPersistence: Get errors that were acknowledged/dismissed but still within suppression period. These are shown as INFO in the frontend with a 'Dismissed' badge. """ - conn = sqlite3.connect(str(self.db_path)) + conn = self._get_conn() conn.row_factory = sqlite3.Row cursor = conn.cursor() @@ -584,7 +609,7 @@ class HealthPersistence: - 'resolved': error resolved - 'escalated': severity increased """ - conn = sqlite3.connect(str(self.db_path)) + conn = self._get_conn() cursor = conn.cursor() event_data = data or {} @@ -608,7 +633,7 @@ class HealthPersistence: Get events that need notification (for future Telegram/Gotify integration). Groups by severity for batch notification sending. """ - conn = sqlite3.connect(str(self.db_path)) + conn = self._get_conn() conn.row_factory = sqlite3.Row cursor = conn.cursor() @@ -641,7 +666,7 @@ class HealthPersistence: if not event_ids: return - conn = sqlite3.connect(str(self.db_path)) + conn = self._get_conn() cursor = conn.cursor() for event_id in event_ids: @@ -663,7 +688,7 @@ class HealthPersistence: def get_unnotified_errors(self) -> List[Dict[str, Any]]: """Get errors that need Telegram notification""" - conn = sqlite3.connect(str(self.db_path)) + conn = self._get_conn() conn.row_factory = sqlite3.Row cursor = conn.cursor() @@ -689,7 +714,7 @@ class HealthPersistence: def mark_notified(self, error_key: str): """Mark error as notified""" - conn = sqlite3.connect(str(self.db_path)) + conn = self._get_conn() cursor = conn.cursor() cursor.execute(''' @@ -708,7 +733,7 @@ class HealthPersistence: Get a cached system capability value. Returns None if not yet detected. """ - conn = sqlite3.connect(str(self.db_path)) + conn = self._get_conn() cursor = conn.cursor() cursor.execute( 'SELECT cap_value FROM system_capabilities WHERE cap_key = ?', @@ -720,7 +745,7 @@ class HealthPersistence: def set_capability(self, cap_key: str, cap_value: str): """Store a system capability value (detected once, cached forever).""" - conn = sqlite3.connect(str(self.db_path)) + conn = self._get_conn() cursor = conn.cursor() cursor.execute(''' INSERT OR REPLACE INTO system_capabilities (cap_key, cap_value, detected_at) @@ -731,7 +756,7 @@ class HealthPersistence: def get_all_capabilities(self) -> Dict[str, str]: """Get all cached system capabilities as a dict.""" - conn = sqlite3.connect(str(self.db_path)) + conn = self._get_conn() cursor = conn.cursor() cursor.execute('SELECT cap_key, cap_value FROM system_capabilities') rows = cursor.fetchall() @@ -747,7 +772,7 @@ class HealthPersistence: def get_setting(self, key: str, default: Optional[str] = None) -> Optional[str]: """Get a user setting value by key.""" - conn = sqlite3.connect(str(self.db_path)) + conn = self._get_conn() cursor = conn.cursor() cursor.execute( 'SELECT setting_value FROM user_settings WHERE setting_key = ?', (key,) @@ -758,18 +783,19 @@ class HealthPersistence: def set_setting(self, key: str, value: str): """Store a user setting value.""" - conn = sqlite3.connect(str(self.db_path)) - cursor = conn.cursor() - cursor.execute(''' - INSERT OR REPLACE INTO user_settings (setting_key, setting_value, updated_at) - VALUES (?, ?, ?) - ''', (key, value, datetime.now().isoformat())) - conn.commit() - conn.close() + with self._db_lock: + conn = self._get_conn() + cursor = conn.cursor() + cursor.execute(''' + INSERT OR REPLACE INTO user_settings (setting_key, setting_value, updated_at) + VALUES (?, ?, ?) + ''', (key, value, datetime.now().isoformat())) + conn.commit() + conn.close() def get_all_settings(self, prefix: Optional[str] = None) -> Dict[str, str]: """Get all user settings, optionally filtered by key prefix.""" - conn = sqlite3.connect(str(self.db_path)) + conn = self._get_conn() cursor = conn.cursor() if prefix: cursor.execute( @@ -791,7 +817,7 @@ class HealthPersistence: For each dismissed error, looks up its category's configured hours and updates the suppression_hours column to match. """ - conn = sqlite3.connect(str(self.db_path)) + conn = self._get_conn() cursor = conn.cursor() # Build reverse map: category -> setting_key @@ -882,6 +908,51 @@ class HealthPersistence: """ all_cats = self.get_suppression_categories() return [c for c in all_cats if c['hours'] != self.DEFAULT_SUPPRESSION_HOURS] + + def record_unknown_persistent(self, category: str, reason: str): + """ + Record a persistent UNKNOWN event when a health check has been + unable to verify for >= 3 consecutive cycles (~15 min). + Avoids duplicates by only recording once per 30 min per category. + """ + with self._db_lock: + self._record_unknown_persistent_impl(category, reason) + + def _record_unknown_persistent_impl(self, category, reason): + try: + event_key = f'unknown_persistent_{category}' + now = datetime.now().isoformat() + + conn = self._get_conn() + cursor = conn.cursor() + + # Check if we already recorded this within the last 30 minutes + # Note: events table has columns (id, event_type, error_key, timestamp, data) + # We use error_key for deduplication since it contains the category + cursor.execute(''' + SELECT MAX(timestamp) FROM events + WHERE event_type = ? AND error_key = ? + ''', ('unknown_persistent', event_key)) + row = cursor.fetchone() + if row and row[0]: + try: + last_recorded = datetime.fromisoformat(row[0]) + if (datetime.now() - last_recorded).total_seconds() < 1800: + conn.close() + return # Already recorded recently + except (ValueError, TypeError): + pass # If timestamp is malformed, proceed with recording + + cursor.execute(''' + INSERT INTO events (event_type, error_key, timestamp, data) + VALUES (?, ?, ?, ?) + ''', ('unknown_persistent', event_key, now, + json.dumps({'category': category, 'reason': reason}))) + + conn.commit() + conn.close() + except Exception as e: + print(f"[HealthPersistence] Error recording UNKNOWN persistent: {e}") # Global instance