Update health monitor

This commit is contained in:
MacRimi
2026-02-17 20:01:45 +01:00
parent 74b6f565e9
commit 1317c5bddc
4 changed files with 318 additions and 67 deletions

View File

@@ -28,6 +28,7 @@ import {
BellOff, BellOff,
ChevronRight, ChevronRight,
Settings2, Settings2,
HelpCircle,
} from "lucide-react" } from "lucide-react"
interface CategoryCheck { interface CategoryCheck {
@@ -207,6 +208,8 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
return <AlertTriangle className={`${cls} text-yellow-500`} /> return <AlertTriangle className={`${cls} text-yellow-500`} />
case "CRITICAL": case "CRITICAL":
return <XCircle className={`${cls} text-red-500`} /> return <XCircle className={`${cls} text-red-500`} />
case "UNKNOWN":
return <HelpCircle className={`${cls} text-amber-400`} />
default: default:
return <Activity className={`${cls} text-muted-foreground`} /> return <Activity className={`${cls} text-muted-foreground`} />
} }
@@ -223,6 +226,8 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
return <Badge className="bg-yellow-500 text-white hover:bg-yellow-500">Warning</Badge> return <Badge className="bg-yellow-500 text-white hover:bg-yellow-500">Warning</Badge>
case "CRITICAL": case "CRITICAL":
return <Badge className="bg-red-500 text-white hover:bg-red-500">Critical</Badge> return <Badge className="bg-red-500 text-white hover:bg-red-500">Critical</Badge>
case "UNKNOWN":
return <Badge className="bg-amber-500 text-white hover:bg-amber-500">UNKNOWN</Badge>
default: default:
return <Badge>Unknown</Badge> return <Badge>Unknown</Badge>
} }
@@ -230,13 +235,14 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
const getHealthStats = () => { const getHealthStats = () => {
if (!healthData?.details) { if (!healthData?.details) {
return { total: 0, healthy: 0, info: 0, warnings: 0, critical: 0 } return { total: 0, healthy: 0, info: 0, warnings: 0, critical: 0, unknown: 0 }
} }
let healthy = 0 let healthy = 0
let info = 0 let info = 0
let warnings = 0 let warnings = 0
let critical = 0 let critical = 0
let unknown = 0
CATEGORIES.forEach(({ key }) => { CATEGORIES.forEach(({ key }) => {
const categoryData = healthData.details[key as keyof typeof healthData.details] const categoryData = healthData.details[key as keyof typeof healthData.details]
@@ -246,10 +252,11 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
else if (status === "INFO") info++ else if (status === "INFO") info++
else if (status === "WARNING") warnings++ else if (status === "WARNING") warnings++
else if (status === "CRITICAL") critical++ else if (status === "CRITICAL") critical++
else if (status === "UNKNOWN") unknown++
} }
}) })
return { total: CATEGORIES.length, healthy, info, warnings, critical } return { total: CATEGORIES.length, healthy, info, warnings, critical, unknown }
} }
const stats = getHealthStats() const stats = getHealthStats()
@@ -317,16 +324,18 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
const s = status?.toUpperCase() const s = status?.toUpperCase()
if (s === "CRITICAL") return "bg-red-500/5 border-red-500/20 hover:bg-red-500/10 cursor-pointer" if (s === "CRITICAL") return "bg-red-500/5 border-red-500/20 hover:bg-red-500/10 cursor-pointer"
if (s === "WARNING") return "bg-yellow-500/5 border-yellow-500/20 hover:bg-yellow-500/10 cursor-pointer" if (s === "WARNING") return "bg-yellow-500/5 border-yellow-500/20 hover:bg-yellow-500/10 cursor-pointer"
if (s === "UNKNOWN") return "bg-amber-500/5 border-amber-500/20 hover:bg-amber-500/10 cursor-pointer"
if (s === "INFO") return "bg-blue-500/5 border-blue-500/20 hover:bg-blue-500/10" if (s === "INFO") return "bg-blue-500/5 border-blue-500/20 hover:bg-blue-500/10"
return "bg-card border-border hover:bg-muted/30" return "bg-card border-border hover:bg-muted/30"
} }
const getOutlineBadgeStyle = (status: string) => { const getOutlineBadgeStyle = (status: string) => {
const s = status?.toUpperCase() const s = status?.toUpperCase()
if (s === "OK") return "border-green-500 text-green-500 bg-transparent" if (s === "OK") return "border-green-500 text-green-500 bg-transparent"
if (s === "INFO") return "border-blue-500 text-blue-500 bg-blue-500/5" if (s === "INFO") return "border-blue-500 text-blue-500 bg-blue-500/5"
if (s === "WARNING") return "border-yellow-500 text-yellow-500 bg-yellow-500/5" if (s === "WARNING") return "border-yellow-500 text-yellow-500 bg-yellow-500/5"
if (s === "CRITICAL") return "border-red-500 text-red-500 bg-red-500/5" if (s === "CRITICAL") return "border-red-500 text-red-500 bg-red-500/5"
if (s === "UNKNOWN") return "border-amber-400 text-amber-400 bg-amber-500/5"
return "" return ""
} }
@@ -502,6 +511,12 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
<div className="text-lg sm:text-2xl font-bold text-red-500">{stats.critical}</div> <div className="text-lg sm:text-2xl font-bold text-red-500">{stats.critical}</div>
<div className="text-[10px] sm:text-xs text-muted-foreground">Critical</div> <div className="text-[10px] sm:text-xs text-muted-foreground">Critical</div>
</div> </div>
{stats.unknown > 0 && (
<div className="text-center">
<div className="text-lg sm:text-2xl font-bold text-amber-400">{stats.unknown}</div>
<div className="text-[10px] sm:text-xs text-muted-foreground">Unknown</div>
</div>
)}
</div> </div>
{healthData.summary && healthData.summary !== "All systems operational" && ( {healthData.summary && healthData.summary !== "All systems operational" && (

View File

@@ -600,6 +600,47 @@ def _health_collector_loop():
time.sleep(300) # Every 5 minutes time.sleep(300) # Every 5 minutes
def _vital_signs_sampler():
"""Dedicated thread for rapid CPU & temperature sampling.
Runs independently of the 5-min health collector loop.
- CPU usage: sampled every 30s (3 samples in 1.5 min for hysteresis)
- Temperature: sampled every 10s (18 samples in 3 min for temporal logic)
Uses time.monotonic() to avoid drift.
"""
from health_monitor import health_monitor
# Wait 15s after startup for sensors to be ready
time.sleep(15)
TEMP_INTERVAL = 10 # seconds
CPU_INTERVAL = 30 # seconds
next_temp = time.monotonic()
next_cpu = time.monotonic()
print("[ProxMenux] Vital signs sampler started (CPU: 30s, Temp: 10s)")
while True:
try:
now = time.monotonic()
if now >= next_temp:
health_monitor._sample_cpu_temperature()
next_temp = now + TEMP_INTERVAL
if now >= next_cpu:
health_monitor._sample_cpu_usage()
next_cpu = now + CPU_INTERVAL
# Sleep until the next earliest event (with 0.5s min to avoid busy-loop)
sleep_until = min(next_temp, next_cpu) - time.monotonic()
time.sleep(max(sleep_until, 0.5))
except Exception as e:
print(f"[ProxMenux] Vital signs sampler error: {e}")
time.sleep(10)
def get_uptime(): def get_uptime():
"""Get system uptime in a human-readable format.""" """Get system uptime in a human-readable format."""
try: try:
@@ -7046,6 +7087,13 @@ if __name__ == '__main__':
except Exception as e: except Exception as e:
print(f"[ProxMenux] Background health monitor failed to start: {e}") print(f"[ProxMenux] Background health monitor failed to start: {e}")
# ── Vital Signs Sampler (rapid CPU + Temperature) ──
try:
vital_thread = threading.Thread(target=_vital_signs_sampler, daemon=True)
vital_thread.start()
except Exception as e:
print(f"[ProxMenux] Vital signs sampler failed to start: {e}")
# Check for SSL configuration # Check for SSL configuration
ssl_ctx = None ssl_ctx = None
try: try:

View File

@@ -60,7 +60,7 @@ class HealthMonitor:
# Network Thresholds # Network Thresholds
NETWORK_LATENCY_WARNING = 100 NETWORK_LATENCY_WARNING = 100
NETWORK_LATENCY_CRITICAL = 300 NETWORK_LATENCY_CRITICAL = 300
NETWORK_TIMEOUT = 0.9 NETWORK_TIMEOUT = 2
NETWORK_INACTIVE_DURATION = 600 NETWORK_INACTIVE_DURATION = 600
# Log Thresholds # Log Thresholds
@@ -142,6 +142,7 @@ class HealthMonitor:
self.io_error_history = defaultdict(list) self.io_error_history = defaultdict(list)
self.failed_vm_history = set() # Track VMs that failed to start self.failed_vm_history = set() # Track VMs that failed to start
self.persistent_log_patterns = defaultdict(lambda: {'count': 0, 'first_seen': 0, 'last_seen': 0}) self.persistent_log_patterns = defaultdict(lambda: {'count': 0, 'first_seen': 0, 'last_seen': 0})
self._unknown_counts = {} # Track consecutive UNKNOWN cycles per category
# System capabilities - derived from Proxmox storage types at runtime (Priority 1.5) # System capabilities - derived from Proxmox storage types at runtime (Priority 1.5)
# SMART detection still uses filesystem check on init (lightweight) # SMART detection still uses filesystem check on init (lightweight)
@@ -153,6 +154,63 @@ class HealthMonitor:
except Exception as e: except Exception as e:
print(f"[HealthMonitor] Cleanup warning: {e}") print(f"[HealthMonitor] Cleanup warning: {e}")
# ─── Lightweight sampling methods for the dedicated vital-signs thread ───
# These ONLY append data to state_history without triggering evaluation,
# persistence, or subprocess-heavy operations.
def _sample_cpu_usage(self):
"""Lightweight CPU sample: read usage % and append to history. ~30ms cost."""
try:
cpu_percent = psutil.cpu_percent(interval=0)
current_time = time.time()
state_key = 'cpu_usage'
self.state_history[state_key].append({
'value': cpu_percent,
'time': current_time
})
# Prune entries older than 6 minutes
self.state_history[state_key] = [
e for e in self.state_history[state_key]
if current_time - e['time'] < 360
]
except Exception:
pass # Sampling must never crash the thread
def _sample_cpu_temperature(self):
"""Lightweight temperature sample: read sensor and append to history. ~50ms cost."""
try:
result = subprocess.run(
['sensors', '-A', '-u'],
capture_output=True, text=True, timeout=2
)
if result.returncode != 0:
return
temps = []
for line in result.stdout.split('\n'):
if 'temp' in line.lower() and '_input' in line:
try:
temp = float(line.split(':')[1].strip())
temps.append(temp)
except Exception:
continue
if temps:
max_temp = max(temps)
current_time = time.time()
state_key = 'cpu_temp_history'
self.state_history[state_key].append({
'value': max_temp,
'time': current_time
})
# Prune entries older than 4 minutes
self.state_history[state_key] = [
e for e in self.state_history[state_key]
if current_time - e['time'] < 240
]
except Exception:
pass # Sampling must never crash the thread
def get_system_info(self) -> Dict[str, Any]: def get_system_info(self) -> Dict[str, Any]:
""" """
Get lightweight system info for header display. Get lightweight system info for header display.
@@ -377,14 +435,34 @@ class HealthMonitor:
elif security_status.get('status') == 'INFO': elif security_status.get('status') == 'INFO':
info_issues.append(f"Security: {security_status.get('reason', 'Security information')}") info_issues.append(f"Security: {security_status.get('reason', 'Security information')}")
# --- Track UNKNOWN counts and persist if >= 3 consecutive cycles ---
unknown_issues = []
for cat_key, cat_data in details.items():
cat_status = cat_data.get('status', 'OK')
if cat_status == 'UNKNOWN':
count = self._unknown_counts.get(cat_key, 0) + 1
self._unknown_counts[cat_key] = min(count, 10) # Cap to avoid unbounded growth
unknown_issues.append(f"{cat_key}: {cat_data.get('reason', 'Check unavailable')}")
if count == 3: # Only persist on the exact 3rd cycle, not every cycle after
try:
health_persistence.record_unknown_persistent(
cat_key, cat_data.get('reason', 'Check unavailable'))
except Exception:
pass
else:
self._unknown_counts[cat_key] = 0
# --- Determine Overall Status --- # --- Determine Overall Status ---
# Use a fixed order of severity: CRITICAL > WARNING > INFO > OK # Severity: CRITICAL > WARNING > UNKNOWN (capped at WARNING) > INFO > OK
if critical_issues: if critical_issues:
overall = 'CRITICAL' overall = 'CRITICAL'
summary = '; '.join(critical_issues[:3]) # Limit summary to 3 issues summary = '; '.join(critical_issues[:3])
elif warning_issues: elif warning_issues:
overall = 'WARNING' overall = 'WARNING'
summary = '; '.join(warning_issues[:3]) summary = '; '.join(warning_issues[:3])
elif unknown_issues:
overall = 'WARNING' # UNKNOWN caps at WARNING, never escalates to CRITICAL
summary = '; '.join(unknown_issues[:3])
elif info_issues: elif info_issues:
overall = 'OK' # INFO statuses don't degrade overall health overall = 'OK' # INFO statuses don't degrade overall health
summary = '; '.join(info_issues[:3]) summary = '; '.join(info_issues[:3])
@@ -444,13 +522,17 @@ class HealthMonitor:
current_time = time.time() current_time = time.time()
state_key = 'cpu_usage' state_key = 'cpu_usage'
# Add this reading as well (supplements the sampler thread)
self.state_history[state_key].append({ self.state_history[state_key].append({
'value': cpu_percent, 'value': cpu_percent,
'time': current_time 'time': current_time
}) })
# Snapshot the list for thread-safe reading (sampler may append concurrently)
cpu_snapshot = list(self.state_history[state_key])
# Prune old entries via snapshot replacement (atomic assignment)
self.state_history[state_key] = [ self.state_history[state_key] = [
entry for entry in self.state_history[state_key] entry for entry in cpu_snapshot
if current_time - entry['time'] < 360 if current_time - entry['time'] < 360
] ]
@@ -517,8 +599,8 @@ class HealthMonitor:
} }
else: else:
checks['cpu_temperature'] = { checks['cpu_temperature'] = {
'status': 'OK', 'status': 'INFO',
'detail': 'Sensor not available', 'detail': 'No temperature sensor detected - install lm-sensors if hardware supports it',
} }
result['checks'] = checks result['checks'] = checks
@@ -564,14 +646,16 @@ class HealthMonitor:
max_temp = max(temps) max_temp = max(temps)
state_key = 'cpu_temp_history' state_key = 'cpu_temp_history'
# Add this reading (supplements the sampler thread)
self.state_history[state_key].append({ self.state_history[state_key].append({
'value': max_temp, 'value': max_temp,
'time': current_time 'time': current_time
}) })
# Keep last 4 minutes of data (240 seconds) # Snapshot for thread-safe reading, then atomic prune
temp_snapshot = list(self.state_history[state_key])
self.state_history[state_key] = [ self.state_history[state_key] = [
entry for entry in self.state_history[state_key] entry for entry in temp_snapshot
if current_time - entry['time'] < 240 if current_time - entry['time'] < 240
] ]
@@ -1058,9 +1142,10 @@ class HealthMonitor:
'reason': f"{len(disk_issues)} disk(s) with recent errors", 'reason': f"{len(disk_issues)} disk(s) with recent errors",
'details': disk_issues 'details': disk_issues
} }
except Exception: except Exception as e:
return {'status': 'OK'} print(f"[HealthMonitor] Disk/IO check failed: {e}")
return {'status': 'UNKNOWN', 'reason': f'Disk check unavailable: {str(e)}', 'checks': {}}
def _check_network_optimized(self) -> Dict[str, Any]: def _check_network_optimized(self) -> Dict[str, Any]:
""" """
@@ -1186,9 +1271,10 @@ class HealthMonitor:
'details': interface_details, 'details': interface_details,
'checks': checks 'checks': checks
} }
except Exception: except Exception as e:
return {'status': 'OK'} print(f"[HealthMonitor] Network check failed: {e}")
return {'status': 'UNKNOWN', 'reason': f'Network check unavailable: {str(e)}', 'checks': {}}
def _check_network_latency(self) -> Optional[Dict[str, Any]]: def _check_network_latency(self) -> Optional[Dict[str, Any]]:
"""Check network latency to 1.1.1.1 (cached)""" """Check network latency to 1.1.1.1 (cached)"""
@@ -1237,15 +1323,31 @@ class HealthMonitor:
except: except:
pass pass
# If ping failed (timeout, unreachable) # If ping failed (timeout, unreachable) - distinguish the reason
stderr_lower = (result.stderr or '').lower() if hasattr(result, 'stderr') else ''
if 'unreachable' in stderr_lower or 'network is unreachable' in stderr_lower:
fail_reason = 'Network unreachable - no route to 1.1.1.1'
elif result.returncode == 1:
fail_reason = 'Packet loss to 1.1.1.1 (100% loss)'
else:
fail_reason = f'Ping failed (exit code {result.returncode})'
packet_loss_result = { packet_loss_result = {
'status': 'CRITICAL', 'status': 'CRITICAL',
'reason': 'Packet loss or timeout to 1.1.1.1' 'reason': fail_reason
} }
self.cached_results[cache_key] = packet_loss_result self.cached_results[cache_key] = packet_loss_result
self.last_check_times[cache_key] = current_time self.last_check_times[cache_key] = current_time
return packet_loss_result return packet_loss_result
except subprocess.TimeoutExpired:
timeout_result = {
'status': 'WARNING',
'reason': f'Ping timeout (>{self.NETWORK_TIMEOUT}s) - possible high latency'
}
self.cached_results[cache_key] = timeout_result
self.last_check_times[cache_key] = current_time
return timeout_result
except Exception: except Exception:
return {'status': 'UNKNOWN', 'reason': 'Ping command failed'} return {'status': 'UNKNOWN', 'reason': 'Ping command failed'}
@@ -1356,9 +1458,10 @@ class HealthMonitor:
'reason': '; '.join(issues[:3]), 'reason': '; '.join(issues[:3]),
'details': vm_details 'details': vm_details
} }
except Exception: except Exception as e:
return {'status': 'OK'} print(f"[HealthMonitor] VMs/CTs check failed: {e}")
return {'status': 'UNKNOWN', 'reason': f'VM/CT check unavailable: {str(e)}', 'checks': {}}
# Modified to use persistence # Modified to use persistence
def _check_vms_cts_with_persistence(self) -> Dict[str, Any]: def _check_vms_cts_with_persistence(self) -> Dict[str, Any]:
@@ -1462,7 +1565,12 @@ class HealthMonitor:
# Generic failed to start for VMs and CTs # Generic failed to start for VMs and CTs
if any(keyword in line_lower for keyword in ['failed to start', 'cannot start', 'activation failed', 'start error']): if any(keyword in line_lower for keyword in ['failed to start', 'cannot start', 'activation failed', 'start error']):
id_match = re.search(r'\b(\d{3,5})\b', line) # Increased digit count for wider match # Try contextual VMID patterns first (more precise), then fallback to generic
id_match = (
re.search(r'(?:VMID|vmid|VM|CT|qemu|lxc|pct|qm)[:\s=/]+(\d{3,5})\b', line) or
re.search(r'\b(\d{3,5})\.conf\b', line) or
re.search(r'\b(\d{3,5})\b', line)
)
if id_match: if id_match:
vmid_ctid = id_match.group(1) vmid_ctid = id_match.group(1)
# Determine if it's a VM or CT based on context, if possible # Determine if it's a VM or CT based on context, if possible
@@ -1521,8 +1629,9 @@ class HealthMonitor:
'checks': checks 'checks': checks
} }
except Exception: except Exception as e:
return {'status': 'OK', 'checks': {}} print(f"[HealthMonitor] VMs/CTs persistence check failed: {e}")
return {'status': 'UNKNOWN', 'reason': f'VM/CT check unavailable: {str(e)}', 'checks': {}}
def _check_pve_services(self) -> Dict[str, Any]: def _check_pve_services(self) -> Dict[str, Any]:
""" """
@@ -1588,7 +1697,7 @@ class HealthMonitor:
error_key = f'pve_service_{svc}' error_key = f'pve_service_{svc}'
health_persistence.record_error( health_persistence.record_error(
error_key=error_key, error_key=error_key,
category='services', category='pve_services',
severity='CRITICAL', severity='CRITICAL',
reason=f'PVE service {svc} is {service_details.get(svc, "inactive")}', reason=f'PVE service {svc} is {service_details.get(svc, "inactive")}',
details={'service': svc, 'state': service_details.get(svc, 'inactive')} details={'service': svc, 'state': service_details.get(svc, 'inactive')}
@@ -1932,9 +2041,8 @@ class HealthMonitor:
return ok_result return ok_result
except Exception as e: except Exception as e:
# Log the exception but return OK to avoid alert storms on check failure print(f"[HealthMonitor] Log check failed: {e}")
print(f"[HealthMonitor] Error checking logs: {e}") return {'status': 'UNKNOWN', 'reason': f'Log check unavailable: {str(e)}', 'checks': {}}
return {'status': 'OK'}
def _normalize_log_pattern(self, line: str) -> str: def _normalize_log_pattern(self, line: str) -> str:
""" """
@@ -1984,12 +2092,21 @@ class HealthMonitor:
pass # Ignore if mtime fails pass # Ignore if mtime fails
# Perform a dry run of apt-get upgrade to see pending packages # Perform a dry run of apt-get upgrade to see pending packages
result = subprocess.run( try:
['apt-get', 'upgrade', '--dry-run'], result = subprocess.run(
capture_output=True, ['apt-get', 'upgrade', '--dry-run'],
text=True, capture_output=True,
timeout=5 # Increased timeout for safety text=True,
) timeout=10
)
except subprocess.TimeoutExpired:
print("[HealthMonitor] apt-get upgrade --dry-run timed out")
return {
'status': 'UNKNOWN',
'reason': 'apt-get timed out - repository may be unreachable',
'count': 0,
'checks': {}
}
status = 'OK' status = 'OK'
reason = None reason = None
@@ -2112,8 +2229,8 @@ class HealthMonitor:
return update_result return update_result
except Exception as e: except Exception as e:
print(f"[HealthMonitor] Error checking updates: {e}") print(f"[HealthMonitor] Updates check failed: {e}")
return {'status': 'OK', 'count': 0, 'checks': {}} return {'status': 'UNKNOWN', 'reason': f'Updates check unavailable: {str(e)}', 'count': 0, 'checks': {}}
def _check_fail2ban_bans(self) -> Dict[str, Any]: def _check_fail2ban_bans(self) -> Dict[str, Any]:
""" """
@@ -2356,8 +2473,8 @@ class HealthMonitor:
} }
except Exception as e: except Exception as e:
print(f"[HealthMonitor] Error checking security: {e}") print(f"[HealthMonitor] Security check failed: {e}")
return {'status': 'OK', 'checks': {}} return {'status': 'UNKNOWN', 'reason': f'Security check unavailable: {str(e)}', 'checks': {}}
def _check_certificates(self) -> Optional[Dict[str, Any]]: def _check_certificates(self) -> Optional[Dict[str, Any]]:
""" """

View File

@@ -17,6 +17,7 @@ Version: 1.1
import sqlite3 import sqlite3
import json import json
import os import os
import threading
from datetime import datetime, timedelta from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional from typing import Dict, List, Any, Optional
from pathlib import Path from pathlib import Path
@@ -52,11 +53,19 @@ class HealthPersistence:
self.data_dir.mkdir(parents=True, exist_ok=True) self.data_dir.mkdir(parents=True, exist_ok=True)
self.db_path = self.data_dir / 'health_monitor.db' self.db_path = self.data_dir / 'health_monitor.db'
self._db_lock = threading.Lock()
self._init_database() self._init_database()
def _get_conn(self) -> sqlite3.Connection:
"""Get a SQLite connection with timeout and WAL mode for safe concurrency."""
conn = sqlite3.connect(str(self.db_path), timeout=10)
conn.execute('PRAGMA journal_mode=WAL')
conn.execute('PRAGMA busy_timeout=5000')
return conn
def _init_database(self): def _init_database(self):
"""Initialize SQLite database with required tables""" """Initialize SQLite database with required tables"""
conn = sqlite3.connect(str(self.db_path)) conn = self._get_conn()
cursor = conn.cursor() cursor = conn.cursor()
# Errors table # Errors table
@@ -126,7 +135,11 @@ class HealthPersistence:
Record or update an error. Record or update an error.
Returns event info (new_error, updated, etc.) Returns event info (new_error, updated, etc.)
""" """
conn = sqlite3.connect(str(self.db_path)) with self._db_lock:
return self._record_error_impl(error_key, category, severity, reason, details)
def _record_error_impl(self, error_key, category, severity, reason, details):
conn = self._get_conn()
cursor = conn.cursor() cursor = conn.cursor()
now = datetime.now().isoformat() now = datetime.now().isoformat()
@@ -262,7 +275,11 @@ class HealthPersistence:
def resolve_error(self, error_key: str, reason: str = 'auto-resolved'): def resolve_error(self, error_key: str, reason: str = 'auto-resolved'):
"""Mark an error as resolved""" """Mark an error as resolved"""
conn = sqlite3.connect(str(self.db_path)) with self._db_lock:
return self._resolve_error_impl(error_key, reason)
def _resolve_error_impl(self, error_key, reason):
conn = self._get_conn()
cursor = conn.cursor() cursor = conn.cursor()
now = datetime.now().isoformat() now = datetime.now().isoformat()
@@ -284,7 +301,7 @@ class HealthPersistence:
Check if an error is currently active (unresolved and not acknowledged). Check if an error is currently active (unresolved and not acknowledged).
Used by checks to avoid re-recording errors that are already tracked. Used by checks to avoid re-recording errors that are already tracked.
""" """
conn = sqlite3.connect(str(self.db_path)) conn = self._get_conn()
cursor = conn.cursor() cursor = conn.cursor()
if category: if category:
@@ -314,7 +331,7 @@ class HealthPersistence:
we delete the record entirely so it can re-trigger as a fresh we delete the record entirely so it can re-trigger as a fresh
event if the condition returns later. event if the condition returns later.
""" """
conn = sqlite3.connect(str(self.db_path)) conn = self._get_conn()
cursor = conn.cursor() cursor = conn.cursor()
now = datetime.now().isoformat() now = datetime.now().isoformat()
@@ -353,7 +370,11 @@ class HealthPersistence:
- Stores suppression_hours on the error record (snapshot at dismiss time) - Stores suppression_hours on the error record (snapshot at dismiss time)
- Marks as acknowledged so it won't re-appear during the suppression period - Marks as acknowledged so it won't re-appear during the suppression period
""" """
conn = sqlite3.connect(str(self.db_path)) with self._db_lock:
return self._acknowledge_error_impl(error_key)
def _acknowledge_error_impl(self, error_key):
conn = self._get_conn()
conn.row_factory = sqlite3.Row conn.row_factory = sqlite3.Row
cursor = conn.cursor() cursor = conn.cursor()
@@ -408,7 +429,7 @@ class HealthPersistence:
def get_active_errors(self, category: Optional[str] = None) -> List[Dict[str, Any]]: def get_active_errors(self, category: Optional[str] = None) -> List[Dict[str, Any]]:
"""Get all active (unresolved) errors, optionally filtered by category""" """Get all active (unresolved) errors, optionally filtered by category"""
conn = sqlite3.connect(str(self.db_path)) conn = self._get_conn()
conn.row_factory = sqlite3.Row conn.row_factory = sqlite3.Row
cursor = conn.cursor() cursor = conn.cursor()
@@ -439,7 +460,11 @@ class HealthPersistence:
def cleanup_old_errors(self): def cleanup_old_errors(self):
"""Clean up old resolved errors and auto-resolve stale errors""" """Clean up old resolved errors and auto-resolve stale errors"""
conn = sqlite3.connect(str(self.db_path)) with self._db_lock:
return self._cleanup_old_errors_impl()
def _cleanup_old_errors_impl(self):
conn = self._get_conn()
cursor = conn.cursor() cursor = conn.cursor()
now = datetime.now() now = datetime.now()
@@ -519,7 +544,7 @@ class HealthPersistence:
Get errors that were acknowledged/dismissed but still within suppression period. Get errors that were acknowledged/dismissed but still within suppression period.
These are shown as INFO in the frontend with a 'Dismissed' badge. These are shown as INFO in the frontend with a 'Dismissed' badge.
""" """
conn = sqlite3.connect(str(self.db_path)) conn = self._get_conn()
conn.row_factory = sqlite3.Row conn.row_factory = sqlite3.Row
cursor = conn.cursor() cursor = conn.cursor()
@@ -584,7 +609,7 @@ class HealthPersistence:
- 'resolved': error resolved - 'resolved': error resolved
- 'escalated': severity increased - 'escalated': severity increased
""" """
conn = sqlite3.connect(str(self.db_path)) conn = self._get_conn()
cursor = conn.cursor() cursor = conn.cursor()
event_data = data or {} event_data = data or {}
@@ -608,7 +633,7 @@ class HealthPersistence:
Get events that need notification (for future Telegram/Gotify integration). Get events that need notification (for future Telegram/Gotify integration).
Groups by severity for batch notification sending. Groups by severity for batch notification sending.
""" """
conn = sqlite3.connect(str(self.db_path)) conn = self._get_conn()
conn.row_factory = sqlite3.Row conn.row_factory = sqlite3.Row
cursor = conn.cursor() cursor = conn.cursor()
@@ -641,7 +666,7 @@ class HealthPersistence:
if not event_ids: if not event_ids:
return return
conn = sqlite3.connect(str(self.db_path)) conn = self._get_conn()
cursor = conn.cursor() cursor = conn.cursor()
for event_id in event_ids: for event_id in event_ids:
@@ -663,7 +688,7 @@ class HealthPersistence:
def get_unnotified_errors(self) -> List[Dict[str, Any]]: def get_unnotified_errors(self) -> List[Dict[str, Any]]:
"""Get errors that need Telegram notification""" """Get errors that need Telegram notification"""
conn = sqlite3.connect(str(self.db_path)) conn = self._get_conn()
conn.row_factory = sqlite3.Row conn.row_factory = sqlite3.Row
cursor = conn.cursor() cursor = conn.cursor()
@@ -689,7 +714,7 @@ class HealthPersistence:
def mark_notified(self, error_key: str): def mark_notified(self, error_key: str):
"""Mark error as notified""" """Mark error as notified"""
conn = sqlite3.connect(str(self.db_path)) conn = self._get_conn()
cursor = conn.cursor() cursor = conn.cursor()
cursor.execute(''' cursor.execute('''
@@ -708,7 +733,7 @@ class HealthPersistence:
Get a cached system capability value. Get a cached system capability value.
Returns None if not yet detected. Returns None if not yet detected.
""" """
conn = sqlite3.connect(str(self.db_path)) conn = self._get_conn()
cursor = conn.cursor() cursor = conn.cursor()
cursor.execute( cursor.execute(
'SELECT cap_value FROM system_capabilities WHERE cap_key = ?', 'SELECT cap_value FROM system_capabilities WHERE cap_key = ?',
@@ -720,7 +745,7 @@ class HealthPersistence:
def set_capability(self, cap_key: str, cap_value: str): def set_capability(self, cap_key: str, cap_value: str):
"""Store a system capability value (detected once, cached forever).""" """Store a system capability value (detected once, cached forever)."""
conn = sqlite3.connect(str(self.db_path)) conn = self._get_conn()
cursor = conn.cursor() cursor = conn.cursor()
cursor.execute(''' cursor.execute('''
INSERT OR REPLACE INTO system_capabilities (cap_key, cap_value, detected_at) INSERT OR REPLACE INTO system_capabilities (cap_key, cap_value, detected_at)
@@ -731,7 +756,7 @@ class HealthPersistence:
def get_all_capabilities(self) -> Dict[str, str]: def get_all_capabilities(self) -> Dict[str, str]:
"""Get all cached system capabilities as a dict.""" """Get all cached system capabilities as a dict."""
conn = sqlite3.connect(str(self.db_path)) conn = self._get_conn()
cursor = conn.cursor() cursor = conn.cursor()
cursor.execute('SELECT cap_key, cap_value FROM system_capabilities') cursor.execute('SELECT cap_key, cap_value FROM system_capabilities')
rows = cursor.fetchall() rows = cursor.fetchall()
@@ -747,7 +772,7 @@ class HealthPersistence:
def get_setting(self, key: str, default: Optional[str] = None) -> Optional[str]: def get_setting(self, key: str, default: Optional[str] = None) -> Optional[str]:
"""Get a user setting value by key.""" """Get a user setting value by key."""
conn = sqlite3.connect(str(self.db_path)) conn = self._get_conn()
cursor = conn.cursor() cursor = conn.cursor()
cursor.execute( cursor.execute(
'SELECT setting_value FROM user_settings WHERE setting_key = ?', (key,) 'SELECT setting_value FROM user_settings WHERE setting_key = ?', (key,)
@@ -758,18 +783,19 @@ class HealthPersistence:
def set_setting(self, key: str, value: str): def set_setting(self, key: str, value: str):
"""Store a user setting value.""" """Store a user setting value."""
conn = sqlite3.connect(str(self.db_path)) with self._db_lock:
cursor = conn.cursor() conn = self._get_conn()
cursor.execute(''' cursor = conn.cursor()
INSERT OR REPLACE INTO user_settings (setting_key, setting_value, updated_at) cursor.execute('''
VALUES (?, ?, ?) INSERT OR REPLACE INTO user_settings (setting_key, setting_value, updated_at)
''', (key, value, datetime.now().isoformat())) VALUES (?, ?, ?)
conn.commit() ''', (key, value, datetime.now().isoformat()))
conn.close() conn.commit()
conn.close()
def get_all_settings(self, prefix: Optional[str] = None) -> Dict[str, str]: def get_all_settings(self, prefix: Optional[str] = None) -> Dict[str, str]:
"""Get all user settings, optionally filtered by key prefix.""" """Get all user settings, optionally filtered by key prefix."""
conn = sqlite3.connect(str(self.db_path)) conn = self._get_conn()
cursor = conn.cursor() cursor = conn.cursor()
if prefix: if prefix:
cursor.execute( cursor.execute(
@@ -791,7 +817,7 @@ class HealthPersistence:
For each dismissed error, looks up its category's configured hours For each dismissed error, looks up its category's configured hours
and updates the suppression_hours column to match. and updates the suppression_hours column to match.
""" """
conn = sqlite3.connect(str(self.db_path)) conn = self._get_conn()
cursor = conn.cursor() cursor = conn.cursor()
# Build reverse map: category -> setting_key # Build reverse map: category -> setting_key
@@ -882,6 +908,51 @@ class HealthPersistence:
""" """
all_cats = self.get_suppression_categories() all_cats = self.get_suppression_categories()
return [c for c in all_cats if c['hours'] != self.DEFAULT_SUPPRESSION_HOURS] return [c for c in all_cats if c['hours'] != self.DEFAULT_SUPPRESSION_HOURS]
def record_unknown_persistent(self, category: str, reason: str):
"""
Record a persistent UNKNOWN event when a health check has been
unable to verify for >= 3 consecutive cycles (~15 min).
Avoids duplicates by only recording once per 30 min per category.
"""
with self._db_lock:
self._record_unknown_persistent_impl(category, reason)
def _record_unknown_persistent_impl(self, category, reason):
try:
event_key = f'unknown_persistent_{category}'
now = datetime.now().isoformat()
conn = self._get_conn()
cursor = conn.cursor()
# Check if we already recorded this within the last 30 minutes
# Note: events table has columns (id, event_type, error_key, timestamp, data)
# We use error_key for deduplication since it contains the category
cursor.execute('''
SELECT MAX(timestamp) FROM events
WHERE event_type = ? AND error_key = ?
''', ('unknown_persistent', event_key))
row = cursor.fetchone()
if row and row[0]:
try:
last_recorded = datetime.fromisoformat(row[0])
if (datetime.now() - last_recorded).total_seconds() < 1800:
conn.close()
return # Already recorded recently
except (ValueError, TypeError):
pass # If timestamp is malformed, proceed with recording
cursor.execute('''
INSERT INTO events (event_type, error_key, timestamp, data)
VALUES (?, ?, ?, ?)
''', ('unknown_persistent', event_key, now,
json.dumps({'category': category, 'reason': reason})))
conn.commit()
conn.close()
except Exception as e:
print(f"[HealthPersistence] Error recording UNKNOWN persistent: {e}")
# Global instance # Global instance