mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-02-18 16:36:27 +00:00
Update health monitor
This commit is contained in:
@@ -28,6 +28,7 @@ import {
|
|||||||
BellOff,
|
BellOff,
|
||||||
ChevronRight,
|
ChevronRight,
|
||||||
Settings2,
|
Settings2,
|
||||||
|
HelpCircle,
|
||||||
} from "lucide-react"
|
} from "lucide-react"
|
||||||
|
|
||||||
interface CategoryCheck {
|
interface CategoryCheck {
|
||||||
@@ -207,6 +208,8 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
|
|||||||
return <AlertTriangle className={`${cls} text-yellow-500`} />
|
return <AlertTriangle className={`${cls} text-yellow-500`} />
|
||||||
case "CRITICAL":
|
case "CRITICAL":
|
||||||
return <XCircle className={`${cls} text-red-500`} />
|
return <XCircle className={`${cls} text-red-500`} />
|
||||||
|
case "UNKNOWN":
|
||||||
|
return <HelpCircle className={`${cls} text-amber-400`} />
|
||||||
default:
|
default:
|
||||||
return <Activity className={`${cls} text-muted-foreground`} />
|
return <Activity className={`${cls} text-muted-foreground`} />
|
||||||
}
|
}
|
||||||
@@ -223,6 +226,8 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
|
|||||||
return <Badge className="bg-yellow-500 text-white hover:bg-yellow-500">Warning</Badge>
|
return <Badge className="bg-yellow-500 text-white hover:bg-yellow-500">Warning</Badge>
|
||||||
case "CRITICAL":
|
case "CRITICAL":
|
||||||
return <Badge className="bg-red-500 text-white hover:bg-red-500">Critical</Badge>
|
return <Badge className="bg-red-500 text-white hover:bg-red-500">Critical</Badge>
|
||||||
|
case "UNKNOWN":
|
||||||
|
return <Badge className="bg-amber-500 text-white hover:bg-amber-500">UNKNOWN</Badge>
|
||||||
default:
|
default:
|
||||||
return <Badge>Unknown</Badge>
|
return <Badge>Unknown</Badge>
|
||||||
}
|
}
|
||||||
@@ -230,13 +235,14 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
|
|||||||
|
|
||||||
const getHealthStats = () => {
|
const getHealthStats = () => {
|
||||||
if (!healthData?.details) {
|
if (!healthData?.details) {
|
||||||
return { total: 0, healthy: 0, info: 0, warnings: 0, critical: 0 }
|
return { total: 0, healthy: 0, info: 0, warnings: 0, critical: 0, unknown: 0 }
|
||||||
}
|
}
|
||||||
|
|
||||||
let healthy = 0
|
let healthy = 0
|
||||||
let info = 0
|
let info = 0
|
||||||
let warnings = 0
|
let warnings = 0
|
||||||
let critical = 0
|
let critical = 0
|
||||||
|
let unknown = 0
|
||||||
|
|
||||||
CATEGORIES.forEach(({ key }) => {
|
CATEGORIES.forEach(({ key }) => {
|
||||||
const categoryData = healthData.details[key as keyof typeof healthData.details]
|
const categoryData = healthData.details[key as keyof typeof healthData.details]
|
||||||
@@ -246,10 +252,11 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
|
|||||||
else if (status === "INFO") info++
|
else if (status === "INFO") info++
|
||||||
else if (status === "WARNING") warnings++
|
else if (status === "WARNING") warnings++
|
||||||
else if (status === "CRITICAL") critical++
|
else if (status === "CRITICAL") critical++
|
||||||
|
else if (status === "UNKNOWN") unknown++
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
return { total: CATEGORIES.length, healthy, info, warnings, critical }
|
return { total: CATEGORIES.length, healthy, info, warnings, critical, unknown }
|
||||||
}
|
}
|
||||||
|
|
||||||
const stats = getHealthStats()
|
const stats = getHealthStats()
|
||||||
@@ -317,16 +324,18 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
|
|||||||
const s = status?.toUpperCase()
|
const s = status?.toUpperCase()
|
||||||
if (s === "CRITICAL") return "bg-red-500/5 border-red-500/20 hover:bg-red-500/10 cursor-pointer"
|
if (s === "CRITICAL") return "bg-red-500/5 border-red-500/20 hover:bg-red-500/10 cursor-pointer"
|
||||||
if (s === "WARNING") return "bg-yellow-500/5 border-yellow-500/20 hover:bg-yellow-500/10 cursor-pointer"
|
if (s === "WARNING") return "bg-yellow-500/5 border-yellow-500/20 hover:bg-yellow-500/10 cursor-pointer"
|
||||||
|
if (s === "UNKNOWN") return "bg-amber-500/5 border-amber-500/20 hover:bg-amber-500/10 cursor-pointer"
|
||||||
if (s === "INFO") return "bg-blue-500/5 border-blue-500/20 hover:bg-blue-500/10"
|
if (s === "INFO") return "bg-blue-500/5 border-blue-500/20 hover:bg-blue-500/10"
|
||||||
return "bg-card border-border hover:bg-muted/30"
|
return "bg-card border-border hover:bg-muted/30"
|
||||||
}
|
}
|
||||||
|
|
||||||
const getOutlineBadgeStyle = (status: string) => {
|
const getOutlineBadgeStyle = (status: string) => {
|
||||||
const s = status?.toUpperCase()
|
const s = status?.toUpperCase()
|
||||||
if (s === "OK") return "border-green-500 text-green-500 bg-transparent"
|
if (s === "OK") return "border-green-500 text-green-500 bg-transparent"
|
||||||
if (s === "INFO") return "border-blue-500 text-blue-500 bg-blue-500/5"
|
if (s === "INFO") return "border-blue-500 text-blue-500 bg-blue-500/5"
|
||||||
if (s === "WARNING") return "border-yellow-500 text-yellow-500 bg-yellow-500/5"
|
if (s === "WARNING") return "border-yellow-500 text-yellow-500 bg-yellow-500/5"
|
||||||
if (s === "CRITICAL") return "border-red-500 text-red-500 bg-red-500/5"
|
if (s === "CRITICAL") return "border-red-500 text-red-500 bg-red-500/5"
|
||||||
|
if (s === "UNKNOWN") return "border-amber-400 text-amber-400 bg-amber-500/5"
|
||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -502,6 +511,12 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
|
|||||||
<div className="text-lg sm:text-2xl font-bold text-red-500">{stats.critical}</div>
|
<div className="text-lg sm:text-2xl font-bold text-red-500">{stats.critical}</div>
|
||||||
<div className="text-[10px] sm:text-xs text-muted-foreground">Critical</div>
|
<div className="text-[10px] sm:text-xs text-muted-foreground">Critical</div>
|
||||||
</div>
|
</div>
|
||||||
|
{stats.unknown > 0 && (
|
||||||
|
<div className="text-center">
|
||||||
|
<div className="text-lg sm:text-2xl font-bold text-amber-400">{stats.unknown}</div>
|
||||||
|
<div className="text-[10px] sm:text-xs text-muted-foreground">Unknown</div>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
{healthData.summary && healthData.summary !== "All systems operational" && (
|
{healthData.summary && healthData.summary !== "All systems operational" && (
|
||||||
|
|||||||
@@ -600,6 +600,47 @@ def _health_collector_loop():
|
|||||||
time.sleep(300) # Every 5 minutes
|
time.sleep(300) # Every 5 minutes
|
||||||
|
|
||||||
|
|
||||||
|
def _vital_signs_sampler():
|
||||||
|
"""Dedicated thread for rapid CPU & temperature sampling.
|
||||||
|
|
||||||
|
Runs independently of the 5-min health collector loop.
|
||||||
|
- CPU usage: sampled every 30s (3 samples in 1.5 min for hysteresis)
|
||||||
|
- Temperature: sampled every 10s (18 samples in 3 min for temporal logic)
|
||||||
|
Uses time.monotonic() to avoid drift.
|
||||||
|
"""
|
||||||
|
from health_monitor import health_monitor
|
||||||
|
|
||||||
|
# Wait 15s after startup for sensors to be ready
|
||||||
|
time.sleep(15)
|
||||||
|
|
||||||
|
TEMP_INTERVAL = 10 # seconds
|
||||||
|
CPU_INTERVAL = 30 # seconds
|
||||||
|
|
||||||
|
next_temp = time.monotonic()
|
||||||
|
next_cpu = time.monotonic()
|
||||||
|
|
||||||
|
print("[ProxMenux] Vital signs sampler started (CPU: 30s, Temp: 10s)")
|
||||||
|
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
now = time.monotonic()
|
||||||
|
|
||||||
|
if now >= next_temp:
|
||||||
|
health_monitor._sample_cpu_temperature()
|
||||||
|
next_temp = now + TEMP_INTERVAL
|
||||||
|
|
||||||
|
if now >= next_cpu:
|
||||||
|
health_monitor._sample_cpu_usage()
|
||||||
|
next_cpu = now + CPU_INTERVAL
|
||||||
|
|
||||||
|
# Sleep until the next earliest event (with 0.5s min to avoid busy-loop)
|
||||||
|
sleep_until = min(next_temp, next_cpu) - time.monotonic()
|
||||||
|
time.sleep(max(sleep_until, 0.5))
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[ProxMenux] Vital signs sampler error: {e}")
|
||||||
|
time.sleep(10)
|
||||||
|
|
||||||
|
|
||||||
def get_uptime():
|
def get_uptime():
|
||||||
"""Get system uptime in a human-readable format."""
|
"""Get system uptime in a human-readable format."""
|
||||||
try:
|
try:
|
||||||
@@ -7046,6 +7087,13 @@ if __name__ == '__main__':
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[ProxMenux] Background health monitor failed to start: {e}")
|
print(f"[ProxMenux] Background health monitor failed to start: {e}")
|
||||||
|
|
||||||
|
# ── Vital Signs Sampler (rapid CPU + Temperature) ──
|
||||||
|
try:
|
||||||
|
vital_thread = threading.Thread(target=_vital_signs_sampler, daemon=True)
|
||||||
|
vital_thread.start()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[ProxMenux] Vital signs sampler failed to start: {e}")
|
||||||
|
|
||||||
# Check for SSL configuration
|
# Check for SSL configuration
|
||||||
ssl_ctx = None
|
ssl_ctx = None
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -60,7 +60,7 @@ class HealthMonitor:
|
|||||||
# Network Thresholds
|
# Network Thresholds
|
||||||
NETWORK_LATENCY_WARNING = 100
|
NETWORK_LATENCY_WARNING = 100
|
||||||
NETWORK_LATENCY_CRITICAL = 300
|
NETWORK_LATENCY_CRITICAL = 300
|
||||||
NETWORK_TIMEOUT = 0.9
|
NETWORK_TIMEOUT = 2
|
||||||
NETWORK_INACTIVE_DURATION = 600
|
NETWORK_INACTIVE_DURATION = 600
|
||||||
|
|
||||||
# Log Thresholds
|
# Log Thresholds
|
||||||
@@ -142,6 +142,7 @@ class HealthMonitor:
|
|||||||
self.io_error_history = defaultdict(list)
|
self.io_error_history = defaultdict(list)
|
||||||
self.failed_vm_history = set() # Track VMs that failed to start
|
self.failed_vm_history = set() # Track VMs that failed to start
|
||||||
self.persistent_log_patterns = defaultdict(lambda: {'count': 0, 'first_seen': 0, 'last_seen': 0})
|
self.persistent_log_patterns = defaultdict(lambda: {'count': 0, 'first_seen': 0, 'last_seen': 0})
|
||||||
|
self._unknown_counts = {} # Track consecutive UNKNOWN cycles per category
|
||||||
|
|
||||||
# System capabilities - derived from Proxmox storage types at runtime (Priority 1.5)
|
# System capabilities - derived from Proxmox storage types at runtime (Priority 1.5)
|
||||||
# SMART detection still uses filesystem check on init (lightweight)
|
# SMART detection still uses filesystem check on init (lightweight)
|
||||||
@@ -153,6 +154,63 @@ class HealthMonitor:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[HealthMonitor] Cleanup warning: {e}")
|
print(f"[HealthMonitor] Cleanup warning: {e}")
|
||||||
|
|
||||||
|
# ─── Lightweight sampling methods for the dedicated vital-signs thread ───
|
||||||
|
# These ONLY append data to state_history without triggering evaluation,
|
||||||
|
# persistence, or subprocess-heavy operations.
|
||||||
|
|
||||||
|
def _sample_cpu_usage(self):
|
||||||
|
"""Lightweight CPU sample: read usage % and append to history. ~30ms cost."""
|
||||||
|
try:
|
||||||
|
cpu_percent = psutil.cpu_percent(interval=0)
|
||||||
|
current_time = time.time()
|
||||||
|
state_key = 'cpu_usage'
|
||||||
|
self.state_history[state_key].append({
|
||||||
|
'value': cpu_percent,
|
||||||
|
'time': current_time
|
||||||
|
})
|
||||||
|
# Prune entries older than 6 minutes
|
||||||
|
self.state_history[state_key] = [
|
||||||
|
e for e in self.state_history[state_key]
|
||||||
|
if current_time - e['time'] < 360
|
||||||
|
]
|
||||||
|
except Exception:
|
||||||
|
pass # Sampling must never crash the thread
|
||||||
|
|
||||||
|
def _sample_cpu_temperature(self):
|
||||||
|
"""Lightweight temperature sample: read sensor and append to history. ~50ms cost."""
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
['sensors', '-A', '-u'],
|
||||||
|
capture_output=True, text=True, timeout=2
|
||||||
|
)
|
||||||
|
if result.returncode != 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
temps = []
|
||||||
|
for line in result.stdout.split('\n'):
|
||||||
|
if 'temp' in line.lower() and '_input' in line:
|
||||||
|
try:
|
||||||
|
temp = float(line.split(':')[1].strip())
|
||||||
|
temps.append(temp)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if temps:
|
||||||
|
max_temp = max(temps)
|
||||||
|
current_time = time.time()
|
||||||
|
state_key = 'cpu_temp_history'
|
||||||
|
self.state_history[state_key].append({
|
||||||
|
'value': max_temp,
|
||||||
|
'time': current_time
|
||||||
|
})
|
||||||
|
# Prune entries older than 4 minutes
|
||||||
|
self.state_history[state_key] = [
|
||||||
|
e for e in self.state_history[state_key]
|
||||||
|
if current_time - e['time'] < 240
|
||||||
|
]
|
||||||
|
except Exception:
|
||||||
|
pass # Sampling must never crash the thread
|
||||||
|
|
||||||
def get_system_info(self) -> Dict[str, Any]:
|
def get_system_info(self) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Get lightweight system info for header display.
|
Get lightweight system info for header display.
|
||||||
@@ -377,14 +435,34 @@ class HealthMonitor:
|
|||||||
elif security_status.get('status') == 'INFO':
|
elif security_status.get('status') == 'INFO':
|
||||||
info_issues.append(f"Security: {security_status.get('reason', 'Security information')}")
|
info_issues.append(f"Security: {security_status.get('reason', 'Security information')}")
|
||||||
|
|
||||||
|
# --- Track UNKNOWN counts and persist if >= 3 consecutive cycles ---
|
||||||
|
unknown_issues = []
|
||||||
|
for cat_key, cat_data in details.items():
|
||||||
|
cat_status = cat_data.get('status', 'OK')
|
||||||
|
if cat_status == 'UNKNOWN':
|
||||||
|
count = self._unknown_counts.get(cat_key, 0) + 1
|
||||||
|
self._unknown_counts[cat_key] = min(count, 10) # Cap to avoid unbounded growth
|
||||||
|
unknown_issues.append(f"{cat_key}: {cat_data.get('reason', 'Check unavailable')}")
|
||||||
|
if count == 3: # Only persist on the exact 3rd cycle, not every cycle after
|
||||||
|
try:
|
||||||
|
health_persistence.record_unknown_persistent(
|
||||||
|
cat_key, cat_data.get('reason', 'Check unavailable'))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
self._unknown_counts[cat_key] = 0
|
||||||
|
|
||||||
# --- Determine Overall Status ---
|
# --- Determine Overall Status ---
|
||||||
# Use a fixed order of severity: CRITICAL > WARNING > INFO > OK
|
# Severity: CRITICAL > WARNING > UNKNOWN (capped at WARNING) > INFO > OK
|
||||||
if critical_issues:
|
if critical_issues:
|
||||||
overall = 'CRITICAL'
|
overall = 'CRITICAL'
|
||||||
summary = '; '.join(critical_issues[:3]) # Limit summary to 3 issues
|
summary = '; '.join(critical_issues[:3])
|
||||||
elif warning_issues:
|
elif warning_issues:
|
||||||
overall = 'WARNING'
|
overall = 'WARNING'
|
||||||
summary = '; '.join(warning_issues[:3])
|
summary = '; '.join(warning_issues[:3])
|
||||||
|
elif unknown_issues:
|
||||||
|
overall = 'WARNING' # UNKNOWN caps at WARNING, never escalates to CRITICAL
|
||||||
|
summary = '; '.join(unknown_issues[:3])
|
||||||
elif info_issues:
|
elif info_issues:
|
||||||
overall = 'OK' # INFO statuses don't degrade overall health
|
overall = 'OK' # INFO statuses don't degrade overall health
|
||||||
summary = '; '.join(info_issues[:3])
|
summary = '; '.join(info_issues[:3])
|
||||||
@@ -444,13 +522,17 @@ class HealthMonitor:
|
|||||||
current_time = time.time()
|
current_time = time.time()
|
||||||
|
|
||||||
state_key = 'cpu_usage'
|
state_key = 'cpu_usage'
|
||||||
|
# Add this reading as well (supplements the sampler thread)
|
||||||
self.state_history[state_key].append({
|
self.state_history[state_key].append({
|
||||||
'value': cpu_percent,
|
'value': cpu_percent,
|
||||||
'time': current_time
|
'time': current_time
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# Snapshot the list for thread-safe reading (sampler may append concurrently)
|
||||||
|
cpu_snapshot = list(self.state_history[state_key])
|
||||||
|
# Prune old entries via snapshot replacement (atomic assignment)
|
||||||
self.state_history[state_key] = [
|
self.state_history[state_key] = [
|
||||||
entry for entry in self.state_history[state_key]
|
entry for entry in cpu_snapshot
|
||||||
if current_time - entry['time'] < 360
|
if current_time - entry['time'] < 360
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -517,8 +599,8 @@ class HealthMonitor:
|
|||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
checks['cpu_temperature'] = {
|
checks['cpu_temperature'] = {
|
||||||
'status': 'OK',
|
'status': 'INFO',
|
||||||
'detail': 'Sensor not available',
|
'detail': 'No temperature sensor detected - install lm-sensors if hardware supports it',
|
||||||
}
|
}
|
||||||
|
|
||||||
result['checks'] = checks
|
result['checks'] = checks
|
||||||
@@ -564,14 +646,16 @@ class HealthMonitor:
|
|||||||
max_temp = max(temps)
|
max_temp = max(temps)
|
||||||
|
|
||||||
state_key = 'cpu_temp_history'
|
state_key = 'cpu_temp_history'
|
||||||
|
# Add this reading (supplements the sampler thread)
|
||||||
self.state_history[state_key].append({
|
self.state_history[state_key].append({
|
||||||
'value': max_temp,
|
'value': max_temp,
|
||||||
'time': current_time
|
'time': current_time
|
||||||
})
|
})
|
||||||
|
|
||||||
# Keep last 4 minutes of data (240 seconds)
|
# Snapshot for thread-safe reading, then atomic prune
|
||||||
|
temp_snapshot = list(self.state_history[state_key])
|
||||||
self.state_history[state_key] = [
|
self.state_history[state_key] = [
|
||||||
entry for entry in self.state_history[state_key]
|
entry for entry in temp_snapshot
|
||||||
if current_time - entry['time'] < 240
|
if current_time - entry['time'] < 240
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -1058,9 +1142,10 @@ class HealthMonitor:
|
|||||||
'reason': f"{len(disk_issues)} disk(s) with recent errors",
|
'reason': f"{len(disk_issues)} disk(s) with recent errors",
|
||||||
'details': disk_issues
|
'details': disk_issues
|
||||||
}
|
}
|
||||||
|
|
||||||
except Exception:
|
except Exception as e:
|
||||||
return {'status': 'OK'}
|
print(f"[HealthMonitor] Disk/IO check failed: {e}")
|
||||||
|
return {'status': 'UNKNOWN', 'reason': f'Disk check unavailable: {str(e)}', 'checks': {}}
|
||||||
|
|
||||||
def _check_network_optimized(self) -> Dict[str, Any]:
|
def _check_network_optimized(self) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
@@ -1186,9 +1271,10 @@ class HealthMonitor:
|
|||||||
'details': interface_details,
|
'details': interface_details,
|
||||||
'checks': checks
|
'checks': checks
|
||||||
}
|
}
|
||||||
|
|
||||||
except Exception:
|
except Exception as e:
|
||||||
return {'status': 'OK'}
|
print(f"[HealthMonitor] Network check failed: {e}")
|
||||||
|
return {'status': 'UNKNOWN', 'reason': f'Network check unavailable: {str(e)}', 'checks': {}}
|
||||||
|
|
||||||
def _check_network_latency(self) -> Optional[Dict[str, Any]]:
|
def _check_network_latency(self) -> Optional[Dict[str, Any]]:
|
||||||
"""Check network latency to 1.1.1.1 (cached)"""
|
"""Check network latency to 1.1.1.1 (cached)"""
|
||||||
@@ -1237,15 +1323,31 @@ class HealthMonitor:
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# If ping failed (timeout, unreachable)
|
# If ping failed (timeout, unreachable) - distinguish the reason
|
||||||
|
stderr_lower = (result.stderr or '').lower() if hasattr(result, 'stderr') else ''
|
||||||
|
if 'unreachable' in stderr_lower or 'network is unreachable' in stderr_lower:
|
||||||
|
fail_reason = 'Network unreachable - no route to 1.1.1.1'
|
||||||
|
elif result.returncode == 1:
|
||||||
|
fail_reason = 'Packet loss to 1.1.1.1 (100% loss)'
|
||||||
|
else:
|
||||||
|
fail_reason = f'Ping failed (exit code {result.returncode})'
|
||||||
|
|
||||||
packet_loss_result = {
|
packet_loss_result = {
|
||||||
'status': 'CRITICAL',
|
'status': 'CRITICAL',
|
||||||
'reason': 'Packet loss or timeout to 1.1.1.1'
|
'reason': fail_reason
|
||||||
}
|
}
|
||||||
self.cached_results[cache_key] = packet_loss_result
|
self.cached_results[cache_key] = packet_loss_result
|
||||||
self.last_check_times[cache_key] = current_time
|
self.last_check_times[cache_key] = current_time
|
||||||
return packet_loss_result
|
return packet_loss_result
|
||||||
|
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
timeout_result = {
|
||||||
|
'status': 'WARNING',
|
||||||
|
'reason': f'Ping timeout (>{self.NETWORK_TIMEOUT}s) - possible high latency'
|
||||||
|
}
|
||||||
|
self.cached_results[cache_key] = timeout_result
|
||||||
|
self.last_check_times[cache_key] = current_time
|
||||||
|
return timeout_result
|
||||||
except Exception:
|
except Exception:
|
||||||
return {'status': 'UNKNOWN', 'reason': 'Ping command failed'}
|
return {'status': 'UNKNOWN', 'reason': 'Ping command failed'}
|
||||||
|
|
||||||
@@ -1356,9 +1458,10 @@ class HealthMonitor:
|
|||||||
'reason': '; '.join(issues[:3]),
|
'reason': '; '.join(issues[:3]),
|
||||||
'details': vm_details
|
'details': vm_details
|
||||||
}
|
}
|
||||||
|
|
||||||
except Exception:
|
except Exception as e:
|
||||||
return {'status': 'OK'}
|
print(f"[HealthMonitor] VMs/CTs check failed: {e}")
|
||||||
|
return {'status': 'UNKNOWN', 'reason': f'VM/CT check unavailable: {str(e)}', 'checks': {}}
|
||||||
|
|
||||||
# Modified to use persistence
|
# Modified to use persistence
|
||||||
def _check_vms_cts_with_persistence(self) -> Dict[str, Any]:
|
def _check_vms_cts_with_persistence(self) -> Dict[str, Any]:
|
||||||
@@ -1462,7 +1565,12 @@ class HealthMonitor:
|
|||||||
|
|
||||||
# Generic failed to start for VMs and CTs
|
# Generic failed to start for VMs and CTs
|
||||||
if any(keyword in line_lower for keyword in ['failed to start', 'cannot start', 'activation failed', 'start error']):
|
if any(keyword in line_lower for keyword in ['failed to start', 'cannot start', 'activation failed', 'start error']):
|
||||||
id_match = re.search(r'\b(\d{3,5})\b', line) # Increased digit count for wider match
|
# Try contextual VMID patterns first (more precise), then fallback to generic
|
||||||
|
id_match = (
|
||||||
|
re.search(r'(?:VMID|vmid|VM|CT|qemu|lxc|pct|qm)[:\s=/]+(\d{3,5})\b', line) or
|
||||||
|
re.search(r'\b(\d{3,5})\.conf\b', line) or
|
||||||
|
re.search(r'\b(\d{3,5})\b', line)
|
||||||
|
)
|
||||||
if id_match:
|
if id_match:
|
||||||
vmid_ctid = id_match.group(1)
|
vmid_ctid = id_match.group(1)
|
||||||
# Determine if it's a VM or CT based on context, if possible
|
# Determine if it's a VM or CT based on context, if possible
|
||||||
@@ -1521,8 +1629,9 @@ class HealthMonitor:
|
|||||||
'checks': checks
|
'checks': checks
|
||||||
}
|
}
|
||||||
|
|
||||||
except Exception:
|
except Exception as e:
|
||||||
return {'status': 'OK', 'checks': {}}
|
print(f"[HealthMonitor] VMs/CTs persistence check failed: {e}")
|
||||||
|
return {'status': 'UNKNOWN', 'reason': f'VM/CT check unavailable: {str(e)}', 'checks': {}}
|
||||||
|
|
||||||
def _check_pve_services(self) -> Dict[str, Any]:
|
def _check_pve_services(self) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
@@ -1588,7 +1697,7 @@ class HealthMonitor:
|
|||||||
error_key = f'pve_service_{svc}'
|
error_key = f'pve_service_{svc}'
|
||||||
health_persistence.record_error(
|
health_persistence.record_error(
|
||||||
error_key=error_key,
|
error_key=error_key,
|
||||||
category='services',
|
category='pve_services',
|
||||||
severity='CRITICAL',
|
severity='CRITICAL',
|
||||||
reason=f'PVE service {svc} is {service_details.get(svc, "inactive")}',
|
reason=f'PVE service {svc} is {service_details.get(svc, "inactive")}',
|
||||||
details={'service': svc, 'state': service_details.get(svc, 'inactive')}
|
details={'service': svc, 'state': service_details.get(svc, 'inactive')}
|
||||||
@@ -1932,9 +2041,8 @@ class HealthMonitor:
|
|||||||
return ok_result
|
return ok_result
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Log the exception but return OK to avoid alert storms on check failure
|
print(f"[HealthMonitor] Log check failed: {e}")
|
||||||
print(f"[HealthMonitor] Error checking logs: {e}")
|
return {'status': 'UNKNOWN', 'reason': f'Log check unavailable: {str(e)}', 'checks': {}}
|
||||||
return {'status': 'OK'}
|
|
||||||
|
|
||||||
def _normalize_log_pattern(self, line: str) -> str:
|
def _normalize_log_pattern(self, line: str) -> str:
|
||||||
"""
|
"""
|
||||||
@@ -1984,12 +2092,21 @@ class HealthMonitor:
|
|||||||
pass # Ignore if mtime fails
|
pass # Ignore if mtime fails
|
||||||
|
|
||||||
# Perform a dry run of apt-get upgrade to see pending packages
|
# Perform a dry run of apt-get upgrade to see pending packages
|
||||||
result = subprocess.run(
|
try:
|
||||||
['apt-get', 'upgrade', '--dry-run'],
|
result = subprocess.run(
|
||||||
capture_output=True,
|
['apt-get', 'upgrade', '--dry-run'],
|
||||||
text=True,
|
capture_output=True,
|
||||||
timeout=5 # Increased timeout for safety
|
text=True,
|
||||||
)
|
timeout=10
|
||||||
|
)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
print("[HealthMonitor] apt-get upgrade --dry-run timed out")
|
||||||
|
return {
|
||||||
|
'status': 'UNKNOWN',
|
||||||
|
'reason': 'apt-get timed out - repository may be unreachable',
|
||||||
|
'count': 0,
|
||||||
|
'checks': {}
|
||||||
|
}
|
||||||
|
|
||||||
status = 'OK'
|
status = 'OK'
|
||||||
reason = None
|
reason = None
|
||||||
@@ -2112,8 +2229,8 @@ class HealthMonitor:
|
|||||||
return update_result
|
return update_result
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[HealthMonitor] Error checking updates: {e}")
|
print(f"[HealthMonitor] Updates check failed: {e}")
|
||||||
return {'status': 'OK', 'count': 0, 'checks': {}}
|
return {'status': 'UNKNOWN', 'reason': f'Updates check unavailable: {str(e)}', 'count': 0, 'checks': {}}
|
||||||
|
|
||||||
def _check_fail2ban_bans(self) -> Dict[str, Any]:
|
def _check_fail2ban_bans(self) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
@@ -2356,8 +2473,8 @@ class HealthMonitor:
|
|||||||
}
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[HealthMonitor] Error checking security: {e}")
|
print(f"[HealthMonitor] Security check failed: {e}")
|
||||||
return {'status': 'OK', 'checks': {}}
|
return {'status': 'UNKNOWN', 'reason': f'Security check unavailable: {str(e)}', 'checks': {}}
|
||||||
|
|
||||||
def _check_certificates(self) -> Optional[Dict[str, Any]]:
|
def _check_certificates(self) -> Optional[Dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ Version: 1.1
|
|||||||
import sqlite3
|
import sqlite3
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import threading
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from typing import Dict, List, Any, Optional
|
from typing import Dict, List, Any, Optional
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -52,11 +53,19 @@ class HealthPersistence:
|
|||||||
self.data_dir.mkdir(parents=True, exist_ok=True)
|
self.data_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
self.db_path = self.data_dir / 'health_monitor.db'
|
self.db_path = self.data_dir / 'health_monitor.db'
|
||||||
|
self._db_lock = threading.Lock()
|
||||||
self._init_database()
|
self._init_database()
|
||||||
|
|
||||||
|
def _get_conn(self) -> sqlite3.Connection:
|
||||||
|
"""Get a SQLite connection with timeout and WAL mode for safe concurrency."""
|
||||||
|
conn = sqlite3.connect(str(self.db_path), timeout=10)
|
||||||
|
conn.execute('PRAGMA journal_mode=WAL')
|
||||||
|
conn.execute('PRAGMA busy_timeout=5000')
|
||||||
|
return conn
|
||||||
|
|
||||||
def _init_database(self):
|
def _init_database(self):
|
||||||
"""Initialize SQLite database with required tables"""
|
"""Initialize SQLite database with required tables"""
|
||||||
conn = sqlite3.connect(str(self.db_path))
|
conn = self._get_conn()
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
# Errors table
|
# Errors table
|
||||||
@@ -126,7 +135,11 @@ class HealthPersistence:
|
|||||||
Record or update an error.
|
Record or update an error.
|
||||||
Returns event info (new_error, updated, etc.)
|
Returns event info (new_error, updated, etc.)
|
||||||
"""
|
"""
|
||||||
conn = sqlite3.connect(str(self.db_path))
|
with self._db_lock:
|
||||||
|
return self._record_error_impl(error_key, category, severity, reason, details)
|
||||||
|
|
||||||
|
def _record_error_impl(self, error_key, category, severity, reason, details):
|
||||||
|
conn = self._get_conn()
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
now = datetime.now().isoformat()
|
now = datetime.now().isoformat()
|
||||||
@@ -262,7 +275,11 @@ class HealthPersistence:
|
|||||||
|
|
||||||
def resolve_error(self, error_key: str, reason: str = 'auto-resolved'):
|
def resolve_error(self, error_key: str, reason: str = 'auto-resolved'):
|
||||||
"""Mark an error as resolved"""
|
"""Mark an error as resolved"""
|
||||||
conn = sqlite3.connect(str(self.db_path))
|
with self._db_lock:
|
||||||
|
return self._resolve_error_impl(error_key, reason)
|
||||||
|
|
||||||
|
def _resolve_error_impl(self, error_key, reason):
|
||||||
|
conn = self._get_conn()
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
now = datetime.now().isoformat()
|
now = datetime.now().isoformat()
|
||||||
@@ -284,7 +301,7 @@ class HealthPersistence:
|
|||||||
Check if an error is currently active (unresolved and not acknowledged).
|
Check if an error is currently active (unresolved and not acknowledged).
|
||||||
Used by checks to avoid re-recording errors that are already tracked.
|
Used by checks to avoid re-recording errors that are already tracked.
|
||||||
"""
|
"""
|
||||||
conn = sqlite3.connect(str(self.db_path))
|
conn = self._get_conn()
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
if category:
|
if category:
|
||||||
@@ -314,7 +331,7 @@ class HealthPersistence:
|
|||||||
we delete the record entirely so it can re-trigger as a fresh
|
we delete the record entirely so it can re-trigger as a fresh
|
||||||
event if the condition returns later.
|
event if the condition returns later.
|
||||||
"""
|
"""
|
||||||
conn = sqlite3.connect(str(self.db_path))
|
conn = self._get_conn()
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
now = datetime.now().isoformat()
|
now = datetime.now().isoformat()
|
||||||
@@ -353,7 +370,11 @@ class HealthPersistence:
|
|||||||
- Stores suppression_hours on the error record (snapshot at dismiss time)
|
- Stores suppression_hours on the error record (snapshot at dismiss time)
|
||||||
- Marks as acknowledged so it won't re-appear during the suppression period
|
- Marks as acknowledged so it won't re-appear during the suppression period
|
||||||
"""
|
"""
|
||||||
conn = sqlite3.connect(str(self.db_path))
|
with self._db_lock:
|
||||||
|
return self._acknowledge_error_impl(error_key)
|
||||||
|
|
||||||
|
def _acknowledge_error_impl(self, error_key):
|
||||||
|
conn = self._get_conn()
|
||||||
conn.row_factory = sqlite3.Row
|
conn.row_factory = sqlite3.Row
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
@@ -408,7 +429,7 @@ class HealthPersistence:
|
|||||||
|
|
||||||
def get_active_errors(self, category: Optional[str] = None) -> List[Dict[str, Any]]:
|
def get_active_errors(self, category: Optional[str] = None) -> List[Dict[str, Any]]:
|
||||||
"""Get all active (unresolved) errors, optionally filtered by category"""
|
"""Get all active (unresolved) errors, optionally filtered by category"""
|
||||||
conn = sqlite3.connect(str(self.db_path))
|
conn = self._get_conn()
|
||||||
conn.row_factory = sqlite3.Row
|
conn.row_factory = sqlite3.Row
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
@@ -439,7 +460,11 @@ class HealthPersistence:
|
|||||||
|
|
||||||
def cleanup_old_errors(self):
|
def cleanup_old_errors(self):
|
||||||
"""Clean up old resolved errors and auto-resolve stale errors"""
|
"""Clean up old resolved errors and auto-resolve stale errors"""
|
||||||
conn = sqlite3.connect(str(self.db_path))
|
with self._db_lock:
|
||||||
|
return self._cleanup_old_errors_impl()
|
||||||
|
|
||||||
|
def _cleanup_old_errors_impl(self):
|
||||||
|
conn = self._get_conn()
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
now = datetime.now()
|
now = datetime.now()
|
||||||
@@ -519,7 +544,7 @@ class HealthPersistence:
|
|||||||
Get errors that were acknowledged/dismissed but still within suppression period.
|
Get errors that were acknowledged/dismissed but still within suppression period.
|
||||||
These are shown as INFO in the frontend with a 'Dismissed' badge.
|
These are shown as INFO in the frontend with a 'Dismissed' badge.
|
||||||
"""
|
"""
|
||||||
conn = sqlite3.connect(str(self.db_path))
|
conn = self._get_conn()
|
||||||
conn.row_factory = sqlite3.Row
|
conn.row_factory = sqlite3.Row
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
@@ -584,7 +609,7 @@ class HealthPersistence:
|
|||||||
- 'resolved': error resolved
|
- 'resolved': error resolved
|
||||||
- 'escalated': severity increased
|
- 'escalated': severity increased
|
||||||
"""
|
"""
|
||||||
conn = sqlite3.connect(str(self.db_path))
|
conn = self._get_conn()
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
event_data = data or {}
|
event_data = data or {}
|
||||||
@@ -608,7 +633,7 @@ class HealthPersistence:
|
|||||||
Get events that need notification (for future Telegram/Gotify integration).
|
Get events that need notification (for future Telegram/Gotify integration).
|
||||||
Groups by severity for batch notification sending.
|
Groups by severity for batch notification sending.
|
||||||
"""
|
"""
|
||||||
conn = sqlite3.connect(str(self.db_path))
|
conn = self._get_conn()
|
||||||
conn.row_factory = sqlite3.Row
|
conn.row_factory = sqlite3.Row
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
@@ -641,7 +666,7 @@ class HealthPersistence:
|
|||||||
if not event_ids:
|
if not event_ids:
|
||||||
return
|
return
|
||||||
|
|
||||||
conn = sqlite3.connect(str(self.db_path))
|
conn = self._get_conn()
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
for event_id in event_ids:
|
for event_id in event_ids:
|
||||||
@@ -663,7 +688,7 @@ class HealthPersistence:
|
|||||||
|
|
||||||
def get_unnotified_errors(self) -> List[Dict[str, Any]]:
|
def get_unnotified_errors(self) -> List[Dict[str, Any]]:
|
||||||
"""Get errors that need Telegram notification"""
|
"""Get errors that need Telegram notification"""
|
||||||
conn = sqlite3.connect(str(self.db_path))
|
conn = self._get_conn()
|
||||||
conn.row_factory = sqlite3.Row
|
conn.row_factory = sqlite3.Row
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
@@ -689,7 +714,7 @@ class HealthPersistence:
|
|||||||
|
|
||||||
def mark_notified(self, error_key: str):
|
def mark_notified(self, error_key: str):
|
||||||
"""Mark error as notified"""
|
"""Mark error as notified"""
|
||||||
conn = sqlite3.connect(str(self.db_path))
|
conn = self._get_conn()
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
@@ -708,7 +733,7 @@ class HealthPersistence:
|
|||||||
Get a cached system capability value.
|
Get a cached system capability value.
|
||||||
Returns None if not yet detected.
|
Returns None if not yet detected.
|
||||||
"""
|
"""
|
||||||
conn = sqlite3.connect(str(self.db_path))
|
conn = self._get_conn()
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
cursor.execute(
|
cursor.execute(
|
||||||
'SELECT cap_value FROM system_capabilities WHERE cap_key = ?',
|
'SELECT cap_value FROM system_capabilities WHERE cap_key = ?',
|
||||||
@@ -720,7 +745,7 @@ class HealthPersistence:
|
|||||||
|
|
||||||
def set_capability(self, cap_key: str, cap_value: str):
|
def set_capability(self, cap_key: str, cap_value: str):
|
||||||
"""Store a system capability value (detected once, cached forever)."""
|
"""Store a system capability value (detected once, cached forever)."""
|
||||||
conn = sqlite3.connect(str(self.db_path))
|
conn = self._get_conn()
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
INSERT OR REPLACE INTO system_capabilities (cap_key, cap_value, detected_at)
|
INSERT OR REPLACE INTO system_capabilities (cap_key, cap_value, detected_at)
|
||||||
@@ -731,7 +756,7 @@ class HealthPersistence:
|
|||||||
|
|
||||||
def get_all_capabilities(self) -> Dict[str, str]:
|
def get_all_capabilities(self) -> Dict[str, str]:
|
||||||
"""Get all cached system capabilities as a dict."""
|
"""Get all cached system capabilities as a dict."""
|
||||||
conn = sqlite3.connect(str(self.db_path))
|
conn = self._get_conn()
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
cursor.execute('SELECT cap_key, cap_value FROM system_capabilities')
|
cursor.execute('SELECT cap_key, cap_value FROM system_capabilities')
|
||||||
rows = cursor.fetchall()
|
rows = cursor.fetchall()
|
||||||
@@ -747,7 +772,7 @@ class HealthPersistence:
|
|||||||
|
|
||||||
def get_setting(self, key: str, default: Optional[str] = None) -> Optional[str]:
|
def get_setting(self, key: str, default: Optional[str] = None) -> Optional[str]:
|
||||||
"""Get a user setting value by key."""
|
"""Get a user setting value by key."""
|
||||||
conn = sqlite3.connect(str(self.db_path))
|
conn = self._get_conn()
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
cursor.execute(
|
cursor.execute(
|
||||||
'SELECT setting_value FROM user_settings WHERE setting_key = ?', (key,)
|
'SELECT setting_value FROM user_settings WHERE setting_key = ?', (key,)
|
||||||
@@ -758,18 +783,19 @@ class HealthPersistence:
|
|||||||
|
|
||||||
def set_setting(self, key: str, value: str):
|
def set_setting(self, key: str, value: str):
|
||||||
"""Store a user setting value."""
|
"""Store a user setting value."""
|
||||||
conn = sqlite3.connect(str(self.db_path))
|
with self._db_lock:
|
||||||
cursor = conn.cursor()
|
conn = self._get_conn()
|
||||||
cursor.execute('''
|
cursor = conn.cursor()
|
||||||
INSERT OR REPLACE INTO user_settings (setting_key, setting_value, updated_at)
|
cursor.execute('''
|
||||||
VALUES (?, ?, ?)
|
INSERT OR REPLACE INTO user_settings (setting_key, setting_value, updated_at)
|
||||||
''', (key, value, datetime.now().isoformat()))
|
VALUES (?, ?, ?)
|
||||||
conn.commit()
|
''', (key, value, datetime.now().isoformat()))
|
||||||
conn.close()
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
def get_all_settings(self, prefix: Optional[str] = None) -> Dict[str, str]:
|
def get_all_settings(self, prefix: Optional[str] = None) -> Dict[str, str]:
|
||||||
"""Get all user settings, optionally filtered by key prefix."""
|
"""Get all user settings, optionally filtered by key prefix."""
|
||||||
conn = sqlite3.connect(str(self.db_path))
|
conn = self._get_conn()
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
if prefix:
|
if prefix:
|
||||||
cursor.execute(
|
cursor.execute(
|
||||||
@@ -791,7 +817,7 @@ class HealthPersistence:
|
|||||||
For each dismissed error, looks up its category's configured hours
|
For each dismissed error, looks up its category's configured hours
|
||||||
and updates the suppression_hours column to match.
|
and updates the suppression_hours column to match.
|
||||||
"""
|
"""
|
||||||
conn = sqlite3.connect(str(self.db_path))
|
conn = self._get_conn()
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
# Build reverse map: category -> setting_key
|
# Build reverse map: category -> setting_key
|
||||||
@@ -882,6 +908,51 @@ class HealthPersistence:
|
|||||||
"""
|
"""
|
||||||
all_cats = self.get_suppression_categories()
|
all_cats = self.get_suppression_categories()
|
||||||
return [c for c in all_cats if c['hours'] != self.DEFAULT_SUPPRESSION_HOURS]
|
return [c for c in all_cats if c['hours'] != self.DEFAULT_SUPPRESSION_HOURS]
|
||||||
|
|
||||||
|
def record_unknown_persistent(self, category: str, reason: str):
|
||||||
|
"""
|
||||||
|
Record a persistent UNKNOWN event when a health check has been
|
||||||
|
unable to verify for >= 3 consecutive cycles (~15 min).
|
||||||
|
Avoids duplicates by only recording once per 30 min per category.
|
||||||
|
"""
|
||||||
|
with self._db_lock:
|
||||||
|
self._record_unknown_persistent_impl(category, reason)
|
||||||
|
|
||||||
|
def _record_unknown_persistent_impl(self, category, reason):
|
||||||
|
try:
|
||||||
|
event_key = f'unknown_persistent_{category}'
|
||||||
|
now = datetime.now().isoformat()
|
||||||
|
|
||||||
|
conn = self._get_conn()
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
# Check if we already recorded this within the last 30 minutes
|
||||||
|
# Note: events table has columns (id, event_type, error_key, timestamp, data)
|
||||||
|
# We use error_key for deduplication since it contains the category
|
||||||
|
cursor.execute('''
|
||||||
|
SELECT MAX(timestamp) FROM events
|
||||||
|
WHERE event_type = ? AND error_key = ?
|
||||||
|
''', ('unknown_persistent', event_key))
|
||||||
|
row = cursor.fetchone()
|
||||||
|
if row and row[0]:
|
||||||
|
try:
|
||||||
|
last_recorded = datetime.fromisoformat(row[0])
|
||||||
|
if (datetime.now() - last_recorded).total_seconds() < 1800:
|
||||||
|
conn.close()
|
||||||
|
return # Already recorded recently
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
pass # If timestamp is malformed, proceed with recording
|
||||||
|
|
||||||
|
cursor.execute('''
|
||||||
|
INSERT INTO events (event_type, error_key, timestamp, data)
|
||||||
|
VALUES (?, ?, ?, ?)
|
||||||
|
''', ('unknown_persistent', event_key, now,
|
||||||
|
json.dumps({'category': category, 'reason': reason})))
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[HealthPersistence] Error recording UNKNOWN persistent: {e}")
|
||||||
|
|
||||||
|
|
||||||
# Global instance
|
# Global instance
|
||||||
|
|||||||
Reference in New Issue
Block a user