mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-04-25 08:56:21 +00:00
update health_monitor.py
This commit is contained in:
@@ -74,6 +74,7 @@ export function ProxmoxDashboard() {
|
|||||||
serverName: "Loading...",
|
serverName: "Loading...",
|
||||||
nodeId: "Loading...",
|
nodeId: "Loading...",
|
||||||
})
|
})
|
||||||
|
const [isInitialLoading, setIsInitialLoading] = useState(true)
|
||||||
const [isRefreshing, setIsRefreshing] = useState(false)
|
const [isRefreshing, setIsRefreshing] = useState(false)
|
||||||
const [isServerConnected, setIsServerConnected] = useState(true)
|
const [isServerConnected, setIsServerConnected] = useState(true)
|
||||||
const [componentKey, setComponentKey] = useState(0)
|
const [componentKey, setComponentKey] = useState(0)
|
||||||
@@ -192,10 +193,10 @@ export function ProxmoxDashboard() {
|
|||||||
}, [])
|
}, [])
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
// Siempre fetch inicial
|
// Siempre fetch inicial — mark loading done when system data arrives
|
||||||
fetchSystemData()
|
fetchSystemData().finally(() => setIsInitialLoading(false))
|
||||||
fetchHealthInfoCount() // Fetch info count on initial load
|
fetchHealthInfoCount()
|
||||||
fetchUpdateStatus() // Fetch ProxMenux update status on initial load
|
fetchUpdateStatus()
|
||||||
|
|
||||||
// En overview: cada 30 segundos para actualización frecuente del estado de salud
|
// En overview: cada 30 segundos para actualización frecuente del estado de salud
|
||||||
// En otras tabs: cada 60 segundos para reducir carga
|
// En otras tabs: cada 60 segundos para reducir carga
|
||||||
@@ -372,6 +373,21 @@ export function ProxmoxDashboard() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (isInitialLoading) {
|
||||||
|
return (
|
||||||
|
<div className="min-h-screen bg-background flex items-center justify-center">
|
||||||
|
<div className="flex flex-col items-center gap-4">
|
||||||
|
<div className="relative">
|
||||||
|
<div className="h-12 w-12 rounded-full border-2 border-muted"></div>
|
||||||
|
<div className="absolute inset-0 h-12 w-12 rounded-full border-2 border-transparent border-t-primary animate-spin"></div>
|
||||||
|
</div>
|
||||||
|
<div className="text-sm font-medium text-foreground">Loading ProxMenux Monitor...</div>
|
||||||
|
<p className="text-xs text-muted-foreground">Connecting to server and fetching system status</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div className="min-h-screen bg-background">
|
<div className="min-h-screen bg-background">
|
||||||
<OnboardingCarousel />
|
<OnboardingCarousel />
|
||||||
|
|||||||
@@ -272,6 +272,8 @@ class HealthMonitor:
|
|||||||
# SMART check cache - reduces disk queries from every 5 min to every 30 min
|
# SMART check cache - reduces disk queries from every 5 min to every 30 min
|
||||||
self._smart_cache = {} # {disk_name: {'result': 'PASSED', 'time': timestamp}}
|
self._smart_cache = {} # {disk_name: {'result': 'PASSED', 'time': timestamp}}
|
||||||
self._SMART_CACHE_TTL = 1620 # 27 min - offset to avoid sync with other processes
|
self._SMART_CACHE_TTL = 1620 # 27 min - offset to avoid sync with other processes
|
||||||
|
# Disk identity cache - avoids repeated smartctl -i calls for serial/model
|
||||||
|
self._disk_identity_cache: Dict[str, Dict[str, str]] = {} # {disk_name: {'serial': ..., 'model': ...}}
|
||||||
|
|
||||||
# Journalctl 24h cache - reduces full log reads from every 5 min to every 1 hour
|
# Journalctl 24h cache - reduces full log reads from every 5 min to every 1 hour
|
||||||
self._journalctl_24h_cache = {'count': 0, 'time': 0}
|
self._journalctl_24h_cache = {'count': 0, 'time': 0}
|
||||||
@@ -280,11 +282,14 @@ class HealthMonitor:
|
|||||||
# Journalctl 10min cache - shared across checks to avoid duplicate calls
|
# Journalctl 10min cache - shared across checks to avoid duplicate calls
|
||||||
# Multiple checks (cpu_temp, vms_cts) use the same journalctl query
|
# Multiple checks (cpu_temp, vms_cts) use the same journalctl query
|
||||||
self._journalctl_10min_cache = {'output': '', 'time': 0}
|
self._journalctl_10min_cache = {'output': '', 'time': 0}
|
||||||
self._JOURNALCTL_10MIN_CACHE_TTL = 60 # 1 minute - fresh enough for health checks
|
self._JOURNALCTL_10MIN_CACHE_TTL = 120 # 2 minutes - covers full health check cycle
|
||||||
|
|
||||||
# Journalctl 1hour cache - for disk health events (SMART warnings, I/O errors)
|
# Journalctl 1hour cache - for disk health events (SMART warnings, I/O errors)
|
||||||
self._journalctl_1hour_cache = {'output': '', 'time': 0}
|
self._journalctl_1hour_cache = {'output': '', 'time': 0}
|
||||||
self._JOURNALCTL_1HOUR_CACHE_TTL = 300 # 5 min cache - disk events don't need real-time
|
self._JOURNALCTL_1HOUR_CACHE_TTL = 300 # 5 min cache - disk events don't need real-time
|
||||||
|
# Timestamp watermark: track last successfully processed journalctl entry
|
||||||
|
# to avoid re-processing old entries on subsequent checks
|
||||||
|
self._disk_journal_last_ts: Optional[str] = None
|
||||||
|
|
||||||
# System capabilities - derived from Proxmox storage types at runtime (Priority 1.5)
|
# System capabilities - derived from Proxmox storage types at runtime (Priority 1.5)
|
||||||
# SMART detection still uses filesystem check on init (lightweight)
|
# SMART detection still uses filesystem check on init (lightweight)
|
||||||
@@ -316,7 +321,7 @@ class HealthMonitor:
|
|||||||
['journalctl', '-b', '0', '--since', '10 minutes ago', '--no-pager', '-p', 'warning'],
|
['journalctl', '-b', '0', '--since', '10 minutes ago', '--no-pager', '-p', 'warning'],
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
text=True,
|
text=True,
|
||||||
timeout=20
|
timeout=10
|
||||||
)
|
)
|
||||||
if result.returncode == 0:
|
if result.returncode == 0:
|
||||||
cache['output'] = result.stdout
|
cache['output'] = result.stdout
|
||||||
@@ -330,37 +335,49 @@ class HealthMonitor:
|
|||||||
return cache.get('output', '') # Return stale cache on error
|
return cache.get('output', '') # Return stale cache on error
|
||||||
|
|
||||||
def _get_journalctl_1hour_warnings(self) -> str:
|
def _get_journalctl_1hour_warnings(self) -> str:
|
||||||
"""Get journalctl warnings from last 1 hour, cached for disk health checks.
|
"""Get journalctl warnings since last check, cached for disk health checks.
|
||||||
|
|
||||||
Used by _check_disk_health_from_events for SMART warnings and I/O errors.
|
Used by _check_disk_health_from_events for SMART warnings and I/O errors.
|
||||||
|
Uses a timestamp watermark (_disk_journal_last_ts) to only read NEW entries
|
||||||
|
since the last successful check, preventing re-processing of old errors.
|
||||||
|
On first run (no watermark), reads the last 10 minutes to catch recent events
|
||||||
|
without pulling in stale history.
|
||||||
Cached for 5 minutes since disk events don't require real-time detection.
|
Cached for 5 minutes since disk events don't require real-time detection.
|
||||||
"""
|
"""
|
||||||
current_time = time.time()
|
current_time = time.time()
|
||||||
cache = self._journalctl_1hour_cache
|
cache = self._journalctl_1hour_cache
|
||||||
|
|
||||||
# Return cached result if fresh
|
# Return cached result if fresh
|
||||||
if cache['output'] and (current_time - cache['time']) < self._JOURNALCTL_1HOUR_CACHE_TTL:
|
if cache['output'] is not None and cache['time'] > 0 and (current_time - cache['time']) < self._JOURNALCTL_1HOUR_CACHE_TTL:
|
||||||
return cache['output']
|
return cache['output']
|
||||||
|
|
||||||
# Execute journalctl and cache result
|
# Determine --since value: use watermark if available, otherwise 10 minutes
|
||||||
# Use -b 0 to only include logs from the current boot
|
if self._disk_journal_last_ts:
|
||||||
|
since_arg = self._disk_journal_last_ts
|
||||||
|
else:
|
||||||
|
since_arg = '10 minutes ago'
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
['journalctl', '-b', '0', '--since', '1 hour ago', '--no-pager', '-p', 'warning',
|
['journalctl', '-b', '0', '--since', since_arg, '--no-pager', '-p', 'warning',
|
||||||
'--output=short-precise'],
|
'--output=short-precise'],
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
text=True,
|
text=True,
|
||||||
timeout=15
|
timeout=15
|
||||||
)
|
)
|
||||||
if result.returncode == 0:
|
if result.returncode == 0:
|
||||||
cache['output'] = result.stdout
|
output = result.stdout
|
||||||
|
cache['output'] = output
|
||||||
cache['time'] = current_time
|
cache['time'] = current_time
|
||||||
return cache['output']
|
# Advance watermark to "now" so next check only gets new entries
|
||||||
|
from datetime import datetime as _dt
|
||||||
|
self._disk_journal_last_ts = _dt.now().strftime('%Y-%m-%d %H:%M:%S')
|
||||||
|
return output
|
||||||
except subprocess.TimeoutExpired:
|
except subprocess.TimeoutExpired:
|
||||||
print("[HealthMonitor] journalctl 1hour cache: timeout")
|
print("[HealthMonitor] journalctl disk cache: timeout")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[HealthMonitor] journalctl 1hour cache error: {e}")
|
print(f"[HealthMonitor] journalctl disk cache error: {e}")
|
||||||
|
|
||||||
return cache.get('output', '') # Return stale cache on error
|
return cache.get('output', '') # Return stale cache on error
|
||||||
|
|
||||||
# ─── Lightweight sampling methods for the dedicated vital-signs thread ───
|
# ─── Lightweight sampling methods for the dedicated vital-signs thread ───
|
||||||
@@ -1260,21 +1277,10 @@ class HealthMonitor:
|
|||||||
reason = f'{disk}: {issue["reason"]}'
|
reason = f'{disk}: {issue["reason"]}'
|
||||||
severity = issue.get('status', 'WARNING')
|
severity = issue.get('status', 'WARNING')
|
||||||
|
|
||||||
# Get serial for this disk to properly track it (important for USB disks)
|
# Get serial for this disk (cached to avoid repeated smartctl calls)
|
||||||
disk_serial = ''
|
disk_id = self._get_disk_identity(device)
|
||||||
disk_model = ''
|
disk_serial = disk_id['serial']
|
||||||
try:
|
disk_model = disk_id['model']
|
||||||
smart_result = subprocess.run(
|
|
||||||
['smartctl', '-i', '-j', f'/dev/{device}'],
|
|
||||||
capture_output=True, text=True, timeout=5
|
|
||||||
)
|
|
||||||
if smart_result.returncode in (0, 4):
|
|
||||||
import json
|
|
||||||
smart_data = json.loads(smart_result.stdout)
|
|
||||||
disk_serial = smart_data.get('serial_number', '')
|
|
||||||
disk_model = smart_data.get('model_name', '') or smart_data.get('model_family', '')
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if (not health_persistence.is_error_active(io_error_key, category='disks') and
|
if (not health_persistence.is_error_active(io_error_key, category='disks') and
|
||||||
@@ -1323,21 +1329,10 @@ class HealthMonitor:
|
|||||||
device = disk_path.replace('/dev/', '')
|
device = disk_path.replace('/dev/', '')
|
||||||
io_severity = disk_info.get('status', 'WARNING').lower()
|
io_severity = disk_info.get('status', 'WARNING').lower()
|
||||||
|
|
||||||
# Get serial for proper disk tracking (important for USB)
|
# Get serial for proper disk tracking (cached)
|
||||||
io_serial = ''
|
io_id = self._get_disk_identity(device)
|
||||||
io_model = ''
|
io_serial = io_id['serial']
|
||||||
try:
|
io_model = io_id['model']
|
||||||
smart_result = subprocess.run(
|
|
||||||
['smartctl', '-i', '-j', f'/dev/{device}'],
|
|
||||||
capture_output=True, text=True, timeout=5
|
|
||||||
)
|
|
||||||
if smart_result.returncode in (0, 4):
|
|
||||||
import json
|
|
||||||
smart_data = json.loads(smart_result.stdout)
|
|
||||||
io_serial = smart_data.get('serial_number', '')
|
|
||||||
io_model = smart_data.get('model_name', '') or smart_data.get('model_family', '')
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Register the disk for observation tracking (worst_health no longer used)
|
# Register the disk for observation tracking (worst_health no longer used)
|
||||||
try:
|
try:
|
||||||
@@ -1946,20 +1941,53 @@ class HealthMonitor:
|
|||||||
except Exception:
|
except Exception:
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
|
def _get_disk_identity(self, disk_name: str) -> Dict[str, str]:
|
||||||
|
"""Get disk serial/model with caching. Avoids repeated smartctl -i calls.
|
||||||
|
|
||||||
|
Returns {'serial': '...', 'model': '...'} or empty values on failure.
|
||||||
|
Cache persists for the lifetime of the monitor (serial/model don't change).
|
||||||
|
"""
|
||||||
|
if disk_name in self._disk_identity_cache:
|
||||||
|
return self._disk_identity_cache[disk_name]
|
||||||
|
|
||||||
|
result = {'serial': '', 'model': ''}
|
||||||
|
try:
|
||||||
|
dev_path = f'/dev/{disk_name}' if not disk_name.startswith('/') else disk_name
|
||||||
|
proc = subprocess.run(
|
||||||
|
['smartctl', '-i', '-j', dev_path],
|
||||||
|
capture_output=True, text=True, timeout=5
|
||||||
|
)
|
||||||
|
if proc.returncode in (0, 4):
|
||||||
|
import json as _json
|
||||||
|
data = _json.loads(proc.stdout)
|
||||||
|
result['serial'] = data.get('serial_number', '')
|
||||||
|
result['model'] = data.get('model_name', '') or data.get('model_family', '')
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
self._disk_identity_cache[disk_name] = result
|
||||||
|
return result
|
||||||
|
|
||||||
def _quick_smart_health(self, disk_name: str) -> str:
|
def _quick_smart_health(self, disk_name: str) -> str:
|
||||||
"""Quick SMART health check for a single disk. Returns 'PASSED', 'FAILED', or 'UNKNOWN'.
|
"""Quick SMART health check for a single disk. Returns 'PASSED', 'FAILED', or 'UNKNOWN'.
|
||||||
|
|
||||||
Results are cached for 30 minutes to reduce disk queries - SMART status rarely changes.
|
Results are cached for 30 minutes to reduce disk queries - SMART status rarely changes.
|
||||||
"""
|
"""
|
||||||
if not disk_name or disk_name.startswith('ata') or disk_name.startswith('zram'):
|
if not disk_name or disk_name.startswith('ata') or disk_name.startswith('zram'):
|
||||||
return 'UNKNOWN'
|
return 'UNKNOWN'
|
||||||
|
|
||||||
# Check cache first
|
# Check cache first (and evict stale entries periodically)
|
||||||
current_time = time.time()
|
current_time = time.time()
|
||||||
cache_key = disk_name
|
cache_key = disk_name
|
||||||
cached = self._smart_cache.get(cache_key)
|
cached = self._smart_cache.get(cache_key)
|
||||||
if cached and current_time - cached['time'] < self._SMART_CACHE_TTL:
|
if cached and current_time - cached['time'] < self._SMART_CACHE_TTL:
|
||||||
return cached['result']
|
return cached['result']
|
||||||
|
# Evict expired entries to prevent unbounded growth
|
||||||
|
if len(self._smart_cache) > 50:
|
||||||
|
self._smart_cache = {
|
||||||
|
k: v for k, v in self._smart_cache.items()
|
||||||
|
if current_time - v['time'] < self._SMART_CACHE_TTL * 2
|
||||||
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
dev_path = f'/dev/{disk_name}' if not disk_name.startswith('/') else disk_name
|
dev_path = f'/dev/{disk_name}' if not disk_name.startswith('/') else disk_name
|
||||||
@@ -2130,7 +2158,11 @@ class HealthMonitor:
|
|||||||
t for t in self.io_error_history[disk]
|
t for t in self.io_error_history[disk]
|
||||||
if current_time - t < 300
|
if current_time - t < 300
|
||||||
]
|
]
|
||||||
|
# Remove empty entries to prevent unbounded dict growth
|
||||||
|
if not self.io_error_history[disk]:
|
||||||
|
del self.io_error_history[disk]
|
||||||
|
continue
|
||||||
|
|
||||||
error_count = len(self.io_error_history[disk])
|
error_count = len(self.io_error_history[disk])
|
||||||
error_key = f'disk_{disk}'
|
error_key = f'disk_{disk}'
|
||||||
sample = disk_samples.get(disk, '')
|
sample = disk_samples.get(disk, '')
|
||||||
@@ -4662,19 +4694,9 @@ class HealthMonitor:
|
|||||||
|
|
||||||
obs_sig = f'{sig_base}_{disk_name}'
|
obs_sig = f'{sig_base}_{disk_name}'
|
||||||
|
|
||||||
# Try to get serial for proper cross-referencing
|
# Get serial for proper cross-referencing (cached)
|
||||||
obs_serial = None
|
obs_id = self._get_disk_identity(disk_name)
|
||||||
try:
|
obs_serial = obs_id['serial'] or None
|
||||||
sm = subprocess.run(
|
|
||||||
['smartctl', '-i', dev_path],
|
|
||||||
capture_output=True, text=True, timeout=3)
|
|
||||||
if sm.returncode in (0, 4):
|
|
||||||
for sline in sm.stdout.split('\n'):
|
|
||||||
if 'Serial Number' in sline or 'Serial number' in sline:
|
|
||||||
obs_serial = sline.split(':')[-1].strip()
|
|
||||||
break
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
health_persistence.record_disk_observation(
|
health_persistence.record_disk_observation(
|
||||||
device_name=disk_name,
|
device_name=disk_name,
|
||||||
|
|||||||
@@ -54,7 +54,12 @@ class HealthPersistence:
|
|||||||
self._init_database()
|
self._init_database()
|
||||||
|
|
||||||
def _get_conn(self) -> sqlite3.Connection:
|
def _get_conn(self) -> sqlite3.Connection:
|
||||||
"""Get a SQLite connection with timeout and WAL mode for safe concurrency."""
|
"""Get a SQLite connection with timeout and WAL mode for safe concurrency.
|
||||||
|
|
||||||
|
IMPORTANT: Always close the connection when done, preferably using
|
||||||
|
the _db_connection() context manager. If not closed explicitly,
|
||||||
|
Python's GC will close it, but this is unreliable under load.
|
||||||
|
"""
|
||||||
conn = sqlite3.connect(str(self.db_path), timeout=30)
|
conn = sqlite3.connect(str(self.db_path), timeout=30)
|
||||||
conn.execute('PRAGMA journal_mode=WAL')
|
conn.execute('PRAGMA journal_mode=WAL')
|
||||||
conn.execute('PRAGMA busy_timeout=10000')
|
conn.execute('PRAGMA busy_timeout=10000')
|
||||||
@@ -332,7 +337,32 @@ class HealthPersistence:
|
|||||||
print(f"[HealthPersistence] WARNING: Missing tables after init: {missing}")
|
print(f"[HealthPersistence] WARNING: Missing tables after init: {missing}")
|
||||||
else:
|
else:
|
||||||
print(f"[HealthPersistence] Database initialized with {len(tables)} tables")
|
print(f"[HealthPersistence] Database initialized with {len(tables)} tables")
|
||||||
|
|
||||||
|
# ─── Startup migration: clean stale looping disk I/O errors ───
|
||||||
|
# Previous versions had a bug where journal-based disk errors were
|
||||||
|
# re-processed every cycle, causing infinite notification loops.
|
||||||
|
# On upgrade, clean up any stale disk errors that are stuck in the
|
||||||
|
# active state from the old buggy behavior.
|
||||||
|
try:
|
||||||
|
cursor = conn.cursor()
|
||||||
|
# Delete active (unresolved) disk errors from journal that are
|
||||||
|
# older than 2 hours — these are leftovers from the feedback loop.
|
||||||
|
# Real new errors will be re-detected from fresh journal entries.
|
||||||
|
cutoff = (datetime.now() - timedelta(hours=2)).isoformat()
|
||||||
|
cursor.execute('''
|
||||||
|
DELETE FROM errors
|
||||||
|
WHERE error_key LIKE 'smart_%'
|
||||||
|
AND resolved_at IS NULL
|
||||||
|
AND acknowledged = 0
|
||||||
|
AND last_seen < ?
|
||||||
|
''', (cutoff,))
|
||||||
|
cleaned = cursor.rowcount
|
||||||
|
if cleaned > 0:
|
||||||
|
conn.commit()
|
||||||
|
print(f"[HealthPersistence] Startup cleanup: removed {cleaned} stale disk error(s) from previous bug")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[HealthPersistence] Startup cleanup warning: {e}")
|
||||||
|
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
def record_error(self, error_key: str, category: str, severity: str,
|
def record_error(self, error_key: str, category: str, severity: str,
|
||||||
@@ -345,21 +375,17 @@ class HealthPersistence:
|
|||||||
return self._record_error_impl(error_key, category, severity, reason, details)
|
return self._record_error_impl(error_key, category, severity, reason, details)
|
||||||
|
|
||||||
def _record_error_impl(self, error_key, category, severity, reason, details):
|
def _record_error_impl(self, error_key, category, severity, reason, details):
|
||||||
# === RESOURCE EXISTENCE CHECK ===
|
# === RESOURCE EXISTENCE CHECK (before DB access) ===
|
||||||
# Skip recording errors for resources that no longer exist
|
# Skip recording errors for resources that no longer exist
|
||||||
# This prevents "ghost" errors from stale journal entries
|
|
||||||
|
|
||||||
# Check VM/CT existence
|
|
||||||
if error_key and (error_key.startswith(('vm_', 'ct_', 'vmct_'))):
|
if error_key and (error_key.startswith(('vm_', 'ct_', 'vmct_'))):
|
||||||
import re
|
import re
|
||||||
vmid_match = re.search(r'(?:vm_|ct_|vmct_)(\d+)', error_key)
|
vmid_match = re.search(r'(?:vm_|ct_|vmct_)(\d+)', error_key)
|
||||||
if vmid_match:
|
if vmid_match:
|
||||||
vmid = vmid_match.group(1)
|
vmid = vmid_match.group(1)
|
||||||
if not self._check_vm_ct_exists(vmid):
|
if not self._check_vm_ct_exists(vmid):
|
||||||
return {'type': 'skipped', 'needs_notification': False,
|
return {'type': 'skipped', 'needs_notification': False,
|
||||||
'reason': f'VM/CT {vmid} no longer exists'}
|
'reason': f'VM/CT {vmid} no longer exists'}
|
||||||
|
|
||||||
# Check disk existence
|
|
||||||
if error_key and any(error_key.startswith(p) for p in ('smart_', 'disk_', 'io_error_')):
|
if error_key and any(error_key.startswith(p) for p in ('smart_', 'disk_', 'io_error_')):
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
@@ -370,161 +396,151 @@ class HealthPersistence:
|
|||||||
if not os.path.exists(f'/dev/{disk_name}') and not os.path.exists(f'/dev/{base_disk}'):
|
if not os.path.exists(f'/dev/{disk_name}') and not os.path.exists(f'/dev/{base_disk}'):
|
||||||
return {'type': 'skipped', 'needs_notification': False,
|
return {'type': 'skipped', 'needs_notification': False,
|
||||||
'reason': f'Disk /dev/{disk_name} no longer exists'}
|
'reason': f'Disk /dev/{disk_name} no longer exists'}
|
||||||
|
|
||||||
conn = self._get_conn()
|
conn = self._get_conn()
|
||||||
cursor = conn.cursor()
|
try:
|
||||||
|
cursor = conn.cursor()
|
||||||
now = datetime.now().isoformat()
|
now = datetime.now().isoformat()
|
||||||
details_json = json.dumps(details) if details else None
|
details_json = json.dumps(details) if details else None
|
||||||
|
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
SELECT id, acknowledged, resolved_at, category, severity, first_seen,
|
SELECT id, acknowledged, resolved_at, category, severity, first_seen,
|
||||||
notification_sent, suppression_hours
|
notification_sent, suppression_hours
|
||||||
FROM errors WHERE error_key = ?
|
FROM errors WHERE error_key = ?
|
||||||
''', (error_key,))
|
''', (error_key,))
|
||||||
existing = cursor.fetchone()
|
existing = cursor.fetchone()
|
||||||
|
|
||||||
event_info = {'type': 'updated', 'needs_notification': False}
|
event_info = {'type': 'updated', 'needs_notification': False}
|
||||||
|
|
||||||
if existing:
|
if existing:
|
||||||
err_id, ack, resolved_at, old_cat, old_severity, first_seen, notif_sent, stored_suppression = existing
|
err_id, ack, resolved_at, old_cat, old_severity, first_seen, notif_sent, stored_suppression = existing
|
||||||
|
|
||||||
if ack == 1:
|
if ack == 1:
|
||||||
# SAFETY OVERRIDE: Critical CPU temperature ALWAYS re-triggers
|
# SAFETY OVERRIDE: Critical CPU temperature ALWAYS re-triggers
|
||||||
# regardless of any dismiss/permanent setting (hardware protection)
|
if error_key == 'cpu_temperature' and severity == 'CRITICAL':
|
||||||
if error_key == 'cpu_temperature' and severity == 'CRITICAL':
|
|
||||||
cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,))
|
|
||||||
cursor.execute('''
|
|
||||||
INSERT INTO errors
|
|
||||||
(error_key, category, severity, reason, details, first_seen, last_seen)
|
|
||||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
||||||
''', (error_key, category, severity, reason, details_json, now, now))
|
|
||||||
event_info = {'type': 'new', 'needs_notification': True}
|
|
||||||
self._record_event(cursor, 'new', error_key,
|
|
||||||
{'severity': severity, 'reason': reason,
|
|
||||||
'note': 'CRITICAL temperature override - safety alert'})
|
|
||||||
conn.commit()
|
|
||||||
conn.close()
|
|
||||||
return event_info
|
|
||||||
|
|
||||||
# Check suppression: use per-record stored hours (set at dismiss time)
|
|
||||||
sup_hours = stored_suppression if stored_suppression is not None else self.DEFAULT_SUPPRESSION_HOURS
|
|
||||||
|
|
||||||
# Permanent dismiss (sup_hours == -1): always suppress
|
|
||||||
if sup_hours == -1:
|
|
||||||
conn.close()
|
|
||||||
return {'type': 'skipped_acknowledged', 'needs_notification': False}
|
|
||||||
|
|
||||||
# Time-limited suppression
|
|
||||||
still_suppressed = False
|
|
||||||
if resolved_at:
|
|
||||||
try:
|
|
||||||
resolved_dt = datetime.fromisoformat(resolved_at)
|
|
||||||
elapsed_hours = (datetime.now() - resolved_dt).total_seconds() / 3600
|
|
||||||
still_suppressed = elapsed_hours < sup_hours
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
if still_suppressed:
|
|
||||||
conn.close()
|
|
||||||
return {'type': 'skipped_acknowledged', 'needs_notification': False}
|
|
||||||
else:
|
|
||||||
# Suppression expired.
|
|
||||||
# For log-based errors (spike, persistent, cascade),
|
|
||||||
# do NOT re-trigger. The journal always contains old
|
|
||||||
# messages, so re-creating the error would cause an
|
|
||||||
# infinite notification cycle. Instead, just delete
|
|
||||||
# the stale record so it stops appearing in the UI.
|
|
||||||
is_log_error = (
|
|
||||||
error_key.startswith('log_persistent_')
|
|
||||||
or error_key.startswith('log_spike_')
|
|
||||||
or error_key.startswith('log_cascade_')
|
|
||||||
or error_key.startswith('log_critical_')
|
|
||||||
or category == 'logs'
|
|
||||||
)
|
|
||||||
if is_log_error:
|
|
||||||
cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,))
|
cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,))
|
||||||
conn.commit()
|
|
||||||
conn.close()
|
|
||||||
return {'type': 'skipped_expired_log', 'needs_notification': False}
|
|
||||||
|
|
||||||
# For non-log errors (hardware, services, etc.),
|
|
||||||
# re-triggering is correct -- the condition is real
|
|
||||||
# and still present.
|
|
||||||
cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,))
|
|
||||||
cursor.execute('''
|
|
||||||
INSERT INTO errors
|
|
||||||
(error_key, category, severity, reason, details, first_seen, last_seen)
|
|
||||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
||||||
''', (error_key, category, severity, reason, details_json, now, now))
|
|
||||||
event_info = {'type': 'new', 'needs_notification': True}
|
|
||||||
self._record_event(cursor, 'new', error_key,
|
|
||||||
{'severity': severity, 'reason': reason,
|
|
||||||
'note': 'Re-triggered after suppression expired'})
|
|
||||||
conn.commit()
|
|
||||||
conn.close()
|
|
||||||
return event_info
|
|
||||||
|
|
||||||
# Not acknowledged - update existing active error
|
|
||||||
cursor.execute('''
|
|
||||||
UPDATE errors
|
|
||||||
SET last_seen = ?, severity = ?, reason = ?, details = ?
|
|
||||||
WHERE error_key = ? AND acknowledged = 0
|
|
||||||
''', (now, severity, reason, details_json, error_key))
|
|
||||||
|
|
||||||
# Check if severity escalated
|
|
||||||
if old_severity == 'WARNING' and severity == 'CRITICAL':
|
|
||||||
event_info['type'] = 'escalated'
|
|
||||||
event_info['needs_notification'] = True
|
|
||||||
else:
|
|
||||||
# Insert new error
|
|
||||||
cursor.execute('''
|
|
||||||
INSERT INTO errors
|
|
||||||
(error_key, category, severity, reason, details, first_seen, last_seen)
|
|
||||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
||||||
''', (error_key, category, severity, reason, details_json, now, now))
|
|
||||||
|
|
||||||
event_info['type'] = 'new'
|
|
||||||
event_info['needs_notification'] = True
|
|
||||||
|
|
||||||
# ─── Auto-suppress: if the category has a non-default setting, ───
|
|
||||||
# auto-dismiss immediately so the user never sees it as active.
|
|
||||||
# Exception: CRITICAL CPU temperature is never auto-suppressed.
|
|
||||||
if not (error_key == 'cpu_temperature' and severity == 'CRITICAL'):
|
|
||||||
setting_key = self.CATEGORY_SETTING_MAP.get(category, '')
|
|
||||||
if setting_key:
|
|
||||||
# P4 fix: use _get_setting_impl with existing connection to avoid deadlock
|
|
||||||
stored = self._get_setting_impl(conn, setting_key)
|
|
||||||
if stored is not None:
|
|
||||||
configured_hours = int(stored)
|
|
||||||
if configured_hours != self.DEFAULT_SUPPRESSION_HOURS:
|
|
||||||
# Non-default setting found: auto-acknowledge
|
|
||||||
# Mark as acknowledged but DO NOT set resolved_at - error remains active
|
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
UPDATE errors
|
INSERT INTO errors
|
||||||
SET acknowledged = 1, acknowledged_at = ?, suppression_hours = ?
|
(error_key, category, severity, reason, details, first_seen, last_seen)
|
||||||
WHERE error_key = ? AND acknowledged = 0
|
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||||
''', (now, configured_hours, error_key))
|
''', (error_key, category, severity, reason, details_json, now, now))
|
||||||
|
event_info = {'type': 'new', 'needs_notification': True}
|
||||||
if cursor.rowcount > 0:
|
self._record_event(cursor, 'new', error_key,
|
||||||
self._record_event(cursor, 'auto_suppressed', error_key, {
|
{'severity': severity, 'reason': reason,
|
||||||
'severity': severity,
|
'note': 'CRITICAL temperature override - safety alert'})
|
||||||
'reason': reason,
|
conn.commit()
|
||||||
'suppression_hours': configured_hours,
|
return event_info
|
||||||
'note': 'Auto-suppressed by user settings'
|
|
||||||
})
|
# Check suppression: use per-record stored hours (set at dismiss time)
|
||||||
event_info['type'] = 'auto_suppressed'
|
sup_hours = stored_suppression if stored_suppression is not None else self.DEFAULT_SUPPRESSION_HOURS
|
||||||
event_info['needs_notification'] = False
|
|
||||||
|
# Permanent dismiss (sup_hours == -1): always suppress
|
||||||
|
if sup_hours == -1:
|
||||||
|
return {'type': 'skipped_acknowledged', 'needs_notification': False}
|
||||||
|
|
||||||
|
# Time-limited suppression
|
||||||
|
still_suppressed = False
|
||||||
|
if resolved_at:
|
||||||
|
try:
|
||||||
|
resolved_dt = datetime.fromisoformat(resolved_at)
|
||||||
|
elapsed_hours = (datetime.now() - resolved_dt).total_seconds() / 3600
|
||||||
|
still_suppressed = elapsed_hours < sup_hours
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if still_suppressed:
|
||||||
|
return {'type': 'skipped_acknowledged', 'needs_notification': False}
|
||||||
|
else:
|
||||||
|
# Suppression expired.
|
||||||
|
# Journal-sourced errors (logs AND disk I/O) should NOT
|
||||||
|
# re-trigger after suppression. The journal always contains
|
||||||
|
# old messages, so re-creating the error causes an infinite
|
||||||
|
# notification loop. Delete the stale record instead.
|
||||||
|
is_journal_error = (
|
||||||
|
error_key.startswith('log_persistent_')
|
||||||
|
or error_key.startswith('log_spike_')
|
||||||
|
or error_key.startswith('log_cascade_')
|
||||||
|
or error_key.startswith('log_critical_')
|
||||||
|
or error_key.startswith('smart_')
|
||||||
|
or error_key.startswith('disk_')
|
||||||
|
or error_key.startswith('io_error_')
|
||||||
|
or category == 'logs'
|
||||||
|
)
|
||||||
|
if is_journal_error:
|
||||||
|
cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,))
|
||||||
conn.commit()
|
conn.commit()
|
||||||
conn.close()
|
return {'type': 'skipped_expired_journal', 'needs_notification': False}
|
||||||
return event_info
|
|
||||||
|
# For non-log errors (hardware, services, etc.),
|
||||||
# Record event
|
# re-triggering is correct -- the condition is real and still present.
|
||||||
self._record_event(cursor, event_info['type'], error_key,
|
cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,))
|
||||||
{'severity': severity, 'reason': reason})
|
cursor.execute('''
|
||||||
|
INSERT INTO errors
|
||||||
conn.commit()
|
(error_key, category, severity, reason, details, first_seen, last_seen)
|
||||||
conn.close()
|
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||||
|
''', (error_key, category, severity, reason, details_json, now, now))
|
||||||
|
event_info = {'type': 'new', 'needs_notification': True}
|
||||||
|
self._record_event(cursor, 'new', error_key,
|
||||||
|
{'severity': severity, 'reason': reason,
|
||||||
|
'note': 'Re-triggered after suppression expired'})
|
||||||
|
conn.commit()
|
||||||
|
return event_info
|
||||||
|
|
||||||
|
# Not acknowledged - update existing active error
|
||||||
|
cursor.execute('''
|
||||||
|
UPDATE errors
|
||||||
|
SET last_seen = ?, severity = ?, reason = ?, details = ?
|
||||||
|
WHERE error_key = ? AND acknowledged = 0
|
||||||
|
''', (now, severity, reason, details_json, error_key))
|
||||||
|
|
||||||
|
# Check if severity escalated
|
||||||
|
if old_severity == 'WARNING' and severity == 'CRITICAL':
|
||||||
|
event_info['type'] = 'escalated'
|
||||||
|
event_info['needs_notification'] = True
|
||||||
|
else:
|
||||||
|
# Insert new error
|
||||||
|
cursor.execute('''
|
||||||
|
INSERT INTO errors
|
||||||
|
(error_key, category, severity, reason, details, first_seen, last_seen)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||||
|
''', (error_key, category, severity, reason, details_json, now, now))
|
||||||
|
|
||||||
|
event_info['type'] = 'new'
|
||||||
|
event_info['needs_notification'] = True
|
||||||
|
|
||||||
|
# ─── Auto-suppress: if the category has a non-default setting, ───
|
||||||
|
if not (error_key == 'cpu_temperature' and severity == 'CRITICAL'):
|
||||||
|
setting_key = self.CATEGORY_SETTING_MAP.get(category, '')
|
||||||
|
if setting_key:
|
||||||
|
stored = self._get_setting_impl(conn, setting_key)
|
||||||
|
if stored is not None:
|
||||||
|
configured_hours = int(stored)
|
||||||
|
if configured_hours != self.DEFAULT_SUPPRESSION_HOURS:
|
||||||
|
cursor.execute('''
|
||||||
|
UPDATE errors
|
||||||
|
SET acknowledged = 1, acknowledged_at = ?, suppression_hours = ?
|
||||||
|
WHERE error_key = ? AND acknowledged = 0
|
||||||
|
''', (now, configured_hours, error_key))
|
||||||
|
|
||||||
|
if cursor.rowcount > 0:
|
||||||
|
self._record_event(cursor, 'auto_suppressed', error_key, {
|
||||||
|
'severity': severity,
|
||||||
|
'reason': reason,
|
||||||
|
'suppression_hours': configured_hours,
|
||||||
|
'note': 'Auto-suppressed by user settings'
|
||||||
|
})
|
||||||
|
event_info['type'] = 'auto_suppressed'
|
||||||
|
event_info['needs_notification'] = False
|
||||||
|
conn.commit()
|
||||||
|
return event_info
|
||||||
|
|
||||||
|
# Record event
|
||||||
|
self._record_event(cursor, event_info['type'], error_key,
|
||||||
|
{'severity': severity, 'reason': reason})
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
return event_info
|
return event_info
|
||||||
|
|
||||||
@@ -534,22 +550,20 @@ class HealthPersistence:
|
|||||||
return self._resolve_error_impl(error_key, reason)
|
return self._resolve_error_impl(error_key, reason)
|
||||||
|
|
||||||
def _resolve_error_impl(self, error_key, reason):
|
def _resolve_error_impl(self, error_key, reason):
|
||||||
conn = self._get_conn()
|
with self._db_connection() as conn:
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
now = datetime.now().isoformat()
|
||||||
now = datetime.now().isoformat()
|
|
||||||
|
cursor.execute('''
|
||||||
cursor.execute('''
|
UPDATE errors
|
||||||
UPDATE errors
|
SET resolved_at = ?
|
||||||
SET resolved_at = ?
|
WHERE error_key = ? AND resolved_at IS NULL
|
||||||
WHERE error_key = ? AND resolved_at IS NULL
|
''', (now, error_key))
|
||||||
''', (now, error_key))
|
|
||||||
|
if cursor.rowcount > 0:
|
||||||
if cursor.rowcount > 0:
|
self._record_event(cursor, 'resolved', error_key, {'reason': reason})
|
||||||
self._record_event(cursor, 'resolved', error_key, {'reason': reason})
|
|
||||||
|
conn.commit()
|
||||||
conn.commit()
|
|
||||||
conn.close()
|
|
||||||
|
|
||||||
def is_error_active(self, error_key: str, category: Optional[str] = None) -> bool:
|
def is_error_active(self, error_key: str, category: Optional[str] = None) -> bool:
|
||||||
"""
|
"""
|
||||||
@@ -626,37 +640,34 @@ class HealthPersistence:
|
|||||||
we delete the record entirely so it can re-trigger as a fresh
|
we delete the record entirely so it can re-trigger as a fresh
|
||||||
event if the condition returns later.
|
event if the condition returns later.
|
||||||
"""
|
"""
|
||||||
conn = self._get_conn()
|
with self._db_connection() as conn:
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
now = datetime.now().isoformat()
|
||||||
now = datetime.now().isoformat()
|
|
||||||
|
# Check if this error was acknowledged (dismissed)
|
||||||
# Check if this error was acknowledged (dismissed)
|
|
||||||
cursor.execute('''
|
|
||||||
SELECT acknowledged FROM errors WHERE error_key = ?
|
|
||||||
''', (error_key,))
|
|
||||||
row = cursor.fetchone()
|
|
||||||
|
|
||||||
if row and row[0] == 1:
|
|
||||||
# Dismissed error that naturally resolved - delete entirely
|
|
||||||
# so it can re-trigger as a new event if it happens again
|
|
||||||
cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,))
|
|
||||||
if cursor.rowcount > 0:
|
|
||||||
self._record_event(cursor, 'cleared', error_key,
|
|
||||||
{'reason': 'condition_resolved_after_dismiss'})
|
|
||||||
else:
|
|
||||||
# Normal active error - mark as resolved
|
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
UPDATE errors
|
SELECT acknowledged FROM errors WHERE error_key = ?
|
||||||
SET resolved_at = ?
|
''', (error_key,))
|
||||||
WHERE error_key = ? AND resolved_at IS NULL
|
row = cursor.fetchone()
|
||||||
''', (now, error_key))
|
|
||||||
|
if row and row[0] == 1:
|
||||||
if cursor.rowcount > 0:
|
# Dismissed error that naturally resolved - delete entirely
|
||||||
self._record_event(cursor, 'cleared', error_key, {'reason': 'condition_resolved'})
|
cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,))
|
||||||
|
if cursor.rowcount > 0:
|
||||||
conn.commit()
|
self._record_event(cursor, 'cleared', error_key,
|
||||||
conn.close()
|
{'reason': 'condition_resolved_after_dismiss'})
|
||||||
|
else:
|
||||||
|
# Normal active error - mark as resolved
|
||||||
|
cursor.execute('''
|
||||||
|
UPDATE errors
|
||||||
|
SET resolved_at = ?
|
||||||
|
WHERE error_key = ? AND resolved_at IS NULL
|
||||||
|
''', (now, error_key))
|
||||||
|
|
||||||
|
if cursor.rowcount > 0:
|
||||||
|
self._record_event(cursor, 'cleared', error_key, {'reason': 'condition_resolved'})
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
def acknowledge_error(self, error_key: str) -> Dict[str, Any]:
|
def acknowledge_error(self, error_key: str) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
@@ -671,17 +682,19 @@ class HealthPersistence:
|
|||||||
def _acknowledge_error_impl(self, error_key):
|
def _acknowledge_error_impl(self, error_key):
|
||||||
conn = self._get_conn()
|
conn = self._get_conn()
|
||||||
conn.row_factory = sqlite3.Row
|
conn.row_factory = sqlite3.Row
|
||||||
cursor = conn.cursor()
|
category = ''
|
||||||
|
sup_hours = self.DEFAULT_SUPPRESSION_HOURS
|
||||||
now = datetime.now().isoformat()
|
try:
|
||||||
|
cursor = conn.cursor()
|
||||||
# Get current error info before acknowledging
|
now = datetime.now().isoformat()
|
||||||
cursor.execute('SELECT * FROM errors WHERE error_key = ?', (error_key,))
|
|
||||||
row = cursor.fetchone()
|
# Get current error info before acknowledging
|
||||||
|
cursor.execute('SELECT * FROM errors WHERE error_key = ?', (error_key,))
|
||||||
result = {'success': False, 'error_key': error_key}
|
row = cursor.fetchone()
|
||||||
|
|
||||||
if not row:
|
result = {'success': False, 'error_key': error_key}
|
||||||
|
|
||||||
|
if not row:
|
||||||
# Error not in DB yet -- create a minimal record so the dismiss persists.
|
# Error not in DB yet -- create a minimal record so the dismiss persists.
|
||||||
# Try to infer category from the error_key prefix.
|
# Try to infer category from the error_key prefix.
|
||||||
category = ''
|
category = ''
|
||||||
@@ -738,15 +751,6 @@ class HealthPersistence:
|
|||||||
'acknowledged_at': now
|
'acknowledged_at': now
|
||||||
}
|
}
|
||||||
conn.commit()
|
conn.commit()
|
||||||
conn.close()
|
|
||||||
|
|
||||||
# ── Clear cooldowns for newly dismissed errors too ──
|
|
||||||
if sup_hours != -1:
|
|
||||||
if category == 'disks':
|
|
||||||
self._clear_disk_io_cooldown(error_key)
|
|
||||||
else:
|
|
||||||
self._clear_notification_cooldown(error_key)
|
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
if row:
|
if row:
|
||||||
@@ -809,21 +813,17 @@ class HealthPersistence:
|
|||||||
'suppression_hours': sup_hours
|
'suppression_hours': sup_hours
|
||||||
}
|
}
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
conn.close()
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
# ── Coordinate with notification cooldowns ──
|
# ── Coordinate with notification cooldowns ──
|
||||||
# When an error is dismissed with non-permanent suppression,
|
|
||||||
# clear the corresponding cooldown in notification_last_sent
|
|
||||||
# so it can re-notify after the suppression period expires.
|
|
||||||
# This applies to ALL categories, not just disks.
|
|
||||||
if sup_hours != -1:
|
if sup_hours != -1:
|
||||||
if category == 'disks':
|
if category == 'disks':
|
||||||
self._clear_disk_io_cooldown(error_key)
|
self._clear_disk_io_cooldown(error_key)
|
||||||
else:
|
else:
|
||||||
# For non-disk categories, clear the PollingCollector cooldown
|
|
||||||
self._clear_notification_cooldown(error_key)
|
self._clear_notification_cooldown(error_key)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def is_error_acknowledged(self, error_key: str) -> bool:
|
def is_error_acknowledged(self, error_key: str) -> bool:
|
||||||
@@ -928,12 +928,13 @@ class HealthPersistence:
|
|||||||
|
|
||||||
def _cleanup_old_errors_impl(self):
|
def _cleanup_old_errors_impl(self):
|
||||||
conn = self._get_conn()
|
conn = self._get_conn()
|
||||||
cursor = conn.cursor()
|
try:
|
||||||
|
cursor = conn.cursor()
|
||||||
now = datetime.now()
|
|
||||||
now_iso = now.isoformat()
|
now = datetime.now()
|
||||||
|
now_iso = now.isoformat()
|
||||||
# Delete resolved errors older than 7 days
|
|
||||||
|
# Delete resolved errors older than 7 days
|
||||||
cutoff_resolved = (now - timedelta(days=7)).isoformat()
|
cutoff_resolved = (now - timedelta(days=7)).isoformat()
|
||||||
cursor.execute('DELETE FROM errors WHERE resolved_at < ?', (cutoff_resolved,))
|
cursor.execute('DELETE FROM errors WHERE resolved_at < ?', (cutoff_resolved,))
|
||||||
|
|
||||||
@@ -1127,12 +1128,13 @@ class HealthPersistence:
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass # If we can't read uptime, skip this cleanup
|
pass # If we can't read uptime, skip this cleanup
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
conn.close()
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
# Clean up errors for resources that no longer exist (VMs/CTs deleted, disks removed)
|
# Clean up errors for resources that no longer exist (VMs/CTs deleted, disks removed)
|
||||||
self._cleanup_stale_resources()
|
self._cleanup_stale_resources()
|
||||||
|
|
||||||
# Clean up disk observations for devices that no longer exist
|
# Clean up disk observations for devices that no longer exist
|
||||||
self.cleanup_orphan_observations()
|
self.cleanup_orphan_observations()
|
||||||
|
|
||||||
@@ -2174,6 +2176,8 @@ class HealthPersistence:
|
|||||||
last_col = 'last_occurrence' if 'last_occurrence' in columns else 'last_seen'
|
last_col = 'last_occurrence' if 'last_occurrence' in columns else 'last_seen'
|
||||||
|
|
||||||
# Upsert observation: if same (disk, type, signature), bump count + update last timestamp
|
# Upsert observation: if same (disk, type, signature), bump count + update last timestamp
|
||||||
|
# IMPORTANT: Do NOT reset dismissed — if the user dismissed this observation,
|
||||||
|
# re-detecting the same journal entry must not un-dismiss it.
|
||||||
cursor.execute(f'''
|
cursor.execute(f'''
|
||||||
INSERT INTO disk_observations
|
INSERT INTO disk_observations
|
||||||
(disk_registry_id, {type_col}, error_signature, {first_col},
|
(disk_registry_id, {type_col}, error_signature, {first_col},
|
||||||
@@ -2182,8 +2186,7 @@ class HealthPersistence:
|
|||||||
ON CONFLICT(disk_registry_id, {type_col}, error_signature) DO UPDATE SET
|
ON CONFLICT(disk_registry_id, {type_col}, error_signature) DO UPDATE SET
|
||||||
{last_col} = excluded.{last_col},
|
{last_col} = excluded.{last_col},
|
||||||
occurrence_count = occurrence_count + 1,
|
occurrence_count = occurrence_count + 1,
|
||||||
severity = CASE WHEN excluded.severity = 'critical' THEN 'critical' ELSE severity END,
|
severity = CASE WHEN excluded.severity = 'critical' THEN 'critical' ELSE severity END
|
||||||
dismissed = 0
|
|
||||||
''', (disk_id, error_type, error_signature, now, now, raw_message, severity))
|
''', (disk_id, error_type, error_signature, now, now, raw_message, severity))
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|||||||
Reference in New Issue
Block a user