diff --git a/AppImage/components/proxmox-dashboard.tsx b/AppImage/components/proxmox-dashboard.tsx
index a4b23d44..8e00cfbd 100644
--- a/AppImage/components/proxmox-dashboard.tsx
+++ b/AppImage/components/proxmox-dashboard.tsx
@@ -74,6 +74,7 @@ export function ProxmoxDashboard() {
serverName: "Loading...",
nodeId: "Loading...",
})
+ const [isInitialLoading, setIsInitialLoading] = useState(true)
const [isRefreshing, setIsRefreshing] = useState(false)
const [isServerConnected, setIsServerConnected] = useState(true)
const [componentKey, setComponentKey] = useState(0)
@@ -192,10 +193,10 @@ export function ProxmoxDashboard() {
}, [])
useEffect(() => {
- // Siempre fetch inicial
- fetchSystemData()
- fetchHealthInfoCount() // Fetch info count on initial load
- fetchUpdateStatus() // Fetch ProxMenux update status on initial load
+ // Siempre fetch inicial — mark loading done when system data arrives
+ fetchSystemData().finally(() => setIsInitialLoading(false))
+ fetchHealthInfoCount()
+ fetchUpdateStatus()
// En overview: cada 30 segundos para actualización frecuente del estado de salud
// En otras tabs: cada 60 segundos para reducir carga
@@ -372,6 +373,21 @@ export function ProxmoxDashboard() {
}
}
+ if (isInitialLoading) {
+ return (
+
+
+
+
Loading ProxMenux Monitor...
+
Connecting to server and fetching system status
+
+
+ )
+ }
+
return (
diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py
index 4d475179..2e3cdd7c 100644
--- a/AppImage/scripts/health_monitor.py
+++ b/AppImage/scripts/health_monitor.py
@@ -272,6 +272,8 @@ class HealthMonitor:
# SMART check cache - reduces disk queries from every 5 min to every 30 min
self._smart_cache = {} # {disk_name: {'result': 'PASSED', 'time': timestamp}}
self._SMART_CACHE_TTL = 1620 # 27 min - offset to avoid sync with other processes
+ # Disk identity cache - avoids repeated smartctl -i calls for serial/model
+ self._disk_identity_cache: Dict[str, Dict[str, str]] = {} # {disk_name: {'serial': ..., 'model': ...}}
# Journalctl 24h cache - reduces full log reads from every 5 min to every 1 hour
self._journalctl_24h_cache = {'count': 0, 'time': 0}
@@ -280,11 +282,14 @@ class HealthMonitor:
# Journalctl 10min cache - shared across checks to avoid duplicate calls
# Multiple checks (cpu_temp, vms_cts) use the same journalctl query
self._journalctl_10min_cache = {'output': '', 'time': 0}
- self._JOURNALCTL_10MIN_CACHE_TTL = 60 # 1 minute - fresh enough for health checks
+ self._JOURNALCTL_10MIN_CACHE_TTL = 120 # 2 minutes - covers full health check cycle
# Journalctl 1hour cache - for disk health events (SMART warnings, I/O errors)
self._journalctl_1hour_cache = {'output': '', 'time': 0}
self._JOURNALCTL_1HOUR_CACHE_TTL = 300 # 5 min cache - disk events don't need real-time
+ # Timestamp watermark: track last successfully processed journalctl entry
+ # to avoid re-processing old entries on subsequent checks
+ self._disk_journal_last_ts: Optional[str] = None
# System capabilities - derived from Proxmox storage types at runtime (Priority 1.5)
# SMART detection still uses filesystem check on init (lightweight)
@@ -316,7 +321,7 @@ class HealthMonitor:
['journalctl', '-b', '0', '--since', '10 minutes ago', '--no-pager', '-p', 'warning'],
capture_output=True,
text=True,
- timeout=20
+ timeout=10
)
if result.returncode == 0:
cache['output'] = result.stdout
@@ -330,37 +335,49 @@ class HealthMonitor:
return cache.get('output', '') # Return stale cache on error
def _get_journalctl_1hour_warnings(self) -> str:
- """Get journalctl warnings from last 1 hour, cached for disk health checks.
-
+ """Get journalctl warnings since last check, cached for disk health checks.
+
Used by _check_disk_health_from_events for SMART warnings and I/O errors.
+ Uses a timestamp watermark (_disk_journal_last_ts) to only read NEW entries
+ since the last successful check, preventing re-processing of old errors.
+ On first run (no watermark), reads the last 10 minutes to catch recent events
+ without pulling in stale history.
Cached for 5 minutes since disk events don't require real-time detection.
"""
current_time = time.time()
cache = self._journalctl_1hour_cache
-
+
# Return cached result if fresh
- if cache['output'] and (current_time - cache['time']) < self._JOURNALCTL_1HOUR_CACHE_TTL:
+ if cache['output'] is not None and cache['time'] > 0 and (current_time - cache['time']) < self._JOURNALCTL_1HOUR_CACHE_TTL:
return cache['output']
-
- # Execute journalctl and cache result
- # Use -b 0 to only include logs from the current boot
+
+ # Determine --since value: use watermark if available, otherwise 10 minutes
+ if self._disk_journal_last_ts:
+ since_arg = self._disk_journal_last_ts
+ else:
+ since_arg = '10 minutes ago'
+
try:
result = subprocess.run(
- ['journalctl', '-b', '0', '--since', '1 hour ago', '--no-pager', '-p', 'warning',
+ ['journalctl', '-b', '0', '--since', since_arg, '--no-pager', '-p', 'warning',
'--output=short-precise'],
capture_output=True,
text=True,
timeout=15
)
if result.returncode == 0:
- cache['output'] = result.stdout
+ output = result.stdout
+ cache['output'] = output
cache['time'] = current_time
- return cache['output']
+ # Advance watermark to "now" so next check only gets new entries
+ from datetime import datetime as _dt
+ self._disk_journal_last_ts = _dt.now().strftime('%Y-%m-%d %H:%M:%S')
+ return output
except subprocess.TimeoutExpired:
- print("[HealthMonitor] journalctl 1hour cache: timeout")
+ print("[HealthMonitor] journalctl disk cache: timeout")
except Exception as e:
- print(f"[HealthMonitor] journalctl 1hour cache error: {e}")
-
+ print(f"[HealthMonitor] journalctl disk cache error: {e}")
+
return cache.get('output', '') # Return stale cache on error
# ─── Lightweight sampling methods for the dedicated vital-signs thread ───
@@ -1260,21 +1277,10 @@ class HealthMonitor:
reason = f'{disk}: {issue["reason"]}'
severity = issue.get('status', 'WARNING')
- # Get serial for this disk to properly track it (important for USB disks)
- disk_serial = ''
- disk_model = ''
- try:
- smart_result = subprocess.run(
- ['smartctl', '-i', '-j', f'/dev/{device}'],
- capture_output=True, text=True, timeout=5
- )
- if smart_result.returncode in (0, 4):
- import json
- smart_data = json.loads(smart_result.stdout)
- disk_serial = smart_data.get('serial_number', '')
- disk_model = smart_data.get('model_name', '') or smart_data.get('model_family', '')
- except Exception:
- pass
+ # Get serial for this disk (cached to avoid repeated smartctl calls)
+ disk_id = self._get_disk_identity(device)
+ disk_serial = disk_id['serial']
+ disk_model = disk_id['model']
try:
if (not health_persistence.is_error_active(io_error_key, category='disks') and
@@ -1323,21 +1329,10 @@ class HealthMonitor:
device = disk_path.replace('/dev/', '')
io_severity = disk_info.get('status', 'WARNING').lower()
- # Get serial for proper disk tracking (important for USB)
- io_serial = ''
- io_model = ''
- try:
- smart_result = subprocess.run(
- ['smartctl', '-i', '-j', f'/dev/{device}'],
- capture_output=True, text=True, timeout=5
- )
- if smart_result.returncode in (0, 4):
- import json
- smart_data = json.loads(smart_result.stdout)
- io_serial = smart_data.get('serial_number', '')
- io_model = smart_data.get('model_name', '') or smart_data.get('model_family', '')
- except Exception:
- pass
+ # Get serial for proper disk tracking (cached)
+ io_id = self._get_disk_identity(device)
+ io_serial = io_id['serial']
+ io_model = io_id['model']
# Register the disk for observation tracking (worst_health no longer used)
try:
@@ -1946,20 +1941,53 @@ class HealthMonitor:
except Exception:
return ''
+ def _get_disk_identity(self, disk_name: str) -> Dict[str, str]:
+ """Get disk serial/model with caching. Avoids repeated smartctl -i calls.
+
+ Returns {'serial': '...', 'model': '...'} or empty values on failure.
+ Cache persists for the lifetime of the monitor (serial/model don't change).
+ """
+ if disk_name in self._disk_identity_cache:
+ return self._disk_identity_cache[disk_name]
+
+ result = {'serial': '', 'model': ''}
+ try:
+ dev_path = f'/dev/{disk_name}' if not disk_name.startswith('/') else disk_name
+ proc = subprocess.run(
+ ['smartctl', '-i', '-j', dev_path],
+ capture_output=True, text=True, timeout=5
+ )
+ if proc.returncode in (0, 4):
+ import json as _json
+ data = _json.loads(proc.stdout)
+ result['serial'] = data.get('serial_number', '')
+ result['model'] = data.get('model_name', '') or data.get('model_family', '')
+ except Exception:
+ pass
+
+ self._disk_identity_cache[disk_name] = result
+ return result
+
def _quick_smart_health(self, disk_name: str) -> str:
"""Quick SMART health check for a single disk. Returns 'PASSED', 'FAILED', or 'UNKNOWN'.
-
+
Results are cached for 30 minutes to reduce disk queries - SMART status rarely changes.
"""
if not disk_name or disk_name.startswith('ata') or disk_name.startswith('zram'):
return 'UNKNOWN'
- # Check cache first
+ # Check cache first (and evict stale entries periodically)
current_time = time.time()
cache_key = disk_name
cached = self._smart_cache.get(cache_key)
if cached and current_time - cached['time'] < self._SMART_CACHE_TTL:
return cached['result']
+ # Evict expired entries to prevent unbounded growth
+ if len(self._smart_cache) > 50:
+ self._smart_cache = {
+ k: v for k, v in self._smart_cache.items()
+ if current_time - v['time'] < self._SMART_CACHE_TTL * 2
+ }
try:
dev_path = f'/dev/{disk_name}' if not disk_name.startswith('/') else disk_name
@@ -2130,7 +2158,11 @@ class HealthMonitor:
t for t in self.io_error_history[disk]
if current_time - t < 300
]
-
+ # Remove empty entries to prevent unbounded dict growth
+ if not self.io_error_history[disk]:
+ del self.io_error_history[disk]
+ continue
+
error_count = len(self.io_error_history[disk])
error_key = f'disk_{disk}'
sample = disk_samples.get(disk, '')
@@ -4662,19 +4694,9 @@ class HealthMonitor:
obs_sig = f'{sig_base}_{disk_name}'
- # Try to get serial for proper cross-referencing
- obs_serial = None
- try:
- sm = subprocess.run(
- ['smartctl', '-i', dev_path],
- capture_output=True, text=True, timeout=3)
- if sm.returncode in (0, 4):
- for sline in sm.stdout.split('\n'):
- if 'Serial Number' in sline or 'Serial number' in sline:
- obs_serial = sline.split(':')[-1].strip()
- break
- except Exception:
- pass
+ # Get serial for proper cross-referencing (cached)
+ obs_id = self._get_disk_identity(disk_name)
+ obs_serial = obs_id['serial'] or None
health_persistence.record_disk_observation(
device_name=disk_name,
diff --git a/AppImage/scripts/health_persistence.py b/AppImage/scripts/health_persistence.py
index f98c34b9..21ab723c 100644
--- a/AppImage/scripts/health_persistence.py
+++ b/AppImage/scripts/health_persistence.py
@@ -54,7 +54,12 @@ class HealthPersistence:
self._init_database()
def _get_conn(self) -> sqlite3.Connection:
- """Get a SQLite connection with timeout and WAL mode for safe concurrency."""
+ """Get a SQLite connection with timeout and WAL mode for safe concurrency.
+
+ IMPORTANT: Always close the connection when done, preferably using
+ the _db_connection() context manager. If not closed explicitly,
+ Python's GC will close it, but this is unreliable under load.
+ """
conn = sqlite3.connect(str(self.db_path), timeout=30)
conn.execute('PRAGMA journal_mode=WAL')
conn.execute('PRAGMA busy_timeout=10000')
@@ -332,7 +337,32 @@ class HealthPersistence:
print(f"[HealthPersistence] WARNING: Missing tables after init: {missing}")
else:
print(f"[HealthPersistence] Database initialized with {len(tables)} tables")
-
+
+ # ─── Startup migration: clean stale looping disk I/O errors ───
+ # Previous versions had a bug where journal-based disk errors were
+ # re-processed every cycle, causing infinite notification loops.
+ # On upgrade, clean up any stale disk errors that are stuck in the
+ # active state from the old buggy behavior.
+ try:
+ cursor = conn.cursor()
+ # Delete active (unresolved) disk errors from journal that are
+ # older than 2 hours — these are leftovers from the feedback loop.
+ # Real new errors will be re-detected from fresh journal entries.
+ cutoff = (datetime.now() - timedelta(hours=2)).isoformat()
+ cursor.execute('''
+ DELETE FROM errors
+ WHERE error_key LIKE 'smart_%'
+ AND resolved_at IS NULL
+ AND acknowledged = 0
+ AND last_seen < ?
+ ''', (cutoff,))
+ cleaned = cursor.rowcount
+ if cleaned > 0:
+ conn.commit()
+ print(f"[HealthPersistence] Startup cleanup: removed {cleaned} stale disk error(s) from previous bug")
+ except Exception as e:
+ print(f"[HealthPersistence] Startup cleanup warning: {e}")
+
conn.close()
def record_error(self, error_key: str, category: str, severity: str,
@@ -345,21 +375,17 @@ class HealthPersistence:
return self._record_error_impl(error_key, category, severity, reason, details)
def _record_error_impl(self, error_key, category, severity, reason, details):
- # === RESOURCE EXISTENCE CHECK ===
+ # === RESOURCE EXISTENCE CHECK (before DB access) ===
# Skip recording errors for resources that no longer exist
- # This prevents "ghost" errors from stale journal entries
-
- # Check VM/CT existence
if error_key and (error_key.startswith(('vm_', 'ct_', 'vmct_'))):
import re
vmid_match = re.search(r'(?:vm_|ct_|vmct_)(\d+)', error_key)
if vmid_match:
vmid = vmid_match.group(1)
if not self._check_vm_ct_exists(vmid):
- return {'type': 'skipped', 'needs_notification': False,
+ return {'type': 'skipped', 'needs_notification': False,
'reason': f'VM/CT {vmid} no longer exists'}
-
- # Check disk existence
+
if error_key and any(error_key.startswith(p) for p in ('smart_', 'disk_', 'io_error_')):
import re
import os
@@ -370,161 +396,151 @@ class HealthPersistence:
if not os.path.exists(f'/dev/{disk_name}') and not os.path.exists(f'/dev/{base_disk}'):
return {'type': 'skipped', 'needs_notification': False,
'reason': f'Disk /dev/{disk_name} no longer exists'}
-
+
conn = self._get_conn()
- cursor = conn.cursor()
-
- now = datetime.now().isoformat()
- details_json = json.dumps(details) if details else None
-
- cursor.execute('''
- SELECT id, acknowledged, resolved_at, category, severity, first_seen,
- notification_sent, suppression_hours
- FROM errors WHERE error_key = ?
- ''', (error_key,))
- existing = cursor.fetchone()
-
- event_info = {'type': 'updated', 'needs_notification': False}
-
- if existing:
- err_id, ack, resolved_at, old_cat, old_severity, first_seen, notif_sent, stored_suppression = existing
-
- if ack == 1:
- # SAFETY OVERRIDE: Critical CPU temperature ALWAYS re-triggers
- # regardless of any dismiss/permanent setting (hardware protection)
- if error_key == 'cpu_temperature' and severity == 'CRITICAL':
- cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,))
- cursor.execute('''
- INSERT INTO errors
- (error_key, category, severity, reason, details, first_seen, last_seen)
- VALUES (?, ?, ?, ?, ?, ?, ?)
- ''', (error_key, category, severity, reason, details_json, now, now))
- event_info = {'type': 'new', 'needs_notification': True}
- self._record_event(cursor, 'new', error_key,
- {'severity': severity, 'reason': reason,
- 'note': 'CRITICAL temperature override - safety alert'})
- conn.commit()
- conn.close()
- return event_info
-
- # Check suppression: use per-record stored hours (set at dismiss time)
- sup_hours = stored_suppression if stored_suppression is not None else self.DEFAULT_SUPPRESSION_HOURS
-
- # Permanent dismiss (sup_hours == -1): always suppress
- if sup_hours == -1:
- conn.close()
- return {'type': 'skipped_acknowledged', 'needs_notification': False}
-
- # Time-limited suppression
- still_suppressed = False
- if resolved_at:
- try:
- resolved_dt = datetime.fromisoformat(resolved_at)
- elapsed_hours = (datetime.now() - resolved_dt).total_seconds() / 3600
- still_suppressed = elapsed_hours < sup_hours
- except Exception:
- pass
-
- if still_suppressed:
- conn.close()
- return {'type': 'skipped_acknowledged', 'needs_notification': False}
- else:
- # Suppression expired.
- # For log-based errors (spike, persistent, cascade),
- # do NOT re-trigger. The journal always contains old
- # messages, so re-creating the error would cause an
- # infinite notification cycle. Instead, just delete
- # the stale record so it stops appearing in the UI.
- is_log_error = (
- error_key.startswith('log_persistent_')
- or error_key.startswith('log_spike_')
- or error_key.startswith('log_cascade_')
- or error_key.startswith('log_critical_')
- or category == 'logs'
- )
- if is_log_error:
+ try:
+ cursor = conn.cursor()
+ now = datetime.now().isoformat()
+ details_json = json.dumps(details) if details else None
+
+ cursor.execute('''
+ SELECT id, acknowledged, resolved_at, category, severity, first_seen,
+ notification_sent, suppression_hours
+ FROM errors WHERE error_key = ?
+ ''', (error_key,))
+ existing = cursor.fetchone()
+
+ event_info = {'type': 'updated', 'needs_notification': False}
+
+ if existing:
+ err_id, ack, resolved_at, old_cat, old_severity, first_seen, notif_sent, stored_suppression = existing
+
+ if ack == 1:
+ # SAFETY OVERRIDE: Critical CPU temperature ALWAYS re-triggers
+ if error_key == 'cpu_temperature' and severity == 'CRITICAL':
cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,))
- conn.commit()
- conn.close()
- return {'type': 'skipped_expired_log', 'needs_notification': False}
-
- # For non-log errors (hardware, services, etc.),
- # re-triggering is correct -- the condition is real
- # and still present.
- cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,))
- cursor.execute('''
- INSERT INTO errors
- (error_key, category, severity, reason, details, first_seen, last_seen)
- VALUES (?, ?, ?, ?, ?, ?, ?)
- ''', (error_key, category, severity, reason, details_json, now, now))
- event_info = {'type': 'new', 'needs_notification': True}
- self._record_event(cursor, 'new', error_key,
- {'severity': severity, 'reason': reason,
- 'note': 'Re-triggered after suppression expired'})
- conn.commit()
- conn.close()
- return event_info
-
- # Not acknowledged - update existing active error
- cursor.execute('''
- UPDATE errors
- SET last_seen = ?, severity = ?, reason = ?, details = ?
- WHERE error_key = ? AND acknowledged = 0
- ''', (now, severity, reason, details_json, error_key))
-
- # Check if severity escalated
- if old_severity == 'WARNING' and severity == 'CRITICAL':
- event_info['type'] = 'escalated'
- event_info['needs_notification'] = True
- else:
- # Insert new error
- cursor.execute('''
- INSERT INTO errors
- (error_key, category, severity, reason, details, first_seen, last_seen)
- VALUES (?, ?, ?, ?, ?, ?, ?)
- ''', (error_key, category, severity, reason, details_json, now, now))
-
- event_info['type'] = 'new'
- event_info['needs_notification'] = True
-
- # ─── Auto-suppress: if the category has a non-default setting, ───
- # auto-dismiss immediately so the user never sees it as active.
- # Exception: CRITICAL CPU temperature is never auto-suppressed.
- if not (error_key == 'cpu_temperature' and severity == 'CRITICAL'):
- setting_key = self.CATEGORY_SETTING_MAP.get(category, '')
- if setting_key:
- # P4 fix: use _get_setting_impl with existing connection to avoid deadlock
- stored = self._get_setting_impl(conn, setting_key)
- if stored is not None:
- configured_hours = int(stored)
- if configured_hours != self.DEFAULT_SUPPRESSION_HOURS:
- # Non-default setting found: auto-acknowledge
- # Mark as acknowledged but DO NOT set resolved_at - error remains active
cursor.execute('''
- UPDATE errors
- SET acknowledged = 1, acknowledged_at = ?, suppression_hours = ?
- WHERE error_key = ? AND acknowledged = 0
- ''', (now, configured_hours, error_key))
-
- if cursor.rowcount > 0:
- self._record_event(cursor, 'auto_suppressed', error_key, {
- 'severity': severity,
- 'reason': reason,
- 'suppression_hours': configured_hours,
- 'note': 'Auto-suppressed by user settings'
- })
- event_info['type'] = 'auto_suppressed'
- event_info['needs_notification'] = False
+ INSERT INTO errors
+ (error_key, category, severity, reason, details, first_seen, last_seen)
+ VALUES (?, ?, ?, ?, ?, ?, ?)
+ ''', (error_key, category, severity, reason, details_json, now, now))
+ event_info = {'type': 'new', 'needs_notification': True}
+ self._record_event(cursor, 'new', error_key,
+ {'severity': severity, 'reason': reason,
+ 'note': 'CRITICAL temperature override - safety alert'})
+ conn.commit()
+ return event_info
+
+ # Check suppression: use per-record stored hours (set at dismiss time)
+ sup_hours = stored_suppression if stored_suppression is not None else self.DEFAULT_SUPPRESSION_HOURS
+
+ # Permanent dismiss (sup_hours == -1): always suppress
+ if sup_hours == -1:
+ return {'type': 'skipped_acknowledged', 'needs_notification': False}
+
+ # Time-limited suppression
+ still_suppressed = False
+ if resolved_at:
+ try:
+ resolved_dt = datetime.fromisoformat(resolved_at)
+ elapsed_hours = (datetime.now() - resolved_dt).total_seconds() / 3600
+ still_suppressed = elapsed_hours < sup_hours
+ except Exception:
+ pass
+
+ if still_suppressed:
+ return {'type': 'skipped_acknowledged', 'needs_notification': False}
+ else:
+ # Suppression expired.
+ # Journal-sourced errors (logs AND disk I/O) should NOT
+ # re-trigger after suppression. The journal always contains
+ # old messages, so re-creating the error causes an infinite
+ # notification loop. Delete the stale record instead.
+ is_journal_error = (
+ error_key.startswith('log_persistent_')
+ or error_key.startswith('log_spike_')
+ or error_key.startswith('log_cascade_')
+ or error_key.startswith('log_critical_')
+ or error_key.startswith('smart_')
+ or error_key.startswith('disk_')
+ or error_key.startswith('io_error_')
+ or category == 'logs'
+ )
+ if is_journal_error:
+ cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,))
conn.commit()
- conn.close()
- return event_info
-
- # Record event
- self._record_event(cursor, event_info['type'], error_key,
- {'severity': severity, 'reason': reason})
-
- conn.commit()
- conn.close()
+ return {'type': 'skipped_expired_journal', 'needs_notification': False}
+
+ # For non-log errors (hardware, services, etc.),
+ # re-triggering is correct -- the condition is real and still present.
+ cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,))
+ cursor.execute('''
+ INSERT INTO errors
+ (error_key, category, severity, reason, details, first_seen, last_seen)
+ VALUES (?, ?, ?, ?, ?, ?, ?)
+ ''', (error_key, category, severity, reason, details_json, now, now))
+ event_info = {'type': 'new', 'needs_notification': True}
+ self._record_event(cursor, 'new', error_key,
+ {'severity': severity, 'reason': reason,
+ 'note': 'Re-triggered after suppression expired'})
+ conn.commit()
+ return event_info
+
+ # Not acknowledged - update existing active error
+ cursor.execute('''
+ UPDATE errors
+ SET last_seen = ?, severity = ?, reason = ?, details = ?
+ WHERE error_key = ? AND acknowledged = 0
+ ''', (now, severity, reason, details_json, error_key))
+
+ # Check if severity escalated
+ if old_severity == 'WARNING' and severity == 'CRITICAL':
+ event_info['type'] = 'escalated'
+ event_info['needs_notification'] = True
+ else:
+ # Insert new error
+ cursor.execute('''
+ INSERT INTO errors
+ (error_key, category, severity, reason, details, first_seen, last_seen)
+ VALUES (?, ?, ?, ?, ?, ?, ?)
+ ''', (error_key, category, severity, reason, details_json, now, now))
+
+ event_info['type'] = 'new'
+ event_info['needs_notification'] = True
+
+ # ─── Auto-suppress: if the category has a non-default setting, ───
+ if not (error_key == 'cpu_temperature' and severity == 'CRITICAL'):
+ setting_key = self.CATEGORY_SETTING_MAP.get(category, '')
+ if setting_key:
+ stored = self._get_setting_impl(conn, setting_key)
+ if stored is not None:
+ configured_hours = int(stored)
+ if configured_hours != self.DEFAULT_SUPPRESSION_HOURS:
+ cursor.execute('''
+ UPDATE errors
+ SET acknowledged = 1, acknowledged_at = ?, suppression_hours = ?
+ WHERE error_key = ? AND acknowledged = 0
+ ''', (now, configured_hours, error_key))
+
+ if cursor.rowcount > 0:
+ self._record_event(cursor, 'auto_suppressed', error_key, {
+ 'severity': severity,
+ 'reason': reason,
+ 'suppression_hours': configured_hours,
+ 'note': 'Auto-suppressed by user settings'
+ })
+ event_info['type'] = 'auto_suppressed'
+ event_info['needs_notification'] = False
+ conn.commit()
+ return event_info
+
+ # Record event
+ self._record_event(cursor, event_info['type'], error_key,
+ {'severity': severity, 'reason': reason})
+
+ conn.commit()
+ finally:
+ conn.close()
return event_info
@@ -534,22 +550,20 @@ class HealthPersistence:
return self._resolve_error_impl(error_key, reason)
def _resolve_error_impl(self, error_key, reason):
- conn = self._get_conn()
- cursor = conn.cursor()
-
- now = datetime.now().isoformat()
-
- cursor.execute('''
- UPDATE errors
- SET resolved_at = ?
- WHERE error_key = ? AND resolved_at IS NULL
- ''', (now, error_key))
-
- if cursor.rowcount > 0:
- self._record_event(cursor, 'resolved', error_key, {'reason': reason})
-
- conn.commit()
- conn.close()
+ with self._db_connection() as conn:
+ cursor = conn.cursor()
+ now = datetime.now().isoformat()
+
+ cursor.execute('''
+ UPDATE errors
+ SET resolved_at = ?
+ WHERE error_key = ? AND resolved_at IS NULL
+ ''', (now, error_key))
+
+ if cursor.rowcount > 0:
+ self._record_event(cursor, 'resolved', error_key, {'reason': reason})
+
+ conn.commit()
def is_error_active(self, error_key: str, category: Optional[str] = None) -> bool:
"""
@@ -626,37 +640,34 @@ class HealthPersistence:
we delete the record entirely so it can re-trigger as a fresh
event if the condition returns later.
"""
- conn = self._get_conn()
- cursor = conn.cursor()
-
- now = datetime.now().isoformat()
-
- # Check if this error was acknowledged (dismissed)
- cursor.execute('''
- SELECT acknowledged FROM errors WHERE error_key = ?
- ''', (error_key,))
- row = cursor.fetchone()
-
- if row and row[0] == 1:
- # Dismissed error that naturally resolved - delete entirely
- # so it can re-trigger as a new event if it happens again
- cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,))
- if cursor.rowcount > 0:
- self._record_event(cursor, 'cleared', error_key,
- {'reason': 'condition_resolved_after_dismiss'})
- else:
- # Normal active error - mark as resolved
+ with self._db_connection() as conn:
+ cursor = conn.cursor()
+ now = datetime.now().isoformat()
+
+ # Check if this error was acknowledged (dismissed)
cursor.execute('''
- UPDATE errors
- SET resolved_at = ?
- WHERE error_key = ? AND resolved_at IS NULL
- ''', (now, error_key))
-
- if cursor.rowcount > 0:
- self._record_event(cursor, 'cleared', error_key, {'reason': 'condition_resolved'})
-
- conn.commit()
- conn.close()
+ SELECT acknowledged FROM errors WHERE error_key = ?
+ ''', (error_key,))
+ row = cursor.fetchone()
+
+ if row and row[0] == 1:
+ # Dismissed error that naturally resolved - delete entirely
+ cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,))
+ if cursor.rowcount > 0:
+ self._record_event(cursor, 'cleared', error_key,
+ {'reason': 'condition_resolved_after_dismiss'})
+ else:
+ # Normal active error - mark as resolved
+ cursor.execute('''
+ UPDATE errors
+ SET resolved_at = ?
+ WHERE error_key = ? AND resolved_at IS NULL
+ ''', (now, error_key))
+
+ if cursor.rowcount > 0:
+ self._record_event(cursor, 'cleared', error_key, {'reason': 'condition_resolved'})
+
+ conn.commit()
def acknowledge_error(self, error_key: str) -> Dict[str, Any]:
"""
@@ -671,17 +682,19 @@ class HealthPersistence:
def _acknowledge_error_impl(self, error_key):
conn = self._get_conn()
conn.row_factory = sqlite3.Row
- cursor = conn.cursor()
-
- now = datetime.now().isoformat()
-
- # Get current error info before acknowledging
- cursor.execute('SELECT * FROM errors WHERE error_key = ?', (error_key,))
- row = cursor.fetchone()
-
- result = {'success': False, 'error_key': error_key}
-
- if not row:
+ category = ''
+ sup_hours = self.DEFAULT_SUPPRESSION_HOURS
+ try:
+ cursor = conn.cursor()
+ now = datetime.now().isoformat()
+
+ # Get current error info before acknowledging
+ cursor.execute('SELECT * FROM errors WHERE error_key = ?', (error_key,))
+ row = cursor.fetchone()
+
+ result = {'success': False, 'error_key': error_key}
+
+ if not row:
# Error not in DB yet -- create a minimal record so the dismiss persists.
# Try to infer category from the error_key prefix.
category = ''
@@ -738,15 +751,6 @@ class HealthPersistence:
'acknowledged_at': now
}
conn.commit()
- conn.close()
-
- # ── Clear cooldowns for newly dismissed errors too ──
- if sup_hours != -1:
- if category == 'disks':
- self._clear_disk_io_cooldown(error_key)
- else:
- self._clear_notification_cooldown(error_key)
-
return result
if row:
@@ -809,21 +813,17 @@ class HealthPersistence:
'suppression_hours': sup_hours
}
- conn.commit()
- conn.close()
-
+ conn.commit()
+ finally:
+ conn.close()
+
# ── Coordinate with notification cooldowns ──
- # When an error is dismissed with non-permanent suppression,
- # clear the corresponding cooldown in notification_last_sent
- # so it can re-notify after the suppression period expires.
- # This applies to ALL categories, not just disks.
if sup_hours != -1:
if category == 'disks':
self._clear_disk_io_cooldown(error_key)
else:
- # For non-disk categories, clear the PollingCollector cooldown
self._clear_notification_cooldown(error_key)
-
+
return result
def is_error_acknowledged(self, error_key: str) -> bool:
@@ -928,12 +928,13 @@ class HealthPersistence:
def _cleanup_old_errors_impl(self):
conn = self._get_conn()
- cursor = conn.cursor()
-
- now = datetime.now()
- now_iso = now.isoformat()
-
- # Delete resolved errors older than 7 days
+ try:
+ cursor = conn.cursor()
+
+ now = datetime.now()
+ now_iso = now.isoformat()
+
+ # Delete resolved errors older than 7 days
cutoff_resolved = (now - timedelta(days=7)).isoformat()
cursor.execute('DELETE FROM errors WHERE resolved_at < ?', (cutoff_resolved,))
@@ -1127,12 +1128,13 @@ class HealthPersistence:
except Exception:
pass # If we can't read uptime, skip this cleanup
- conn.commit()
- conn.close()
-
+ conn.commit()
+ finally:
+ conn.close()
+
# Clean up errors for resources that no longer exist (VMs/CTs deleted, disks removed)
self._cleanup_stale_resources()
-
+
# Clean up disk observations for devices that no longer exist
self.cleanup_orphan_observations()
@@ -2174,6 +2176,8 @@ class HealthPersistence:
last_col = 'last_occurrence' if 'last_occurrence' in columns else 'last_seen'
# Upsert observation: if same (disk, type, signature), bump count + update last timestamp
+ # IMPORTANT: Do NOT reset dismissed — if the user dismissed this observation,
+ # re-detecting the same journal entry must not un-dismiss it.
cursor.execute(f'''
INSERT INTO disk_observations
(disk_registry_id, {type_col}, error_signature, {first_col},
@@ -2182,8 +2186,7 @@ class HealthPersistence:
ON CONFLICT(disk_registry_id, {type_col}, error_signature) DO UPDATE SET
{last_col} = excluded.{last_col},
occurrence_count = occurrence_count + 1,
- severity = CASE WHEN excluded.severity = 'critical' THEN 'critical' ELSE severity END,
- dismissed = 0
+ severity = CASE WHEN excluded.severity = 'critical' THEN 'critical' ELSE severity END
''', (disk_id, error_type, error_signature, now, now, raw_message, severity))
conn.commit()