diff --git a/AppImage/scripts/build_appimage.sh b/AppImage/scripts/build_appimage.sh index 849b5f97..0f540547 100644 --- a/AppImage/scripts/build_appimage.sh +++ b/AppImage/scripts/build_appimage.sh @@ -90,7 +90,7 @@ cp "$SCRIPT_DIR/hardware_monitor.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo " cp "$SCRIPT_DIR/proxmox_storage_monitor.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ proxmox_storage_monitor.py not found" cp "$SCRIPT_DIR/flask_script_runner.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ flask_script_runner.py not found" cp "$SCRIPT_DIR/security_manager.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ security_manager.py not found" -cp "$SCRIPT_DIR/flask_security_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠��� flask_security_routes.py not found" +cp "$SCRIPT_DIR/flask_security_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ flask_security_routes.py not found" cp "$SCRIPT_DIR/notification_manager.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ notification_manager.py not found" cp "$SCRIPT_DIR/notification_channels.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ notification_channels.py not found" cp "$SCRIPT_DIR/notification_templates.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ notification_templates.py not found" diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py index c4f1a148..eee9dc6b 100644 --- a/AppImage/scripts/health_monitor.py +++ b/AppImage/scripts/health_monitor.py @@ -33,6 +33,19 @@ except ImportError: # ============================================================================ DEBUG_PERF = False +# Startup grace period: suppress transient issues during boot +# This is set when the module loads (service start) +_MODULE_START_TIME = time.time() +_STARTUP_HEALTH_GRACE_SECONDS = 300 # 5 minutes + +def _is_startup_health_grace() -> bool: + """Check if we're within the startup health grace period (5 min). + + Used to downgrade transient errors (high latency, storage not ready) + to INFO level during system boot, preventing false CRITICAL alerts. + """ + return (time.time() - _MODULE_START_TIME) < _STARTUP_HEALTH_GRACE_SECONDS + def _perf_log(section: str, elapsed_ms: float): """Log performance timing for a section. Only logs if DEBUG_PERF is True.""" if DEBUG_PERF: @@ -2512,12 +2525,24 @@ class HealthMonitor: return loss_result # Evaluate latency thresholds + # During startup grace period, downgrade CRITICAL/WARNING to INFO + # to avoid false alerts from transient boot-time latency spikes + in_grace_period = _is_startup_health_grace() + if avg_latency > self.NETWORK_LATENCY_CRITICAL: - status = 'CRITICAL' - reason = f'Latency {avg_latency:.1f}ms to gateway >{self.NETWORK_LATENCY_CRITICAL}ms' + if in_grace_period: + status = 'INFO' + reason = f'Latency {avg_latency:.1f}ms (startup grace, will stabilize)' + else: + status = 'CRITICAL' + reason = f'Latency {avg_latency:.1f}ms to gateway >{self.NETWORK_LATENCY_CRITICAL}ms' elif avg_latency > self.NETWORK_LATENCY_WARNING: - status = 'WARNING' - reason = f'Latency {avg_latency:.1f}ms to gateway >{self.NETWORK_LATENCY_WARNING}ms' + if in_grace_period: + status = 'INFO' + reason = f'Latency {avg_latency:.1f}ms (startup grace, will stabilize)' + else: + status = 'WARNING' + reason = f'Latency {avg_latency:.1f}ms to gateway >{self.NETWORK_LATENCY_WARNING}ms' else: status = 'OK' reason = None diff --git a/AppImage/scripts/notification_events.py b/AppImage/scripts/notification_events.py index 715f7ce8..180a487f 100644 --- a/AppImage/scripts/notification_events.py +++ b/AppImage/scripts/notification_events.py @@ -221,7 +221,7 @@ def capture_journal_context(keywords: list, lines: int = 30, return "" -# ─── Journal Watcher (Real-time) ───────────────────────────────── +# ─── Journal Watcher (Real-time) ───��───────────────────────────── class JournalWatcher: """Watches journald in real-time for critical system events. @@ -1640,13 +1640,9 @@ class TaskWatcher: # let PollingCollector emit one "System startup: X VMs, Y CTs started". _STARTUP_EVENTS = {'vm_start', 'ct_start'} if event_type in _STARTUP_EVENTS and not is_error: - is_startup = _shared_state.is_startup_period() - elapsed = time.time() - _shared_state._startup_time - print(f"[TaskWatcher] {event_type} for {vmid}: is_startup_period={is_startup}, elapsed={elapsed:.1f}s") - if is_startup: + if _shared_state.is_startup_period(): vm_type = 'ct' if event_type == 'ct_start' else 'vm' _shared_state.add_startup_vm(vmid, vmname or f'ID {vmid}', vm_type) - print(f"[TaskWatcher] Aggregated {event_type} for {vmid}, total pending: {len(_shared_state._startup_vms)}") return self._queue.put(NotificationEvent( @@ -2189,16 +2185,11 @@ class PollingCollector: if _shared_state.was_startup_aggregated(): return - print(f"[PollingCollector] Startup period ended, checking for aggregated VMs...") - # Get all collected startup VMs/CTs startup_items = _shared_state.get_and_clear_startup_vms() if not startup_items: - print(f"[PollingCollector] No VMs/CTs collected during startup period") return - print(f"[PollingCollector] Emitting aggregated startup notification for {len(startup_items)} items") - # Count VMs and CTs vms = [(vmid, name) for vmid, name, vtype in startup_items if vtype == 'vm'] cts = [(vmid, name) for vmid, name, vtype in startup_items if vtype == 'ct'] @@ -2289,7 +2280,7 @@ class PollingCollector: if total == 0: return - # ── Parse every Inst line ────────────────��───────────── + # ── Parse every Inst line ────────────────────────────── all_pkgs: list[dict] = [] # {name, cur, new} security_pkgs: list[dict] = [] pve_pkgs: list[dict] = []