From d33741a90d8e25614b2d11252c24060d94b46ea5 Mon Sep 17 00:00:00 2001 From: MacRimi Date: Sun, 22 Mar 2026 14:20:47 +0100 Subject: [PATCH] Update notification service --- AppImage/scripts/flask_server.py | 268 ++++++++++++++++++------ AppImage/scripts/health_monitor.py | 74 +++++-- AppImage/scripts/notification_events.py | 8 +- 3 files changed, 258 insertions(+), 92 deletions(-) diff --git a/AppImage/scripts/flask_server.py b/AppImage/scripts/flask_server.py index 644d55cf..bbb44703 100644 --- a/AppImage/scripts/flask_server.py +++ b/AppImage/scripts/flask_server.py @@ -1045,6 +1045,150 @@ _system_info_cache = { } _SYSTEM_INFO_CACHE_TTL = 21600 # 6 hours - update notifications are sent once per 24h +# Cache for pvesh cluster resources (reduces repeated API calls) +_pvesh_cache = { + 'cluster_resources_vm': None, + 'cluster_resources_vm_time': 0, + 'cluster_resources_storage': None, + 'cluster_resources_storage_time': 0, + 'storage_list': None, + 'storage_list_time': 0, +} +_PVESH_CACHE_TTL = 30 # 30 seconds - balances freshness with performance + +# Cache for sensors output (temperature readings) +_sensors_cache = { + 'output': None, + 'time': 0, +} +_SENSORS_CACHE_TTL = 10 # 10 seconds - temperature changes slowly + +# Cache for hardware info (lspci, dmidecode, lsblk) +_hardware_cache = { + 'lspci': None, + 'lspci_time': 0, + 'dmidecode': None, + 'dmidecode_time': 0, + 'lsblk': None, + 'lsblk_time': 0, +} +_HARDWARE_CACHE_TTL = 300 # 5 minutes - hardware doesn't change + + +def get_cached_pvesh_cluster_resources_vm(): + """Get cluster VM resources with 30s cache.""" + global _pvesh_cache + now = time.time() + + if _pvesh_cache['cluster_resources_vm'] is not None and \ + now - _pvesh_cache['cluster_resources_vm_time'] < _PVESH_CACHE_TTL: + return _pvesh_cache['cluster_resources_vm'] + + try: + result = subprocess.run( + ['pvesh', 'get', '/cluster/resources', '--type', 'vm', '--output-format', 'json'], + capture_output=True, text=True, timeout=10 + ) + if result.returncode == 0: + data = json.loads(result.stdout) + _pvesh_cache['cluster_resources_vm'] = data + _pvesh_cache['cluster_resources_vm_time'] = now + return data + except Exception: + pass + return _pvesh_cache['cluster_resources_vm'] or [] + + +def get_cached_sensors_output(): + """Get sensors output with 10s cache.""" + global _sensors_cache + now = time.time() + + if _sensors_cache['output'] is not None and \ + now - _sensors_cache['time'] < _SENSORS_CACHE_TTL: + return _sensors_cache['output'] + + try: + result = subprocess.run(['sensors'], capture_output=True, text=True, timeout=5) + if result.returncode == 0: + _sensors_cache['output'] = result.stdout + _sensors_cache['time'] = now + return result.stdout + except Exception: + pass + return _sensors_cache['output'] or '' + + +def get_cached_lspci(): + """Get lspci output with 5 minute cache.""" + global _hardware_cache + now = time.time() + + if _hardware_cache['lspci'] is not None and \ + now - _hardware_cache['lspci_time'] < _HARDWARE_CACHE_TTL: + return _hardware_cache['lspci'] + + try: + result = subprocess.run(['lspci'], capture_output=True, text=True, timeout=10) + if result.returncode == 0: + _hardware_cache['lspci'] = result.stdout + _hardware_cache['lspci_time'] = now + return result.stdout + except Exception: + pass + return _hardware_cache['lspci'] or '' + + +def get_cached_lspci_vmm(): + """Get lspci -vmm output with 5 minute cache.""" + global _hardware_cache + now = time.time() + + cache_key = 'lspci_vmm' + if cache_key not in _hardware_cache: + _hardware_cache[cache_key] = None + _hardware_cache[cache_key + '_time'] = 0 + + if _hardware_cache[cache_key] is not None and \ + now - _hardware_cache[cache_key + '_time'] < _HARDWARE_CACHE_TTL: + return _hardware_cache[cache_key] + + try: + result = subprocess.run(['lspci', '-vmm'], capture_output=True, text=True, timeout=10) + if result.returncode == 0: + _hardware_cache[cache_key] = result.stdout + _hardware_cache[cache_key + '_time'] = now + return result.stdout + except Exception: + pass + return _hardware_cache[cache_key] or '' + + +def get_cached_lspci_k(): + """Get lspci -k output with 5 minute cache.""" + global _hardware_cache + now = time.time() + + cache_key = 'lspci_k' + if cache_key not in _hardware_cache: + _hardware_cache[cache_key] = None + _hardware_cache[cache_key + '_time'] = 0 + + if _hardware_cache[cache_key] is not None and \ + now - _hardware_cache[cache_key + '_time'] < _HARDWARE_CACHE_TTL: + return _hardware_cache[cache_key] + + try: + result = subprocess.run(['lspci', '-k'], capture_output=True, text=True, timeout=10) + if result.returncode == 0: + _hardware_cache[cache_key] = result.stdout + _hardware_cache[cache_key + '_time'] = now + return result.stdout + except Exception: + pass + return _hardware_cache[cache_key] or '' + + def get_proxmox_version(): """Get Proxmox version if available. Cached for 6 hours.""" global _system_info_cache @@ -1237,11 +1381,8 @@ def get_vm_lxc_names(): # local_node = socket.gethostname() local_node = get_proxmox_node_name() - result = subprocess.run(['pvesh', 'get', '/cluster/resources', '--type', 'vm', '--output-format', 'json'], - capture_output=True, text=True, timeout=10) - - if result.returncode == 0: - resources = json.loads(result.stdout) + resources = get_cached_pvesh_cluster_resources_vm() + if resources: for resource in resources: node = resource.get('node', '') if node != local_node: @@ -3247,18 +3388,15 @@ def get_proxmox_vms(): # local_node = socket.gethostname() local_node = get_proxmox_node_name() - # print(f"[v0] Local node detected: {local_node}") - pass - - result = subprocess.run(['pvesh', 'get', '/cluster/resources', '--type', 'vm', '--output-format', 'json'], - capture_output=True, text=True, timeout=10) - - if result.returncode == 0: - resources = json.loads(result.stdout) - for resource in resources: - node = resource.get('node', '') - if node != local_node: - # print(f"[v0] Skipping VM {resource.get('vmid')} from remote node: {node}") + # print(f"[v0] Local node detected: {local_node}") + pass + + resources = get_cached_pvesh_cluster_resources_vm() + if resources: + for resource in resources: + node = resource.get('node', '') + if node != local_node: + # print(f"[v0] Skipping VM {resource.get('vmid')} from remote node: {node}") pass continue @@ -3696,13 +3834,13 @@ def get_temperature_info(): power_meter = None try: - result = subprocess.run(['sensors'], capture_output=True, text=True, timeout=5) - if result.returncode == 0: + sensors_output = get_cached_sensors_output() + if sensors_output: current_adapter = None current_chip = None current_sensor = None - for line in result.stdout.split('\n'): + for line in sensors_output.split('\n'): line = line.strip() if not line: continue @@ -4931,9 +5069,9 @@ def get_gpu_info(): gpus = [] try: - result = subprocess.run(['lspci'], capture_output=True, text=True, timeout=5) - if result.returncode == 0: - for line in result.stdout.split('\n'): + lspci_output = get_cached_lspci() + if lspci_output: + for line in lspci_output.split('\n'): # Match VGA, 3D, Display controllers if any(keyword in line for keyword in ['VGA compatible controller', '3D controller', 'Display controller']): @@ -4984,11 +5122,11 @@ def get_gpu_info(): pass try: - result = subprocess.run(['sensors'], capture_output=True, text=True, timeout=5) - if result.returncode == 0: + sensors_output = get_cached_sensors_output() + if sensors_output: current_adapter = None - for line in result.stdout.split('\n'): + for line in sensors_output.split('\n'): line = line.strip() if not line: continue @@ -5399,9 +5537,9 @@ def get_hardware_info(): }) # Always check lspci for all GPUs (integrated and discrete) - result = subprocess.run(['lspci'], capture_output=True, text=True, timeout=5) - if result.returncode == 0: - for line in result.stdout.split('\n'): + lspci_output = get_cached_lspci() + if lspci_output: + for line in lspci_output.split('\n'): # Match VGA, 3D, Display controllers if any(keyword in line for keyword in ['VGA compatible controller', '3D controller', 'Display controller']): parts = line.split(':', 2) @@ -5453,10 +5591,10 @@ def get_hardware_info(): # print("[v0] Getting PCI devices with driver information...") pass # First get basic device info with lspci -vmm - result = subprocess.run(['lspci', '-vmm'], capture_output=True, text=True, timeout=10) - if result.returncode == 0: + lspci_vmm_output = get_cached_lspci_vmm() + if lspci_vmm_output: current_device = {} - for line in result.stdout.split('\n'): + for line in lspci_vmm_output.split('\n'): line = line.strip() if not line: @@ -5523,13 +5661,13 @@ def get_hardware_info(): current_device[key.strip()] = value.strip() # Now get driver information with lspci -k - result_k = subprocess.run(['lspci', '-k'], capture_output=True, text=True, timeout=10) - if result_k.returncode == 0: + lspci_k_output = get_cached_lspci_k() + if lspci_k_output: current_slot = None current_driver = None current_module = None - for line in result_k.stdout.split('\n'): + for line in lspci_k_output.split('\n'): # Match PCI slot line (e.g., "00:1f.2 SATA controller: ...") if line and not line.startswith('\t'): parts = line.split(' ', 1) @@ -5579,17 +5717,17 @@ def get_hardware_info(): 'critical': entry.critical if entry.critical else 0 }) - # print(f"[v0] Temperature sensors: {len(hardware_data['sensors']['temperatures'])} found") - pass - - try: - result = subprocess.run(['sensors'], capture_output=True, text=True, timeout=5) - if result.returncode == 0: - current_adapter = None - current_chip = None # Add chip name tracking - fans = [] - - for line in result.stdout.split('\n'): + # print(f"[v0] Temperature sensors: {len(hardware_data['sensors']['temperatures'])} found") + pass + + try: + sensors_output = get_cached_sensors_output() + if sensors_output: + current_adapter = None + current_chip = None # Add chip name tracking + fans = [] + + for line in sensors_output.split('\n'): line = line.strip() if not line: continue @@ -6634,10 +6772,8 @@ def api_create_backup(vmid): # Try to find VM in cluster resources try: - result = subprocess.run(['pvesh', 'get', '/cluster/resources', '--type', 'vm', '--output-format', 'json'], - capture_output=True, text=True, timeout=10) - if result.returncode == 0: - vms = json.loads(result.stdout) + vms = get_cached_pvesh_cluster_resources_vm() + if vms: for vm in vms: if vm.get('vmid') == vmid: node = vm.get('node') @@ -7484,11 +7620,9 @@ def api_vm_logs(vmid): """Download real logs for a specific VM/LXC (not task history)""" try: # Get VM type and node - result = subprocess.run(['pvesh', 'get', '/cluster/resources', '--type', 'vm', '--output-format', 'json'], - capture_output=True, text=True, timeout=10) + resources = get_cached_pvesh_cluster_resources_vm() - if result.returncode == 0: - resources = json.loads(result.stdout) + if resources: vm_info = None for resource in resources: if resource.get('vmid') == vmid: @@ -7536,17 +7670,15 @@ def api_vm_control(vmid): data = request.get_json() action = data.get('action') # start, stop, shutdown, reboot - if action not in ['start', 'stop', 'shutdown', 'reboot']: - return jsonify({'error': 'Invalid action'}), 400 - - # Get VM type and node - result = subprocess.run(['pvesh', 'get', '/cluster/resources', '--type', 'vm', '--output-format', 'json'], - capture_output=True, text=True, timeout=10) - - if result.returncode == 0: - resources = json.loads(result.stdout) - vm_info = None - for resource in resources: + if action not in ['start', 'stop', 'shutdown', 'reboot']: + return jsonify({'error': 'Invalid action'}), 400 + + # Get VM type and node + resources = get_cached_pvesh_cluster_resources_vm() + + if resources: + vm_info = None + for resource in resources: if resource.get('vmid') == vmid: vm_info = resource break @@ -7590,11 +7722,9 @@ def api_vm_config_update(vmid): description = data.get('description', '') # Get VM type and node - result = subprocess.run(['pvesh', 'get', '/cluster/resources', '--type', 'vm', '--output-format', 'json'], - capture_output=True, text=True, timeout=10) + resources = get_cached_pvesh_cluster_resources_vm() - if result.returncode == 0: - resources = json.loads(result.stdout) + if resources: vm_info = None for resource in resources: if resource.get('vmid') == vmid: diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py index b93ebed5..eb032961 100644 --- a/AppImage/scripts/health_monitor.py +++ b/AppImage/scripts/health_monitor.py @@ -215,6 +215,14 @@ class HealthMonitor: self._unknown_counts = {} # Track consecutive UNKNOWN cycles per category self._last_cleanup_time = 0 # Throttle cleanup_old_errors calls + # SMART check cache - reduces disk queries from every 5 min to every 30 min + self._smart_cache = {} # {disk_name: {'result': 'PASSED', 'time': timestamp}} + self._SMART_CACHE_TTL = 1800 # 30 minutes - disk health changes slowly + + # Journalctl 24h cache - reduces full log reads from every 5 min to every 1 hour + self._journalctl_24h_cache = {'count': 0, 'time': 0} + self._JOURNALCTL_24H_CACHE_TTL = 3600 # 1 hour - login attempts aggregate slowly + # System capabilities - derived from Proxmox storage types at runtime (Priority 1.5) # SMART detection still uses filesystem check on init (lightweight) has_smart = os.path.exists('/usr/sbin/smartctl') or os.path.exists('/usr/bin/smartctl') @@ -1758,9 +1766,20 @@ class HealthMonitor: return '' def _quick_smart_health(self, disk_name: str) -> str: - """Quick SMART health check for a single disk. Returns 'PASSED', 'FAILED', or 'UNKNOWN'.""" + """Quick SMART health check for a single disk. Returns 'PASSED', 'FAILED', or 'UNKNOWN'. + + Results are cached for 30 minutes to reduce disk queries - SMART status rarely changes. + """ if not disk_name or disk_name.startswith('ata') or disk_name.startswith('zram'): return 'UNKNOWN' + + # Check cache first + current_time = time.time() + cache_key = disk_name + cached = self._smart_cache.get(cache_key) + if cached and current_time - cached['time'] < self._SMART_CACHE_TTL: + return cached['result'] + try: dev_path = f'/dev/{disk_name}' if not disk_name.startswith('/') else disk_name result = subprocess.run( @@ -1771,10 +1790,15 @@ class HealthMonitor: data = _json.loads(result.stdout) passed = data.get('smart_status', {}).get('passed', None) if passed is True: - return 'PASSED' + smart_result = 'PASSED' elif passed is False: - return 'FAILED' - return 'UNKNOWN' + smart_result = 'FAILED' + else: + smart_result = 'UNKNOWN' + + # Cache the result + self._smart_cache[cache_key] = {'result': smart_result, 'time': current_time} + return smart_result except Exception: return 'UNKNOWN' @@ -3960,24 +3984,36 @@ class HealthMonitor: issues.append(cert_reason or 'Certificate issue') # Sub-check 3: Failed login attempts (brute force detection) + # Cached for 1 hour to avoid reading 24h of logs every 5 minutes try: - result = subprocess.run( - ['journalctl', '--since', '24 hours ago', '--no-pager', - '-g', 'authentication failure|failed password|invalid user', - '--output=cat', '-n', '5000'], - capture_output=True, - text=True, - timeout=20 - ) + current_time = time.time() - failed_logins = 0 - if result.returncode == 0: - for line in result.stdout.split('\n'): - line_lower = line.lower() - if 'authentication failure' in line_lower or 'failed password' in line_lower or 'invalid user' in line_lower: - failed_logins += 1 + # Check if we have a valid cached result + if self._journalctl_24h_cache['time'] > 0 and \ + current_time - self._journalctl_24h_cache['time'] < self._JOURNALCTL_24H_CACHE_TTL: + failed_logins = self._journalctl_24h_cache['count'] + else: + # Cache expired or first run - read full 24h logs + result = subprocess.run( + ['journalctl', '--since', '24 hours ago', '--no-pager', + '-g', 'authentication failure|failed password|invalid user', + '--output=cat', '-n', '5000'], + capture_output=True, + text=True, + timeout=20 + ) - if failed_logins > 50: + failed_logins = 0 + if result.returncode == 0: + for line in result.stdout.split('\n'): + line_lower = line.lower() + if 'authentication failure' in line_lower or 'failed password' in line_lower or 'invalid user' in line_lower: + failed_logins += 1 + + # Cache the result + self._journalctl_24h_cache = {'count': failed_logins, 'time': current_time} + + if failed_logins > 50: msg = f'{failed_logins} failed login attempts in 24h' issues.append(msg) checks['login_attempts'] = {'status': 'WARNING', 'detail': msg, 'count': failed_logins, 'dismissable': True} diff --git a/AppImage/scripts/notification_events.py b/AppImage/scripts/notification_events.py index a52e4a3e..2ea148f3 100644 --- a/AppImage/scripts/notification_events.py +++ b/AppImage/scripts/notification_events.py @@ -1396,10 +1396,10 @@ class TaskWatcher: # 2. Check active file for newly started tasks (backup start) self._check_active_tasks() - except Exception as e: - print(f"[TaskWatcher] Error reading task log: {e}") - - time.sleep(2) # Check every 2 seconds + except Exception as e: + print(f"[TaskWatcher] Error reading task log: {e}") + + time.sleep(5) # Check every 5 seconds (reduced from 2s for efficiency) def _check_active_tasks(self): """Scan /var/log/pve/tasks/active to track vzdump for VM suppression.