From d33741a90d8e25614b2d11252c24060d94b46ea5 Mon Sep 17 00:00:00 2001
From: MacRimi <ricoextincion@gmail.com>
Date: Sun, 22 Mar 2026 14:20:47 +0100
Subject: [PATCH] Update notification service

---
 AppImage/scripts/flask_server.py        | 268 ++++++++++++++++++------
 AppImage/scripts/health_monitor.py      |  74 +++++--
 AppImage/scripts/notification_events.py |   8 +-
 3 files changed, 258 insertions(+), 92 deletions(-)

diff --git a/AppImage/scripts/flask_server.py b/AppImage/scripts/flask_server.py
index 644d55cf..bbb44703 100644
--- a/AppImage/scripts/flask_server.py
+++ b/AppImage/scripts/flask_server.py
@@ -1045,6 +1045,150 @@ _system_info_cache = {
 }
 _SYSTEM_INFO_CACHE_TTL = 21600  # 6 hours - update notifications are sent once per 24h
 
+# Cache for pvesh cluster resources (reduces repeated API calls)
+_pvesh_cache = {
+    'cluster_resources_vm': None,
+    'cluster_resources_vm_time': 0,
+    'cluster_resources_storage': None,
+    'cluster_resources_storage_time': 0,
+    'storage_list': None,
+    'storage_list_time': 0,
+}
+_PVESH_CACHE_TTL = 30  # 30 seconds - balances freshness with performance
+
+# Cache for sensors output (temperature readings)
+_sensors_cache = {
+    'output': None,
+    'time': 0,
+}
+_SENSORS_CACHE_TTL = 10  # 10 seconds - temperature changes slowly
+
+# Cache for hardware info (lspci, dmidecode, lsblk)
+_hardware_cache = {
+    'lspci': None,
+    'lspci_time': 0,
+    'dmidecode': None,
+    'dmidecode_time': 0,
+    'lsblk': None,
+    'lsblk_time': 0,
+}
+_HARDWARE_CACHE_TTL = 300  # 5 minutes - hardware doesn't change
+
+
+def get_cached_pvesh_cluster_resources_vm():
+    """Get cluster VM resources with 30s cache."""
+    global _pvesh_cache
+    now = time.time()
+    
+    if _pvesh_cache['cluster_resources_vm'] is not None and \
+       now - _pvesh_cache['cluster_resources_vm_time'] < _PVESH_CACHE_TTL:
+        return _pvesh_cache['cluster_resources_vm']
+    
+    try:
+        result = subprocess.run(
+            ['pvesh', 'get', '/cluster/resources', '--type', 'vm', '--output-format', 'json'],
+            capture_output=True, text=True, timeout=10
+        )
+        if result.returncode == 0:
+            data = json.loads(result.stdout)
+            _pvesh_cache['cluster_resources_vm'] = data
+            _pvesh_cache['cluster_resources_vm_time'] = now
+            return data
+    except Exception:
+        pass
+    return _pvesh_cache['cluster_resources_vm'] or []
+
+
+def get_cached_sensors_output():
+    """Get sensors output with 10s cache."""
+    global _sensors_cache
+    now = time.time()
+    
+    if _sensors_cache['output'] is not None and \
+       now - _sensors_cache['time'] < _SENSORS_CACHE_TTL:
+        return _sensors_cache['output']
+    
+    try:
+        result = subprocess.run(['sensors'], capture_output=True, text=True, timeout=5)
+        if result.returncode == 0:
+            _sensors_cache['output'] = result.stdout
+            _sensors_cache['time'] = now
+            return result.stdout
+    except Exception:
+        pass
+    return _sensors_cache['output'] or ''
+
+
+def get_cached_lspci():
+    """Get lspci output with 5 minute cache."""
+    global _hardware_cache
+    now = time.time()
+    
+    if _hardware_cache['lspci'] is not None and \
+       now - _hardware_cache['lspci_time'] < _HARDWARE_CACHE_TTL:
+        return _hardware_cache['lspci']
+    
+    try:
+        result = subprocess.run(['lspci'], capture_output=True, text=True, timeout=10)
+        if result.returncode == 0:
+            _hardware_cache['lspci'] = result.stdout
+            _hardware_cache['lspci_time'] = now
+            return result.stdout
+    except Exception:
+        pass
+    return _hardware_cache['lspci'] or ''
+
+
+def get_cached_lspci_vmm():
+    """Get lspci -vmm output with 5 minute cache."""
+    global _hardware_cache
+    now = time.time()
+    
+    cache_key = 'lspci_vmm'
+    if cache_key not in _hardware_cache:
+        _hardware_cache[cache_key] = None
+        _hardware_cache[cache_key + '_time'] = 0
+    
+    if _hardware_cache[cache_key] is not None and \
+       now - _hardware_cache[cache_key + '_time'] < _HARDWARE_CACHE_TTL:
+        return _hardware_cache[cache_key]
+    
+    try:
+        result = subprocess.run(['lspci', '-vmm'], capture_output=True, text=True, timeout=10)
+        if result.returncode == 0:
+            _hardware_cache[cache_key] = result.stdout
+            _hardware_cache[cache_key + '_time'] = now
+            return result.stdout
+    except Exception:
+        pass
+    return _hardware_cache[cache_key] or ''
+
+
+def get_cached_lspci_k():
+    """Get lspci -k output with 5 minute cache."""
+    global _hardware_cache
+    now = time.time()
+    
+    cache_key = 'lspci_k'
+    if cache_key not in _hardware_cache:
+        _hardware_cache[cache_key] = None
+        _hardware_cache[cache_key + '_time'] = 0
+    
+    if _hardware_cache[cache_key] is not None and \
+       now - _hardware_cache[cache_key + '_time'] < _HARDWARE_CACHE_TTL:
+        return _hardware_cache[cache_key]
+    
+    try:
+        result = subprocess.run(['lspci', '-k'], capture_output=True, text=True, timeout=10)
+        if result.returncode == 0:
+            _hardware_cache[cache_key] = result.stdout
+            _hardware_cache[cache_key + '_time'] = now
+            return result.stdout
+    except Exception:
+        pass
+    return _hardware_cache[cache_key] or ''
+
+
 def get_proxmox_version():
     """Get Proxmox version if available. Cached for 6 hours."""
     global _system_info_cache
@@ -1237,11 +1381,8 @@ def get_vm_lxc_names():
         # local_node = socket.gethostname()
         local_node = get_proxmox_node_name()
         
-        result = subprocess.run(['pvesh', 'get', '/cluster/resources', '--type', 'vm', '--output-format', 'json'], 
-                              capture_output=True, text=True, timeout=10)
-        
-        if result.returncode == 0:
-            resources = json.loads(result.stdout)
+        resources = get_cached_pvesh_cluster_resources_vm()
+        if resources:
             for resource in resources:
                 node = resource.get('node', '')
                 if node != local_node:
@@ -3247,18 +3388,15 @@ def get_proxmox_vms():
             # local_node = socket.gethostname()
             local_node = get_proxmox_node_name()
 
-            # print(f"[v0] Local node detected: {local_node}")
-            pass
-            
-            result = subprocess.run(['pvesh', 'get', '/cluster/resources', '--type', 'vm', '--output-format', 'json'], 
-                                  capture_output=True, text=True, timeout=10)
-            
-            if result.returncode == 0:
-                resources = json.loads(result.stdout)
-                for resource in resources:
-                    node = resource.get('node', '')
-                    if node != local_node:
-                        # print(f"[v0] Skipping VM {resource.get('vmid')} from remote node: {node}")
+        # print(f"[v0] Local node detected: {local_node}")
+        pass
+        
+        resources = get_cached_pvesh_cluster_resources_vm()
+        if resources:
+            for resource in resources:
+                node = resource.get('node', '')
+                if node != local_node:
+                    # print(f"[v0] Skipping VM {resource.get('vmid')} from remote node: {node}")
                         pass
                         continue
                     
@@ -3696,13 +3834,13 @@ def get_temperature_info():
     power_meter = None
     
     try:
-        result = subprocess.run(['sensors'], capture_output=True, text=True, timeout=5)
-        if result.returncode == 0:
+        sensors_output = get_cached_sensors_output()
+        if sensors_output:
             current_adapter = None
             current_chip = None
             current_sensor = None
             
-            for line in result.stdout.split('\n'):
+            for line in sensors_output.split('\n'):
                 line = line.strip()
                 if not line:
                     continue
@@ -4931,9 +5069,9 @@ def get_gpu_info():
     gpus = []
     
     try:
-        result = subprocess.run(['lspci'], capture_output=True, text=True, timeout=5)
-        if result.returncode == 0:
-            for line in result.stdout.split('\n'):
+        lspci_output = get_cached_lspci()
+        if lspci_output:
+            for line in lspci_output.split('\n'):
                 # Match VGA, 3D, Display controllers
                 if any(keyword in line for keyword in ['VGA compatible controller', '3D controller', 'Display controller']):
 
@@ -4984,11 +5122,11 @@ def get_gpu_info():
         pass
     
     try:
-        result = subprocess.run(['sensors'], capture_output=True, text=True, timeout=5)
-        if result.returncode == 0:
+        sensors_output = get_cached_sensors_output()
+        if sensors_output:
             current_adapter = None
             
-            for line in result.stdout.split('\n'):
+            for line in sensors_output.split('\n'):
                 line = line.strip()
                 if not line:
                     continue
@@ -5399,9 +5537,9 @@ def get_hardware_info():
                             })
             
             # Always check lspci for all GPUs (integrated and discrete)
-            result = subprocess.run(['lspci'], capture_output=True, text=True, timeout=5)
-            if result.returncode == 0:
-                for line in result.stdout.split('\n'):
+            lspci_output = get_cached_lspci()
+            if lspci_output:
+                for line in lspci_output.split('\n'):
                     # Match VGA, 3D, Display controllers
                     if any(keyword in line for keyword in ['VGA compatible controller', '3D controller', 'Display controller']):
                         parts = line.split(':', 2)
@@ -5453,10 +5591,10 @@ def get_hardware_info():
             # print("[v0] Getting PCI devices with driver information...")
             pass
             # First get basic device info with lspci -vmm
-            result = subprocess.run(['lspci', '-vmm'], capture_output=True, text=True, timeout=10)
-            if result.returncode == 0:
+            lspci_vmm_output = get_cached_lspci_vmm()
+            if lspci_vmm_output:
                 current_device = {}
-                for line in result.stdout.split('\n'):
+                for line in lspci_vmm_output.split('\n'):
                     line = line.strip()
                     
                     if not line:
@@ -5523,13 +5661,13 @@ def get_hardware_info():
                         current_device[key.strip()] = value.strip()
             
             # Now get driver information with lspci -k
-            result_k = subprocess.run(['lspci', '-k'], capture_output=True, text=True, timeout=10)
-            if result_k.returncode == 0:
+            lspci_k_output = get_cached_lspci_k()
+            if lspci_k_output:
                 current_slot = None
                 current_driver = None
                 current_module = None
                 
-                for line in result_k.stdout.split('\n'):
+                for line in lspci_k_output.split('\n'):
                     # Match PCI slot line (e.g., "00:1f.2 SATA controller: ...")
                     if line and not line.startswith('\t'):
                         parts = line.split(' ', 1)
@@ -5579,17 +5717,17 @@ def get_hardware_info():
                                 'critical': entry.critical if entry.critical else 0
                             })
                     
-                    # print(f"[v0] Temperature sensors: {len(hardware_data['sensors']['temperatures'])} found")
-                    pass
-            
-            try:
-                result = subprocess.run(['sensors'], capture_output=True, text=True, timeout=5)
-                if result.returncode == 0:
-                    current_adapter = None
-                    current_chip = None  # Add chip name tracking
-                    fans = []
-                    
-                    for line in result.stdout.split('\n'):
+        # print(f"[v0] Temperature sensors: {len(hardware_data['sensors']['temperatures'])} found")
+        pass
+        
+        try:
+            sensors_output = get_cached_sensors_output()
+            if sensors_output:
+                current_adapter = None
+                current_chip = None  # Add chip name tracking
+                fans = []
+                
+                for line in sensors_output.split('\n'):
                         line = line.strip()
                         if not line:
                             continue
@@ -6634,10 +6772,8 @@ def api_create_backup(vmid):
         
         # Try to find VM in cluster resources
         try:
-            result = subprocess.run(['pvesh', 'get', '/cluster/resources', '--type', 'vm', '--output-format', 'json'],
-                                  capture_output=True, text=True, timeout=10)
-            if result.returncode == 0:
-                vms = json.loads(result.stdout)
+            vms = get_cached_pvesh_cluster_resources_vm()
+            if vms:
                 for vm in vms:
                     if vm.get('vmid') == vmid:
                         node = vm.get('node')
@@ -7484,11 +7620,9 @@ def api_vm_logs(vmid):
     """Download real logs for a specific VM/LXC (not task history)"""
     try:
         # Get VM type and node
-        result = subprocess.run(['pvesh', 'get', '/cluster/resources', '--type', 'vm', '--output-format', 'json'], 
-                              capture_output=True, text=True, timeout=10)
+        resources = get_cached_pvesh_cluster_resources_vm()
         
-        if result.returncode == 0:
-            resources = json.loads(result.stdout)
+        if resources:
             vm_info = None
             for resource in resources:
                 if resource.get('vmid') == vmid:
@@ -7536,17 +7670,15 @@ def api_vm_control(vmid):
         data = request.get_json()
         action = data.get('action')  # start, stop, shutdown, reboot
         
-        if action not in ['start', 'stop', 'shutdown', 'reboot']:
-            return jsonify({'error': 'Invalid action'}), 400
-        
-        # Get VM type and node
-        result = subprocess.run(['pvesh', 'get', '/cluster/resources', '--type', 'vm', '--output-format', 'json'], 
-                              capture_output=True, text=True, timeout=10)
-        
-        if result.returncode == 0:
-            resources = json.loads(result.stdout)
-            vm_info = None
-            for resource in resources:
+    if action not in ['start', 'stop', 'shutdown', 'reboot']:
+        return jsonify({'error': 'Invalid action'}), 400
+    
+    # Get VM type and node
+    resources = get_cached_pvesh_cluster_resources_vm()
+    
+    if resources:
+        vm_info = None
+        for resource in resources:
                 if resource.get('vmid') == vmid:
                     vm_info = resource
                     break
@@ -7590,11 +7722,9 @@ def api_vm_config_update(vmid):
         description = data.get('description', '')
         
         # Get VM type and node
-        result = subprocess.run(['pvesh', 'get', '/cluster/resources', '--type', 'vm', '--output-format', 'json'], 
-                              capture_output=True, text=True, timeout=10)
+        resources = get_cached_pvesh_cluster_resources_vm()
         
-        if result.returncode == 0:
-            resources = json.loads(result.stdout)
+        if resources:
             vm_info = None
             for resource in resources:
                 if resource.get('vmid') == vmid:
diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py
index b93ebed5..eb032961 100644
--- a/AppImage/scripts/health_monitor.py
+++ b/AppImage/scripts/health_monitor.py
@@ -215,6 +215,14 @@ class HealthMonitor:
         self._unknown_counts = {}  # Track consecutive UNKNOWN cycles per category
         self._last_cleanup_time = 0  # Throttle cleanup_old_errors calls
         
+        # SMART check cache - reduces disk queries from every 5 min to every 30 min
+        self._smart_cache = {}  # {disk_name: {'result': 'PASSED', 'time': timestamp}}
+        self._SMART_CACHE_TTL = 1800  # 30 minutes - disk health changes slowly
+        
+        # Journalctl 24h cache - reduces full log reads from every 5 min to every 1 hour
+        self._journalctl_24h_cache = {'count': 0, 'time': 0}
+        self._JOURNALCTL_24H_CACHE_TTL = 3600  # 1 hour - login attempts aggregate slowly
+        
         # System capabilities - derived from Proxmox storage types at runtime (Priority 1.5)
         # SMART detection still uses filesystem check on init (lightweight)
         has_smart = os.path.exists('/usr/sbin/smartctl') or os.path.exists('/usr/bin/smartctl')
@@ -1758,9 +1766,20 @@ class HealthMonitor:
             return ''
     
     def _quick_smart_health(self, disk_name: str) -> str:
-        """Quick SMART health check for a single disk. Returns 'PASSED', 'FAILED', or 'UNKNOWN'."""
+        """Quick SMART health check for a single disk. Returns 'PASSED', 'FAILED', or 'UNKNOWN'.
+        
+        Results are cached for 30 minutes to reduce disk queries - SMART status rarely changes.
+        """
         if not disk_name or disk_name.startswith('ata') or disk_name.startswith('zram'):
             return 'UNKNOWN'
+        
+        # Check cache first
+        current_time = time.time()
+        cache_key = disk_name
+        cached = self._smart_cache.get(cache_key)
+        if cached and current_time - cached['time'] < self._SMART_CACHE_TTL:
+            return cached['result']
+        
         try:
             dev_path = f'/dev/{disk_name}' if not disk_name.startswith('/') else disk_name
             result = subprocess.run(
@@ -1771,10 +1790,15 @@ class HealthMonitor:
             data = _json.loads(result.stdout)
             passed = data.get('smart_status', {}).get('passed', None)
             if passed is True:
-                return 'PASSED'
+                smart_result = 'PASSED'
             elif passed is False:
-                return 'FAILED'
-            return 'UNKNOWN'
+                smart_result = 'FAILED'
+            else:
+                smart_result = 'UNKNOWN'
+            
+            # Cache the result
+            self._smart_cache[cache_key] = {'result': smart_result, 'time': current_time}
+            return smart_result
         except Exception:
             return 'UNKNOWN'
 
@@ -3960,24 +3984,36 @@ class HealthMonitor:
                     issues.append(cert_reason or 'Certificate issue')
             
             # Sub-check 3: Failed login attempts (brute force detection)
+            # Cached for 1 hour to avoid reading 24h of logs every 5 minutes
             try:
-                result = subprocess.run(
-                    ['journalctl', '--since', '24 hours ago', '--no-pager',
-                     '-g', 'authentication failure|failed password|invalid user',
-                     '--output=cat', '-n', '5000'],
-                    capture_output=True,
-                    text=True,
-                    timeout=20
-                )
+                current_time = time.time()
                 
-                failed_logins = 0
-                if result.returncode == 0:
-                    for line in result.stdout.split('\n'):
-                        line_lower = line.lower()
-                        if 'authentication failure' in line_lower or 'failed password' in line_lower or 'invalid user' in line_lower:
-                            failed_logins += 1
+                # Check if we have a valid cached result
+                if self._journalctl_24h_cache['time'] > 0 and \
+                   current_time - self._journalctl_24h_cache['time'] < self._JOURNALCTL_24H_CACHE_TTL:
+                    failed_logins = self._journalctl_24h_cache['count']
+                else:
+                    # Cache expired or first run - read full 24h logs
+                    result = subprocess.run(
+                        ['journalctl', '--since', '24 hours ago', '--no-pager',
+                         '-g', 'authentication failure|failed password|invalid user',
+                         '--output=cat', '-n', '5000'],
+                        capture_output=True,
+                        text=True,
+                        timeout=20
+                    )
                     
-                    if failed_logins > 50:
+                    failed_logins = 0
+                    if result.returncode == 0:
+                        for line in result.stdout.split('\n'):
+                            line_lower = line.lower()
+                            if 'authentication failure' in line_lower or 'failed password' in line_lower or 'invalid user' in line_lower:
+                                failed_logins += 1
+                    
+                    # Cache the result
+                    self._journalctl_24h_cache = {'count': failed_logins, 'time': current_time}
+                
+                if failed_logins > 50:
                         msg = f'{failed_logins} failed login attempts in 24h'
                         issues.append(msg)
                         checks['login_attempts'] = {'status': 'WARNING', 'detail': msg, 'count': failed_logins, 'dismissable': True}
diff --git a/AppImage/scripts/notification_events.py b/AppImage/scripts/notification_events.py
index a52e4a3e..2ea148f3 100644
--- a/AppImage/scripts/notification_events.py
+++ b/AppImage/scripts/notification_events.py
@@ -1396,10 +1396,10 @@ class TaskWatcher:
                 # 2. Check active file for newly started tasks (backup start)
                 self._check_active_tasks()
                 
-            except Exception as e:
-                print(f"[TaskWatcher] Error reading task log: {e}")
-            
-            time.sleep(2)  # Check every 2 seconds
+                except Exception as e:
+                    print(f"[TaskWatcher] Error reading task log: {e}")
+                
+                time.sleep(5)  # Check every 5 seconds (reduced from 2s for efficiency)
     
     def _check_active_tasks(self):
         """Scan /var/log/pve/tasks/active to track vzdump for VM suppression.