Update notification service

2026-08-01 13:26:21 +00:00 · 2026-03-08 10:15:55 +01:00
parent f8b5e07518
commit 17e4227978
3 changed files with 182 additions and 180 deletions
@@ -631,39 +631,39 @@ def init_latency_db():
        return False

 def _measure_latency(target_ip: str) -> dict:
-  """Ping a target and return latency stats. Uses 3 pings and returns avg, min, max for full visibility."""
-  try:
-    result = subprocess.run(
-      ['ping', '-c', '3', '-W', '2', target_ip],
-      capture_output=True, text=True, timeout=10
-    )
-    
-    if result.returncode == 0:
-      latencies = []
-      for line in result.stdout.split('\n'):
-        if 'time=' in line:
-          try:
-            latency_str = line.split('time=')[1].split()[0]
-            latencies.append(float(latency_str))
-          except:
-            pass
-      
-      if latencies:
-        return {
-          'success': True,
-          'avg': round(sum(latencies) / len(latencies), 1),
-          'min': round(min(latencies), 1),
-          'max': round(max(latencies), 1),
-          'packet_loss': round((3 - len(latencies)) / 3 * 100, 1)
-        }
-    
-    # Ping failed - 100% packet loss
-    return {'success': False, 'avg': None, 'min': None, 'max': None, 'packet_loss': 100.0}
-  except Exception:
-    return {'success': False, 'avg': None, 'min': None, 'max': None, 'packet_loss': 100.0}
+    """Ping a target and return latency stats."""
+    try:
+        result = subprocess.run(
+            ['ping', '-c', '3', '-W', '2', target_ip],
+            capture_output=True, text=True, timeout=10
+        )
+        
+        if result.returncode == 0:
+            latencies = []
+            for line in result.stdout.split('\n'):
+                if 'time=' in line:
+                    try:
+                        latency_str = line.split('time=')[1].split()[0]
+                        latencies.append(float(latency_str))
+                    except:
+                        pass
+            
+            if latencies:
+                return {
+                    'success': True,
+                    'avg': round(sum(latencies) / len(latencies), 1),
+                    'min': round(min(latencies), 1),
+                    'max': round(max(latencies), 1),
+                    'packet_loss': round((3 - len(latencies)) / 3 * 100, 1)
+                }
+        
+        # Ping failed - 100% packet loss
+        return {'success': False, 'avg': None, 'min': None, 'max': None, 'packet_loss': 100.0}
+    except Exception:
+        return {'success': False, 'avg': None, 'min': None, 'max': None, 'packet_loss': 100.0}

 def _record_latency():
-    """Record latency to the default gateway. Only stores the average of 3 pings."""
+    """Record latency to the default gateway."""
    try:
        gateway = _get_default_gateway()
        stats = _measure_latency(gateway)
@@ -671,9 +671,9 @@ def _record_latency():
        conn = _get_temp_db()
        conn.execute(
            """INSERT INTO latency_history 
-               (timestamp, target, latency_avg, packet_loss) 
-               VALUES (?, ?, ?, ?)""",
-            (int(time.time()), 'gateway', stats['avg'], stats['packet_loss'])
+               (timestamp, target, latency_avg, latency_min, latency_max, packet_loss) 
+               VALUES (?, ?, ?, ?, ?, ?)""",
+            (int(time.time()), 'gateway', stats['avg'], stats['min'], stats['max'], stats['packet_loss'])
        )
        conn.commit()
        conn.close()
@@ -718,18 +718,20 @@ def get_latency_history(target='gateway', timeframe='hour'):
        
        if interval is None:
            cursor = conn.execute(
-                """SELECT timestamp, latency_avg, packet_loss 
+                """SELECT timestamp, latency_avg, latency_min, latency_max, packet_loss 
                   FROM latency_history 
                   WHERE timestamp >= ? AND target = ? 
                   ORDER BY timestamp ASC""",
                (since, target)
            )
            rows = cursor.fetchall()
-            data = [{"timestamp": r[0], "value": r[1], "packet_loss": r[2]} for r in rows if r[1] is not None]
+            data = [{"timestamp": r[0], "value": r[1], "min": r[2], "max": r[3], "packet_loss": r[4]} for r in rows if r[1] is not None]
        else:
            cursor = conn.execute(
                """SELECT (timestamp / ?) * ? as bucket, 
                          ROUND(AVG(latency_avg), 1) as avg_val,
+                          ROUND(MIN(latency_min), 1) as min_val,
+                          ROUND(MAX(latency_max), 1) as max_val,
                          ROUND(AVG(packet_loss), 1) as avg_loss
                   FROM latency_history 
                   WHERE timestamp >= ? AND target = ?
@@ -738,20 +740,32 @@ def get_latency_history(target='gateway', timeframe='hour'):
                (interval, interval, since, target)
            )
            rows = cursor.fetchall()
-            data = [{"timestamp": r[0], "value": r[1], "packet_loss": r[2]} for r in rows if r[1] is not None]
+            data = [{"timestamp": r[0], "value": r[1], "min": r[2], "max": r[3], "packet_loss": r[4]} for r in rows if r[1] is not None]
        
        conn.close()
        
-        # Compute stats using the averaged values shown in the graph
+        # Compute stats
        if data:
            values = [d["value"] for d in data if d["value"] is not None]
            if values:
-                stats = {
-                    "min": round(min(values), 1),
-                    "max": round(max(values), 1),
-                    "avg": round(sum(values) / len(values), 1),
-                    "current": values[-1] if values else 0
-                }
+                # For gateway, use min/max of the averages (values) so stats match the graph
+                # For other targets (realtime), use actual min/max from individual pings
+                if target == 'gateway':
+                    stats = {
+                        "min": round(min(values), 1),
+                        "max": round(max(values), 1),
+                        "avg": round(sum(values) / len(values), 1),
+                        "current": values[-1] if values else 0
+                    }
+                else:
+                    mins = [d["min"] for d in data if d.get("min") is not None]
+                    maxs = [d["max"] for d in data if d.get("max") is not None]
+                    stats = {
+                        "min": round(min(mins) if mins else min(values), 1),
+                        "max": round(max(maxs) if maxs else max(values), 1),
+                        "avg": round(sum(values) / len(values), 1),
+                        "current": values[-1] if values else 0
+                    }
            else:
                stats = {"min": 0, "max": 0, "avg": 0, "current": 0}
        else:
@@ -761,69 +775,27 @@ def get_latency_history(target='gateway', timeframe='hour'):
    except Exception as e:
        return {"data": [], "stats": {"min": 0, "max": 0, "avg": 0, "current": 0}, "target": target}

-def get_latest_gateway_latency():
-    """Get the most recent gateway latency from the database (no new ping).
-    Used by health monitor to avoid duplicate pings."""
-    try:
-        conn = _get_temp_db()
-        cursor = conn.execute(
-            """SELECT timestamp, latency_avg, packet_loss FROM latency_history
-               WHERE target = 'gateway' ORDER BY timestamp DESC LIMIT 1"""
-        )
-        row = cursor.fetchone()
-        conn.close()
-        
-        if row:
-            timestamp, latency_avg, packet_loss = row
-            age = int(time.time()) - timestamp
-            # Only return if data is fresh (less than 2 minutes old)
-            if age <= 120:
-                return {
-                    'latency_avg': latency_avg,
-                    'packet_loss': packet_loss,
-                    'timestamp': timestamp,
-                    'age_seconds': age,
-                    'fresh': True
-                }
-        return {'fresh': False, 'latency_avg': None}
-    except Exception:
-        return {'fresh': False, 'latency_avg': None}
-
 def get_current_latency(target='gateway'):
-    """Get the most recent latency measurement for a target.
-    For gateway: reads from database (already monitored every 60s).
-    For cloudflare/google: does a fresh ping (on-demand only)."""
+    """Get the most recent latency measurement for a target."""
    try:
-        # Gateway uses stored data to avoid duplicate pings
+        # If gateway, resolve to actual IP
        if target == 'gateway':
            target_ip = _get_default_gateway()
-            stored = get_latest_gateway_latency()
-            if stored.get('fresh'):
-                return {
-                    'target': target,
-                    'target_ip': target_ip,
-                    'latency_avg': stored['latency_avg'],
-                    'packet_loss': stored['packet_loss'],
-                    'status': 'ok' if stored['latency_avg'] and stored['latency_avg'] < 100 else 'warning'
-                }
-            # Fallback: do fresh measurement if no stored data
-            stats = _measure_latency(target_ip)
        else:
-    # Cloudflare/Google: fresh ping (not continuously monitored)
-      target_ip = LATENCY_TARGETS.get(target, target)
-      stats = _measure_latency(target_ip)
-    
-    return {
-      'target': target,
-      'target_ip': target_ip,
-      'latency_avg': stats['avg'],
-      'latency_min': stats.get('min'),
-      'latency_max': stats.get('max'),
-      'packet_loss': stats['packet_loss'],
-      'status': 'ok' if stats['success'] and stats['avg'] and stats['avg'] < 100 else 'warning' if stats['success'] else 'error'
-    }
-  except Exception:
-    return {'target': target, 'latency_avg': None, 'latency_min': None, 'latency_max': None, 'status': 'error'}
+            target_ip = LATENCY_TARGETS.get(target, target)
+        
+        stats = _measure_latency(target_ip)
+        return {
+            'target': target,
+            'target_ip': target_ip,
+            'latency_avg': stats['avg'],
+            'latency_min': stats['min'],
+            'latency_max': stats['max'],
+            'packet_loss': stats['packet_loss'],
+            'status': 'ok' if stats['success'] and stats['avg'] and stats['avg'] < 100 else 'warning' if stats['success'] else 'error'
+        }
+    except Exception:
+        return {'target': target, 'latency_avg': None, 'status': 'error'}


 def _health_collector_loop():
@@ -1966,12 +1966,11 @@ class HealthMonitor:
            if latency_status:
                latency_ms = latency_status.get('latency_ms', 'N/A')
                latency_sev = latency_status.get('status', 'OK')
-                interface_details['connectivity'] = latency_status
-                target_display = latency_status.get('target', 'gateway')
-                connectivity_check = {
-                    'status': latency_sev if latency_sev not in ['UNKNOWN'] else 'OK',
-                    'detail': f'Latency {latency_ms}ms to {target_display}' if isinstance(latency_ms, (int, float)) else latency_status.get('reason', 'Unknown'),
-                }
+            interface_details['connectivity'] = latency_status
+            connectivity_check = {
+                'status': latency_sev if latency_sev not in ['UNKNOWN'] else 'OK',
+                'detail': f'Latency {latency_ms}ms to gateway' if isinstance(latency_ms, (int, float)) else latency_status.get('reason', 'Unknown'),
+            }
                if latency_sev not in ['OK', 'INFO', 'UNKNOWN']:
                    issues.append(latency_status.get('reason', 'Network latency issue'))
            else:
@@ -2007,48 +2006,74 @@ class HealthMonitor:
            return {'status': 'UNKNOWN', 'reason': f'Network check unavailable: {str(e)}', 'checks': {}}
    
    def _check_network_latency(self) -> Optional[Dict[str, Any]]:
-        """Check network latency using the gateway latency already monitored.
+        """Check network latency by reading from the gateway latency monitor database.
        
-        Uses the latency data from flask_server's continuous monitoring (every 60s)
-        instead of doing a separate ping to avoid duplicate network checks.
-        The gateway latency is based on the average of 3 pings to avoid false positives.
+        Reads the most recent gateway latency measurement from the SQLite database
+        that is updated every 60 seconds by the latency monitor thread.
+        This avoids redundant ping operations and uses the existing monitoring data.
        """
        cache_key = 'network_latency'
        current_time = time.time()
        
-        # Use shorter cache since we're reading from DB (no network overhead)
        if cache_key in self.last_check_times:
-            if current_time - self.last_check_times[cache_key] < 30:
+            if current_time - self.last_check_times[cache_key] < 60:
                return self.cached_results.get(cache_key)
        
        try:
-            # Import and use the stored gateway latency from flask_server
-            import sys
-            import os
-            sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
-            from flask_server import get_latest_gateway_latency, _get_default_gateway
+            import sqlite3
+            db_path = "/usr/local/share/proxmenux/monitor.db"
            
-            stored = get_latest_gateway_latency()
-            gateway_ip = _get_default_gateway() or 'gateway'
+            # Check if database exists
+            if not os.path.exists(db_path):
+                return {'status': 'UNKNOWN', 'reason': 'Latency monitor database not available'}
            
-            if stored.get('fresh') and stored.get('latency_avg') is not None:
-                avg_latency = stored['latency_avg']
-                packet_loss = stored.get('packet_loss', 0)
+            conn = sqlite3.connect(db_path, timeout=5)
+            cursor = conn.execute(
+                """SELECT latency_avg, latency_min, latency_max, packet_loss, timestamp
+                   FROM latency_history 
+                   WHERE target = 'gateway' 
+                   ORDER BY timestamp DESC 
+                   LIMIT 1"""
+            )
+            row = cursor.fetchone()
+            conn.close()
+            
+            if row and row[0] is not None:
+                avg_latency = row[0]
+                min_latency = row[1]
+                max_latency = row[2]
+                packet_loss = row[3] or 0
+                data_age = current_time - row[4]
+                
+                # If data is older than 2 minutes, consider it stale
+                if data_age > 120:
+                    stale_result = {
+                        'status': 'UNKNOWN',
+                        'reason': 'Latency data is stale (>2 min old)'
+                    }
+                    self.cached_results[cache_key] = stale_result
+                    self.last_check_times[cache_key] = current_time
+                    return stale_result
                
                # Check for packet loss first
-                if packet_loss is not None and packet_loss >= 50:
+                if packet_loss >= 100:
+                    loss_result = {
+                        'status': 'CRITICAL',
+                        'reason': 'Packet loss to gateway (100% loss)',
+                        'latency_ms': None,
+                        'packet_loss': packet_loss
+                    }
+                    self.cached_results[cache_key] = loss_result
+                    self.last_check_times[cache_key] = current_time
+                    return loss_result
+                
+                # Evaluate latency thresholds
+                if avg_latency > self.NETWORK_LATENCY_CRITICAL:
                    status = 'CRITICAL'
-                    reason = f'High packet loss ({packet_loss:.0f}%) to gateway'
-                elif packet_loss is not None and packet_loss > 0:
-                    status = 'WARNING'
-                    reason = f'Packet loss ({packet_loss:.0f}%) to gateway'
-                # Check latency thresholds
-                elif avg_latency > self.NETWORK_LATENCY_CRITICAL:
-                    status = 'CRITICAL'
-                    reason = f'Latency {avg_latency:.1f}ms >{self.NETWORK_LATENCY_CRITICAL}ms to gateway'
+                    reason = f'Latency {avg_latency:.1f}ms to gateway >{self.NETWORK_LATENCY_CRITICAL}ms'
                elif avg_latency > self.NETWORK_LATENCY_WARNING:
                    status = 'WARNING'
-                    reason = f'Latency {avg_latency:.1f}ms >{self.NETWORK_LATENCY_WARNING}ms to gateway'
+                    reason = f'Latency {avg_latency:.1f}ms to gateway >{self.NETWORK_LATENCY_WARNING}ms'
                else:
                    status = 'OK'
                    reason = None
@@ -2056,7 +2081,9 @@ class HealthMonitor:
                latency_result = {
                    'status': status,
                    'latency_ms': round(avg_latency, 1),
-                    'target': gateway_ip,
+                    'latency_min': round(min_latency, 1) if min_latency else None,
+                    'latency_max': round(max_latency, 1) if max_latency else None,
+                    'packet_loss': packet_loss,
                }
                if reason:
                    latency_result['reason'] = reason
@@ -2065,11 +2092,17 @@ class HealthMonitor:
                self.last_check_times[cache_key] = current_time
                return latency_result
            
-            # No fresh data available - return unknown (monitoring may not be running)
-            return {'status': 'UNKNOWN', 'reason': 'No recent latency data available'}
+            # No data in database yet
+            no_data_result = {
+                'status': 'UNKNOWN',
+                'reason': 'No gateway latency data available yet'
+            }
+            self.cached_results[cache_key] = no_data_result
+            self.last_check_times[cache_key] = current_time
+            return no_data_result
            
        except Exception as e:
-            return {'status': 'UNKNOWN', 'reason': f'Latency check unavailable: {str(e)}'}
+            return {'status': 'UNKNOWN', 'reason': f'Latency check failed: {str(e)}'}
    
    def _is_vzdump_active(self) -> bool:
        """Check if a vzdump (backup) job is currently running."""