From 56dab535c31a527134e494e77858156e77966b6e Mon Sep 17 00:00:00 2001 From: MacRimi Date: Sun, 8 Mar 2026 00:53:35 +0100 Subject: [PATCH] Update notification service --- AppImage/scripts/flask_server.py | 52 +++++++++-- AppImage/scripts/health_monitor.py | 141 +++++++++++------------------ 2 files changed, 101 insertions(+), 92 deletions(-) diff --git a/AppImage/scripts/flask_server.py b/AppImage/scripts/flask_server.py index 58c55b64..9b905390 100644 --- a/AppImage/scripts/flask_server.py +++ b/AppImage/scripts/flask_server.py @@ -759,22 +759,62 @@ def get_latency_history(target='gateway', timeframe='hour'): except Exception as e: return {"data": [], "stats": {"min": 0, "max": 0, "avg": 0, "current": 0}, "target": target} -def get_current_latency(target='gateway'): - """Get the most recent latency measurement for a target.""" +def get_latest_gateway_latency(): + """Get the most recent gateway latency from the database (no new ping). + Used by health monitor to avoid duplicate pings.""" try: - # If gateway, resolve to actual IP + conn = _get_temp_db() + cursor = conn.execute( + """SELECT timestamp, latency_avg, packet_loss FROM latency_history + WHERE target = 'gateway' ORDER BY timestamp DESC LIMIT 1""" + ) + row = cursor.fetchone() + conn.close() + + if row: + timestamp, latency_avg, packet_loss = row + age = int(time.time()) - timestamp + # Only return if data is fresh (less than 2 minutes old) + if age <= 120: + return { + 'latency_avg': latency_avg, + 'packet_loss': packet_loss, + 'timestamp': timestamp, + 'age_seconds': age, + 'fresh': True + } + return {'fresh': False, 'latency_avg': None} + except Exception: + return {'fresh': False, 'latency_avg': None} + +def get_current_latency(target='gateway'): + """Get the most recent latency measurement for a target. + For gateway: reads from database (already monitored every 60s). + For cloudflare/google: does a fresh ping (on-demand only).""" + try: + # Gateway uses stored data to avoid duplicate pings if target == 'gateway': target_ip = _get_default_gateway() + stored = get_latest_gateway_latency() + if stored.get('fresh'): + return { + 'target': target, + 'target_ip': target_ip, + 'latency_avg': stored['latency_avg'], + 'packet_loss': stored['packet_loss'], + 'status': 'ok' if stored['latency_avg'] and stored['latency_avg'] < 100 else 'warning' + } + # Fallback: do fresh measurement if no stored data + stats = _measure_latency(target_ip) else: + # Cloudflare/Google: fresh ping (not continuously monitored) target_ip = LATENCY_TARGETS.get(target, target) + stats = _measure_latency(target_ip) - stats = _measure_latency(target_ip) return { 'target': target, 'target_ip': target_ip, 'latency_avg': stats['avg'], - 'latency_min': stats['min'], - 'latency_max': stats['max'], 'packet_loss': stats['packet_loss'], 'status': 'ok' if stats['success'] and stats['avg'] and stats['avg'] < 100 else 'warning' if stats['success'] else 'error' } diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py index bb835e37..3e8f684d 100644 --- a/AppImage/scripts/health_monitor.py +++ b/AppImage/scripts/health_monitor.py @@ -1967,10 +1967,11 @@ class HealthMonitor: latency_ms = latency_status.get('latency_ms', 'N/A') latency_sev = latency_status.get('status', 'OK') interface_details['connectivity'] = latency_status - connectivity_check = { - 'status': latency_sev if latency_sev not in ['UNKNOWN'] else 'OK', - 'detail': f'Latency {latency_ms}ms to 1.1.1.1' if isinstance(latency_ms, (int, float)) else latency_status.get('reason', 'Unknown'), - } + target_display = latency_status.get('target', 'gateway') + connectivity_check = { + 'status': latency_sev if latency_sev not in ['UNKNOWN'] else 'OK', + 'detail': f'Latency {latency_ms}ms to {target_display}' if isinstance(latency_ms, (int, float)) else latency_status.get('reason', 'Unknown'), + } if latency_sev not in ['OK', 'INFO', 'UNKNOWN']: issues.append(latency_status.get('reason', 'Network latency issue')) else: @@ -2006,101 +2007,69 @@ class HealthMonitor: return {'status': 'UNKNOWN', 'reason': f'Network check unavailable: {str(e)}', 'checks': {}} def _check_network_latency(self) -> Optional[Dict[str, Any]]: - """Check network latency to 1.1.1.1 using 3 consecutive pings. + """Check network latency using the gateway latency already monitored. - Uses 3 pings to avoid false positives from transient network spikes. - Reports the average latency and only warns if all 3 exceed threshold. + Uses the latency data from flask_server's continuous monitoring (every 60s) + instead of doing a separate ping to avoid duplicate network checks. + The gateway latency is based on the average of 3 pings to avoid false positives. """ cache_key = 'network_latency' current_time = time.time() + # Use shorter cache since we're reading from DB (no network overhead) if cache_key in self.last_check_times: - if current_time - self.last_check_times[cache_key] < 60: + if current_time - self.last_check_times[cache_key] < 30: return self.cached_results.get(cache_key) try: - # Use 3 pings to get reliable latency measurement - result = subprocess.run( - ['ping', '-c', '3', '-W', '2', '1.1.1.1'], - capture_output=True, - text=True, - timeout=self.NETWORK_TIMEOUT + 6 # Allow time for 3 pings - ) + # Import and use the stored gateway latency from flask_server + import sys + import os + sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + from flask_server import get_latest_gateway_latency, _get_default_gateway - if result.returncode == 0: - # Parse individual ping times - latencies = [] - for line in result.stdout.split('\n'): - if 'time=' in line: - try: - latency_str = line.split('time=')[1].split()[0] - latencies.append(float(latency_str)) - except: - pass + stored = get_latest_gateway_latency() + gateway_ip = _get_default_gateway() or 'gateway' + + if stored.get('fresh') and stored.get('latency_avg') is not None: + avg_latency = stored['latency_avg'] + packet_loss = stored.get('packet_loss', 0) - if latencies: - # Calculate average latency - avg_latency = sum(latencies) / len(latencies) - max_latency = max(latencies) - min_latency = min(latencies) - - # Count how many pings exceeded thresholds - critical_count = sum(1 for l in latencies if l > self.NETWORK_LATENCY_CRITICAL) - warning_count = sum(1 for l in latencies if l > self.NETWORK_LATENCY_WARNING) - - # Only report WARNING/CRITICAL if majority of pings exceed threshold - # This prevents false positives from single transient spikes - if critical_count >= 2: # 2 or more of 3 pings are critical - status = 'CRITICAL' - reason = f'Latency {avg_latency:.1f}ms avg >{self.NETWORK_LATENCY_CRITICAL}ms (min:{min_latency:.0f} max:{max_latency:.0f})' - elif warning_count >= 2: # 2 or more of 3 pings exceed warning - status = 'WARNING' - reason = f'Latency {avg_latency:.1f}ms avg >{self.NETWORK_LATENCY_WARNING}ms (min:{min_latency:.0f} max:{max_latency:.0f})' - else: - status = 'OK' - reason = None - - latency_result = { - 'status': status, - 'latency_ms': round(avg_latency, 1), - 'latency_min': round(min_latency, 1), - 'latency_max': round(max_latency, 1), - 'samples': len(latencies), - } - if reason: - latency_result['reason'] = reason - - self.cached_results[cache_key] = latency_result - self.last_check_times[cache_key] = current_time - return latency_result + # Check for packet loss first + if packet_loss is not None and packet_loss >= 50: + status = 'CRITICAL' + reason = f'High packet loss ({packet_loss:.0f}%) to gateway' + elif packet_loss is not None and packet_loss > 0: + status = 'WARNING' + reason = f'Packet loss ({packet_loss:.0f}%) to gateway' + # Check latency thresholds + elif avg_latency > self.NETWORK_LATENCY_CRITICAL: + status = 'CRITICAL' + reason = f'Latency {avg_latency:.1f}ms >{self.NETWORK_LATENCY_CRITICAL}ms to gateway' + elif avg_latency > self.NETWORK_LATENCY_WARNING: + status = 'WARNING' + reason = f'Latency {avg_latency:.1f}ms >{self.NETWORK_LATENCY_WARNING}ms to gateway' + else: + status = 'OK' + reason = None + + latency_result = { + 'status': status, + 'latency_ms': round(avg_latency, 1), + 'target': gateway_ip, + } + if reason: + latency_result['reason'] = reason + + self.cached_results[cache_key] = latency_result + self.last_check_times[cache_key] = current_time + return latency_result - # If ping failed (timeout, unreachable) - distinguish the reason - stderr_lower = (result.stderr or '').lower() if hasattr(result, 'stderr') else '' - if 'unreachable' in stderr_lower or 'network is unreachable' in stderr_lower: - fail_reason = 'Network unreachable - no route to 1.1.1.1' - elif result.returncode == 1: - fail_reason = 'Packet loss to 1.1.1.1 (100% loss)' - else: - fail_reason = f'Ping failed (exit code {result.returncode})' + # No fresh data available - return unknown (monitoring may not be running) + return {'status': 'UNKNOWN', 'reason': 'No recent latency data available'} - packet_loss_result = { - 'status': 'CRITICAL', - 'reason': fail_reason - } - self.cached_results[cache_key] = packet_loss_result - self.last_check_times[cache_key] = current_time - return packet_loss_result - - except subprocess.TimeoutExpired: - timeout_result = { - 'status': 'WARNING', - 'reason': f'Ping timeout (>{self.NETWORK_TIMEOUT}s) - possible high latency' - } - self.cached_results[cache_key] = timeout_result - self.last_check_times[cache_key] = current_time - return timeout_result - except Exception: - return {'status': 'UNKNOWN', 'reason': 'Ping command failed'} + except Exception as e: + return {'status': 'UNKNOWN', 'reason': f'Latency check unavailable: {str(e)}'} def _is_vzdump_active(self) -> bool: """Check if a vzdump (backup) job is currently running."""