mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-05-01 11:56:21 +00:00
Update notification service
This commit is contained in:
@@ -759,22 +759,62 @@ def get_latency_history(target='gateway', timeframe='hour'):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
return {"data": [], "stats": {"min": 0, "max": 0, "avg": 0, "current": 0}, "target": target}
|
return {"data": [], "stats": {"min": 0, "max": 0, "avg": 0, "current": 0}, "target": target}
|
||||||
|
|
||||||
def get_current_latency(target='gateway'):
|
def get_latest_gateway_latency():
|
||||||
"""Get the most recent latency measurement for a target."""
|
"""Get the most recent gateway latency from the database (no new ping).
|
||||||
|
Used by health monitor to avoid duplicate pings."""
|
||||||
try:
|
try:
|
||||||
# If gateway, resolve to actual IP
|
conn = _get_temp_db()
|
||||||
|
cursor = conn.execute(
|
||||||
|
"""SELECT timestamp, latency_avg, packet_loss FROM latency_history
|
||||||
|
WHERE target = 'gateway' ORDER BY timestamp DESC LIMIT 1"""
|
||||||
|
)
|
||||||
|
row = cursor.fetchone()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
if row:
|
||||||
|
timestamp, latency_avg, packet_loss = row
|
||||||
|
age = int(time.time()) - timestamp
|
||||||
|
# Only return if data is fresh (less than 2 minutes old)
|
||||||
|
if age <= 120:
|
||||||
|
return {
|
||||||
|
'latency_avg': latency_avg,
|
||||||
|
'packet_loss': packet_loss,
|
||||||
|
'timestamp': timestamp,
|
||||||
|
'age_seconds': age,
|
||||||
|
'fresh': True
|
||||||
|
}
|
||||||
|
return {'fresh': False, 'latency_avg': None}
|
||||||
|
except Exception:
|
||||||
|
return {'fresh': False, 'latency_avg': None}
|
||||||
|
|
||||||
|
def get_current_latency(target='gateway'):
|
||||||
|
"""Get the most recent latency measurement for a target.
|
||||||
|
For gateway: reads from database (already monitored every 60s).
|
||||||
|
For cloudflare/google: does a fresh ping (on-demand only)."""
|
||||||
|
try:
|
||||||
|
# Gateway uses stored data to avoid duplicate pings
|
||||||
if target == 'gateway':
|
if target == 'gateway':
|
||||||
target_ip = _get_default_gateway()
|
target_ip = _get_default_gateway()
|
||||||
|
stored = get_latest_gateway_latency()
|
||||||
|
if stored.get('fresh'):
|
||||||
|
return {
|
||||||
|
'target': target,
|
||||||
|
'target_ip': target_ip,
|
||||||
|
'latency_avg': stored['latency_avg'],
|
||||||
|
'packet_loss': stored['packet_loss'],
|
||||||
|
'status': 'ok' if stored['latency_avg'] and stored['latency_avg'] < 100 else 'warning'
|
||||||
|
}
|
||||||
|
# Fallback: do fresh measurement if no stored data
|
||||||
|
stats = _measure_latency(target_ip)
|
||||||
else:
|
else:
|
||||||
|
# Cloudflare/Google: fresh ping (not continuously monitored)
|
||||||
target_ip = LATENCY_TARGETS.get(target, target)
|
target_ip = LATENCY_TARGETS.get(target, target)
|
||||||
|
stats = _measure_latency(target_ip)
|
||||||
|
|
||||||
stats = _measure_latency(target_ip)
|
|
||||||
return {
|
return {
|
||||||
'target': target,
|
'target': target,
|
||||||
'target_ip': target_ip,
|
'target_ip': target_ip,
|
||||||
'latency_avg': stats['avg'],
|
'latency_avg': stats['avg'],
|
||||||
'latency_min': stats['min'],
|
|
||||||
'latency_max': stats['max'],
|
|
||||||
'packet_loss': stats['packet_loss'],
|
'packet_loss': stats['packet_loss'],
|
||||||
'status': 'ok' if stats['success'] and stats['avg'] and stats['avg'] < 100 else 'warning' if stats['success'] else 'error'
|
'status': 'ok' if stats['success'] and stats['avg'] and stats['avg'] < 100 else 'warning' if stats['success'] else 'error'
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1967,10 +1967,11 @@ class HealthMonitor:
|
|||||||
latency_ms = latency_status.get('latency_ms', 'N/A')
|
latency_ms = latency_status.get('latency_ms', 'N/A')
|
||||||
latency_sev = latency_status.get('status', 'OK')
|
latency_sev = latency_status.get('status', 'OK')
|
||||||
interface_details['connectivity'] = latency_status
|
interface_details['connectivity'] = latency_status
|
||||||
connectivity_check = {
|
target_display = latency_status.get('target', 'gateway')
|
||||||
'status': latency_sev if latency_sev not in ['UNKNOWN'] else 'OK',
|
connectivity_check = {
|
||||||
'detail': f'Latency {latency_ms}ms to 1.1.1.1' if isinstance(latency_ms, (int, float)) else latency_status.get('reason', 'Unknown'),
|
'status': latency_sev if latency_sev not in ['UNKNOWN'] else 'OK',
|
||||||
}
|
'detail': f'Latency {latency_ms}ms to {target_display}' if isinstance(latency_ms, (int, float)) else latency_status.get('reason', 'Unknown'),
|
||||||
|
}
|
||||||
if latency_sev not in ['OK', 'INFO', 'UNKNOWN']:
|
if latency_sev not in ['OK', 'INFO', 'UNKNOWN']:
|
||||||
issues.append(latency_status.get('reason', 'Network latency issue'))
|
issues.append(latency_status.get('reason', 'Network latency issue'))
|
||||||
else:
|
else:
|
||||||
@@ -2006,101 +2007,69 @@ class HealthMonitor:
|
|||||||
return {'status': 'UNKNOWN', 'reason': f'Network check unavailable: {str(e)}', 'checks': {}}
|
return {'status': 'UNKNOWN', 'reason': f'Network check unavailable: {str(e)}', 'checks': {}}
|
||||||
|
|
||||||
def _check_network_latency(self) -> Optional[Dict[str, Any]]:
|
def _check_network_latency(self) -> Optional[Dict[str, Any]]:
|
||||||
"""Check network latency to 1.1.1.1 using 3 consecutive pings.
|
"""Check network latency using the gateway latency already monitored.
|
||||||
|
|
||||||
Uses 3 pings to avoid false positives from transient network spikes.
|
Uses the latency data from flask_server's continuous monitoring (every 60s)
|
||||||
Reports the average latency and only warns if all 3 exceed threshold.
|
instead of doing a separate ping to avoid duplicate network checks.
|
||||||
|
The gateway latency is based on the average of 3 pings to avoid false positives.
|
||||||
"""
|
"""
|
||||||
cache_key = 'network_latency'
|
cache_key = 'network_latency'
|
||||||
current_time = time.time()
|
current_time = time.time()
|
||||||
|
|
||||||
|
# Use shorter cache since we're reading from DB (no network overhead)
|
||||||
if cache_key in self.last_check_times:
|
if cache_key in self.last_check_times:
|
||||||
if current_time - self.last_check_times[cache_key] < 60:
|
if current_time - self.last_check_times[cache_key] < 30:
|
||||||
return self.cached_results.get(cache_key)
|
return self.cached_results.get(cache_key)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Use 3 pings to get reliable latency measurement
|
# Import and use the stored gateway latency from flask_server
|
||||||
result = subprocess.run(
|
import sys
|
||||||
['ping', '-c', '3', '-W', '2', '1.1.1.1'],
|
import os
|
||||||
capture_output=True,
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||||
text=True,
|
from flask_server import get_latest_gateway_latency, _get_default_gateway
|
||||||
timeout=self.NETWORK_TIMEOUT + 6 # Allow time for 3 pings
|
|
||||||
)
|
|
||||||
|
|
||||||
if result.returncode == 0:
|
stored = get_latest_gateway_latency()
|
||||||
# Parse individual ping times
|
gateway_ip = _get_default_gateway() or 'gateway'
|
||||||
latencies = []
|
|
||||||
for line in result.stdout.split('\n'):
|
if stored.get('fresh') and stored.get('latency_avg') is not None:
|
||||||
if 'time=' in line:
|
avg_latency = stored['latency_avg']
|
||||||
try:
|
packet_loss = stored.get('packet_loss', 0)
|
||||||
latency_str = line.split('time=')[1].split()[0]
|
|
||||||
latencies.append(float(latency_str))
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
if latencies:
|
# Check for packet loss first
|
||||||
# Calculate average latency
|
if packet_loss is not None and packet_loss >= 50:
|
||||||
avg_latency = sum(latencies) / len(latencies)
|
status = 'CRITICAL'
|
||||||
max_latency = max(latencies)
|
reason = f'High packet loss ({packet_loss:.0f}%) to gateway'
|
||||||
min_latency = min(latencies)
|
elif packet_loss is not None and packet_loss > 0:
|
||||||
|
status = 'WARNING'
|
||||||
# Count how many pings exceeded thresholds
|
reason = f'Packet loss ({packet_loss:.0f}%) to gateway'
|
||||||
critical_count = sum(1 for l in latencies if l > self.NETWORK_LATENCY_CRITICAL)
|
# Check latency thresholds
|
||||||
warning_count = sum(1 for l in latencies if l > self.NETWORK_LATENCY_WARNING)
|
elif avg_latency > self.NETWORK_LATENCY_CRITICAL:
|
||||||
|
status = 'CRITICAL'
|
||||||
# Only report WARNING/CRITICAL if majority of pings exceed threshold
|
reason = f'Latency {avg_latency:.1f}ms >{self.NETWORK_LATENCY_CRITICAL}ms to gateway'
|
||||||
# This prevents false positives from single transient spikes
|
elif avg_latency > self.NETWORK_LATENCY_WARNING:
|
||||||
if critical_count >= 2: # 2 or more of 3 pings are critical
|
status = 'WARNING'
|
||||||
status = 'CRITICAL'
|
reason = f'Latency {avg_latency:.1f}ms >{self.NETWORK_LATENCY_WARNING}ms to gateway'
|
||||||
reason = f'Latency {avg_latency:.1f}ms avg >{self.NETWORK_LATENCY_CRITICAL}ms (min:{min_latency:.0f} max:{max_latency:.0f})'
|
else:
|
||||||
elif warning_count >= 2: # 2 or more of 3 pings exceed warning
|
status = 'OK'
|
||||||
status = 'WARNING'
|
reason = None
|
||||||
reason = f'Latency {avg_latency:.1f}ms avg >{self.NETWORK_LATENCY_WARNING}ms (min:{min_latency:.0f} max:{max_latency:.0f})'
|
|
||||||
else:
|
latency_result = {
|
||||||
status = 'OK'
|
'status': status,
|
||||||
reason = None
|
'latency_ms': round(avg_latency, 1),
|
||||||
|
'target': gateway_ip,
|
||||||
latency_result = {
|
}
|
||||||
'status': status,
|
if reason:
|
||||||
'latency_ms': round(avg_latency, 1),
|
latency_result['reason'] = reason
|
||||||
'latency_min': round(min_latency, 1),
|
|
||||||
'latency_max': round(max_latency, 1),
|
self.cached_results[cache_key] = latency_result
|
||||||
'samples': len(latencies),
|
self.last_check_times[cache_key] = current_time
|
||||||
}
|
return latency_result
|
||||||
if reason:
|
|
||||||
latency_result['reason'] = reason
|
|
||||||
|
|
||||||
self.cached_results[cache_key] = latency_result
|
|
||||||
self.last_check_times[cache_key] = current_time
|
|
||||||
return latency_result
|
|
||||||
|
|
||||||
# If ping failed (timeout, unreachable) - distinguish the reason
|
# No fresh data available - return unknown (monitoring may not be running)
|
||||||
stderr_lower = (result.stderr or '').lower() if hasattr(result, 'stderr') else ''
|
return {'status': 'UNKNOWN', 'reason': 'No recent latency data available'}
|
||||||
if 'unreachable' in stderr_lower or 'network is unreachable' in stderr_lower:
|
|
||||||
fail_reason = 'Network unreachable - no route to 1.1.1.1'
|
|
||||||
elif result.returncode == 1:
|
|
||||||
fail_reason = 'Packet loss to 1.1.1.1 (100% loss)'
|
|
||||||
else:
|
|
||||||
fail_reason = f'Ping failed (exit code {result.returncode})'
|
|
||||||
|
|
||||||
packet_loss_result = {
|
except Exception as e:
|
||||||
'status': 'CRITICAL',
|
return {'status': 'UNKNOWN', 'reason': f'Latency check unavailable: {str(e)}'}
|
||||||
'reason': fail_reason
|
|
||||||
}
|
|
||||||
self.cached_results[cache_key] = packet_loss_result
|
|
||||||
self.last_check_times[cache_key] = current_time
|
|
||||||
return packet_loss_result
|
|
||||||
|
|
||||||
except subprocess.TimeoutExpired:
|
|
||||||
timeout_result = {
|
|
||||||
'status': 'WARNING',
|
|
||||||
'reason': f'Ping timeout (>{self.NETWORK_TIMEOUT}s) - possible high latency'
|
|
||||||
}
|
|
||||||
self.cached_results[cache_key] = timeout_result
|
|
||||||
self.last_check_times[cache_key] = current_time
|
|
||||||
return timeout_result
|
|
||||||
except Exception:
|
|
||||||
return {'status': 'UNKNOWN', 'reason': 'Ping command failed'}
|
|
||||||
|
|
||||||
def _is_vzdump_active(self) -> bool:
|
def _is_vzdump_active(self) -> bool:
|
||||||
"""Check if a vzdump (backup) job is currently running."""
|
"""Check if a vzdump (backup) job is currently running."""
|
||||||
|
|||||||
Reference in New Issue
Block a user