mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-04-25 08:56:21 +00:00
Update notification service
This commit is contained in:
@@ -631,39 +631,39 @@ def init_latency_db():
|
||||
return False
|
||||
|
||||
def _measure_latency(target_ip: str) -> dict:
|
||||
"""Ping a target and return latency stats. Uses 3 pings and returns avg, min, max for full visibility."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['ping', '-c', '3', '-W', '2', target_ip],
|
||||
capture_output=True, text=True, timeout=10
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
latencies = []
|
||||
for line in result.stdout.split('\n'):
|
||||
if 'time=' in line:
|
||||
try:
|
||||
latency_str = line.split('time=')[1].split()[0]
|
||||
latencies.append(float(latency_str))
|
||||
except:
|
||||
pass
|
||||
|
||||
if latencies:
|
||||
return {
|
||||
'success': True,
|
||||
'avg': round(sum(latencies) / len(latencies), 1),
|
||||
'min': round(min(latencies), 1),
|
||||
'max': round(max(latencies), 1),
|
||||
'packet_loss': round((3 - len(latencies)) / 3 * 100, 1)
|
||||
}
|
||||
|
||||
# Ping failed - 100% packet loss
|
||||
return {'success': False, 'avg': None, 'min': None, 'max': None, 'packet_loss': 100.0}
|
||||
except Exception:
|
||||
return {'success': False, 'avg': None, 'min': None, 'max': None, 'packet_loss': 100.0}
|
||||
"""Ping a target and return latency stats."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['ping', '-c', '3', '-W', '2', target_ip],
|
||||
capture_output=True, text=True, timeout=10
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
latencies = []
|
||||
for line in result.stdout.split('\n'):
|
||||
if 'time=' in line:
|
||||
try:
|
||||
latency_str = line.split('time=')[1].split()[0]
|
||||
latencies.append(float(latency_str))
|
||||
except:
|
||||
pass
|
||||
|
||||
if latencies:
|
||||
return {
|
||||
'success': True,
|
||||
'avg': round(sum(latencies) / len(latencies), 1),
|
||||
'min': round(min(latencies), 1),
|
||||
'max': round(max(latencies), 1),
|
||||
'packet_loss': round((3 - len(latencies)) / 3 * 100, 1)
|
||||
}
|
||||
|
||||
# Ping failed - 100% packet loss
|
||||
return {'success': False, 'avg': None, 'min': None, 'max': None, 'packet_loss': 100.0}
|
||||
except Exception:
|
||||
return {'success': False, 'avg': None, 'min': None, 'max': None, 'packet_loss': 100.0}
|
||||
|
||||
def _record_latency():
|
||||
"""Record latency to the default gateway. Only stores the average of 3 pings."""
|
||||
"""Record latency to the default gateway."""
|
||||
try:
|
||||
gateway = _get_default_gateway()
|
||||
stats = _measure_latency(gateway)
|
||||
@@ -671,9 +671,9 @@ def _record_latency():
|
||||
conn = _get_temp_db()
|
||||
conn.execute(
|
||||
"""INSERT INTO latency_history
|
||||
(timestamp, target, latency_avg, packet_loss)
|
||||
VALUES (?, ?, ?, ?)""",
|
||||
(int(time.time()), 'gateway', stats['avg'], stats['packet_loss'])
|
||||
(timestamp, target, latency_avg, latency_min, latency_max, packet_loss)
|
||||
VALUES (?, ?, ?, ?, ?, ?)""",
|
||||
(int(time.time()), 'gateway', stats['avg'], stats['min'], stats['max'], stats['packet_loss'])
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
@@ -718,18 +718,20 @@ def get_latency_history(target='gateway', timeframe='hour'):
|
||||
|
||||
if interval is None:
|
||||
cursor = conn.execute(
|
||||
"""SELECT timestamp, latency_avg, packet_loss
|
||||
"""SELECT timestamp, latency_avg, latency_min, latency_max, packet_loss
|
||||
FROM latency_history
|
||||
WHERE timestamp >= ? AND target = ?
|
||||
ORDER BY timestamp ASC""",
|
||||
(since, target)
|
||||
)
|
||||
rows = cursor.fetchall()
|
||||
data = [{"timestamp": r[0], "value": r[1], "packet_loss": r[2]} for r in rows if r[1] is not None]
|
||||
data = [{"timestamp": r[0], "value": r[1], "min": r[2], "max": r[3], "packet_loss": r[4]} for r in rows if r[1] is not None]
|
||||
else:
|
||||
cursor = conn.execute(
|
||||
"""SELECT (timestamp / ?) * ? as bucket,
|
||||
ROUND(AVG(latency_avg), 1) as avg_val,
|
||||
ROUND(MIN(latency_min), 1) as min_val,
|
||||
ROUND(MAX(latency_max), 1) as max_val,
|
||||
ROUND(AVG(packet_loss), 1) as avg_loss
|
||||
FROM latency_history
|
||||
WHERE timestamp >= ? AND target = ?
|
||||
@@ -738,20 +740,32 @@ def get_latency_history(target='gateway', timeframe='hour'):
|
||||
(interval, interval, since, target)
|
||||
)
|
||||
rows = cursor.fetchall()
|
||||
data = [{"timestamp": r[0], "value": r[1], "packet_loss": r[2]} for r in rows if r[1] is not None]
|
||||
data = [{"timestamp": r[0], "value": r[1], "min": r[2], "max": r[3], "packet_loss": r[4]} for r in rows if r[1] is not None]
|
||||
|
||||
conn.close()
|
||||
|
||||
# Compute stats using the averaged values shown in the graph
|
||||
# Compute stats
|
||||
if data:
|
||||
values = [d["value"] for d in data if d["value"] is not None]
|
||||
if values:
|
||||
stats = {
|
||||
"min": round(min(values), 1),
|
||||
"max": round(max(values), 1),
|
||||
"avg": round(sum(values) / len(values), 1),
|
||||
"current": values[-1] if values else 0
|
||||
}
|
||||
# For gateway, use min/max of the averages (values) so stats match the graph
|
||||
# For other targets (realtime), use actual min/max from individual pings
|
||||
if target == 'gateway':
|
||||
stats = {
|
||||
"min": round(min(values), 1),
|
||||
"max": round(max(values), 1),
|
||||
"avg": round(sum(values) / len(values), 1),
|
||||
"current": values[-1] if values else 0
|
||||
}
|
||||
else:
|
||||
mins = [d["min"] for d in data if d.get("min") is not None]
|
||||
maxs = [d["max"] for d in data if d.get("max") is not None]
|
||||
stats = {
|
||||
"min": round(min(mins) if mins else min(values), 1),
|
||||
"max": round(max(maxs) if maxs else max(values), 1),
|
||||
"avg": round(sum(values) / len(values), 1),
|
||||
"current": values[-1] if values else 0
|
||||
}
|
||||
else:
|
||||
stats = {"min": 0, "max": 0, "avg": 0, "current": 0}
|
||||
else:
|
||||
@@ -761,69 +775,27 @@ def get_latency_history(target='gateway', timeframe='hour'):
|
||||
except Exception as e:
|
||||
return {"data": [], "stats": {"min": 0, "max": 0, "avg": 0, "current": 0}, "target": target}
|
||||
|
||||
def get_latest_gateway_latency():
|
||||
"""Get the most recent gateway latency from the database (no new ping).
|
||||
Used by health monitor to avoid duplicate pings."""
|
||||
try:
|
||||
conn = _get_temp_db()
|
||||
cursor = conn.execute(
|
||||
"""SELECT timestamp, latency_avg, packet_loss FROM latency_history
|
||||
WHERE target = 'gateway' ORDER BY timestamp DESC LIMIT 1"""
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
conn.close()
|
||||
|
||||
if row:
|
||||
timestamp, latency_avg, packet_loss = row
|
||||
age = int(time.time()) - timestamp
|
||||
# Only return if data is fresh (less than 2 minutes old)
|
||||
if age <= 120:
|
||||
return {
|
||||
'latency_avg': latency_avg,
|
||||
'packet_loss': packet_loss,
|
||||
'timestamp': timestamp,
|
||||
'age_seconds': age,
|
||||
'fresh': True
|
||||
}
|
||||
return {'fresh': False, 'latency_avg': None}
|
||||
except Exception:
|
||||
return {'fresh': False, 'latency_avg': None}
|
||||
|
||||
def get_current_latency(target='gateway'):
|
||||
"""Get the most recent latency measurement for a target.
|
||||
For gateway: reads from database (already monitored every 60s).
|
||||
For cloudflare/google: does a fresh ping (on-demand only)."""
|
||||
"""Get the most recent latency measurement for a target."""
|
||||
try:
|
||||
# Gateway uses stored data to avoid duplicate pings
|
||||
# If gateway, resolve to actual IP
|
||||
if target == 'gateway':
|
||||
target_ip = _get_default_gateway()
|
||||
stored = get_latest_gateway_latency()
|
||||
if stored.get('fresh'):
|
||||
return {
|
||||
'target': target,
|
||||
'target_ip': target_ip,
|
||||
'latency_avg': stored['latency_avg'],
|
||||
'packet_loss': stored['packet_loss'],
|
||||
'status': 'ok' if stored['latency_avg'] and stored['latency_avg'] < 100 else 'warning'
|
||||
}
|
||||
# Fallback: do fresh measurement if no stored data
|
||||
stats = _measure_latency(target_ip)
|
||||
else:
|
||||
# Cloudflare/Google: fresh ping (not continuously monitored)
|
||||
target_ip = LATENCY_TARGETS.get(target, target)
|
||||
stats = _measure_latency(target_ip)
|
||||
|
||||
return {
|
||||
'target': target,
|
||||
'target_ip': target_ip,
|
||||
'latency_avg': stats['avg'],
|
||||
'latency_min': stats.get('min'),
|
||||
'latency_max': stats.get('max'),
|
||||
'packet_loss': stats['packet_loss'],
|
||||
'status': 'ok' if stats['success'] and stats['avg'] and stats['avg'] < 100 else 'warning' if stats['success'] else 'error'
|
||||
}
|
||||
except Exception:
|
||||
return {'target': target, 'latency_avg': None, 'latency_min': None, 'latency_max': None, 'status': 'error'}
|
||||
target_ip = LATENCY_TARGETS.get(target, target)
|
||||
|
||||
stats = _measure_latency(target_ip)
|
||||
return {
|
||||
'target': target,
|
||||
'target_ip': target_ip,
|
||||
'latency_avg': stats['avg'],
|
||||
'latency_min': stats['min'],
|
||||
'latency_max': stats['max'],
|
||||
'packet_loss': stats['packet_loss'],
|
||||
'status': 'ok' if stats['success'] and stats['avg'] and stats['avg'] < 100 else 'warning' if stats['success'] else 'error'
|
||||
}
|
||||
except Exception:
|
||||
return {'target': target, 'latency_avg': None, 'status': 'error'}
|
||||
|
||||
|
||||
def _health_collector_loop():
|
||||
|
||||
@@ -1966,12 +1966,11 @@ class HealthMonitor:
|
||||
if latency_status:
|
||||
latency_ms = latency_status.get('latency_ms', 'N/A')
|
||||
latency_sev = latency_status.get('status', 'OK')
|
||||
interface_details['connectivity'] = latency_status
|
||||
target_display = latency_status.get('target', 'gateway')
|
||||
connectivity_check = {
|
||||
'status': latency_sev if latency_sev not in ['UNKNOWN'] else 'OK',
|
||||
'detail': f'Latency {latency_ms}ms to {target_display}' if isinstance(latency_ms, (int, float)) else latency_status.get('reason', 'Unknown'),
|
||||
}
|
||||
interface_details['connectivity'] = latency_status
|
||||
connectivity_check = {
|
||||
'status': latency_sev if latency_sev not in ['UNKNOWN'] else 'OK',
|
||||
'detail': f'Latency {latency_ms}ms to gateway' if isinstance(latency_ms, (int, float)) else latency_status.get('reason', 'Unknown'),
|
||||
}
|
||||
if latency_sev not in ['OK', 'INFO', 'UNKNOWN']:
|
||||
issues.append(latency_status.get('reason', 'Network latency issue'))
|
||||
else:
|
||||
@@ -2007,48 +2006,74 @@ class HealthMonitor:
|
||||
return {'status': 'UNKNOWN', 'reason': f'Network check unavailable: {str(e)}', 'checks': {}}
|
||||
|
||||
def _check_network_latency(self) -> Optional[Dict[str, Any]]:
|
||||
"""Check network latency using the gateway latency already monitored.
|
||||
"""Check network latency by reading from the gateway latency monitor database.
|
||||
|
||||
Uses the latency data from flask_server's continuous monitoring (every 60s)
|
||||
instead of doing a separate ping to avoid duplicate network checks.
|
||||
The gateway latency is based on the average of 3 pings to avoid false positives.
|
||||
Reads the most recent gateway latency measurement from the SQLite database
|
||||
that is updated every 60 seconds by the latency monitor thread.
|
||||
This avoids redundant ping operations and uses the existing monitoring data.
|
||||
"""
|
||||
cache_key = 'network_latency'
|
||||
current_time = time.time()
|
||||
|
||||
# Use shorter cache since we're reading from DB (no network overhead)
|
||||
if cache_key in self.last_check_times:
|
||||
if current_time - self.last_check_times[cache_key] < 30:
|
||||
if current_time - self.last_check_times[cache_key] < 60:
|
||||
return self.cached_results.get(cache_key)
|
||||
|
||||
try:
|
||||
# Import and use the stored gateway latency from flask_server
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
from flask_server import get_latest_gateway_latency, _get_default_gateway
|
||||
import sqlite3
|
||||
db_path = "/usr/local/share/proxmenux/monitor.db"
|
||||
|
||||
stored = get_latest_gateway_latency()
|
||||
gateway_ip = _get_default_gateway() or 'gateway'
|
||||
# Check if database exists
|
||||
if not os.path.exists(db_path):
|
||||
return {'status': 'UNKNOWN', 'reason': 'Latency monitor database not available'}
|
||||
|
||||
if stored.get('fresh') and stored.get('latency_avg') is not None:
|
||||
avg_latency = stored['latency_avg']
|
||||
packet_loss = stored.get('packet_loss', 0)
|
||||
conn = sqlite3.connect(db_path, timeout=5)
|
||||
cursor = conn.execute(
|
||||
"""SELECT latency_avg, latency_min, latency_max, packet_loss, timestamp
|
||||
FROM latency_history
|
||||
WHERE target = 'gateway'
|
||||
ORDER BY timestamp DESC
|
||||
LIMIT 1"""
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
conn.close()
|
||||
|
||||
if row and row[0] is not None:
|
||||
avg_latency = row[0]
|
||||
min_latency = row[1]
|
||||
max_latency = row[2]
|
||||
packet_loss = row[3] or 0
|
||||
data_age = current_time - row[4]
|
||||
|
||||
# If data is older than 2 minutes, consider it stale
|
||||
if data_age > 120:
|
||||
stale_result = {
|
||||
'status': 'UNKNOWN',
|
||||
'reason': 'Latency data is stale (>2 min old)'
|
||||
}
|
||||
self.cached_results[cache_key] = stale_result
|
||||
self.last_check_times[cache_key] = current_time
|
||||
return stale_result
|
||||
|
||||
# Check for packet loss first
|
||||
if packet_loss is not None and packet_loss >= 50:
|
||||
if packet_loss >= 100:
|
||||
loss_result = {
|
||||
'status': 'CRITICAL',
|
||||
'reason': 'Packet loss to gateway (100% loss)',
|
||||
'latency_ms': None,
|
||||
'packet_loss': packet_loss
|
||||
}
|
||||
self.cached_results[cache_key] = loss_result
|
||||
self.last_check_times[cache_key] = current_time
|
||||
return loss_result
|
||||
|
||||
# Evaluate latency thresholds
|
||||
if avg_latency > self.NETWORK_LATENCY_CRITICAL:
|
||||
status = 'CRITICAL'
|
||||
reason = f'High packet loss ({packet_loss:.0f}%) to gateway'
|
||||
elif packet_loss is not None and packet_loss > 0:
|
||||
status = 'WARNING'
|
||||
reason = f'Packet loss ({packet_loss:.0f}%) to gateway'
|
||||
# Check latency thresholds
|
||||
elif avg_latency > self.NETWORK_LATENCY_CRITICAL:
|
||||
status = 'CRITICAL'
|
||||
reason = f'Latency {avg_latency:.1f}ms >{self.NETWORK_LATENCY_CRITICAL}ms to gateway'
|
||||
reason = f'Latency {avg_latency:.1f}ms to gateway >{self.NETWORK_LATENCY_CRITICAL}ms'
|
||||
elif avg_latency > self.NETWORK_LATENCY_WARNING:
|
||||
status = 'WARNING'
|
||||
reason = f'Latency {avg_latency:.1f}ms >{self.NETWORK_LATENCY_WARNING}ms to gateway'
|
||||
reason = f'Latency {avg_latency:.1f}ms to gateway >{self.NETWORK_LATENCY_WARNING}ms'
|
||||
else:
|
||||
status = 'OK'
|
||||
reason = None
|
||||
@@ -2056,7 +2081,9 @@ class HealthMonitor:
|
||||
latency_result = {
|
||||
'status': status,
|
||||
'latency_ms': round(avg_latency, 1),
|
||||
'target': gateway_ip,
|
||||
'latency_min': round(min_latency, 1) if min_latency else None,
|
||||
'latency_max': round(max_latency, 1) if max_latency else None,
|
||||
'packet_loss': packet_loss,
|
||||
}
|
||||
if reason:
|
||||
latency_result['reason'] = reason
|
||||
@@ -2065,11 +2092,17 @@ class HealthMonitor:
|
||||
self.last_check_times[cache_key] = current_time
|
||||
return latency_result
|
||||
|
||||
# No fresh data available - return unknown (monitoring may not be running)
|
||||
return {'status': 'UNKNOWN', 'reason': 'No recent latency data available'}
|
||||
# No data in database yet
|
||||
no_data_result = {
|
||||
'status': 'UNKNOWN',
|
||||
'reason': 'No gateway latency data available yet'
|
||||
}
|
||||
self.cached_results[cache_key] = no_data_result
|
||||
self.last_check_times[cache_key] = current_time
|
||||
return no_data_result
|
||||
|
||||
except Exception as e:
|
||||
return {'status': 'UNKNOWN', 'reason': f'Latency check unavailable: {str(e)}'}
|
||||
return {'status': 'UNKNOWN', 'reason': f'Latency check failed: {str(e)}'}
|
||||
|
||||
def _is_vzdump_active(self) -> bool:
|
||||
"""Check if a vzdump (backup) job is currently running."""
|
||||
|
||||
Reference in New Issue
Block a user