mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-02-22 02:16:23 +00:00
Update notification service
This commit is contained in:
@@ -18,6 +18,7 @@ import json
|
|||||||
import time
|
import time
|
||||||
import hashlib
|
import hashlib
|
||||||
import socket
|
import socket
|
||||||
|
import sqlite3
|
||||||
import subprocess
|
import subprocess
|
||||||
import threading
|
import threading
|
||||||
from queue import Queue
|
from queue import Queue
|
||||||
@@ -614,27 +615,68 @@ class TaskWatcher:
|
|||||||
# ─── Polling Collector ────────────────────────────────────────────
|
# ─── Polling Collector ────────────────────────────────────────────
|
||||||
|
|
||||||
class PollingCollector:
|
class PollingCollector:
|
||||||
"""Periodic collector that reads Health Monitor pending notifications.
|
"""Periodic collector that polls health state independently.
|
||||||
|
|
||||||
Polls health_persistence for:
|
Architecture:
|
||||||
- Pending notification events (state changes from Bloque A)
|
- Completely independent from Health Monitor's suppression system.
|
||||||
- Unnotified errors
|
Suppression Duration only affects the UI health badge; it NEVER blocks
|
||||||
- Update availability (every 24h)
|
notifications.
|
||||||
|
- Reads ``get_active_errors()`` (ALL active errors, even suppressed ones)
|
||||||
|
and decides when to notify based on its own 24-hour cycle.
|
||||||
|
- For *new* errors (first_seen within the last poll interval), notifies
|
||||||
|
immediately.
|
||||||
|
- For *persistent* errors (already known), re-notifies once every 24 h.
|
||||||
|
- Update checks run on their own 24-h timer and include security counts.
|
||||||
|
|
||||||
|
Tracking is stored in ``notification_last_sent`` (same DB).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, event_queue: Queue, poll_interval: int = 30):
|
DIGEST_INTERVAL = 86400 # 24 h between re-notifications
|
||||||
|
UPDATE_CHECK_INTERVAL = 86400 # 24 h between update scans
|
||||||
|
NEW_ERROR_WINDOW = 120 # seconds – errors younger than this are "new"
|
||||||
|
|
||||||
|
_ENTITY_MAP = {
|
||||||
|
'cpu': ('node', ''), 'memory': ('node', ''), 'temperature': ('node', ''),
|
||||||
|
'disk': ('storage', ''), 'network': ('network', ''),
|
||||||
|
'pve_services': ('node', ''), 'security': ('user', ''),
|
||||||
|
'updates': ('node', ''), 'storage': ('storage', ''),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Map health-persistence category names to our TEMPLATES event types.
|
||||||
|
# These must match keys in notification_templates.TEMPLATES exactly.
|
||||||
|
_CATEGORY_TO_EVENT_TYPE = {
|
||||||
|
'cpu': 'cpu_high',
|
||||||
|
'memory': 'ram_high',
|
||||||
|
'load': 'load_high',
|
||||||
|
'temperature': 'temp_high',
|
||||||
|
'disk': 'disk_space_low',
|
||||||
|
'storage': 'disk_space_low',
|
||||||
|
'network': 'network_down',
|
||||||
|
'pve_services': 'service_fail',
|
||||||
|
'security': 'auth_fail',
|
||||||
|
'updates': 'update_available',
|
||||||
|
'zfs': 'disk_io_error',
|
||||||
|
'smart': 'disk_io_error',
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, event_queue: Queue, poll_interval: int = 60):
|
||||||
self._queue = event_queue
|
self._queue = event_queue
|
||||||
self._running = False
|
self._running = False
|
||||||
self._thread: Optional[threading.Thread] = None
|
self._thread: Optional[threading.Thread] = None
|
||||||
self._poll_interval = poll_interval
|
self._poll_interval = poll_interval
|
||||||
self._hostname = _hostname()
|
self._hostname = _hostname()
|
||||||
self._last_update_check = 0
|
self._last_update_check = 0
|
||||||
self._update_check_interval = 86400 # 24 hours
|
# In-memory cache: error_key -> last notification timestamp
|
||||||
|
self._last_notified: Dict[str, float] = {}
|
||||||
|
# Track known error keys so we can detect truly new ones
|
||||||
|
self._known_errors: set = set()
|
||||||
|
self._first_poll_done = False
|
||||||
|
|
||||||
def start(self):
|
def start(self):
|
||||||
if self._running:
|
if self._running:
|
||||||
return
|
return
|
||||||
self._running = True
|
self._running = True
|
||||||
|
self._load_last_notified()
|
||||||
self._thread = threading.Thread(target=self._poll_loop, daemon=True,
|
self._thread = threading.Thread(target=self._poll_loop, daemon=True,
|
||||||
name='polling-collector')
|
name='polling-collector')
|
||||||
self._thread.start()
|
self._thread.start()
|
||||||
@@ -642,92 +684,121 @@ class PollingCollector:
|
|||||||
def stop(self):
|
def stop(self):
|
||||||
self._running = False
|
self._running = False
|
||||||
|
|
||||||
|
# ── Main loop ──────────────────────────────────────────────
|
||||||
|
|
||||||
def _poll_loop(self):
|
def _poll_loop(self):
|
||||||
"""Main polling loop."""
|
"""Main polling loop."""
|
||||||
# Initial delay to let health monitor warm up
|
# Initial delay to let health monitor warm up
|
||||||
for _ in range(10):
|
for _ in range(15):
|
||||||
if not self._running:
|
if not self._running:
|
||||||
return
|
return
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
||||||
while self._running:
|
while self._running:
|
||||||
try:
|
try:
|
||||||
self._collect_health_events()
|
self._check_persistent_health()
|
||||||
self._check_updates()
|
self._check_updates()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[PollingCollector] Error: {e}")
|
print(f"[PollingCollector] Error: {e}")
|
||||||
|
|
||||||
# Sleep in small increments for responsive shutdown
|
|
||||||
for _ in range(self._poll_interval):
|
for _ in range(self._poll_interval):
|
||||||
if not self._running:
|
if not self._running:
|
||||||
return
|
return
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
||||||
def _collect_health_events(self):
|
# ── Health errors (independent of suppression) ─────────────
|
||||||
"""Collect pending notification events from health_persistence."""
|
|
||||||
|
def _check_persistent_health(self):
|
||||||
|
"""Read ALL active errors from health_persistence and decide
|
||||||
|
whether each one warrants a notification right now.
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
- A *new* error (not in _known_errors) -> notify immediately
|
||||||
|
- A *persistent* error already notified -> re-notify after 24 h
|
||||||
|
- Uses its own tracking, NOT the health monitor's needs_notification flag
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
from health_persistence import health_persistence
|
from health_persistence import health_persistence
|
||||||
|
errors = health_persistence.get_active_errors()
|
||||||
|
except ImportError:
|
||||||
|
return
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[PollingCollector] get_active_errors failed: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
# Get pending notification events
|
now = time.time()
|
||||||
events = health_persistence.get_pending_notifications()
|
current_keys = set()
|
||||||
for evt in events:
|
|
||||||
data = json.loads(evt.get('data', '{}')) if isinstance(evt.get('data'), str) else evt.get('data', {})
|
|
||||||
|
|
||||||
event_type = evt.get('event_type', 'state_change')
|
for error in errors:
|
||||||
severity = data.get('severity', 'WARNING')
|
error_key = error.get('error_key', '')
|
||||||
|
if not error_key:
|
||||||
|
continue
|
||||||
|
|
||||||
data['hostname'] = self._hostname
|
current_keys.add(error_key)
|
||||||
data['error_key'] = evt.get('error_key', '')
|
category = error.get('category', '')
|
||||||
|
severity = error.get('severity', 'WARNING')
|
||||||
|
reason = error.get('reason', '')
|
||||||
|
|
||||||
# Deduce entity from health category
|
# Determine if we should notify
|
||||||
category = data.get('category', '')
|
is_new = error_key not in self._known_errors and self._first_poll_done
|
||||||
entity_map = {
|
last_sent = self._last_notified.get(error_key, 0)
|
||||||
'cpu': ('node', ''), 'memory': ('node', ''),
|
is_due = (now - last_sent) >= self.DIGEST_INTERVAL
|
||||||
'disk': ('storage', ''), 'network': ('network', ''),
|
|
||||||
'pve_services': ('node', ''), 'security': ('user', ''),
|
if not is_new and not is_due:
|
||||||
'updates': ('node', ''), 'storage': ('storage', ''),
|
continue
|
||||||
|
|
||||||
|
# Map to our event type
|
||||||
|
event_type = self._CATEGORY_TO_EVENT_TYPE.get(category, 'system_problem')
|
||||||
|
entity, eid = self._ENTITY_MAP.get(category, ('node', ''))
|
||||||
|
|
||||||
|
data = {
|
||||||
|
'hostname': self._hostname,
|
||||||
|
'category': category,
|
||||||
|
'reason': reason,
|
||||||
|
'error_key': error_key,
|
||||||
|
'severity': severity,
|
||||||
|
'first_seen': error.get('first_seen', ''),
|
||||||
|
'last_seen': error.get('last_seen', ''),
|
||||||
|
'is_persistent': not is_new,
|
||||||
}
|
}
|
||||||
entity, eid = entity_map.get(category, ('node', ''))
|
|
||||||
|
# Include extra details if present
|
||||||
|
details = error.get('details')
|
||||||
|
if isinstance(details, dict):
|
||||||
|
data.update(details)
|
||||||
|
elif isinstance(details, str):
|
||||||
|
try:
|
||||||
|
data.update(json.loads(details))
|
||||||
|
except (json.JSONDecodeError, TypeError):
|
||||||
|
pass
|
||||||
|
|
||||||
self._queue.put(NotificationEvent(
|
self._queue.put(NotificationEvent(
|
||||||
event_type, severity, data, source='health',
|
event_type, severity, data, source='health',
|
||||||
entity=entity, entity_id=eid or data.get('error_key', ''),
|
entity=entity, entity_id=eid or error_key,
|
||||||
))
|
))
|
||||||
|
|
||||||
# Mark events as notified
|
# Track that we notified
|
||||||
if events:
|
self._last_notified[error_key] = now
|
||||||
event_ids = [e['id'] for e in events if 'id' in e]
|
self._persist_last_notified(error_key, now)
|
||||||
if event_ids:
|
|
||||||
health_persistence.mark_events_notified(event_ids)
|
|
||||||
|
|
||||||
# Also check unnotified errors
|
# Remove tracking for errors that resolved
|
||||||
unnotified = health_persistence.get_unnotified_errors()
|
resolved = self._known_errors - current_keys
|
||||||
for error in unnotified:
|
for key in resolved:
|
||||||
err_cat = error.get('category', '')
|
self._last_notified.pop(key, None)
|
||||||
e_entity, e_eid = entity_map.get(err_cat, ('node', ''))
|
|
||||||
self._queue.put(NotificationEvent(
|
|
||||||
'new_error', error.get('severity', 'WARNING'), {
|
|
||||||
'category': err_cat,
|
|
||||||
'reason': error.get('reason', ''),
|
|
||||||
'hostname': self._hostname,
|
|
||||||
'error_key': error.get('error_key', ''),
|
|
||||||
},
|
|
||||||
source='health',
|
|
||||||
entity=e_entity,
|
|
||||||
entity_id=e_eid or error.get('error_key', ''),
|
|
||||||
))
|
|
||||||
# Mark as notified
|
|
||||||
if 'id' in error:
|
|
||||||
health_persistence.mark_notified(error['id'])
|
|
||||||
|
|
||||||
except ImportError:
|
self._known_errors = current_keys
|
||||||
pass # health_persistence not available (CLI mode)
|
self._first_poll_done = True
|
||||||
except Exception as e:
|
|
||||||
print(f"[PollingCollector] Health event collection error: {e}")
|
# ── Update check (enriched) ────────────────────────────────
|
||||||
|
|
||||||
def _check_updates(self):
|
def _check_updates(self):
|
||||||
"""Check for available system updates (every 24h)."""
|
"""Check for available system updates every 24 h.
|
||||||
|
|
||||||
|
Enriched output: total count, security updates, PVE version hint,
|
||||||
|
and top package names.
|
||||||
|
"""
|
||||||
now = time.time()
|
now = time.time()
|
||||||
if now - self._last_update_check < self._update_check_interval:
|
if now - self._last_update_check < self.UPDATE_CHECK_INTERVAL:
|
||||||
return
|
return
|
||||||
|
|
||||||
self._last_update_check = now
|
self._last_update_check = now
|
||||||
@@ -735,33 +806,107 @@ class PollingCollector:
|
|||||||
try:
|
try:
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
['apt-get', '-s', 'upgrade'],
|
['apt-get', '-s', 'upgrade'],
|
||||||
capture_output=True, text=True, timeout=60
|
capture_output=True, text=True, timeout=60,
|
||||||
)
|
)
|
||||||
|
if result.returncode != 0:
|
||||||
|
return
|
||||||
|
|
||||||
if result.returncode == 0:
|
lines = [l for l in result.stdout.split('\n') if l.startswith('Inst ')]
|
||||||
# Count upgradeable packages
|
total = len(lines)
|
||||||
lines = [l for l in result.stdout.split('\n')
|
if total == 0:
|
||||||
if l.startswith('Inst ')]
|
return
|
||||||
count = len(lines)
|
|
||||||
|
|
||||||
if count > 0:
|
packages = [l.split()[1] for l in lines]
|
||||||
# Show first 5 package names
|
security = [p for p in packages if any(
|
||||||
packages = [l.split()[1] for l in lines[:5]]
|
kw in p.lower() for kw in ('security', 'cve', 'openssl', 'libssl')
|
||||||
details = ', '.join(packages)
|
)]
|
||||||
if count > 5:
|
|
||||||
details += f', ... and {count - 5} more'
|
# Also detect security updates via apt changelog / Debian-Security origin
|
||||||
|
sec_result = subprocess.run(
|
||||||
|
['apt-get', '-s', 'upgrade', '-o', 'Dir::Etc::SourceList=/dev/null',
|
||||||
|
'-o', 'Dir::Etc::SourceParts=/dev/null'],
|
||||||
|
capture_output=True, text=True, timeout=30,
|
||||||
|
)
|
||||||
|
# Count lines from security repo (rough heuristic)
|
||||||
|
sec_count = max(len(security), 0)
|
||||||
|
try:
|
||||||
|
sec_output = subprocess.run(
|
||||||
|
['apt-get', '-s', '--only-upgrade', 'install'] + packages[:50],
|
||||||
|
capture_output=True, text=True, timeout=30,
|
||||||
|
)
|
||||||
|
for line in sec_output.stdout.split('\n'):
|
||||||
|
if 'security' in line.lower() and 'Inst ' in line:
|
||||||
|
sec_count += 1
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Check for PVE version upgrade
|
||||||
|
pve_packages = [p for p in packages if 'pve-' in p.lower() or 'proxmox-' in p.lower()]
|
||||||
|
|
||||||
|
# Build display details
|
||||||
|
top_pkgs = packages[:8]
|
||||||
|
details = ', '.join(top_pkgs)
|
||||||
|
if total > 8:
|
||||||
|
details += f', ... +{total - 8} more'
|
||||||
|
|
||||||
|
data = {
|
||||||
|
'hostname': self._hostname,
|
||||||
|
'count': str(total),
|
||||||
|
'security_count': str(sec_count),
|
||||||
|
'details': details,
|
||||||
|
'packages': ', '.join(packages[:20]),
|
||||||
|
}
|
||||||
|
if pve_packages:
|
||||||
|
data['pve_packages'] = ', '.join(pve_packages)
|
||||||
|
|
||||||
self._queue.put(NotificationEvent(
|
self._queue.put(NotificationEvent(
|
||||||
'update_available', 'INFO', {
|
'update_available', 'INFO', data,
|
||||||
'count': str(count),
|
source='polling', entity='node', entity_id='',
|
||||||
'details': details,
|
|
||||||
'hostname': self._hostname,
|
|
||||||
},
|
|
||||||
source='polling',
|
|
||||||
entity='node', entity_id='',
|
|
||||||
))
|
))
|
||||||
except Exception:
|
except Exception:
|
||||||
pass # Non-critical, silently skip
|
pass
|
||||||
|
|
||||||
|
# ── Persistence helpers ────────────────────────────────────
|
||||||
|
|
||||||
|
def _load_last_notified(self):
|
||||||
|
"""Load per-error notification timestamps from DB on startup."""
|
||||||
|
try:
|
||||||
|
db_path = Path('/usr/local/share/proxmenux/health_monitor.db')
|
||||||
|
if not db_path.exists():
|
||||||
|
return
|
||||||
|
conn = sqlite3.connect(str(db_path), timeout=10)
|
||||||
|
conn.execute('PRAGMA journal_mode=WAL')
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute(
|
||||||
|
"SELECT fingerprint, last_sent_ts FROM notification_last_sent "
|
||||||
|
"WHERE fingerprint LIKE 'health_%'"
|
||||||
|
)
|
||||||
|
for fp, ts in cursor.fetchall():
|
||||||
|
error_key = fp.replace('health_', '', 1)
|
||||||
|
self._last_notified[error_key] = ts
|
||||||
|
self._known_errors.add(error_key)
|
||||||
|
conn.close()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[PollingCollector] Failed to load last_notified: {e}")
|
||||||
|
|
||||||
|
def _persist_last_notified(self, error_key: str, ts: float):
|
||||||
|
"""Save per-error notification timestamp to DB."""
|
||||||
|
try:
|
||||||
|
db_path = Path('/usr/local/share/proxmenux/health_monitor.db')
|
||||||
|
conn = sqlite3.connect(str(db_path), timeout=10)
|
||||||
|
conn.execute('PRAGMA journal_mode=WAL')
|
||||||
|
conn.execute('PRAGMA busy_timeout=5000')
|
||||||
|
fp = f'health_{error_key}'
|
||||||
|
conn.execute('''
|
||||||
|
INSERT OR REPLACE INTO notification_last_sent (fingerprint, last_sent_ts, count)
|
||||||
|
VALUES (?, ?, COALESCE(
|
||||||
|
(SELECT count + 1 FROM notification_last_sent WHERE fingerprint = ?), 1
|
||||||
|
))
|
||||||
|
''', (fp, int(ts), fp))
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
# ─── Proxmox Webhook Receiver ───────────────────────────────────
|
# ─── Proxmox Webhook Receiver ───────────────────────────────────
|
||||||
@@ -801,6 +946,11 @@ class ProxmoxHookWatcher:
|
|||||||
event_type, entity, entity_id = self._classify(
|
event_type, entity, entity_id = self._classify(
|
||||||
notification_type, source_component, title, body, payload
|
notification_type, source_component, title, body, payload
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Discard meta-events (overall status changes, update status, etc.)
|
||||||
|
if event_type == '_skip':
|
||||||
|
return {'accepted': False, 'skipped': True, 'reason': 'Meta-event filtered'}
|
||||||
|
|
||||||
severity = self._map_severity(severity_raw)
|
severity = self._map_severity(severity_raw)
|
||||||
|
|
||||||
data = {
|
data = {
|
||||||
@@ -830,11 +980,28 @@ class ProxmoxHookWatcher:
|
|||||||
|
|
||||||
def _classify(self, ntype: str, component: str, title: str,
|
def _classify(self, ntype: str, component: str, title: str,
|
||||||
body: str, payload: dict) -> tuple:
|
body: str, payload: dict) -> tuple:
|
||||||
"""Classify webhook payload into (event_type, entity, entity_id)."""
|
"""Classify webhook payload into (event_type, entity, entity_id).
|
||||||
|
|
||||||
|
Returns ('_skip', '', '') for meta-events we should discard.
|
||||||
|
"""
|
||||||
title_lower = (title or '').lower()
|
title_lower = (title or '').lower()
|
||||||
body_lower = (body or '').lower()
|
body_lower = (body or '').lower()
|
||||||
component_lower = (component or '').lower()
|
component_lower = (component or '').lower()
|
||||||
|
|
||||||
|
# ── Skip PVE meta-events ──
|
||||||
|
# PVE sends "overall status changed from OK to WARNING" which is a meta
|
||||||
|
# aggregation event. Our own health monitor handles the underlying issues
|
||||||
|
# with better granularity, so we skip these to avoid noise/duplicates.
|
||||||
|
if 'overall' in title_lower and ('changed' in title_lower or 'status' in title_lower):
|
||||||
|
return '_skip', '', ''
|
||||||
|
|
||||||
|
# ── Skip "updates changed" status events ──
|
||||||
|
# PVE sends "updates status changed from OK to WARNING" when apt updates
|
||||||
|
# are available. Our PollingCollector already handles update checks with
|
||||||
|
# proper detail (security count, package list) on a 24h cycle.
|
||||||
|
if 'updates' in title_lower and ('changed' in title_lower or 'status' in title_lower):
|
||||||
|
return '_skip', '', ''
|
||||||
|
|
||||||
# VM / CT lifecycle events (if sent via webhook)
|
# VM / CT lifecycle events (if sent via webhook)
|
||||||
vmid = str(payload.get('vmid', ''))
|
vmid = str(payload.get('vmid', ''))
|
||||||
if any(k in component_lower for k in ('qemu', 'lxc', 'vm', 'ct', 'container')):
|
if any(k in component_lower for k in ('qemu', 'lxc', 'vm', 'ct', 'container')):
|
||||||
@@ -872,8 +1039,8 @@ class ProxmoxHookWatcher:
|
|||||||
if 'replication' in component_lower or 'replication' in title_lower:
|
if 'replication' in component_lower or 'replication' in title_lower:
|
||||||
vmid = str(payload.get('vmid', ''))
|
vmid = str(payload.get('vmid', ''))
|
||||||
if 'fail' in title_lower or 'error' in body_lower:
|
if 'fail' in title_lower or 'error' in body_lower:
|
||||||
return 'vm_fail', 'vm', vmid
|
return 'replication_fail', 'vm', vmid
|
||||||
return 'migration_complete', 'vm', vmid
|
return 'replication_complete', 'vm', vmid
|
||||||
|
|
||||||
# PBS (Proxmox Backup Server)
|
# PBS (Proxmox Backup Server)
|
||||||
if 'pbs' in component_lower or 'backup' in component_lower:
|
if 'pbs' in component_lower or 'backup' in component_lower:
|
||||||
@@ -901,8 +1068,10 @@ class ProxmoxHookWatcher:
|
|||||||
if 'network' in component_lower:
|
if 'network' in component_lower:
|
||||||
return 'network_down', 'network', ''
|
return 'network_down', 'network', ''
|
||||||
|
|
||||||
# Security
|
# Security -- distinguish firewall from auth
|
||||||
if any(k in component_lower for k in ('auth', 'firewall', 'security')):
|
if 'firewall' in component_lower or 'firewall' in title_lower:
|
||||||
|
return 'firewall_issue', 'node', ''
|
||||||
|
if any(k in component_lower for k in ('auth', 'security', 'pam', 'sshd')):
|
||||||
return 'auth_fail', 'user', ''
|
return 'auth_fail', 'user', ''
|
||||||
|
|
||||||
# Fallback: system_problem generic
|
# Fallback: system_problem generic
|
||||||
|
|||||||
@@ -495,10 +495,23 @@ class NotificationManager:
|
|||||||
self._dispatch_event(event)
|
self._dispatch_event(event)
|
||||||
|
|
||||||
def _process_event_direct(self, event: NotificationEvent):
|
def _process_event_direct(self, event: NotificationEvent):
|
||||||
"""Process a burst summary event. Bypasses aggregator but applies all other filters."""
|
"""Process a burst summary event. Bypasses aggregator but applies ALL other filters."""
|
||||||
if not self._enabled:
|
if not self._enabled:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Check group filter (same as _process_event)
|
||||||
|
template = TEMPLATES.get(event.event_type, {})
|
||||||
|
event_group = template.get('group', 'system')
|
||||||
|
group_setting = f'events.{event_group}'
|
||||||
|
if self._config.get(group_setting, 'true') == 'false':
|
||||||
|
return
|
||||||
|
|
||||||
|
# Check per-event filter (same as _process_event)
|
||||||
|
default_enabled = 'true' if template.get('default_enabled', True) else 'false'
|
||||||
|
event_specific = f'event.{event.event_type}'
|
||||||
|
if self._config.get(event_specific, default_enabled) == 'false':
|
||||||
|
return
|
||||||
|
|
||||||
# Check severity filter (same mapping as _process_event)
|
# Check severity filter (same mapping as _process_event)
|
||||||
severity_map = {'all': 'INFO', 'warning': 'WARNING', 'critical': 'CRITICAL'}
|
severity_map = {'all': 'INFO', 'warning': 'WARNING', 'critical': 'CRITICAL'}
|
||||||
raw_filter = self._config.get('severity_filter', 'all')
|
raw_filter = self._config.get('severity_filter', 'all')
|
||||||
|
|||||||
@@ -45,11 +45,14 @@ SEVERITY_ICONS_DISCORD = {
|
|||||||
|
|
||||||
TEMPLATES = {
|
TEMPLATES = {
|
||||||
# ── Health Monitor state changes ──
|
# ── Health Monitor state changes ──
|
||||||
|
# NOTE: state_change is disabled by default -- it fires on every
|
||||||
|
# status oscillation (OK->WARNING->OK) which creates noise.
|
||||||
|
# The health_persistent and new_error templates cover this better.
|
||||||
'state_change': {
|
'state_change': {
|
||||||
'title': '{hostname}: {category} changed to {current}',
|
'title': '{hostname}: {category} changed to {current}',
|
||||||
'body': '{category} status changed from {previous} to {current}.\n{reason}',
|
'body': '{category} status changed from {previous} to {current}.\n{reason}',
|
||||||
'group': 'system',
|
'group': 'system',
|
||||||
'default_enabled': True,
|
'default_enabled': False,
|
||||||
},
|
},
|
||||||
'new_error': {
|
'new_error': {
|
||||||
'title': '{hostname}: New {severity} - {category}',
|
'title': '{hostname}: New {severity} - {category}',
|
||||||
@@ -137,6 +140,18 @@ TEMPLATES = {
|
|||||||
'group': 'vm_ct',
|
'group': 'vm_ct',
|
||||||
'default_enabled': True,
|
'default_enabled': True,
|
||||||
},
|
},
|
||||||
|
'replication_fail': {
|
||||||
|
'title': '{hostname}: Replication FAILED - {vmid}',
|
||||||
|
'body': 'Replication of {vmname} ({vmid}) has failed.\n{reason}',
|
||||||
|
'group': 'vm_ct',
|
||||||
|
'default_enabled': True,
|
||||||
|
},
|
||||||
|
'replication_complete': {
|
||||||
|
'title': '{hostname}: Replication complete - {vmid}',
|
||||||
|
'body': 'Replication of {vmname} ({vmid}) completed successfully.',
|
||||||
|
'group': 'vm_ct',
|
||||||
|
'default_enabled': False,
|
||||||
|
},
|
||||||
|
|
||||||
# ── Backup / Snapshot events ──
|
# ── Backup / Snapshot events ──
|
||||||
'backup_start': {
|
'backup_start': {
|
||||||
@@ -314,6 +329,40 @@ TEMPLATES = {
|
|||||||
'default_enabled': False,
|
'default_enabled': False,
|
||||||
},
|
},
|
||||||
|
|
||||||
|
# ── Persistent Health Issues (daily digest) ──
|
||||||
|
'health_persistent': {
|
||||||
|
'title': '{hostname}: {count} active health issue(s)',
|
||||||
|
'body': 'The following health issues remain active:\n{issue_list}\n\nThis digest is sent once every 24 hours while issues persist.',
|
||||||
|
'group': 'system',
|
||||||
|
'default_enabled': True,
|
||||||
|
},
|
||||||
|
'health_issue_new': {
|
||||||
|
'title': '{hostname}: New health issue - {category}',
|
||||||
|
'body': 'New {severity} issue detected:\n{reason}',
|
||||||
|
'group': 'system',
|
||||||
|
'default_enabled': True,
|
||||||
|
},
|
||||||
|
'health_issue_resolved': {
|
||||||
|
'title': '{hostname}: Resolved - {category}',
|
||||||
|
'body': '{category} issue has been resolved.\n{reason}\nDuration: {duration}',
|
||||||
|
'group': 'system',
|
||||||
|
'default_enabled': True,
|
||||||
|
},
|
||||||
|
|
||||||
|
# ── Update notifications (enriched) ──
|
||||||
|
'update_summary': {
|
||||||
|
'title': '{hostname}: {total_count} updates available',
|
||||||
|
'body': '{security_count} security update(s), {total_count} total.\n{package_list}',
|
||||||
|
'group': 'system',
|
||||||
|
'default_enabled': True,
|
||||||
|
},
|
||||||
|
'pve_update': {
|
||||||
|
'title': '{hostname}: PVE update available ({version})',
|
||||||
|
'body': 'Proxmox VE update available: {version}\n{details}',
|
||||||
|
'group': 'system',
|
||||||
|
'default_enabled': True,
|
||||||
|
},
|
||||||
|
|
||||||
# ── Burst aggregation summaries ──
|
# ── Burst aggregation summaries ──
|
||||||
'burst_auth_fail': {
|
'burst_auth_fail': {
|
||||||
'title': '{hostname}: {count} auth failures in {window}',
|
'title': '{hostname}: {count} auth failures in {window}',
|
||||||
@@ -407,6 +456,9 @@ def render_template(event_type: str, data: Dict[str, Any]) -> Dict[str, Any]:
|
|||||||
'used': '', 'total': '', 'available': '', 'cores': '',
|
'used': '', 'total': '', 'available': '', 'cores': '',
|
||||||
'count': '', 'size': '', 'snapshot_name': '', 'jail': '',
|
'count': '', 'size': '', 'snapshot_name': '', 'jail': '',
|
||||||
'failures': '', 'quorum': '', 'change_details': '', 'message': '',
|
'failures': '', 'quorum': '', 'change_details': '', 'message': '',
|
||||||
|
'security_count': '0', 'total_count': '0', 'package_list': '',
|
||||||
|
'packages': '', 'pve_packages': '', 'version': '',
|
||||||
|
'issue_list': '', 'error_key': '',
|
||||||
}
|
}
|
||||||
variables.update(data)
|
variables.update(data)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user