From 688ca8a604bc530edbb328c73af478e807fd36c6 Mon Sep 17 00:00:00 2001 From: MacRimi Date: Mon, 2 Mar 2026 17:16:22 +0100 Subject: [PATCH] Update notification service --- AppImage/components/proxmox-dashboard.tsx | 2 +- AppImage/lib/api-config.ts | 52 +++------ AppImage/scripts/health_monitor.py | 14 +++ AppImage/scripts/health_persistence.py | 23 +++- AppImage/scripts/notification_events.py | 122 ++++++++++++++------- AppImage/scripts/notification_templates.py | 10 +- 6 files changed, 141 insertions(+), 82 deletions(-) diff --git a/AppImage/components/proxmox-dashboard.tsx b/AppImage/components/proxmox-dashboard.tsx index 20794015..8c342c2b 100644 --- a/AppImage/components/proxmox-dashboard.tsx +++ b/AppImage/components/proxmox-dashboard.tsx @@ -110,7 +110,7 @@ export function ProxmoxDashboard() { }) setIsServerConnected(true) } catch (error) { - console.error("[v0] Failed to fetch system data from Flask server:", error) + // Expected to fail in v0 preview (no Flask server) setIsServerConnected(false) setSystemStatus((prev) => ({ diff --git a/AppImage/lib/api-config.ts b/AppImage/lib/api-config.ts index 34175c9e..3bb0a36b 100644 --- a/AppImage/lib/api-config.ts +++ b/AppImage/lib/api-config.ts @@ -19,29 +19,19 @@ export const API_PORT = process.env.NEXT_PUBLIC_API_PORT || "8008" */ export function getApiBaseUrl(): string { if (typeof window === "undefined") { - console.log("[v0] getApiBaseUrl: Running on server (SSR)") return "" } const { protocol, hostname, port } = window.location - console.log("[v0] getApiBaseUrl - protocol:", protocol, "hostname:", hostname, "port:", port) - // If accessing via standard ports (80/443) or no port, assume we're behind a proxy // In this case, use relative URLs so the proxy handles routing const isStandardPort = port === "" || port === "80" || port === "443" - console.log("[v0] getApiBaseUrl - isStandardPort:", isStandardPort) - if (isStandardPort) { - // Behind a proxy - use relative URL - console.log("[v0] getApiBaseUrl: Detected proxy access, using relative URLs") return "" } else { - // Direct access - use explicit API port - const baseUrl = `${protocol}//${hostname}:${API_PORT}` - console.log("[v0] getApiBaseUrl: Direct access detected, using:", baseUrl) - return baseUrl + return `${protocol}//${hostname}:${API_PORT}` } } @@ -69,12 +59,7 @@ export function getAuthToken(): string | null { if (typeof window === "undefined") { return null } - const token = localStorage.getItem("proxmenux-auth-token") - console.log( - "[v0] getAuthToken called:", - token ? `Token found (length: ${token.length})` : "No token found in localStorage", - ) - return token + return localStorage.getItem("proxmenux-auth-token") } /** @@ -96,31 +81,20 @@ export async function fetchApi(endpoint: string, options?: RequestInit): Prom if (token) { headers["Authorization"] = `Bearer ${token}` - console.log("[v0] fetchApi:", endpoint, "- Authorization header ADDED") - } else { - console.log("[v0] fetchApi:", endpoint, "- NO TOKEN - Request will fail if endpoint is protected") } - try { - const response = await fetch(url, { - ...options, - headers, - cache: "no-store", - }) + const response = await fetch(url, { + ...options, + headers, + cache: "no-store", + }) - console.log("[v0] fetchApi:", endpoint, "- Response status:", response.status) - - if (!response.ok) { - if (response.status === 401) { - console.error("[v0] fetchApi: 401 UNAUTHORIZED -", endpoint, "- Token present:", !!token) - throw new Error(`Unauthorized: ${endpoint}`) - } - throw new Error(`API request failed: ${response.status} ${response.statusText}`) + if (!response.ok) { + if (response.status === 401) { + throw new Error(`Unauthorized: ${endpoint}`) } - - return response.json() - } catch (error) { - console.error("[v0] fetchApi error for", endpoint, ":", error) - throw error + throw new Error(`API request failed: ${response.status} ${response.statusText}`) } + + return response.json() } diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py index 7430665b..cbb31b23 100644 --- a/AppImage/scripts/health_monitor.py +++ b/AppImage/scripts/health_monitor.py @@ -141,6 +141,20 @@ class HealthMonitor: r'ata\d+.*hard resetting link', r'ata\d+.*link is slow', r'ata\d+.*COMRESET', + + # ── ProxMenux self-referential noise ── + # The monitor reporting its OWN service failures is circular -- + # it cannot meaningfully alert about itself. + r'proxmenux-monitor\.service.*Failed', + r'proxmenux-monitor\.service.*exit-code', + r'ProxMenux-Monitor.*Failed at step EXEC', + + # ── PVE scheduler operational noise ── + # pvescheduler emits "could not update job state" every minute + # when a scheduled job reference is stale. This is cosmetic, + # not a system problem. + r'pvescheduler.*could not update job state', + r'pvescheduler.*no such task', ] CRITICAL_LOG_KEYWORDS = [ diff --git a/AppImage/scripts/health_persistence.py b/AppImage/scripts/health_persistence.py index 9e4b0085..241534e3 100644 --- a/AppImage/scripts/health_persistence.py +++ b/AppImage/scripts/health_persistence.py @@ -221,7 +221,28 @@ class HealthPersistence: conn.close() return {'type': 'skipped_acknowledged', 'needs_notification': False} else: - # Suppression expired - reset as a NEW event + # Suppression expired. + # For log-based errors (spike, persistent, cascade), + # do NOT re-trigger. The journal always contains old + # messages, so re-creating the error would cause an + # infinite notification cycle. Instead, just delete + # the stale record so it stops appearing in the UI. + is_log_error = ( + error_key.startswith('log_persistent_') + or error_key.startswith('log_spike_') + or error_key.startswith('log_cascade_') + or error_key.startswith('log_critical_') + or category == 'logs' + ) + if is_log_error: + cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,)) + conn.commit() + conn.close() + return {'type': 'skipped_expired_log', 'needs_notification': False} + + # For non-log errors (hardware, services, etc.), + # re-triggering is correct -- the condition is real + # and still present. cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,)) cursor.execute(''' INSERT INTO errors diff --git a/AppImage/scripts/notification_events.py b/AppImage/scripts/notification_events.py index 8c4f381e..a95c405d 100644 --- a/AppImage/scripts/notification_events.py +++ b/AppImage/scripts/notification_events.py @@ -505,6 +505,7 @@ class JournalWatcher: r'user-runtime-dir@\d+', # User runtime dirs r'systemd-coredump@', # Coredump handlers (transient) r'run-.*\.mount', # Transient mounts + r'proxmenux-monitor', # Self-referential: monitor can't alert about itself ] for noise in _NOISE_PATTERNS: if re.search(noise, msg) or re.search(noise, unit): @@ -741,17 +742,20 @@ class JournalWatcher: def _check_backup_start(self, msg: str, syslog_id: str): """Detect backup job start from journal messages. - Matches multiple formats: - - pvedaemon: "INFO: starting new backup job: vzdump 110 --storage PBS-Cloud --mode stop ..." - - pvesh: "INFO: starting new backup job: vzdump 104 --mode stop --storage PBS-Cloud ..." - - vzdump: "starting new backup job: vzdump 110 --storage PBS-Cloud --mode stop ..." - - vzdump: "INFO: Starting Backup of VM 110 (qemu)" (per-guest fallback) + The message "starting new backup job: vzdump ..." is unique and + definitive -- only a real vzdump invocation produces it. We match + purely on message content, regardless of which service emitted it, + because PVE uses different syslog identifiers depending on how the + backup was triggered: + - pvescheduler (scheduled backups via /etc/pve/jobs.cfg) + - pvedaemon (GUI-triggered backups) + - pvesh (CLI / API-triggered backups) + - vzdump (per-guest "Starting Backup of VM ..." lines) - PVE emits from pvedaemon for scheduled backups, from pvesh for - API/CLI-triggered backups, and from vzdump for the per-guest lines. + Trying to maintain a whitelist of syslog_ids is fragile -- new PVE + versions or plugins may introduce more. The message pattern itself + is the reliable indicator. """ - if syslog_id not in ('pvedaemon', 'pvesh', 'vzdump', ''): - return # Primary pattern: full vzdump command with all arguments # Matches both "INFO: starting new backup job: vzdump ..." and @@ -887,49 +891,45 @@ class JournalWatcher: }, entity='cluster', entity_id=node_name) def _check_system_shutdown(self, msg: str, syslog_id: str): - """Detect system shutdown/reboot. + """Detect full-node shutdown or reboot. - Matches multiple systemd signals that indicate the node is going down: - - "Shutting down." (systemd PID 1) - - "System is powering off." / "System is rebooting." - - "Reached target Shutdown." / "Reached target Reboot." - - "Journal stopped" (very late in shutdown) - - "The system will reboot now!" / "The system will power off now!" + ONLY matches definitive signals from PID 1 (systemd) that prove + the entire node is going down -- NOT individual service restarts. + + Severity is INFO, not CRITICAL, because: + - A planned shutdown/reboot is an administrative action, not an emergency. + - If the node truly crashes, the monitor dies before it can send anything. + - Proxmox itself treats these as informational notifications. """ - msg_lower = msg.lower() + # Strict syslog_id filter: only systemd PID 1 and systemd-logind + # emit authoritative node-level shutdown messages. + if syslog_id not in ('systemd', 'systemd-logind'): + return - # Only process systemd / logind messages - if not any(s in syslog_id for s in ('systemd', 'logind', '')): - if 'systemd' not in msg_lower: - return + msg_lower = msg.lower() is_reboot = False is_shutdown = False - # Detect reboot signals + # Reboot signals -- only definitive whole-system messages reboot_signals = [ 'system is rebooting', - 'reached target reboot', 'the system will reboot now', - 'starting reboot', ] for sig in reboot_signals: if sig in msg_lower: is_reboot = True break - # Detect shutdown/poweroff signals + # Shutdown/poweroff signals -- only definitive whole-system messages. + # "shutting down" is deliberately EXCLUDED because many services emit + # it during normal restarts (e.g. "Shutting down proxy server..."). + # "journal stopped" is EXCLUDED because journald can restart independently. if not is_reboot: shutdown_signals = [ 'system is powering off', 'system is halting', - 'shutting down', - 'reached target shutdown', - 'reached target halt', 'the system will power off now', - 'starting power-off', - 'journal stopped', - 'stopping journal service', ] for sig in shutdown_signals: if sig in msg_lower: @@ -937,13 +937,13 @@ class JournalWatcher: break if is_reboot: - self._emit('system_reboot', 'CRITICAL', { - 'reason': msg[:200], + self._emit('system_reboot', 'INFO', { + 'reason': 'The system is rebooting.', 'hostname': self._hostname, }, entity='node', entity_id='') elif is_shutdown: - self._emit('system_shutdown', 'CRITICAL', { - 'reason': msg[:200], + self._emit('system_shutdown', 'INFO', { + 'reason': 'The system is shutting down.', 'hostname': self._hostname, }, entity='node', entity_id='') @@ -1832,11 +1832,36 @@ class ProxmoxHookWatcher: 'hostname': pve_hostname, 'pve_type': pve_type, 'pve_message': message, - 'pve_title': title, - 'title': title, + 'pve_title': title or event_type, + 'title': title or event_type, 'job_id': pve_job_id, } + # ── Extract clean reason for system-mail events ── + # smartd and other system mail contains verbose boilerplate. + # Extract just the actionable warning/error lines. + if pve_type == 'system-mail' and message: + clean_lines = [] + for line in message.split('\n'): + stripped = line.strip() + # Skip boilerplate lines + if not stripped: + continue + if stripped.startswith('This message was generated'): + continue + if stripped.startswith('For details see'): + continue + if stripped.startswith('You can also use'): + continue + if stripped.startswith('The original message'): + continue + if stripped.startswith('Another message will'): + continue + if stripped.startswith('host name:') or stripped.startswith('DNS domain:'): + continue + clean_lines.append(stripped) + data['reason'] = '\n'.join(clean_lines).strip() if clean_lines else message.strip()[:500] + # Extract VMID and VM name from message for vzdump events if pve_type == 'vzdump' and message: # PVE vzdump messages contain lines like: @@ -1902,8 +1927,27 @@ class ProxmoxHookWatcher: if pve_type == 'package-updates': return 'update_available', 'node', '' - if pve_type == 'system-mail': - return 'system_mail', 'node', '' + if pve_type == 'system-mail': + # Parse smartd messages to extract useful info and filter noise. + # smartd sends system-mail when it detects SMART issues. + msg_lower = (message or '').lower() + title_lower_sm = (title or '').lower() + + # ── Filter smartd noise ── + # FailedReadSmartErrorLog: smartd can't read the error log -- this is + # a firmware quirk on some WD/Seagate drives, NOT a disk failure. + # FailedReadSmartData: similar firmware issue. + # These should NOT generate notifications. + smartd_noise = [ + 'failedreadsmarterrorlog', + 'failedreadsmartdata', + 'failedopendevice', # drive was temporarily unavailable + ] + for noise in smartd_noise: + if noise in title_lower_sm or noise in msg_lower: + return '_skip', '', '' + + return 'system_mail', 'node', '' # ── Fallback for unknown/empty pve_type ── # (e.g. test notifications, future PVE event types) diff --git a/AppImage/scripts/notification_templates.py b/AppImage/scripts/notification_templates.py index 2d59dc65..34e624e2 100644 --- a/AppImage/scripts/notification_templates.py +++ b/AppImage/scripts/notification_templates.py @@ -561,13 +561,13 @@ TEMPLATES = { # ── System events ── 'system_shutdown': { 'title': '{hostname}: System shutting down', - 'body': 'The system is shutting down.\n{reason}', + 'body': '{reason}', 'group': 'system', 'default_enabled': True, }, 'system_reboot': { 'title': '{hostname}: System rebooting', - 'body': 'The system is rebooting.\n{reason}', + 'body': '{reason}', 'group': 'system', 'default_enabled': True, }, @@ -583,6 +583,12 @@ TEMPLATES = { 'group': 'system', 'default_enabled': True, }, + 'system_mail': { + 'title': '{hostname}: {pve_title}', + 'body': '{reason}', + 'group': 'system', + 'default_enabled': True, + }, 'update_available': { 'title': '{hostname}: Updates available', 'body': 'Total updates: {total_count}\nSecurity: {security_count}\nProxmox: {pve_count}\nKernel: {kernel_count}\nImportant: {important_list}',