From 688ca8a604bc530edbb328c73af478e807fd36c6 Mon Sep 17 00:00:00 2001
From: MacRimi <ricoextincion@gmail.com>
Date: Mon, 2 Mar 2026 17:16:22 +0100
Subject: [PATCH] Update notification service

---
 AppImage/components/proxmox-dashboard.tsx  |   2 +-
 AppImage/lib/api-config.ts                 |  52 +++------
 AppImage/scripts/health_monitor.py         |  14 +++
 AppImage/scripts/health_persistence.py     |  23 +++-
 AppImage/scripts/notification_events.py    | 122 ++++++++++++++-------
 AppImage/scripts/notification_templates.py |  10 +-
 6 files changed, 141 insertions(+), 82 deletions(-)
diff --git a/AppImage/components/proxmox-dashboard.tsx b/AppImage/components/proxmox-dashboard.tsx
index 20794015..8c342c2b 100644
--- a/AppImage/components/proxmox-dashboard.tsx
+++ b/AppImage/components/proxmox-dashboard.tsx
@@ -110,7 +110,7 @@ export function ProxmoxDashboard() {
       })
       setIsServerConnected(true)
     } catch (error) {
-      console.error("[v0] Failed to fetch system data from Flask server:", error)
+      // Expected to fail in v0 preview (no Flask server)
 
       setIsServerConnected(false)
       setSystemStatus((prev) => ({
diff --git a/AppImage/lib/api-config.ts b/AppImage/lib/api-config.ts
index 34175c9e..3bb0a36b 100644
--- a/AppImage/lib/api-config.ts
+++ b/AppImage/lib/api-config.ts
@@ -19,29 +19,19 @@ export const API_PORT = process.env.NEXT_PUBLIC_API_PORT || "8008"
  */
 export function getApiBaseUrl(): string {
   if (typeof window === "undefined") {
-    console.log("[v0] getApiBaseUrl: Running on server (SSR)")
     return ""
   }
 
   const { protocol, hostname, port } = window.location
 
-  console.log("[v0] getApiBaseUrl - protocol:", protocol, "hostname:", hostname, "port:", port)
-
   // If accessing via standard ports (80/443) or no port, assume we're behind a proxy
   // In this case, use relative URLs so the proxy handles routing
   const isStandardPort = port === "" || port === "80" || port === "443"
 
-  console.log("[v0] getApiBaseUrl - isStandardPort:", isStandardPort)
-
   if (isStandardPort) {
-    // Behind a proxy - use relative URL
-    console.log("[v0] getApiBaseUrl: Detected proxy access, using relative URLs")
     return ""
   } else {
-    // Direct access - use explicit API port
-    const baseUrl = `${protocol}//${hostname}:${API_PORT}`
-    console.log("[v0] getApiBaseUrl: Direct access detected, using:", baseUrl)
-    return baseUrl
+    return `${protocol}//${hostname}:${API_PORT}`
   }
 }
 
@@ -69,12 +59,7 @@ export function getAuthToken(): string | null {
   if (typeof window === "undefined") {
     return null
   }
-  const token = localStorage.getItem("proxmenux-auth-token")
-  console.log(
-    "[v0] getAuthToken called:",
-    token ? `Token found (length: ${token.length})` : "No token found in localStorage",
-  )
-  return token
+  return localStorage.getItem("proxmenux-auth-token")
 }
 
 /**
@@ -96,31 +81,20 @@ export async function fetchApi<T>(endpoint: string, options?: RequestInit): Prom
 
   if (token) {
     headers["Authorization"] = `Bearer ${token}`
-    console.log("[v0] fetchApi:", endpoint, "- Authorization header ADDED")
-  } else {
-    console.log("[v0] fetchApi:", endpoint, "- NO TOKEN - Request will fail if endpoint is protected")
   }
 
-  try {
-    const response = await fetch(url, {
-      ...options,
-      headers,
-      cache: "no-store",
-    })
+  const response = await fetch(url, {
+    ...options,
+    headers,
+    cache: "no-store",
+  })
 
-    console.log("[v0] fetchApi:", endpoint, "- Response status:", response.status)
-
-    if (!response.ok) {
-      if (response.status === 401) {
-        console.error("[v0] fetchApi: 401 UNAUTHORIZED -", endpoint, "- Token present:", !!token)
-        throw new Error(`Unauthorized: ${endpoint}`)
-      }
-      throw new Error(`API request failed: ${response.status} ${response.statusText}`)
+  if (!response.ok) {
+    if (response.status === 401) {
+      throw new Error(`Unauthorized: ${endpoint}`)
     }
-
-    return response.json()
-  } catch (error) {
-    console.error("[v0] fetchApi error for", endpoint, ":", error)
-    throw error
+    throw new Error(`API request failed: ${response.status} ${response.statusText}`)
   }
+
+  return response.json()
 }
diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py
index 7430665b..cbb31b23 100644
--- a/AppImage/scripts/health_monitor.py
+++ b/AppImage/scripts/health_monitor.py
@@ -141,6 +141,20 @@ class HealthMonitor:
         r'ata\d+.*hard resetting link',
         r'ata\d+.*link is slow',
         r'ata\d+.*COMRESET',
+        
+        # ── ProxMenux self-referential noise ──
+        # The monitor reporting its OWN service failures is circular --
+        # it cannot meaningfully alert about itself.
+        r'proxmenux-monitor\.service.*Failed',
+        r'proxmenux-monitor\.service.*exit-code',
+        r'ProxMenux-Monitor.*Failed at step EXEC',
+        
+        # ── PVE scheduler operational noise ──
+        # pvescheduler emits "could not update job state" every minute
+        # when a scheduled job reference is stale.  This is cosmetic,
+        # not a system problem.
+        r'pvescheduler.*could not update job state',
+        r'pvescheduler.*no such task',
     ]
     
     CRITICAL_LOG_KEYWORDS = [
diff --git a/AppImage/scripts/health_persistence.py b/AppImage/scripts/health_persistence.py
index 9e4b0085..241534e3 100644
--- a/AppImage/scripts/health_persistence.py
+++ b/AppImage/scripts/health_persistence.py
@@ -221,7 +221,28 @@ class HealthPersistence:
                     conn.close()
                     return {'type': 'skipped_acknowledged', 'needs_notification': False}
                 else:
-                    # Suppression expired - reset as a NEW event
+                    # Suppression expired.
+                    # For log-based errors (spike, persistent, cascade),
+                    # do NOT re-trigger.  The journal always contains old
+                    # messages, so re-creating the error would cause an
+                    # infinite notification cycle.  Instead, just delete
+                    # the stale record so it stops appearing in the UI.
+                    is_log_error = (
+                        error_key.startswith('log_persistent_')
+                        or error_key.startswith('log_spike_')
+                        or error_key.startswith('log_cascade_')
+                        or error_key.startswith('log_critical_')
+                        or category == 'logs'
+                    )
+                    if is_log_error:
+                        cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,))
+                        conn.commit()
+                        conn.close()
+                        return {'type': 'skipped_expired_log', 'needs_notification': False}
+                    
+                    # For non-log errors (hardware, services, etc.),
+                    # re-triggering is correct -- the condition is real
+                    # and still present.
                     cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,))
                     cursor.execute('''
                         INSERT INTO errors 
diff --git a/AppImage/scripts/notification_events.py b/AppImage/scripts/notification_events.py
index 8c4f381e..a95c405d 100644
--- a/AppImage/scripts/notification_events.py
+++ b/AppImage/scripts/notification_events.py
@@ -505,6 +505,7 @@ class JournalWatcher:
             r'user-runtime-dir@\d+',        # User runtime dirs
             r'systemd-coredump@',           # Coredump handlers (transient)
             r'run-.*\.mount',               # Transient mounts
+            r'proxmenux-monitor',           # Self-referential: monitor can't alert about itself
         ]
         for noise in _NOISE_PATTERNS:
             if re.search(noise, msg) or re.search(noise, unit):
@@ -741,17 +742,20 @@ class JournalWatcher:
     def _check_backup_start(self, msg: str, syslog_id: str):
         """Detect backup job start from journal messages.
         
-        Matches multiple formats:
-        - pvedaemon: "INFO: starting new backup job: vzdump 110 --storage PBS-Cloud --mode stop ..."
-        - pvesh:     "INFO: starting new backup job: vzdump 104 --mode stop --storage PBS-Cloud ..."
-        - vzdump:    "starting new backup job: vzdump 110 --storage PBS-Cloud --mode stop ..."
-        - vzdump:    "INFO: Starting Backup of VM 110 (qemu)"  (per-guest fallback)
+        The message "starting new backup job: vzdump ..." is unique and
+        definitive -- only a real vzdump invocation produces it.  We match
+        purely on message content, regardless of which service emitted it,
+        because PVE uses different syslog identifiers depending on how the
+        backup was triggered:
+          - pvescheduler  (scheduled backups via /etc/pve/jobs.cfg)
+          - pvedaemon     (GUI-triggered backups)
+          - pvesh         (CLI / API-triggered backups)
+          - vzdump        (per-guest "Starting Backup of VM ..." lines)
         
-        PVE emits from pvedaemon for scheduled backups, from pvesh for
-        API/CLI-triggered backups, and from vzdump for the per-guest lines.
+        Trying to maintain a whitelist of syslog_ids is fragile -- new PVE
+        versions or plugins may introduce more.  The message pattern itself
+        is the reliable indicator.
         """
-        if syslog_id not in ('pvedaemon', 'pvesh', 'vzdump', ''):
-            return
         
         # Primary pattern: full vzdump command with all arguments
         # Matches both "INFO: starting new backup job: vzdump ..." and
@@ -887,49 +891,45 @@ class JournalWatcher:
             }, entity='cluster', entity_id=node_name)
     
     def _check_system_shutdown(self, msg: str, syslog_id: str):
-        """Detect system shutdown/reboot.
+        """Detect full-node shutdown or reboot.
         
-        Matches multiple systemd signals that indicate the node is going down:
-          - "Shutting down."  (systemd PID 1)
-          - "System is powering off."  / "System is rebooting."
-          - "Reached target Shutdown." / "Reached target Reboot."
-          - "Journal stopped"  (very late in shutdown)
-          - "The system will reboot now!"  / "The system will power off now!"
+        ONLY matches definitive signals from PID 1 (systemd) that prove
+        the entire node is going down -- NOT individual service restarts.
+        
+        Severity is INFO, not CRITICAL, because:
+        - A planned shutdown/reboot is an administrative action, not an emergency.
+        - If the node truly crashes, the monitor dies before it can send anything.
+        - Proxmox itself treats these as informational notifications.
         """
-        msg_lower = msg.lower()
+        # Strict syslog_id filter: only systemd PID 1 and systemd-logind
+        # emit authoritative node-level shutdown messages.
+        if syslog_id not in ('systemd', 'systemd-logind'):
+            return
         
-        # Only process systemd / logind messages
-        if not any(s in syslog_id for s in ('systemd', 'logind', '')):
-            if 'systemd' not in msg_lower:
-                return
+        msg_lower = msg.lower()
         
         is_reboot = False
         is_shutdown = False
         
-        # Detect reboot signals
+        # Reboot signals -- only definitive whole-system messages
         reboot_signals = [
             'system is rebooting',
-            'reached target reboot',
             'the system will reboot now',
-            'starting reboot',
         ]
         for sig in reboot_signals:
             if sig in msg_lower:
                 is_reboot = True
                 break
         
-        # Detect shutdown/poweroff signals
+        # Shutdown/poweroff signals -- only definitive whole-system messages.
+        # "shutting down" is deliberately EXCLUDED because many services emit
+        # it during normal restarts (e.g. "Shutting down proxy server...").
+        # "journal stopped" is EXCLUDED because journald can restart independently.
         if not is_reboot:
             shutdown_signals = [
                 'system is powering off',
                 'system is halting',
-                'shutting down',
-                'reached target shutdown',
-                'reached target halt',
                 'the system will power off now',
-                'starting power-off',
-                'journal stopped',
-                'stopping journal service',
             ]
             for sig in shutdown_signals:
                 if sig in msg_lower:
@@ -937,13 +937,13 @@ class JournalWatcher:
                     break
         
         if is_reboot:
-            self._emit('system_reboot', 'CRITICAL', {
-                'reason': msg[:200],
+            self._emit('system_reboot', 'INFO', {
+                'reason': 'The system is rebooting.',
                 'hostname': self._hostname,
             }, entity='node', entity_id='')
         elif is_shutdown:
-            self._emit('system_shutdown', 'CRITICAL', {
-                'reason': msg[:200],
+            self._emit('system_shutdown', 'INFO', {
+                'reason': 'The system is shutting down.',
                 'hostname': self._hostname,
             }, entity='node', entity_id='')
     
@@ -1832,11 +1832,36 @@ class ProxmoxHookWatcher:
             'hostname': pve_hostname,
             'pve_type': pve_type,
             'pve_message': message,
-            'pve_title': title,
-            'title': title,
+            'pve_title': title or event_type,
+            'title': title or event_type,
             'job_id': pve_job_id,
         }
         
+        # ── Extract clean reason for system-mail events ──
+        # smartd and other system mail contains verbose boilerplate.
+        # Extract just the actionable warning/error lines.
+        if pve_type == 'system-mail' and message:
+            clean_lines = []
+            for line in message.split('\n'):
+                stripped = line.strip()
+                # Skip boilerplate lines
+                if not stripped:
+                    continue
+                if stripped.startswith('This message was generated'):
+                    continue
+                if stripped.startswith('For details see'):
+                    continue
+                if stripped.startswith('You can also use'):
+                    continue
+                if stripped.startswith('The original message'):
+                    continue
+                if stripped.startswith('Another message will'):
+                    continue
+                if stripped.startswith('host name:') or stripped.startswith('DNS domain:'):
+                    continue
+                clean_lines.append(stripped)
+            data['reason'] = '\n'.join(clean_lines).strip() if clean_lines else message.strip()[:500]
+        
         # Extract VMID and VM name from message for vzdump events
         if pve_type == 'vzdump' and message:
             # PVE vzdump messages contain lines like:
@@ -1902,8 +1927,27 @@ class ProxmoxHookWatcher:
         if pve_type == 'package-updates':
             return 'update_available', 'node', ''
         
-        if pve_type == 'system-mail':
-            return 'system_mail', 'node', ''
+    if pve_type == 'system-mail':
+        # Parse smartd messages to extract useful info and filter noise.
+        # smartd sends system-mail when it detects SMART issues.
+        msg_lower = (message or '').lower()
+        title_lower_sm = (title or '').lower()
+        
+        # ── Filter smartd noise ──
+        # FailedReadSmartErrorLog: smartd can't read the error log -- this is
+        # a firmware quirk on some WD/Seagate drives, NOT a disk failure.
+        # FailedReadSmartData: similar firmware issue.
+        # These should NOT generate notifications.
+        smartd_noise = [
+            'failedreadsmarterrorlog',
+            'failedreadsmartdata',
+            'failedopendevice',  # drive was temporarily unavailable
+        ]
+        for noise in smartd_noise:
+            if noise in title_lower_sm or noise in msg_lower:
+                return '_skip', '', ''
+        
+        return 'system_mail', 'node', ''
         
         # ── Fallback for unknown/empty pve_type ──
         # (e.g. test notifications, future PVE event types)
diff --git a/AppImage/scripts/notification_templates.py b/AppImage/scripts/notification_templates.py
index 2d59dc65..34e624e2 100644
--- a/AppImage/scripts/notification_templates.py
+++ b/AppImage/scripts/notification_templates.py
@@ -561,13 +561,13 @@ TEMPLATES = {
     # ── System events ──
     'system_shutdown': {
         'title': '{hostname}: System shutting down',
-        'body': 'The system is shutting down.\n{reason}',
+        'body': '{reason}',
         'group': 'system',
         'default_enabled': True,
     },
     'system_reboot': {
         'title': '{hostname}: System rebooting',
-        'body': 'The system is rebooting.\n{reason}',
+        'body': '{reason}',
         'group': 'system',
         'default_enabled': True,
     },
@@ -583,6 +583,12 @@ TEMPLATES = {
         'group': 'system',
         'default_enabled': True,
     },
+    'system_mail': {
+        'title': '{hostname}: {pve_title}',
+        'body': '{reason}',
+        'group': 'system',
+        'default_enabled': True,
+    },
     'update_available': {
         'title': '{hostname}: Updates available',
         'body': 'Total updates: {total_count}\nSecurity: {security_count}\nProxmox: {pve_count}\nKernel: {kernel_count}\nImportant: {important_list}',