Update notification service

This commit is contained in:
MacRimi
2026-03-02 17:16:22 +01:00
parent 9fe58935c4
commit 688ca8a604
6 changed files with 141 additions and 82 deletions

View File

@@ -110,7 +110,7 @@ export function ProxmoxDashboard() {
}) })
setIsServerConnected(true) setIsServerConnected(true)
} catch (error) { } catch (error) {
console.error("[v0] Failed to fetch system data from Flask server:", error) // Expected to fail in v0 preview (no Flask server)
setIsServerConnected(false) setIsServerConnected(false)
setSystemStatus((prev) => ({ setSystemStatus((prev) => ({

View File

@@ -19,29 +19,19 @@ export const API_PORT = process.env.NEXT_PUBLIC_API_PORT || "8008"
*/ */
export function getApiBaseUrl(): string { export function getApiBaseUrl(): string {
if (typeof window === "undefined") { if (typeof window === "undefined") {
console.log("[v0] getApiBaseUrl: Running on server (SSR)")
return "" return ""
} }
const { protocol, hostname, port } = window.location const { protocol, hostname, port } = window.location
console.log("[v0] getApiBaseUrl - protocol:", protocol, "hostname:", hostname, "port:", port)
// If accessing via standard ports (80/443) or no port, assume we're behind a proxy // If accessing via standard ports (80/443) or no port, assume we're behind a proxy
// In this case, use relative URLs so the proxy handles routing // In this case, use relative URLs so the proxy handles routing
const isStandardPort = port === "" || port === "80" || port === "443" const isStandardPort = port === "" || port === "80" || port === "443"
console.log("[v0] getApiBaseUrl - isStandardPort:", isStandardPort)
if (isStandardPort) { if (isStandardPort) {
// Behind a proxy - use relative URL
console.log("[v0] getApiBaseUrl: Detected proxy access, using relative URLs")
return "" return ""
} else { } else {
// Direct access - use explicit API port return `${protocol}//${hostname}:${API_PORT}`
const baseUrl = `${protocol}//${hostname}:${API_PORT}`
console.log("[v0] getApiBaseUrl: Direct access detected, using:", baseUrl)
return baseUrl
} }
} }
@@ -69,12 +59,7 @@ export function getAuthToken(): string | null {
if (typeof window === "undefined") { if (typeof window === "undefined") {
return null return null
} }
const token = localStorage.getItem("proxmenux-auth-token") return localStorage.getItem("proxmenux-auth-token")
console.log(
"[v0] getAuthToken called:",
token ? `Token found (length: ${token.length})` : "No token found in localStorage",
)
return token
} }
/** /**
@@ -96,31 +81,20 @@ export async function fetchApi<T>(endpoint: string, options?: RequestInit): Prom
if (token) { if (token) {
headers["Authorization"] = `Bearer ${token}` headers["Authorization"] = `Bearer ${token}`
console.log("[v0] fetchApi:", endpoint, "- Authorization header ADDED")
} else {
console.log("[v0] fetchApi:", endpoint, "- NO TOKEN - Request will fail if endpoint is protected")
} }
try { const response = await fetch(url, {
const response = await fetch(url, { ...options,
...options, headers,
headers, cache: "no-store",
cache: "no-store", })
})
console.log("[v0] fetchApi:", endpoint, "- Response status:", response.status) if (!response.ok) {
if (response.status === 401) {
if (!response.ok) { throw new Error(`Unauthorized: ${endpoint}`)
if (response.status === 401) {
console.error("[v0] fetchApi: 401 UNAUTHORIZED -", endpoint, "- Token present:", !!token)
throw new Error(`Unauthorized: ${endpoint}`)
}
throw new Error(`API request failed: ${response.status} ${response.statusText}`)
} }
throw new Error(`API request failed: ${response.status} ${response.statusText}`)
return response.json()
} catch (error) {
console.error("[v0] fetchApi error for", endpoint, ":", error)
throw error
} }
return response.json()
} }

View File

@@ -141,6 +141,20 @@ class HealthMonitor:
r'ata\d+.*hard resetting link', r'ata\d+.*hard resetting link',
r'ata\d+.*link is slow', r'ata\d+.*link is slow',
r'ata\d+.*COMRESET', r'ata\d+.*COMRESET',
# ── ProxMenux self-referential noise ──
# The monitor reporting its OWN service failures is circular --
# it cannot meaningfully alert about itself.
r'proxmenux-monitor\.service.*Failed',
r'proxmenux-monitor\.service.*exit-code',
r'ProxMenux-Monitor.*Failed at step EXEC',
# ── PVE scheduler operational noise ──
# pvescheduler emits "could not update job state" every minute
# when a scheduled job reference is stale. This is cosmetic,
# not a system problem.
r'pvescheduler.*could not update job state',
r'pvescheduler.*no such task',
] ]
CRITICAL_LOG_KEYWORDS = [ CRITICAL_LOG_KEYWORDS = [

View File

@@ -221,7 +221,28 @@ class HealthPersistence:
conn.close() conn.close()
return {'type': 'skipped_acknowledged', 'needs_notification': False} return {'type': 'skipped_acknowledged', 'needs_notification': False}
else: else:
# Suppression expired - reset as a NEW event # Suppression expired.
# For log-based errors (spike, persistent, cascade),
# do NOT re-trigger. The journal always contains old
# messages, so re-creating the error would cause an
# infinite notification cycle. Instead, just delete
# the stale record so it stops appearing in the UI.
is_log_error = (
error_key.startswith('log_persistent_')
or error_key.startswith('log_spike_')
or error_key.startswith('log_cascade_')
or error_key.startswith('log_critical_')
or category == 'logs'
)
if is_log_error:
cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,))
conn.commit()
conn.close()
return {'type': 'skipped_expired_log', 'needs_notification': False}
# For non-log errors (hardware, services, etc.),
# re-triggering is correct -- the condition is real
# and still present.
cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,)) cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,))
cursor.execute(''' cursor.execute('''
INSERT INTO errors INSERT INTO errors

View File

@@ -505,6 +505,7 @@ class JournalWatcher:
r'user-runtime-dir@\d+', # User runtime dirs r'user-runtime-dir@\d+', # User runtime dirs
r'systemd-coredump@', # Coredump handlers (transient) r'systemd-coredump@', # Coredump handlers (transient)
r'run-.*\.mount', # Transient mounts r'run-.*\.mount', # Transient mounts
r'proxmenux-monitor', # Self-referential: monitor can't alert about itself
] ]
for noise in _NOISE_PATTERNS: for noise in _NOISE_PATTERNS:
if re.search(noise, msg) or re.search(noise, unit): if re.search(noise, msg) or re.search(noise, unit):
@@ -741,17 +742,20 @@ class JournalWatcher:
def _check_backup_start(self, msg: str, syslog_id: str): def _check_backup_start(self, msg: str, syslog_id: str):
"""Detect backup job start from journal messages. """Detect backup job start from journal messages.
Matches multiple formats: The message "starting new backup job: vzdump ..." is unique and
- pvedaemon: "INFO: starting new backup job: vzdump 110 --storage PBS-Cloud --mode stop ..." definitive -- only a real vzdump invocation produces it. We match
- pvesh: "INFO: starting new backup job: vzdump 104 --mode stop --storage PBS-Cloud ..." purely on message content, regardless of which service emitted it,
- vzdump: "starting new backup job: vzdump 110 --storage PBS-Cloud --mode stop ..." because PVE uses different syslog identifiers depending on how the
- vzdump: "INFO: Starting Backup of VM 110 (qemu)" (per-guest fallback) backup was triggered:
- pvescheduler (scheduled backups via /etc/pve/jobs.cfg)
- pvedaemon (GUI-triggered backups)
- pvesh (CLI / API-triggered backups)
- vzdump (per-guest "Starting Backup of VM ..." lines)
PVE emits from pvedaemon for scheduled backups, from pvesh for Trying to maintain a whitelist of syslog_ids is fragile -- new PVE
API/CLI-triggered backups, and from vzdump for the per-guest lines. versions or plugins may introduce more. The message pattern itself
is the reliable indicator.
""" """
if syslog_id not in ('pvedaemon', 'pvesh', 'vzdump', ''):
return
# Primary pattern: full vzdump command with all arguments # Primary pattern: full vzdump command with all arguments
# Matches both "INFO: starting new backup job: vzdump ..." and # Matches both "INFO: starting new backup job: vzdump ..." and
@@ -887,49 +891,45 @@ class JournalWatcher:
}, entity='cluster', entity_id=node_name) }, entity='cluster', entity_id=node_name)
def _check_system_shutdown(self, msg: str, syslog_id: str): def _check_system_shutdown(self, msg: str, syslog_id: str):
"""Detect system shutdown/reboot. """Detect full-node shutdown or reboot.
Matches multiple systemd signals that indicate the node is going down: ONLY matches definitive signals from PID 1 (systemd) that prove
- "Shutting down." (systemd PID 1) the entire node is going down -- NOT individual service restarts.
- "System is powering off." / "System is rebooting."
- "Reached target Shutdown." / "Reached target Reboot." Severity is INFO, not CRITICAL, because:
- "Journal stopped" (very late in shutdown) - A planned shutdown/reboot is an administrative action, not an emergency.
- "The system will reboot now!" / "The system will power off now!" - If the node truly crashes, the monitor dies before it can send anything.
- Proxmox itself treats these as informational notifications.
""" """
msg_lower = msg.lower() # Strict syslog_id filter: only systemd PID 1 and systemd-logind
# emit authoritative node-level shutdown messages.
if syslog_id not in ('systemd', 'systemd-logind'):
return
# Only process systemd / logind messages msg_lower = msg.lower()
if not any(s in syslog_id for s in ('systemd', 'logind', '')):
if 'systemd' not in msg_lower:
return
is_reboot = False is_reboot = False
is_shutdown = False is_shutdown = False
# Detect reboot signals # Reboot signals -- only definitive whole-system messages
reboot_signals = [ reboot_signals = [
'system is rebooting', 'system is rebooting',
'reached target reboot',
'the system will reboot now', 'the system will reboot now',
'starting reboot',
] ]
for sig in reboot_signals: for sig in reboot_signals:
if sig in msg_lower: if sig in msg_lower:
is_reboot = True is_reboot = True
break break
# Detect shutdown/poweroff signals # Shutdown/poweroff signals -- only definitive whole-system messages.
# "shutting down" is deliberately EXCLUDED because many services emit
# it during normal restarts (e.g. "Shutting down proxy server...").
# "journal stopped" is EXCLUDED because journald can restart independently.
if not is_reboot: if not is_reboot:
shutdown_signals = [ shutdown_signals = [
'system is powering off', 'system is powering off',
'system is halting', 'system is halting',
'shutting down',
'reached target shutdown',
'reached target halt',
'the system will power off now', 'the system will power off now',
'starting power-off',
'journal stopped',
'stopping journal service',
] ]
for sig in shutdown_signals: for sig in shutdown_signals:
if sig in msg_lower: if sig in msg_lower:
@@ -937,13 +937,13 @@ class JournalWatcher:
break break
if is_reboot: if is_reboot:
self._emit('system_reboot', 'CRITICAL', { self._emit('system_reboot', 'INFO', {
'reason': msg[:200], 'reason': 'The system is rebooting.',
'hostname': self._hostname, 'hostname': self._hostname,
}, entity='node', entity_id='') }, entity='node', entity_id='')
elif is_shutdown: elif is_shutdown:
self._emit('system_shutdown', 'CRITICAL', { self._emit('system_shutdown', 'INFO', {
'reason': msg[:200], 'reason': 'The system is shutting down.',
'hostname': self._hostname, 'hostname': self._hostname,
}, entity='node', entity_id='') }, entity='node', entity_id='')
@@ -1832,11 +1832,36 @@ class ProxmoxHookWatcher:
'hostname': pve_hostname, 'hostname': pve_hostname,
'pve_type': pve_type, 'pve_type': pve_type,
'pve_message': message, 'pve_message': message,
'pve_title': title, 'pve_title': title or event_type,
'title': title, 'title': title or event_type,
'job_id': pve_job_id, 'job_id': pve_job_id,
} }
# ── Extract clean reason for system-mail events ──
# smartd and other system mail contains verbose boilerplate.
# Extract just the actionable warning/error lines.
if pve_type == 'system-mail' and message:
clean_lines = []
for line in message.split('\n'):
stripped = line.strip()
# Skip boilerplate lines
if not stripped:
continue
if stripped.startswith('This message was generated'):
continue
if stripped.startswith('For details see'):
continue
if stripped.startswith('You can also use'):
continue
if stripped.startswith('The original message'):
continue
if stripped.startswith('Another message will'):
continue
if stripped.startswith('host name:') or stripped.startswith('DNS domain:'):
continue
clean_lines.append(stripped)
data['reason'] = '\n'.join(clean_lines).strip() if clean_lines else message.strip()[:500]
# Extract VMID and VM name from message for vzdump events # Extract VMID and VM name from message for vzdump events
if pve_type == 'vzdump' and message: if pve_type == 'vzdump' and message:
# PVE vzdump messages contain lines like: # PVE vzdump messages contain lines like:
@@ -1902,8 +1927,27 @@ class ProxmoxHookWatcher:
if pve_type == 'package-updates': if pve_type == 'package-updates':
return 'update_available', 'node', '' return 'update_available', 'node', ''
if pve_type == 'system-mail': if pve_type == 'system-mail':
return 'system_mail', 'node', '' # Parse smartd messages to extract useful info and filter noise.
# smartd sends system-mail when it detects SMART issues.
msg_lower = (message or '').lower()
title_lower_sm = (title or '').lower()
# ── Filter smartd noise ──
# FailedReadSmartErrorLog: smartd can't read the error log -- this is
# a firmware quirk on some WD/Seagate drives, NOT a disk failure.
# FailedReadSmartData: similar firmware issue.
# These should NOT generate notifications.
smartd_noise = [
'failedreadsmarterrorlog',
'failedreadsmartdata',
'failedopendevice', # drive was temporarily unavailable
]
for noise in smartd_noise:
if noise in title_lower_sm or noise in msg_lower:
return '_skip', '', ''
return 'system_mail', 'node', ''
# ── Fallback for unknown/empty pve_type ── # ── Fallback for unknown/empty pve_type ──
# (e.g. test notifications, future PVE event types) # (e.g. test notifications, future PVE event types)

View File

@@ -561,13 +561,13 @@ TEMPLATES = {
# ── System events ── # ── System events ──
'system_shutdown': { 'system_shutdown': {
'title': '{hostname}: System shutting down', 'title': '{hostname}: System shutting down',
'body': 'The system is shutting down.\n{reason}', 'body': '{reason}',
'group': 'system', 'group': 'system',
'default_enabled': True, 'default_enabled': True,
}, },
'system_reboot': { 'system_reboot': {
'title': '{hostname}: System rebooting', 'title': '{hostname}: System rebooting',
'body': 'The system is rebooting.\n{reason}', 'body': '{reason}',
'group': 'system', 'group': 'system',
'default_enabled': True, 'default_enabled': True,
}, },
@@ -583,6 +583,12 @@ TEMPLATES = {
'group': 'system', 'group': 'system',
'default_enabled': True, 'default_enabled': True,
}, },
'system_mail': {
'title': '{hostname}: {pve_title}',
'body': '{reason}',
'group': 'system',
'default_enabled': True,
},
'update_available': { 'update_available': {
'title': '{hostname}: Updates available', 'title': '{hostname}: Updates available',
'body': 'Total updates: {total_count}\nSecurity: {security_count}\nProxmox: {pve_count}\nKernel: {kernel_count}\nImportant: {important_list}', 'body': 'Total updates: {total_count}\nSecurity: {security_count}\nProxmox: {pve_count}\nKernel: {kernel_count}\nImportant: {important_list}',