diff --git a/AppImage/components/health-status-modal.tsx b/AppImage/components/health-status-modal.tsx index b6da6742..439b946f 100644 --- a/AppImage/components/health-status-modal.tsx +++ b/AppImage/components/health-status-modal.tsx @@ -3,6 +3,7 @@ import type React from "react" import { useState, useEffect, useCallback } from "react" +import { fetchApi, getApiUrl, getAuthToken } from "@/lib/api-config" import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle } from "@/components/ui/dialog" import { Badge } from "@/components/ui/badge" import { Button } from "@/components/ui/button" @@ -122,10 +123,16 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu let newOverallStatus = "OK" // Use the new combined endpoint for fewer round-trips - const response = await fetch(getApiUrl("/api/health/full")) + const token = getAuthToken() + const authHeaders: Record = {} + if (token) { + authHeaders["Authorization"] = `Bearer ${token}` + } + + const response = await fetch(getApiUrl("/api/health/full"), { headers: authHeaders }) if (!response.ok) { // Fallback to legacy endpoint - const legacyResponse = await fetch(getApiUrl("/api/health/details")) + const legacyResponse = await fetch(getApiUrl("/api/health/details"), { headers: authHeaders }) if (!legacyResponse.ok) throw new Error("Failed to fetch health details") const data = await legacyResponse.json() setHealthData(data) @@ -288,15 +295,22 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu setDismissingKey(errorKey) try { - const response = await fetch(getApiUrl("/api/health/acknowledge"), { + const url = getApiUrl("/api/health/acknowledge") + const token = getAuthToken() + const headers: Record = { "Content-Type": "application/json" } + if (token) { + headers["Authorization"] = `Bearer ${token}` + } + + const response = await fetch(url, { method: "POST", - headers: { "Content-Type": "application/json" }, + headers, body: JSON.stringify({ error_key: errorKey }), }) if (!response.ok) { - const errorData = await response.json() - throw new Error(errorData.error || "Failed to dismiss error") + const errorData = await response.json().catch(() => ({})) + throw new Error(errorData.error || `Failed to dismiss error (${response.status})`) } await fetchHealthDetails() @@ -408,10 +422,10 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu key={checkKey} className="flex items-center justify-between gap-1.5 sm:gap-2 text-[10px] sm:text-xs py-1.5 px-2 sm:px-3 rounded-md hover:bg-muted/40 transition-colors" > -
- {getStatusIcon(checkData.status, "sm")} +
+ {getStatusIcon(checkData.status, "sm")} {formatCheckLabel(checkKey)} - {checkData.detail} + {checkData.detail} {checkData.dismissed && ( Dismissed @@ -520,8 +534,8 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
{healthData.summary && healthData.summary !== "All systems operational" && ( -
-

{healthData.summary}

+
+

{healthData.summary}

)} @@ -559,7 +573,7 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu )}
{reason && !isExpanded && ( -

{reason}

+

{reason}

)}
@@ -578,7 +592,7 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu {isExpanded && (
{reason && ( -

{reason}

+

{reason}

)} {hasChecks ? ( renderChecks(checks, key) diff --git a/AppImage/components/notification-settings.tsx b/AppImage/components/notification-settings.tsx new file mode 100644 index 00000000..6d720da3 --- /dev/null +++ b/AppImage/components/notification-settings.tsx @@ -0,0 +1,1511 @@ +"use client" + +import { useState, useEffect, useCallback } from "react" +import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "./ui/card" +import { Tabs, TabsList, TabsTrigger, TabsContent } from "./ui/tabs" +import { Input } from "./ui/input" +import { Label } from "./ui/label" +import { Badge } from "./ui/badge" + +import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from "./ui/select" +import { fetchApi } from "../lib/api-config" +import { + Bell, BellOff, Send, CheckCircle2, XCircle, Loader2, + AlertTriangle, Info, Settings2, Zap, Eye, EyeOff, + Trash2, ChevronDown, ChevronUp, ChevronRight, TestTube2, Mail, Webhook, + Copy, Server, Shield +} from "lucide-react" + +interface ChannelConfig { + enabled: boolean + bot_token?: string + chat_id?: string + url?: string + token?: string + webhook_url?: string + // Email channel fields + host?: string + port?: string + username?: string + password?: string + tls_mode?: string + from_address?: string + to_addresses?: string + subject_prefix?: string +} + +interface EventTypeInfo { + type: string + title: string + default_enabled: boolean +} + +interface NotificationConfig { + enabled: boolean + channels: Record + severity_filter: string + event_categories: Record + event_toggles: Record + event_types_by_group: Record + ai_enabled: boolean + ai_provider: string + ai_api_key: string + ai_model: string + hostname: string + webhook_secret: string + webhook_allowed_ips: string + pbs_host: string + pve_host: string + pbs_trusted_sources: string +} + +interface ServiceStatus { + enabled: boolean + running: boolean + channels: Record + queue_size: number + last_sent: string | null + total_sent_24h: number +} + +interface HistoryEntry { + id: number + event_type: string + channel: string + title: string + severity: string + sent_at: string + success: boolean + error_message: string | null +} + +const SEVERITY_OPTIONS = [ + { value: "critical", label: "Critical only" }, + { value: "warning", label: "Warning + Critical" }, + { value: "info", label: "All (Info + Warning + Critical)" }, +] + +const EVENT_CATEGORIES = [ + { key: "system", label: "System", desc: "Startup, shutdown, kernel events" }, + { key: "vm_ct", label: "VM / CT", desc: "Start, stop, crash, migration" }, + { key: "backup", label: "Backups", desc: "Backup start, complete, fail" }, + { key: "resources", label: "Resources", desc: "CPU, memory, temperature" }, + { key: "storage", label: "Storage", desc: "Disk space, I/O errors, SMART" }, + { key: "network", label: "Network", desc: "Connectivity, bond, latency" }, + { key: "security", label: "Security", desc: "Auth failures, fail2ban, firewall" }, + { key: "cluster", label: "Cluster", desc: "Quorum, split-brain, HA fencing" }, +] + +const AI_PROVIDERS = [ + { value: "openai", label: "OpenAI" }, + { value: "groq", label: "Groq" }, +] + +const DEFAULT_CONFIG: NotificationConfig = { + enabled: false, + channels: { + telegram: { enabled: false }, + gotify: { enabled: false }, + discord: { enabled: false }, + email: { enabled: false }, + }, + severity_filter: "all", + event_categories: { + system: true, vm_ct: true, backup: true, resources: true, + storage: true, network: true, security: true, cluster: true, + }, + event_toggles: {}, + event_types_by_group: {}, + ai_enabled: false, + ai_provider: "openai", + ai_api_key: "", + ai_model: "", + hostname: "", + webhook_secret: "", + webhook_allowed_ips: "", + pbs_host: "", + pve_host: "", + pbs_trusted_sources: "", +} + +export function NotificationSettings() { + const [config, setConfig] = useState(DEFAULT_CONFIG) + const [status, setStatus] = useState(null) + const [history, setHistory] = useState([]) + const [loading, setLoading] = useState(true) + const [saving, setSaving] = useState(false) + const [saved, setSaved] = useState(false) + const [testing, setTesting] = useState(null) + const [testResult, setTestResult] = useState<{ channel: string; success: boolean; message: string } | null>(null) + const [showHistory, setShowHistory] = useState(false) + const [showAdvanced, setShowAdvanced] = useState(false) + const [showSecrets, setShowSecrets] = useState>({}) + const [editMode, setEditMode] = useState(false) + const [hasChanges, setHasChanges] = useState(false) + const [expandedCategories, setExpandedCategories] = useState>(new Set()) + const [originalConfig, setOriginalConfig] = useState(DEFAULT_CONFIG) + const [webhookSetup, setWebhookSetup] = useState<{ + status: "idle" | "running" | "success" | "failed" + fallback_commands: string[] + error: string + }>({ status: "idle", fallback_commands: [], error: "" }) + + const loadConfig = useCallback(async () => { + try { + const data = await fetchApi<{ success: boolean; config: NotificationConfig }>("/api/notifications/settings") + if (data.success && data.config) { + setConfig(data.config) + setOriginalConfig(data.config) + } + } catch (err) { + console.error("Failed to load notification settings:", err) + } finally { + setLoading(false) + } + }, []) + + const loadStatus = useCallback(async () => { + try { + const data = await fetchApi<{ success: boolean } & ServiceStatus>("/api/notifications/status") + if (data.success) { + setStatus(data) + } + } catch { + // Service may not be running yet + } + }, []) + + const loadHistory = useCallback(async () => { + try { + const data = await fetchApi<{ success: boolean; history: HistoryEntry[]; total: number }>("/api/notifications/history?limit=20") + if (data.success) { + setHistory(data.history || []) + } + } catch { + // Ignore + } + }, []) + + useEffect(() => { + loadConfig() + loadStatus() + }, [loadConfig, loadStatus]) + + useEffect(() => { + if (showHistory) loadHistory() + }, [showHistory, loadHistory]) + + const updateConfig = (updater: (prev: NotificationConfig) => NotificationConfig) => { + setConfig(prev => { + const next = updater(prev) + setHasChanges(true) + return next + }) + } + + const updateChannel = (channel: string, field: string, value: string | boolean) => { + updateConfig(prev => ({ + ...prev, + channels: { + ...prev.channels, + [channel]: { ...prev.channels[channel], [field]: value }, + }, + })) + } + + /** Flatten the nested NotificationConfig into the flat key-value map the backend expects. */ + const flattenConfig = (cfg: NotificationConfig): Record => { + const flat: Record = { + enabled: String(cfg.enabled), + severity_filter: cfg.severity_filter, + ai_enabled: String(cfg.ai_enabled), + ai_provider: cfg.ai_provider, + ai_api_key: cfg.ai_api_key, + ai_model: cfg.ai_model, + hostname: cfg.hostname, + webhook_secret: cfg.webhook_secret, + webhook_allowed_ips: cfg.webhook_allowed_ips, + pbs_host: cfg.pbs_host, + pve_host: cfg.pve_host, + pbs_trusted_sources: cfg.pbs_trusted_sources, + } + // Flatten channels: { telegram: { enabled, bot_token, chat_id } } -> telegram.enabled, telegram.bot_token, ... + for (const [chName, chCfg] of Object.entries(cfg.channels)) { + for (const [field, value] of Object.entries(chCfg)) { + flat[`${chName}.${field}`] = String(value ?? "") + } + } + // Flatten event_categories: { system: true, backups: false } -> events.system, events.backups + for (const [cat, enabled] of Object.entries(cfg.event_categories)) { + flat[`events.${cat}`] = String(enabled) + } + // Flatten event_toggles: { vm_start: true, vm_stop: false } -> event.vm_start, event.vm_stop + // Always write ALL toggles to DB so the backend has an explicit record. + // This ensures default_enabled changes in templates don't get overridden by stale DB values. + if (cfg.event_toggles) { + for (const [evt, enabled] of Object.entries(cfg.event_toggles)) { + flat[`event.${evt}`] = String(enabled) + } + } + // Also write any events NOT in event_toggles using their template defaults. + // This covers newly added templates whose default_enabled may be false. + if (cfg.event_types_by_group) { + for (const events of Object.values(cfg.event_types_by_group)) { + for (const evt of (events as Array<{type: string, default_enabled: boolean}>)) { + const key = `event.${evt.type}` + if (!(key in flat)) { + flat[key] = String(evt.default_enabled) + } + } + } + } + return flat + } + + const handleSave = async () => { + setSaving(true) + try { + // If notifications are being disabled, clean up PVE webhook first + const wasEnabled = originalConfig.enabled + const isNowDisabled = !config.enabled + + if (wasEnabled && isNowDisabled) { + try { + await fetchApi("/api/notifications/proxmox/cleanup-webhook", { method: "POST" }) + } catch { + // Non-fatal: webhook cleanup failed but we still save settings + } + } + + const payload = flattenConfig(config) + await fetchApi("/api/notifications/settings", { + method: "POST", + body: JSON.stringify(payload), + }) + setOriginalConfig(config) + setHasChanges(false) + setEditMode(false) + setSaved(true) + setTimeout(() => setSaved(false), 3000) + loadStatus() + } catch (err) { + console.error("Failed to save notification settings:", err) + } finally { + setSaving(false) + } + } + + const handleCancel = () => { + setConfig(originalConfig) + setHasChanges(false) + setEditMode(false) + } + + const handleTest = async (channel: string) => { + setTesting(channel) + setTestResult(null) + try { + // Auto-save current config before testing so backend has latest channel data + const payload = flattenConfig(config) + await fetchApi("/api/notifications/settings", { + method: "POST", + body: JSON.stringify(payload), + }) + setOriginalConfig(config) + setHasChanges(false) + + const data = await fetchApi<{ + success: boolean + message?: string + error?: string + results?: Record + }>("/api/notifications/test", { + method: "POST", + body: JSON.stringify({ channel }), + }) + + // Extract message from the results object if present + let message = data.message || "" + if (!message && data.results) { + const channelResult = data.results[channel] + if (channelResult) { + message = channelResult.success + ? "Test notification sent successfully" + : channelResult.error || "Test failed" + } + } + if (!message && data.error) { + message = data.error + } + if (!message) { + message = data.success ? "Test notification sent successfully" : "Test failed" + } + + setTestResult({ channel, success: data.success, message }) + } catch (err) { + setTestResult({ channel, success: false, message: String(err) }) + } finally { + setTesting(null) + setTimeout(() => setTestResult(null), 8000) + } + } + + const handleClearHistory = async () => { + try { + await fetchApi("/api/notifications/history", { method: "DELETE" }) + setHistory([]) + } catch { + // Ignore + } + } + + const toggleSecret = (key: string) => { + setShowSecrets(prev => ({ ...prev, [key]: !prev[key] })) + } + + if (loading) { + return ( + + +
+ + Notifications +
+
+ +
+
+
+ + + ) + } + + const activeChannels = Object.entries(config.channels).filter(([, ch]) => ch.enabled).length + + const handleEnable = async () => { + setSaving(true) + setWebhookSetup({ status: "running", fallback_commands: [], error: "" }) + try { + // 1) Save enabled=true + const newConfig = { ...config, enabled: true } + await fetchApi("/api/notifications/settings", { + method: "POST", + body: JSON.stringify(newConfig), + }) + setConfig(newConfig) + setOriginalConfig(newConfig) + + // 2) Auto-configure PVE webhook + try { + const setup = await fetchApi<{ + configured: boolean + secret?: string + fallback_commands?: string[] + error?: string + }>("/api/notifications/proxmox/setup-webhook", { method: "POST" }) + + if (setup.configured) { + setWebhookSetup({ status: "success", fallback_commands: [], error: "" }) + // Update secret in local config if one was generated + if (setup.secret) { + const updated = { ...newConfig, webhook_secret: setup.secret } + setConfig(updated) + setOriginalConfig(updated) + } + } else { + setWebhookSetup({ + status: "failed", + fallback_commands: setup.fallback_commands || [], + error: setup.error || "Unknown error", + }) + } + } catch { + setWebhookSetup({ + status: "failed", + fallback_commands: [], + error: "Could not reach setup endpoint", + }) + } + + setEditMode(true) + loadStatus() + } catch (err) { + console.error("Failed to enable notifications:", err) + setWebhookSetup({ status: "idle", fallback_commands: [], error: "" }) + } finally { + setSaving(false) + } + } + + // ── Disabled state: show activation card ── + if (!config.enabled && !editMode) { + return ( + + +
+ + Notifications + + Disabled + +
+ + Get real-time alerts about your Proxmox environment via Telegram, Discord, Gotify, or Email. + +
+ +
+
+
+ +
+

Enable notification service

+

+ Monitor system health, VM/CT events, backups, security alerts, and cluster status. + PVE webhook integration is configured automatically. +

+
+
+
+ +
+ + {/* Webhook setup result */} + {webhookSetup.status === "success" && ( +
+ +

+ PVE webhook configured automatically. Proxmox will send notifications to ProxMenux. +

+
+ )} + {webhookSetup.status === "failed" && ( +
+
+ +
+

+ Automatic PVE configuration failed: {webhookSetup.error} +

+

+ Notifications are enabled. Run the commands below on the PVE host to complete webhook setup. +

+
+
+ {webhookSetup.fallback_commands.length > 0 && ( +
+{webhookSetup.fallback_commands.join('\n')}
+                    
+ )} +
+ )} +
+ + {/* PBS manual section (collapsible) */} +
+ + + + Configure PBS notifications (manual) + +
+
+

+ PVE backups launched from the PVE interface are covered automatically by the PVE webhook above. +

+

+ However, PBS has its own internal jobs (Verify, Prune, GC, Sync) that generate + separate notifications. These must be configured directly on the PBS server. +

+
+
+

+ Append to /etc/proxmox-backup/notifications.cfg on the PBS host: +

+
+{`webhook: proxmenux-webhook
+\tmethod post
+\turl http://:8008/api/notifications/webhook
+
+matcher: proxmenux-pbs
+\ttarget proxmenux-webhook
+\tmatch-severity warning,error`}
+                  
+
+
+ +

+ {"Replace with the IP of this PVE node (not 127.0.0.1, unless PBS runs on the same host). Append at the end -- do not delete existing content."} +

+
+
+
+
+
+
+ ) + } + + return ( + + +
+
+ + Notifications + {config.enabled && ( + + Active + + )} +
+
+ {saved && ( + + + Saved + + )} + {editMode ? ( + <> + + + + ) : ( + + )} +
+
+ + Configure notification channels and event filters. Receive alerts via Telegram, Gotify, Discord, or Email. + +
+ + + {/* ── Service Status ── */} + {status && ( +
+
+
+ + {status.running ? "Service running" : "Service stopped"} + + {status.total_sent_24h > 0 && ( + + {status.total_sent_24h} sent in last 24h + + )} +
+ {activeChannels > 0 && ( + + {activeChannels} channel{activeChannels > 1 ? "s" : ""} + + )} +
+ )} + + {/* ── Enable/Disable ── */} +
+
+ {config.enabled ? ( + + ) : ( + + )} +
+ Enable Notifications +

Activate the notification service

+
+
+ +
+ + {config.enabled && ( + <> + {/* ── Channel Configuration ── */} +
+
+ + Channels +
+ +
+ + + + Telegram + + + Gotify + + + Discord + + + Email + + + + {/* Telegram */} + +
+ + +
+ {config.channels.telegram?.enabled && ( + <> +
+ +
+ updateChannel("telegram", "bot_token", e.target.value)} + /> + +
+
+
+ + updateChannel("telegram", "chat_id", e.target.value)} + /> +
+ {/* Per-channel action bar */} +
+ + +
+ + )} +
+ + {/* Gotify */} + +
+ + +
+ {config.channels.gotify?.enabled && ( + <> +
+ + updateChannel("gotify", "url", e.target.value)} + /> +
+
+ +
+ updateChannel("gotify", "token", e.target.value)} + /> + +
+
+ {/* Per-channel action bar */} +
+ + +
+ + )} +
+ + {/* Discord */} + +
+ + +
+ {config.channels.discord?.enabled && ( + <> +
+ +
+ updateChannel("discord", "webhook_url", e.target.value)} + /> + +
+
+ {/* Per-channel action bar */} +
+ + +
+ + )} +
+ + {/* Email */} + +
+ + +
+ {config.channels.email?.enabled && ( + <> +
+
+ + updateChannel("email", "host", e.target.value)} + /> +
+
+ + updateChannel("email", "port", e.target.value)} + /> +
+
+
+ + +
+
+
+ + updateChannel("email", "username", e.target.value)} + /> +
+
+ +
+ updateChannel("email", "password", e.target.value)} + /> + +
+
+
+
+ + updateChannel("email", "from_address", e.target.value)} + /> +
+
+ + updateChannel("email", "to_addresses", e.target.value)} + /> +
+
+ + updateChannel("email", "subject_prefix", e.target.value)} + /> +
+
+ +

+ Leave SMTP Host empty to use local sendmail (must be installed on the server). + For Gmail, use an App Password instead of your account password. +

+
+ {/* Per-channel action bar */} +
+ + +
+ + )} +
+
+ + {/* Test Result */} + {testResult && ( +
+ {testResult.success ? ( + + ) : ( + + )} + {testResult.message} +
+ )} +
{/* close bordered channel container */} +
+ + {/* ── Filters ── */} +
+
+ + Filters & Events +
+
+ {/* Severity */} +
+ + +
+ + {/* Event Categories */} +
+ +
+ {EVENT_CATEGORIES.map(cat => { + const isEnabled = config.event_categories[cat.key] ?? true + const isExpanded = expandedCategories.has(cat.key) + const eventsForGroup = config.event_types_by_group?.[cat.key] || [] + const enabledCount = eventsForGroup.filter(e => config.event_toggles?.[e.type] ?? e.default_enabled).length + + return ( +
+ {/* Category header row */} +
+ {/* Expand/collapse button */} + + + {/* Label + description */} +
+ + {cat.label} + + {cat.desc} +
+ + {/* Count badge */} + {isEnabled && eventsForGroup.length > 0 && ( + + {enabledCount}/{eventsForGroup.length} + + )} + + {/* Category toggle */} + +
+ + {/* Per-event toggles (expanded) */} + {isEnabled && isExpanded && eventsForGroup.length > 0 && ( +
+ {eventsForGroup.map(evt => { + const evtEnabled = config.event_toggles?.[evt.type] ?? evt.default_enabled + return ( +
+ + {evt.title} + + +
+ ) + })} +
+ )} +
+ ) + })} +
+
+
{/* close bordered filters container */} +
+ + {/* ── Proxmox Webhook ── */} +
+
+ + Proxmox Webhook +
+
+
+
+ PVE Webhook Configuration +
+ {!editMode && ( + + )} +
+ + {/* Setup status inline */} + {webhookSetup.status === "success" && ( +
+ +

PVE webhook configured successfully.

+
+ )} + {webhookSetup.status === "failed" && ( +
+
+ +

PVE auto-config failed: {webhookSetup.error}

+
+ {webhookSetup.fallback_commands.length > 0 && ( +
+{webhookSetup.fallback_commands.join('\n')}
+                    
+ )} +
+ )} + +
+ +
+ updateConfig(p => ({ ...p, webhook_secret: e.target.value }))} + disabled={!editMode} + /> + +
+

+ {"Used for remote connections only (e.g. PBS on another host). Local PVE webhook runs on localhost and does not need this header."} +

+
+
+ + updateConfig(p => ({ ...p, webhook_allowed_ips: e.target.value }))} + disabled={!editMode} + /> +

+ {"Localhost (127.0.0.1) is always allowed. This restricts remote callers only."} +

+
+
{/* close bordered webhook container */} + + {/* PBS manual guide (collapsible) */} +
+ + + Configure PBS notifications (manual) + +
+

+ Backups launched from PVE are covered by the PVE webhook. PBS internal jobs + (Verify, Prune, GC, Sync) require separate configuration on the PBS server. +

+

+ Append to /etc/proxmox-backup/notifications.cfg: +

+
+{`webhook: proxmenux-webhook
+\tmethod post
+\turl http://:8008/api/notifications/webhook
+
+matcher: proxmenux-pbs
+\ttarget proxmenux-webhook
+\tmatch-severity warning,error`}
+                  
+

+ {"Replace with this node's IP. Append at the end -- do not delete existing content."} +

+
+
+
+ + {/* ── Advanced: AI Enhancement ── */} +
+ + + {showAdvanced && ( +
+
+
+ AI-Enhanced Messages +

Use AI to generate contextual notification messages

+
+ +
+ + {config.ai_enabled && ( + <> +
+ + +
+
+ +
+ updateConfig(p => ({ ...p, ai_api_key: e.target.value }))} + disabled={!editMode} + /> + +
+
+
+ + updateConfig(p => ({ ...p, ai_model: e.target.value }))} + disabled={!editMode} + /> +
+
+ +

+ AI enhancement is optional. When enabled, notifications include contextual analysis and recommended actions. If the AI service is unavailable, standard templates are used as fallback. +

+
+ + )} +
+ )} +
+ + {/* ── Notification History ── */} +
+ + + {showHistory && ( +
+ {history.length === 0 ? ( +

No notifications sent yet

+ ) : ( + <> +
+ +
+
+ {history.map(entry => ( +
+ {entry.success ? ( + + ) : ( + + )} +
+ {entry.title || entry.event_type} + + {entry.channel} - {new Date(entry.sent_at).toLocaleString()} + +
+ + {entry.severity} + +
+ ))} +
+ + )} +
+ )} +
+ + )} + + {/* ── Footer info ── */} +
+ +

+ {config.enabled + ? "Notifications are active. Events matching your severity filter and category selection will be sent to configured channels." + : "Enable notifications to receive alerts about system events, health status changes, and security incidents via Telegram, Gotify, Discord, or Email."} +

+
+ + + ) +} diff --git a/AppImage/components/settings.tsx b/AppImage/components/settings.tsx index 4f037221..f2631177 100644 --- a/AppImage/components/settings.tsx +++ b/AppImage/components/settings.tsx @@ -3,6 +3,7 @@ import { useState, useEffect } from "react" import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "./ui/card" import { Wrench, Package, Ruler, HeartPulse, Cpu, MemoryStick, HardDrive, CircleDot, Network, Server, Settings2, FileText, RefreshCw, Shield, AlertTriangle, Info, Loader2, Check } from "lucide-react" +import { NotificationSettings } from "./notification-settings" import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from "./ui/select" import { Input } from "./ui/input" import { Badge } from "./ui/badge" @@ -438,6 +439,9 @@ export function Settings() { + {/* Notification Settings */} + + {/* ProxMenux Optimizations */} diff --git a/AppImage/components/storage-overview.tsx b/AppImage/components/storage-overview.tsx index a9a4cf79..648993b9 100644 --- a/AppImage/components/storage-overview.tsx +++ b/AppImage/components/storage-overview.tsx @@ -34,6 +34,12 @@ interface DiskInfo { wear_leveling_count?: number // SSD: Wear Leveling Count total_lbas_written?: number // SSD/NVMe: Total LBAs Written (GB) ssd_life_left?: number // SSD: SSD Life Left percentage + io_errors?: { + count: number + severity: string + sample: string + reason: string + } } interface ZFSPool { @@ -776,6 +782,17 @@ export function StorageOverview() {
+ {disk.io_errors && disk.io_errors.count > 0 && ( +
+ + {disk.io_errors.count} I/O error{disk.io_errors.count !== 1 ? 's' : ''} in 5 min +
+ )} +
{disk.size_formatted && (
@@ -841,6 +858,22 @@ export function StorageOverview() {
+ {disk.io_errors && disk.io_errors.count > 0 && ( +
+ +
+ {disk.io_errors.count} I/O error{disk.io_errors.count !== 1 ? 's' : ''} in 5 min + {disk.io_errors.sample && ( +

{disk.io_errors.sample}

+ )} +
+
+ )} +
{disk.size_formatted && (
diff --git a/AppImage/scripts/build_appimage.sh b/AppImage/scripts/build_appimage.sh index 447dd60e..3b5f3090 100644 --- a/AppImage/scripts/build_appimage.sh +++ b/AppImage/scripts/build_appimage.sh @@ -91,6 +91,11 @@ cp "$SCRIPT_DIR/proxmox_storage_monitor.py" "$APP_DIR/usr/bin/" 2>/dev/null || e cp "$SCRIPT_DIR/flask_script_runner.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ flask_script_runner.py not found" cp "$SCRIPT_DIR/security_manager.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ security_manager.py not found" cp "$SCRIPT_DIR/flask_security_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ flask_security_routes.py not found" +cp "$SCRIPT_DIR/notification_manager.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ notification_manager.py not found" +cp "$SCRIPT_DIR/notification_channels.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ notification_channels.py not found" +cp "$SCRIPT_DIR/notification_templates.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ notification_templates.py not found" +cp "$SCRIPT_DIR/notification_events.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ notification_events.py not found" +cp "$SCRIPT_DIR/flask_notification_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ flask_notification_routes.py not found" echo "📋 Adding translation support..." cat > "$APP_DIR/usr/bin/translate_cli.py" << 'PYEOF' diff --git a/AppImage/scripts/flask_notification_routes.py b/AppImage/scripts/flask_notification_routes.py new file mode 100644 index 00000000..c0a79c63 --- /dev/null +++ b/AppImage/scripts/flask_notification_routes.py @@ -0,0 +1,695 @@ +""" +Flask routes for notification service configuration and management. +Blueprint pattern matching flask_health_routes.py / flask_security_routes.py. +""" + +import hmac +import time +import hashlib +from collections import deque +from flask import Blueprint, jsonify, request +from notification_manager import notification_manager + + +# ─── Webhook Hardening Helpers ─────────────────────────────────── + +class WebhookRateLimiter: + """Simple sliding-window rate limiter for the webhook endpoint.""" + + def __init__(self, max_requests: int = 60, window_seconds: int = 60): + self._max = max_requests + self._window = window_seconds + self._timestamps: deque = deque() + + def allow(self) -> bool: + now = time.time() + # Prune entries outside the window + while self._timestamps and now - self._timestamps[0] > self._window: + self._timestamps.popleft() + if len(self._timestamps) >= self._max: + return False + self._timestamps.append(now) + return True + + +class ReplayCache: + """Bounded in-memory cache of recently seen request signatures (60s TTL).""" + + _MAX_SIZE = 2000 # Hard cap to prevent memory growth + + def __init__(self, ttl: int = 60): + self._ttl = ttl + self._seen: dict = {} # signature -> timestamp + + def check_and_record(self, signature: str) -> bool: + """Return True if this signature was already seen (replay). Records it otherwise.""" + now = time.time() + # Periodic cleanup + if len(self._seen) > self._MAX_SIZE // 2: + cutoff = now - self._ttl + self._seen = {k: v for k, v in self._seen.items() if v > cutoff} + if signature in self._seen and now - self._seen[signature] < self._ttl: + return True # Replay detected + self._seen[signature] = now + return False + + +# Module-level singletons (one per process) +_webhook_limiter = WebhookRateLimiter(max_requests=60, window_seconds=60) +_replay_cache = ReplayCache(ttl=60) + +# Timestamp validation window (seconds) +_TIMESTAMP_MAX_DRIFT = 60 + +notification_bp = Blueprint('notifications', __name__) + + +@notification_bp.route('/api/notifications/settings', methods=['GET']) +def get_notification_settings(): + """Get all notification settings for the UI.""" + try: + settings = notification_manager.get_settings() + return jsonify(settings) + except Exception as e: + return jsonify({'error': str(e)}), 500 + + +@notification_bp.route('/api/notifications/settings', methods=['POST']) +def save_notification_settings(): + """Save notification settings from the UI.""" + try: + payload = request.get_json() + if not payload: + return jsonify({'error': 'No data provided'}), 400 + + result = notification_manager.save_settings(payload) + return jsonify(result) + except Exception as e: + return jsonify({'error': str(e)}), 500 + + +@notification_bp.route('/api/notifications/test', methods=['POST']) +def test_notification(): + """Send a test notification to one or all channels.""" + try: + data = request.get_json() or {} + channel = data.get('channel', 'all') + + result = notification_manager.test_channel(channel) + return jsonify(result) + except Exception as e: + return jsonify({'error': str(e)}), 500 + + +@notification_bp.route('/api/notifications/status', methods=['GET']) +def get_notification_status(): + """Get notification service status.""" + try: + status = notification_manager.get_status() + return jsonify(status) + except Exception as e: + return jsonify({'error': str(e)}), 500 + + +@notification_bp.route('/api/notifications/history', methods=['GET']) +def get_notification_history(): + """Get notification history with optional filters.""" + try: + limit = request.args.get('limit', 100, type=int) + offset = request.args.get('offset', 0, type=int) + severity = request.args.get('severity', '') + channel = request.args.get('channel', '') + + result = notification_manager.get_history(limit, offset, severity, channel) + return jsonify(result) + except Exception as e: + return jsonify({'error': str(e)}), 500 + + +@notification_bp.route('/api/notifications/history', methods=['DELETE']) +def clear_notification_history(): + """Clear all notification history.""" + try: + result = notification_manager.clear_history() + return jsonify(result) + except Exception as e: + return jsonify({'error': str(e)}), 500 + + +@notification_bp.route('/api/notifications/send', methods=['POST']) +def send_notification(): + """Send a notification via API (for testing or external triggers).""" + try: + data = request.get_json() + if not data: + return jsonify({'error': 'No data provided'}), 400 + + result = notification_manager.send_notification( + event_type=data.get('event_type', 'custom'), + severity=data.get('severity', 'INFO'), + title=data.get('title', ''), + message=data.get('message', ''), + data=data.get('data', {}), + source='api' + ) + return jsonify(result) + except Exception as e: + return jsonify({'error': str(e)}), 500 + + +# ── PVE config constants ── +_PVE_ENDPOINT_ID = 'proxmenux-webhook' +_PVE_MATCHER_ID = 'proxmenux-default' +_PVE_WEBHOOK_URL = 'http://127.0.0.1:8008/api/notifications/webhook' +_PVE_NOTIFICATIONS_CFG = '/etc/pve/notifications.cfg' +_PVE_PRIV_CFG = '/etc/pve/priv/notifications.cfg' +_PVE_OUR_HEADERS = { + f'webhook: {_PVE_ENDPOINT_ID}', + f'matcher: {_PVE_MATCHER_ID}', +} + + +def _pve_read_file(path): + """Read file, return (content, error). Content is '' if missing.""" + try: + with open(path, 'r') as f: + return f.read(), None + except FileNotFoundError: + return '', None + except PermissionError: + return None, f'Permission denied reading {path}' + except Exception as e: + return None, str(e) + + +def _pve_backup_file(path): + """Create timestamped backup if file exists. Never fails fatally.""" + import os, shutil + from datetime import datetime + try: + if os.path.exists(path): + ts = datetime.now().strftime('%Y%m%d_%H%M%S') + backup = f"{path}.proxmenux_backup_{ts}" + shutil.copy2(path, backup) + except Exception: + pass + + +def _pve_remove_our_blocks(text, headers_to_remove): + """Remove only blocks whose header line matches one of ours. + + Preserves ALL other content byte-for-byte. + A block = header line + indented continuation lines + trailing blank line. + """ + lines = text.splitlines(keepends=True) + cleaned = [] + skip_block = False + + for line in lines: + stripped = line.strip() + + if stripped and not line[0:1].isspace() and ':' in stripped: + if stripped in headers_to_remove: + skip_block = True + continue + else: + skip_block = False + + if skip_block: + if not stripped: + skip_block = False + continue + elif line[0:1].isspace(): + continue + else: + skip_block = False + + cleaned.append(line) + + return ''.join(cleaned) + + +def _build_webhook_fallback(): + """Build fallback manual commands for webhook setup.""" + import base64 + body_tpl = '{"title":"{{ escape title }}","message":"{{ escape message }}","severity":"{{ severity }}","timestamp":"{{ timestamp }}","fields":{{ json fields }}}' + body_b64 = base64.b64encode(body_tpl.encode()).decode() + return [ + "# 1. Append to END of /etc/pve/notifications.cfg", + "# (do NOT delete existing content):", + "", + f"webhook: {_PVE_ENDPOINT_ID}", + f"\tbody {body_b64}", + f"\tmethod post", + f"\turl {_PVE_WEBHOOK_URL}", + "", + f"matcher: {_PVE_MATCHER_ID}", + f"\ttarget {_PVE_ENDPOINT_ID}", + "\tmode all", + "", + "# 2. Append to /etc/pve/priv/notifications.cfg :", + f"webhook: {_PVE_ENDPOINT_ID}", + ] + + +def setup_pve_webhook_core() -> dict: + """Core logic to configure PVE webhook. Callable from anywhere. + + Returns dict with 'configured', 'error', 'fallback_commands' keys. + Idempotent: safe to call multiple times. + """ + import secrets as secrets_mod + + result = { + 'configured': False, + 'endpoint_id': _PVE_ENDPOINT_ID, + 'matcher_id': _PVE_MATCHER_ID, + 'url': _PVE_WEBHOOK_URL, + 'fallback_commands': [], + 'error': None, + } + + try: + # ── Step 1: Ensure webhook secret exists (for our own internal use) ── + secret = notification_manager.get_webhook_secret() + if not secret: + secret = secrets_mod.token_urlsafe(32) + notification_manager._save_setting('webhook_secret', secret) + + # ── Step 2: Read main config ── + cfg_text, err = _pve_read_file(_PVE_NOTIFICATIONS_CFG) + if err: + result['error'] = err + result['fallback_commands'] = _build_webhook_fallback() + return result + + # ── Step 3: Read priv config (to clean up any broken blocks we wrote before) ── + priv_text, err = _pve_read_file(_PVE_PRIV_CFG) + if err: + priv_text = None + + # ── Step 4: Create backups before ANY modification ── + _pve_backup_file(_PVE_NOTIFICATIONS_CFG) + if priv_text is not None: + _pve_backup_file(_PVE_PRIV_CFG) + + # ── Step 5: Remove any previous proxmenux blocks from BOTH files ── + cleaned_cfg = _pve_remove_our_blocks(cfg_text, _PVE_OUR_HEADERS) + + if priv_text is not None: + cleaned_priv = _pve_remove_our_blocks(priv_text, _PVE_OUR_HEADERS) + + # ── Step 6: Build new blocks ── + # Exact format from a real working PVE server: + # webhook: name + # \tmethod post + # \turl http://... + # + # NO header lines -- localhost webhook doesn't need them. + # PVE header format is: header name=X-Key,value= + # PVE secret format is: secret name=key,value= + # Neither is needed for localhost calls. + + # PVE stores body as base64 in the config file. + # {{ escape title/message }} -- JSON-safe escaping of quotes/newlines. + # {{ json fields }} -- renders ALL PVE metadata as a JSON object + # (type, hostname, job-id). This is a single Handlebars helper + # that always works, even if fields is empty (renders {}). + import base64 + body_template = '{"title":"{{ escape title }}","message":"{{ escape message }}","severity":"{{ severity }}","timestamp":"{{ timestamp }}","fields":{{ json fields }}}' + body_b64 = base64.b64encode(body_template.encode()).decode() + + endpoint_block = ( + f"webhook: {_PVE_ENDPOINT_ID}\n" + f"\tbody {body_b64}\n" + f"\tmethod post\n" + f"\turl {_PVE_WEBHOOK_URL}\n" + ) + + matcher_block = ( + f"matcher: {_PVE_MATCHER_ID}\n" + f"\ttarget {_PVE_ENDPOINT_ID}\n" + f"\tmode all\n" + ) + + # ── Step 7: Append our blocks to cleaned main config ── + if cleaned_cfg and not cleaned_cfg.endswith('\n'): + cleaned_cfg += '\n' + if cleaned_cfg and not cleaned_cfg.endswith('\n\n'): + cleaned_cfg += '\n' + + new_cfg = cleaned_cfg + endpoint_block + '\n' + matcher_block + + # ── Step 8: Write main config ── + try: + with open(_PVE_NOTIFICATIONS_CFG, 'w') as f: + f.write(new_cfg) + except PermissionError: + result['error'] = f'Permission denied writing {_PVE_NOTIFICATIONS_CFG}' + result['fallback_commands'] = _build_webhook_fallback() + return result + except Exception as e: + try: + with open(_PVE_NOTIFICATIONS_CFG, 'w') as f: + f.write(cfg_text) + except Exception: + pass + result['error'] = str(e) + result['fallback_commands'] = _build_webhook_fallback() + return result + + # ── Step 9: Write priv config with our webhook entry ── + # PVE REQUIRES a matching block in priv/notifications.cfg for every + # webhook endpoint, even if it has no secrets. Without it PVE throws: + # "Could not instantiate endpoint: private config does not exist" + priv_block = ( + f"webhook: {_PVE_ENDPOINT_ID}\n" + ) + + if priv_text is not None: + # Start from cleaned priv (our old blocks removed) + if cleaned_priv and not cleaned_priv.endswith('\n'): + cleaned_priv += '\n' + if cleaned_priv and not cleaned_priv.endswith('\n\n'): + cleaned_priv += '\n' + new_priv = cleaned_priv + priv_block + else: + new_priv = priv_block + + try: + with open(_PVE_PRIV_CFG, 'w') as f: + f.write(new_priv) + except PermissionError: + result['error'] = f'Permission denied writing {_PVE_PRIV_CFG}' + result['fallback_commands'] = _build_webhook_fallback() + return result + except Exception: + pass + + result['configured'] = True + result['secret'] = secret + return result + + except Exception as e: + result['error'] = str(e) + result['fallback_commands'] = _build_webhook_fallback() + return result + + +@notification_bp.route('/api/notifications/proxmox/setup-webhook', methods=['POST']) +def setup_proxmox_webhook(): + """HTTP endpoint wrapper for webhook setup.""" + return jsonify(setup_pve_webhook_core()), 200 + + +def cleanup_pve_webhook_core() -> dict: + """Core logic to remove PVE webhook blocks. Callable from anywhere. + + Returns dict with 'cleaned', 'error' keys. + Only removes blocks named 'proxmenux-webhook' / 'proxmenux-default'. + """ + result = {'cleaned': False, 'error': None} + + try: + # Read both files + cfg_text, err = _pve_read_file(_PVE_NOTIFICATIONS_CFG) + if err: + result['error'] = err + return result + + priv_text, err = _pve_read_file(_PVE_PRIV_CFG) + if err: + priv_text = None + + # Check if our blocks actually exist before doing anything + has_our_blocks = any( + h in cfg_text for h in [f'webhook: {_PVE_ENDPOINT_ID}', f'matcher: {_PVE_MATCHER_ID}'] + ) + has_priv_blocks = priv_text and f'webhook: {_PVE_ENDPOINT_ID}' in priv_text + + if not has_our_blocks and not has_priv_blocks: + result['cleaned'] = True + return result + + # Backup before modification + _pve_backup_file(_PVE_NOTIFICATIONS_CFG) + if priv_text is not None: + _pve_backup_file(_PVE_PRIV_CFG) + + # Remove our blocks + if has_our_blocks: + cleaned_cfg = _pve_remove_our_blocks(cfg_text, _PVE_OUR_HEADERS) + try: + with open(_PVE_NOTIFICATIONS_CFG, 'w') as f: + f.write(cleaned_cfg) + except PermissionError: + result['error'] = f'Permission denied writing {_PVE_NOTIFICATIONS_CFG}' + return result + except Exception as e: + # Rollback + try: + with open(_PVE_NOTIFICATIONS_CFG, 'w') as f: + f.write(cfg_text) + except Exception: + pass + result['error'] = str(e) + return result + + if has_priv_blocks and priv_text is not None: + cleaned_priv = _pve_remove_our_blocks(priv_text, _PVE_OUR_HEADERS) + try: + with open(_PVE_PRIV_CFG, 'w') as f: + f.write(cleaned_priv) + except Exception: + pass # Best-effort + + result['cleaned'] = True + return result + + except Exception as e: + result['error'] = str(e) + return result + + +@notification_bp.route('/api/notifications/proxmox/cleanup-webhook', methods=['POST']) +def cleanup_proxmox_webhook(): + """HTTP endpoint wrapper for webhook cleanup.""" + return jsonify(cleanup_pve_webhook_core()), 200 + + +@notification_bp.route('/api/notifications/proxmox/read-cfg', methods=['GET']) +def read_pve_notification_cfg(): + """Diagnostic: return raw content of PVE notification config files. + + GET /api/notifications/proxmox/read-cfg + Returns both notifications.cfg and priv/notifications.cfg content. + """ + import os + + files = { + 'notifications_cfg': '/etc/pve/notifications.cfg', + 'priv_cfg': '/etc/pve/priv/notifications.cfg', + } + + # Also look for any backups we created + backup_dir = '/etc/pve' + priv_backup_dir = '/etc/pve/priv' + + result = {} + for key, path in files.items(): + try: + with open(path, 'r') as f: + result[key] = { + 'path': path, + 'content': f.read(), + 'size': os.path.getsize(path), + 'error': None, + } + except FileNotFoundError: + result[key] = {'path': path, 'content': None, 'size': 0, 'error': 'file_not_found'} + except PermissionError: + result[key] = {'path': path, 'content': None, 'size': 0, 'error': 'permission_denied'} + except Exception as e: + result[key] = {'path': path, 'content': None, 'size': 0, 'error': str(e)} + + # Find backups + backups = [] + for d in [backup_dir, priv_backup_dir]: + try: + for fname in sorted(os.listdir(d)): + if 'proxmenux_backup' in fname: + fpath = os.path.join(d, fname) + try: + with open(fpath, 'r') as f: + backups.append({ + 'path': fpath, + 'content': f.read(), + 'size': os.path.getsize(fpath), + }) + except Exception: + backups.append({'path': fpath, 'content': None, 'error': 'read_failed'}) + except Exception: + pass + + result['backups'] = backups + return jsonify(result), 200 + + +@notification_bp.route('/api/notifications/proxmox/restore-cfg', methods=['POST']) +def restore_pve_notification_cfg(): + """Restore PVE notification config from our backup. + + POST /api/notifications/proxmox/restore-cfg + Finds the most recent proxmenux_backup and restores it. + """ + import os + import shutil + + files_to_restore = { + '/etc/pve': '/etc/pve/notifications.cfg', + '/etc/pve/priv': '/etc/pve/priv/notifications.cfg', + } + + restored = [] + errors = [] + + for search_dir, target_path in files_to_restore.items(): + try: + candidates = sorted([ + f for f in os.listdir(search_dir) + if 'proxmenux_backup' in f and f.startswith('notifications.cfg') + ], reverse=True) + + if candidates: + backup_path = os.path.join(search_dir, candidates[0]) + shutil.copy2(backup_path, target_path) + restored.append({'target': target_path, 'from_backup': backup_path}) + else: + errors.append({'target': target_path, 'error': 'no_backup_found'}) + except Exception as e: + errors.append({'target': target_path, 'error': str(e)}) + + return jsonify({ + 'restored': restored, + 'errors': errors, + 'success': len(errors) == 0 and len(restored) > 0, + }), 200 + + +@notification_bp.route('/api/notifications/webhook', methods=['POST']) +def proxmox_webhook(): + """Receive native Proxmox VE notification webhooks (hardened). + + Security layers: + Localhost (127.0.0.1 / ::1): rate limiting only. + PVE calls us on localhost and cannot send custom auth headers, + so we trust the loopback interface (only local processes can reach it). + Remote: rate limiting + shared secret + timestamp + replay + IP allowlist. + """ + _reject = lambda code, error, status: (jsonify({'accepted': False, 'error': error}), status) + + client_ip = request.remote_addr or '' + is_localhost = client_ip in ('127.0.0.1', '::1') + + # ── Layer 1: Rate limiting (always) ── + if not _webhook_limiter.allow(): + resp = jsonify({'accepted': False, 'error': 'rate_limited'}) + resp.headers['Retry-After'] = '60' + return resp, 429 + + # ── Layers 2-5: Remote-only checks ── + if not is_localhost: + # Layer 2: Shared secret + try: + configured_secret = notification_manager.get_webhook_secret() + except Exception: + configured_secret = '' + + if configured_secret: + request_secret = request.headers.get('X-Webhook-Secret', '') + if not request_secret: + return _reject(401, 'missing_secret', 401) + if not hmac.compare_digest(configured_secret, request_secret): + return _reject(401, 'invalid_secret', 401) + + # Layer 3: Anti-replay timestamp + ts_header = request.headers.get('X-ProxMenux-Timestamp', '') + if not ts_header: + return _reject(401, 'missing_timestamp', 401) + try: + ts_value = int(ts_header) + except (ValueError, TypeError): + return _reject(401, 'invalid_timestamp', 401) + if abs(time.time() - ts_value) > _TIMESTAMP_MAX_DRIFT: + return _reject(401, 'timestamp_expired', 401) + + # Layer 4: Replay cache + raw_body = request.get_data(as_text=True) or '' + signature = hashlib.sha256(f"{ts_value}:{raw_body}".encode(errors='replace')).hexdigest() + if _replay_cache.check_and_record(signature): + return _reject(409, 'replay_detected', 409) + + # Layer 5: IP allowlist + try: + allowed_ips = notification_manager.get_webhook_allowed_ips() + if allowed_ips and client_ip not in allowed_ips: + return _reject(403, 'forbidden_ip', 403) + except Exception: + pass + + # ── Parse and process payload ── + try: + content_type = request.content_type or '' + raw_data = request.get_data(as_text=True) or '' + + # Try JSON first + payload = request.get_json(silent=True) or {} + + # If not JSON, try form data + if not payload: + payload = dict(request.form) + + # If still empty, try parsing raw data as JSON (PVE may not set Content-Type) + if not payload and raw_data: + import json + try: + payload = json.loads(raw_data) + except (json.JSONDecodeError, ValueError): + # PVE's {{ message }} may contain unescaped newlines/quotes + # that break JSON. Try to repair common issues. + try: + repaired = raw_data.replace('\n', '\\n').replace('\r', '\\r') + payload = json.loads(repaired) + except (json.JSONDecodeError, ValueError): + # Try to extract fields with regex from broken JSON + import re + title_m = re.search(r'"title"\s*:\s*"([^"]*)"', raw_data) + sev_m = re.search(r'"severity"\s*:\s*"([^"]*)"', raw_data) + if title_m: + payload = { + 'title': title_m.group(1), + 'body': raw_data[:1000], + 'severity': sev_m.group(1) if sev_m else 'info', + 'source': 'proxmox_hook', + } + + # If still empty, try to salvage data from raw body + if not payload: + if raw_data: + # Last resort: treat raw text as the message body + payload = { + 'title': 'PVE Notification', + 'body': raw_data[:1000], + 'severity': 'info', + 'source': 'proxmox_hook', + } + else: + return _reject(400, 'empty_payload', 400) + + result = notification_manager.process_webhook(payload) + # Always return 200 to PVE -- a non-200 makes PVE report the webhook as broken. + # The 'accepted' field in the JSON body indicates actual processing status. + return jsonify(result), 200 + except Exception as e: + # Still return 200 to avoid PVE flagging the webhook as broken + return jsonify({'accepted': False, 'error': 'internal_error', 'detail': str(e)}), 200 diff --git a/AppImage/scripts/flask_server.py b/AppImage/scripts/flask_server.py index 003d15f4..dd38090e 100644 --- a/AppImage/scripts/flask_server.py +++ b/AppImage/scripts/flask_server.py @@ -23,6 +23,7 @@ import time import threading import urllib.parse import hardware_monitor +import health_persistence import xml.etree.ElementTree as ET from datetime import datetime, timedelta from functools import wraps @@ -46,6 +47,8 @@ from flask_health_routes import health_bp # noqa: E402 from flask_auth_routes import auth_bp # noqa: E402 from flask_proxmenux_routes import proxmenux_bp # noqa: E402 from flask_security_routes import security_bp # noqa: E402 +from flask_notification_routes import notification_bp # noqa: E402 +from notification_manager import notification_manager # noqa: E402 from jwt_middleware import require_auth # noqa: E402 import auth_manager # noqa: E402 @@ -120,6 +123,7 @@ app.register_blueprint(auth_bp) app.register_blueprint(health_bp) app.register_blueprint(proxmenux_bp) app.register_blueprint(security_bp) +app.register_blueprint(notification_bp) # Initialize terminal / WebSocket routes init_terminal_routes(app) @@ -1156,19 +1160,66 @@ def get_storage_info(): 'ssd_life_left': smart_data.get('ssd_life_left') # Added } - storage_data['disk_count'] += 1 - health = smart_data.get('health', 'unknown').lower() - if health == 'healthy': - storage_data['healthy_disks'] += 1 - elif health == 'warning': - storage_data['warning_disks'] += 1 - elif health in ['critical', 'failed']: - storage_data['critical_disks'] += 1 - except Exception as e: - # print(f"Error getting disk list: {e}") pass + # Enrich physical disks with active I/O errors from health_persistence. + # This is the single source of truth -- health_monitor detects ATA/SCSI/IO + # errors via dmesg, records them in health_persistence, and we read them here. + try: + active_disk_errors = health_persistence.get_active_errors(category='disks') + for err in active_disk_errors: + details = err.get('details', {}) + if isinstance(details, str): + try: + details = json.loads(details) + except (json.JSONDecodeError, TypeError): + details = {} + + err_device = details.get('disk', '') + error_count = details.get('error_count', 0) + sample = details.get('sample', '') + severity = err.get('severity', 'WARNING') + + # Match error to physical disk. + # err_device can be 'sda', 'nvme0n1', or 'ata8' (if resolution failed) + matched_disk = None + if err_device in physical_disks: + matched_disk = err_device + else: + # Try partial match: 'sda' matches disk 'sda' + for dk in physical_disks: + if dk == err_device or err_device.startswith(dk): + matched_disk = dk + break + + if matched_disk: + physical_disks[matched_disk]['io_errors'] = { + 'count': error_count, + 'severity': severity, + 'sample': sample, + 'reason': err.get('reason', ''), + } + # Override health status if I/O errors are more severe + current_health = physical_disks[matched_disk].get('health', 'unknown').lower() + if severity == 'CRITICAL' and current_health != 'critical': + physical_disks[matched_disk]['health'] = 'critical' + elif severity == 'WARNING' and current_health in ('healthy', 'unknown'): + physical_disks[matched_disk]['health'] = 'warning' + except Exception: + pass + + # Count disk health states AFTER I/O error enrichment + for disk_name, disk_info in physical_disks.items(): + storage_data['disk_count'] += 1 + health = disk_info.get('health', 'unknown').lower() + if health == 'healthy': + storage_data['healthy_disks'] += 1 + elif health == 'warning': + storage_data['warning_disks'] += 1 + elif health in ['critical', 'failed']: + storage_data['critical_disks'] += 1 + storage_data['total'] = round(total_disk_size_bytes / (1024**4), 1) # Get disk usage for mounted partitions @@ -7094,6 +7145,16 @@ if __name__ == '__main__': except Exception as e: print(f"[ProxMenux] Vital signs sampler failed to start: {e}") + # ── Notification Service ── + try: + notification_manager.start() + if notification_manager._enabled: + print(f"[ProxMenux] Notification service started (channels: {list(notification_manager._channels.keys())})") + else: + print("[ProxMenux] Notification service loaded (disabled - configure in Settings)") + except Exception as e: + print(f"[ProxMenux] Notification service failed to start: {e}") + # Check for SSL configuration ssl_ctx = None try: diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py index ef381192..30eb884b 100644 --- a/AppImage/scripts/health_monitor.py +++ b/AppImage/scripts/health_monitor.py @@ -324,6 +324,13 @@ class HealthMonitor: Returns JSON structure with ALL 10 categories always present. Now includes persistent error tracking. """ + # Run cleanup on every status check so stale errors are auto-resolved + # using the user-configured Suppression Duration (single source of truth). + try: + health_persistence.cleanup_old_errors() + except Exception: + pass + active_errors = health_persistence.get_active_errors() # No need to create persistent_issues dict here, it's implicitly handled by the checks @@ -821,8 +828,20 @@ class HealthMonitor: issues = [] storage_details = {} - # Check disk usage and mount status first for critical mounts - critical_mounts = ['/'] + # Check disk usage and mount status for important mounts. + # We detect actual mountpoints dynamically rather than hard-coding. + critical_mounts = set() + critical_mounts.add('/') + try: + for part in psutil.disk_partitions(all=False): + mp = part.mountpoint + # Include standard system mounts and PVE storage + if mp in ('/', '/var', '/tmp', '/boot', '/boot/efi') or \ + mp.startswith('/var/lib/vz') or mp.startswith('/mnt/'): + critical_mounts.add(mp) + except Exception: + pass + critical_mounts = sorted(critical_mounts) for mount_point in critical_mounts: try: @@ -857,9 +876,32 @@ class HealthMonitor: # Check filesystem usage only if not already flagged as critical if mount_point not in storage_details or storage_details[mount_point].get('status') == 'OK': fs_status = self._check_filesystem(mount_point) + error_key = f'disk_space_{mount_point}' if fs_status['status'] != 'OK': issues.append(f"{mount_point}: {fs_status['reason']}") storage_details[mount_point] = fs_status + # Record persistent error for notifications + usage = psutil.disk_usage(mount_point) + avail_gb = usage.free / (1024**3) + if avail_gb >= 1: + avail_str = f"{avail_gb:.1f} GiB" + else: + avail_str = f"{usage.free / (1024**2):.0f} MiB" + health_persistence.record_error( + error_key=error_key, + category='disk', + severity=fs_status['status'], + reason=f'{mount_point}: {fs_status["reason"]}', + details={ + 'mount': mount_point, + 'used': str(round(usage.percent, 1)), + 'available': avail_str, + 'dismissable': False, + } + ) + else: + # Space recovered -- clear any previous alert + health_persistence.clear_error(error_key) except Exception: pass # Silently skip if mountpoint check fails @@ -1052,16 +1094,67 @@ class HealthMonitor: return storages + def _resolve_ata_to_disk(self, ata_port: str) -> str: + """Resolve an ATA controller name (e.g. 'ata8') to a block device (e.g. 'sda'). + + Uses /sys/class/ata_port/ symlinks and /sys/block/ to find the mapping. + Falls back to parsing dmesg for 'ata8: SATA link up' -> 'sd 7:0:0:0: [sda]'. + """ + if not ata_port or not ata_port.startswith('ata'): + return ata_port + + port_num = ata_port.replace('ata', '') + + # Method 1: Walk /sys/class/ata_port/ -> host -> target -> block + try: + ata_path = f'/sys/class/ata_port/{ata_port}' + if os.path.exists(ata_path): + device_path = os.path.realpath(ata_path) + # Walk up to find the SCSI host, then find block devices + # Path: /sys/devices/.../ataX/hostY/targetY:0:0/Y:0:0:0/block/sdZ + for root, dirs, files in os.walk(os.path.dirname(device_path)): + if 'block' in dirs: + block_path = os.path.join(root, 'block') + devs = os.listdir(block_path) + if devs: + return devs[0] # e.g. 'sda' + except (OSError, IOError): + pass + + # Method 2: Parse dmesg for ATA link messages + try: + result = subprocess.run( + ['dmesg', '--notime'], + capture_output=True, text=True, timeout=2 + ) + if result.returncode == 0: + # Look for "ata8: SATA link up" followed by "sd X:0:0:0: [sda]" + lines = result.stdout.split('\n') + host_num = None + for line in lines: + m = re.search(rf'{ata_port}:\s+SATA link', line) + if m: + # ata port number maps to host(N-1) typically + host_num = int(port_num) - 1 + if host_num is not None: + m2 = re.search(rf'sd\s+{host_num}:\d+:\d+:\d+:\s+\[(\w+)\]', line) + if m2: + return m2.group(1) + except (OSError, subprocess.TimeoutExpired): + pass + + return ata_port # Return original if resolution fails + def _check_disks_optimized(self) -> Dict[str, Any]: """ - Optimized disk check - always returns status. - Checks dmesg for I/O errors and SMART status. - NOTE: This function is now largely covered by _check_storage_optimized, - but kept for potential specific disk-level reporting if needed. - Currently, its primary function is to detect recent I/O errors. + Disk I/O error check -- the SINGLE source of truth for disk errors. + + Reads dmesg for I/O/ATA/SCSI errors, counts per device, records in + health_persistence, and returns status for the health dashboard. + Resolves ATA controller names (ata8) to physical disks (sda). """ current_time = time.time() - disk_issues = {} + disk_results = {} # Single dict for both WARNING and CRITICAL try: # Check dmesg for I/O errors in the last 5 minutes @@ -1072,17 +1165,52 @@ class HealthMonitor: timeout=2 ) + # Collect a sample line per device for richer error messages + disk_samples = {} + if result.returncode == 0: for line in result.stdout.split('\n'): line_lower = line.lower() - if any(keyword in line_lower for keyword in ['i/o error', 'ata error', 'scsi error', 'medium error']): - # Try to extract disk name - disk_match = re.search(r'/dev/(sd[a-z]|nvme\d+n\d+)', line) - if disk_match: - disk_name = disk_match.group(1) + # Detect various disk error formats + is_disk_error = any(kw in line_lower for kw in [ + 'i/o error', 'scsi error', 'medium error', + 'failed command:', 'exception emask', + ]) + ata_match = re.search(r'(ata\d+)[\.\d]*:.*(?:error|failed|exception)', line_lower) + if ata_match: + is_disk_error = True + + if is_disk_error: + # Extract device from multiple formats + raw_device = None + for dev_re in [ + r'dev\s+(sd[a-z]+)', # dev sdb + r'\[(sd[a-z]+)\]', # [sda] + r'/dev/(sd[a-z]+)', # /dev/sda + r'(nvme\d+n\d+)', # nvme0n1 + r'device\s+(sd[a-z]+\d*)', # device sda1 + r'(ata\d+)', # ata8 (ATA controller) + ]: + dm = re.search(dev_re, line) + if dm: + raw_device = dm.group(1) + break + + if raw_device: + # Resolve ATA port to physical disk name + if raw_device.startswith('ata'): + resolved = self._resolve_ata_to_disk(raw_device) + disk_name = resolved + else: + disk_name = raw_device.rstrip('0123456789') if raw_device.startswith('sd') else raw_device + self.io_error_history[disk_name].append(current_time) + if disk_name not in disk_samples: + # Clean the sample: strip dmesg timestamp prefix + clean = re.sub(r'^\[.*?\]\s*', '', line.strip()) + disk_samples[disk_name] = clean[:200] - # Clean old history (keep errors from the last 5 minutes) + # Clean old history and evaluate per-disk status for disk in list(self.io_error_history.keys()): self.io_error_history[disk] = [ t for t in self.io_error_history[disk] @@ -1090,57 +1218,67 @@ class HealthMonitor: ] error_count = len(self.io_error_history[disk]) + error_key = f'disk_{disk}' + sample = disk_samples.get(disk, '') + display = f'/dev/{disk}' if not disk.startswith('/') else disk - # Report based on recent error count if error_count >= 3: - error_key = f'disk_{disk}' severity = 'CRITICAL' - reason = f'{error_count} I/O errors in 5 minutes' + reason = f'{display}: {error_count} I/O errors in 5 min' + if sample: + reason += f'\n{sample}' health_persistence.record_error( error_key=error_key, category='disks', severity=severity, reason=reason, - details={'disk': disk, 'error_count': error_count, 'dismissable': False} + details={'disk': disk, 'device': display, + 'error_count': error_count, + 'sample': sample, 'dismissable': False} ) - - disk_details[disk] = { + disk_results[display] = { 'status': severity, 'reason': reason, - 'dismissable': False + 'device': disk, + 'error_count': error_count, + 'dismissable': False, } elif error_count >= 1: - error_key = f'disk_{disk}' severity = 'WARNING' - reason = f'{error_count} I/O error(s) in 5 minutes' + reason = f'{display}: {error_count} I/O error(s) in 5 min' + if sample: + reason += f'\n{sample}' - health_persistence.record_error( + rec_result = health_persistence.record_error( error_key=error_key, category='disks', severity=severity, reason=reason, - details={'disk': disk, 'error_count': error_count, 'dismissable': True} + details={'disk': disk, 'device': display, + 'error_count': error_count, + 'sample': sample, 'dismissable': True} ) - - disk_issues[f'/dev/{disk}'] = { - 'status': severity, - 'reason': reason, - 'dismissable': True - } + if not rec_result or rec_result.get('type') != 'skipped_acknowledged': + disk_results[display] = { + 'status': severity, + 'reason': reason, + 'device': disk, + 'error_count': error_count, + 'dismissable': True, + } else: - error_key = f'disk_{disk}' health_persistence.resolve_error(error_key, 'Disk errors cleared') - if not disk_issues: + if not disk_results: return {'status': 'OK'} - has_critical = any(d.get('status') == 'CRITICAL' for d in disk_issues.values()) + has_critical = any(d.get('status') == 'CRITICAL' for d in disk_results.values()) return { 'status': 'CRITICAL' if has_critical else 'WARNING', - 'reason': f"{len(disk_issues)} disk(s) with recent errors", - 'details': disk_issues + 'reason': f"{len(disk_results)} disk(s) with recent errors", + 'details': disk_results } except Exception as e: @@ -1351,12 +1489,51 @@ class HealthMonitor: except Exception: return {'status': 'UNKNOWN', 'reason': 'Ping command failed'} + def _is_vzdump_active(self) -> bool: + """Check if a vzdump (backup) job is currently running.""" + try: + with open('/var/log/pve/tasks/active', 'r') as f: + for line in f: + if ':vzdump:' in line: + return True + except (OSError, IOError): + pass + return False + + def _resolve_vm_name(self, vmid: str) -> str: + """Resolve VMID to guest name from PVE config files.""" + if not vmid: + return '' + for base in ['/etc/pve/qemu-server', '/etc/pve/lxc']: + conf = os.path.join(base, f'{vmid}.conf') + try: + with open(conf) as f: + for line in f: + if line.startswith('hostname:') or line.startswith('name:'): + return line.split(':', 1)[1].strip() + except (OSError, IOError): + continue + return '' + def _check_vms_cts_optimized(self) -> Dict[str, Any]: """ Optimized VM/CT check - detects qmp failures and startup errors from logs. Improved detection of container and VM errors from journalctl. """ try: + # First: auto-resolve any persisted VM/CT errors where the guest + # is now running. This clears stale "Failed to start" / QMP + # errors that are no longer relevant. + try: + active_vm_errors = health_persistence.get_active_errors('vms') + for err in active_vm_errors: + details = err.get('details') or {} + vmid = details.get('id', '') + if vmid: + health_persistence.check_vm_running(vmid) + except Exception: + pass + issues = [] vm_details = {} @@ -1367,20 +1544,28 @@ class HealthMonitor: timeout=3 ) + # Check if vzdump is running -- QMP timeouts during backup are normal + _vzdump_running = self._is_vzdump_active() + if result.returncode == 0: for line in result.stdout.split('\n'): line_lower = line.lower() vm_qmp_match = re.search(r'vm\s+(\d+)\s+qmp\s+command.*(?:failed|unable|timeout)', line_lower) if vm_qmp_match: + if _vzdump_running: + continue # Normal during backup vmid = vm_qmp_match.group(1) + vm_name = self._resolve_vm_name(vmid) + display = f"VM {vmid} ({vm_name})" if vm_name else f"VM {vmid}" key = f'vm_{vmid}' if key not in vm_details: - issues.append(f'VM {vmid}: Communication issue') + issues.append(f'{display}: QMP communication issue') vm_details[key] = { 'status': 'WARNING', - 'reason': 'QMP command timeout', + 'reason': f'{display}: QMP command failed or timed out.\n{line.strip()[:200]}', 'id': vmid, + 'vmname': vm_name, 'type': 'VM' } continue @@ -1401,11 +1586,15 @@ class HealthMonitor: else: reason = 'Container error' - issues.append(f'CT {ctid}: {reason}') + ct_name = self._resolve_vm_name(ctid) + display = f"CT {ctid} ({ct_name})" if ct_name else f"CT {ctid}" + full_reason = f'{display}: {reason}\n{line.strip()[:200]}' + issues.append(f'{display}: {reason}') vm_details[key] = { 'status': 'WARNING' if 'device' in reason.lower() else 'CRITICAL', - 'reason': reason, + 'reason': full_reason, 'id': ctid, + 'vmname': ct_name, 'type': 'CT' } continue @@ -1440,11 +1629,15 @@ class HealthMonitor: vmid = id_match.group(1) key = f'vmct_{vmid}' if key not in vm_details: - issues.append(f'VM/CT {vmid}: Failed to start') + vm_name = self._resolve_vm_name(vmid) + display = f"VM/CT {vmid} ({vm_name})" if vm_name else f"VM/CT {vmid}" + full_reason = f'{display}: Failed to start\n{line.strip()[:200]}' + issues.append(f'{display}: Failed to start') vm_details[key] = { 'status': 'CRITICAL', - 'reason': 'Failed to start', + 'reason': full_reason, 'id': vmid, + 'vmname': vm_name, 'type': 'VM/CT' } @@ -1504,31 +1697,38 @@ class HealthMonitor: timeout=3 ) + _vzdump_running = self._is_vzdump_active() + if result.returncode == 0: for line in result.stdout.split('\n'): line_lower = line.lower() - # VM QMP errors + # VM QMP errors (skip during active backup -- normal behavior) vm_qmp_match = re.search(r'vm\s+(\d+)\s+qmp\s+command.*(?:failed|unable|timeout)', line_lower) if vm_qmp_match: + if _vzdump_running: + continue # Normal during backup vmid = vm_qmp_match.group(1) + vm_name = self._resolve_vm_name(vmid) + display = f"VM {vmid} ({vm_name})" if vm_name else f"VM {vmid}" error_key = f'vm_{vmid}' if error_key not in vm_details: - # Record persistent error - health_persistence.record_error( + rec_result = health_persistence.record_error( error_key=error_key, category='vms', severity='WARNING', - reason='QMP command timeout', - details={'id': vmid, 'type': 'VM'} + reason=f'{display}: QMP command failed or timed out.\n{line.strip()[:200]}', + details={'id': vmid, 'vmname': vm_name, 'type': 'VM'} ) - issues.append(f'VM {vmid}: Communication issue') - vm_details[error_key] = { - 'status': 'WARNING', - 'reason': 'QMP command timeout', - 'id': vmid, - 'type': 'VM' - } + if not rec_result or rec_result.get('type') != 'skipped_acknowledged': + issues.append(f'{display}: QMP communication issue') + vm_details[error_key] = { + 'status': 'WARNING', + 'reason': f'{display}: QMP command failed or timed out', + 'id': vmid, + 'vmname': vm_name, + 'type': 'VM' + } continue # Container errors (including startup issues via vzstart) @@ -1548,20 +1748,21 @@ class HealthMonitor: reason = 'Startup error' # Record persistent error - health_persistence.record_error( + rec_result = health_persistence.record_error( error_key=error_key, category='vms', severity='WARNING', reason=reason, details={'id': ctid, 'type': 'CT'} ) - issues.append(f'CT {ctid}: {reason}') - vm_details[error_key] = { - 'status': 'WARNING', - 'reason': reason, - 'id': ctid, - 'type': 'CT' - } + if not rec_result or rec_result.get('type') != 'skipped_acknowledged': + issues.append(f'CT {ctid}: {reason}') + vm_details[error_key] = { + 'status': 'WARNING', + 'reason': reason, + 'id': ctid, + 'type': 'CT' + } # Generic failed to start for VMs and CTs if any(keyword in line_lower for keyword in ['failed to start', 'cannot start', 'activation failed', 'start error']): @@ -1586,22 +1787,28 @@ class HealthMonitor: vm_type = 'VM/CT' if error_key not in vm_details: - reason = 'Failed to start' + vm_name = self._resolve_vm_name(vmid_ctid) + display = f"{vm_type} {vmid_ctid}" + if vm_name: + display = f"{vm_type} {vmid_ctid} ({vm_name})" + reason = f'{display}: Failed to start\n{line.strip()[:200]}' # Record persistent error - health_persistence.record_error( + rec_result = health_persistence.record_error( error_key=error_key, category='vms', severity='CRITICAL', reason=reason, - details={'id': vmid_ctid, 'type': vm_type} + details={'id': vmid_ctid, 'vmname': vm_name, 'type': vm_type} ) - issues.append(f'{vm_type} {vmid_ctid}: {reason}') - vm_details[error_key] = { - 'status': 'CRITICAL', - 'reason': reason, - 'id': vmid_ctid, - 'type': vm_type - } + if not rec_result or rec_result.get('type') != 'skipped_acknowledged': + issues.append(f'{display}: Failed to start') + vm_details[error_key] = { + 'status': 'CRITICAL', + 'reason': reason, + 'id': vmid_ctid, + 'vmname': vm_name, + 'type': vm_type + } # Build checks dict from vm_details checks = {} @@ -1692,16 +1899,23 @@ class HealthMonitor: if failed_services: reason = f'Services inactive: {", ".join(failed_services)}' - # Record each failed service in persistence + # Record each failed service in persistence, respecting dismiss + active_failed = [] for svc in failed_services: error_key = f'pve_service_{svc}' - health_persistence.record_error( + rec_result = health_persistence.record_error( error_key=error_key, category='pve_services', severity='CRITICAL', reason=f'PVE service {svc} is {service_details.get(svc, "inactive")}', details={'service': svc, 'state': service_details.get(svc, 'inactive')} ) + if rec_result and rec_result.get('type') == 'skipped_acknowledged': + # Mark as dismissed in checks for frontend + if svc in checks: + checks[svc]['dismissed'] = True + else: + active_failed.append(svc) # Auto-clear services that recovered for svc in services_to_check: @@ -1710,10 +1924,21 @@ class HealthMonitor: if health_persistence.is_error_active(error_key): health_persistence.clear_error(error_key) + # If all failed services are dismissed, return OK + if not active_failed: + return { + 'status': 'OK', + 'reason': None, + 'failed': [], + 'is_cluster': is_cluster, + 'services_checked': len(services_to_check), + 'checks': checks + } + return { 'status': 'CRITICAL', - 'reason': reason, - 'failed': failed_services, + 'reason': f'Services inactive: {", ".join(active_failed)}', + 'failed': active_failed, 'is_cluster': is_cluster, 'services_checked': len(services_to_check), 'checks': checks @@ -1871,7 +2096,8 @@ class HealthMonitor: self.persistent_log_patterns[pattern] = { 'count': 1, 'first_seen': current_time, - 'last_seen': current_time + 'last_seen': current_time, + 'sample': line.strip()[:200], # Original line for display } for line in previous_lines: @@ -1903,6 +2129,18 @@ class HealthMonitor: if recent_count >= 5 and recent_count >= prev_count * 4: spike_errors[pattern] = recent_count + # Helper: get human-readable samples from normalized patterns + def _get_samples(error_dict, max_items=3): + """Return list of readable sample lines for error patterns.""" + samples = [] + for pattern in list(error_dict.keys())[:max_items]: + pdata = self.persistent_log_patterns.get(pattern, {}) + sample = pdata.get('sample', pattern) + # Trim timestamp prefix if present (e.g. "Feb 27 16:03:35 host ") + clean = re.sub(r'^[A-Z][a-z]{2}\s+\d+\s+[\d:]+\s+\S+\s+', '', sample) + samples.append(clean[:120]) + return samples + persistent_errors = {} for pattern, data in self.persistent_log_patterns.items(): time_span = current_time - data['first_seen'] @@ -1913,12 +2151,16 @@ class HealthMonitor: pattern_hash = hashlib.md5(pattern.encode()).hexdigest()[:8] error_key = f'log_persistent_{pattern_hash}' if not health_persistence.is_error_active(error_key, category='logs'): + # Use the original sample line for the notification, + # not the normalized pattern (which has IDs replaced). + sample = data.get('sample', pattern) health_persistence.record_error( error_key=error_key, category='logs', severity='WARNING', - reason=f'Persistent error pattern detected: {pattern[:80]}', - details={'pattern': pattern, 'dismissable': True, 'occurrences': data['count']} + reason=f'Recurring error ({data["count"]}x): {sample[:150]}', + details={'pattern': pattern, 'sample': sample, + 'dismissable': True, 'occurrences': data['count']} ) patterns_to_remove = [ @@ -1940,26 +2182,33 @@ class HealthMonitor: reason = f'Critical error detected: {representative_error[:100]}' elif cascade_count > 0: status = 'WARNING' - reason = f'Error cascade detected: {cascade_count} pattern(s) repeating ≥15 times in 3min' + samples = _get_samples(cascading_errors, 3) + reason = f'Error cascade ({cascade_count} patterns repeating):\n' + '\n'.join(f' - {s}' for s in samples) elif spike_count > 0: status = 'WARNING' - reason = f'Error spike detected: {spike_count} pattern(s) increased 4x' + samples = _get_samples(spike_errors, 3) + reason = f'Error spike ({spike_count} patterns with 4x increase):\n' + '\n'.join(f' - {s}' for s in samples) elif persistent_count > 0: status = 'WARNING' - reason = f'Persistent errors: {persistent_count} pattern(s) recurring over 15+ minutes' + samples = _get_samples(persistent_errors, 3) + reason = f'Persistent errors ({persistent_count} patterns over 15+ min):\n' + '\n'.join(f' - {s}' for s in samples) else: # No significant issues found status = 'OK' reason = None # Record/clear persistent errors for each log sub-check so Dismiss works + cascade_samples = _get_samples(cascading_errors, 2) if cascade_count else [] + spike_samples = _get_samples(spike_errors, 2) if spike_count else [] + persist_samples = _get_samples(persistent_errors, 2) if persistent_count else [] + log_sub_checks = { 'log_error_cascade': {'active': cascade_count > 0, 'severity': 'WARNING', - 'reason': f'{cascade_count} pattern(s) repeating >=15 times'}, + 'reason': f'{cascade_count} pattern(s) repeating >=15 times:\n' + '\n'.join(f' - {s}' for s in cascade_samples) if cascade_count else ''}, 'log_error_spike': {'active': spike_count > 0, 'severity': 'WARNING', - 'reason': f'{spike_count} pattern(s) with 4x increase'}, + 'reason': f'{spike_count} pattern(s) with 4x increase:\n' + '\n'.join(f' - {s}' for s in spike_samples) if spike_count else ''}, 'log_persistent_errors': {'active': persistent_count > 0, 'severity': 'WARNING', - 'reason': f'{persistent_count} recurring pattern(s) over 15+ min'}, + 'reason': f'{persistent_count} recurring pattern(s) over 15+ min:\n' + '\n'.join(f' - {s}' for s in persist_samples) if persistent_count else ''}, 'log_critical_errors': {'active': unique_critical_count > 0, 'severity': 'CRITICAL', 'reason': f'{unique_critical_count} critical error(s) found', 'dismissable': False}, } @@ -2335,20 +2584,7 @@ class HealthMonitor: msg = f'{total_banned} IP(s) currently banned by Fail2Ban (jails: {jails_str})' result['status'] = 'WARNING' result['detail'] = msg - - # Record in persistence (dismissable) - health_persistence.record_error( - error_key='fail2ban', - category='security', - severity='WARNING', - reason=msg, - details={ - 'banned_count': total_banned, - 'jails': jails_with_bans, - 'banned_ips': all_banned_ips[:5], - 'dismissable': True - } - ) + # Persistence handled by _check_security caller via security_fail2ban key else: result['detail'] = f'Fail2Ban active ({len(jails)} jail(s), no current bans)' # Auto-resolve if previously banned IPs are now gone @@ -2456,14 +2692,60 @@ class HealthMonitor: except Exception: pass - # Determine overall security status - if issues: - # Check if any sub-check is CRITICAL - has_critical = any(c.get('status') == 'CRITICAL' for c in checks.values()) + # Persist errors and respect dismiss for each sub-check + dismissed_keys = set() + security_sub_checks = { + 'security_login_attempts': checks.get('login_attempts', {}), + 'security_certificates': checks.get('certificates', {}), + 'security_uptime': checks.get('uptime', {}), + 'security_fail2ban': checks.get('fail2ban', {}), + } + + for err_key, check_info in security_sub_checks.items(): + check_status = check_info.get('status', 'OK') + if check_status not in ('OK', 'INFO'): + is_dismissable = check_info.get('dismissable', True) + rec_result = health_persistence.record_error( + error_key=err_key, + category='security', + severity=check_status, + reason=check_info.get('detail', ''), + details={'dismissable': is_dismissable} + ) + if rec_result and rec_result.get('type') == 'skipped_acknowledged': + dismissed_keys.add(err_key) + elif health_persistence.is_error_active(err_key): + health_persistence.clear_error(err_key) + + # Rebuild issues excluding dismissed sub-checks + key_to_check = { + 'security_login_attempts': 'login_attempts', + 'security_certificates': 'certificates', + 'security_uptime': 'uptime', + 'security_fail2ban': 'fail2ban', + } + active_issues = [] + for err_key, check_name in key_to_check.items(): + if err_key in dismissed_keys: + # Mark as dismissed in checks for the frontend + if check_name in checks: + checks[check_name]['dismissed'] = True + continue + check_info = checks.get(check_name, {}) + if check_info.get('status', 'OK') not in ('OK', 'INFO'): + active_issues.append(check_info.get('detail', '')) + + # Determine overall security status from non-dismissed issues only + if active_issues: + has_critical = any( + c.get('status') == 'CRITICAL' + for k, c in checks.items() + if f'security_{k}' not in dismissed_keys + ) overall_status = 'CRITICAL' if has_critical else 'WARNING' return { 'status': overall_status, - 'reason': '; '.join(issues[:2]), + 'reason': '; '.join(active_issues[:2]), 'checks': checks } diff --git a/AppImage/scripts/health_persistence.py b/AppImage/scripts/health_persistence.py index 377f71da..fede9b53 100644 --- a/AppImage/scripts/health_persistence.py +++ b/AppImage/scripts/health_persistence.py @@ -25,12 +25,8 @@ from pathlib import Path class HealthPersistence: """Manages persistent health error tracking""" - # Error retention periods (seconds) - VM_ERROR_RETENTION = 48 * 3600 # 48 hours - LOG_ERROR_RETENTION = 24 * 3600 # 24 hours - DISK_ERROR_RETENTION = 48 * 3600 # 48 hours - - # Default suppression: 24 hours (user can change per-category in settings) + # Default suppression duration when no user setting exists for a category. + # Users override per-category via the Suppression Duration settings UI. DEFAULT_SUPPRESSION_HOURS = 24 # Mapping from error categories to settings keys @@ -114,6 +110,31 @@ class HealthPersistence: ) ''') + # Notification history table (records all sent notifications) + cursor.execute(''' + CREATE TABLE IF NOT EXISTS notification_history ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + event_type TEXT NOT NULL, + channel TEXT NOT NULL, + title TEXT, + message TEXT, + severity TEXT, + sent_at TEXT NOT NULL, + success INTEGER DEFAULT 1, + error_message TEXT, + source TEXT DEFAULT 'server' + ) + ''') + + # Notification cooldown persistence (survives restarts) + cursor.execute(''' + CREATE TABLE IF NOT EXISTS notification_last_sent ( + fingerprint TEXT PRIMARY KEY, + last_sent_ts INTEGER NOT NULL, + count INTEGER DEFAULT 1 + ) + ''') + # Migration: add suppression_hours column to errors if not present cursor.execute("PRAGMA table_info(errors)") columns = [col[1] for col in cursor.fetchall()] @@ -125,6 +146,9 @@ class HealthPersistence: cursor.execute('CREATE INDEX IF NOT EXISTS idx_category ON errors(category)') cursor.execute('CREATE INDEX IF NOT EXISTS idx_resolved ON errors(resolved_at)') cursor.execute('CREATE INDEX IF NOT EXISTS idx_events_error ON events(error_key)') + cursor.execute('CREATE INDEX IF NOT EXISTS idx_notif_sent_at ON notification_history(sent_at)') + cursor.execute('CREATE INDEX IF NOT EXISTS idx_notif_severity ON notification_history(severity)') + cursor.execute('CREATE INDEX IF NOT EXISTS idx_nls_ts ON notification_last_sent(last_sent_ts)') conn.commit() conn.close() @@ -468,32 +492,58 @@ class HealthPersistence: cursor = conn.cursor() now = datetime.now() + now_iso = now.isoformat() # Delete resolved errors older than 7 days cutoff_resolved = (now - timedelta(days=7)).isoformat() cursor.execute('DELETE FROM errors WHERE resolved_at < ?', (cutoff_resolved,)) - # Auto-resolve VM/CT errors older than 48h - cutoff_vm = (now - timedelta(seconds=self.VM_ERROR_RETENTION)).isoformat() - cursor.execute(''' - UPDATE errors - SET resolved_at = ? - WHERE category = 'vms' - AND resolved_at IS NULL - AND first_seen < ? - AND acknowledged = 0 - ''', (now.isoformat(), cutoff_vm)) + # ── Auto-resolve stale errors using Suppression Duration settings ── + # Read per-category suppression hours from user_settings. + # If the user hasn't configured a value, use DEFAULT_SUPPRESSION_HOURS. + # This is the SINGLE source of truth for auto-resolution timing. + user_settings = {} + try: + cursor.execute( + 'SELECT setting_key, setting_value FROM user_settings WHERE setting_key LIKE ?', + ('suppress_%',) + ) + for row in cursor.fetchall(): + user_settings[row[0]] = row[1] + except Exception: + pass - # Auto-resolve log errors older than 24h - cutoff_logs = (now - timedelta(seconds=self.LOG_ERROR_RETENTION)).isoformat() + for category, setting_key in self.CATEGORY_SETTING_MAP.items(): + stored = user_settings.get(setting_key) + try: + hours = int(stored) if stored else self.DEFAULT_SUPPRESSION_HOURS + except (ValueError, TypeError): + hours = self.DEFAULT_SUPPRESSION_HOURS + + # -1 means permanently suppressed -- skip auto-resolve + if hours < 0: + continue + + cutoff = (now - timedelta(hours=hours)).isoformat() + cursor.execute(''' + UPDATE errors + SET resolved_at = ? + WHERE category = ? + AND resolved_at IS NULL + AND last_seen < ? + AND acknowledged = 0 + ''', (now_iso, category, cutoff)) + + # Catch-all: auto-resolve any error from an unmapped category + # whose last_seen exceeds DEFAULT_SUPPRESSION_HOURS. + fallback_cutoff = (now - timedelta(hours=self.DEFAULT_SUPPRESSION_HOURS)).isoformat() cursor.execute(''' - UPDATE errors + UPDATE errors SET resolved_at = ? - WHERE category = 'logs' - AND resolved_at IS NULL - AND first_seen < ? + WHERE resolved_at IS NULL AND acknowledged = 0 - ''', (now.isoformat(), cutoff_logs)) + AND last_seen < ? + ''', (now_iso, fallback_cutoff)) # Delete old events (>30 days) cutoff_events = (now - timedelta(days=30)).isoformat() diff --git a/AppImage/scripts/notification_channels.py b/AppImage/scripts/notification_channels.py new file mode 100644 index 00000000..9cb6255f --- /dev/null +++ b/AppImage/scripts/notification_channels.py @@ -0,0 +1,579 @@ +""" +ProxMenux Notification Channels +Provides transport adapters for Telegram, Gotify, and Discord. + +Each channel implements send() and test() with: +- Retry with exponential backoff (3 attempts) +- Request timeout of 10s +- Rate limiting (max 30 msg/min per channel) + +Author: MacRimi +""" + +import json +import time +import urllib.request +import urllib.error +import urllib.parse +from abc import ABC, abstractmethod +from collections import deque +from typing import Tuple, Optional, Dict, Any + + +# ─── Rate Limiter ──────────────────────────────────────────────── + +class RateLimiter: + """Token-bucket rate limiter: max N messages per window.""" + + def __init__(self, max_calls: int = 30, window_seconds: int = 60): + self.max_calls = max_calls + self.window = window_seconds + self._timestamps: deque = deque() + + def allow(self) -> bool: + now = time.monotonic() + while self._timestamps and now - self._timestamps[0] > self.window: + self._timestamps.popleft() + if len(self._timestamps) >= self.max_calls: + return False + self._timestamps.append(now) + return True + + def wait_time(self) -> float: + if not self._timestamps: + return 0.0 + return max(0.0, self.window - (time.monotonic() - self._timestamps[0])) + + +# ─── Base Channel ──────────────────────────────────────────────── + +class NotificationChannel(ABC): + """Abstract base for all notification channels.""" + + MAX_RETRIES = 3 + RETRY_DELAYS = [2, 4, 8] # exponential backoff seconds + REQUEST_TIMEOUT = 10 + + def __init__(self): + self._rate_limiter = RateLimiter(max_calls=30, window_seconds=60) + + @abstractmethod + def send(self, title: str, message: str, severity: str = 'INFO', + data: Optional[Dict] = None) -> Dict[str, Any]: + """Send a notification. Returns {success, error, channel}.""" + pass + + @abstractmethod + def test(self) -> Tuple[bool, str]: + """Send a test message. Returns (success, error_message).""" + pass + + @abstractmethod + def validate_config(self) -> Tuple[bool, str]: + """Check if config is valid without sending. Returns (valid, error).""" + pass + + def _http_request(self, url: str, data: bytes, headers: Dict[str, str], + method: str = 'POST') -> Tuple[int, str]: + """Execute HTTP request with timeout. Returns (status_code, body).""" + req = urllib.request.Request(url, data=data, headers=headers, method=method) + try: + with urllib.request.urlopen(req, timeout=self.REQUEST_TIMEOUT) as resp: + body = resp.read().decode('utf-8', errors='replace') + return resp.status, body + except urllib.error.HTTPError as e: + body = e.read().decode('utf-8', errors='replace') if e.fp else str(e) + return e.code, body + except urllib.error.URLError as e: + return 0, str(e.reason) + except Exception as e: + return 0, str(e) + + def _send_with_retry(self, send_fn) -> Dict[str, Any]: + """Wrap a send function with rate limiting and retry logic.""" + if not self._rate_limiter.allow(): + wait = self._rate_limiter.wait_time() + return { + 'success': False, + 'error': f'Rate limited. Retry in {wait:.0f}s', + 'rate_limited': True + } + + last_error = '' + for attempt in range(self.MAX_RETRIES): + try: + status, body = send_fn() + if 200 <= status < 300: + return {'success': True, 'error': None} + last_error = f'HTTP {status}: {body[:200]}' + except Exception as e: + last_error = str(e) + + if attempt < self.MAX_RETRIES - 1: + time.sleep(self.RETRY_DELAYS[attempt]) + + return {'success': False, 'error': last_error} + + +# ─── Telegram ──────────────────────────────────────────────────── + +class TelegramChannel(NotificationChannel): + """Telegram Bot API channel using HTML parse mode.""" + + API_BASE = 'https://api.telegram.org/bot{token}/sendMessage' + MAX_LENGTH = 4096 + + SEVERITY_ICONS = { + 'CRITICAL': '\U0001F534', # red circle + 'WARNING': '\U0001F7E1', # yellow circle + 'INFO': '\U0001F535', # blue circle + 'OK': '\U0001F7E2', # green circle + 'UNKNOWN': '\u26AA', # white circle + } + + def __init__(self, bot_token: str, chat_id: str): + super().__init__() + token = bot_token.strip() + # Strip 'bot' prefix if user included it (API_BASE already adds it) + if token.lower().startswith('bot') and ':' in token[3:]: + token = token[3:] + self.bot_token = token + self.chat_id = chat_id.strip() + + def validate_config(self) -> Tuple[bool, str]: + if not self.bot_token: + return False, 'Bot token is required' + if not self.chat_id: + return False, 'Chat ID is required' + if ':' not in self.bot_token: + return False, 'Invalid bot token format (expected BOT_ID:TOKEN)' + return True, '' + + def send(self, title: str, message: str, severity: str = 'INFO', + data: Optional[Dict] = None) -> Dict[str, Any]: + icon = self.SEVERITY_ICONS.get(severity, self.SEVERITY_ICONS['INFO']) + html_msg = f"{icon} {self._escape_html(title)}\n\n{self._escape_html(message)}" + + # Split long messages + chunks = self._split_message(html_msg) + result = {'success': True, 'error': None, 'channel': 'telegram'} + + for chunk in chunks: + res = self._send_with_retry(lambda c=chunk: self._post_message(c)) + if not res['success']: + result = {**res, 'channel': 'telegram'} + break + + return result + + def test(self) -> Tuple[bool, str]: + valid, err = self.validate_config() + if not valid: + return False, err + + result = self.send( + 'ProxMenux Test', + 'Notification service is working correctly.\nThis is a test message from ProxMenux Monitor.', + 'INFO' + ) + return result['success'], result.get('error', '') + + def _post_message(self, text: str) -> Tuple[int, str]: + url = self.API_BASE.format(token=self.bot_token) + payload = json.dumps({ + 'chat_id': self.chat_id, + 'text': text, + 'parse_mode': 'HTML', + 'disable_web_page_preview': True, + }).encode('utf-8') + + return self._http_request(url, payload, {'Content-Type': 'application/json'}) + + def _split_message(self, text: str) -> list: + if len(text) <= self.MAX_LENGTH: + return [text] + chunks = [] + while text: + if len(text) <= self.MAX_LENGTH: + chunks.append(text) + break + split_at = text.rfind('\n', 0, self.MAX_LENGTH) + if split_at == -1: + split_at = self.MAX_LENGTH + chunks.append(text[:split_at]) + text = text[split_at:].lstrip('\n') + return chunks + + @staticmethod + def _escape_html(text: str) -> str: + return (text + .replace('&', '&') + .replace('<', '<') + .replace('>', '>')) + + +# ─── Gotify ────────────────────────────────────────────────────── + +class GotifyChannel(NotificationChannel): + """Gotify push notification channel with priority mapping.""" + + PRIORITY_MAP = { + 'OK': 1, + 'INFO': 2, + 'UNKNOWN': 3, + 'WARNING': 5, + 'CRITICAL': 10, + } + + def __init__(self, server_url: str, app_token: str): + super().__init__() + self.server_url = server_url.rstrip('/').strip() + self.app_token = app_token.strip() + + def validate_config(self) -> Tuple[bool, str]: + if not self.server_url: + return False, 'Server URL is required' + if not self.app_token: + return False, 'Application token is required' + if not self.server_url.startswith(('http://', 'https://')): + return False, 'Server URL must start with http:// or https://' + return True, '' + + def send(self, title: str, message: str, severity: str = 'INFO', + data: Optional[Dict] = None) -> Dict[str, Any]: + priority = self.PRIORITY_MAP.get(severity, 2) + + result = self._send_with_retry( + lambda: self._post_message(title, message, priority) + ) + result['channel'] = 'gotify' + return result + + def test(self) -> Tuple[bool, str]: + valid, err = self.validate_config() + if not valid: + return False, err + + result = self.send( + 'ProxMenux Test', + 'Notification service is working correctly.\nThis is a test message from ProxMenux Monitor.', + 'INFO' + ) + return result['success'], result.get('error', '') + + def _post_message(self, title: str, message: str, priority: int) -> Tuple[int, str]: + url = f"{self.server_url}/message?token={self.app_token}" + payload = json.dumps({ + 'title': title, + 'message': message, + 'priority': priority, + 'extras': { + 'client::display': {'contentType': 'text/markdown'} + } + }).encode('utf-8') + + return self._http_request(url, payload, {'Content-Type': 'application/json'}) + + +# ─── Discord ───────────────────────────────────────────────────── + +class DiscordChannel(NotificationChannel): + """Discord webhook channel with color-coded embeds.""" + + MAX_EMBED_DESC = 2048 + + SEVERITY_COLORS = { + 'CRITICAL': 0xED4245, # red + 'WARNING': 0xFEE75C, # yellow + 'INFO': 0x5865F2, # blurple + 'OK': 0x57F287, # green + 'UNKNOWN': 0x99AAB5, # grey + } + + def __init__(self, webhook_url: str): + super().__init__() + self.webhook_url = webhook_url.strip() + + def validate_config(self) -> Tuple[bool, str]: + if not self.webhook_url: + return False, 'Webhook URL is required' + if 'discord.com/api/webhooks/' not in self.webhook_url: + return False, 'Invalid Discord webhook URL' + return True, '' + + def send(self, title: str, message: str, severity: str = 'INFO', + data: Optional[Dict] = None) -> Dict[str, Any]: + color = self.SEVERITY_COLORS.get(severity, 0x5865F2) + + desc = message[:self.MAX_EMBED_DESC] if len(message) > self.MAX_EMBED_DESC else message + + embed = { + 'title': title, + 'description': desc, + 'color': color, + 'footer': {'text': 'ProxMenux Monitor'}, + 'timestamp': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()), + } + + # Use structured fields from render_template if available + rendered_fields = (data or {}).get('_rendered_fields', []) + if rendered_fields: + embed['fields'] = [ + {'name': name, 'value': val[:1024], 'inline': True} + for name, val in rendered_fields[:25] # Discord limit: 25 fields + ] + elif data: + fields = [] + if data.get('category'): + fields.append({'name': 'Category', 'value': data['category'], 'inline': True}) + if data.get('hostname'): + fields.append({'name': 'Host', 'value': data['hostname'], 'inline': True}) + if data.get('severity'): + fields.append({'name': 'Severity', 'value': data['severity'], 'inline': True}) + if fields: + embed['fields'] = fields + + result = self._send_with_retry( + lambda: self._post_webhook(embed) + ) + result['channel'] = 'discord' + return result + + def test(self) -> Tuple[bool, str]: + valid, err = self.validate_config() + if not valid: + return False, err + + result = self.send( + 'ProxMenux Test', + 'Notification service is working correctly.\nThis is a test message from ProxMenux Monitor.', + 'INFO' + ) + return result['success'], result.get('error', '') + + def _post_webhook(self, embed: Dict) -> Tuple[int, str]: + payload = json.dumps({ + 'username': 'ProxMenux', + 'embeds': [embed] + }).encode('utf-8') + + return self._http_request( + self.webhook_url, payload, {'Content-Type': 'application/json'} + ) + + +# ─── Email Channel ────────────────────────────────────────────── + +class EmailChannel(NotificationChannel): + """Email notification channel using SMTP (smtplib) or sendmail fallback. + + Config keys: + host, port, username, password, tls_mode (none|starttls|ssl), + from_address, to_addresses (comma-separated), subject_prefix, timeout + """ + + def __init__(self, config: Dict[str, str]): + super().__init__() + self.host = config.get('host', '') + self.port = int(config.get('port', 587) or 587) + self.username = config.get('username', '') + self.password = config.get('password', '') + self.tls_mode = config.get('tls_mode', 'starttls') # none | starttls | ssl + self.from_address = config.get('from_address', '') + self.to_addresses = self._parse_recipients(config.get('to_addresses', '')) + self.subject_prefix = config.get('subject_prefix', '[ProxMenux]') + self.timeout = int(config.get('timeout', 10) or 10) + + @staticmethod + def _parse_recipients(raw) -> list: + if isinstance(raw, list): + return [a.strip() for a in raw if a.strip()] + return [addr.strip() for addr in str(raw).split(',') if addr.strip()] + + def validate_config(self) -> Tuple[bool, str]: + if not self.to_addresses: + return False, 'No recipients configured' + if not self.from_address: + return False, 'No from address configured' + # Must have SMTP host OR local sendmail available + if not self.host: + import os + if not os.path.exists('/usr/sbin/sendmail'): + return False, 'No SMTP host configured and /usr/sbin/sendmail not found' + return True, '' + + def send(self, title: str, message: str, severity: str = 'INFO', + data: Optional[Dict] = None) -> Dict[str, Any]: + subject = f"{self.subject_prefix} [{severity}] {title}" + + def _do_send(): + if self.host: + return self._send_smtp(subject, message, severity) + else: + return self._send_sendmail(subject, message, severity) + + return self._send_with_retry(_do_send) + + def _send_smtp(self, subject: str, body: str, severity: str) -> Tuple[int, str]: + import smtplib + from email.message import EmailMessage + + msg = EmailMessage() + msg['Subject'] = subject + msg['From'] = self.from_address + msg['To'] = ', '.join(self.to_addresses) + msg.set_content(body) + + # Add HTML alternative + html_body = self._format_html(subject, body, severity) + if html_body: + msg.add_alternative(html_body, subtype='html') + + try: + if self.tls_mode == 'ssl': + server = smtplib.SMTP_SSL(self.host, self.port, timeout=self.timeout) + else: + server = smtplib.SMTP(self.host, self.port, timeout=self.timeout) + if self.tls_mode == 'starttls': + server.starttls() + + if self.username and self.password: + server.login(self.username, self.password) + + server.send_message(msg) + server.quit() + return 200, 'OK' + except smtplib.SMTPAuthenticationError as e: + return 0, f'SMTP authentication failed: {e}' + except smtplib.SMTPConnectError as e: + return 0, f'SMTP connection failed: {e}' + except smtplib.SMTPException as e: + return 0, f'SMTP error: {e}' + except (OSError, TimeoutError) as e: + return 0, f'Connection error: {e}' + + def _send_sendmail(self, subject: str, body: str, severity: str) -> Tuple[int, str]: + import os + import subprocess + from email.message import EmailMessage + + sendmail = '/usr/sbin/sendmail' + if not os.path.exists(sendmail): + return 0, 'sendmail not found at /usr/sbin/sendmail' + + msg = EmailMessage() + msg['Subject'] = subject + msg['From'] = self.from_address or 'proxmenux@localhost' + msg['To'] = ', '.join(self.to_addresses) + msg.set_content(body) + + try: + proc = subprocess.run( + [sendmail, '-t', '-oi'], + input=msg.as_string(), capture_output=True, text=True, timeout=30 + ) + if proc.returncode == 0: + return 200, 'OK' + return 0, f'sendmail failed (rc={proc.returncode}): {proc.stderr[:200]}' + except subprocess.TimeoutExpired: + return 0, 'sendmail timed out after 30s' + except Exception as e: + return 0, f'sendmail error: {e}' + + @staticmethod + def _format_html(subject: str, body: str, severity: str) -> str: + """Create professional HTML email.""" + import html as html_mod + + severity_colors = {'CRITICAL': '#dc2626', 'WARNING': '#f59e0b', 'INFO': '#3b82f6'} + color = severity_colors.get(severity, '#6b7280') + + body_html = ''.join( + f'

{html_mod.escape(line)}

' + for line in body.split('\n') if line.strip() + ) + + return f''' + +
+
+

ProxMenux Monitor

+

{html_mod.escape(severity)} Alert

+
+
+

{html_mod.escape(subject)}

+ {body_html} +
+
+

Sent by ProxMenux Notification Service

+
+
+''' + + def test(self) -> Tuple[bool, str]: + result = self.send( + 'ProxMenux Test Notification', + 'This is a test notification from ProxMenux Monitor.\n' + 'If you received this, your email channel is working correctly.', + 'INFO' + ) + return result.get('success', False), result.get('error', '') + + +# ─── Channel Factory ───────────────────────────────────────────── + +CHANNEL_TYPES = { + 'telegram': { + 'name': 'Telegram', + 'config_keys': ['bot_token', 'chat_id'], + 'class': TelegramChannel, + }, + 'gotify': { + 'name': 'Gotify', + 'config_keys': ['url', 'token'], + 'class': GotifyChannel, + }, + 'discord': { + 'name': 'Discord', + 'config_keys': ['webhook_url'], + 'class': DiscordChannel, + }, + 'email': { + 'name': 'Email (SMTP)', + 'config_keys': ['host', 'port', 'username', 'password', 'tls_mode', + 'from_address', 'to_addresses', 'subject_prefix'], + 'class': EmailChannel, + }, +} + + +def create_channel(channel_type: str, config: Dict[str, str]) -> Optional[NotificationChannel]: + """Create a channel instance from type name and config dict. + + Args: + channel_type: 'telegram', 'gotify', or 'discord' + config: Dict with channel-specific keys (see CHANNEL_TYPES) + + Returns: + Channel instance or None if creation fails + """ + try: + if channel_type == 'telegram': + return TelegramChannel( + bot_token=config.get('bot_token', ''), + chat_id=config.get('chat_id', '') + ) + elif channel_type == 'gotify': + return GotifyChannel( + server_url=config.get('url', ''), + app_token=config.get('token', '') + ) + elif channel_type == 'discord': + return DiscordChannel( + webhook_url=config.get('webhook_url', '') + ) + elif channel_type == 'email': + return EmailChannel(config) + except Exception as e: + print(f"[NotificationChannels] Failed to create {channel_type}: {e}") + return None diff --git a/AppImage/scripts/notification_events.py b/AppImage/scripts/notification_events.py new file mode 100644 index 00000000..8a47d428 --- /dev/null +++ b/AppImage/scripts/notification_events.py @@ -0,0 +1,1301 @@ +""" +ProxMenux Notification Event Watchers +Detects Proxmox events from journald, PVE task log, and health monitor. + +Architecture: +- JournalWatcher: Real-time stream of journald for critical events +- TaskWatcher: Real-time tail of /var/log/pve/tasks/index for VM/CT/backup events +- PollingCollector: Periodic poll of health_persistence pending notifications + +All watchers put events into a shared Queue consumed by NotificationManager. + +Author: MacRimi +""" + +import os +import re +import json +import time +import hashlib +import socket +import sqlite3 +import subprocess +import threading +from queue import Queue +from typing import Optional, Dict, Any, Tuple +from pathlib import Path + + +# ─── Event Object ───────────────────────────────────────────────── + +class NotificationEvent: + """Represents a detected event ready for notification dispatch. + + Fields: + event_type: Taxonomy key (e.g. 'vm_fail', 'auth_fail', 'split_brain') + severity: INFO | WARNING | CRITICAL + data: Payload dict with context (hostname, vmid, reason, etc.) + source: Origin: journal | tasks | health | proxmox_hook | cli | api | polling + entity: What is affected: node | vm | ct | storage | disk | network | cluster | user + entity_id: Specific identifier (vmid, IP, device, pool, interface, etc.) + raw: Original payload (webhook JSON or log line), optional + fingerprint: Stable dedup key: hostname:entity:entity_id:event_type + event_id: Short hash of fingerprint for correlation + ts_epoch: time.time() at creation + ts_monotonic: time.monotonic() at creation (drift-safe for cooldown) + """ + + __slots__ = ( + 'event_type', 'severity', 'data', 'timestamp', 'source', + 'entity', 'entity_id', 'raw', + 'fingerprint', 'event_id', 'ts_epoch', 'ts_monotonic', + ) + + def __init__(self, event_type: str, severity: str = 'INFO', + data: Optional[Dict[str, Any]] = None, + source: str = 'watcher', + entity: str = 'node', entity_id: str = '', + raw: Any = None): + self.event_type = event_type + self.severity = severity + self.data = data or {} + self.source = source + self.entity = entity + self.entity_id = entity_id + self.raw = raw + self.ts_epoch = time.time() + self.ts_monotonic = time.monotonic() + self.timestamp = self.ts_epoch # backward compat + + # Build fingerprint for dedup/cooldown + hostname = self.data.get('hostname', _hostname()) + if entity_id: + fp_base = f"{hostname}:{entity}:{entity_id}:{event_type}" + else: + # When entity_id is empty, include a hash of title/body for uniqueness + reason = self.data.get('reason', self.data.get('title', '')) + stable_extra = hashlib.md5(reason.encode(errors='replace')).hexdigest()[:8] if reason else '' + fp_base = f"{hostname}:{entity}:{event_type}:{stable_extra}" + self.fingerprint = fp_base + self.event_id = hashlib.md5(fp_base.encode()).hexdigest()[:12] + + def __repr__(self): + return f"NotificationEvent({self.event_type}, {self.severity}, fp={self.fingerprint[:40]})" + + +def _hostname() -> str: + try: + return socket.gethostname().split('.')[0] + except Exception: + return 'proxmox' + + +# ─── Journal Watcher (Real-time) ───────────────────────────────── + +class JournalWatcher: + """Watches journald in real-time for critical system events. + + Uses 'journalctl -f -o json' subprocess to stream entries. + Detects: auth failures, kernel panics, OOM, service crashes, + disk I/O errors, split-brain, node disconnect, system shutdown, + fail2ban bans, firewall blocks, permission changes. + """ + + def __init__(self, event_queue: Queue): + self._queue = event_queue + self._running = False + self._thread: Optional[threading.Thread] = None + self._process: Optional[subprocess.Popen] = None + self._hostname = _hostname() + + # Dedup: track recent events to avoid duplicates + self._recent_events: Dict[str, float] = {} + self._dedup_window = 30 # seconds + + def start(self): + """Start the journal watcher thread.""" + if self._running: + return + self._running = True + self._thread = threading.Thread(target=self._watch_loop, daemon=True, + name='journal-watcher') + self._thread.start() + + def stop(self): + """Stop the journal watcher.""" + self._running = False + if self._process: + try: + self._process.terminate() + self._process.wait(timeout=5) + except Exception: + try: + self._process.kill() + except Exception: + pass + + def _watch_loop(self): + """Main watch loop with auto-restart on failure.""" + while self._running: + try: + self._run_journalctl() + except Exception as e: + print(f"[JournalWatcher] Error: {e}") + if self._running: + time.sleep(5) # Wait before restart + + def _run_journalctl(self): + """Run journalctl -f and process output line by line.""" + cmd = ['journalctl', '-f', '-o', 'json', '--no-pager', + '-n', '0'] # Start from now, don't replay history + + self._process = subprocess.Popen( + cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, + text=True, bufsize=1 + ) + + for line in self._process.stdout: + if not self._running: + break + line = line.strip() + if not line: + continue + try: + entry = json.loads(line) + self._process_entry(entry) + except (json.JSONDecodeError, KeyError): + # Try plain text matching as fallback + self._process_plain(line) + + if self._process: + self._process.wait() + + def _process_entry(self, entry: Dict): + """Process a parsed journald JSON entry.""" + msg = entry.get('MESSAGE', '') + if not msg or not isinstance(msg, str): + return + + unit = entry.get('_SYSTEMD_UNIT', '') + syslog_id = entry.get('SYSLOG_IDENTIFIER', '') + priority = int(entry.get('PRIORITY', 6)) + + self._check_auth_failure(msg, syslog_id, entry) + self._check_fail2ban(msg, syslog_id) + self._check_kernel_critical(msg, syslog_id, priority) + self._check_service_failure(msg, unit) + self._check_disk_io(msg, syslog_id, priority) + self._check_cluster_events(msg, syslog_id) + self._check_system_shutdown(msg, syslog_id) + self._check_permission_change(msg, syslog_id) + self._check_firewall(msg, syslog_id) + + def _process_plain(self, line: str): + """Fallback: process a plain text log line.""" + self._check_auth_failure(line, '', {}) + self._check_fail2ban(line, '') + self._check_kernel_critical(line, '', 6) + self._check_cluster_events(line, '') + self._check_system_shutdown(line, '') + + # ── Detection methods ── + + def _check_auth_failure(self, msg: str, syslog_id: str, entry: Dict): + """Detect authentication failures (SSH, PAM, PVE).""" + patterns = [ + (r'Failed password for (?:invalid user )?(\S+) from (\S+)', 'ssh'), + (r'authentication failure.*rhost=(\S+).*user=(\S+)', 'pam'), + (r'pvedaemon\[.*authentication failure.*rhost=(\S+)', 'pve'), + ] + + for pattern, service in patterns: + match = re.search(pattern, msg, re.IGNORECASE) + if match: + groups = match.groups() + if service == 'ssh': + username, source_ip = groups[0], groups[1] + elif service == 'pam': + source_ip, username = groups[0], groups[1] + else: + source_ip = groups[0] + username = 'unknown' + + self._emit('auth_fail', 'WARNING', { + 'source_ip': source_ip, + 'username': username, + 'service': service, + 'hostname': self._hostname, + }, entity='user', entity_id=source_ip) + return + + def _check_fail2ban(self, msg: str, syslog_id: str): + """Detect Fail2Ban IP bans.""" + if 'fail2ban' not in msg.lower() and syslog_id != 'fail2ban-server': + return + + # Ban detected + ban_match = re.search(r'Ban\s+(\S+)', msg) + if ban_match: + ip = ban_match.group(1) + jail_match = re.search(r'\[(\w+)\]', msg) + jail = jail_match.group(1) if jail_match else 'unknown' + + self._emit('ip_block', 'INFO', { + 'source_ip': ip, + 'jail': jail, + 'failures': '', + 'hostname': self._hostname, + }, entity='user', entity_id=ip) + + def _check_kernel_critical(self, msg: str, syslog_id: str, priority: int): + """Detect kernel panics, OOM, segfaults, hardware errors.""" + # Only process messages from kernel or systemd (not app-level logs) + if syslog_id and syslog_id not in ('kernel', 'systemd', 'systemd-coredump', ''): + return + + # Filter out normal kernel messages that are NOT problems + _KERNEL_NOISE = [ + r'vfio-pci\s+\S+:\s*reset', # PCI passthrough resets (normal during VM start/stop) + r'vfio-pci\s+\S+:\s*resetting', + r'entered\s+(?:promiscuous|allmulticast)\s+mode', # Network bridge ops + r'entered\s+(?:blocking|forwarding|disabled)\s+state', # Bridge STP + r'tap\d+i\d+:', # TAP interface events + r'vmbr\d+:.*port\s+\d+', # Bridge port events + ] + for noise in _KERNEL_NOISE: + if re.search(noise, msg, re.IGNORECASE): + return + + # NOTE: Disk I/O errors (ATA, SCSI, blk_update_request) are NOT handled + # here. They are detected exclusively by HealthMonitor._check_disks_optimized + # which records to health_persistence -> PollingCollector -> notification. + # This avoids duplicate notifications and ensures the health dashboard + # stays in sync with notifications. + # Filesystem errors (EXT4/BTRFS/XFS/ZFS) ARE handled here because they + # indicate corruption, not just hardware I/O problems. + + critical_patterns = { + r'kernel panic': ('system_problem', 'CRITICAL', 'Kernel panic'), + r'Out of memory': ('system_problem', 'CRITICAL', 'Out of memory killer activated'), + r'segfault': ('system_problem', 'WARNING', 'Segmentation fault detected'), + r'BUG:': ('system_problem', 'CRITICAL', 'Kernel BUG detected'), + r'Call Trace:': ('system_problem', 'WARNING', 'Kernel call trace'), + r'EXT4-fs error': ('system_problem', 'CRITICAL', 'Filesystem error'), + r'BTRFS error': ('system_problem', 'CRITICAL', 'Filesystem error'), + r'XFS.*error': ('system_problem', 'CRITICAL', 'Filesystem error'), + r'ZFS.*error': ('system_problem', 'CRITICAL', 'ZFS pool error'), + r'mce:.*Hardware Error': ('system_problem', 'CRITICAL', 'Hardware error (MCE)'), + } + + for pattern, (event_type, severity, reason) in critical_patterns.items(): + if re.search(pattern, msg, re.IGNORECASE): + entity = 'node' + entity_id = '' + + # Build a context-rich reason from the journal message. + enriched = reason + + if 'segfault' in pattern: + m = re.search(r'(\S+)\[(\d+)\].*segfault', msg) + proc_name = m.group(1) if m else '' + proc_pid = m.group(2) if m else '' + lib_match = re.search(r'\bin\s+(\S+)', msg) + lib_name = lib_match.group(1) if lib_match else '' + + parts = [reason] + if proc_name: + parts.append(f"Process: {proc_name}" + (f" (PID {proc_pid})" if proc_pid else '')) + if lib_name: + parts.append(f"Module: {lib_name}") + enriched = '\n'.join(parts) + + elif 'Out of memory' in pattern: + m = re.search(r'Killed process\s+(\d+)\s+\(([^)]+)\)', msg) + if m: + enriched = f"{reason}\nKilled: {m.group(2)} (PID {m.group(1)})" + else: + enriched = f"{reason}\n{msg[:300]}" + + else: + # Generic: include the raw journal message for context + enriched = f"{reason}\n{msg[:300]}" + + data = {'reason': enriched, 'hostname': self._hostname} + + self._emit(event_type, severity, data, entity=entity, entity_id=entity_id) + return + + def _check_service_failure(self, msg: str, unit: str): + """Detect critical service failures with enriched context.""" + # Filter out noise -- these are normal systemd transient units, + # not real service failures worth alerting about. + _NOISE_PATTERNS = [ + r'session-\d+\.scope', # SSH/login sessions + r'user@\d+\.service', # Per-user service managers + r'user-runtime-dir@\d+', # User runtime dirs + r'systemd-coredump@', # Coredump handlers (transient) + r'run-.*\.mount', # Transient mounts + ] + for noise in _NOISE_PATTERNS: + if re.search(noise, msg) or re.search(noise, unit): + return + + service_patterns = [ + r'Failed to start (.+)', + r'Unit (\S+) (?:entered failed state|failed)', + r'(\S+)\.service: (?:Main process exited|Failed with result)', + ] + + for pattern in service_patterns: + match = re.search(pattern, msg) + if match: + service_name = match.group(1) + data = { + 'service_name': service_name, + 'reason': msg[:300], + 'hostname': self._hostname, + } + + # Enrich PVE VM/CT services with guest name and context + # pve-container@101 -> LXC container 101 + # qemu-server@100 -> QEMU VM 100 + pve_match = re.match( + r'(pve-container|qemu-server)@(\d+)', service_name) + if pve_match: + svc_type = pve_match.group(1) + vmid = pve_match.group(2) + vm_name = self._resolve_vm_name(vmid) + + if svc_type == 'pve-container': + guest_type = 'LXC container' + else: + guest_type = 'QEMU VM' + + display = f"{guest_type} {vmid}" + if vm_name: + display = f"{guest_type} {vmid} ({vm_name})" + + data['service_name'] = service_name + data['vmid'] = vmid + data['vmname'] = vm_name + data['guest_type'] = guest_type + data['display_name'] = display + data['reason'] = ( + f"{display} failed to start.\n{msg[:300]}" + ) + + self._emit('service_fail', 'WARNING', data, + entity='node', entity_id=service_name) + return + + def _resolve_vm_name(self, vmid: str) -> str: + """Try to resolve VMID to a guest name from PVE config files.""" + if not vmid: + return '' + # Check QEMU configs + for base in ['/etc/pve/qemu-server', '/etc/pve/lxc']: + conf = os.path.join(base, f'{vmid}.conf') + try: + with open(conf) as f: + for line in f: + if line.startswith('hostname:') or line.startswith('name:'): + return line.split(':', 1)[1].strip() + except (OSError, IOError): + continue + return '' + + def _check_disk_io(self, msg: str, syslog_id: str, priority: int): + """Detect disk I/O errors from kernel messages.""" + if syslog_id != 'kernel' and priority > 3: + return + + io_patterns = [ + r'blk_update_request: I/O error.*dev (\S+)', + r'Buffer I/O error on device (\S+)', + r'SCSI error.*sd(\w)', + r'ata\d+.*error', + ] + + for pattern in io_patterns: + match = re.search(pattern, msg) + if match: + device = match.group(1) if match.lastindex else 'unknown' + self._emit('disk_io_error', 'CRITICAL', { + 'device': device, + 'reason': msg[:200], + 'hostname': self._hostname, + }, entity='disk', entity_id=device) + return + + def _check_cluster_events(self, msg: str, syslog_id: str): + """Detect cluster split-brain and node disconnect.""" + msg_lower = msg.lower() + + # Split-brain + if any(p in msg_lower for p in ['split-brain', 'split brain', + 'fencing required', 'cluster partition']): + quorum = 'unknown' + if 'quorum' in msg_lower: + quorum = 'lost' if 'lost' in msg_lower else 'valid' + + self._emit('split_brain', 'CRITICAL', { + 'quorum': quorum, + 'reason': msg[:200], + 'hostname': self._hostname, + }, entity='cluster', entity_id=self._hostname) + return + + # Node disconnect + if (('quorum' in msg_lower and 'lost' in msg_lower) or + ('node' in msg_lower and any(w in msg_lower for w in ['left', 'offline', 'lost']))): + + node_match = re.search(r'[Nn]ode\s+(\S+)', msg) + node_name = node_match.group(1) if node_match else 'unknown' + + self._emit('node_disconnect', 'CRITICAL', { + 'node_name': node_name, + 'hostname': self._hostname, + }, entity='cluster', entity_id=node_name) + + def _check_system_shutdown(self, msg: str, syslog_id: str): + """Detect system shutdown/reboot. + + Matches multiple systemd signals that indicate the node is going down: + - "Shutting down." (systemd PID 1) + - "System is powering off." / "System is rebooting." + - "Reached target Shutdown." / "Reached target Reboot." + - "Journal stopped" (very late in shutdown) + - "The system will reboot now!" / "The system will power off now!" + """ + msg_lower = msg.lower() + + # Only process systemd / logind messages + if not any(s in syslog_id for s in ('systemd', 'logind', '')): + if 'systemd' not in msg_lower: + return + + is_reboot = False + is_shutdown = False + + # Detect reboot signals + reboot_signals = [ + 'system is rebooting', + 'reached target reboot', + 'the system will reboot now', + 'starting reboot', + ] + for sig in reboot_signals: + if sig in msg_lower: + is_reboot = True + break + + # Detect shutdown/poweroff signals + if not is_reboot: + shutdown_signals = [ + 'system is powering off', + 'system is halting', + 'shutting down', + 'reached target shutdown', + 'reached target halt', + 'the system will power off now', + 'starting power-off', + 'journal stopped', + 'stopping journal service', + ] + for sig in shutdown_signals: + if sig in msg_lower: + is_shutdown = True + break + + if is_reboot: + self._emit('system_reboot', 'CRITICAL', { + 'reason': msg[:200], + 'hostname': self._hostname, + }, entity='node', entity_id='') + elif is_shutdown: + self._emit('system_shutdown', 'CRITICAL', { + 'reason': msg[:200], + 'hostname': self._hostname, + }, entity='node', entity_id='') + + def _check_permission_change(self, msg: str, syslog_id: str): + """Detect user permission changes in PVE.""" + permission_patterns = [ + (r'set permissions.*user\s+(\S+)', 'Permission changed'), + (r'user added to group.*?(\S+)', 'Added to group'), + (r'user removed from group.*?(\S+)', 'Removed from group'), + (r'ACL updated.*?(\S+)', 'ACL updated'), + (r'Role assigned.*?(\S+)', 'Role assigned'), + ] + + for pattern, action in permission_patterns: + match = re.search(pattern, msg, re.IGNORECASE) + if match: + username = match.group(1) + self._emit('user_permission_change', 'INFO', { + 'username': username, + 'change_details': action, + 'hostname': self._hostname, + }, entity='user', entity_id=username) + return + + def _check_firewall(self, msg: str, syslog_id: str): + """Detect firewall issues (not individual drops, but rule errors).""" + if re.search(r'pve-firewall.*(?:error|failed|unable)', msg, re.IGNORECASE): + self._emit('firewall_issue', 'WARNING', { + 'reason': msg[:200], + 'hostname': self._hostname, + }, entity='network', entity_id='') + + # ── Emit helper ── + + def _emit(self, event_type: str, severity: str, data: Dict, + entity: str = 'node', entity_id: str = ''): + """Emit event to queue with short-term deduplication (30s window).""" + event = NotificationEvent( + event_type, severity, data, source='journal', + entity=entity, entity_id=entity_id, + ) + + now = time.time() + last = self._recent_events.get(event.fingerprint, 0) + if now - last < self._dedup_window: + return # Skip duplicate within 30s window + + self._recent_events[event.fingerprint] = now + + # Cleanup old dedup entries periodically + if len(self._recent_events) > 200: + cutoff = now - self._dedup_window * 2 + self._recent_events = { + k: v for k, v in self._recent_events.items() if v > cutoff + } + + self._queue.put(event) + + +# ─── Task Watcher (Real-time) ──────────────────────────────────── + +class TaskWatcher: + """Watches /var/log/pve/tasks/index for VM/CT and backup events. + + The PVE task index file is appended when tasks start/finish. + Format: UPID:node:pid:pstart:starttime:type:id:user: + Final status is recorded when task completes. + """ + + TASK_LOG = '/var/log/pve/tasks/index' + + # Map PVE task types to our event types + TASK_MAP = { + 'qmstart': ('vm_start', 'INFO'), + 'qmstop': ('vm_stop', 'INFO'), + 'qmshutdown': ('vm_shutdown', 'INFO'), + 'qmreboot': ('vm_restart', 'INFO'), + 'qmreset': ('vm_restart', 'INFO'), + 'vzstart': ('ct_start', 'INFO'), + 'vzstop': ('ct_stop', 'INFO'), + 'vzshutdown': ('ct_shutdown', 'INFO'), + 'vzreboot': ('ct_restart', 'INFO'), + 'vzdump': ('backup_start', 'INFO'), + 'qmsnapshot': ('snapshot_complete', 'INFO'), + 'vzsnapshot': ('snapshot_complete', 'INFO'), + 'qmigrate': ('migration_start', 'INFO'), + 'vzmigrate': ('migration_start', 'INFO'), + } + + def __init__(self, event_queue: Queue): + self._queue = event_queue + self._running = False + self._thread: Optional[threading.Thread] = None + self._hostname = _hostname() + self._last_position = 0 + # Cache for active vzdump detection + self._vzdump_active_cache: float = 0 # timestamp of last positive check + self._vzdump_cache_ttl = 5 # cache result for 5s + + def start(self): + if self._running: + return + self._running = True + + # Start at end of file + if os.path.exists(self.TASK_LOG): + try: + self._last_position = os.path.getsize(self.TASK_LOG) + except OSError: + self._last_position = 0 + + self._thread = threading.Thread(target=self._watch_loop, daemon=True, + name='task-watcher') + self._thread.start() + + def stop(self): + self._running = False + + def _is_vzdump_active(self) -> bool: + """Check if a vzdump (backup) job is currently running. + + Reads /var/log/pve/tasks/active which lists all running PVE tasks. + Also verifies the process is actually alive (PID check). + Result is cached for a few seconds to avoid excessive file reads. + """ + now = time.time() + # Negative cache: if we recently confirmed NO vzdump, skip the check + if hasattr(self, '_vzdump_negative_cache') and \ + now - self._vzdump_negative_cache < self._vzdump_cache_ttl: + return False + # Positive cache + if now - self._vzdump_active_cache < self._vzdump_cache_ttl: + return True + + active_file = '/var/log/pve/tasks/active' + try: + with open(active_file, 'r') as f: + for line in f: + # UPID format: UPID:node:pid:pstart:starttime:type:id:user: + if ':vzdump:' in line: + # Verify the PID is still alive + parts = line.strip().split(':') + if len(parts) >= 3: + try: + pid = int(parts[2]) + os.kill(pid, 0) # Signal 0 = just check existence + self._vzdump_active_cache = now + return True + except (ValueError, ProcessLookupError, PermissionError): + pass # PID not found or not a number -- stale entry + except (OSError, IOError): + pass + + self._vzdump_negative_cache = now + return False + + def _watch_loop(self): + """Poll the task index file for new entries.""" + while self._running: + try: + if os.path.exists(self.TASK_LOG): + current_size = os.path.getsize(self.TASK_LOG) + + if current_size < self._last_position: + # File was truncated/rotated + self._last_position = 0 + + if current_size > self._last_position: + with open(self.TASK_LOG, 'r') as f: + f.seek(self._last_position) + new_lines = f.readlines() + self._last_position = f.tell() + + for line in new_lines: + self._process_task_line(line.strip()) + except Exception as e: + print(f"[TaskWatcher] Error reading task log: {e}") + + time.sleep(2) # Check every 2 seconds + + def _process_task_line(self, line: str): + """Process a single task index line. + + PVE task index format (space-separated): + UPID endtime status + Where UPID = UPID:node:pid:pstart:starttime:type:id:user: + """ + if not line: + return + + parts = line.split() + if not parts: + return + + upid = parts[0] + status = parts[2] if len(parts) >= 3 else '' + + # Parse UPID + upid_parts = upid.split(':') + if len(upid_parts) < 8: + return + + task_type = upid_parts[5] + vmid = upid_parts[6] + user = upid_parts[7] + + # Get VM/CT name + vmname = self._get_vm_name(vmid) if vmid else '' + + # Map to event type + event_info = self.TASK_MAP.get(task_type) + if not event_info: + return + + event_type, default_severity = event_info + + + + # Check if task failed + is_error = status and status != 'OK' and status != '' + + if is_error: + # Override to failure event + if 'start' in event_type: + event_type = event_type.replace('_start', '_fail') + elif 'complete' in event_type: + event_type = event_type.replace('_complete', '_fail') + severity = 'CRITICAL' + elif status == 'OK': + # Task completed successfully + if event_type == 'backup_start': + event_type = 'backup_complete' + elif event_type == 'migration_start': + event_type = 'migration_complete' + severity = 'INFO' + else: + # Task just started (no status yet) + severity = default_severity + + data = { + 'vmid': vmid, + 'vmname': vmname or f'ID {vmid}', + 'hostname': self._hostname, + 'user': user, + 'reason': status if is_error else '', + 'target_node': '', + 'size': '', + 'snapshot_name': '', + } + + # Determine entity type from task type + entity = 'ct' if task_type.startswith('vz') else 'vm' + + # Backup and replication events are handled EXCLUSIVELY by the PVE + # webhook, which delivers much richer data (full logs, sizes, durations, + # filenames). TaskWatcher skips these entirely to avoid duplicates. + _WEBHOOK_EXCLUSIVE = {'backup_complete', 'backup_fail', 'backup_start', + 'replication_complete', 'replication_fail'} + if event_type in _WEBHOOK_EXCLUSIVE: + return + + # Suppress VM/CT start/stop/shutdown while a vzdump is active. + # These are backup-induced operations (mode=stop), not user actions. + # Exception: if a VM/CT FAILS to start after backup, that IS important. + _BACKUP_NOISE = {'vm_start', 'vm_stop', 'vm_shutdown', 'vm_restart', + 'ct_start', 'ct_stop', 'ct_shutdown', 'ct_restart'} + if event_type in _BACKUP_NOISE and not is_error: + if self._is_vzdump_active(): + return + + self._queue.put(NotificationEvent( + event_type, severity, data, source='tasks', + entity=entity, entity_id=vmid, + )) + + def _get_vm_name(self, vmid: str) -> str: + """Try to resolve VMID to name via config files.""" + if not vmid: + return '' + + # Try QEMU + conf_path = f'/etc/pve/qemu-server/{vmid}.conf' + name = self._read_name_from_conf(conf_path) + if name: + return name + + # Try LXC + conf_path = f'/etc/pve/lxc/{vmid}.conf' + name = self._read_name_from_conf(conf_path) + if name: + return name + + return '' + + @staticmethod + def _read_name_from_conf(path: str) -> str: + """Read 'name:' or 'hostname:' from PVE config file.""" + try: + if not os.path.exists(path): + return '' + with open(path, 'r') as f: + for line in f: + if line.startswith('name:'): + return line.split(':', 1)[1].strip() + if line.startswith('hostname:'): + return line.split(':', 1)[1].strip() + except (IOError, PermissionError): + pass + return '' + + +# ─── Polling Collector ──────────────────────────────────────────── + +class PollingCollector: + """Periodic collector that polls health state independently. + + Architecture: + - Completely independent from Health Monitor's suppression system. + Suppression Duration only affects the UI health badge; it NEVER blocks + notifications. + - Reads ``get_active_errors()`` (ALL active errors, even suppressed ones) + and decides when to notify based on its own 24-hour cycle. + - For *new* errors (first_seen within the last poll interval), notifies + immediately. + - For *persistent* errors (already known), re-notifies once every 24 h. + - Update checks run on their own 24-h timer and include security counts. + + Tracking is stored in ``notification_last_sent`` (same DB). + """ + + DIGEST_INTERVAL = 86400 # 24 h between re-notifications + UPDATE_CHECK_INTERVAL = 86400 # 24 h between update scans + NEW_ERROR_WINDOW = 120 # seconds – errors younger than this are "new" + + _ENTITY_MAP = { + 'cpu': ('node', ''), 'memory': ('node', ''), 'temperature': ('node', ''), + 'disk': ('storage', ''), 'network': ('network', ''), + 'pve_services': ('node', ''), 'security': ('user', ''), + 'updates': ('node', ''), 'storage': ('storage', ''), + } + + # Map health-persistence category names to our TEMPLATES event types. + # These must match keys in notification_templates.TEMPLATES exactly. + _CATEGORY_TO_EVENT_TYPE = { + 'cpu': 'cpu_high', + 'memory': 'ram_high', + 'load': 'load_high', + 'temperature': 'temp_high', + 'disk': 'disk_space_low', + 'storage': 'storage_unavailable', + 'network': 'network_down', + 'pve_services': 'service_fail', + 'security': 'auth_fail', + 'updates': 'update_available', + 'zfs': 'disk_io_error', + 'smart': 'disk_io_error', + 'disks': 'disk_io_error', + 'logs': 'system_problem', + 'vms': 'system_problem', + } + + def __init__(self, event_queue: Queue, poll_interval: int = 60): + self._queue = event_queue + self._running = False + self._thread: Optional[threading.Thread] = None + self._poll_interval = poll_interval + self._hostname = _hostname() + self._last_update_check = 0 + # In-memory cache: error_key -> last notification timestamp + self._last_notified: Dict[str, float] = {} + # Track known error keys so we can detect truly new ones + self._known_errors: set = set() + self._first_poll_done = False + + def start(self): + if self._running: + return + self._running = True + self._load_last_notified() + self._thread = threading.Thread(target=self._poll_loop, daemon=True, + name='polling-collector') + self._thread.start() + + def stop(self): + self._running = False + + # ── Main loop ────────────────────────────────────────────── + + def _poll_loop(self): + """Main polling loop.""" + # Initial delay to let health monitor warm up + for _ in range(15): + if not self._running: + return + time.sleep(1) + + while self._running: + try: + self._check_persistent_health() + self._check_updates() + except Exception as e: + print(f"[PollingCollector] Error: {e}") + + for _ in range(self._poll_interval): + if not self._running: + return + time.sleep(1) + + # ── Health errors (independent of suppression) ───────────── + + def _check_persistent_health(self): + """Read ALL active errors from health_persistence and decide + whether each one warrants a notification right now. + + Rules: + - A *new* error (not in _known_errors) -> notify immediately + - A *persistent* error already notified -> re-notify after 24 h + - Uses its own tracking, NOT the health monitor's needs_notification flag + """ + try: + from health_persistence import health_persistence + errors = health_persistence.get_active_errors() + except ImportError: + return + except Exception as e: + print(f"[PollingCollector] get_active_errors failed: {e}") + return + + now = time.time() + current_keys = set() + + for error in errors: + error_key = error.get('error_key', '') + if not error_key: + continue + + current_keys.add(error_key) + category = error.get('category', '') + severity = error.get('severity', 'WARNING') + reason = error.get('reason', '') + + # Determine if we should notify + is_new = error_key not in self._known_errors and self._first_poll_done + last_sent = self._last_notified.get(error_key, 0) + is_due = (now - last_sent) >= self.DIGEST_INTERVAL + + if not is_new and not is_due: + continue + + # Map to our event type + event_type = self._CATEGORY_TO_EVENT_TYPE.get(category, 'system_problem') + entity, eid = self._ENTITY_MAP.get(category, ('node', '')) + + data = { + 'hostname': self._hostname, + 'category': category, + 'reason': reason, + 'error_key': error_key, + 'severity': severity, + 'first_seen': error.get('first_seen', ''), + 'last_seen': error.get('last_seen', ''), + 'is_persistent': not is_new, + } + + # Include extra details if present + details = error.get('details') + if isinstance(details, dict): + data.update(details) + elif isinstance(details, str): + try: + data.update(json.loads(details)) + except (json.JSONDecodeError, TypeError): + pass + + self._queue.put(NotificationEvent( + event_type, severity, data, source='health', + entity=entity, entity_id=eid or error_key, + )) + + # Track that we notified + self._last_notified[error_key] = now + self._persist_last_notified(error_key, now) + + # Remove tracking for errors that resolved + resolved = self._known_errors - current_keys + for key in resolved: + self._last_notified.pop(key, None) + + self._known_errors = current_keys + self._first_poll_done = True + + # ── Update check (enriched) ──────────────────────────────── + + def _check_updates(self): + """Check for available system updates every 24 h. + + Enriched output: total count, security updates, PVE version hint, + and top package names. + """ + now = time.time() + if now - self._last_update_check < self.UPDATE_CHECK_INTERVAL: + return + + self._last_update_check = now + + try: + result = subprocess.run( + ['apt-get', '-s', 'upgrade'], + capture_output=True, text=True, timeout=60, + ) + if result.returncode != 0: + return + + lines = [l for l in result.stdout.split('\n') if l.startswith('Inst ')] + total = len(lines) + if total == 0: + return + + packages = [l.split()[1] for l in lines] + security = [p for p in packages if any( + kw in p.lower() for kw in ('security', 'cve', 'openssl', 'libssl') + )] + + # Also detect security updates via apt changelog / Debian-Security origin + sec_result = subprocess.run( + ['apt-get', '-s', 'upgrade', '-o', 'Dir::Etc::SourceList=/dev/null', + '-o', 'Dir::Etc::SourceParts=/dev/null'], + capture_output=True, text=True, timeout=30, + ) + # Count lines from security repo (rough heuristic) + sec_count = max(len(security), 0) + try: + sec_output = subprocess.run( + ['apt-get', '-s', '--only-upgrade', 'install'] + packages[:50], + capture_output=True, text=True, timeout=30, + ) + for line in sec_output.stdout.split('\n'): + if 'security' in line.lower() and 'Inst ' in line: + sec_count += 1 + except Exception: + pass + + # Check for PVE version upgrade + pve_packages = [p for p in packages if 'pve-' in p.lower() or 'proxmox-' in p.lower()] + + # Build display details + top_pkgs = packages[:8] + details = ', '.join(top_pkgs) + if total > 8: + details += f', ... +{total - 8} more' + + data = { + 'hostname': self._hostname, + 'count': str(total), + 'security_count': str(sec_count), + 'details': details, + 'packages': ', '.join(packages[:20]), + } + if pve_packages: + data['pve_packages'] = ', '.join(pve_packages) + + self._queue.put(NotificationEvent( + 'update_available', 'INFO', data, + source='polling', entity='node', entity_id='', + )) + except Exception: + pass + + # ── Persistence helpers ──────────────────────────────────── + + def _load_last_notified(self): + """Load per-error notification timestamps from DB on startup.""" + try: + db_path = Path('/usr/local/share/proxmenux/health_monitor.db') + if not db_path.exists(): + return + conn = sqlite3.connect(str(db_path), timeout=10) + conn.execute('PRAGMA journal_mode=WAL') + cursor = conn.cursor() + cursor.execute( + "SELECT fingerprint, last_sent_ts FROM notification_last_sent " + "WHERE fingerprint LIKE 'health_%'" + ) + for fp, ts in cursor.fetchall(): + error_key = fp.replace('health_', '', 1) + self._last_notified[error_key] = ts + self._known_errors.add(error_key) + conn.close() + except Exception as e: + print(f"[PollingCollector] Failed to load last_notified: {e}") + + def _persist_last_notified(self, error_key: str, ts: float): + """Save per-error notification timestamp to DB.""" + try: + db_path = Path('/usr/local/share/proxmenux/health_monitor.db') + conn = sqlite3.connect(str(db_path), timeout=10) + conn.execute('PRAGMA journal_mode=WAL') + conn.execute('PRAGMA busy_timeout=5000') + fp = f'health_{error_key}' + conn.execute(''' + INSERT OR REPLACE INTO notification_last_sent (fingerprint, last_sent_ts, count) + VALUES (?, ?, COALESCE( + (SELECT count + 1 FROM notification_last_sent WHERE fingerprint = ?), 1 + )) + ''', (fp, int(ts), fp)) + conn.commit() + conn.close() + except Exception: + pass + + +# ─── Proxmox Webhook Receiver ─────────────────────────────────── + +class ProxmoxHookWatcher: + """Receives native Proxmox VE notifications via local webhook endpoint. + + Configured automatically via /etc/pve/notifications.cfg (endpoint + + matcher blocks). The setup-webhook API writes these blocks on first + enable. See flask_notification_routes.py for details. + + Payload varies by source (storage, replication, cluster, PBS, apt). + This class normalizes them into NotificationEvent objects. + """ + + def __init__(self, event_queue: Queue): + self._queue = event_queue + self._hostname = _hostname() + + def process_webhook(self, payload: dict) -> dict: + """Process an incoming Proxmox webhook payload. + + The PVE webhook is the PRIMARY source for vzdump, replication, + fencing, package-updates and system-mail events. PVE sends rich + detail (full logs, sizes, durations) that TaskWatcher cannot match. + + Body template delivers: + {title, message, severity, timestamp, fields: {type, hostname, job-id}} + + Returns: {'accepted': bool, 'event_type': str, 'event_id': str} + """ + if not payload: + return {'accepted': False, 'error': 'Empty payload'} + + # ── Extract structured PVE fields ── + fields = payload.get('fields') or {} + if isinstance(fields, str): + # Edge case: {{ json fields }} rendered as string instead of dict + try: + import json + fields = json.loads(fields) + except (json.JSONDecodeError, ValueError): + fields = {} + + pve_type = fields.get('type', '').lower().strip() + pve_hostname = fields.get('hostname', self._hostname) + pve_job_id = fields.get('job-id', '') + + title = payload.get('title', '') + message = payload.get('message', payload.get('body', '')) + severity_raw = payload.get('severity', 'info').lower().strip() + timestamp = payload.get('timestamp', '') + + # ── Classify by PVE type (direct, no heuristics needed) ── + import re + event_type, entity, entity_id = self._classify_pve( + pve_type, severity_raw, title, message + ) + + # Discard meta-events + if event_type == '_skip': + return {'accepted': False, 'skipped': True, 'reason': 'Meta-event filtered'} + + severity = self._map_severity(severity_raw) + + # ── Build rich data dict ── + # For webhook events, PVE's `message` IS the notification body. + # It contains full vzdump logs, package lists, error details, etc. + # We pass it as 'pve_message' so templates can use it directly. + data = { + 'hostname': pve_hostname, + 'pve_type': pve_type, + 'pve_message': message, + 'pve_title': title, + 'title': title, + 'job_id': pve_job_id, + } + + # Extract VMID and VM name from message for vzdump events + if pve_type == 'vzdump' and message: + # PVE vzdump messages contain lines like: + # "INFO: Starting Backup of VM 100 (qemu)" + # "VMID Name Status Time Size Filename" + # "100 arch-linux OK 00:05:30 1.2G /path/to/file" + vmids = re.findall(r'(?:VM|CT)\s+(\d+)', message, re.IGNORECASE) + if vmids: + data['vmid'] = vmids[0] + entity_id = vmids[0] + # Try to extract VM name from the table line + name_m = re.search(r'(\d+)\s+(\S+)\s+(?:OK|ERROR|WARNINGS)', message) + if name_m: + data['vmname'] = name_m.group(2) + # Extract size from "Total size: X" + size_m = re.search(r'Total size:\s*(.+?)(?:\n|$)', message) + if size_m: + data['size'] = size_m.group(1).strip() + # Extract duration from "Total running time: X" + dur_m = re.search(r'Total running time:\s*(.+?)(?:\n|$)', message) + if dur_m: + data['duration'] = dur_m.group(1).strip() + + event = NotificationEvent( + event_type=event_type, + severity=severity, + data=data, + source='proxmox_hook', + entity=entity, + entity_id=entity_id, + raw=payload, + ) + + self._queue.put(event) + return {'accepted': True, 'event_type': event_type, 'event_id': event.event_id} + + def _classify_pve(self, pve_type: str, severity: str, + title: str, message: str) -> tuple: + """Classify using PVE's structured fields.type. + + Returns (event_type, entity, entity_id). + """ + title_lower = (title or '').lower() + + # Skip overall/updates status change meta-events + if 'overall' in title_lower and ('changed' in title_lower or 'status' in title_lower): + return '_skip', '', '' + if 'updates' in title_lower and ('changed' in title_lower or 'status' in title_lower): + return '_skip', '', '' + + # ── Direct classification by PVE type ── + if pve_type == 'vzdump': + if severity in ('error', 'err'): + return 'backup_fail', 'vm', '' + return 'backup_complete', 'vm', '' + + if pve_type == 'fencing': + return 'split_brain', 'node', '' + + if pve_type == 'replication': + return 'replication_fail', 'vm', '' + + if pve_type == 'package-updates': + return 'update_available', 'node', '' + + if pve_type == 'system-mail': + return 'system_mail', 'node', '' + + # ── Fallback for unknown/empty pve_type ── + # (e.g. test notifications, future PVE event types) + msg_lower = (message or '').lower() + text = f"{title_lower} {msg_lower}" + + if 'vzdump' in text or 'backup' in text: + import re + m = re.search(r'(?:vm|ct)\s+(\d+)', text, re.IGNORECASE) + vmid = m.group(1) if m else '' + if any(w in text for w in ('fail', 'error')): + return 'backup_fail', 'vm', vmid + return 'backup_complete', 'vm', vmid + + if 'replication' in text: + return 'replication_fail', 'vm', '' + + # Generic fallback + return 'system_problem', 'node', '' + + # Old _classify removed -- replaced by _classify_pve above. + + @staticmethod + def _map_severity(raw: str) -> str: + raw_l = str(raw).lower() + if raw_l in ('critical', 'emergency', 'alert', 'crit', 'err', 'error'): + return 'CRITICAL' + if raw_l in ('warning', 'warn', 'notice'): + return 'WARNING' + return 'INFO' diff --git a/AppImage/scripts/notification_manager.py b/AppImage/scripts/notification_manager.py new file mode 100644 index 00000000..3b2bed92 --- /dev/null +++ b/AppImage/scripts/notification_manager.py @@ -0,0 +1,1283 @@ +""" +ProxMenux Notification Manager +Central orchestrator for the notification service. + +Connects: +- notification_channels.py (transport: Telegram, Gotify, Discord) +- notification_templates.py (message formatting + optional AI) +- notification_events.py (event detection: Journal, Task, Polling watchers) +- health_persistence.py (DB: config storage, notification_history) + +Two interfaces consume this module: +1. Server mode: Flask imports and calls start()/stop()/send_notification() +2. CLI mode: `python3 notification_manager.py --action send --type vm_fail ...` + Scripts .sh in /usr/local/share/proxmenux/scripts call this directly. + +Author: MacRimi +""" + +import json +import os +import sys +import time +import socket +import sqlite3 +import threading +from queue import Queue, Empty +from datetime import datetime +from typing import Dict, Any, List, Optional +from pathlib import Path + +# Ensure local imports work +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +if BASE_DIR not in sys.path: + sys.path.insert(0, BASE_DIR) + +from notification_channels import create_channel, CHANNEL_TYPES +from notification_templates import ( + render_template, format_with_ai, TEMPLATES, + EVENT_GROUPS, get_event_types_by_group, get_default_enabled_events +) +from notification_events import ( + JournalWatcher, TaskWatcher, PollingCollector, NotificationEvent, + ProxmoxHookWatcher, +) + + +# ─── Constants ──────────────────────────────────────────────────── + +DB_PATH = Path('/usr/local/share/proxmenux/health_monitor.db') +SETTINGS_PREFIX = 'notification.' + +# Cooldown defaults (seconds) +DEFAULT_COOLDOWNS = { + 'CRITICAL': 60, # 60s minimum (prevents storm, delivers fast) + 'WARNING': 300, # 5 min + 'INFO': 900, # 15 min + 'resources': 900, # 15 min for resource alerts + 'updates': 86400, # 24h for update notifications +} + + +# ─── Storm Protection ──────────────────────────────────────────── + +GROUP_RATE_LIMITS = { + 'security': {'max_per_minute': 5, 'max_per_hour': 30}, + 'storage': {'max_per_minute': 3, 'max_per_hour': 20}, + 'cluster': {'max_per_minute': 5, 'max_per_hour': 20}, + 'network': {'max_per_minute': 3, 'max_per_hour': 15}, + 'resources': {'max_per_minute': 3, 'max_per_hour': 20}, + 'vm_ct': {'max_per_minute': 10, 'max_per_hour': 60}, + 'backup': {'max_per_minute': 5, 'max_per_hour': 30}, + 'system': {'max_per_minute': 5, 'max_per_hour': 30}, +} + + +class GroupRateLimiter: + """Rate limiter per event group. Prevents notification storms.""" + + def __init__(self): + from collections import deque + self._deque = deque + self._minute_counts: Dict[str, Any] = {} # group -> deque[timestamp] + self._hour_counts: Dict[str, Any] = {} # group -> deque[timestamp] + + def allow(self, group: str) -> bool: + """Check if group rate limit allows this event.""" + limits = GROUP_RATE_LIMITS.get(group, GROUP_RATE_LIMITS['system']) + now = time.time() + + # Initialize if needed + if group not in self._minute_counts: + self._minute_counts[group] = self._deque() + self._hour_counts[group] = self._deque() + + # Prune old entries + minute_q = self._minute_counts[group] + hour_q = self._hour_counts[group] + while minute_q and now - minute_q[0] > 60: + minute_q.popleft() + while hour_q and now - hour_q[0] > 3600: + hour_q.popleft() + + # Check limits + if len(minute_q) >= limits['max_per_minute']: + return False + if len(hour_q) >= limits['max_per_hour']: + return False + + # Record + minute_q.append(now) + hour_q.append(now) + return True + + def get_stats(self) -> Dict[str, Dict[str, int]]: + """Return current rate stats per group.""" + now = time.time() + stats = {} + for group in self._minute_counts: + minute_q = self._minute_counts.get(group, []) + hour_q = self._hour_counts.get(group, []) + stats[group] = { + 'last_minute': sum(1 for t in minute_q if now - t <= 60), + 'last_hour': sum(1 for t in hour_q if now - t <= 3600), + } + return stats + + +AGGREGATION_RULES = { + 'auth_fail': {'window': 120, 'min_count': 3, 'burst_type': 'burst_auth_fail'}, + 'ip_block': {'window': 120, 'min_count': 3, 'burst_type': 'burst_ip_block'}, + 'disk_io_error': {'window': 60, 'min_count': 3, 'burst_type': 'burst_disk_io'}, + 'split_brain': {'window': 300, 'min_count': 2, 'burst_type': 'burst_cluster'}, + 'node_disconnect': {'window': 300, 'min_count': 2, 'burst_type': 'burst_cluster'}, +} + + +class BurstAggregator: + """Accumulates similar events in a time window, then sends a single summary. + + Examples: + - "Fail2Ban banned 17 IPs in 2 minutes" + - "Disk I/O errors: 34 events on /dev/sdb in 60s" + """ + + def __init__(self): + self._buckets: Dict[str, List] = {} # bucket_key -> [events] + self._deadlines: Dict[str, float] = {} # bucket_key -> flush_deadline + self._lock = threading.Lock() + + def ingest(self, event: NotificationEvent) -> Optional[NotificationEvent]: + """Add event to aggregation. Returns: + - None if event is being buffered (wait for window) + - Original event if not eligible for aggregation + """ + rule = AGGREGATION_RULES.get(event.event_type) + if not rule: + return event # Not aggregable, pass through + + bucket_key = f"{event.event_type}:{event.data.get('hostname', '')}" + + with self._lock: + if bucket_key not in self._buckets: + self._buckets[bucket_key] = [] + self._deadlines[bucket_key] = time.time() + rule['window'] + + self._buckets[bucket_key].append(event) + + # First event in bucket: pass through immediately so user gets fast alert + if len(self._buckets[bucket_key]) == 1: + return event + + # Subsequent events: buffer (will be flushed as summary) + return None + + def flush_expired(self) -> List[NotificationEvent]: + """Flush all buckets past their deadline. Returns summary events.""" + now = time.time() + summaries = [] + + with self._lock: + expired_keys = [k for k, d in self._deadlines.items() if now >= d] + + for key in expired_keys: + events = self._buckets.pop(key, []) + del self._deadlines[key] + + if len(events) < 2: + continue # Single event already sent on ingest, no summary needed + + rule_type = key.split(':')[0] + rule = AGGREGATION_RULES.get(rule_type, {}) + min_count = rule.get('min_count', 2) + + if len(events) < min_count: + continue # Not enough events for a summary + + summary = self._create_summary(events, rule) + if summary: + summaries.append(summary) + + return summaries + + def _create_summary(self, events: List[NotificationEvent], + rule: dict) -> Optional[NotificationEvent]: + """Create a single summary event from multiple events.""" + if not events: + return None + + first = events[0] + # Determine highest severity + sev_order = {'INFO': 0, 'WARNING': 1, 'CRITICAL': 2} + max_severity = max(events, key=lambda e: sev_order.get(e.severity, 0)).severity + + # Collect unique entity_ids + entity_ids = list(set(e.entity_id for e in events if e.entity_id)) + entity_list = ', '.join(entity_ids[:10]) if entity_ids else 'multiple sources' + if len(entity_ids) > 10: + entity_list += f' (+{len(entity_ids) - 10} more)' + + # Calculate window + window_secs = events[-1].ts_epoch - events[0].ts_epoch + if window_secs < 120: + window_str = f'{int(window_secs)}s' + else: + window_str = f'{int(window_secs / 60)}m' + + burst_type = rule.get('burst_type', 'burst_generic') + + data = { + 'hostname': first.data.get('hostname', socket.gethostname()), + 'count': str(len(events)), + 'window': window_str, + 'entity_list': entity_list, + 'event_type': first.event_type, + } + + return NotificationEvent( + event_type=burst_type, + severity=max_severity, + data=data, + source='aggregator', + entity=first.entity, + entity_id='burst', + ) + + +# ─── Notification Manager ───────────────────────────────────────── + +class NotificationManager: + """Central notification orchestrator. + + Manages channels, event watchers, deduplication, and dispatch. + Can run in server mode (background threads) or CLI mode (one-shot). + """ + + def __init__(self): + self._channels: Dict[str, Any] = {} # channel_name -> channel_instance + self._event_queue: Queue = Queue() + self._running = False + self._config: Dict[str, str] = {} + self._enabled = False + self._lock = threading.Lock() + + # Watchers + self._journal_watcher: Optional[JournalWatcher] = None + self._task_watcher: Optional[TaskWatcher] = None + self._polling_collector: Optional[PollingCollector] = None + self._dispatch_thread: Optional[threading.Thread] = None + + # Webhook receiver (no thread, passive) + self._hook_watcher: Optional[ProxmoxHookWatcher] = None + + # Cooldown tracking: {fingerprint: last_sent_timestamp} + self._cooldowns: Dict[str, float] = {} + + # Storm protection + self._group_limiter = GroupRateLimiter() + self._aggregator = BurstAggregator() + self._aggregation_thread: Optional[threading.Thread] = None + + # Stats + self._stats = { + 'started_at': None, + 'total_sent': 0, + 'total_errors': 0, + 'last_sent_at': None, + } + + # ─── Configuration ────────────────────────────────────────── + + def _load_config(self): + """Load notification settings from the shared SQLite database.""" + self._config = {} + try: + if not DB_PATH.exists(): + return + + conn = sqlite3.connect(str(DB_PATH), timeout=10) + conn.execute('PRAGMA journal_mode=WAL') + conn.execute('PRAGMA busy_timeout=5000') + cursor = conn.cursor() + cursor.execute( + 'SELECT setting_key, setting_value FROM user_settings WHERE setting_key LIKE ?', + (f'{SETTINGS_PREFIX}%',) + ) + for key, value in cursor.fetchall(): + # Strip prefix for internal use + short_key = key[len(SETTINGS_PREFIX):] + self._config[short_key] = value + conn.close() + except Exception as e: + print(f"[NotificationManager] Failed to load config: {e}") + + # Reconcile per-event toggles with current template defaults. + # If a template's default_enabled was changed (e.g. state_change False), + # but the DB has a stale 'true' from a previous default, fix it now. + # Only override if the user hasn't explicitly set it (we track this with + # a sentinel: if the value came from auto-save of defaults, it may be stale). + for event_type, tmpl in TEMPLATES.items(): + key = f'event.{event_type}' + if key in self._config: + db_val = self._config[key] == 'true' + tmpl_default = tmpl.get('default_enabled', True) + # If template says disabled but DB says enabled, AND there's no + # explicit user marker, enforce the template default. + if not tmpl_default and db_val: + # Check if user explicitly enabled it (look for a marker) + marker = f'event_explicit.{event_type}' + if marker not in self._config: + self._config[key] = 'false' + + self._enabled = self._config.get('enabled', 'false') == 'true' + self._rebuild_channels() + + def _save_setting(self, key: str, value: str): + """Save a single notification setting to the database.""" + full_key = f'{SETTINGS_PREFIX}{key}' + now = datetime.now().isoformat() + try: + conn = sqlite3.connect(str(DB_PATH), timeout=10) + conn.execute('PRAGMA journal_mode=WAL') + conn.execute('PRAGMA busy_timeout=5000') + cursor = conn.cursor() + cursor.execute(''' + INSERT OR REPLACE INTO user_settings (setting_key, setting_value, updated_at) + VALUES (?, ?, ?) + ''', (full_key, value, now)) + conn.commit() + conn.close() + self._config[key] = value + except Exception as e: + print(f"[NotificationManager] Failed to save setting {key}: {e}") + + def _rebuild_channels(self): + """Rebuild channel instances from current config.""" + self._channels = {} + + for ch_type in CHANNEL_TYPES: + enabled_key = f'{ch_type}.enabled' + if self._config.get(enabled_key) != 'true': + continue + + # Gather config keys for this channel + ch_config = {} + for config_key in CHANNEL_TYPES[ch_type]['config_keys']: + full_key = f'{ch_type}.{config_key}' + ch_config[config_key] = self._config.get(full_key, '') + + channel = create_channel(ch_type, ch_config) + if channel: + valid, err = channel.validate_config() + if valid: + self._channels[ch_type] = channel + else: + print(f"[NotificationManager] Channel {ch_type} invalid: {err}") + + def reload_config(self): + """Reload config from DB without restarting.""" + with self._lock: + self._load_config() + return {'success': True, 'channels': list(self._channels.keys())} + + # ─── Server Mode (Background) ────────────────────────────── + + def start(self): + """Start the notification service in server mode. + + Launches watchers and dispatch loop as daemon threads. + Called by flask_server.py on startup. + """ + if self._running: + return + + self._load_config() + self._load_cooldowns_from_db() + + if not self._enabled: + print("[NotificationManager] Service is disabled. Skipping start.") + return + + self._running = True + self._stats['started_at'] = datetime.now().isoformat() + + # Ensure PVE webhook is configured (repairs priv config if missing) + try: + from flask_notification_routes import setup_pve_webhook_core + wh_result = setup_pve_webhook_core() + if wh_result.get('configured'): + print("[NotificationManager] PVE webhook configured OK.") + elif wh_result.get('error'): + print(f"[NotificationManager] PVE webhook warning: {wh_result['error']}") + except ImportError: + pass # flask_notification_routes not loaded yet (early startup) + except Exception as e: + print(f"[NotificationManager] PVE webhook setup error: {e}") + + # Start event watchers + self._journal_watcher = JournalWatcher(self._event_queue) + self._task_watcher = TaskWatcher(self._event_queue) + self._polling_collector = PollingCollector(self._event_queue) + + self._journal_watcher.start() + self._task_watcher.start() + self._polling_collector.start() + + # Start dispatch loop + self._dispatch_thread = threading.Thread( + target=self._dispatch_loop, daemon=True, name='notification-dispatch' + ) + self._dispatch_thread.start() + + print(f"[NotificationManager] Started with channels: {list(self._channels.keys())}") + + def stop(self): + """Stop the notification service cleanly.""" + self._running = False + + if self._journal_watcher: + self._journal_watcher.stop() + if self._task_watcher: + self._task_watcher.stop() + if self._polling_collector: + self._polling_collector.stop() + + print("[NotificationManager] Stopped.") + + def _dispatch_loop(self): + """Main dispatch loop: reads queue -> filters -> formats -> sends -> records.""" + last_cleanup = time.monotonic() + last_flush = time.monotonic() + cleanup_interval = 3600 # Cleanup cooldowns every hour + flush_interval = 5 # Flush aggregation buckets every 5s + + while self._running: + try: + event = self._event_queue.get(timeout=2) + except Empty: + # Periodic maintenance during idle + now_mono = time.monotonic() + if now_mono - last_cleanup > cleanup_interval: + self._cleanup_old_cooldowns() + last_cleanup = now_mono + # Flush expired aggregation buckets + if now_mono - last_flush > flush_interval: + self._flush_aggregation() + last_flush = now_mono + continue + + try: + self._process_event(event) + except Exception as e: + print(f"[NotificationManager] Dispatch error: {e}") + + # Also flush aggregation after each event + if time.monotonic() - last_flush > flush_interval: + self._flush_aggregation() + last_flush = time.monotonic() + + def _flush_aggregation(self): + """Flush expired aggregation buckets and dispatch summaries.""" + try: + summaries = self._aggregator.flush_expired() + for summary_event in summaries: + # Burst summaries bypass aggregator but still pass cooldown + rate limit + self._process_event_direct(summary_event) + except Exception as e: + print(f"[NotificationManager] Aggregation flush error: {e}") + + def _process_event(self, event: NotificationEvent): + """Process a single event: filter -> aggregate -> cooldown -> rate limit -> dispatch.""" + if not self._enabled: + return + + # Check if this event's GROUP is enabled in settings. + # The UI saves categories by group key: events.vm_ct, events.backup, etc. + template = TEMPLATES.get(event.event_type, {}) + event_group = template.get('group', 'system') + group_setting = f'events.{event_group}' + if self._config.get(group_setting, 'true') == 'false': + return + + # Check if this SPECIFIC event type is enabled (granular per-event toggle). + # Key format: event.{event_type} = "true"/"false" + # Default comes from the template's default_enabled field. + default_enabled = 'true' if template.get('default_enabled', True) else 'false' + event_specific = f'event.{event.event_type}' + if self._config.get(event_specific, default_enabled) == 'false': + return + + # Check severity filter. + # The UI saves severity_filter as: "all", "warning", "critical". + # Map to our internal severity names for comparison. + severity_map = {'all': 'INFO', 'warning': 'WARNING', 'critical': 'CRITICAL'} + raw_filter = self._config.get('severity_filter', 'all') + min_severity = severity_map.get(raw_filter.lower(), 'INFO') + if not self._meets_severity(event.severity, min_severity): + return + + # Try aggregation (may buffer the event) + result = self._aggregator.ingest(event) + if result is None: + return # Buffered, will be flushed as summary later + event = result # Use original event (first in burst passes through) + + # From here, proceed with dispatch (shared with _process_event_direct) + self._dispatch_event(event) + + def _process_event_direct(self, event: NotificationEvent): + """Process a burst summary event. Bypasses aggregator but applies ALL other filters.""" + if not self._enabled: + return + + # Check group filter (same as _process_event) + template = TEMPLATES.get(event.event_type, {}) + event_group = template.get('group', 'system') + group_setting = f'events.{event_group}' + if self._config.get(group_setting, 'true') == 'false': + return + + # Check per-event filter (same as _process_event) + default_enabled = 'true' if template.get('default_enabled', True) else 'false' + event_specific = f'event.{event.event_type}' + if self._config.get(event_specific, default_enabled) == 'false': + return + + # Check severity filter (same mapping as _process_event) + severity_map = {'all': 'INFO', 'warning': 'WARNING', 'critical': 'CRITICAL'} + raw_filter = self._config.get('severity_filter', 'all') + min_severity = severity_map.get(raw_filter.lower(), 'INFO') + if not self._meets_severity(event.severity, min_severity): + return + + self._dispatch_event(event) + + def _dispatch_event(self, event: NotificationEvent): + """Shared dispatch pipeline: cooldown -> rate limit -> render -> send.""" + # Check cooldown + if not self._check_cooldown(event): + return + + # Check group rate limit + template = TEMPLATES.get(event.event_type, {}) + group = template.get('group', 'system') + if not self._group_limiter.allow(group): + return + + # Use the properly mapped severity from the event, not from template defaults. + # event.severity was set by _map_severity which normalises to CRITICAL/WARNING/INFO. + severity = event.severity + + # Inject the canonical severity into data so templates see it too. + event.data['severity'] = severity + + # Render message from template (structured output) + rendered = render_template(event.event_type, event.data) + + # Optional AI enhancement (on text body only) + ai_config = { + 'enabled': self._config.get('ai_enabled', 'false'), + 'provider': self._config.get('ai_provider', ''), + 'api_key': self._config.get('ai_api_key', ''), + 'model': self._config.get('ai_model', ''), + } + body = format_with_ai( + rendered['title'], rendered['body'], severity, ai_config + ) + + # Enrich data with structured fields for channels that support them + enriched_data = dict(event.data) + enriched_data['_rendered_fields'] = rendered.get('fields', []) + enriched_data['_body_html'] = rendered.get('body_html', '') + + # Send through all active channels + self._dispatch_to_channels( + rendered['title'], body, severity, + event.event_type, enriched_data, event.source + ) + + def _dispatch_to_channels(self, title: str, body: str, severity: str, + event_type: str, data: Dict, source: str): + """Send notification through all configured channels.""" + with self._lock: + channels = dict(self._channels) + + for ch_name, channel in channels.items(): + try: + result = channel.send(title, body, severity, data) + self._record_history( + event_type, ch_name, title, body, severity, + result.get('success', False), + result.get('error', ''), + source + ) + + if result.get('success'): + self._stats['total_sent'] += 1 + self._stats['last_sent_at'] = datetime.now().isoformat() + else: + self._stats['total_errors'] += 1 + print(f"[NotificationManager] Send failed ({ch_name}): {result.get('error')}") + + except Exception as e: + self._stats['total_errors'] += 1 + self._record_history( + event_type, ch_name, title, body, severity, + False, str(e), source + ) + + # ─── Cooldown / Dedup ─────────────────────────────────────── + + def _check_cooldown(self, event: NotificationEvent) -> bool: + """Check if the event passes cooldown rules.""" + now = time.time() + + # Determine cooldown period + template = TEMPLATES.get(event.event_type, {}) + group = template.get('group', 'system') + + # Priority: per-type config > per-severity > default + cooldown_key = f'cooldown.{event.event_type}' + cooldown_str = self._config.get(cooldown_key) + + if cooldown_str is None: + cooldown_key_group = f'cooldown.{group}' + cooldown_str = self._config.get(cooldown_key_group) + + if cooldown_str is not None: + cooldown = int(cooldown_str) + else: + cooldown = DEFAULT_COOLDOWNS.get(event.severity, 300) + + # CRITICAL events: 60s minimum cooldown (prevents storm, but delivers fast) + if event.severity == 'CRITICAL' and cooldown_str is None: + cooldown = 60 + + # Backup/replication events: each execution is unique and should + # always be delivered. A 10s cooldown prevents exact duplicates + # (webhook + tasks) but allows repeated backup jobs to report. + _ALWAYS_DELIVER = {'backup_complete', 'backup_fail', 'backup_start', + 'replication_complete', 'replication_fail'} + if event.event_type in _ALWAYS_DELIVER and cooldown_str is None: + cooldown = 10 + + # VM/CT state changes are real user actions that should always be + # delivered. Each start/stop/shutdown is a distinct event. A 5s + # cooldown prevents exact duplicates from concurrent watchers. + _STATE_EVENTS = { + 'vm_start', 'vm_stop', 'vm_shutdown', 'vm_restart', + 'ct_start', 'ct_stop', 'ct_shutdown', 'ct_restart', + 'vm_fail', 'ct_fail', + } + if event.event_type in _STATE_EVENTS and cooldown_str is None: + cooldown = 5 + + # System shutdown/reboot must be delivered immediately -- the node + # is going down and there may be only seconds to send the message. + _URGENT_EVENTS = {'system_shutdown', 'system_reboot'} + if event.event_type in _URGENT_EVENTS and cooldown_str is None: + cooldown = 5 + + # Check against last sent time using stable fingerprint + last_sent = self._cooldowns.get(event.fingerprint, 0) + + if now - last_sent < cooldown: + return False + + self._cooldowns[event.fingerprint] = now + self._persist_cooldown(event.fingerprint, now) + return True + + def _load_cooldowns_from_db(self): + """Load persistent cooldown state from SQLite (up to 48h).""" + try: + if not DB_PATH.exists(): + return + conn = sqlite3.connect(str(DB_PATH), timeout=10) + conn.execute('PRAGMA journal_mode=WAL') + cursor = conn.cursor() + cursor.execute('SELECT fingerprint, last_sent_ts FROM notification_last_sent') + now = time.time() + for fp, ts in cursor.fetchall(): + if now - ts < 172800: # 48h window + self._cooldowns[fp] = ts + conn.close() + except Exception as e: + print(f"[NotificationManager] Failed to load cooldowns: {e}") + + def _persist_cooldown(self, fingerprint: str, ts: float): + """Save cooldown timestamp to SQLite for restart persistence.""" + try: + conn = sqlite3.connect(str(DB_PATH), timeout=10) + conn.execute('PRAGMA journal_mode=WAL') + conn.execute('PRAGMA busy_timeout=5000') + conn.execute(''' + INSERT OR REPLACE INTO notification_last_sent (fingerprint, last_sent_ts, count) + VALUES (?, ?, COALESCE( + (SELECT count + 1 FROM notification_last_sent WHERE fingerprint = ?), 1 + )) + ''', (fingerprint, int(ts), fingerprint)) + conn.commit() + conn.close() + except Exception: + pass # Non-critical, in-memory cooldown still works + + def _cleanup_old_cooldowns(self): + """Remove cooldown entries older than 48h from both memory and DB.""" + cutoff = time.time() - 172800 # 48h + self._cooldowns = {k: v for k, v in self._cooldowns.items() if v > cutoff} + try: + conn = sqlite3.connect(str(DB_PATH), timeout=10) + conn.execute('PRAGMA journal_mode=WAL') + conn.execute('DELETE FROM notification_last_sent WHERE last_sent_ts < ?', (int(cutoff),)) + conn.commit() + conn.close() + except Exception: + pass + + @staticmethod + def _meets_severity(event_severity: str, min_severity: str) -> bool: + """Check if event severity meets the minimum threshold.""" + levels = {'INFO': 0, 'WARNING': 1, 'CRITICAL': 2} + return levels.get(event_severity, 0) >= levels.get(min_severity, 0) + + # ─── History Recording ────────────────────────────────────── + + def _record_history(self, event_type: str, channel: str, title: str, + message: str, severity: str, success: bool, + error_message: str, source: str): + """Record a notification attempt in the history table.""" + try: + conn = sqlite3.connect(str(DB_PATH), timeout=10) + conn.execute('PRAGMA journal_mode=WAL') + conn.execute('PRAGMA busy_timeout=5000') + cursor = conn.cursor() + cursor.execute(''' + INSERT INTO notification_history + (event_type, channel, title, message, severity, sent_at, success, error_message, source) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) + ''', ( + event_type, channel, title, message[:500], severity, + datetime.now().isoformat(), 1 if success else 0, + error_message[:500] if error_message else None, source + )) + conn.commit() + conn.close() + except Exception as e: + print(f"[NotificationManager] History record error: {e}") + + # ─── Public API (used by Flask routes and CLI) ────────────── + + def send_notification(self, event_type: str, severity: str, + title: str, message: str, + data: Optional[Dict] = None, + source: str = 'api') -> Dict[str, Any]: + """Send a notification directly (bypasses queue and cooldown). + + Used by CLI and API for explicit sends. + """ + if not self._channels: + self._load_config() + + if not self._channels: + return { + 'success': False, + 'error': 'No channels configured or enabled', + 'channels_sent': [], + } + + # Render template if available + if event_type in TEMPLATES and not message: + rendered = render_template(event_type, data or {}) + title = title or rendered['title'] + message = rendered['body'] + severity = severity or rendered['severity'] + + # AI enhancement + ai_config = { + 'enabled': self._config.get('ai_enabled', 'false'), + 'provider': self._config.get('ai_provider', ''), + 'api_key': self._config.get('ai_api_key', ''), + 'model': self._config.get('ai_model', ''), + } + message = format_with_ai(title, message, severity, ai_config) + + results = {} + channels_sent = [] + errors = [] + + with self._lock: + channels = dict(self._channels) + + for ch_name, channel in channels.items(): + try: + result = channel.send(title, message, severity, data) + results[ch_name] = result + + self._record_history( + event_type, ch_name, title, message, severity, + result.get('success', False), + result.get('error', ''), + source + ) + + if result.get('success'): + channels_sent.append(ch_name) + else: + errors.append(f"{ch_name}: {result.get('error')}") + except Exception as e: + errors.append(f"{ch_name}: {str(e)}") + + return { + 'success': len(channels_sent) > 0, + 'channels_sent': channels_sent, + 'errors': errors, + 'total_channels': len(channels), + } + + def send_raw(self, title: str, message: str, + severity: str = 'INFO', + source: str = 'api') -> Dict[str, Any]: + """Send a raw message without template (for custom scripts).""" + return self.send_notification( + 'custom', severity, title, message, source=source + ) + + def test_channel(self, channel_name: str = 'all') -> Dict[str, Any]: + """Test one or all configured channels.""" + if not self._channels: + self._load_config() + + if not self._channels: + return {'success': False, 'error': 'No channels configured'} + + results = {} + + if channel_name == 'all': + targets = dict(self._channels) + elif channel_name in self._channels: + targets = {channel_name: self._channels[channel_name]} + else: + # Try to create channel from config even if not enabled + ch_config = {} + for config_key in CHANNEL_TYPES.get(channel_name, {}).get('config_keys', []): + ch_config[config_key] = self._config.get(f'{channel_name}.{config_key}', '') + + channel = create_channel(channel_name, ch_config) + if channel: + targets = {channel_name: channel} + else: + return {'success': False, 'error': f'Channel {channel_name} not configured'} + + for ch_name, channel in targets.items(): + success, error = channel.test() + results[ch_name] = {'success': success, 'error': error} + + self._record_history( + 'test', ch_name, 'ProxMenux Test', + 'Test notification', 'INFO', + success, error, 'api' + ) + + overall_success = any(r['success'] for r in results.values()) + return { + 'success': overall_success, + 'results': results, + } + + # ─── Proxmox Webhook ────────────────────────────────────────── + + def process_webhook(self, payload: dict) -> dict: + """Process incoming Proxmox webhook. Delegates to ProxmoxHookWatcher.""" + if not self._hook_watcher: + self._hook_watcher = ProxmoxHookWatcher(self._event_queue) + return self._hook_watcher.process_webhook(payload) + + def get_webhook_secret(self) -> str: + """Get configured webhook secret, or empty string if none.""" + if not self._config: + self._load_config() + return self._config.get('webhook_secret', '') + + def get_webhook_allowed_ips(self) -> list: + """Get list of allowed IPs for webhook, or empty list (allow all).""" + if not self._config: + self._load_config() + raw = self._config.get('webhook_allowed_ips', '') + if not raw: + return [] + return [ip.strip() for ip in str(raw).split(',') if ip.strip()] + + # ─── Status & Settings ────────────────────────────────────── + + def get_status(self) -> Dict[str, Any]: + """Get current service status.""" + if not self._config: + self._load_config() + + return { + 'enabled': self._enabled, + 'running': self._running, + 'channels': { + name: { + 'type': name, + 'connected': True, + } + for name in self._channels + }, + 'stats': self._stats, + 'watchers': { + 'journal': self._journal_watcher is not None and self._running, + 'task': self._task_watcher is not None and self._running, + 'polling': self._polling_collector is not None and self._running, + }, + } + + def set_enabled(self, enabled: bool) -> Dict[str, Any]: + """Enable or disable the notification service.""" + self._save_setting('enabled', 'true' if enabled else 'false') + self._enabled = enabled + + if enabled and not self._running: + self.start() + elif not enabled and self._running: + self.stop() + + return {'success': True, 'enabled': enabled} + + def list_channels(self) -> Dict[str, Any]: + """List all channel types with their configuration status.""" + if not self._config: + self._load_config() + + channels_info = {} + for ch_type, info in CHANNEL_TYPES.items(): + enabled = self._config.get(f'{ch_type}.enabled', 'false') == 'true' + configured = all( + bool(self._config.get(f'{ch_type}.{k}', '')) + for k in info['config_keys'] + ) + channels_info[ch_type] = { + 'name': info['name'], + 'enabled': enabled, + 'configured': configured, + 'active': ch_type in self._channels, + } + + return {'channels': channels_info} + + def get_history(self, limit: int = 50, offset: int = 0, + severity: str = '', channel: str = '') -> Dict[str, Any]: + """Get notification history with optional filters.""" + try: + conn = sqlite3.connect(str(DB_PATH), timeout=10) + conn.execute('PRAGMA journal_mode=WAL') + conn.execute('PRAGMA busy_timeout=5000') + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + query = 'SELECT * FROM notification_history WHERE 1=1' + params: list = [] + + if severity: + query += ' AND severity = ?' + params.append(severity) + if channel: + query += ' AND channel = ?' + params.append(channel) + + query += ' ORDER BY sent_at DESC LIMIT ? OFFSET ?' + params.extend([limit, offset]) + + cursor.execute(query, params) + rows = [dict(row) for row in cursor.fetchall()] + + # Get total count + count_query = 'SELECT COUNT(*) FROM notification_history WHERE 1=1' + count_params: list = [] + if severity: + count_query += ' AND severity = ?' + count_params.append(severity) + if channel: + count_query += ' AND channel = ?' + count_params.append(channel) + + cursor.execute(count_query, count_params) + total = cursor.fetchone()[0] + + conn.close() + + return { + 'history': rows, + 'total': total, + 'limit': limit, + 'offset': offset, + } + except Exception as e: + return {'history': [], 'total': 0, 'error': str(e)} + + def clear_history(self) -> Dict[str, Any]: + """Clear all notification history.""" + try: + conn = sqlite3.connect(str(DB_PATH), timeout=10) + conn.execute('PRAGMA journal_mode=WAL') + conn.execute('PRAGMA busy_timeout=5000') + conn.execute('DELETE FROM notification_history') + conn.commit() + conn.close() + return {'success': True} + except Exception as e: + return {'success': False, 'error': str(e)} + + def get_settings(self) -> Dict[str, Any]: + """Get all notification settings for the UI. + + Returns a structure matching the frontend's NotificationConfig shape + so the round-trip (GET -> edit -> POST) is seamless. + """ + if not self._config: + self._load_config() + + # Build nested channels object matching frontend ChannelConfig + channels = {} + for ch_type, info in CHANNEL_TYPES.items(): + ch_cfg: Dict[str, Any] = { + 'enabled': self._config.get(f'{ch_type}.enabled', 'false') == 'true', + } + for config_key in info['config_keys']: + ch_cfg[config_key] = self._config.get(f'{ch_type}.{config_key}', '') + channels[ch_type] = ch_cfg + + # Build event_categories dict (group-level toggle) + # EVENT_GROUPS is a dict: { 'system': {...}, 'vm_ct': {...}, ... } + event_categories = {} + for group_key in EVENT_GROUPS: + event_categories[group_key] = self._config.get(f'events.{group_key}', 'true') == 'true' + + # Build per-event toggles: { 'vm_start': true, 'vm_stop': false, ... } + event_toggles = {} + for event_type, tmpl in TEMPLATES.items(): + default = tmpl.get('default_enabled', True) + saved = self._config.get(f'event.{event_type}', None) + if saved is not None: + event_toggles[event_type] = saved == 'true' + else: + event_toggles[event_type] = default + + # Build event_types_by_group for UI rendering + event_types_by_group = get_event_types_by_group() + + config = { + 'enabled': self._enabled, + 'channels': channels, + 'severity_filter': self._config.get('severity_filter', 'all'), + 'event_categories': event_categories, + 'event_toggles': event_toggles, + 'event_types_by_group': event_types_by_group, + 'ai_enabled': self._config.get('ai_enabled', 'false') == 'true', + 'ai_provider': self._config.get('ai_provider', 'openai'), + 'ai_api_key': self._config.get('ai_api_key', ''), + 'ai_model': self._config.get('ai_model', ''), + 'hostname': self._config.get('hostname', ''), + 'webhook_secret': self._config.get('webhook_secret', ''), + 'webhook_allowed_ips': self._config.get('webhook_allowed_ips', ''), + 'pbs_host': self._config.get('pbs_host', ''), + 'pve_host': self._config.get('pve_host', ''), + 'pbs_trusted_sources': self._config.get('pbs_trusted_sources', ''), + } + + return { + 'success': True, + 'config': config, + } + + def save_settings(self, settings: Dict[str, str]) -> Dict[str, Any]: + """Save multiple notification settings at once.""" + try: + conn = sqlite3.connect(str(DB_PATH), timeout=10) + conn.execute('PRAGMA journal_mode=WAL') + conn.execute('PRAGMA busy_timeout=5000') + cursor = conn.cursor() + now = datetime.now().isoformat() + + for key, value in settings.items(): + # Accept both prefixed and unprefixed keys + full_key = key if key.startswith(SETTINGS_PREFIX) else f'{SETTINGS_PREFIX}{key}' + short_key = full_key[len(SETTINGS_PREFIX):] + + cursor.execute(''' + INSERT OR REPLACE INTO user_settings (setting_key, setting_value, updated_at) + VALUES (?, ?, ?) + ''', (full_key, str(value), now)) + + self._config[short_key] = str(value) + + # If user is explicitly enabling an event that defaults to disabled, + # mark it so _load_config reconciliation won't override it later. + if short_key.startswith('event.') and str(value) == 'true': + event_type = short_key[6:] # strip 'event.' + tmpl = TEMPLATES.get(event_type, {}) + if not tmpl.get('default_enabled', True): + marker_key = f'{SETTINGS_PREFIX}event_explicit.{event_type}' + cursor.execute(''' + INSERT OR REPLACE INTO user_settings (setting_key, setting_value, updated_at) + VALUES (?, ?, ?) + ''', (marker_key, 'true', now)) + self._config[f'event_explicit.{event_type}'] = 'true' + + conn.commit() + conn.close() + + # Rebuild channels with new config + was_enabled = self._enabled + self._enabled = self._config.get('enabled', 'false') == 'true' + self._rebuild_channels() + + # Start/stop service and auto-configure PVE webhook + pve_webhook_result = None + if self._enabled and not was_enabled: + # Notifications just got ENABLED -> start service + setup PVE webhook + if not self._running: + self.start() + try: + from flask_notification_routes import setup_pve_webhook_core + pve_webhook_result = setup_pve_webhook_core() + except ImportError: + pass # flask_notification_routes not available (CLI mode) + except Exception as e: + pve_webhook_result = {'configured': False, 'error': str(e)} + elif not self._enabled and was_enabled: + # Notifications just got DISABLED -> stop service + cleanup PVE webhook + if self._running: + self.stop() + try: + from flask_notification_routes import cleanup_pve_webhook_core + cleanup_pve_webhook_core() + except ImportError: + pass + except Exception: + pass + + result = {'success': True, 'channels_active': list(self._channels.keys())} + if pve_webhook_result: + result['pve_webhook'] = pve_webhook_result + return result + except Exception as e: + return {'success': False, 'error': str(e)} + + +# ─── Singleton (for server mode) ───────────────────────────────── + +notification_manager = NotificationManager() + + +# ─── CLI Interface ──────────────────────────────────────────────── + +def _print_result(result: Dict, as_json: bool): + """Print CLI result in human-readable or JSON format.""" + if as_json: + print(json.dumps(result, indent=2, default=str)) + return + + if result.get('success'): + print(f"OK: ", end='') + elif 'success' in result and not result['success']: + print(f"ERROR: ", end='') + + # Format based on content + if 'channels_sent' in result: + sent = result.get('channels_sent', []) + print(f"Sent via: {', '.join(sent) if sent else 'none'}") + if result.get('errors'): + for err in result['errors']: + print(f" Error: {err}") + elif 'results' in result: + for ch, r in result['results'].items(): + status = 'OK' if r['success'] else f"FAILED: {r['error']}" + print(f" {ch}: {status}") + elif 'channels' in result: + for ch, info in result['channels'].items(): + status = 'active' if info.get('active') else ('configured' if info.get('configured') else 'not configured') + enabled = 'enabled' if info.get('enabled') else 'disabled' + print(f" {info['name']}: {enabled}, {status}") + elif 'enabled' in result and 'running' in result: + print(f"Enabled: {result['enabled']}, Running: {result['running']}") + if result.get('stats'): + stats = result['stats'] + print(f" Total sent: {stats.get('total_sent', 0)}") + print(f" Total errors: {stats.get('total_errors', 0)}") + if stats.get('last_sent_at'): + print(f" Last sent: {stats['last_sent_at']}") + elif 'enabled' in result: + print(f"Service {'enabled' if result['enabled'] else 'disabled'}") + else: + print(json.dumps(result, indent=2, default=str)) + + +if __name__ == '__main__': + import argparse + + parser = argparse.ArgumentParser( + description='ProxMenux Notification Manager CLI', + epilog='Example: python3 notification_manager.py --action send --type vm_fail --severity CRITICAL --title "VM 100 failed" --message "QEMU process crashed"' + ) + parser.add_argument('--action', required=True, + choices=['send', 'send-raw', 'test', 'status', + 'enable', 'disable', 'list-channels'], + help='Action to perform') + parser.add_argument('--type', help='Event type for send action (e.g. vm_fail, backup_complete)') + parser.add_argument('--severity', default='INFO', + choices=['INFO', 'WARNING', 'CRITICAL'], + help='Notification severity (default: INFO)') + parser.add_argument('--title', help='Notification title') + parser.add_argument('--message', help='Notification message body') + parser.add_argument('--channel', default='all', + help='Specific channel for test (default: all)') + parser.add_argument('--json', action='store_true', + help='Output result as JSON') + + args = parser.parse_args() + + mgr = NotificationManager() + mgr._load_config() + + if args.action == 'send': + if not args.type: + parser.error('--type is required for send action') + result = mgr.send_notification( + args.type, args.severity, + args.title or '', args.message or '', + data={ + 'hostname': socket.gethostname().split('.')[0], + 'reason': args.message or '', + }, + source='cli' + ) + + elif args.action == 'send-raw': + if not args.title or not args.message: + parser.error('--title and --message are required for send-raw') + result = mgr.send_raw(args.title, args.message, args.severity, source='cli') + + elif args.action == 'test': + result = mgr.test_channel(args.channel) + + elif args.action == 'status': + result = mgr.get_status() + + elif args.action == 'enable': + result = mgr.set_enabled(True) + + elif args.action == 'disable': + result = mgr.set_enabled(False) + + elif args.action == 'list-channels': + result = mgr.list_channels() + + else: + result = {'error': f'Unknown action: {args.action}'} + + _print_result(result, args.json) + + # Exit with appropriate code + sys.exit(0 if result.get('success', True) else 1) diff --git a/AppImage/scripts/notification_templates.py b/AppImage/scripts/notification_templates.py new file mode 100644 index 00000000..55371f45 --- /dev/null +++ b/AppImage/scripts/notification_templates.py @@ -0,0 +1,958 @@ +""" +ProxMenux Notification Templates +Message templates for all event types with per-channel formatting. + +Templates use Python str.format() variables: + {hostname}, {severity}, {category}, {reason}, {summary}, + {previous}, {current}, {vmid}, {vmname}, {timestamp}, etc. + +Optional AI enhancement enriches messages with context/suggestions. + +Author: MacRimi +""" + +import json +import re +import socket +import time +import urllib.request +import urllib.error +from typing import Dict, Any, Optional, List + + +# ─── vzdump message parser ─────────────────────────────────────── + +def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]: + """Parse a PVE vzdump notification message into structured data. + + Supports two formats: + 1. Local storage: table with columns VMID Name Status Time Size Filename + 2. PBS storage: log-style output with 'Finished Backup of VM NNN (HH:MM:SS)' + and sizes in lines like 'root.pxar: had to backup X of Y' or 'transferred X' + + Returns dict with 'vms' list, 'total_time', 'total_size', or None. + """ + if not message: + return None + + vms: List[Dict[str, str]] = [] + total_time = '' + total_size = '' + + lines = message.split('\n') + + # ── Strategy 1: classic table (local/NFS/CIFS storage) ── + header_idx = -1 + for i, line in enumerate(lines): + if re.match(r'\s*VMID\s+Name\s+Status', line, re.IGNORECASE): + header_idx = i + break + + if header_idx >= 0: + # Use column positions from the header to slice each row. + # Header: "VMID Name Status Time Size Filename" + header = lines[header_idx] + col_starts = [] + for col_name in ['VMID', 'Name', 'Status', 'Time', 'Size', 'Filename']: + idx = header.find(col_name) + if idx >= 0: + col_starts.append(idx) + + if len(col_starts) == 6: + for line in lines[header_idx + 1:]: + stripped = line.strip() + if not stripped or stripped.startswith('Total') or stripped.startswith('Logs') or stripped.startswith('='): + break + # Pad line to avoid index errors + padded = line.ljust(col_starts[-1] + 50) + vmid = padded[col_starts[0]:col_starts[1]].strip() + name = padded[col_starts[1]:col_starts[2]].strip() + status = padded[col_starts[2]:col_starts[3]].strip() + time_val = padded[col_starts[3]:col_starts[4]].strip() + size = padded[col_starts[4]:col_starts[5]].strip() + filename = padded[col_starts[5]:].strip() + + if vmid and vmid.isdigit(): + vms.append({ + 'vmid': vmid, + 'name': name, + 'status': status, + 'time': time_val, + 'size': size, + 'filename': filename, + }) + + # ── Strategy 2: log-style (PBS / Proxmox Backup Server) ── + # Parse from the full vzdump log lines. + # Look for patterns: + # "Starting Backup of VM NNN (lxc/qemu)" -> detect guest + # "CT Name: xxx" or "VM Name: xxx" -> guest name + # "Finished Backup of VM NNN (HH:MM:SS)" -> duration + status=ok + # "root.pxar: had to backup X of Y" -> size (CT) + # "transferred X in N seconds" -> size (QEMU) + # "creating ... archive 'ct/100/2026-..'" -> archive name for PBS + # "TASK ERROR:" or "ERROR:" -> status=error + if not vms: + current_vm: Optional[Dict[str, str]] = None + + for line in lines: + # Remove "INFO: " prefix that PVE adds + clean = re.sub(r'^(?:INFO|WARNING|ERROR):\s*', '', line.strip()) + + # Start of a new VM backup + m_start = re.match( + r'Starting Backup of VM (\d+)\s+\((lxc|qemu)\)', clean) + if m_start: + if current_vm: + vms.append(current_vm) + current_vm = { + 'vmid': m_start.group(1), + 'name': '', + 'status': 'ok', + 'time': '', + 'size': '', + 'filename': '', + 'type': m_start.group(2), + } + continue + + if current_vm: + # Guest name + m_name = re.match(r'(?:CT|VM) Name:\s*(.+)', clean) + if m_name: + current_vm['name'] = m_name.group(1).strip() + continue + + # PBS archive path -> extract as filename + m_archive = re.search( + r"creating .+ archive '([^']+)'", clean) + if m_archive: + current_vm['filename'] = m_archive.group(1) + continue + + # Size for containers (pxar) + m_pxar = re.search( + r'root\.pxar:.*?of\s+([\d.]+\s+\S+)', clean) + if m_pxar: + current_vm['size'] = m_pxar.group(1) + continue + + # Size for QEMU (transferred) + m_transfer = re.search( + r'transferred\s+([\d.]+\s+\S+)', clean) + if m_transfer: + current_vm['size'] = m_transfer.group(1) + continue + + # Finished -> duration + m_finish = re.match( + r'Finished Backup of VM (\d+)\s+\(([^)]+)\)', clean) + if m_finish: + current_vm['time'] = m_finish.group(2) + current_vm['status'] = 'ok' + vms.append(current_vm) + current_vm = None + continue + + # Error + if clean.startswith('ERROR:') or clean.startswith('TASK ERROR'): + if current_vm: + current_vm['status'] = 'error' + + # Don't forget the last VM if it wasn't finished + if current_vm: + vms.append(current_vm) + + # ── Extract totals ── + for line in lines: + m_time = re.search(r'Total running time:\s*(.+)', line) + if m_time: + total_time = m_time.group(1).strip() + m_size = re.search(r'Total size:\s*(.+)', line) + if m_size: + total_size = m_size.group(1).strip() + + # For PBS: calculate total size if not explicitly stated + if not total_size and vms: + # Sum individual sizes if they share units + sizes_gib = 0.0 + for vm in vms: + s = vm.get('size', '') + m = re.match(r'([\d.]+)\s+(.*)', s) + if m: + val = float(m.group(1)) + unit = m.group(2).strip().upper() + if 'GIB' in unit or 'GB' in unit: + sizes_gib += val + elif 'MIB' in unit or 'MB' in unit: + sizes_gib += val / 1024 + elif 'TIB' in unit or 'TB' in unit: + sizes_gib += val * 1024 + if sizes_gib > 0: + if sizes_gib >= 1024: + total_size = f"{sizes_gib / 1024:.3f} TiB" + elif sizes_gib >= 1: + total_size = f"{sizes_gib:.3f} GiB" + else: + total_size = f"{sizes_gib * 1024:.3f} MiB" + + # For PBS: calculate total time if not stated + if not total_time and vms: + total_secs = 0 + for vm in vms: + t = vm.get('time', '') + # Parse HH:MM:SS format + m = re.match(r'(\d+):(\d+):(\d+)', t) + if m: + total_secs += int(m.group(1)) * 3600 + int(m.group(2)) * 60 + int(m.group(3)) + if total_secs > 0: + hours = total_secs // 3600 + mins = (total_secs % 3600) // 60 + secs = total_secs % 60 + if hours: + total_time = f"{hours}h {mins}m {secs}s" + elif mins: + total_time = f"{mins}m {secs}s" + else: + total_time = f"{secs}s" + + if not vms and not total_size: + return None + + return { + 'vms': vms, + 'total_time': total_time, + 'total_size': total_size, + 'vm_count': len(vms), + } + + +def _format_vzdump_body(parsed: Dict[str, Any], is_success: bool) -> str: + """Format parsed vzdump data into a clean Telegram-friendly message.""" + parts = [] + + for vm in parsed.get('vms', []): + status = vm.get('status', '').lower() + icon = '\u2705' if status == 'ok' else '\u274C' + + parts.append(f"{icon} ID {vm['vmid']} ({vm['name']})") + + details = [] + if vm.get('size'): + details.append(f"Size: {vm['size']}") + if vm.get('time'): + details.append(f"Duration: {vm['time']}") + if vm.get('filename'): + fname = vm['filename'] + # PBS archives look like "ct/100/2026-..." or "vm/105/2026-..." + if re.match(r'^(?:ct|vm)/\d+/', fname): + details.append(f"PBS: {fname}") + else: + details.append(f"File: {fname}") + if details: + parts.append(' | '.join(details)) + parts.append('') # blank line between VMs + + # Summary + vm_count = parsed.get('vm_count', 0) + if vm_count > 0 or parsed.get('total_size'): + ok_count = sum(1 for v in parsed.get('vms', []) + if v.get('status', '').lower() == 'ok') + fail_count = vm_count - ok_count + + summary_parts = [] + if vm_count: + summary_parts.append(f"{vm_count} backup(s)") + if fail_count: + summary_parts.append(f"{fail_count} failed") + if parsed.get('total_size'): + summary_parts.append(f"Total: {parsed['total_size']}") + if parsed.get('total_time'): + summary_parts.append(f"Time: {parsed['total_time']}") + + if summary_parts: + parts.append('--- ' + ' | '.join(summary_parts)) + + return '\n'.join(parts) + + +# ─── Severity Icons ────────────────────────────────────────────── + +SEVERITY_ICONS = { + 'CRITICAL': '\U0001F534', + 'WARNING': '\U0001F7E1', + 'INFO': '\U0001F535', + 'OK': '\U0001F7E2', + 'UNKNOWN': '\u26AA', +} + +SEVERITY_ICONS_DISCORD = { + 'CRITICAL': ':red_circle:', + 'WARNING': ':yellow_circle:', + 'INFO': ':blue_circle:', + 'OK': ':green_circle:', + 'UNKNOWN': ':white_circle:', +} + + +# ─── Event Templates ───────────────────────────────────────────── +# Each template has a 'title' and 'body' with {variable} placeholders. +# 'group' is used for UI event filter grouping. +# 'default_enabled' controls initial state in settings. + +TEMPLATES = { + # ── Health Monitor state changes ── + # NOTE: state_change is disabled by default -- it fires on every + # status oscillation (OK->WARNING->OK) which creates noise. + # The health_persistent and new_error templates cover this better. + 'state_change': { + 'title': '{hostname}: {category} changed to {current}', + 'body': '{category} status changed from {previous} to {current}.\n{reason}', + 'group': 'system', + 'default_enabled': False, + }, + 'new_error': { + 'title': '{hostname}: New {severity} - {category}', + 'body': '{reason}', + 'group': 'system', + 'default_enabled': True, + }, + 'error_resolved': { + 'title': '{hostname}: Resolved - {category}', + 'body': '{reason}\nDuration: {duration}', + 'group': 'system', + 'default_enabled': True, + }, + 'error_escalated': { + 'title': '{hostname}: Escalated to {severity} - {category}', + 'body': '{reason}', + 'group': 'system', + 'default_enabled': True, + }, + + # ── VM / CT events ── + 'vm_start': { + 'title': '{hostname}: VM {vmid} started', + 'body': '{vmname} ({vmid}) has been started.', + 'group': 'vm_ct', + 'default_enabled': True, + }, + 'vm_stop': { + 'title': '{hostname}: VM {vmid} stopped', + 'body': '{vmname} ({vmid}) has been stopped.', + 'group': 'vm_ct', + 'default_enabled': False, + }, + 'vm_shutdown': { + 'title': '{hostname}: VM {vmid} shutdown', + 'body': '{vmname} ({vmid}) has been shut down.', + 'group': 'vm_ct', + 'default_enabled': False, + }, + 'vm_fail': { + 'title': '{hostname}: VM {vmid} FAILED', + 'body': '{vmname} ({vmid}) has failed.\n{reason}', + 'group': 'vm_ct', + 'default_enabled': True, + }, + 'vm_restart': { + 'title': '{hostname}: VM {vmid} restarted', + 'body': '{vmname} ({vmid}) has been restarted.', + 'group': 'vm_ct', + 'default_enabled': False, + }, + 'ct_start': { + 'title': '{hostname}: CT {vmid} started', + 'body': '{vmname} ({vmid}) has been started.', + 'group': 'vm_ct', + 'default_enabled': True, + }, + 'ct_stop': { + 'title': '{hostname}: CT {vmid} stopped', + 'body': '{vmname} ({vmid}) has been stopped.', + 'group': 'vm_ct', + 'default_enabled': False, + }, + 'ct_shutdown': { + 'title': '{hostname}: CT {vmid} shutdown', + 'body': '{vmname} ({vmid}) has been shut down.', + 'group': 'vm_ct', + 'default_enabled': False, + }, + 'ct_restart': { + 'title': '{hostname}: CT {vmid} restarted', + 'body': '{vmname} ({vmid}) has been restarted.', + 'group': 'vm_ct', + 'default_enabled': False, + }, + 'ct_fail': { + 'title': '{hostname}: CT {vmid} FAILED', + 'body': '{vmname} ({vmid}) has failed.\n{reason}', + 'group': 'vm_ct', + 'default_enabled': True, + }, + 'migration_start': { + 'title': '{hostname}: Migration started - {vmid}', + 'body': '{vmname} ({vmid}) migration to {target_node} started.', + 'group': 'vm_ct', + 'default_enabled': True, + }, + 'migration_complete': { + 'title': '{hostname}: Migration complete - {vmid}', + 'body': '{vmname} ({vmid}) migrated successfully to {target_node}.', + 'group': 'vm_ct', + 'default_enabled': True, + }, + 'migration_fail': { + 'title': '{hostname}: Migration FAILED - {vmid}', + 'body': '{vmname} ({vmid}) migration to {target_node} failed.\n{reason}', + 'group': 'vm_ct', + 'default_enabled': True, + }, + 'replication_fail': { + 'title': '{hostname}: Replication FAILED - {vmid}', + 'body': 'Replication of {vmname} ({vmid}) has failed.\n{reason}', + 'group': 'vm_ct', + 'default_enabled': True, + }, + 'replication_complete': { + 'title': '{hostname}: Replication complete - {vmid}', + 'body': 'Replication of {vmname} ({vmid}) completed successfully.', + 'group': 'vm_ct', + 'default_enabled': False, + }, + + # ── Backup / Snapshot events ── + 'backup_start': { + 'title': '{hostname}: Backup started - {vmid}', + 'body': 'Backup of {vmname} ({vmid}) has started.', + 'group': 'backup', + 'default_enabled': False, + }, + 'backup_complete': { + 'title': '{hostname}: Backup complete - {vmid}', + 'body': 'Backup of {vmname} ({vmid}) completed successfully.\nSize: {size}', + 'group': 'backup', + 'default_enabled': True, + }, + 'backup_fail': { + 'title': '{hostname}: Backup FAILED - {vmid}', + 'body': 'Backup of {vmname} ({vmid}) has failed.\n{reason}', + 'group': 'backup', + 'default_enabled': True, + }, + 'snapshot_complete': { + 'title': '{hostname}: Snapshot created - {vmid}', + 'body': 'Snapshot of {vmname} ({vmid}) created: {snapshot_name}', + 'group': 'backup', + 'default_enabled': False, + }, + 'snapshot_fail': { + 'title': '{hostname}: Snapshot FAILED - {vmid}', + 'body': 'Snapshot of {vmname} ({vmid}) failed.\n{reason}', + 'group': 'backup', + 'default_enabled': True, + }, + + # ── Resource events (from Health Monitor) ── + 'cpu_high': { + 'title': '{hostname}: High CPU usage ({value}%)', + 'body': 'CPU usage is at {value}% on {cores} cores.\n{details}', + 'group': 'resources', + 'default_enabled': True, + }, + 'ram_high': { + 'title': '{hostname}: High memory usage ({value}%)', + 'body': 'Memory usage: {used} / {total} ({value}%).\n{details}', + 'group': 'resources', + 'default_enabled': True, + }, + 'temp_high': { + 'title': '{hostname}: High temperature ({value}C)', + 'body': 'CPU temperature: {value}C (threshold: {threshold}C).\n{details}', + 'group': 'resources', + 'default_enabled': True, + }, + 'disk_space_low': { + 'title': '{hostname}: Low disk space on {mount}', + 'body': '{mount}: {used}% used ({available} available).', + 'group': 'storage', + 'default_enabled': True, + }, + 'disk_io_error': { + 'title': '{hostname}: Disk I/O error', + 'body': '{reason}', + 'group': 'storage', + 'default_enabled': True, + }, + 'storage_unavailable': { + 'title': '{hostname}: Storage unavailable - {storage_name}', + 'body': 'PVE storage "{storage_name}" ({storage_type}) is not available.\n{reason}', + 'group': 'storage', + 'default_enabled': True, + }, + 'load_high': { + 'title': '{hostname}: High system load ({value})', + 'body': 'System load average: {value} on {cores} cores.\n{details}', + 'group': 'resources', + 'default_enabled': True, + }, + + # ── Network events ── + 'network_down': { + 'title': '{hostname}: Network connectivity lost', + 'body': 'Network connectivity check failed.\n{reason}', + 'group': 'network', + 'default_enabled': True, + }, + 'network_latency': { + 'title': '{hostname}: High network latency ({value}ms)', + 'body': 'Latency to gateway: {value}ms (threshold: {threshold}ms).', + 'group': 'network', + 'default_enabled': False, + }, + + # ── Security events ── + 'auth_fail': { + 'title': '{hostname}: Authentication failure', + 'body': 'Failed login attempt from {source_ip}.\nUser: {username}\nService: {service}', + 'group': 'security', + 'default_enabled': True, + }, + 'ip_block': { + 'title': '{hostname}: IP blocked by Fail2Ban', + 'body': 'IP {source_ip} has been banned.\nJail: {jail}\nFailures: {failures}', + 'group': 'security', + 'default_enabled': True, + }, + 'firewall_issue': { + 'title': '{hostname}: Firewall issue detected', + 'body': '{reason}', + 'group': 'security', + 'default_enabled': True, + }, + 'user_permission_change': { + 'title': '{hostname}: User permission changed', + 'body': 'User: {username}\nChange: {change_details}', + 'group': 'security', + 'default_enabled': True, + }, + + # ── Cluster events ── + 'split_brain': { + 'title': '{hostname}: SPLIT-BRAIN detected', + 'body': 'Cluster split-brain condition detected.\nQuorum status: {quorum}', + 'group': 'cluster', + 'default_enabled': True, + }, + 'node_disconnect': { + 'title': '{hostname}: Node disconnected', + 'body': 'Node {node_name} has disconnected from the cluster.', + 'group': 'cluster', + 'default_enabled': True, + }, + 'node_reconnect': { + 'title': '{hostname}: Node reconnected', + 'body': 'Node {node_name} has reconnected to the cluster.', + 'group': 'cluster', + 'default_enabled': True, + }, + + # ── System events ── + 'system_shutdown': { + 'title': '{hostname}: System shutting down', + 'body': 'The system is shutting down.\n{reason}', + 'group': 'system', + 'default_enabled': True, + }, + 'system_reboot': { + 'title': '{hostname}: System rebooting', + 'body': 'The system is rebooting.\n{reason}', + 'group': 'system', + 'default_enabled': True, + }, + 'system_problem': { + 'title': '{hostname}: System problem detected', + 'body': '{reason}', + 'group': 'system', + 'default_enabled': True, + }, + 'service_fail': { + 'title': '{hostname}: Service failed - {service_name}', + 'body': '{reason}', + 'group': 'system', + 'default_enabled': True, + }, + 'update_available': { + 'title': '{hostname}: Updates available ({count})', + 'body': '{count} package updates are available.\n{details}', + 'group': 'system', + 'default_enabled': False, + }, + 'update_complete': { + 'title': '{hostname}: Update completed', + 'body': '{details}', + 'group': 'system', + 'default_enabled': False, + }, + + # ── Unknown persistent (from health monitor) ── + 'unknown_persistent': { + 'title': '{hostname}: Check unavailable - {category}', + 'body': 'Health check for {category} has been unavailable for 3+ cycles.\n{reason}', + 'group': 'system', + 'default_enabled': False, + }, + + # ── Persistent Health Issues (daily digest) ── + 'health_persistent': { + 'title': '{hostname}: {count} active health issue(s)', + 'body': 'The following health issues remain active:\n{issue_list}\n\nThis digest is sent once every 24 hours while issues persist.', + 'group': 'system', + 'default_enabled': True, + }, + 'health_issue_new': { + 'title': '{hostname}: New health issue - {category}', + 'body': 'New {severity} issue detected:\n{reason}', + 'group': 'system', + 'default_enabled': True, + }, + 'health_issue_resolved': { + 'title': '{hostname}: Resolved - {category}', + 'body': '{category} issue has been resolved.\n{reason}\nDuration: {duration}', + 'group': 'system', + 'default_enabled': True, + }, + + # ── Update notifications (enriched) ── + 'update_summary': { + 'title': '{hostname}: {total_count} updates available', + 'body': '{security_count} security update(s), {total_count} total.\n{package_list}', + 'group': 'system', + 'default_enabled': True, + }, + 'pve_update': { + 'title': '{hostname}: PVE update available ({version})', + 'body': 'Proxmox VE update available: {version}\n{details}', + 'group': 'system', + 'default_enabled': True, + }, + + # ── PVE webhook test ── + 'webhook_test': { + 'title': '{hostname}: Webhook test received', + 'body': 'PVE webhook connectivity test successful.\n{reason}', + 'group': 'system', + 'default_enabled': True, + }, + + # ── Burst aggregation summaries ── + 'burst_auth_fail': { + 'title': '{hostname}: {count} auth failures in {window}', + 'body': '{count} authentication failures detected in {window}.\nSources: {entity_list}', + 'group': 'security', + 'default_enabled': True, + }, + 'burst_ip_block': { + 'title': '{hostname}: Fail2Ban banned {count} IPs in {window}', + 'body': '{count} IPs banned by Fail2Ban in {window}.\nIPs: {entity_list}', + 'group': 'security', + 'default_enabled': True, + }, + 'burst_disk_io': { + 'title': '{hostname}: {count} disk I/O errors on {entity_list}', + 'body': '{count} I/O errors detected in {window}.\nDevices: {entity_list}', + 'group': 'storage', + 'default_enabled': True, + }, + 'burst_cluster': { + 'title': '{hostname}: Cluster flapping detected ({count} changes)', + 'body': 'Cluster state changed {count} times in {window}.\nNodes: {entity_list}', + 'group': 'cluster', + 'default_enabled': True, + }, + 'burst_generic': { + 'title': '{hostname}: {count} {event_type} events in {window}', + 'body': '{count} events of type {event_type} in {window}.\n{entity_list}', + 'group': 'system', + 'default_enabled': True, + }, +} + +# ─── Event Groups (for UI filtering) ───────────────────────────── + +EVENT_GROUPS = { + 'system': {'label': 'System', 'description': 'System health, services, updates'}, + 'vm_ct': {'label': 'VM / CT', 'description': 'Virtual machines and containers'}, + 'backup': {'label': 'Backup', 'description': 'Backups and snapshots'}, + 'resources': {'label': 'Resources', 'description': 'CPU, memory, temperature, load'}, + 'storage': {'label': 'Storage', 'description': 'Disk space and I/O'}, + 'network': {'label': 'Network', 'description': 'Connectivity and latency'}, + 'security': {'label': 'Security', 'description': 'Authentication, firewall, bans'}, + 'cluster': {'label': 'Cluster', 'description': 'Cluster health and quorum'}, +} + + +# ─── Template Renderer ─────────────────────────────────────────── + +def _get_hostname() -> str: + """Get short hostname for message titles.""" + try: + return socket.gethostname().split('.')[0] + except Exception: + return 'proxmox' + + +def render_template(event_type: str, data: Dict[str, Any]) -> Dict[str, Any]: + """Render a template into a structured notification object. + + Returns structured output usable by all channels: + title, body (text), body_text, body_html (escaped), fields, tags, severity, group + """ + import html as html_mod + + template = TEMPLATES.get(event_type) + if not template: + fallback_body = data.get('message', data.get('reason', str(data))) + severity = data.get('severity', 'INFO') + return { + 'title': f"{_get_hostname()}: {event_type}", + 'body': fallback_body, 'body_text': fallback_body, + 'body_html': f'

{html_mod.escape(str(fallback_body))}

', + 'fields': [], 'tags': [severity, 'system', event_type], + 'severity': severity, 'group': 'system', + } + + # Ensure hostname is always available + variables = { + 'hostname': _get_hostname(), + 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'), + 'severity': data.get('severity', 'INFO'), + # Burst event variables + 'window': '', 'entity_list': '', + # Common defaults + 'vmid': '', 'vmname': '', 'reason': '', 'summary': '', + 'details': '', 'category': '', 'previous': '', 'current': '', + 'duration': '', 'value': '', 'threshold': '', + 'source_ip': '', 'username': '', 'service': '', 'service_name': '', + 'node_name': '', 'target_node': '', 'mount': '', 'device': '', + 'used': '', 'total': '', 'available': '', 'cores': '', + 'count': '', 'size': '', 'snapshot_name': '', 'jail': '', + 'failures': '', 'quorum': '', 'change_details': '', 'message': '', + 'security_count': '0', 'total_count': '0', 'package_list': '', + 'packages': '', 'pve_packages': '', 'version': '', + 'issue_list': '', 'error_key': '', + 'storage_name': '', 'storage_type': '', + } + variables.update(data) + + try: + title = template['title'].format(**variables) + except (KeyError, ValueError): + title = template['title'] + + # ── PVE vzdump special formatting ── + # When the event came from PVE webhook with a full vzdump message, + # parse the table/logs and format a rich body instead of the sparse template. + pve_message = data.get('pve_message', '') + pve_title = data.get('pve_title', '') + + if event_type in ('backup_complete', 'backup_fail') and pve_message: + parsed = _parse_vzdump_message(pve_message) + if parsed: + is_success = (event_type == 'backup_complete') + body_text = _format_vzdump_body(parsed, is_success) + # Use PVE's own title if available (contains hostname and status) + if pve_title: + title = pve_title + else: + # Couldn't parse -- use PVE raw message as body + body_text = pve_message.strip() + elif event_type == 'system_mail' and pve_message: + # System mail -- use PVE message directly (mail bounce, cron, smartd) + body_text = pve_message.strip()[:1000] + else: + try: + body_text = template['body'].format(**variables) + except (KeyError, ValueError): + body_text = template['body'] + + # Clean up: collapse runs of 3+ blank lines into 1, remove trailing whitespace + import re as _re + body_text = _re.sub(r'\n{3,}', '\n\n', body_text.strip()) + + severity = variables.get('severity', 'INFO') + group = template.get('group', 'system') + + # Build structured fields for Discord embeds / rich notifications + fields = [] + field_map = [ + ('vmid', 'VM/CT'), ('vmname', 'Name'), ('device', 'Device'), + ('source_ip', 'Source IP'), ('node_name', 'Node'), ('category', 'Category'), + ('service_name', 'Service'), ('jail', 'Jail'), ('username', 'User'), + ('count', 'Count'), ('window', 'Window'), ('entity_list', 'Affected'), + ] + for key, label in field_map: + val = variables.get(key, '') + if val: + fields.append((label, str(val))) + + # Build HTML body with escaped content + body_html_parts = [] + for line in body_text.split('\n'): + if line.strip(): + body_html_parts.append(f'

{html_mod.escape(line)}

') + body_html = '\n'.join(body_html_parts) if body_html_parts else f'

{html_mod.escape(body_text)}

' + + return { + 'title': title, + 'body': body_text, # backward compat + 'body_text': body_text, + 'body_html': body_html, + 'fields': fields, + 'tags': [severity, group, event_type], + 'severity': severity, + 'group': group, + } + + +def get_event_types_by_group() -> Dict[str, list]: + """Get all event types organized by group, for UI rendering. + + Returns: + {group_key: [{'type': event_type, 'title': template_title, + 'default_enabled': bool}, ...]} + """ + result = {} + for event_type, template in TEMPLATES.items(): + group = template.get('group', 'system') + if group not in result: + result[group] = [] + import re + # Clean title: remove {hostname}: prefix and any remaining {placeholders} + title = template['title'].replace('{hostname}', '').strip(': ') + title = re.sub(r'\s*\{[^}]+\}', '', title).strip(' -:') + if not title: + title = event_type.replace('_', ' ').title() + result[group].append({ + 'type': event_type, + 'title': title, + 'default_enabled': template.get('default_enabled', True), + }) + return result + + +def get_default_enabled_events() -> Dict[str, bool]: + """Get the default enabled state for all event types.""" + return { + event_type: template.get('default_enabled', True) + for event_type, template in TEMPLATES.items() + } + + +# ─── AI Enhancement (Optional) ─────────────────────────────────── + +class AIEnhancer: + """Optional AI message enhancement using external LLM API. + + Enriches template-generated messages with context and suggestions. + Falls back to original message if AI is unavailable or fails. + """ + + SYSTEM_PROMPT = """You are a Proxmox system administrator assistant. +You receive a notification message about a server event and must enhance it with: +1. A brief explanation of what this means in practical terms +2. A suggested action if applicable (1-2 sentences max) + +Keep the response concise (max 3 sentences total). Do not repeat the original message. +Respond in the same language as the input message.""" + + def __init__(self, provider: str, api_key: str, model: str = ''): + self.provider = provider.lower() + self.api_key = api_key + self.model = model + self._enabled = bool(api_key) + + @property + def enabled(self) -> bool: + return self._enabled + + def enhance(self, title: str, body: str, severity: str) -> Optional[str]: + """Enhance a notification message with AI context. + + Returns enhanced body text, or None if enhancement fails/disabled. + """ + if not self._enabled: + return None + + try: + if self.provider in ('openai', 'groq'): + return self._call_openai_compatible(title, body, severity) + except Exception as e: + print(f"[AIEnhancer] Enhancement failed: {e}") + + return None + + def _call_openai_compatible(self, title: str, body: str, severity: str) -> Optional[str]: + """Call OpenAI-compatible API (works with OpenAI, Groq, local).""" + if self.provider == 'groq': + url = 'https://api.groq.com/openai/v1/chat/completions' + model = self.model or 'llama-3.3-70b-versatile' + else: # openai + url = 'https://api.openai.com/v1/chat/completions' + model = self.model or 'gpt-4o-mini' + + user_msg = f"Severity: {severity}\nTitle: {title}\nMessage: {body}" + + payload = json.dumps({ + 'model': model, + 'messages': [ + {'role': 'system', 'content': self.SYSTEM_PROMPT}, + {'role': 'user', 'content': user_msg}, + ], + 'max_tokens': 150, + 'temperature': 0.3, + }).encode('utf-8') + + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {self.api_key}', + } + + req = urllib.request.Request(url, data=payload, headers=headers) + with urllib.request.urlopen(req, timeout=10) as resp: + result = json.loads(resp.read().decode('utf-8')) + content = result['choices'][0]['message']['content'].strip() + return content if content else None + + +def format_with_ai(title: str, body: str, severity: str, + ai_config: Dict[str, str]) -> str: + """Format a message with optional AI enhancement. + + If AI is configured and succeeds, appends AI insight to the body. + Otherwise returns the original body unchanged. + + Args: + title: Notification title + body: Notification body + severity: Severity level + ai_config: {'enabled': 'true', 'provider': 'groq', 'api_key': '...', 'model': ''} + + Returns: + Enhanced body string + """ + if ai_config.get('enabled') != 'true' or not ai_config.get('api_key'): + return body + + enhancer = AIEnhancer( + provider=ai_config.get('provider', 'groq'), + api_key=ai_config['api_key'], + model=ai_config.get('model', ''), + ) + + insight = enhancer.enhance(title, body, severity) + if insight: + return f"{body}\n\n---\n{insight}" + + return body