Update notification service

This commit is contained in:
MacRimi
2026-02-19 17:02:02 +01:00
parent 34d04e57dd
commit 7c5cdb9161
7 changed files with 1587 additions and 95 deletions

View File

@@ -12,7 +12,7 @@ import { fetchApi } from "../lib/api-config"
import { import {
Bell, BellOff, Send, CheckCircle2, XCircle, Loader2, Bell, BellOff, Send, CheckCircle2, XCircle, Loader2,
AlertTriangle, Info, Settings2, Zap, Eye, EyeOff, AlertTriangle, Info, Settings2, Zap, Eye, EyeOff,
Trash2, ChevronDown, ChevronUp, TestTube2 Trash2, ChevronDown, ChevronUp, TestTube2, Mail, Webhook
} from "lucide-react" } from "lucide-react"
interface ChannelConfig { interface ChannelConfig {
@@ -22,6 +22,15 @@ interface ChannelConfig {
url?: string url?: string
token?: string token?: string
webhook_url?: string webhook_url?: string
// Email channel fields
host?: string
port?: string
username?: string
password?: string
tls_mode?: string
from_address?: string
to_addresses?: string
subject_prefix?: string
} }
interface NotificationConfig { interface NotificationConfig {
@@ -34,6 +43,8 @@ interface NotificationConfig {
ai_api_key: string ai_api_key: string
ai_model: string ai_model: string
hostname: string hostname: string
webhook_secret: string
webhook_allowed_ips: string
} }
interface ServiceStatus { interface ServiceStatus {
@@ -84,6 +95,7 @@ const DEFAULT_CONFIG: NotificationConfig = {
telegram: { enabled: false }, telegram: { enabled: false },
gotify: { enabled: false }, gotify: { enabled: false },
discord: { enabled: false }, discord: { enabled: false },
email: { enabled: false },
}, },
severity_filter: "warning", severity_filter: "warning",
event_categories: { event_categories: {
@@ -95,6 +107,8 @@ const DEFAULT_CONFIG: NotificationConfig = {
ai_api_key: "", ai_api_key: "",
ai_model: "", ai_model: "",
hostname: "", hostname: "",
webhook_secret: "",
webhook_allowed_ips: "",
} }
export function NotificationSettings() { export function NotificationSettings() {
@@ -112,6 +126,11 @@ export function NotificationSettings() {
const [editMode, setEditMode] = useState(false) const [editMode, setEditMode] = useState(false)
const [hasChanges, setHasChanges] = useState(false) const [hasChanges, setHasChanges] = useState(false)
const [originalConfig, setOriginalConfig] = useState<NotificationConfig>(DEFAULT_CONFIG) const [originalConfig, setOriginalConfig] = useState<NotificationConfig>(DEFAULT_CONFIG)
const [webhookSetup, setWebhookSetup] = useState<{
status: "idle" | "running" | "success" | "failed"
fallback_commands: string[]
error: string
}>({ status: "idle", fallback_commands: [], error: "" })
const loadConfig = useCallback(async () => { const loadConfig = useCallback(async () => {
try { try {
@@ -252,6 +271,184 @@ export function NotificationSettings() {
const activeChannels = Object.entries(config.channels).filter(([, ch]) => ch.enabled).length const activeChannels = Object.entries(config.channels).filter(([, ch]) => ch.enabled).length
const handleEnable = async () => {
setSaving(true)
setWebhookSetup({ status: "running", fallback_commands: [], error: "" })
try {
// 1) Save enabled=true
const newConfig = { ...config, enabled: true }
await fetchApi("/api/notifications/settings", {
method: "POST",
body: JSON.stringify(newConfig),
})
setConfig(newConfig)
setOriginalConfig(newConfig)
// 2) Auto-configure PVE webhook
try {
const setup = await fetchApi<{
configured: boolean
secret?: string
fallback_commands?: string[]
error?: string
}>("/api/notifications/proxmox/setup-webhook", { method: "POST" })
if (setup.configured) {
setWebhookSetup({ status: "success", fallback_commands: [], error: "" })
// Update secret in local config if one was generated
if (setup.secret) {
const updated = { ...newConfig, webhook_secret: setup.secret }
setConfig(updated)
setOriginalConfig(updated)
}
} else {
setWebhookSetup({
status: "failed",
fallback_commands: setup.fallback_commands || [],
error: setup.error || "Unknown error",
})
}
} catch {
setWebhookSetup({
status: "failed",
fallback_commands: [],
error: "Could not reach setup endpoint",
})
}
setEditMode(true)
loadStatus()
} catch (err) {
console.error("Failed to enable notifications:", err)
setWebhookSetup({ status: "idle", fallback_commands: [], error: "" })
} finally {
setSaving(false)
}
}
// ── Disabled state: show activation card ──
if (!config.enabled && !editMode) {
return (
<Card>
<CardHeader>
<div className="flex items-center gap-2">
<BellOff className="h-5 w-5 text-muted-foreground" />
<CardTitle>Notifications</CardTitle>
<Badge variant="outline" className="text-[10px] border-muted-foreground/30 text-muted-foreground">
Disabled
</Badge>
</div>
<CardDescription>
Get real-time alerts about your Proxmox environment via Telegram, Discord, Gotify, or Email.
</CardDescription>
</CardHeader>
<CardContent>
<div className="space-y-4">
<div className="flex flex-col gap-3 p-4 bg-muted/50 rounded-lg border border-border">
<div className="flex items-start gap-3">
<Bell className="h-5 w-5 text-blue-500 mt-0.5 shrink-0" />
<div className="space-y-1">
<p className="text-sm font-medium">Enable notification service</p>
<p className="text-xs text-muted-foreground leading-relaxed">
Monitor system health, VM/CT events, backups, security alerts, and cluster status.
PVE webhook integration is configured automatically.
</p>
</div>
</div>
<div className="flex flex-col sm:flex-row items-start gap-2">
<button
className="h-8 px-4 text-sm rounded-md bg-blue-600 hover:bg-blue-700 text-white transition-colors w-full sm:w-auto disabled:opacity-50 flex items-center justify-center gap-2"
onClick={handleEnable}
disabled={saving}
>
{saving ? <Loader2 className="h-3.5 w-3.5 animate-spin" /> : <Bell className="h-3.5 w-3.5" />}
{saving ? "Configuring..." : "Enable Notifications"}
</button>
</div>
{/* Webhook setup result */}
{webhookSetup.status === "success" && (
<div className="flex items-start gap-2 p-2 rounded-md bg-green-500/10 border border-green-500/20">
<CheckCircle2 className="h-3.5 w-3.5 text-green-500 shrink-0 mt-0.5" />
<p className="text-[11px] text-green-400 leading-relaxed">
PVE webhook configured automatically. Proxmox will send notifications to ProxMenux.
</p>
</div>
)}
{webhookSetup.status === "failed" && (
<div className="space-y-2">
<div className="flex items-start gap-2 p-2 rounded-md bg-amber-500/10 border border-amber-500/20">
<AlertTriangle className="h-3.5 w-3.5 text-amber-400 shrink-0 mt-0.5" />
<div className="space-y-1">
<p className="text-[11px] text-amber-400 leading-relaxed">
Automatic PVE configuration failed: {webhookSetup.error}
</p>
<p className="text-[10px] text-muted-foreground">
Notifications are enabled. Run the commands below on the PVE host to complete webhook setup.
</p>
</div>
</div>
{webhookSetup.fallback_commands.length > 0 && (
<pre className="text-[11px] bg-background p-2 rounded border border-border overflow-x-auto font-mono">
{webhookSetup.fallback_commands.join('\n')}
</pre>
)}
</div>
)}
</div>
{/* PBS manual section (collapsible) */}
<details className="group">
<summary className="text-xs font-medium text-muted-foreground cursor-pointer hover:text-foreground transition-colors flex items-center gap-1.5">
<ChevronDown className="h-3 w-3 group-open:rotate-180 transition-transform" />
<Webhook className="h-3 w-3" />
Configure PBS notifications (manual)
</summary>
<div className="mt-2 p-3 bg-muted/30 rounded-md border border-border space-y-3">
<div className="space-y-1">
<p className="text-xs text-muted-foreground leading-relaxed">
PVE backups launched from the PVE interface are covered automatically by the PVE webhook above.
</p>
<p className="text-xs text-muted-foreground leading-relaxed">
However, PBS has its own internal jobs (Verify, Prune, GC, Sync) that generate
separate notifications. These must be configured directly on the PBS server.
</p>
</div>
<div className="space-y-1.5">
<p className="text-[11px] font-medium text-muted-foreground">
Run on the PBS host:
</p>
<pre className="text-[11px] bg-background p-2 rounded border border-border overflow-x-auto font-mono">
{`# Create webhook endpoint on PBS
proxmox-backup-manager notification endpoint webhook create proxmenux-webhook \\
--url http://<PVE_HOST_IP>:8008/api/notifications/webhook \\
--header "X-Webhook-Secret=<YOUR_SECRET>"
# Create matcher to route PBS events
proxmox-backup-manager notification matcher create proxmenux-pbs \\
--target proxmenux-webhook \\
--match-severity warning,error`}
</pre>
</div>
<div className="flex items-start gap-2 p-2 rounded-md bg-blue-500/10 border border-blue-500/20">
<Info className="h-3.5 w-3.5 text-blue-400 shrink-0 mt-0.5" />
<div className="text-[10px] text-blue-400/90 leading-relaxed space-y-1">
<p>
{"Replace <PVE_HOST_IP> with the IP address of this PVE node (not 127.0.0.1, unless PBS runs on the same host)."}
</p>
<p>
{"Replace <YOUR_SECRET> with the webhook secret shown in your notification settings."}
</p>
</div>
</div>
</div>
</details>
</div>
</CardContent>
</Card>
)
}
return ( return (
<Card> <Card>
<CardHeader> <CardHeader>
@@ -302,7 +499,7 @@ export function NotificationSettings() {
</div> </div>
</div> </div>
<CardDescription> <CardDescription>
Configure notification channels and event filters. Receive alerts via Telegram, Gotify, or Discord. Configure notification channels and event filters. Receive alerts via Telegram, Gotify, Discord, or Email.
</CardDescription> </CardDescription>
</CardHeader> </CardHeader>
@@ -369,7 +566,7 @@ export function NotificationSettings() {
</div> </div>
<Tabs defaultValue="telegram" className="w-full"> <Tabs defaultValue="telegram" className="w-full">
<TabsList className="w-full grid grid-cols-3 h-8"> <TabsList className="w-full grid grid-cols-4 h-8">
<TabsTrigger value="telegram" className="text-xs data-[state=active]:text-blue-500"> <TabsTrigger value="telegram" className="text-xs data-[state=active]:text-blue-500">
Telegram Telegram
</TabsTrigger> </TabsTrigger>
@@ -379,6 +576,9 @@ export function NotificationSettings() {
<TabsTrigger value="discord" className="text-xs data-[state=active]:text-indigo-500"> <TabsTrigger value="discord" className="text-xs data-[state=active]:text-indigo-500">
Discord Discord
</TabsTrigger> </TabsTrigger>
<TabsTrigger value="email" className="text-xs data-[state=active]:text-amber-500">
Email
</TabsTrigger>
</TabsList> </TabsList>
{/* Telegram */} {/* Telegram */}
@@ -571,6 +771,151 @@ export function NotificationSettings() {
</> </>
)} )}
</TabsContent> </TabsContent>
{/* Email */}
<TabsContent value="email" className="space-y-3 pt-2">
<div className="flex items-center justify-between">
<Label className="text-xs font-medium">Enable Email</Label>
<button
className={`relative w-9 h-[18px] rounded-full transition-colors ${
config.channels.email?.enabled ? "bg-amber-600" : "bg-muted-foreground/30"
} ${!editMode ? "opacity-60 cursor-not-allowed" : "cursor-pointer"}`}
onClick={() => editMode && updateChannel("email", "enabled", !config.channels.email?.enabled)}
disabled={!editMode}
role="switch"
aria-checked={config.channels.email?.enabled || false}
>
<span className={`absolute top-[1px] left-[1px] h-4 w-4 rounded-full bg-white shadow transition-transform ${
config.channels.email?.enabled ? "translate-x-[18px]" : "translate-x-0"
}`} />
</button>
</div>
{config.channels.email?.enabled && (
<>
<div className="grid grid-cols-1 sm:grid-cols-2 gap-2">
<div className="space-y-1.5">
<Label className="text-[11px] text-muted-foreground">SMTP Host</Label>
<Input
className="h-7 text-xs font-mono"
placeholder="smtp.gmail.com"
value={config.channels.email?.host || ""}
onChange={e => updateChannel("email", "host", e.target.value)}
disabled={!editMode}
/>
</div>
<div className="space-y-1.5">
<Label className="text-[11px] text-muted-foreground">Port</Label>
<Input
className="h-7 text-xs font-mono"
placeholder="587"
value={config.channels.email?.port || ""}
onChange={e => updateChannel("email", "port", e.target.value)}
disabled={!editMode}
/>
</div>
</div>
<div className="space-y-1.5">
<Label className="text-[11px] text-muted-foreground">TLS Mode</Label>
<Select
value={config.channels.email?.tls_mode || "starttls"}
onValueChange={v => updateChannel("email", "tls_mode", v)}
disabled={!editMode}
>
<SelectTrigger className={`h-7 text-xs ${!editMode ? "opacity-60" : ""}`}>
<SelectValue />
</SelectTrigger>
<SelectContent>
<SelectItem value="starttls">STARTTLS (port 587)</SelectItem>
<SelectItem value="ssl">SSL/TLS (port 465)</SelectItem>
<SelectItem value="none">None (port 25)</SelectItem>
</SelectContent>
</Select>
</div>
<div className="grid grid-cols-1 sm:grid-cols-2 gap-2">
<div className="space-y-1.5">
<Label className="text-[11px] text-muted-foreground">Username</Label>
<Input
className="h-7 text-xs font-mono"
placeholder="user@example.com"
value={config.channels.email?.username || ""}
onChange={e => updateChannel("email", "username", e.target.value)}
disabled={!editMode}
/>
</div>
<div className="space-y-1.5">
<Label className="text-[11px] text-muted-foreground">Password</Label>
<div className="flex items-center gap-1.5">
<Input
type={showSecrets["em_pass"] ? "text" : "password"}
className="h-7 text-xs font-mono"
placeholder="App password"
value={config.channels.email?.password || ""}
onChange={e => updateChannel("email", "password", e.target.value)}
disabled={!editMode}
/>
<button
className="h-7 w-7 flex items-center justify-center rounded-md border border-border hover:bg-muted transition-colors shrink-0"
onClick={() => toggleSecret("em_pass")}
>
{showSecrets["em_pass"] ? <EyeOff className="h-3 w-3" /> : <Eye className="h-3 w-3" />}
</button>
</div>
</div>
</div>
<div className="space-y-1.5">
<Label className="text-[11px] text-muted-foreground">From Address</Label>
<Input
className="h-7 text-xs font-mono"
placeholder="proxmenux@yourdomain.com"
value={config.channels.email?.from_address || ""}
onChange={e => updateChannel("email", "from_address", e.target.value)}
disabled={!editMode}
/>
</div>
<div className="space-y-1.5">
<Label className="text-[11px] text-muted-foreground">To Addresses (comma-separated)</Label>
<Input
className="h-7 text-xs font-mono"
placeholder="admin@example.com, ops@example.com"
value={config.channels.email?.to_addresses || ""}
onChange={e => updateChannel("email", "to_addresses", e.target.value)}
disabled={!editMode}
/>
</div>
<div className="space-y-1.5">
<Label className="text-[11px] text-muted-foreground">Subject Prefix</Label>
<Input
className="h-7 text-xs font-mono"
placeholder="[ProxMenux]"
value={config.channels.email?.subject_prefix || "[ProxMenux]"}
onChange={e => updateChannel("email", "subject_prefix", e.target.value)}
disabled={!editMode}
/>
</div>
<div className="flex items-start gap-2 p-2 rounded-md bg-amber-500/10 border border-amber-500/20">
<Info className="h-3.5 w-3.5 text-amber-400 shrink-0 mt-0.5" />
<p className="text-[10px] text-amber-400/90 leading-relaxed">
Leave SMTP Host empty to use local sendmail (must be installed on the server).
For Gmail, use an App Password instead of your account password.
</p>
</div>
{!editMode && config.channels.email?.to_addresses && (
<button
className="h-7 px-3 text-xs rounded-md border border-border bg-background hover:bg-muted transition-colors flex items-center gap-1.5 w-full justify-center"
onClick={() => handleTest("email")}
disabled={testing === "email"}
>
{testing === "email" ? (
<Loader2 className="h-3 w-3 animate-spin" />
) : (
<TestTube2 className="h-3 w-3" />
)}
Test Email
</button>
)}
</>
)}
</TabsContent>
</Tabs> </Tabs>
{/* Test Result */} {/* Test Result */}
@@ -647,6 +992,131 @@ export function NotificationSettings() {
</div> </div>
</div> </div>
{/* ── Proxmox Webhook ── */}
<div className="space-y-3">
<div className="flex items-center justify-between">
<div className="flex items-center gap-2">
<Webhook className="h-3.5 w-3.5 text-muted-foreground" />
<span className="text-xs font-medium text-muted-foreground uppercase tracking-wider">Proxmox Webhook</span>
</div>
{!editMode && (
<button
className="h-6 px-2.5 text-[10px] rounded-md border border-border bg-background hover:bg-muted transition-colors flex items-center gap-1.5"
onClick={async () => {
try {
setWebhookSetup({ status: "running", fallback_commands: [], error: "" })
const setup = await fetchApi<{
configured: boolean; secret?: string; fallback_commands?: string[]; error?: string
}>("/api/notifications/proxmox/setup-webhook", { method: "POST" })
if (setup.configured) {
setWebhookSetup({ status: "success", fallback_commands: [], error: "" })
if (setup.secret) {
const updated = { ...config, webhook_secret: setup.secret }
setConfig(updated)
setOriginalConfig(updated)
}
} else {
setWebhookSetup({ status: "failed", fallback_commands: setup.fallback_commands || [], error: setup.error || "" })
}
} catch {
setWebhookSetup({ status: "failed", fallback_commands: [], error: "Request failed" })
}
}}
disabled={webhookSetup.status === "running"}
>
{webhookSetup.status === "running" ? <Loader2 className="h-2.5 w-2.5 animate-spin" /> : <Webhook className="h-2.5 w-2.5" />}
Re-configure PVE
</button>
)}
</div>
{/* Setup status inline */}
{webhookSetup.status === "success" && (
<div className="flex items-center gap-2 p-1.5 rounded bg-green-500/10 border border-green-500/20">
<CheckCircle2 className="h-3 w-3 text-green-500 shrink-0" />
<p className="text-[10px] text-green-400">PVE webhook configured successfully.</p>
</div>
)}
{webhookSetup.status === "failed" && (
<div className="space-y-1.5">
<div className="flex items-start gap-2 p-1.5 rounded bg-amber-500/10 border border-amber-500/20">
<AlertTriangle className="h-3 w-3 text-amber-400 shrink-0 mt-0.5" />
<p className="text-[10px] text-amber-400">PVE auto-config failed: {webhookSetup.error}</p>
</div>
{webhookSetup.fallback_commands.length > 0 && (
<pre className="text-[10px] bg-background p-1.5 rounded border border-border overflow-x-auto font-mono">
{webhookSetup.fallback_commands.join('\n')}
</pre>
)}
</div>
)}
<div className="space-y-1.5">
<Label className="text-[11px] text-muted-foreground">Shared Secret</Label>
<div className="flex items-center gap-1.5">
<Input
type={showSecrets["wh_secret"] ? "text" : "password"}
className="h-7 text-xs font-mono"
placeholder="Required for webhook authentication"
value={config.webhook_secret || ""}
onChange={e => updateConfig(p => ({ ...p, webhook_secret: e.target.value }))}
disabled={!editMode}
/>
<button
className="h-7 w-7 flex items-center justify-center rounded-md border border-border hover:bg-muted transition-colors shrink-0"
onClick={() => toggleSecret("wh_secret")}
>
{showSecrets["wh_secret"] ? <EyeOff className="h-3 w-3" /> : <Eye className="h-3 w-3" />}
</button>
</div>
<p className="text-[10px] text-muted-foreground">
{"Proxmox must send this value in the X-Webhook-Secret header. Auto-generated on first enable."}
</p>
</div>
<div className="space-y-1.5">
<Label className="text-[11px] text-muted-foreground">Allowed IPs (optional, remote only)</Label>
<Input
className="h-7 text-xs font-mono"
placeholder="10.0.0.5, 192.168.1.10 (empty = allow all)"
value={config.webhook_allowed_ips || ""}
onChange={e => updateConfig(p => ({ ...p, webhook_allowed_ips: e.target.value }))}
disabled={!editMode}
/>
<p className="text-[10px] text-muted-foreground">
{"Localhost (127.0.0.1) is always allowed. This restricts remote callers only."}
</p>
</div>
{/* PBS manual guide (collapsible) */}
<details className="group">
<summary className="text-[11px] font-medium text-muted-foreground cursor-pointer hover:text-foreground transition-colors flex items-center gap-1.5 py-1">
<ChevronDown className="h-3 w-3 group-open:rotate-180 transition-transform" />
Configure PBS notifications (manual)
</summary>
<div className="mt-1.5 p-2.5 bg-muted/30 rounded-md border border-border space-y-2">
<p className="text-[11px] text-muted-foreground leading-relaxed">
Backups launched from PVE are covered by the PVE webhook. PBS internal jobs
(Verify, Prune, GC, Sync) require separate configuration on the PBS server.
</p>
<pre className="text-[10px] bg-background p-2 rounded border border-border overflow-x-auto font-mono">
{`# On the PBS host:
proxmox-backup-manager notification endpoint webhook \\
create proxmenux-webhook \\
--url http://<PVE_IP>:8008/api/notifications/webhook \\
--header "X-Webhook-Secret=<SECRET>"
proxmox-backup-manager notification matcher \\
create proxmenux-pbs \\
--target proxmenux-webhook \\
--match-severity warning,error`}
</pre>
<p className="text-[10px] text-muted-foreground">
{"Replace <PVE_IP> with this node's IP and <SECRET> with the webhook secret above."}
</p>
</div>
</details>
</div>
{/* ── Advanced: AI Enhancement ── */} {/* ── Advanced: AI Enhancement ── */}
<div> <div>
<button <button
@@ -818,7 +1288,7 @@ export function NotificationSettings() {
<p className="text-[11px] text-muted-foreground leading-relaxed"> <p className="text-[11px] text-muted-foreground leading-relaxed">
{config.enabled {config.enabled
? "Notifications are active. Events matching your severity filter and category selection will be sent to configured channels." ? "Notifications are active. Events matching your severity filter and category selection will be sent to configured channels."
: "Enable notifications to receive alerts about system events, health status changes, and security incidents via Telegram, Gotify, or Discord."} : "Enable notifications to receive alerts about system events, health status changes, and security incidents via Telegram, Gotify, Discord, or Email."}
</p> </p>
</div> </div>
</CardContent> </CardContent>

View File

@@ -3,9 +3,64 @@ Flask routes for notification service configuration and management.
Blueprint pattern matching flask_health_routes.py / flask_security_routes.py. Blueprint pattern matching flask_health_routes.py / flask_security_routes.py.
""" """
import hmac
import time
import hashlib
from collections import deque
from flask import Blueprint, jsonify, request from flask import Blueprint, jsonify, request
from notification_manager import notification_manager from notification_manager import notification_manager
# ─── Webhook Hardening Helpers ───────────────────────────────────
class WebhookRateLimiter:
"""Simple sliding-window rate limiter for the webhook endpoint."""
def __init__(self, max_requests: int = 60, window_seconds: int = 60):
self._max = max_requests
self._window = window_seconds
self._timestamps: deque = deque()
def allow(self) -> bool:
now = time.time()
# Prune entries outside the window
while self._timestamps and now - self._timestamps[0] > self._window:
self._timestamps.popleft()
if len(self._timestamps) >= self._max:
return False
self._timestamps.append(now)
return True
class ReplayCache:
"""Bounded in-memory cache of recently seen request signatures (60s TTL)."""
_MAX_SIZE = 2000 # Hard cap to prevent memory growth
def __init__(self, ttl: int = 60):
self._ttl = ttl
self._seen: dict = {} # signature -> timestamp
def check_and_record(self, signature: str) -> bool:
"""Return True if this signature was already seen (replay). Records it otherwise."""
now = time.time()
# Periodic cleanup
if len(self._seen) > self._MAX_SIZE // 2:
cutoff = now - self._ttl
self._seen = {k: v for k, v in self._seen.items() if v > cutoff}
if signature in self._seen and now - self._seen[signature] < self._ttl:
return True # Replay detected
self._seen[signature] = now
return False
# Module-level singletons (one per process)
_webhook_limiter = WebhookRateLimiter(max_requests=60, window_seconds=60)
_replay_cache = ReplayCache(ttl=60)
# Timestamp validation window (seconds)
_TIMESTAMP_MAX_DRIFT = 60
notification_bp = Blueprint('notifications', __name__) notification_bp = Blueprint('notifications', __name__)
@@ -100,3 +155,218 @@ def send_notification():
return jsonify(result) return jsonify(result)
except Exception as e: except Exception as e:
return jsonify({'error': str(e)}), 500 return jsonify({'error': str(e)}), 500
@notification_bp.route('/api/notifications/proxmox/setup-webhook', methods=['POST'])
def setup_proxmox_webhook():
"""Automatically configure PVE notifications to call our webhook.
Idempotent: safe to call multiple times. Only creates/updates
ProxMenux-owned objects (proxmenux-webhook endpoint, proxmenux-default matcher).
Never deletes or overrides user notification targets.
"""
import subprocess
import secrets as secrets_mod
ENDPOINT_ID = 'proxmenux-webhook'
MATCHER_ID = 'proxmenux-default'
WEBHOOK_URL = 'http://127.0.0.1:8008/api/notifications/webhook'
result = {
'configured': False,
'endpoint_id': ENDPOINT_ID,
'matcher_id': MATCHER_ID,
'url': WEBHOOK_URL,
'fallback_commands': [],
'error': None,
}
def _run_pvesh(args: list, check: bool = True) -> tuple:
"""Run pvesh command. Returns (success, stdout, stderr)."""
try:
proc = subprocess.run(
['pvesh'] + args,
capture_output=True, text=True, timeout=15
)
return proc.returncode == 0, proc.stdout.strip(), proc.stderr.strip()
except FileNotFoundError:
return False, '', 'pvesh not found'
except subprocess.TimeoutExpired:
return False, '', 'pvesh timed out'
except Exception as e:
return False, '', str(e)
try:
# Step 1: Ensure webhook secret exists
secret = notification_manager.get_webhook_secret()
if not secret:
secret = secrets_mod.token_urlsafe(32)
notification_manager._save_setting('webhook_secret', secret)
secret_header = f'X-Webhook-Secret={secret}'
# Step 2: Check if endpoint already exists
exists_ok, _, _ = _run_pvesh([
'get', f'/cluster/notifications/endpoints/webhook/{ENDPOINT_ID}',
'--output-format', 'json'
])
if exists_ok:
# Update existing endpoint
ok, _, err = _run_pvesh([
'set', f'/cluster/notifications/endpoints/webhook/{ENDPOINT_ID}',
'--url', WEBHOOK_URL,
'--method', 'post',
'--header', secret_header,
])
else:
# Create new endpoint
ok, _, err = _run_pvesh([
'create', '/cluster/notifications/endpoints/webhook',
'--name', ENDPOINT_ID,
'--url', WEBHOOK_URL,
'--method', 'post',
'--header', secret_header,
])
if not ok:
# Build fallback commands for manual execution
result['fallback_commands'] = [
f'pvesh create /cluster/notifications/endpoints/webhook '
f'--name {ENDPOINT_ID} --url {WEBHOOK_URL} --method post '
f'--header "{secret_header}"',
f'pvesh create /cluster/notifications/matchers '
f'--name {MATCHER_ID} --target {ENDPOINT_ID} '
f'--match-severity warning,error',
]
result['error'] = f'Failed to configure endpoint: {err}'
return jsonify(result), 200
# Step 3: Create or update matcher
matcher_exists, _, _ = _run_pvesh([
'get', f'/cluster/notifications/matchers/{MATCHER_ID}',
'--output-format', 'json'
])
if matcher_exists:
ok_m, _, err_m = _run_pvesh([
'set', f'/cluster/notifications/matchers/{MATCHER_ID}',
'--target', ENDPOINT_ID,
'--match-severity', 'warning,error',
])
else:
ok_m, _, err_m = _run_pvesh([
'create', '/cluster/notifications/matchers',
'--name', MATCHER_ID,
'--target', ENDPOINT_ID,
'--match-severity', 'warning,error',
])
if not ok_m:
result['fallback_commands'] = [
f'pvesh create /cluster/notifications/matchers '
f'--name {MATCHER_ID} --target {ENDPOINT_ID} '
f'--match-severity warning,error',
]
result['error'] = f'Endpoint OK, but matcher failed: {err_m}'
result['configured'] = False
return jsonify(result), 200
result['configured'] = True
result['secret'] = secret # Return so UI can display it
return jsonify(result), 200
except Exception as e:
result['error'] = str(e)
result['fallback_commands'] = [
f'pvesh create /cluster/notifications/endpoints/webhook '
f'--name {ENDPOINT_ID} --url {WEBHOOK_URL} --method post '
f'--header "X-Webhook-Secret=YOUR_SECRET"',
f'pvesh create /cluster/notifications/matchers '
f'--name {MATCHER_ID} --target {ENDPOINT_ID} '
f'--match-severity warning,error',
]
return jsonify(result), 200
@notification_bp.route('/api/notifications/webhook', methods=['POST'])
def proxmox_webhook():
"""Receive native Proxmox VE notification webhooks (hardened).
Security layers:
1. Rate limiting (60 req/min) -- always
2. Shared secret (X-Webhook-Secret) -- always required
3. Anti-replay timestamp (60s window) -- remote only
4. Replay cache (signature dedup) -- remote only
5. IP allowlist (optional) -- remote only
Localhost callers (127.0.0.1 / ::1) bypass layers 3-5 because Proxmox
cannot inject dynamic timestamp headers. The shared secret is still
required for localhost to prevent any local process from injecting events.
"""
_reject = lambda code, error, status: (jsonify({'accepted': False, 'error': error}), status)
client_ip = request.remote_addr or ''
is_localhost = client_ip in ('127.0.0.1', '::1')
# ── Layer 1: Rate limiting (always) ──
if not _webhook_limiter.allow():
resp = jsonify({'accepted': False, 'error': 'rate_limited'})
resp.headers['Retry-After'] = '60'
return resp, 429
# ── Layer 2: Shared secret (always required) ──
try:
configured_secret = notification_manager.get_webhook_secret()
except Exception:
configured_secret = ''
if not configured_secret:
return _reject(500, 'webhook_not_configured', 500)
request_secret = request.headers.get('X-Webhook-Secret', '')
if not request_secret:
return _reject(401, 'missing_secret', 401)
if not hmac.compare_digest(configured_secret, request_secret):
return _reject(401, 'invalid_secret', 401)
# ── Layers 3-5: Remote-only checks ──
if not is_localhost:
# Layer 3: Anti-replay timestamp
ts_header = request.headers.get('X-ProxMenux-Timestamp', '')
if not ts_header:
return _reject(401, 'missing_timestamp', 401)
try:
ts_value = int(ts_header)
except (ValueError, TypeError):
return _reject(401, 'invalid_timestamp', 401)
if abs(time.time() - ts_value) > _TIMESTAMP_MAX_DRIFT:
return _reject(401, 'timestamp_expired', 401)
# Layer 4: Replay cache
raw_body = request.get_data(as_text=True) or ''
signature = hashlib.sha256(f"{ts_value}:{raw_body}".encode(errors='replace')).hexdigest()
if _replay_cache.check_and_record(signature):
return _reject(409, 'replay_detected', 409)
# Layer 5: IP allowlist
try:
allowed_ips = notification_manager.get_webhook_allowed_ips()
if allowed_ips and client_ip not in allowed_ips:
return _reject(403, 'forbidden_ip', 403)
except Exception:
pass
# ── Parse and process payload ──
try:
payload = request.get_json(silent=True) or {}
if not payload:
payload = dict(request.form)
if not payload:
return _reject(400, 'invalid_payload', 400)
result = notification_manager.process_webhook(payload)
status_code = 200 if result.get('accepted') else 400
return jsonify(result), status_code
except Exception:
return jsonify({'accepted': False, 'error': 'internal_error'}), 500

View File

@@ -130,6 +130,15 @@ class HealthPersistence:
) )
''') ''')
# Notification cooldown persistence (survives restarts)
cursor.execute('''
CREATE TABLE IF NOT EXISTS notification_last_sent (
fingerprint TEXT PRIMARY KEY,
last_sent_ts INTEGER NOT NULL,
count INTEGER DEFAULT 1
)
''')
# Migration: add suppression_hours column to errors if not present # Migration: add suppression_hours column to errors if not present
cursor.execute("PRAGMA table_info(errors)") cursor.execute("PRAGMA table_info(errors)")
columns = [col[1] for col in cursor.fetchall()] columns = [col[1] for col in cursor.fetchall()]
@@ -143,6 +152,7 @@ class HealthPersistence:
cursor.execute('CREATE INDEX IF NOT EXISTS idx_events_error ON events(error_key)') cursor.execute('CREATE INDEX IF NOT EXISTS idx_events_error ON events(error_key)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_notif_sent_at ON notification_history(sent_at)') cursor.execute('CREATE INDEX IF NOT EXISTS idx_notif_sent_at ON notification_history(sent_at)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_notif_severity ON notification_history(severity)') cursor.execute('CREATE INDEX IF NOT EXISTS idx_notif_severity ON notification_history(severity)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_nls_ts ON notification_last_sent(last_sent_ts)')
conn.commit() conn.commit()
conn.close() conn.close()

View File

@@ -311,7 +311,14 @@ class DiscordChannel(NotificationChannel):
'timestamp': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()), 'timestamp': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()),
} }
if data: # Use structured fields from render_template if available
rendered_fields = (data or {}).get('_rendered_fields', [])
if rendered_fields:
embed['fields'] = [
{'name': name, 'value': val[:1024], 'inline': True}
for name, val in rendered_fields[:25] # Discord limit: 25 fields
]
elif data:
fields = [] fields = []
if data.get('category'): if data.get('category'):
fields.append({'name': 'Category', 'value': data['category'], 'inline': True}) fields.append({'name': 'Category', 'value': data['category'], 'inline': True})
@@ -351,6 +358,164 @@ class DiscordChannel(NotificationChannel):
) )
# ─── Email Channel ──────────────────────────────────────────────
class EmailChannel(NotificationChannel):
"""Email notification channel using SMTP (smtplib) or sendmail fallback.
Config keys:
host, port, username, password, tls_mode (none|starttls|ssl),
from_address, to_addresses (comma-separated), subject_prefix, timeout
"""
def __init__(self, config: Dict[str, str]):
super().__init__()
self.host = config.get('host', '')
self.port = int(config.get('port', 587) or 587)
self.username = config.get('username', '')
self.password = config.get('password', '')
self.tls_mode = config.get('tls_mode', 'starttls') # none | starttls | ssl
self.from_address = config.get('from_address', '')
self.to_addresses = self._parse_recipients(config.get('to_addresses', ''))
self.subject_prefix = config.get('subject_prefix', '[ProxMenux]')
self.timeout = int(config.get('timeout', 10) or 10)
@staticmethod
def _parse_recipients(raw) -> list:
if isinstance(raw, list):
return [a.strip() for a in raw if a.strip()]
return [addr.strip() for addr in str(raw).split(',') if addr.strip()]
def validate_config(self) -> Tuple[bool, str]:
if not self.to_addresses:
return False, 'No recipients configured'
if not self.from_address:
return False, 'No from address configured'
# Must have SMTP host OR local sendmail available
if not self.host:
import os
if not os.path.exists('/usr/sbin/sendmail'):
return False, 'No SMTP host configured and /usr/sbin/sendmail not found'
return True, ''
def send(self, title: str, message: str, severity: str = 'INFO',
data: Optional[Dict] = None) -> Dict[str, Any]:
subject = f"{self.subject_prefix} [{severity}] {title}"
def _do_send():
if self.host:
return self._send_smtp(subject, message, severity)
else:
return self._send_sendmail(subject, message, severity)
return self._send_with_retry(_do_send)
def _send_smtp(self, subject: str, body: str, severity: str) -> Tuple[int, str]:
import smtplib
from email.message import EmailMessage
msg = EmailMessage()
msg['Subject'] = subject
msg['From'] = self.from_address
msg['To'] = ', '.join(self.to_addresses)
msg.set_content(body)
# Add HTML alternative
html_body = self._format_html(subject, body, severity)
if html_body:
msg.add_alternative(html_body, subtype='html')
try:
if self.tls_mode == 'ssl':
server = smtplib.SMTP_SSL(self.host, self.port, timeout=self.timeout)
else:
server = smtplib.SMTP(self.host, self.port, timeout=self.timeout)
if self.tls_mode == 'starttls':
server.starttls()
if self.username and self.password:
server.login(self.username, self.password)
server.send_message(msg)
server.quit()
return 200, 'OK'
except smtplib.SMTPAuthenticationError as e:
return 0, f'SMTP authentication failed: {e}'
except smtplib.SMTPConnectError as e:
return 0, f'SMTP connection failed: {e}'
except smtplib.SMTPException as e:
return 0, f'SMTP error: {e}'
except (OSError, TimeoutError) as e:
return 0, f'Connection error: {e}'
def _send_sendmail(self, subject: str, body: str, severity: str) -> Tuple[int, str]:
import os
import subprocess
from email.message import EmailMessage
sendmail = '/usr/sbin/sendmail'
if not os.path.exists(sendmail):
return 0, 'sendmail not found at /usr/sbin/sendmail'
msg = EmailMessage()
msg['Subject'] = subject
msg['From'] = self.from_address or 'proxmenux@localhost'
msg['To'] = ', '.join(self.to_addresses)
msg.set_content(body)
try:
proc = subprocess.run(
[sendmail, '-t', '-oi'],
input=msg.as_string(), capture_output=True, text=True, timeout=30
)
if proc.returncode == 0:
return 200, 'OK'
return 0, f'sendmail failed (rc={proc.returncode}): {proc.stderr[:200]}'
except subprocess.TimeoutExpired:
return 0, 'sendmail timed out after 30s'
except Exception as e:
return 0, f'sendmail error: {e}'
@staticmethod
def _format_html(subject: str, body: str, severity: str) -> str:
"""Create professional HTML email."""
import html as html_mod
severity_colors = {'CRITICAL': '#dc2626', 'WARNING': '#f59e0b', 'INFO': '#3b82f6'}
color = severity_colors.get(severity, '#6b7280')
body_html = ''.join(
f'<p style="margin:4px 0;color:#374151;">{html_mod.escape(line)}</p>'
for line in body.split('\n') if line.strip()
)
return f'''<!DOCTYPE html>
<html><body style="font-family:-apple-system,Arial,sans-serif;background:#f3f4f6;padding:20px;">
<div style="max-width:600px;margin:0 auto;background:#fff;border-radius:8px;overflow:hidden;">
<div style="background:{color};padding:16px 24px;">
<h2 style="color:#fff;margin:0;font-size:16px;">ProxMenux Monitor</h2>
<p style="color:rgba(255,255,255,0.85);margin:4px 0 0;font-size:13px;">{html_mod.escape(severity)} Alert</p>
</div>
<div style="padding:24px;">
<h3 style="margin:0 0 12px;color:#111827;">{html_mod.escape(subject)}</h3>
{body_html}
</div>
<div style="background:#f9fafb;padding:12px 24px;border-top:1px solid #e5e7eb;">
<p style="margin:0;font-size:11px;color:#9ca3af;">Sent by ProxMenux Notification Service</p>
</div>
</div>
</body></html>'''
def test(self) -> Tuple[bool, str]:
result = self.send(
'ProxMenux Test Notification',
'This is a test notification from ProxMenux Monitor.\n'
'If you received this, your email channel is working correctly.',
'INFO'
)
return result.get('success', False), result.get('error', '')
# ─── Channel Factory ───────────────────────────────────────────── # ─── Channel Factory ─────────────────────────────────────────────
CHANNEL_TYPES = { CHANNEL_TYPES = {
@@ -369,6 +534,12 @@ CHANNEL_TYPES = {
'config_keys': ['webhook_url'], 'config_keys': ['webhook_url'],
'class': DiscordChannel, 'class': DiscordChannel,
}, },
'email': {
'name': 'Email (SMTP)',
'config_keys': ['host', 'port', 'username', 'password', 'tls_mode',
'from_address', 'to_addresses', 'subject_prefix'],
'class': EmailChannel,
},
} }
@@ -397,6 +568,8 @@ def create_channel(channel_type: str, config: Dict[str, str]) -> Optional[Notifi
return DiscordChannel( return DiscordChannel(
webhook_url=config.get('webhook_url', '') webhook_url=config.get('webhook_url', '')
) )
elif channel_type == 'email':
return EmailChannel(config)
except Exception as e: except Exception as e:
print(f"[NotificationChannels] Failed to create {channel_type}: {e}") print(f"[NotificationChannels] Failed to create {channel_type}: {e}")
return None return None

View File

@@ -16,32 +16,70 @@ import os
import re import re
import json import json
import time import time
import hashlib
import socket import socket
import subprocess import subprocess
import threading import threading
from queue import Queue from queue import Queue
from typing import Optional, Dict, Any from typing import Optional, Dict, Any, Tuple
from pathlib import Path from pathlib import Path
# ─── Event Object ───────────────────────────────────────────────── # ─── Event Object ─────────────────────────────────────────────────
class NotificationEvent: class NotificationEvent:
"""Represents a detected event ready for notification dispatch.""" """Represents a detected event ready for notification dispatch.
__slots__ = ('event_type', 'severity', 'data', 'timestamp', 'source') Fields:
event_type: Taxonomy key (e.g. 'vm_fail', 'auth_fail', 'split_brain')
severity: INFO | WARNING | CRITICAL
data: Payload dict with context (hostname, vmid, reason, etc.)
source: Origin: journal | tasks | health | proxmox_hook | cli | api | polling
entity: What is affected: node | vm | ct | storage | disk | network | cluster | user
entity_id: Specific identifier (vmid, IP, device, pool, interface, etc.)
raw: Original payload (webhook JSON or log line), optional
fingerprint: Stable dedup key: hostname:entity:entity_id:event_type
event_id: Short hash of fingerprint for correlation
ts_epoch: time.time() at creation
ts_monotonic: time.monotonic() at creation (drift-safe for cooldown)
"""
__slots__ = (
'event_type', 'severity', 'data', 'timestamp', 'source',
'entity', 'entity_id', 'raw',
'fingerprint', 'event_id', 'ts_epoch', 'ts_monotonic',
)
def __init__(self, event_type: str, severity: str = 'INFO', def __init__(self, event_type: str, severity: str = 'INFO',
data: Optional[Dict[str, Any]] = None, data: Optional[Dict[str, Any]] = None,
source: str = 'watcher'): source: str = 'watcher',
entity: str = 'node', entity_id: str = '',
raw: Any = None):
self.event_type = event_type self.event_type = event_type
self.severity = severity self.severity = severity
self.data = data or {} self.data = data or {}
self.timestamp = time.time()
self.source = source self.source = source
self.entity = entity
self.entity_id = entity_id
self.raw = raw
self.ts_epoch = time.time()
self.ts_monotonic = time.monotonic()
self.timestamp = self.ts_epoch # backward compat
# Build fingerprint for dedup/cooldown
hostname = self.data.get('hostname', _hostname())
if entity_id:
fp_base = f"{hostname}:{entity}:{entity_id}:{event_type}"
else:
# When entity_id is empty, include a hash of title/body for uniqueness
reason = self.data.get('reason', self.data.get('title', ''))
stable_extra = hashlib.md5(reason.encode(errors='replace')).hexdigest()[:8] if reason else ''
fp_base = f"{hostname}:{entity}:{event_type}:{stable_extra}"
self.fingerprint = fp_base
self.event_id = hashlib.md5(fp_base.encode()).hexdigest()[:12]
def __repr__(self): def __repr__(self):
return f"NotificationEvent({self.event_type}, {self.severity})" return f"NotificationEvent({self.event_type}, {self.severity}, fp={self.fingerprint[:40]})"
def _hostname() -> str: def _hostname() -> str:
@@ -186,7 +224,7 @@ class JournalWatcher:
'username': username, 'username': username,
'service': service, 'service': service,
'hostname': self._hostname, 'hostname': self._hostname,
}) }, entity='user', entity_id=source_ip)
return return
def _check_fail2ban(self, msg: str, syslog_id: str): def _check_fail2ban(self, msg: str, syslog_id: str):
@@ -206,7 +244,7 @@ class JournalWatcher:
'jail': jail, 'jail': jail,
'failures': '', 'failures': '',
'hostname': self._hostname, 'hostname': self._hostname,
}) }, entity='user', entity_id=ip)
def _check_kernel_critical(self, msg: str, syslog_id: str, priority: int): def _check_kernel_critical(self, msg: str, syslog_id: str, priority: int):
"""Detect kernel panics, OOM, segfaults, hardware errors.""" """Detect kernel panics, OOM, segfaults, hardware errors."""
@@ -227,13 +265,17 @@ class JournalWatcher:
for pattern, (event_type, severity, reason) in critical_patterns.items(): for pattern, (event_type, severity, reason) in critical_patterns.items():
if re.search(pattern, msg, re.IGNORECASE): if re.search(pattern, msg, re.IGNORECASE):
data = {'reason': reason, 'hostname': self._hostname} data = {'reason': reason, 'hostname': self._hostname}
entity = 'node'
entity_id = ''
# Try to extract device for disk errors # Try to extract device for disk errors
dev_match = re.search(r'dev\s+(\S+)', msg) dev_match = re.search(r'dev\s+(\S+)', msg)
if dev_match and event_type == 'disk_io_error': if dev_match and event_type == 'disk_io_error':
data['device'] = dev_match.group(1) data['device'] = dev_match.group(1)
entity = 'disk'
entity_id = dev_match.group(1)
self._emit(event_type, severity, data) self._emit(event_type, severity, data, entity=entity, entity_id=entity_id)
return return
def _check_service_failure(self, msg: str, unit: str): def _check_service_failure(self, msg: str, unit: str):
@@ -252,7 +294,7 @@ class JournalWatcher:
'service_name': service_name, 'service_name': service_name,
'reason': msg[:200], 'reason': msg[:200],
'hostname': self._hostname, 'hostname': self._hostname,
}) }, entity='node', entity_id=service_name)
return return
def _check_disk_io(self, msg: str, syslog_id: str, priority: int): def _check_disk_io(self, msg: str, syslog_id: str, priority: int):
@@ -275,7 +317,7 @@ class JournalWatcher:
'device': device, 'device': device,
'reason': msg[:200], 'reason': msg[:200],
'hostname': self._hostname, 'hostname': self._hostname,
}) }, entity='disk', entity_id=device)
return return
def _check_cluster_events(self, msg: str, syslog_id: str): def _check_cluster_events(self, msg: str, syslog_id: str):
@@ -293,7 +335,7 @@ class JournalWatcher:
'quorum': quorum, 'quorum': quorum,
'reason': msg[:200], 'reason': msg[:200],
'hostname': self._hostname, 'hostname': self._hostname,
}) }, entity='cluster', entity_id=self._hostname)
return return
# Node disconnect # Node disconnect
@@ -306,7 +348,7 @@ class JournalWatcher:
self._emit('node_disconnect', 'CRITICAL', { self._emit('node_disconnect', 'CRITICAL', {
'node_name': node_name, 'node_name': node_name,
'hostname': self._hostname, 'hostname': self._hostname,
}) }, entity='cluster', entity_id=node_name)
def _check_system_shutdown(self, msg: str, syslog_id: str): def _check_system_shutdown(self, msg: str, syslog_id: str):
"""Detect system shutdown/reboot.""" """Detect system shutdown/reboot."""
@@ -315,13 +357,13 @@ class JournalWatcher:
self._emit('system_shutdown', 'WARNING', { self._emit('system_shutdown', 'WARNING', {
'reason': 'System journal stopped', 'reason': 'System journal stopped',
'hostname': self._hostname, 'hostname': self._hostname,
}) }, entity='node', entity_id='')
elif 'Shutting down' in msg or 'System is rebooting' in msg: elif 'Shutting down' in msg or 'System is rebooting' in msg:
event = 'system_reboot' if 'reboot' in msg.lower() else 'system_shutdown' event = 'system_reboot' if 'reboot' in msg.lower() else 'system_shutdown'
self._emit(event, 'WARNING', { self._emit(event, 'WARNING', {
'reason': msg[:200], 'reason': msg[:200],
'hostname': self._hostname, 'hostname': self._hostname,
}) }, entity='node', entity_id='')
def _check_permission_change(self, msg: str, syslog_id: str): def _check_permission_change(self, msg: str, syslog_id: str):
"""Detect user permission changes in PVE.""" """Detect user permission changes in PVE."""
@@ -341,7 +383,7 @@ class JournalWatcher:
'username': username, 'username': username,
'change_details': action, 'change_details': action,
'hostname': self._hostname, 'hostname': self._hostname,
}) }, entity='user', entity_id=username)
return return
def _check_firewall(self, msg: str, syslog_id: str): def _check_firewall(self, msg: str, syslog_id: str):
@@ -350,20 +392,24 @@ class JournalWatcher:
self._emit('firewall_issue', 'WARNING', { self._emit('firewall_issue', 'WARNING', {
'reason': msg[:200], 'reason': msg[:200],
'hostname': self._hostname, 'hostname': self._hostname,
}) }, entity='network', entity_id='')
# ── Emit helper ── # ── Emit helper ──
def _emit(self, event_type: str, severity: str, data: Dict): def _emit(self, event_type: str, severity: str, data: Dict,
"""Emit event to queue with deduplication.""" entity: str = 'node', entity_id: str = ''):
dedup_key = f"{event_type}:{data.get('source_ip', '')}:{data.get('device', '')}:{data.get('service_name', '')}" """Emit event to queue with short-term deduplication (30s window)."""
event = NotificationEvent(
event_type, severity, data, source='journal',
entity=entity, entity_id=entity_id,
)
now = time.time() now = time.time()
last = self._recent_events.get(dedup_key, 0) last = self._recent_events.get(event.fingerprint, 0)
if now - last < self._dedup_window: if now - last < self._dedup_window:
return # Skip duplicate return # Skip duplicate within 30s window
self._recent_events[dedup_key] = now self._recent_events[event.fingerprint] = now
# Cleanup old dedup entries periodically # Cleanup old dedup entries periodically
if len(self._recent_events) > 200: if len(self._recent_events) > 200:
@@ -372,7 +418,7 @@ class JournalWatcher:
k: v for k, v in self._recent_events.items() if v > cutoff k: v for k, v in self._recent_events.items() if v > cutoff
} }
self._queue.put(NotificationEvent(event_type, severity, data, source='journal')) self._queue.put(event)
# ─── Task Watcher (Real-time) ──────────────────────────────────── # ─── Task Watcher (Real-time) ────────────────────────────────────
@@ -522,7 +568,12 @@ class TaskWatcher:
'snapshot_name': '', 'snapshot_name': '',
} }
self._queue.put(NotificationEvent(event_type, severity, data, source='task')) # Determine entity type from task type
entity = 'ct' if task_type.startswith('vz') else 'vm'
self._queue.put(NotificationEvent(
event_type, severity, data, source='tasks',
entity=entity, entity_id=vmid,
))
def _get_vm_name(self, vmid: str) -> str: def _get_vm_name(self, vmid: str) -> str:
"""Try to resolve VMID to name via config files.""" """Try to resolve VMID to name via config files."""
@@ -628,8 +679,18 @@ class PollingCollector:
data['hostname'] = self._hostname data['hostname'] = self._hostname
data['error_key'] = evt.get('error_key', '') data['error_key'] = evt.get('error_key', '')
# Deduce entity from health category
category = data.get('category', '')
entity_map = {
'cpu': ('node', ''), 'memory': ('node', ''),
'disk': ('storage', ''), 'network': ('network', ''),
'pve_services': ('node', ''), 'security': ('user', ''),
'updates': ('node', ''), 'storage': ('storage', ''),
}
entity, eid = entity_map.get(category, ('node', ''))
self._queue.put(NotificationEvent( self._queue.put(NotificationEvent(
event_type, severity, data, source='health_monitor' event_type, severity, data, source='health',
entity=entity, entity_id=eid or data.get('error_key', ''),
)) ))
# Mark events as notified # Mark events as notified
@@ -641,14 +702,18 @@ class PollingCollector:
# Also check unnotified errors # Also check unnotified errors
unnotified = health_persistence.get_unnotified_errors() unnotified = health_persistence.get_unnotified_errors()
for error in unnotified: for error in unnotified:
err_cat = error.get('category', '')
e_entity, e_eid = entity_map.get(err_cat, ('node', ''))
self._queue.put(NotificationEvent( self._queue.put(NotificationEvent(
'new_error', error.get('severity', 'WARNING'), { 'new_error', error.get('severity', 'WARNING'), {
'category': error.get('category', ''), 'category': err_cat,
'reason': error.get('reason', ''), 'reason': error.get('reason', ''),
'hostname': self._hostname, 'hostname': self._hostname,
'error_key': error.get('error_key', ''), 'error_key': error.get('error_key', ''),
}, },
source='health_monitor' source='health',
entity=e_entity,
entity_id=e_eid or error.get('error_key', ''),
)) ))
# Mark as notified # Mark as notified
if 'id' in error: if 'id' in error:
@@ -692,7 +757,139 @@ class PollingCollector:
'details': details, 'details': details,
'hostname': self._hostname, 'hostname': self._hostname,
}, },
source='polling' source='polling',
entity='node', entity_id='',
)) ))
except Exception: except Exception:
pass # Non-critical, silently skip pass # Non-critical, silently skip
# ─── Proxmox Webhook Receiver ───────────────────────────────────
class ProxmoxHookWatcher:
"""Receives native Proxmox VE notifications via local webhook endpoint.
Proxmox can be configured to send notifications to a webhook target:
pvesh create /cluster/notifications/endpoints/webhook/proxmenux \\
--url http://127.0.0.1:8008/api/notifications/webhook \\
--method POST
Payload varies by source (storage, replication, cluster, PBS, apt).
This class normalizes them into NotificationEvent objects.
"""
def __init__(self, event_queue: Queue):
self._queue = event_queue
self._hostname = _hostname()
def process_webhook(self, payload: dict) -> dict:
"""Process an incoming Proxmox webhook payload.
Returns: {'accepted': bool, 'event_type': str, 'event_id': str}
or {'accepted': False, 'error': str}
"""
if not payload:
return {'accepted': False, 'error': 'Empty payload'}
# Extract common fields from PVE notification payload
notification_type = payload.get('type', payload.get('notification-type', ''))
severity_raw = payload.get('severity', payload.get('priority', 'info'))
title = payload.get('title', payload.get('subject', ''))
body = payload.get('body', payload.get('message', ''))
source_component = payload.get('component', payload.get('source', ''))
# Map to our event taxonomy
event_type, entity, entity_id = self._classify(
notification_type, source_component, title, body, payload
)
severity = self._map_severity(severity_raw)
data = {
'hostname': self._hostname,
'reason': body[:500] if body else title,
'title': title,
'source_component': source_component,
'notification_type': notification_type,
}
# Merge extra fields from payload
for key in ('vmid', 'node', 'storage', 'device', 'pool'):
if key in payload:
data[key] = str(payload[key])
event = NotificationEvent(
event_type=event_type,
severity=severity,
data=data,
source='proxmox_hook',
entity=entity,
entity_id=entity_id,
raw=payload,
)
self._queue.put(event)
return {'accepted': True, 'event_type': event_type, 'event_id': event.event_id}
def _classify(self, ntype: str, component: str, title: str,
body: str, payload: dict) -> tuple:
"""Classify webhook payload into (event_type, entity, entity_id)."""
title_lower = (title or '').lower()
body_lower = (body or '').lower()
component_lower = (component or '').lower()
# Storage / SMART / ZFS / Ceph
if any(k in component_lower for k in ('smart', 'disk', 'zfs', 'ceph')):
entity_id = payload.get('device', payload.get('pool', ''))
if 'smart' in title_lower or 'smart' in body_lower:
return 'disk_io_error', 'disk', str(entity_id)
if 'zfs' in title_lower:
return 'disk_io_error', 'storage', str(entity_id)
return 'disk_space_low', 'storage', str(entity_id)
# Replication
if 'replication' in component_lower or 'replication' in title_lower:
vmid = str(payload.get('vmid', ''))
if 'fail' in title_lower or 'error' in body_lower:
return 'vm_fail', 'vm', vmid
return 'migration_complete', 'vm', vmid
# PBS (Proxmox Backup Server)
if 'pbs' in component_lower or 'backup' in component_lower:
vmid = str(payload.get('vmid', ''))
if 'fail' in title_lower or 'error' in body_lower:
return 'backup_fail', 'vm', vmid
if 'complete' in title_lower or 'success' in body_lower:
return 'backup_complete', 'vm', vmid
return 'backup_start', 'vm', vmid
# Cluster / HA / Fencing / Corosync
if any(k in component_lower for k in ('cluster', 'ha', 'fencing', 'corosync')):
node = str(payload.get('node', ''))
if 'quorum' in title_lower or 'split' in body_lower:
return 'split_brain', 'cluster', node
if 'fencing' in title_lower:
return 'node_disconnect', 'cluster', node
return 'node_disconnect', 'cluster', node
# APT / Updates
if 'apt' in component_lower or 'update' in title_lower:
return 'update_available', 'node', ''
# Network
if 'network' in component_lower:
return 'network_down', 'network', ''
# Security
if any(k in component_lower for k in ('auth', 'firewall', 'security')):
return 'auth_fail', 'user', ''
# Fallback: system_problem generic
return 'system_problem', 'node', ''
@staticmethod
def _map_severity(raw: str) -> str:
raw_l = str(raw).lower()
if raw_l in ('critical', 'emergency', 'alert', 'crit', 'err', 'error'):
return 'CRITICAL'
if raw_l in ('warning', 'warn'):
return 'WARNING'
return 'INFO'

View File

@@ -39,7 +39,8 @@ from notification_templates import (
EVENT_GROUPS, get_event_types_by_group, get_default_enabled_events EVENT_GROUPS, get_event_types_by_group, get_default_enabled_events
) )
from notification_events import ( from notification_events import (
JournalWatcher, TaskWatcher, PollingCollector, NotificationEvent JournalWatcher, TaskWatcher, PollingCollector, NotificationEvent,
ProxmoxHookWatcher,
) )
@@ -50,7 +51,7 @@ SETTINGS_PREFIX = 'notification.'
# Cooldown defaults (seconds) # Cooldown defaults (seconds)
DEFAULT_COOLDOWNS = { DEFAULT_COOLDOWNS = {
'CRITICAL': 0, # No cooldown for critical 'CRITICAL': 60, # 60s minimum (prevents storm, delivers fast)
'WARNING': 300, # 5 min 'WARNING': 300, # 5 min
'INFO': 900, # 15 min 'INFO': 900, # 15 min
'resources': 900, # 15 min for resource alerts 'resources': 900, # 15 min for resource alerts
@@ -58,6 +59,191 @@ DEFAULT_COOLDOWNS = {
} }
# ─── Storm Protection ────────────────────────────────────────────
GROUP_RATE_LIMITS = {
'security': {'max_per_minute': 5, 'max_per_hour': 30},
'storage': {'max_per_minute': 3, 'max_per_hour': 20},
'cluster': {'max_per_minute': 5, 'max_per_hour': 20},
'network': {'max_per_minute': 3, 'max_per_hour': 15},
'resources': {'max_per_minute': 3, 'max_per_hour': 20},
'vm_ct': {'max_per_minute': 10, 'max_per_hour': 60},
'backup': {'max_per_minute': 5, 'max_per_hour': 30},
'system': {'max_per_minute': 5, 'max_per_hour': 30},
}
class GroupRateLimiter:
"""Rate limiter per event group. Prevents notification storms."""
def __init__(self):
from collections import deque
self._deque = deque
self._minute_counts: Dict[str, Any] = {} # group -> deque[timestamp]
self._hour_counts: Dict[str, Any] = {} # group -> deque[timestamp]
def allow(self, group: str) -> bool:
"""Check if group rate limit allows this event."""
limits = GROUP_RATE_LIMITS.get(group, GROUP_RATE_LIMITS['system'])
now = time.time()
# Initialize if needed
if group not in self._minute_counts:
self._minute_counts[group] = self._deque()
self._hour_counts[group] = self._deque()
# Prune old entries
minute_q = self._minute_counts[group]
hour_q = self._hour_counts[group]
while minute_q and now - minute_q[0] > 60:
minute_q.popleft()
while hour_q and now - hour_q[0] > 3600:
hour_q.popleft()
# Check limits
if len(minute_q) >= limits['max_per_minute']:
return False
if len(hour_q) >= limits['max_per_hour']:
return False
# Record
minute_q.append(now)
hour_q.append(now)
return True
def get_stats(self) -> Dict[str, Dict[str, int]]:
"""Return current rate stats per group."""
now = time.time()
stats = {}
for group in self._minute_counts:
minute_q = self._minute_counts.get(group, [])
hour_q = self._hour_counts.get(group, [])
stats[group] = {
'last_minute': sum(1 for t in minute_q if now - t <= 60),
'last_hour': sum(1 for t in hour_q if now - t <= 3600),
}
return stats
AGGREGATION_RULES = {
'auth_fail': {'window': 120, 'min_count': 3, 'burst_type': 'burst_auth_fail'},
'ip_block': {'window': 120, 'min_count': 3, 'burst_type': 'burst_ip_block'},
'disk_io_error': {'window': 60, 'min_count': 3, 'burst_type': 'burst_disk_io'},
'split_brain': {'window': 300, 'min_count': 2, 'burst_type': 'burst_cluster'},
'node_disconnect': {'window': 300, 'min_count': 2, 'burst_type': 'burst_cluster'},
}
class BurstAggregator:
"""Accumulates similar events in a time window, then sends a single summary.
Examples:
- "Fail2Ban banned 17 IPs in 2 minutes"
- "Disk I/O errors: 34 events on /dev/sdb in 60s"
"""
def __init__(self):
self._buckets: Dict[str, List] = {} # bucket_key -> [events]
self._deadlines: Dict[str, float] = {} # bucket_key -> flush_deadline
self._lock = threading.Lock()
def ingest(self, event: NotificationEvent) -> Optional[NotificationEvent]:
"""Add event to aggregation. Returns:
- None if event is being buffered (wait for window)
- Original event if not eligible for aggregation
"""
rule = AGGREGATION_RULES.get(event.event_type)
if not rule:
return event # Not aggregable, pass through
bucket_key = f"{event.event_type}:{event.data.get('hostname', '')}"
with self._lock:
if bucket_key not in self._buckets:
self._buckets[bucket_key] = []
self._deadlines[bucket_key] = time.time() + rule['window']
self._buckets[bucket_key].append(event)
# First event in bucket: pass through immediately so user gets fast alert
if len(self._buckets[bucket_key]) == 1:
return event
# Subsequent events: buffer (will be flushed as summary)
return None
def flush_expired(self) -> List[NotificationEvent]:
"""Flush all buckets past their deadline. Returns summary events."""
now = time.time()
summaries = []
with self._lock:
expired_keys = [k for k, d in self._deadlines.items() if now >= d]
for key in expired_keys:
events = self._buckets.pop(key, [])
del self._deadlines[key]
if len(events) < 2:
continue # Single event already sent on ingest, no summary needed
rule_type = key.split(':')[0]
rule = AGGREGATION_RULES.get(rule_type, {})
min_count = rule.get('min_count', 2)
if len(events) < min_count:
continue # Not enough events for a summary
summary = self._create_summary(events, rule)
if summary:
summaries.append(summary)
return summaries
def _create_summary(self, events: List[NotificationEvent],
rule: dict) -> Optional[NotificationEvent]:
"""Create a single summary event from multiple events."""
if not events:
return None
first = events[0]
# Determine highest severity
sev_order = {'INFO': 0, 'WARNING': 1, 'CRITICAL': 2}
max_severity = max(events, key=lambda e: sev_order.get(e.severity, 0)).severity
# Collect unique entity_ids
entity_ids = list(set(e.entity_id for e in events if e.entity_id))
entity_list = ', '.join(entity_ids[:10]) if entity_ids else 'multiple sources'
if len(entity_ids) > 10:
entity_list += f' (+{len(entity_ids) - 10} more)'
# Calculate window
window_secs = events[-1].ts_epoch - events[0].ts_epoch
if window_secs < 120:
window_str = f'{int(window_secs)}s'
else:
window_str = f'{int(window_secs / 60)}m'
burst_type = rule.get('burst_type', 'burst_generic')
data = {
'hostname': first.data.get('hostname', socket.gethostname()),
'count': str(len(events)),
'window': window_str,
'entity_list': entity_list,
'event_type': first.event_type,
}
return NotificationEvent(
event_type=burst_type,
severity=max_severity,
data=data,
source='aggregator',
entity=first.entity,
entity_id='burst',
)
# ─── Notification Manager ───────────────────────────────────────── # ─── Notification Manager ─────────────────────────────────────────
class NotificationManager: class NotificationManager:
@@ -81,9 +267,17 @@ class NotificationManager:
self._polling_collector: Optional[PollingCollector] = None self._polling_collector: Optional[PollingCollector] = None
self._dispatch_thread: Optional[threading.Thread] = None self._dispatch_thread: Optional[threading.Thread] = None
# Cooldown tracking: {event_type_or_key: last_sent_timestamp} # Webhook receiver (no thread, passive)
self._hook_watcher: Optional[ProxmoxHookWatcher] = None
# Cooldown tracking: {fingerprint: last_sent_timestamp}
self._cooldowns: Dict[str, float] = {} self._cooldowns: Dict[str, float] = {}
# Storm protection
self._group_limiter = GroupRateLimiter()
self._aggregator = BurstAggregator()
self._aggregation_thread: Optional[threading.Thread] = None
# Stats # Stats
self._stats = { self._stats = {
'started_at': None, 'started_at': None,
@@ -180,6 +374,7 @@ class NotificationManager:
return return
self._load_config() self._load_config()
self._load_cooldowns_from_db()
if not self._enabled: if not self._enabled:
print("[NotificationManager] Service is disabled. Skipping start.") print("[NotificationManager] Service is disabled. Skipping start.")
@@ -220,19 +415,48 @@ class NotificationManager:
def _dispatch_loop(self): def _dispatch_loop(self):
"""Main dispatch loop: reads queue -> filters -> formats -> sends -> records.""" """Main dispatch loop: reads queue -> filters -> formats -> sends -> records."""
last_cleanup = time.monotonic()
last_flush = time.monotonic()
cleanup_interval = 3600 # Cleanup cooldowns every hour
flush_interval = 5 # Flush aggregation buckets every 5s
while self._running: while self._running:
try: try:
event = self._event_queue.get(timeout=2) event = self._event_queue.get(timeout=2)
except Empty: except Empty:
# Periodic maintenance during idle
now_mono = time.monotonic()
if now_mono - last_cleanup > cleanup_interval:
self._cleanup_old_cooldowns()
last_cleanup = now_mono
# Flush expired aggregation buckets
if now_mono - last_flush > flush_interval:
self._flush_aggregation()
last_flush = now_mono
continue continue
try: try:
self._process_event(event) self._process_event(event)
except Exception as e: except Exception as e:
print(f"[NotificationManager] Dispatch error: {e}") print(f"[NotificationManager] Dispatch error: {e}")
# Also flush aggregation after each event
if time.monotonic() - last_flush > flush_interval:
self._flush_aggregation()
last_flush = time.monotonic()
def _flush_aggregation(self):
"""Flush expired aggregation buckets and dispatch summaries."""
try:
summaries = self._aggregator.flush_expired()
for summary_event in summaries:
# Burst summaries bypass aggregator but still pass cooldown + rate limit
self._process_event_direct(summary_event)
except Exception as e:
print(f"[NotificationManager] Aggregation flush error: {e}")
def _process_event(self, event: NotificationEvent): def _process_event(self, event: NotificationEvent):
"""Process a single event from the queue.""" """Process a single event: filter -> aggregate -> cooldown -> rate limit -> dispatch."""
if not self._enabled: if not self._enabled:
return return
@@ -246,14 +470,43 @@ class NotificationManager:
if not self._meets_severity(event.severity, min_severity): if not self._meets_severity(event.severity, min_severity):
return return
# Try aggregation (may buffer the event)
result = self._aggregator.ingest(event)
if result is None:
return # Buffered, will be flushed as summary later
event = result # Use original event (first in burst passes through)
# From here, proceed with dispatch (shared with _process_event_direct)
self._dispatch_event(event)
def _process_event_direct(self, event: NotificationEvent):
"""Process a burst summary event. Bypasses aggregator but applies all other filters."""
if not self._enabled:
return
# Check severity filter
min_severity = self._config.get('filter.min_severity', 'INFO')
if not self._meets_severity(event.severity, min_severity):
return
self._dispatch_event(event)
def _dispatch_event(self, event: NotificationEvent):
"""Shared dispatch pipeline: cooldown -> rate limit -> render -> send."""
# Check cooldown # Check cooldown
if not self._check_cooldown(event): if not self._check_cooldown(event):
return return
# Render message from template # Check group rate limit
template = TEMPLATES.get(event.event_type, {})
group = template.get('group', 'system')
if not self._group_limiter.allow(group):
return
# Render message from template (structured output)
rendered = render_template(event.event_type, event.data) rendered = render_template(event.event_type, event.data)
# Optional AI enhancement # Optional AI enhancement (on text body only)
ai_config = { ai_config = {
'enabled': self._config.get('ai_enabled', 'false'), 'enabled': self._config.get('ai_enabled', 'false'),
'provider': self._config.get('ai_provider', ''), 'provider': self._config.get('ai_provider', ''),
@@ -264,10 +517,15 @@ class NotificationManager:
rendered['title'], rendered['body'], rendered['severity'], ai_config rendered['title'], rendered['body'], rendered['severity'], ai_config
) )
# Enrich data with structured fields for channels that support them
enriched_data = dict(event.data)
enriched_data['_rendered_fields'] = rendered.get('fields', [])
enriched_data['_body_html'] = rendered.get('body_html', '')
# Send through all active channels # Send through all active channels
self._dispatch_to_channels( self._dispatch_to_channels(
rendered['title'], body, rendered['severity'], rendered['title'], body, rendered['severity'],
event.event_type, event.data, event.source event.event_type, enriched_data, event.source
) )
def _dispatch_to_channels(self, title: str, body: str, severity: str, def _dispatch_to_channels(self, title: str, body: str, severity: str,
@@ -323,20 +581,67 @@ class NotificationManager:
else: else:
cooldown = DEFAULT_COOLDOWNS.get(event.severity, 300) cooldown = DEFAULT_COOLDOWNS.get(event.severity, 300)
# CRITICAL events have zero cooldown by default # CRITICAL events: 60s minimum cooldown (prevents storm, but delivers fast)
if event.severity == 'CRITICAL' and cooldown_str is None: if event.severity == 'CRITICAL' and cooldown_str is None:
cooldown = 0 cooldown = 60
# Check against last sent time # Check against last sent time using stable fingerprint
dedup_key = f"{event.event_type}:{event.data.get('category', '')}:{event.data.get('vmid', '')}" last_sent = self._cooldowns.get(event.fingerprint, 0)
last_sent = self._cooldowns.get(dedup_key, 0)
if now - last_sent < cooldown: if now - last_sent < cooldown:
return False return False
self._cooldowns[dedup_key] = now self._cooldowns[event.fingerprint] = now
self._persist_cooldown(event.fingerprint, now)
return True return True
def _load_cooldowns_from_db(self):
"""Load persistent cooldown state from SQLite (up to 48h)."""
try:
if not DB_PATH.exists():
return
conn = sqlite3.connect(str(DB_PATH), timeout=10)
conn.execute('PRAGMA journal_mode=WAL')
cursor = conn.cursor()
cursor.execute('SELECT fingerprint, last_sent_ts FROM notification_last_sent')
now = time.time()
for fp, ts in cursor.fetchall():
if now - ts < 172800: # 48h window
self._cooldowns[fp] = ts
conn.close()
except Exception as e:
print(f"[NotificationManager] Failed to load cooldowns: {e}")
def _persist_cooldown(self, fingerprint: str, ts: float):
"""Save cooldown timestamp to SQLite for restart persistence."""
try:
conn = sqlite3.connect(str(DB_PATH), timeout=10)
conn.execute('PRAGMA journal_mode=WAL')
conn.execute('PRAGMA busy_timeout=5000')
conn.execute('''
INSERT OR REPLACE INTO notification_last_sent (fingerprint, last_sent_ts, count)
VALUES (?, ?, COALESCE(
(SELECT count + 1 FROM notification_last_sent WHERE fingerprint = ?), 1
))
''', (fingerprint, int(ts), fingerprint))
conn.commit()
conn.close()
except Exception:
pass # Non-critical, in-memory cooldown still works
def _cleanup_old_cooldowns(self):
"""Remove cooldown entries older than 48h from both memory and DB."""
cutoff = time.time() - 172800 # 48h
self._cooldowns = {k: v for k, v in self._cooldowns.items() if v > cutoff}
try:
conn = sqlite3.connect(str(DB_PATH), timeout=10)
conn.execute('PRAGMA journal_mode=WAL')
conn.execute('DELETE FROM notification_last_sent WHERE last_sent_ts < ?', (int(cutoff),))
conn.commit()
conn.close()
except Exception:
pass
@staticmethod @staticmethod
def _meets_severity(event_severity: str, min_severity: str) -> bool: def _meets_severity(event_severity: str, min_severity: str) -> bool:
"""Check if event severity meets the minimum threshold.""" """Check if event severity meets the minimum threshold."""
@@ -487,6 +792,31 @@ class NotificationManager:
'results': results, 'results': results,
} }
# ─── Proxmox Webhook ──────────────────────────────────────────
def process_webhook(self, payload: dict) -> dict:
"""Process incoming Proxmox webhook. Delegates to ProxmoxHookWatcher."""
if not self._hook_watcher:
self._hook_watcher = ProxmoxHookWatcher(self._event_queue)
return self._hook_watcher.process_webhook(payload)
def get_webhook_secret(self) -> str:
"""Get configured webhook secret, or empty string if none."""
if not self._config:
self._load_config()
return self._config.get('webhook_secret', '')
def get_webhook_allowed_ips(self) -> list:
"""Get list of allowed IPs for webhook, or empty list (allow all)."""
if not self._config:
self._load_config()
raw = self._config.get('webhook_allowed_ips', '')
if not raw:
return []
return [ip.strip() for ip in str(raw).split(',') if ip.strip()]
# ─── Status & Settings ──────────────────────────────────────
def get_status(self) -> Dict[str, Any]: def get_status(self) -> Dict[str, Any]:
"""Get current service status.""" """Get current service status."""
if not self._config: if not self._config:
@@ -618,6 +948,8 @@ class NotificationManager:
'event_groups': EVENT_GROUPS, 'event_groups': EVENT_GROUPS,
'event_types': get_event_types_by_group(), 'event_types': get_event_types_by_group(),
'default_events': get_default_enabled_events(), 'default_events': get_default_enabled_events(),
'webhook_secret': self._config.get('webhook_secret', ''),
'webhook_allowed_ips': self._config.get('webhook_allowed_ips', ''),
} }
def save_settings(self, settings: Dict[str, str]) -> Dict[str, Any]: def save_settings(self, settings: Dict[str, str]) -> Dict[str, Any]:

View File

@@ -313,6 +313,38 @@ TEMPLATES = {
'group': 'system', 'group': 'system',
'default_enabled': False, 'default_enabled': False,
}, },
# ── Burst aggregation summaries ──
'burst_auth_fail': {
'title': '{hostname}: {count} auth failures in {window}',
'body': '{count} authentication failures detected in {window}.\nSources: {entity_list}',
'group': 'security',
'default_enabled': True,
},
'burst_ip_block': {
'title': '{hostname}: Fail2Ban banned {count} IPs in {window}',
'body': '{count} IPs banned by Fail2Ban in {window}.\nIPs: {entity_list}',
'group': 'security',
'default_enabled': True,
},
'burst_disk_io': {
'title': '{hostname}: {count} disk I/O errors on {entity_list}',
'body': '{count} I/O errors detected in {window}.\nDevices: {entity_list}',
'group': 'storage',
'default_enabled': True,
},
'burst_cluster': {
'title': '{hostname}: Cluster flapping detected ({count} changes)',
'body': 'Cluster state changed {count} times in {window}.\nNodes: {entity_list}',
'group': 'cluster',
'default_enabled': True,
},
'burst_generic': {
'title': '{hostname}: {count} {event_type} events in {window}',
'body': '{count} events of type {event_type} in {window}.\n{entity_list}',
'group': 'system',
'default_enabled': True,
},
} }
# ─── Event Groups (for UI filtering) ───────────────────────────── # ─── Event Groups (for UI filtering) ─────────────────────────────
@@ -339,23 +371,24 @@ def _get_hostname() -> str:
return 'proxmox' return 'proxmox'
def render_template(event_type: str, data: Dict[str, Any]) -> Dict[str, str]: def render_template(event_type: str, data: Dict[str, Any]) -> Dict[str, Any]:
"""Render a template with the given data. """Render a template into a structured notification object.
Args: Returns structured output usable by all channels:
event_type: Key from TEMPLATES dict title, body (text), body_text, body_html (escaped), fields, tags, severity, group
data: Variables to fill into the template
Returns:
{'title': rendered_title, 'body': rendered_body, 'severity': severity}
""" """
import html as html_mod
template = TEMPLATES.get(event_type) template = TEMPLATES.get(event_type)
if not template: if not template:
# Fallback for unknown event types fallback_body = data.get('message', data.get('reason', str(data)))
severity = data.get('severity', 'INFO')
return { return {
'title': f"{_get_hostname()}: {event_type}", 'title': f"{_get_hostname()}: {event_type}",
'body': data.get('message', data.get('reason', str(data))), 'body': fallback_body, 'body_text': fallback_body,
'severity': data.get('severity', 'INFO'), 'body_html': f'<p>{html_mod.escape(str(fallback_body))}</p>',
'fields': [], 'tags': [severity, 'system', event_type],
'severity': severity, 'group': 'system',
} }
# Ensure hostname is always available # Ensure hostname is always available
@@ -363,58 +396,65 @@ def render_template(event_type: str, data: Dict[str, Any]) -> Dict[str, str]:
'hostname': _get_hostname(), 'hostname': _get_hostname(),
'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'), 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
'severity': data.get('severity', 'INFO'), 'severity': data.get('severity', 'INFO'),
# Burst event variables
'window': '', 'entity_list': '',
# Common defaults # Common defaults
'vmid': '', 'vmid': '', 'vmname': '', 'reason': '', 'summary': '',
'vmname': '', 'details': '', 'category': '', 'previous': '', 'current': '',
'reason': '', 'duration': '', 'value': '', 'threshold': '',
'summary': '', 'source_ip': '', 'username': '', 'service': '', 'service_name': '',
'details': '', 'node_name': '', 'target_node': '', 'mount': '', 'device': '',
'category': '', 'used': '', 'total': '', 'available': '', 'cores': '',
'previous': '', 'count': '', 'size': '', 'snapshot_name': '', 'jail': '',
'current': '', 'failures': '', 'quorum': '', 'change_details': '', 'message': '',
'duration': '',
'value': '',
'threshold': '',
'source_ip': '',
'username': '',
'service': '',
'service_name': '',
'node_name': '',
'target_node': '',
'mount': '',
'device': '',
'used': '',
'total': '',
'available': '',
'cores': '',
'count': '',
'size': '',
'snapshot_name': '',
'jail': '',
'failures': '',
'quorum': '',
'change_details': '',
'message': '',
} }
variables.update(data) variables.update(data)
try: try:
title = template['title'].format(**variables) title = template['title'].format(**variables)
except (KeyError, ValueError): except (KeyError, ValueError):
title = template['title'] # Use raw template if formatting fails title = template['title']
try: try:
body = template['body'].format(**variables) body_text = template['body'].format(**variables)
except (KeyError, ValueError): except (KeyError, ValueError):
body = template['body'] body_text = template['body']
# Clean up empty lines from missing optional variables # Clean up empty lines from missing optional variables
body = '\n'.join(line for line in body.split('\n') if line.strip()) body_text = '\n'.join(line for line in body_text.split('\n') if line.strip())
severity = variables.get('severity', 'INFO')
group = template.get('group', 'system')
# Build structured fields for Discord embeds / rich notifications
fields = []
field_map = [
('vmid', 'VM/CT'), ('vmname', 'Name'), ('device', 'Device'),
('source_ip', 'Source IP'), ('node_name', 'Node'), ('category', 'Category'),
('service_name', 'Service'), ('jail', 'Jail'), ('username', 'User'),
('count', 'Count'), ('window', 'Window'), ('entity_list', 'Affected'),
]
for key, label in field_map:
val = variables.get(key, '')
if val:
fields.append((label, str(val)))
# Build HTML body with escaped content
body_html_parts = []
for line in body_text.split('\n'):
if line.strip():
body_html_parts.append(f'<p>{html_mod.escape(line)}</p>')
body_html = '\n'.join(body_html_parts) if body_html_parts else f'<p>{html_mod.escape(body_text)}</p>'
return { return {
'title': title, 'title': title,
'body': body, 'body': body_text, # backward compat
'severity': variables.get('severity', 'INFO'), 'body_text': body_text,
'body_html': body_html,
'fields': fields,
'tags': [severity, group, event_type],
'severity': severity,
'group': group,
} }