ProxMenux/scripts/telegram-notifier.sh
2025-03-29 11:16:38 +01:00

2316 lines
100 KiB
Bash
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# Configuration ============================================
REPO_URL="https://raw.githubusercontent.com/MacRimi/ProxMenux/main"
BASE_DIR="/usr/local/share/proxmenux"
UTILS_FILE="$BASE_DIR/utils.sh"
VENV_PATH="/opt/googletrans-env"
if [[ -f "$UTILS_FILE" ]]; then
source "$UTILS_FILE"
fi
load_language
initialize_cache
CONFIG_FILE="/etc/proxmox-telegram.conf"
PID_DIR="/var/run/proxmox-telegram"
WRAPPER_PATH="/usr/local/bin/telegram-notifier-wrapper.sh"
t() { translate "$1"; }
declare -A IFACE_DOWN
declare -A IFACE_DOWN_TIME
disk_full_detected=false
disk_nearly_full_detected=false
inode_full_detected=false
cpu_usage_history=""
last_cpu_sustained_notification=0
last_swap_notification=0
# ==================================================
# TELEGRAM
# ==================================================
# Create configuration file if it doesn't exist
if [[ ! -f "$CONFIG_FILE" ]]; then
cat <<EOF > "$CONFIG_FILE"
BOT_TOKEN=""
CHAT_ID=""
vm_start=0
vm_shutdown=0
vm_restart=0
vm_fail=0
update_available=0
update_complete=0
system_shutdown=0
system_problem=0
system_load_high=0
kernel_panic=0
disk_fail=0
disk_full=0
disk_io_error=0
node_disconnect=0
split_brain=0
network_down=0
network_saturation=0
firewall_issue=0
backup_complete=0
backup_fail=0
snapshot_complete=0
snapshot_fail=0
auth_fail=0
ip_block=0
user_permission_change=0
cpu_high=0
ram_high=0
temp_high=0
low_disk_space=0
EOF
chmod 600 "$CONFIG_FILE"
fi
source "$CONFIG_FILE"
########################################################################
send_notification() {
local message="$1"
if [[ -z "$BOT_TOKEN" || -z "$CHAT_ID" ]]; then
return 1
fi
curl -s -X POST "https://api.telegram.org/bot$BOT_TOKEN/sendMessage" \
-d "chat_id=$CHAT_ID" \
-d "text=$message" > /dev/null 2>&1
}
#########################################################################
# Function to configure Telegram
configure_telegram() {
[[ -f "$CONFIG_FILE" ]] && source "$CONFIG_FILE"
BOT_TOKEN=$(whiptail --title "$(translate "Telegram Configuration")" \
--inputbox "$(translate "Enter your Telegram Bot Token:")" 10 70 "$BOT_TOKEN" 3>&1 1>&2 2>&3)
if [[ $? -ne 0 ]]; then
return
fi
CHAT_ID=$(whiptail --title "$(translate "Telegram Configuration")" \
--inputbox "$(translate "Enter your Telegram Chat ID:")" 10 70 "$CHAT_ID" 3>&1 1>&2 2>&3)
if [[ $? -ne 0 ]]; then
return
fi
# Save configuration to file
if [[ -n "$BOT_TOKEN" && -n "$CHAT_ID" ]]; then
cp "$CONFIG_FILE" "${CONFIG_FILE}.bak" 2>/dev/null
if grep -q "^BOT_TOKEN=" "$CONFIG_FILE"; then
sed -i "s/^BOT_TOKEN=.*/BOT_TOKEN=\"$BOT_TOKEN\"/" "$CONFIG_FILE"
else
echo "BOT_TOKEN=\"$BOT_TOKEN\"" >> "$CONFIG_FILE"
fi
if grep -q "^CHAT_ID=" "$CONFIG_FILE"; then
sed -i "s/^CHAT_ID=.*/CHAT_ID=\"$CHAT_ID\"/" "$CONFIG_FILE"
else
echo "CHAT_ID=\"$CHAT_ID\"" >> "$CONFIG_FILE"
fi
source "$CONFIG_FILE"
# Test the configuration immediately
response=$(curl -s -X POST "https://api.telegram.org/bot$BOT_TOKEN/sendMessage" \
-d "chat_id=$CHAT_ID" \
-d "text=$(translate "Telegram is working correctly!")")
if [[ "$response" =~ "ok\":true" ]]; then
whiptail --title "$(translate "Success")" \
--msgbox "$(translate "Valid Telegram configuration. Notifications will be sent.")" 10 70
else
whiptail --title "$(translate "Error")" \
--msgbox "$(translate "Invalid Telegram configuration. Please verify the token and chat ID.")" 10 70
fi
else
whiptail --title "$(translate "Error")" \
--msgbox "$(translate "Incomplete Telegram configuration. Please provide both token and chat ID.")" 10 70
fi
}
# ==================================================
# ==================================================
# NOTIFICATION CONFIGURATION
# ==================================================
# Options for the menu
options=(
"VM and Container|$(t 'VM/Container Start')|vm_start"
"VM and Container|$(t 'VM/Container Shutdown')|vm_shutdown"
"VM and Container|$(t 'VM/Container Restart')|vm_restart"
"VM and Container|$(t 'VM/Container Start Failure')|vm_fail"
"System|$(t 'New update available')|update_available"
"System|$(t 'Update completed')|update_complete"
"System|$(t 'System shutdown')|system_shutdown"
"System|$(t 'System problem')|system_problem"
"System|$(t 'High system load')|system_load_high"
"Storage|$(t 'Disk failure')|disk_fail"
"Storage|$(t 'Storage full')|disk_full"
"Storage|$(t 'Read/Write issues')|disk_io_error"
"Cluster|$(t 'Node disconnected')|node_disconnect"
"Cluster|$(t 'Split-brain (quorum conflict)')|split_brain"
"Network|$(t 'Network interface down')|network_down"
"Network|$(t 'Network saturation')|network_saturation"
"Network|$(t 'Firewall issue')|firewall_issue"
"Backup and Snapshot|$(t 'Backup completed')|backup_complete"
"Backup and Snapshot|$(t 'Backup failed')|backup_fail"
"Backup and Snapshot|$(t 'Snapshot completed')|snapshot_complete"
"Backup and Snapshot|$(t 'Snapshot failed')|snapshot_fail"
"Security|$(t 'Failed authentication attempt')|auth_fail"
"Security|$(t 'Automatic IP blocks')|ip_block"
"Security|$(t 'User permission change')|user_permission_change"
"Resources|$(t 'High CPU usage')|cpu_high"
"Resources|$(t 'High RAM usage')|ram_high"
"Resources|$(t 'High system temperature')|temp_high"
"Resources|$(t 'Low disk space')|low_disk_space"
)
# Function to configure notifications
configure_notifications() {
IFS=$'\n' sorted_options=($(for option in "${options[@]}"; do
IFS='|' read -r category description var_name <<< "$option"
printf "%s|%s|%s\n" "$category" "$description" "$var_name"
done | sort -t'|' -k1,1 -k2,2))
unset IFS
declare -A index_to_var
index=1
menu_items=()
for option in "${sorted_options[@]}"; do
IFS='|' read -r category description var_name <<< "$option"
index_to_var["$index"]="$var_name"
formatted_item="$description"
current_length=${#formatted_item}
spaces_needed=$((50 - current_length))
for ((j = 0; j < spaces_needed; j++)); do
formatted_item+=" "
done
formatted_item+="$category"
state="OFF"
[[ "$(eval echo \$$var_name)" -eq 1 ]] && state="ON"
menu_items+=("$index" "$formatted_item" "$state")
((index++))
done
# whiptail menu
selected_indices=$(whiptail --backtitle "ProxMenuX" --title "$(translate "Telegram Notification Configuration")" \
--checklist --separate-output \
"\n$(translate "Select the events you want to receive:")\n" \
30 100 20 \
"${menu_items[@]}" \
3>&1 1>&2 2>&3)
local result=$?
if [[ $result -eq 0 ]]; then
cp "$CONFIG_FILE" "${CONFIG_FILE}.bak" 2>/dev/null
for var_name in "${index_to_var[@]}"; do
sed -i "s/^$var_name=.*/$var_name=0/" "$CONFIG_FILE"
done
for selected_index in $selected_indices; do
var_name="${index_to_var[$selected_index]}"
sed -i "s/^$var_name=.*/$var_name=1/" "$CONFIG_FILE"
done
source "$CONFIG_FILE"
whiptail --backtitle "ProxMenuX" --title "$(translate "Success")" \
--msgbox "$(translate "Configuration updated successfully.")" 10 70
fi
}
# ==================================================
# Function to get VM/CT name from its ID
get_vm_name() {
local vmid="$1"
local name=""
if [[ -f "/etc/pve/qemu-server/$vmid.conf" ]]; then
name=$(grep -i "^name:" "/etc/pve/qemu-server/$vmid.conf" | cut -d ' ' -f2-)
elif [[ -f "/etc/pve/lxc/$vmid.conf" ]]; then
name=$(grep -i "^hostname:" "/etc/pve/lxc/$vmid.conf" | cut -d ' ' -f2-)
fi
if [[ -n "$name" ]]; then
echo "$name ($vmid)"
else
echo "$vmid"
fi
}
# ==================================================
# ==================================================
# NOTIFICATION EVENTS
# ==================================================
# Function: capture events from journalctl
capture_journal_events() {
local processed_events_file="$PID_DIR/processed_events"
mkdir -p "$PID_DIR" 2>/dev/null
if [[ ! -f "$processed_events_file" ]]; then
touch "$processed_events_file"
fi
while true; do
# Use tail for Proxmox tasks file
tail -F /var/log/pve/tasks/index 2>/dev/null | while read -r line; do
event_id=$(echo "$line" | md5sum | cut -d' ' -f1)
if grep -q "$event_id" "$processed_events_file" 2>/dev/null; then
continue
fi
echo "$event_id" >> "$processed_events_file"
tail -n 1000 "$processed_events_file" > "${processed_events_file}.tmp" 2>/dev/null && mv "${processed_events_file}.tmp" "$processed_events_file" 2>/dev/null
local event_processed=false
# ===== IMMEDIATE NOTIFICATION EVENTS =====
# VM or CT start failure (CRITICAL)
if [[ "$vm_fail" -eq 1 ]] && [[ "$event_processed" = false ]]; then
# Detect VM errors
if [[ "$line" =~ "Failed to start VM" || "$line" =~ "qmstart" && "$line" =~ "err" || "$line" =~ "qmstart" && "$line" =~ "fail" ]]; then
VM_ID=$(echo "$line" | grep -oP '(VM |qmstart:)\K[0-9]+')
NAME=$(get_vm_name "$VM_ID")
send_notification "🚨 $(translate "CRITICAL: Failed to start VM:") $NAME"
event_processed=true
# Detect CT (LXC) errors
elif [[ "$line" =~ "Failed to start CT" || "$line" =~ "lxc-start" && "$line" =~ "err" || "$line" =~ "lxc-start" && "$line" =~ "fail" ]]; then
CT_ID=$(echo "$line" | grep -oP '(CT |lxc-start:)\K[0-9]+')
NAME=$(get_vm_name "$CT_ID")
send_notification "🚨 $(translate "CRITICAL: Failed to start Container:") $NAME"
event_processed=true
fi
fi
# Disk I/O errors (CRITICAL)
if [[ "$disk_io_error" -eq 1 ]] && [[ "$event_processed" = false ]]; then
if [[ "$line" =~ "I/O error" || "$line" =~ "read error" || "$line" =~ "write error" ||
"$line" =~ "blk_update_request" || "$line" =~ "buffer I/O error" ||
"$line" =~ "medium error" || "$line" =~ "sense key: Medium Error" ||
"$line" =~ "ata.*failed command" || "$line" =~ "SCSI error" ]]; then
# Extract device name with improved pattern matching
DISK=$(echo "$line" | grep -oE "/dev/[a-zA-Z0-9]+" ||
echo "$line" | grep -oE "sd[a-z][0-9]*" ||
echo "$line" | grep -oE "nvme[0-9]+n[0-9]+" ||
echo "unknown")
# Try to extract error type
ERROR_TYPE="unknown"
if [[ "$line" =~ "read error" ]]; then
ERROR_TYPE="read"
elif [[ "$line" =~ "write error" ]]; then
ERROR_TYPE="write"
elif [[ "$line" =~ "medium error" || "$line" =~ "sense key: Medium Error" ]]; then
ERROR_TYPE="medium"
elif [[ "$line" =~ "timeout" ]]; then
ERROR_TYPE="timeout"
fi
# Try to extract sector information if available
SECTOR=$(echo "$line" | grep -oP "sector [0-9]+" || echo "")
if [[ -n "$SECTOR" ]]; then
SECTOR=" ($SECTOR)"
fi
# Send notification with enhanced information
send_notification "🚨 $(translate "CRITICAL: Disk ${ERROR_TYPE} error on:") $DISK$SECTOR"
event_processed=true
fi
fi
# Disk failure (CRITICAL)
if [[ "$disk_fail" -eq 1 ]] && [[ "$event_processed" = false ]]; then
if [[ "$line" =~ "disk failure" || "$line" =~ "hard drive failure" ||
"$line" =~ "SMART error" || "$line" =~ "SMART failure" ||
"$line" =~ "SMART Status BAD" || "$line" =~ "failed SMART" ||
"$line" =~ "drive failure" || "$line" =~ "bad sectors" ||
"$line" =~ "sector reallocation" || "$line" =~ "uncorrectable error" ||
"$line" =~ "media error" || "$line" =~ "not responding" && "$line" =~ "disk" ||
"$line" =~ "SSD life critical" || "$line" =~ "SSD wear" && "$line" =~ "critical" ]]; then
# Extract device name with improved pattern matching
DISK=$(echo "$line" | grep -oE "/dev/[a-zA-Z0-9]+" ||
echo "$line" | grep -oE "sd[a-z][0-9]*" ||
echo "$line" | grep -oE "nvme[0-9]+n[0-9]+" ||
echo "$line" | grep -oE "ata[0-9]+" ||
echo "unknown")
# Try to determine failure type
FAILURE_TYPE="hardware"
if [[ "$line" =~ "SMART" ]]; then
FAILURE_TYPE="SMART"
# Try to extract SMART attribute if available
SMART_ATTR=$(echo "$line" | grep -oP "Attribute \K[^:]*" ||
echo "$line" | grep -oP "SMART attribute \K[^:]*" ||
echo "")
if [[ -n "$SMART_ATTR" ]]; then
SMART_ATTR=" (Attribute: $SMART_ATTR)"
fi
elif [[ "$line" =~ "bad sectors" || "$line" =~ "sector reallocation" ]]; then
FAILURE_TYPE="bad sectors"
elif [[ "$line" =~ "SSD" ]]; then
FAILURE_TYPE="SSD wear"
elif [[ "$line" =~ "not responding" ]]; then
FAILURE_TYPE="unresponsive"
fi
# Send notification with enhanced information
send_notification "🚨 $(translate "CRITICAL: Disk failure detected") ($FAILURE_TYPE): $DISK$SMART_ATTR"
event_processed=true
fi
fi
# Snapshot failed (CRITICAL)
if [[ "$line" =~ "snapshot" ]] && [[ "$snapshot_fail" -eq 1 ]] && [[ "$line" =~ "error" || "$line" =~ "fail" || "$line" =~ "unable to" || "$line" =~ "cannot" ]] && [[ "$event_processed" = false ]]; then
# Extract VM/CT ID
VM_ID=$(echo "$line" | grep -oP 'TASK \K[0-9]+')
if [[ -z "$VM_ID" ]]; then
VM_ID=$(echo "$line" | grep -oP 'VM \K[0-9]+')
fi
if [[ -z "$VM_ID" ]]; then
VM_ID=$(echo "$line" | grep -oP 'CT \K[0-9]+')
fi
# Extract snapshot ID
SNAPSHOT_ID=$(echo "$line" | grep -oP 'snapshot \K[a-zA-Z0-9_-]+')
if [[ -z "$SNAPSHOT_ID" ]]; then
SNAPSHOT_ID=$(echo "$line" | grep -oP 'snap\K[a-zA-Z0-9_-]+')
fi
# Try to determine error reason
ERROR_REASON=""
if [[ "$line" =~ "no space" || "$line" =~ "space exhausted" || "$line" =~ "out of space" ]]; then
ERROR_REASON=" (No space left)"
elif [[ "$line" =~ "timeout" ]]; then
ERROR_REASON=" (Operation timed out)"
elif [[ "$line" =~ "already exists" ]]; then
ERROR_REASON=" (Snapshot already exists)"
elif [[ "$line" =~ "locked" || "$line" =~ "lock" ]]; then
ERROR_REASON=" (Resource locked)"
elif [[ "$line" =~ "permission" ]]; then
ERROR_REASON=" (Permission denied)"
elif [[ "$line" =~ "quorum" ]]; then
ERROR_REASON=" (Quorum error)"
fi
# Format the notification message
if [[ -n "$VM_ID" ]]; then
NAME=$(get_vm_name "$VM_ID")
if [[ -n "$SNAPSHOT_ID" ]]; then
send_notification "🚨 $(translate "CRITICAL: Snapshot failed for:") $NAME (ID: $SNAPSHOT_ID)$ERROR_REASON"
else
send_notification "🚨 $(translate "CRITICAL: Snapshot failed for:") $NAME$ERROR_REASON"
fi
else
if [[ -n "$SNAPSHOT_ID" ]]; then
send_notification "🚨 $(translate "CRITICAL: Snapshot failed") (ID: $SNAPSHOT_ID)$ERROR_REASON"
else
send_notification "🚨 $(translate "CRITICAL: Snapshot failed")$ERROR_REASON"
fi
fi
event_processed=true
fi
# Backup failed (CRITICAL)
if [[ "$backup_fail" -eq 1 ]] && [[ "$event_processed" = false ]]; then
# Expanded pattern matching for backup failures
if [[ "$line" =~ "backup" && ("$line" =~ "error" || "$line" =~ "fail" || "$line" =~ "unable to" || "$line" =~ "cannot" || "$line" =~ "abort") ]]; then
# Extract VM/CT ID
VM_ID=$(echo "$line" | grep -oP 'TASK \K[0-9]+')
if [[ -z "$VM_ID" ]]; then
VM_ID=$(echo "$line" | grep -oP 'VM \K[0-9]+')
fi
if [[ -z "$VM_ID" ]]; then
VM_ID=$(echo "$line" | grep -oP 'CT \K[0-9]+')
fi
# Extract backup target
BACKUP_TARGET=$(echo "$line" | grep -oP 'to ["\047]?\K[a-zA-Z0-9_-]+')
if [[ -z "$BACKUP_TARGET" ]]; then
BACKUP_TARGET=$(echo "$line" | grep -oP 'storage ["\047]?\K[a-zA-Z0-9_-]+')
fi
# Try to determine error reason
ERROR_REASON=""
if [[ "$line" =~ "no space" || "$line" =~ "space exhausted" || "$line" =~ "out of space" ]]; then
ERROR_REASON=" (No space left)"
elif [[ "$line" =~ "timeout" ]]; then
ERROR_REASON=" (Operation timed out)"
elif [[ "$line" =~ "connection" && "$line" =~ "refused" ]]; then
ERROR_REASON=" (Connection refused)"
elif [[ "$line" =~ "network" ]]; then
ERROR_REASON=" (Network error)"
elif [[ "$line" =~ "permission" ]]; then
ERROR_REASON=" (Permission denied)"
elif [[ "$line" =~ "locked" || "$line" =~ "lock" ]]; then
ERROR_REASON=" (Resource locked)"
elif [[ "$line" =~ "quorum" ]]; then
ERROR_REASON=" (Quorum error)"
elif [[ "$line" =~ "already running" ]]; then
ERROR_REASON=" (Another backup is already running)"
fi
# Format the notification message
if [[ -n "$VM_ID" ]]; then
NAME=$(get_vm_name "$VM_ID")
if [[ -n "$BACKUP_TARGET" ]]; then
send_notification "🚨 $(translate "CRITICAL: Backup failed for:") $NAME (Target: $BACKUP_TARGET)$ERROR_REASON"
else
send_notification "🚨 $(translate "CRITICAL: Backup failed for:") $NAME$ERROR_REASON"
fi
else
if [[ -n "$BACKUP_TARGET" ]]; then
send_notification "🚨 $(translate "CRITICAL: Backup failed") (Target: $BACKUP_TARGET)$ERROR_REASON"
else
send_notification "🚨 $(translate "CRITICAL: Backup failed")$ERROR_REASON"
fi
fi
event_processed=true
fi
fi
# Failed authentication attempt (CRITICAL)
if [[ "$auth_fail" -eq 1 ]] && [[ "$event_processed" = false ]]; then
if [[ "$line" =~ "authentication failure" || "$line" =~ "auth fail" || "$line" =~ "login failed" ||
"$line" =~ "Failed password" || "$line" =~ "Invalid user" || "$line" =~ "failed login" ||
"$line" =~ "authentication error" || ( "$line" =~ "unauthorized" && "$line" =~ "access" ) ]]; then
# Extract username
USER=$(echo "$line" | grep -oP 'user=\K[^ ]+')
if [[ -z "$USER" ]]; then
USER=$(echo "$line" | grep -oP 'user \K[^ ]+')
fi
if [[ -z "$USER" ]]; then
USER=$(echo "$line" | grep -oP 'for user \K[^ ]+')
fi
if [[ -z "$USER" ]]; then
USER=$(echo "$line" | grep -oP 'for invalid user \K[^ ]+')
fi
if [[ -z "$USER" ]]; then
USER=$(echo "$line" | grep -oP 'for \K[^ ]+' | grep -v "invalid")
fi
if [[ -z "$USER" ]]; then
USER="unknown"
fi
# Extract IP address
IP=$(echo "$line" | grep -oP 'rhost=\K[^ ]+')
if [[ -z "$IP" ]]; then
IP=$(echo "$line" | grep -oP 'from \K[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+')
fi
if [[ -z "$IP" ]]; then
IP=$(echo "$line" | grep -oP 'from \K[0-9a-f:]+')
fi
if [[ -z "$IP" ]]; then
IP=$(echo "$line" | grep -oP 'IP: \K[^ ]+')
fi
if [[ -z "$IP" ]]; then
IP="unknown"
fi
# Try to determine authentication service
SERVICE="system"
if [[ "$line" =~ "sshd" ]]; then
SERVICE="SSH"
elif [[ "$line" =~ "pvedaemon" || "$line" =~ "pveproxy" ]]; then
SERVICE="Proxmox Web UI"
elif [[ "$line" =~ "nginx" || "$line" =~ "apache" ]]; then
SERVICE="Web Server"
elif [[ "$line" =~ "smtp" || "$line" =~ "mail" ]]; then
SERVICE="Mail"
elif [[ "$line" =~ "ftp" ]]; then
SERVICE="FTP"
fi
# Try to extract authentication method if available
AUTH_METHOD=""
if [[ "$line" =~ "password" ]]; then
AUTH_METHOD=" (Password auth)"
elif [[ "$line" =~ "publickey" ]]; then
AUTH_METHOD=" (Public key auth)"
elif [[ "$line" =~ "keyboard-interactive" ]]; then
AUTH_METHOD=" (Interactive auth)"
elif [[ "$line" =~ "PAM" ]]; then
AUTH_METHOD=" (PAM auth)"
fi
# Count failed attempts from this IP if possible
ATTEMPT_COUNT=""
if [[ -n "$IP" && "$IP" != "unknown" ]]; then
# Use journalctl to count recent failed attempts from this IP
if command -v journalctl &>/dev/null; then
COUNT=$(journalctl -q --since "1 hour ago" | grep -c "$IP")
if [[ $COUNT -gt 1 ]]; then
ATTEMPT_COUNT=" ($COUNT attempts in the last hour)"
fi
fi
fi
# Send notification with enhanced information
send_notification "🚨 $(translate "CRITICAL: Failed authentication attempt:") $USER from $IP - $SERVICE$AUTH_METHOD$ATTEMPT_COUNT"
event_processed=true
fi
fi
# Firewall issue (CRITICAL)
if [[ "$line" =~ "firewall" && "$firewall_issue" -eq 1 &&
( "$line" =~ "error" || "$line" =~ "block" || "$line" =~ "reject" ||
"$line" =~ "drop" || "$line" =~ "denied" || "$line" =~ "fail" || "$line" =~ "invalid" ) &&
"$event_processed" = false ]]; then
# Try to determine the type of firewall issue
ISSUE_TYPE="issue"
if [[ "$line" =~ "error" ]]; then
ISSUE_TYPE="configuration error"
elif [[ "$line" =~ "block" || "$line" =~ "denied" ]]; then
ISSUE_TYPE="blocked connection"
elif [[ "$line" =~ "reject" ]]; then
ISSUE_TYPE="rejected connection"
elif [[ "$line" =~ "drop" ]]; then
ISSUE_TYPE="dropped packet"
elif [[ "$line" =~ "invalid" ]]; then
ISSUE_TYPE="invalid rule"
fi
# Extract source IP
SRC_IP=$(echo "$line" | grep -oP 'SRC=\K[0-9.]+')
if [[ -z "$SRC_IP" ]]; then
SRC_IP=$(echo "$line" | grep -oP 'from \K[0-9.]+')
fi
if [[ -z "$SRC_IP" ]]; then
SRC_IP=$(echo "$line" | grep -oP 'source \K[0-9.]+')
fi
# Extract destination IP
DST_IP=$(echo "$line" | grep -oP 'DST=\K[0-9.]+')
if [[ -z "$DST_IP" ]]; then
DST_IP=$(echo "$line" | grep -oP 'to \K[0-9.]+')
fi
if [[ -z "$DST_IP" ]]; then
DST_IP=$(echo "$line" | grep -oP 'destination \K[0-9.]+')
fi
# Try to extract port information if available
PORT_INFO=""
SRC_PORT=$(echo "$line" | grep -oP 'SPT=\K[0-9]+' || echo "")
DST_PORT=$(echo "$line" | grep -oP 'DPT=\K[0-9]+' || echo "")
if [[ -n "$SRC_PORT" && -n "$DST_PORT" ]]; then
PORT_INFO=" (Port $SRC_PORT$DST_PORT)"
elif [[ -n "$DST_PORT" ]]; then
PORT_INFO=" (Port $DST_PORT)"
fi
# Try to extract protocol if available
PROTO=$(echo "$line" | grep -oP 'PROTO=\K[A-Za-z]+' ||
echo "$line" | grep -oP 'protocol \K[A-Za-z]+' || echo "")
if [[ -n "$PROTO" ]]; then
PROTO=" $PROTO"
fi
# Try to extract interface if available
IFACE=$(echo "$line" | grep -oP 'IN=\K[^ ]+' ||
echo "$line" | grep -oP 'OUT=\K[^ ]+' ||
echo "$line" | grep -oP 'on \K[^ ]+' || echo "")
if [[ -n "$IFACE" ]]; then
IFACE=" on $IFACE"
fi
# Format the notification message
if [[ -n "$SRC_IP" && -n "$DST_IP" ]]; then
send_notification "🚨 $(translate "CRITICAL: Firewall ${ISSUE_TYPE}:") $SRC_IP$DST_IP$PORT_INFO$PROTO$IFACE"
elif [[ -n "$SRC_IP" ]]; then
send_notification "🚨 $(translate "CRITICAL: Firewall ${ISSUE_TYPE}:") from $SRC_IP$PORT_INFO$PROTO$IFACE"
elif [[ -n "$DST_IP" ]]; then
send_notification "🚨 $(translate "CRITICAL: Firewall ${ISSUE_TYPE}:") to $DST_IP$PORT_INFO$PROTO$IFACE"
else
# Extract a more concise message from the line
CONCISE_MSG=$(echo "$line" | sed -E 's/.*firewall[^:]*: ?//i' | cut -c 1-100)
send_notification "🚨 $(translate "CRITICAL: Firewall ${ISSUE_TYPE}:") $CONCISE_MSG"
fi
event_processed=true
fi
# Network interface recovery handler
if [[ "$network_down" -eq 1 ]] && [[ "$event_processed" = false ]]; then
if [[ "$line" =~ (eth[0-9]+|eno[0-9]+|enp[0-9]+s[0-9]+|wlan[0-9]+) ]]; then
IFACE="${BASH_REMATCH[1]}"
# Detect interface going down
if [[ "$line" =~ "link down" || "$line" =~ "disconnected" || "$line" =~ "no carrier" || "$line" =~ "failure" ]]; then
# Mark interface as down and store the timestamp
IFACE_DOWN["$IFACE"]=true
IFACE_DOWN_TIME["$IFACE"]="$(date +%s)"
fi
# Detect interface recovery
if [[ "$line" =~ "link up" || "$line" =~ "activated" ]]; then
if [[ "${IFACE_DOWN[$IFACE]}" == true ]]; then
RESTORE_TIME=$(date +%s)
START_TIME=${IFACE_DOWN_TIME[$IFACE]}
DURATION=$((RESTORE_TIME - START_TIME))
# Check if this is the default route interface
PRIMARY=""
if ip route | grep -q "default.*$IFACE"; then
PRIMARY=" (PRIMARY INTERFACE)"
fi
# Send notification after connection is restored
send_notification "$(translate '✅ Network connection was lost and has been restored on') $IFACE$PRIMARY. $(translate 'Downtime duration'): ${DURATION}s"
# Clean up the interface state
unset IFACE_DOWN["$IFACE"]
unset IFACE_DOWN_TIME["$IFACE"]
event_processed=true
fi
fi
fi
fi
# Split-brain detected (CRITICAL)
if [[ "$split_brain" -eq 1 ]] && [[ "$event_processed" = false ]]; then
# Expanded pattern matching for split-brain detection
if [[ "$line" =~ "Split-Brain" || "$line" =~ "split brain" || "$line" =~ "split-brain" ||
( "$line" =~ "fencing" && "$line" =~ "required" ) ||
( "$line" =~ "cluster" && "$line" =~ "partition" ) ]]; then
NODES=$(echo "$line" | grep -oP 'nodes: \K[^.]+')
if [[ -z "$NODES" ]]; then
NODES=$(echo "$line" | grep -oP 'between \K[^.]+')
fi
if [[ -n "$NODES" ]]; then
NODES=" (Affected nodes: $NODES)"
fi
# Try to extract fence status if available
FENCE_INFO=""
if [[ "$line" =~ "fencing" ]]; then
if [[ "$line" =~ "successful" ]]; then
FENCE_INFO=" (Fencing successful)"
elif [[ "$line" =~ "failed" ]]; then
FENCE_INFO=" (Fencing failed)"
else
FENCE_INFO=" (Fencing required)"
fi
fi
# Send notification with enhanced information
send_notification "🚨 $(translate "CRITICAL: Split-brain detected in cluster")$NODES$FENCE_INFO - $(translate "Manual intervention required!")"
event_processed=true
fi
fi
# Node disconnected from cluster (CRITICAL)
if [[ "$node_disconnect" -eq 1 ]] && [[ "$event_processed" = false ]]; then
# Expanded pattern matching for node disconnection
if [[ ( "$line" =~ "quorum" && "$line" =~ "lost" ) ||
( "$line" =~ "node" && "$line" =~ "left" ) ||
( "$line" =~ "node" && "$line" =~ "offline" ) ||
( "$line" =~ "connection" && "$line" =~ "lost" && "$line" =~ "node" ) ]]; then
# Extract node name with improved pattern matching
NODE=$(echo "$line" | grep -oP 'node \K[^ ,.]+')
if [[ -z "$NODE" ]]; then
NODE=$(echo "$line" | grep -oP 'Node \K[^ ,.]+')
fi
if [[ -z "$NODE" ]]; then
NODE=$(echo "$line" | grep -oP 'from \K[^ ,.]+')
fi
if [[ -z "$NODE" ]]; then
NODE="unknown"
fi
# Try to determine if quorum is still valid
QUORUM_STATUS=""
if [[ "$line" =~ "quorum" ]]; then
if [[ "$line" =~ "lost" ]]; then
QUORUM_STATUS=" (Quorum lost)"
elif [[ "$line" =~ "still" && "$line" =~ "valid" ]]; then
QUORUM_STATUS=" (Quorum still valid)"
fi
fi
# Try to extract remaining nodes count if available
REMAINING_COUNT=$(echo "$line" | grep -oP 'remaining nodes: \K[0-9]+')
if [[ -z "$REMAINING_COUNT" ]]; then
REMAINING_COUNT=$(echo "$line" | grep -oP 'nodes left: \K[0-9]+')
fi
if [[ -n "$REMAINING_COUNT" ]]; then
REMAINING=" ($REMAINING_COUNT nodes remaining)"
fi
# Try to determine if this is expected or unexpected
EXPECTED=""
if [[ "$line" =~ "shutdown" || "$line" =~ "maintenance" ]]; then
EXPECTED=" (Planned)"
else
EXPECTED=" (Unexpected)"
fi
# Send notification with enhanced information
send_notification "🚨 $(translate "CRITICAL: Node disconnected from cluster:") $NODE$QUORUM_STATUS$REMAINING$EXPECTED"
event_processed=true
fi
fi
# System shutdown (NON-CRITICAL - with interval)
if [[ "$system_shutdown" -eq 1 ]] && (( current_time - last_shutdown_notification > resource_interval )); then
if [[ "$line" =~ "systemd-journald" && "$line" =~ "Journal stopped" ]]; then
# No hay razón específica, pero podemos indicar que se detuvo el journal (lo último antes del apagado)
send_notification "⚠️ $(translate "System is shutting down")"
last_shutdown_notification=$current_time
# Log the event
logger -t proxmox-notify "System is shutting down (journal stopped)"
event_processed=true
fi
fi
# System problem (CRITICAL)
if [[ "$system_problem" -eq 1 ]] && [[ "$event_processed" = false ]]; then
if [[ "$line" =~ "kernel panic" || "$line" =~ "segfault" || "$line" =~ "Out of memory" ||
"$line" =~ "BUG:" || "$line" =~ "Call Trace:" ||
"$line" =~ "Failed to start" || "$line" =~ "Unit .* failed" ||
"$line" =~ "Service .* exited with" ]]; then
# Extract possible service name or component if available
COMPONENT=$(echo "$line" | grep -oP 'Failed to start \K[^:]+' ||
echo "$line" | grep -oP 'Unit \K[^ ]+' ||
echo "$line" | grep -oP 'Service \K[^ ]+' || echo "unknown")
# Format and send the notification
send_notification "🚨 $(translate "CRITICAL: System problem detected") ($COMPONENT)"
logger -t proxmox-notify "CRITICAL: System problem detected ($COMPONENT)"
event_processed=true
fi
fi
# User permission change (NON-CRITICAL - with interval)
if [[ "$user_permission_change" -eq 1 ]] && (( current_time - last_user_permission_notification > resource_interval )); then
if [[ "$line" =~ "set permissions" || "$line" =~ "user added to group" || "$line" =~ "user removed from group" ||
"$line" =~ "ACL updated" || "$line" =~ "Role assigned" || "$line" =~ "Changed user permissions" ]]; then
# Try to extract username
USER=$(echo "$line" | grep -oP 'user \K[^ ]+' ||
echo "$line" | grep -oP 'for user \K[^ ]+' ||
echo "$line" | grep -oP 'User \K[^ ]+' || echo "unknown")
# Try to detect change type
ACTION="Permission change"
if [[ "$line" =~ "added to group" ]]; then
ACTION="Added to group"
elif [[ "$line" =~ "removed from group" ]]; then
ACTION="Removed from group"
elif [[ "$line" =~ "Role assigned" ]]; then
ACTION="Role assigned"
elif [[ "$line" =~ "ACL updated" ]]; then
ACTION="ACL updated"
fi
send_notification "🔐 $(translate "User permission changed:") $USER ($ACTION)"
logger -t proxmox-notify "User permission changed: $USER ($ACTION)"
last_user_permission_notification=$current_time
event_processed=true
fi
fi
# Network saturation (NON-CRITICAL - with interval)
if [[ "$network_saturation" -eq 1 ]] && (( current_time - last_network_saturation_notification > resource_interval )); then
if command -v ip &>/dev/null; then
saturated_ifaces=""
# Loop over interfaces and look for rx/tx errors/drops
while read -r line; do
iface=$(echo "$line" | awk -F: '{print $2}' | xargs)
stats=$(ip -s link show "$iface" | awk '/RX:|TX:/ {getline; print}')
rx_errors=$(echo "$stats" | awk 'NR==1 {print $3}')
tx_errors=$(echo "$stats" | awk 'NR==2 {print $3}')
rx_dropped=$(echo "$stats" | awk 'NR==1 {print $4}')
tx_dropped=$(echo "$stats" | awk 'NR==2 {print $4}')
if (( rx_errors > 100 || tx_errors > 100 || rx_dropped > 100 || tx_dropped > 100 )); then
saturated_ifaces+="$iface (RX errors: $rx_errors, TX errors: $tx_errors, RX dropped: $rx_dropped, TX dropped: $tx_dropped), "
fi
done < <(ip -o link show | awk -F': ' '{print $2}')
# Clean and notify
if [[ -n "$saturated_ifaces" ]]; then
saturated_ifaces="${saturated_ifaces%, }"
send_notification "⚠️ $(translate "WARNING: Network saturation or errors detected on:") $saturated_ifaces"
logger -t proxmox-notify "WARNING: Network saturation on: $saturated_ifaces"
last_network_saturation_notification=$current_time
fi
fi
fi
# Automatic IP blocks (NON-CRITICAL - with interval)
if [[ "$ip_block" -eq 1 ]] && (( current_time - last_ip_block_notification > resource_interval )); then
if [[ "$line" =~ "fail2ban" || "$line" =~ "Banned IP" || "$line" =~ "Blocking IP" ||
"$line" =~ "DROP" && "$line" =~ "SRC=" ||
"$line" =~ "REJECT" && "$line" =~ "SRC=" ]]; then
# Try to extract IP address
IP=$(echo "$line" | grep -oP 'SRC=\K[0-9.]+' ||
echo "$line" | grep -oP 'from \K[0-9.]+' ||
echo "$line" | grep -oP 'Banned IP \K[0-9.]+' ||
echo "$line" | grep -oP 'Blocking IP \K[0-9.]+' || echo "unknown")
# Detect source (Fail2ban, Firewall, etc.)
SOURCE="Firewall"
if [[ "$line" =~ "fail2ban" ]]; then
SOURCE="Fail2ban"
elif [[ "$line" =~ "pve-firewall" ]]; then
SOURCE="PVE Firewall"
fi
send_notification "🔒 $(translate "Automatic IP block detected") ($IP - $SOURCE)"
logger -t proxmox-notify "IP block detected: $IP ($SOURCE)"
last_ip_block_notification=$current_time
event_processed=true
fi
fi
# ===== NON-CRITICAL EVENTS (IMMEDIATE) =====
# VM/CT start (NON-CRITICAL but immediate)
if [[ "$vm_start" -eq 1 ]] && [[ "$event_processed" = false ]]; then
# VM start detection
if [[ "$line" =~ "qmstart" && ! "$line" =~ "err" && ! "$line" =~ "fail" ]]; then
VM_ID=$(echo "$line" | grep -oP 'qmstart:\K[0-9]+' ||
echo "$line" | grep -oP 'VM \K[0-9]+' || echo "")
if [[ -n "$VM_ID" ]]; then
NAME=$(get_vm_name "$VM_ID")
# Try to extract additional information
EXTRA_INFO=""
# Check if this is a template
if [[ "$line" =~ "template" ]]; then
EXTRA_INFO=" (Template)"
fi
# Check if this is a restore or clone operation
if [[ "$line" =~ "restore" ]]; then
EXTRA_INFO=" (Restored)"
elif [[ "$line" =~ "clone" ]]; then
EXTRA_INFO=" (Cloned)"
fi
send_notification "$(translate "VM started successfully:") $NAME$EXTRA_INFO"
event_processed=true
fi
# LXC container start detection
elif [[ "$line" =~ "lxc-start" && ! "$line" =~ "err" && ! "$line" =~ "fail" ]] ||
[[ "$line" =~ "Starting CT" && ! "$line" =~ "err" && ! "$line" =~ "fail" ]]; then
CT_ID=$(echo "$line" | grep -oP 'lxc-start:\K[0-9]+' ||
echo "$line" | grep -oP 'CT \K[0-9]+' || echo "")
if [[ -n "$CT_ID" ]]; then
NAME=$(get_vm_name "$CT_ID")
# Try to extract additional information
EXTRA_INFO=""
# Check if this is a template
if [[ "$line" =~ "template" ]]; then
EXTRA_INFO=" (Template)"
fi
# Check if this is a restore or clone operation
if [[ "$line" =~ "restore" ]]; then
EXTRA_INFO=" (Restored)"
elif [[ "$line" =~ "clone" ]]; then
EXTRA_INFO=" (Cloned)"
fi
send_notification "$(translate "Container started successfully:") $NAME$EXTRA_INFO"
event_processed=true
fi
fi
fi
# VM/CT shutdown (NON-CRITICAL but immediate)
if [[ "$vm_shutdown" -eq 1 ]] && [[ "$event_processed" = false ]]; then
# VM shutdown detection
if [[ "$line" =~ "qmstop" && ! "$line" =~ "err" && ! "$line" =~ "fail" ]]; then
VM_ID=$(echo "$line" | grep -oP 'qmstop:\K[0-9]+' ||
echo "$line" | grep -oP 'VM \K[0-9]+' || echo "")
if [[ -n "$VM_ID" ]]; then
NAME=$(get_vm_name "$VM_ID")
# Try to determine shutdown type
SHUTDOWN_TYPE=""
if [[ "$line" =~ "force" || "$line" =~ "kill" ]]; then
SHUTDOWN_TYPE=" (Forced)"
elif [[ "$line" =~ "suspend" ]]; then
SHUTDOWN_TYPE=" (Suspended)"
elif [[ "$line" =~ "hibernate" ]]; then
SHUTDOWN_TYPE=" (Hibernated)"
elif [[ "$line" =~ "timeout" ]]; then
SHUTDOWN_TYPE=" (Timeout)"
elif [[ "$line" =~ "acpi" ]]; then
SHUTDOWN_TYPE=" (ACPI shutdown)"
fi
send_notification "$(translate "VM stopped successfully:") $NAME$SHUTDOWN_TYPE"
event_processed=true
fi
# LXC container shutdown detection
elif [[ "$line" =~ "lxc-stop" && ! "$line" =~ "err" && ! "$line" =~ "fail" ]] ||
[[ "$line" =~ "Stopping CT" && ! "$line" =~ "err" && ! "$line" =~ "fail" ]]; then
CT_ID=$(echo "$line" | grep -oP 'lxc-stop:\K[0-9]+' ||
echo "$line" | grep -oP 'CT \K[0-9]+' || echo "")
if [[ -n "$CT_ID" ]]; then
NAME=$(get_vm_name "$CT_ID")
# Try to determine shutdown type
SHUTDOWN_TYPE=""
if [[ "$line" =~ "force" || "$line" =~ "kill" ]]; then
SHUTDOWN_TYPE=" (Forced)"
elif [[ "$line" =~ "timeout" ]]; then
SHUTDOWN_TYPE=" (Timeout)"
fi
send_notification "$(translate "Container stopped successfully:") $NAME$SHUTDOWN_TYPE"
event_processed=true
fi
fi
fi
# VM/CT restart (NON-CRITICAL but immediate)
if [[ "$vm_restart" -eq 1 ]] && [[ "$event_processed" = false ]]; then
# VM restart detection
if [[ ("$line" =~ "qmreset" || "$line" =~ "qmreboot") && ! "$line" =~ "err" && ! "$line" =~ "fail" ]]; then
VM_ID=$(echo "$line" | grep -oP '(qmreset|qmreboot):\K[0-9]+' ||
echo "$line" | grep -oP 'VM \K[0-9]+' || echo "")
if [[ -n "$VM_ID" ]]; then
NAME=$(get_vm_name "$VM_ID")
# Try to determine restart type
RESTART_TYPE=""
if [[ "$line" =~ "qmreset" ]]; then
RESTART_TYPE=" (Hard reset)"
elif [[ "$line" =~ "force" || "$line" =~ "kill" ]]; then
RESTART_TYPE=" (Forced)"
elif [[ "$line" =~ "timeout" ]]; then
RESTART_TYPE=" (After timeout)"
elif [[ "$line" =~ "acpi" ]]; then
RESTART_TYPE=" (ACPI restart)"
fi
send_notification "$(translate "VM restarted successfully:") $NAME$RESTART_TYPE"
event_processed=true
fi
# LXC container restart detection
elif [[ "$line" =~ "lxc-restart" || "$line" =~ "Restarting CT" ||
("$line" =~ "lxc-stop" && "$line" =~ "lxc-start" && "$line" =~ "restart") ]] &&
[[ ! "$line" =~ "err" && ! "$line" =~ "fail" ]]; then
CT_ID=$(echo "$line" | grep -oP 'lxc-restart:\K[0-9]+' ||
echo "$line" | grep -oP 'CT \K[0-9]+' ||
echo "$line" | grep -oP 'lxc-(stop|start):\K[0-9]+' || echo "")
if [[ -n "$CT_ID" ]]; then
NAME=$(get_vm_name "$CT_ID")
# Try to determine restart type
RESTART_TYPE=""
if [[ "$line" =~ "force" || "$line" =~ "kill" ]]; then
RESTART_TYPE=" (Forced)"
elif [[ "$line" =~ "timeout" ]]; then
RESTART_TYPE=" (After timeout)"
fi
send_notification "$(translate "Container restarted successfully:") $NAME$RESTART_TYPE"
event_processed=true
fi
fi
fi
# Snapshot completed (NON-CRITICAL but immediate)
if [[ "$line" =~ "snapshot" ]] && [[ "$snapshot_complete" -eq 1 ]] && [[ ! "$line" =~ "error" ]] && [[ "$event_processed" = false ]]; then
# Additional pattern matching for completed snapshots
if [[ "$line" =~ "complete" || "$line" =~ "finished" || "$line" =~ "success" || ! "$line" =~ "fail" && ! "$line" =~ "unable" ]]; then
# Extract VM/CT ID with improved pattern matching
VM_ID=$(echo "$line" | grep -oP 'TASK \K[0-9]+')
if [[ -z "$VM_ID" ]]; then
VM_ID=$(echo "$line" | grep -oP 'VM \K[0-9]+')
fi
if [[ -z "$VM_ID" ]]; then
VM_ID=$(echo "$line" | grep -oP 'CT \K[0-9]+')
fi
# Try to extract snapshot name/ID if available
SNAPSHOT_NAME=$(echo "$line" | grep -oP 'snapshot \K[a-zA-Z0-9_-]+' ||
echo "$line" | grep -oP 'snap\K[a-zA-Z0-9_-]+' ||
echo "$line" | grep -oP 'name: \K[a-zA-Z0-9_-]+' || echo "")
# Try to extract snapshot size if available
SNAPSHOT_SIZE=$(echo "$line" | grep -oP 'size: \K[0-9.]+[KMGT]B' ||
echo "$line" | grep -oP '[0-9.]+[KMGT]B' || echo "")
# Try to extract duration if available
DURATION=$(echo "$line" | grep -oP 'duration: \K[0-9.]+s' ||
echo "$line" | grep -oP 'in \K[0-9.]+s' ||
echo "$line" | grep -oP 'took \K[0-9.]+s' || echo "")
# Format additional information
ADDITIONAL_INFO=""
if [[ -n "$SNAPSHOT_NAME" ]]; then
ADDITIONAL_INFO+=" (Name: $SNAPSHOT_NAME"
if [[ -n "$SNAPSHOT_SIZE" ]]; then
ADDITIONAL_INFO+=", Size: $SNAPSHOT_SIZE"
fi
if [[ -n "$DURATION" ]]; then
ADDITIONAL_INFO+=", Duration: $DURATION"
fi
ADDITIONAL_INFO+=")"
elif [[ -n "$SNAPSHOT_SIZE" || -n "$DURATION" ]]; then
ADDITIONAL_INFO+=" ("
if [[ -n "$SNAPSHOT_SIZE" ]]; then
ADDITIONAL_INFO+="Size: $SNAPSHOT_SIZE"
if [[ -n "$DURATION" ]]; then
ADDITIONAL_INFO+=", "
fi
fi
if [[ -n "$DURATION" ]]; then
ADDITIONAL_INFO+="Duration: $DURATION"
fi
ADDITIONAL_INFO+=")"
fi
# Try to determine snapshot type
SNAPSHOT_TYPE=""
if [[ "$line" =~ "memory" || "$line" =~ "ram" ]]; then
SNAPSHOT_TYPE=" (With RAM)"
elif [[ "$line" =~ "disk-only" ]]; then
SNAPSHOT_TYPE=" (Disk only)"
fi
# Format the notification message
if [[ -n "$VM_ID" ]]; then
NAME=$(get_vm_name "$VM_ID")
send_notification "$(translate "Snapshot completed for:") $NAME$ADDITIONAL_INFO$SNAPSHOT_TYPE"
else
send_notification "$(translate "Snapshot completed")$ADDITIONAL_INFO$SNAPSHOT_TYPE"
fi
event_processed=true
fi
fi
# Backup completed (NON-CRITICAL but immediate)
if [[ "$line" =~ "backup" && "$backup_complete" -eq 1 &&
( "$line" =~ "successful" || "$line" =~ "complete" || "$line" =~ "finished" || "$line" =~ "success" ) &&
! "$line" =~ "error" && ! "$line" =~ "fail" &&
"$event_processed" = false ]]; then
# Extract VM/CT ID
VM_ID=$(echo "$line" | grep -oP 'TASK \K[0-9]+')
if [[ -z "$VM_ID" ]]; then
VM_ID=$(echo "$line" | grep -oP 'VM \K[0-9]+')
fi
if [[ -z "$VM_ID" ]]; then
VM_ID=$(echo "$line" | grep -oP 'CT \K[0-9]+')
fi
# Extract backup target
BACKUP_TARGET=$(echo "$line" | grep -oP 'to ["\047]?\K[a-zA-Z0-9_-]+')
if [[ -z "$BACKUP_TARGET" ]]; then
BACKUP_TARGET=$(echo "$line" | grep -oP 'storage ["\047]?\K[a-zA-Z0-9_-]+')
fi
if [[ -z "$BACKUP_TARGET" ]]; then
BACKUP_TARGET=$(echo "$line" | grep -oP 'target ["\047]?\K[a-zA-Z0-9_-]+')
fi
# Try to extract backup size if available
BACKUP_SIZE=$(echo "$line" | grep -oP 'size: \K[0-9.]+[KMGT]B' ||
echo "$line" | grep -oP '[0-9.]+[KMGT]B' || echo "")
# Try to extract duration if available
DURATION=$(echo "$line" | grep -oP 'duration: \K[0-9.]+s' ||
echo "$line" | grep -oP 'in \K[0-9.]+s' ||
echo "$line" | grep -oP 'took \K[0-9.]+s' || echo "")
# Try to extract compression rate if available
COMPRESSION=$(echo "$line" | grep -oP 'compression: \K[0-9.]+%' ||
echo "$line" | grep -oP 'compressed: \K[0-9.]+%' || echo "")
# Format additional information
ADDITIONAL_INFO=""
if [[ -n "$BACKUP_TARGET" || -n "$BACKUP_SIZE" || -n "$DURATION" || -n "$COMPRESSION" ]]; then
ADDITIONAL_INFO+=" ("
if [[ -n "$BACKUP_TARGET" ]]; then
ADDITIONAL_INFO+="Target: $BACKUP_TARGET"
if [[ -n "$BACKUP_SIZE" || -n "$DURATION" || -n "$COMPRESSION" ]]; then
ADDITIONAL_INFO+=", "
fi
fi
if [[ -n "$BACKUP_SIZE" ]]; then
ADDITIONAL_INFO+="Size: $BACKUP_SIZE"
if [[ -n "$DURATION" || -n "$COMPRESSION" ]]; then
ADDITIONAL_INFO+=", "
fi
fi
if [[ -n "$DURATION" ]]; then
ADDITIONAL_INFO+="Duration: $DURATION"
if [[ -n "$COMPRESSION" ]]; then
ADDITIONAL_INFO+=", "
fi
fi
if [[ -n "$COMPRESSION" ]]; then
ADDITIONAL_INFO+="Compression: $COMPRESSION"
fi
ADDITIONAL_INFO+=")"
fi
# Try to determine backup type
BACKUP_TYPE=""
if [[ "$line" =~ "incremental" ]]; then
BACKUP_TYPE=" (Incremental)"
elif [[ "$line" =~ "differential" ]]; then
BACKUP_TYPE=" (Differential)"
elif [[ "$line" =~ "full" ]]; then
BACKUP_TYPE=" (Full)"
fi
# Format the notification message
if [[ -n "$VM_ID" ]]; then
NAME=$(get_vm_name "$VM_ID")
send_notification "$(translate "Backup completed for:") $NAME$ADDITIONAL_INFO$BACKUP_TYPE"
else
send_notification "$(translate "Backup completed")$ADDITIONAL_INFO$BACKUP_TYPE"
fi
event_processed=true
fi
# System update completed (NON-CRITICAL but immediate)
if [[ "$update_complete" -eq 1 ]] && [[ "$event_processed" = false ]]; then
# Match various patterns that indicate a completed update
if [[ "$line" =~ "update" && ("$line" =~ "complete" || "$line" =~ "finished" || "$line" =~ "done" || "$line" =~ "success") &&
! "$line" =~ "error" && ! "$line" =~ "fail" && ! "$line" =~ "unable" ]]; then
# Try to determine what was updated
update_type="system"
if [[ "$line" =~ "proxmox" || "$line" =~ "pve" ]]; then
update_type="Proxmox VE"
elif [[ "$line" =~ "kernel" ]]; then
update_type="kernel"
elif [[ "$line" =~ "package" ]]; then
update_type="package"
fi
# Try to extract version information if available
version_info=""
if [[ "$line" =~ "version" ]]; then
version=$(echo "$line" | grep -oP 'version \K[0-9.]+' ||
echo "$line" | grep -oP 'to \K[0-9.]+' || echo "")
if [[ -n "$version" ]]; then
version_info=" ($(translate "version") $version)"
fi
fi
# Try to extract package count if available
package_count=""
if [[ "$line" =~ "package" ]]; then
count=$(echo "$line" | grep -oP '([0-9]+) package' || echo "")
if [[ -n "$count" ]]; then
package_count=" ($count $(translate "packages"))"
fi
fi
# Try to get a list of updated packages if available
package_list=""
if [[ -f /var/log/apt/history.log ]]; then
# Get the most recent upgrade entry
recent_upgrade=$(tac /var/log/apt/history.log | grep -m 1 -A 20 "Upgrade:" | grep -v "End-Date:" | grep "Upgrade:")
if [[ -n "$recent_upgrade" ]]; then
# Extract package names and versions
packages=$(echo "$recent_upgrade" | grep -oP '[a-zA-Z0-9.-]+:[a-zA-Z0-9]+ $$[^)]+$$' | head -n 5)
if [[ -n "$packages" ]]; then
package_list="
$(translate "Updated packages:") $(echo "$packages" | tr '\n' ', ' | sed 's/,$//')"
# If there are more packages, indicate this
total_packages=$(echo "$recent_upgrade" | grep -oP '[a-zA-Z0-9.-]+:[a-zA-Z0-9]+ $$[^)]+$$' | wc -l)
if [[ $total_packages -gt 5 ]]; then
package_list="$package_list, ... ($(translate "and") $((total_packages-5)) $(translate "more"))"
fi
fi
fi
fi
# Check if a reboot is required
reboot_required=""
if [[ -f /var/run/reboot-required ]]; then
reboot_required="
⚠️ $(translate "System restart required to complete the update")"
fi
# Format the notification message
send_notification "$(translate "${update_type} update completed")${version_info}${package_count}${package_list}${reboot_required}"
event_processed=true
# Log the event
logger -t proxmox-notify "${update_type} update completed"
fi
fi
done
# Si llegamos aquí, es porque tail -F terminó inesperadamente
sleep 5
done
}
# Function: capture direct system events
capture_direct_events() {
# Variables to control notification frequency
local last_load_notification=0
local last_temp_notification=0
local last_disk_space_notification=0
local last_cpu_notification=0
local last_ram_notification=0
local last_update_notification=0
local resource_interval=900 # 15 minutes for resources
local update_interval=86400 # 24 hours for updates
local disk_full_detected=false
while true; do
current_time=$(date +%s)
# ===== CRITICAL IMMEDIATE NOTIFICATION EVENTS =====
# Disk full (CRITICAL - immediate)
if [[ "$disk_full" -eq 1 ]]; then
# Check for disks that are completely full (100%)
full_disks=$(df -h | awk '$5 == "100%" {print $1 " (100% full)"}')
# Check for disks that are nearly full (>=95%)
nearly_full_disks=$(df -h | awk '$5 >= "95%" && $5 < "100%" {print $1 " (" $5 " full)"}')
# Handle completely full disks
if [[ -n "$full_disks" && "$disk_full_detected" = false ]]; then
# Format the output for better readability
formatted_full_disks=$(echo "$full_disks" | tr '\n' ', ' | sed 's/,$//' | sed 's/,/, /g')
send_notification "🚨 $(translate "CRITICAL: Storage completely full:") $formatted_full_disks"
disk_full_detected=true
# Log the event
logger -t proxmox-notify "CRITICAL: Storage completely full: $formatted_full_disks"
elif [[ -z "$full_disks" ]]; then
disk_full_detected=false
fi
# Handle nearly full disks (separate notification)
if [[ -n "$nearly_full_disks" && "$disk_nearly_full_detected" = false ]]; then
# Format the output for better readability
formatted_nearly_full_disks=$(echo "$nearly_full_disks" | tr '\n' ', ' | sed 's/,$//' | sed 's/,/, /g')
send_notification "⚠️ $(translate "WARNING: Storage nearly full:") $formatted_nearly_full_disks"
disk_nearly_full_detected=true
# Log the event
logger -t proxmox-notify "WARNING: Storage nearly full: $formatted_nearly_full_disks"
elif [[ -z "$nearly_full_disks" ]]; then
disk_nearly_full_detected=false
fi
# Check for inode usage (sometimes disks can be full of inodes but not space)
full_inodes=""
while read -r filesystem inodes_used inodes_total iuse_percent mounted_on; do
# Skip if the line doesn't have a valid percentage
if ! [[ "$iuse_percent" =~ ^[0-9]+%$ ]]; then
continue
fi
# Extract percentage number without the % sign
percent_num=${iuse_percent/\%/}
# Skip if percentage is less than 95
if [[ $percent_num -lt 95 ]]; then
continue
fi
# Skip certain Proxmox-specific filesystems that normally show high inode usage
# but don't represent a real problem
if [[ "$filesystem" =~ ^/dev/mapper/pve- ||
"$filesystem" =~ ^/dev/pve/ ||
"$mounted_on" =~ ^/var/lib/vz/root/ ||
"$mounted_on" =~ ^/etc/pve/ ||
"$mounted_on" == "/var/lib/vz" && "$percent_num" -lt 98 ]]; then
continue
fi
# Skip tmpfs and devtmpfs filesystems
if [[ "$filesystem" == "tmpfs" || "$filesystem" == "devtmpfs" ]]; then
continue
fi
# Skip if the filesystem has very few total inodes (less than 1000)
# This helps avoid alerts on small or special filesystems
if [[ $inodes_total -lt 1000 ]]; then
continue
fi
# Get a more user-friendly name for the filesystem
fs_name="$filesystem"
if [[ "$mounted_on" != "/" ]]; then
fs_name="$mounted_on ($filesystem)"
fi
# Add to our list of filesystems with high inode usage
full_inodes+="$fs_name ($iuse_percent inodos usados, $inodes_used/$inodes_total), "
done < <(df -i | grep -v "Filesystem" | awk '{print $1, $3, $2, $5, $6}')
# Remove trailing comma and space if any
full_inodes=${full_inodes%, }
if [[ -n "$full_inodes" && "$inode_full_detected" = false ]]; then
send_notification "⚠️ $(translate "WARNING: Inode usage critical:") $full_inodes"
inode_full_detected=true
# Log the event
logger -t proxmox-notify "WARNING: Inode usage critical: $full_inodes"
elif [[ -z "$full_inodes" ]]; then
inode_full_detected=false
fi
fi
# ===== NON-CRITICAL EVENTS WITH INTERVAL =====
# High system load (NON-CRITICAL - with interval)
if [[ "$system_load_high" -eq 1 ]]; then
# Get current load averages (1, 5, 15 minutes)
load_1=$(awk '{print $1}' /proc/loadavg)
load_5=$(awk '{print $2}' /proc/loadavg)
load_15=$(awk '{print $3}' /proc/loadavg)
# Get number of CPU cores
if [[ -f /proc/cpuinfo ]]; then
cpu_cores=$(grep -c "^processor" /proc/cpuinfo)
else
# Default to 1 if we can't determine
cpu_cores=1
fi
# Calculate thresholds based on number of cores
warning_threshold=$(echo "$cpu_cores * 0.8" | bc -l)
critical_threshold=$(echo "$cpu_cores * 1.5" | bc -l)
# Format load averages for display
load_info="1m: $load_1, 5m: $load_5, 15m: $load_15"
# Check if load exceeds critical threshold
if (( $(echo "$load_1 > $critical_threshold" | bc -l) )) &&
(( current_time - last_load_notification > resource_interval )); then
# Get top processes consuming CPU
if command -v top &>/dev/null; then
top_processes=$(top -b -n 1 | head -n 12 | tail -n 5 | awk '{print $NF " (" $9 "% CPU)"}' | tr '\n' ', ' | sed 's/,$//')
process_info=" $(translate "Top processes:") $top_processes"
else
process_info=""
fi
# Get memory usage
if [[ -f /proc/meminfo ]]; then
mem_total=$(grep "MemTotal" /proc/meminfo | awk '{print $2}')
mem_available=$(grep "MemAvailable" /proc/meminfo | awk '{print $2}')
mem_used_percent=$(echo "scale=1; 100 - ($mem_available * 100 / $mem_total)" | bc -l)
memory_info=" $(translate "Memory usage:") ${mem_used_percent}%"
else
memory_info=""
fi
send_notification "🚨 $(translate "CRITICAL: Extremely high system load:") $load_info ($(translate "on") $cpu_cores $(translate "cores"))$memory_info$process_info"
last_load_notification=$current_time
# Log the event
logger -t proxmox-notify "CRITICAL: Extremely high system load: $load_info"
# Check if load exceeds warning threshold
elif (( $(echo "$load_1 > $warning_threshold" | bc -l) )) &&
(( current_time - last_load_notification > resource_interval )); then
# Get memory usage
if [[ -f /proc/meminfo ]]; then
mem_total=$(grep "MemTotal" /proc/meminfo | awk '{print $2}')
mem_available=$(grep "MemAvailable" /proc/meminfo | awk '{print $2}')
mem_used_percent=$(echo "scale=1; 100 - ($mem_available * 100 / $mem_total)" | bc -l)
memory_info=" $(translate "Memory usage:") ${mem_used_percent}%"
else
memory_info=""
fi
send_notification "⚠️ $(translate "WARNING: High system load:") $load_info ($(translate "on") $cpu_cores $(translate "cores"))$memory_info"
last_load_notification=$current_time
# Log the event
logger -t proxmox-notify "WARNING: High system load: $load_info"
fi
fi
# Available updates (NON-CRITICAL - with daily interval)
if [[ "$update_available" -eq 1 ]] && (( current_time - last_update_notification > update_interval )); then
# Update package lists quietly
apt-get update -qq &>/dev/null
# Count total upgradable packages
updates=$(apt list --upgradable 2>/dev/null | grep -v "Listing..." | wc -l)
# Check for security updates specifically
security_updates=$(apt list --upgradable 2>/dev/null | grep -i security | wc -l)
# Check for Proxmox VE updates specifically
proxmox_updates=$(apt list --upgradable 2>/dev/null | grep -E "^(proxmox-ve|pve-manager|pve-kernel|pve-container|pve-firewall|pve-ha-manager|pve-docs|pve-qemu-kvm|pve-storage|pve-cluster|pve-gui|pve-headers|pve-firmware|pve-zsync|pve-guest-common)" | wc -l)
# Get Proxmox version information
current_pve_version=$(pveversion -v 2>/dev/null | grep -oP "pve-manager/\K[0-9]+\.[0-9]+" || echo "unknown")
# Check if there's a new major Proxmox version available
new_pve_version=""
if [[ $proxmox_updates -gt 0 ]]; then
new_version_check=$(apt list --upgradable 2>/dev/null | grep "^pve-manager/" | grep -oP "pve-manager/\K[0-9]+\.[0-9]+" || echo "")
if [[ -n "$new_version_check" && "$new_version_check" != "$current_pve_version" ]]; then
new_pve_version="$new_version_check"
fi
fi
# Get list of specific packages that have updates
if [[ $updates -gt 0 ]]; then
# Get a list of all upgradable packages (limited to 10 to avoid too long messages)
package_list=$(apt list --upgradable 2>/dev/null | grep -v "Listing..." | head -n 10 | awk -F/ '{print $1}' | tr '\n' ', ' | sed 's/,$//')
# If there are more than 10 packages, indicate this
if [[ $updates -gt 10 ]]; then
package_list="$package_list, ... ($(translate "and") $((updates-10)) $(translate "more"))"
fi
# Format the notification message
update_msg=" $(translate "Updates available:") $updates"
if [[ $security_updates -gt 0 ]]; then
update_msg="$update_msg ($(translate "including") $security_updates $(translate "security updates"))"
fi
# If there's a new Proxmox version, highlight it
if [[ -n "$new_pve_version" ]]; then
update_msg="🔄 $(translate "NEW PROXMOX VERSION AVAILABLE:") $new_pve_version ($(translate "current:") $current_pve_version)
$update_msg"
elif [[ $proxmox_updates -gt 0 ]]; then
update_msg="🔄 $(translate "Proxmox updates available") ($proxmox_updates $(translate "packages"))
$update_msg"
fi
send_notification "$update_msg"
last_update_notification=$current_time
# Log the event
logger -t proxmox-notify "Updates available: $updates packages"
fi
fi
# Low disk space (NON-CRITICAL - with interval)
if [[ "$low_disk_space" -eq 1 ]] && (( current_time - last_disk_space_notification > resource_interval )); then
# Check partitions with critical space (95-99% usage)
critical_space=$(df -h | awk '$5 ~ /9[5-9]%/ && $5 != "100%" {print $1 " (" $5 " full, " $4 " free)"}')
# Check partitions with warning space (90-94% usage)
warning_space=$(df -h | awk '$5 ~ /9[0-4]%/ {print $1 " (" $5 " full, " $4 " free)"}')
# Check partitions with attention space (85-89% usage)
attention_space=$(df -h | awk '$5 ~ /8[5-9]%/ {print $1 " (" $5 " full, " $4 " free)"}')
# Format messages for better readability
if [[ -n "$critical_space" ]]; then
critical_space=$(echo "$critical_space" | tr '\n' ', ' | sed 's/,$//' | sed 's/,/, /g')
fi
if [[ -n "$warning_space" ]]; then
warning_space=$(echo "$warning_space" | tr '\n' ', ' | sed 's/,$//' | sed 's/,/, /g')
fi
if [[ -n "$attention_space" ]]; then
attention_space=$(echo "$attention_space" | tr '\n' ', ' | sed 's/,$//' | sed 's/,/, /g')
fi
# Build notification message
disk_space_msg=""
if [[ -n "$critical_space" ]]; then
disk_space_msg+="🚨 $(translate "CRITICAL: Very low disk space:") $critical_space"
fi
if [[ -n "$warning_space" ]]; then
if [[ -n "$disk_space_msg" ]]; then
disk_space_msg+="
"
fi
disk_space_msg+="⚠️ $(translate "WARNING: Low disk space:") $warning_space"
fi
if [[ -n "$attention_space" && -z "$critical_space" && -z "$warning_space" ]]; then
# Only show attention level if no higher alerts are present
disk_space_msg+=" $(translate "ATTENTION: Disk space getting low:") $attention_space"
fi
# Send notification if any space issues were detected
if [[ -n "$disk_space_msg" ]]; then
send_notification "$disk_space_msg"
last_disk_space_notification=$current_time
# Log the event
logger -t proxmox-notify "Low disk space detected"
# Suggest cleanup options for Proxmox
if [[ -d /var/lib/vz/dump || -d /var/lib/vz/template ]]; then
cleanup_msg="$(translate "TIP: Consider cleaning up old backups with:") 'rm -f /var/lib/vz/dump/vzdump-*.tar' $(translate "or old templates with:") 'rm -f /var/lib/vz/template/cache/*.tar.gz'"
send_notification "$cleanup_msg"
fi
fi
fi
# High CPU usage (NON-CRITICAL - with interval)
if [[ "$cpu_high" -eq 1 ]] && (( current_time - last_cpu_notification > resource_interval )); then
# Get number of CPU cores
if [[ -f /proc/cpuinfo ]]; then
cpu_cores=$(grep -c "^processor" /proc/cpuinfo)
else
# Default to 1 if we can't determine
cpu_cores=1
fi
# Use mpstat if available, otherwise use top
if command -v mpstat &>/dev/null; then
cpu_usage=$(mpstat 1 1 | awk '/Average:/ {print 100 - $NF}')
else
cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2 + $4}')
fi
# Round to one decimal place
cpu_usage=$(printf "%.1f" $cpu_usage)
# Get CPU temperature if available
cpu_temp=""
if command -v sensors &>/dev/null; then
# Try to get CPU temperature from sensors
cpu_temp=$(sensors | grep -i "core\|temp" | grep -oP '\+\K[0-9.]+°C' | sort -nr | head -n1)
elif [[ -f /sys/class/thermal/thermal_zone0/temp ]]; then
# Alternative method using sysfs
cpu_temp=$(echo "scale=1; $(cat /sys/class/thermal/thermal_zone0/temp) / 1000" | bc -l)
cpu_temp="${cpu_temp}°C"
fi
# Add temperature info if available
temp_info=""
if [[ -n "$cpu_temp" ]]; then
temp_info=" ($(translate "Temperature:") $cpu_temp)"
fi
# Get top CPU consuming processes
process_info=""
if command -v top &>/dev/null; then
top_processes=$(top -bn1 -o %CPU | head -n 12 | tail -n 5 | awk '{print $NF " (" $9 "%)"}' | tr '\n' ', ' | sed 's/,$//')
process_info="
$(translate "Top processes:") $top_processes"
fi
# Check for critical CPU usage (>95%)
if (( $(echo "$cpu_usage > 95" | bc -l) )); then
send_notification "🚨 $(translate "CRITICAL: Very high CPU usage:") ${cpu_usage}% ($(translate "on") $cpu_cores $(translate "cores"))${temp_info}${process_info}"
last_cpu_notification=$current_time
# Log the event
logger -t proxmox-notify "CRITICAL: Very high CPU usage: ${cpu_usage}%"
# Check for high CPU usage (>85%)
elif (( $(echo "$cpu_usage > 85" | bc -l) )); then
send_notification "⚠️ $(translate "WARNING: High CPU usage:") ${cpu_usage}% ($(translate "on") $cpu_cores $(translate "cores"))${temp_info}${process_info}"
last_cpu_notification=$current_time
# Log the event
logger -t proxmox-notify "WARNING: High CPU usage: ${cpu_usage}%"
fi
# Check for sustained moderate CPU usage (>70% for extended period)
# This requires tracking previous readings
if [[ -z "$cpu_usage_history" ]]; then
cpu_usage_history="$cpu_usage"
else
cpu_usage_history="$cpu_usage_history,$cpu_usage"
# Keep only the last 5 readings
cpu_usage_history=$(echo "$cpu_usage_history" | awk -F, '{for(i=NF-4>1?NF-4:1; i<=NF; i++) printf("%s%s", $i, i==NF?"":",") }')
# Calculate average of last readings
cpu_usage_avg=$(echo "$cpu_usage_history" | awk -F, '{sum=0; for(i=1; i<=NF; i++) sum+=$i; print sum/NF}')
# If average is >70% and we haven't sent a notification recently
if (( $(echo "$cpu_usage_avg > 70" | bc -l) )) &&
(( current_time - last_cpu_sustained_notification > resource_interval * 3 )); then
send_notification " $(translate "ATTENTION: Sustained CPU usage:") ${cpu_usage_avg}% $(translate "average over time") ($(translate "on") $cpu_cores $(translate "cores"))${temp_info}"
last_cpu_sustained_notification=$current_time
# Log the event
logger -t proxmox-notify "ATTENTION: Sustained CPU usage: ${cpu_usage_avg}%"
fi
fi
fi
# High RAM usage (NON-CRITICAL - with interval)
if [[ "$ram_high" -eq 1 ]] && (( current_time - last_ram_notification > resource_interval )); then
# Get detailed memory information
total_ram=$(free -m | awk '/Mem:/ {print $2}')
used_ram=$(free -m | awk '/Mem:/ {print $3}')
free_ram=$(free -m | awk '/Mem:/ {print $4}')
shared_ram=$(free -m | awk '/Mem:/ {print $5}')
cache_ram=$(free -m | awk '/Mem:/ {print $6}')
available_ram=$(free -m | awk '/Mem:/ {print $7}')
# Calculate percentages
ram_usage=$(echo "scale=1; ($total_ram - $available_ram) * 100 / $total_ram" | bc -l)
ram_usage_no_cache=$(echo "scale=1; ($used_ram - $cache_ram) * 100 / $total_ram" | bc -l)
# Get swap information
total_swap=$(free -m | awk '/Swap:/ {print $2}')
used_swap=$(free -m | awk '/Swap:/ {print $3}')
# Calculate swap percentage if swap exists
swap_info=""
if [[ $total_swap -gt 0 ]]; then
swap_percent=$(echo "scale=1; $used_swap * 100 / $total_swap" | bc -l)
swap_info=", $(translate "Swap:") ${swap_percent}% (${used_swap}MB/${total_swap}MB)"
fi
# Format memory values for display
ram_info="${ram_usage}% (${used_ram}MB/${total_ram}MB)"
ram_info_detailed="Used: ${used_ram}MB, Free: ${free_ram}MB, Cache: ${cache_ram}MB, Available: ${available_ram}MB"
# Get top memory consuming processes
process_info=""
if command -v ps &>/dev/null; then
top_processes=$(ps aux --sort=-%mem | head -n 6 | tail -n 5 | awk '{print $11 " (" int($4) "%)"}' | tr '\n' ', ' | sed 's/,$//')
process_info="
$(translate "Top processes:") $top_processes"
fi
# Check for critical RAM usage (>95%)
if (( $(echo "$ram_usage > 95" | bc -l) )); then
send_notification "🚨 $(translate "CRITICAL: Very high RAM usage:") ${ram_info}${swap_info}
${ram_info_detailed}${process_info}"
last_ram_notification=$current_time
# Log the event
logger -t proxmox-notify "CRITICAL: Very high RAM usage: ${ram_usage}%"
# Check for high RAM usage (>85%)
elif (( $(echo "$ram_usage > 85" | bc -l) )); then
send_notification "⚠️ $(translate "WARNING: High RAM usage:") ${ram_info}${swap_info}
${ram_info_detailed}${process_info}"
last_ram_notification=$current_time
# Log the event
logger -t proxmox-notify "WARNING: High RAM usage: ${ram_usage}%"
# Check for high RAM usage excluding cache (>80%)
# This is important because Linux uses free RAM for cache, but can free it when needed
elif (( $(echo "$ram_usage_no_cache > 80" | bc -l) )); then
send_notification " $(translate "ATTENTION: High RAM usage (excluding cache):") ${ram_usage_no_cache}%${swap_info}
${ram_info_detailed}${process_info}"
last_ram_notification=$current_time
# Log the event
logger -t proxmox-notify "ATTENTION: High RAM usage (excluding cache): ${ram_usage_no_cache}%"
fi
# Check for high swap usage if swap exists and is being used
if [[ $total_swap -gt 0 && $used_swap -gt 0 ]]; then
# Only alert on high swap if we haven't already alerted on RAM
if (( $(echo "$swap_percent > 50" | bc -l) )) &&
(( $(echo "$ram_usage <= 85" | bc -l) )) &&
(( current_time - last_swap_notification > resource_interval )); then
send_notification "⚠️ $(translate "WARNING: High swap usage:") ${swap_percent}% (${used_swap}MB/${total_swap}MB)
${ram_info_detailed}${process_info}"
last_swap_notification=$current_time
# Log the event
logger -t proxmox-notify "WARNING: High swap usage: ${swap_percent}%"
fi
fi
fi
# High temperature (NON-CRITICAL - with interval)
if [[ "$temp_high" -eq 1 ]] && (( current_time - last_temp_notification > resource_interval )); then
# Initialize variables
temp_detected=false
max_temp=0
temp_sources=""
# Method 1: Use 'sensors' command if available
if command -v sensors &>/dev/null; then
# Update sensors database if needed
if [[ ! -f /var/run/proxmox-notify-sensors-updated ]]; then
sensors-detect --auto &>/dev/null || true
touch /var/run/proxmox-notify-sensors-updated
fi
# Try to get CPU temperature from various patterns
cpu_temp=$(sensors | grep -E 'Package id 0:|Core [0-9]+:|CPU:|Tdie:|Tctl:' | grep -oP '\+\K[0-9.]+°C|[0-9.]+°C' | sed 's/°C//' | sort -nr | head -n1)
if [[ -n "$cpu_temp" && "$cpu_temp" != "0" ]]; then
temp_detected=true
if (( $(echo "$cpu_temp > $max_temp" | bc -l) )); then
max_temp=$cpu_temp
temp_sources="CPU"
fi
fi
# Try to get motherboard/system temperature
mb_temp=$(sensors | grep -E 'MB Temperature|System Temp|Board Temp|Motherboard' | grep -oP '\+\K[0-9.]+°C|[0-9.]+°C' | sed 's/°C//' | sort -nr | head -n1)
if [[ -n "$mb_temp" && "$mb_temp" != "0" ]]; then
temp_detected=true
if (( $(echo "$mb_temp > $max_temp" | bc -l) )); then
max_temp=$mb_temp
temp_sources="Motherboard"
elif (( $(echo "$mb_temp == $max_temp" | bc -l) )); then
temp_sources="$temp_sources, Motherboard"
fi
fi
# Try to get GPU temperature if available
gpu_temp=$(sensors | grep -E 'GPU|VGA' | grep -oP '\+\K[0-9.]+°C|[0-9.]+°C' | sed 's/°C//' | sort -nr | head -n1)
if [[ -n "$gpu_temp" && "$gpu_temp" != "0" ]]; then
temp_detected=true
if (( $(echo "$gpu_temp > $max_temp" | bc -l) )); then
max_temp=$gpu_temp
temp_sources="GPU"
elif (( $(echo "$gpu_temp == $max_temp" | bc -l) )); then
temp_sources="$temp_sources, GPU"
fi
fi
# Try to get disk temperature if available
disk_temp=$(sensors | grep -E 'Drive Temp|Disk Temp|Storage Temp' | grep -oP '\+\K[0-9.]+°C|[0-9.]+°C' | sed 's/°C//' | sort -nr | head -n1)
if [[ -n "$disk_temp" && "$disk_temp" != "0" ]]; then
temp_detected=true
if (( $(echo "$disk_temp > $max_temp" | bc -l) )); then
max_temp=$disk_temp
temp_sources="Disk"
elif (( $(echo "$disk_temp == $max_temp" | bc -l) )); then
temp_sources="$temp_sources, Disk"
fi
fi
fi
# Method 2: Use sysfs thermal zones if sensors not available or no temp detected
if ! $temp_detected && [[ -d /sys/class/thermal ]]; then
for zone in /sys/class/thermal/thermal_zone*/temp; do
if [[ -f "$zone" ]]; then
zone_temp=$(echo "scale=1; $(cat "$zone") / 1000" | bc -l)
if [[ -n "$zone_temp" && "$zone_temp" != "0" ]]; then
temp_detected=true
if (( $(echo "$zone_temp > $max_temp" | bc -l) )); then
max_temp=$zone_temp
# Try to get zone type
zone_dir=$(dirname "$zone")
if [[ -f "$zone_dir/type" ]]; then
zone_type=$(cat "$zone_dir/type")
temp_sources="$zone_type"
else
temp_sources="Thermal Zone"
fi
fi
fi
fi
done
fi
# Method 3: Use ipmitool if available and no temp detected yet
if ! $temp_detected && command -v ipmitool &>/dev/null; then
ipmi_temp=$(ipmitool sdr type temperature 2>/dev/null | grep -i -E 'CPU|System|Ambient|Inlet|Exhaust' | head -1 | awk '{print $4}')
if [[ -n "$ipmi_temp" && "$ipmi_temp" != "0" ]]; then
temp_detected=true
max_temp=$ipmi_temp
temp_sources="IPMI"
fi
fi
# Method 4: Use hddtemp for disk temperatures if available
if command -v hddtemp &>/dev/null; then
for disk in /dev/sd[a-z]; do
if [[ -b "$disk" ]]; then
disk_temp=$(hddtemp "$disk" 2>/dev/null | grep -oP '[0-9.]+°C' | sed 's/°C//')
if [[ -n "$disk_temp" && "$disk_temp" != "0" ]]; then
temp_detected=true
if (( $(echo "$disk_temp > $max_temp" | bc -l) )); then
max_temp=$disk_temp
disk_name=$(basename "$disk")
temp_sources="Disk $disk_name"
elif (( $(echo "$disk_temp == $max_temp" | bc -l) )); then
disk_name=$(basename "$disk")
temp_sources="$temp_sources, Disk $disk_name"
fi
fi
fi
done
fi
# If we detected a temperature, check against thresholds
if $temp_detected && [[ -n "$max_temp" && "$max_temp" != "0" ]]; then
# Critical temperature (>90°C)
if (( $(echo "$max_temp > 90" | bc -l) )); then
send_notification "🚨 $(translate "CRITICAL: Dangerously high temperature:") ${max_temp}°C (${temp_sources})"
last_temp_notification=$current_time
# Log the event
logger -t proxmox-notify "CRITICAL: Dangerously high temperature: ${max_temp}°C (${temp_sources})"
# High temperature (>80°C)
elif (( $(echo "$max_temp > 80" | bc -l) )); then
send_notification "⚠️ $(translate "WARNING: High temperature:") ${max_temp}°C (${temp_sources})"
last_temp_notification=$current_time
# Log the event
logger -t proxmox-notify "WARNING: High temperature: ${max_temp}°C (${temp_sources})"
# Elevated temperature (>70°C)
elif (( $(echo "$max_temp > 70" | bc -l) )); then
send_notification " $(translate "ATTENTION: Elevated temperature:") ${max_temp}°C (${temp_sources})"
last_temp_notification=$current_time
# Log the event
logger -t proxmox-notify "ATTENTION: Elevated temperature: ${max_temp}°C (${temp_sources})"
fi
fi
fi
# Pause between checks
sleep 30
done
}
# Function to start the notification service
start_notification_service() {
if [[ ! -f /etc/systemd/system/proxmox-telegram.service ]]; then
install_systemd_service
fi
if systemctl is-active --quiet proxmox-telegram.service; then
whiptail --title "$(translate "Information")" \
--msgbox "$(translate "The notification service is already running.")" 10 70
else
systemctl start proxmox-telegram.service
if systemctl is-active --quiet proxmox-telegram.service; then
whiptail --title "$(translate "Started")" \
--msgbox "$(translate "The service has been started successfully.")" 10 70
else
whiptail --title "$(translate "Error")" \
--msgbox "$(translate "Could not start the notification service.")" 10 70
fi
fi
}
# Function to stop the service
stop_notification_service() {
if [[ -f /etc/systemd/system/proxmox-telegram.service ]]; then
if systemctl is-active --quiet proxmox-telegram.service; then
systemctl stop proxmox-telegram.service
sleep 2
fi
if ! systemctl is-active --quiet proxmox-telegram.service; then
whiptail --title "$(translate "Stopped")" \
--msgbox "$(translate "The service has been stopped successfully.")" 10 70
else
whiptail --title "$(translate "Error")" \
--msgbox "$(translate "Could not stop the notification service.")" 10 70
fi
else
whiptail --title "$(translate "Information")" \
--msgbox "$(translate "The notification service is not installed yet.")" 10 70
fi
}
# Function to check service status
check_service_status() {
clear
if [[ -f /etc/systemd/system/proxmox-telegram.service ]]; then
systemctl status proxmox-telegram.service
else
echo "$(translate "The service is not installed.")"
fi
echo
msg_success "$(translate "Press Enter to return to the menu...")"
read -r
}
# Function to remove the systemd service
remove_systemd_service() {
if [[ -f /etc/systemd/system/proxmox-telegram.service ]]; then
if systemctl is-active --quiet proxmox-telegram.service; then
systemctl stop proxmox-telegram.service
fi
systemctl disable proxmox-telegram.service
rm -f /etc/systemd/system/proxmox-telegram.service
systemctl daemon-reexec
whiptail --title "$(translate "Removed")" \
--msgbox "$(translate "The service has been removed successfully. You can reinstall it from the menu if desired.")" 10 70
else
whiptail --title "$(translate "Information")" \
--msgbox "$(translate "The service does not exist, nothing to remove.")" 10 70
fi
}
# Functions required by systemd
start_silent() {
mkdir -p "$PID_DIR"
capture_journal_events > /dev/null 2>&1 &
echo $! > "$PID_DIR/journal.pid"
journal_pid=$!
capture_direct_events > /dev/null 2>&1 &
echo $! > "$PID_DIR/direct.pid"
direct_pid=$!
echo $$ > "$PID_DIR/service.pid"
# Wait for both processes to finish (keeps systemd service alive)
wait $journal_pid
wait $direct_pid
}
stop_silent() {
send_notification "⚠️ $(translate "System is shutting down on") $(hostname) at $(date)"
kill $(cat "$PID_DIR/journal.pid" 2>/dev/null) 2>/dev/null
kill $(cat "$PID_DIR/direct.pid" 2>/dev/null) 2>/dev/null
kill $(cat "$PID_DIR/service.pid" 2>/dev/null) 2>/dev/null
rm -f "$PID_DIR"/*.pid
}
# Function to install the service as a systemd service
install_systemd_service() {
mkdir -p "$PID_DIR"
cat > "$WRAPPER_PATH" <<EOW
#!/bin/bash
exec bash <(curl -fsSL https://raw.githubusercontent.com/MacRimi/ProxMenux/main/scripts/telegram-notifier.sh) "\$@"
EOW
chmod +x "$WRAPPER_PATH"
cat > /etc/systemd/system/proxmox-telegram.service <<EOF
[Unit]
Description=Proxmox Telegram Notification Service
After=network.target pve-cluster.service
[Service]
Type=simple
ExecStart=$WRAPPER_PATH start_silent
ExecStop=$WRAPPER_PATH stop_silent
Restart=on-failure
PIDFile=$PID_DIR/service.pid
[Install]
WantedBy=multi-user.target
EOF
systemctl daemon-reexec
systemctl enable proxmox-telegram.service
systemctl start proxmox-telegram.service
}
# Main menu
main_menu() {
while true; do
local menu_options=(
"1" "$(translate "Configure Telegram")"
"2" "$(translate "Configure Notifications")"
"3" "$(translate "Start Notification Service")"
"4" "$(translate "Stop Notification Service")"
"5" "$(translate "Check Service Status")"
)
if [[ -f /etc/systemd/system/proxmox-telegram.service ]]; then
menu_options+=(
"6" "$(translate "Remove Notification Service")"
)
fi
menu_options+=(
"7" "$(translate "Exit")"
)
OPTION=$(whiptail --backtitle "ProxMenuX" --title "$(translate "Proxmox Notification Configuration")" \
--menu "$(translate "Choose an option:")" 20 70 10 \
"${menu_options[@]}" \
3>&1 1>&2 2>&3)
if [[ $? -ne 0 ]]; then
exit 0
fi
case "$OPTION" in
1) configure_telegram ;;
2) configure_notifications ;;
3) start_notification_service ;;
4) stop_notification_service ;;
5) check_service_status ;;
6) remove_systemd_service ;;
7) exit 0 ;;
esac
done
}
case "$1" in
start_silent) start_silent ;;
stop_silent) stop_silent ;;
*) main_menu ;;
esac