Update notification service

This commit is contained in:
MacRimi
2026-03-26 20:04:53 +01:00
parent 839a20df97
commit 7c5e7208b9
4 changed files with 530 additions and 276 deletions

View File

@@ -1093,7 +1093,7 @@ class HealthPersistence:
conn.commit() conn.commit()
conn.close() conn.close()
# ─── System Capabilities Cache ────────────────────────────── # ─── System Capabilities Cache ────────────────────<EFBFBD><EFBFBD>──────────
def get_capability(self, cap_key: str) -> Optional[str]: def get_capability(self, cap_key: str) -> Optional[str]:
""" """

View File

@@ -2144,60 +2144,98 @@ class PollingCollector:
self._first_poll_done = True self._first_poll_done = True
def _check_startup_aggregation(self): def _check_startup_aggregation(self):
"""Check if startup period ended and emit aggregated VM/CT start message. """Check if startup period ended and emit comprehensive startup report.
During the startup grace period, TaskWatcher collects VM/CT starts instead At the end of the health grace period, collects:
of emitting individual notifications. Once the period ends, this method - VMs/CTs that started successfully
emits a single aggregated "System startup" notification. - VMs/CTs that failed to start
- Service status
- Storage status
- Journal errors (for AI enrichment)
Emits a single "system_startup" notification with full report data.
""" """
# Only check once startup period is over # Wait until health grace period is over (5 min) for complete picture
if _shared_state.is_startup_period(): if startup_grace.is_startup_health_grace():
return return
# Only emit once # Only emit once
if _shared_state.was_startup_aggregated(): if startup_grace.was_startup_aggregated():
return return
# Get all collected startup VMs/CTs # Collect comprehensive startup report
startup_items = _shared_state.get_and_clear_startup_vms() report = startup_grace.collect_startup_report()
if not startup_items:
return
# Count VMs and CTs # Generate human-readable summary
vms = [(vmid, name) for vmid, name, vtype in startup_items if vtype == 'vm'] summary = startup_grace.format_startup_summary(report)
cts = [(vmid, name) for vmid, name, vtype in startup_items if vtype == 'ct']
vm_count = len(vms) # Count totals
ct_count = len(cts) vms_ok = len(report.get('vms_started', []))
total = vm_count + ct_count cts_ok = len(report.get('cts_started', []))
vms_fail = len(report.get('vms_failed', []))
cts_fail = len(report.get('cts_failed', []))
total_ok = vms_ok + cts_ok
total_fail = vms_fail + cts_fail
# Build entity list (max 10 items for readability) # Build entity list for backwards compatibility
entity_names = [] entity_names = []
for vmid, name in (vms + cts)[:10]: for vm in report.get('vms_started', [])[:5]:
entity_names.append(f'{name} ({vmid})') entity_names.append(f"{vm['name']} ({vm['vmid']})")
if total > 10: for ct in report.get('cts_started', [])[:5]:
entity_names.append(f'...and {total - 10} more') entity_names.append(f"{ct['name']} ({ct['vmid']})")
if total_ok > 10:
entity_names.append(f"...and {total_ok - 10} more")
# Build summary text # Determine severity based on issues
parts = [] has_issues = (
if vm_count: total_fail > 0 or
parts.append(f'{vm_count} VM{"s" if vm_count != 1 else ""}') not report.get('services_ok', True) or
if ct_count: not report.get('storage_ok', True) or
parts.append(f'{ct_count} CT{"s" if ct_count != 1 else ""}') report.get('health_status') in ['CRITICAL', 'WARNING']
summary = ' and '.join(parts) + ' started' )
severity = 'WARNING' if has_issues else 'INFO'
# Build notification data
data = { data = {
'hostname': self._hostname, 'hostname': self._hostname,
'summary': summary, 'summary': summary,
'vm_count': vm_count,
'ct_count': ct_count, # VM/CT counts (backwards compatible)
'total_count': total, 'vm_count': vms_ok,
'ct_count': cts_ok,
'total_count': total_ok,
'entity_list': ', '.join(entity_names), 'entity_list': ', '.join(entity_names),
'reason': f'System startup completed: {summary}',
# New: failure counts
'vms_failed_count': vms_fail,
'cts_failed_count': cts_fail,
'total_failed': total_fail,
# New: detailed lists
'vms_started': report.get('vms_started', []),
'cts_started': report.get('cts_started', []),
'vms_failed': report.get('vms_failed', []),
'cts_failed': report.get('cts_failed', []),
# New: system status
'services_ok': report.get('services_ok', True),
'services_failed': report.get('services_failed', []),
'storage_ok': report.get('storage_ok', True),
'storage_unavailable': report.get('storage_unavailable', []),
'health_status': report.get('health_status', 'UNKNOWN'),
'health_issues': report.get('health_issues', []),
# For AI enrichment
'_journal_context': report.get('_journal_context', ''),
# Metadata
'startup_duration_seconds': report.get('startup_duration_seconds', 0),
'has_issues': has_issues,
'reason': summary.split('\n')[0], # First line as reason
} }
self._queue.put(NotificationEvent( self._queue.put(NotificationEvent(
'system_startup', 'INFO', data, source='polling', 'system_startup', severity, data, source='polling',
entity='node', entity_id='', entity='node', entity_id='',
)) ))
@@ -2500,7 +2538,7 @@ class PollingCollector:
except Exception as e: except Exception as e:
print(f"[PollingCollector] AI model check failed: {e}") print(f"[PollingCollector] AI model check failed: {e}")
# ── Persistence helpers ─────────────────────────────────── # ── Persistence helpers ──────────────────────────────<EFBFBD><EFBFBD>─────
def _load_last_notified(self): def _load_last_notified(self):
"""Load per-error notification timestamps from DB on startup.""" """Load per-error notification timestamps from DB on startup."""

View File

@@ -17,7 +17,7 @@ import socket
import time import time
import urllib.request import urllib.request
import urllib.error import urllib.error
from typing import Dict, Any, Optional, List from typing import Dict, Any, Optional, List, Tuple
# ─── vzdump message parser ─────────────────────────────────────── # ─── vzdump message parser ───────────────────────────────────────
@@ -314,6 +314,90 @@ def _format_vzdump_body(parsed: Dict[str, Any], is_success: bool) -> str:
return '\n'.join(parts) return '\n'.join(parts)
def _format_system_startup(data: Dict[str, Any]) -> Tuple[str, str]:
"""
Format comprehensive system startup report.
Returns (title, body) tuple for the notification.
Handles both simple startups (all OK) and those with issues.
"""
hostname = data.get('hostname', 'unknown')
has_issues = data.get('has_issues', False)
# Build title
if has_issues:
total_issues = (
data.get('total_failed', 0) +
len(data.get('services_failed', [])) +
len(data.get('storage_unavailable', []))
)
title = f"{hostname}: System startup - {total_issues} issue(s) detected"
else:
title = f"{hostname}: System startup completed"
# Build body
parts = []
# Overall status
if not has_issues:
parts.append("All systems operational.")
# VMs/CTs started
vms_ok = len(data.get('vms_started', []))
cts_ok = len(data.get('cts_started', []))
if vms_ok or cts_ok:
count_parts = []
if vms_ok:
count_parts.append(f"{vms_ok} VM{'s' if vms_ok > 1 else ''}")
if cts_ok:
count_parts.append(f"{cts_ok} CT{'s' if cts_ok > 1 else ''}")
# List names (up to 5)
names = []
for vm in data.get('vms_started', [])[:3]:
names.append(f"{vm['name']} ({vm['vmid']})")
for ct in data.get('cts_started', [])[:3]:
names.append(f"{ct['name']} ({ct['vmid']})")
line = f"\u2705 {' and '.join(count_parts)} started"
if names:
if len(names) <= 5:
line += f": {', '.join(names)}"
else:
line += f": {', '.join(names[:5])}..."
parts.append(line)
# Failed VMs/CTs
for vm in data.get('vms_failed', []):
reason = vm.get('reason', 'unknown error')
parts.append(f"\u274C VM failed: {vm['name']} - {reason}")
for ct in data.get('cts_failed', []):
reason = ct.get('reason', 'unknown error')
parts.append(f"\u274C CT failed: {ct['name']} - {reason}")
# Storage issues
storage_unavailable = data.get('storage_unavailable', [])
if storage_unavailable:
names = [s['name'] for s in storage_unavailable[:3]]
parts.append(f"\u26A0\uFE0F Storage: {len(storage_unavailable)} unavailable ({', '.join(names)})")
# Service issues
services_failed = data.get('services_failed', [])
if services_failed:
names = [s['name'] for s in services_failed[:3]]
parts.append(f"\u26A0\uFE0F Services: {len(services_failed)} failed ({', '.join(names)})")
# Startup duration
duration = data.get('startup_duration_seconds', 0)
if duration:
minutes = int(duration // 60)
parts.append(f"\u23F1\uFE0F Startup completed in {minutes} min")
body = '\n'.join(parts)
return title, body
# ─── Severity Icons ────────────────────────────────────────────── # ─── Severity Icons ──────────────────────────────────────────────
SEVERITY_ICONS = { SEVERITY_ICONS = {
@@ -645,11 +729,12 @@ TEMPLATES = {
# ── Services events ── # ── Services events ──
'system_startup': { 'system_startup': {
'title': '{hostname}: System startup — {summary}', 'title': '{hostname}: {reason}',
'body': 'System startup completed.\n{summary}\n\nGuests: {entity_list}', 'body': '{summary}',
'label': 'System startup', 'label': 'System startup report',
'group': 'services', 'group': 'services',
'default_enabled': True, 'default_enabled': True,
'formatter': '_format_system_startup',
}, },
'system_shutdown': { 'system_shutdown': {
'title': '{hostname}: System shutting down', 'title': '{hostname}: System shutting down',
@@ -959,7 +1044,19 @@ def render_template(event_type: str, data: Dict[str, Any]) -> Dict[str, Any]:
pve_message = data.get('pve_message', '') pve_message = data.get('pve_message', '')
pve_title = data.get('pve_title', '') pve_title = data.get('pve_title', '')
if event_type in ('backup_complete', 'backup_fail') and pve_message: # Check for custom formatter function
formatter_name = template.get('formatter')
if formatter_name and formatter_name in globals():
formatter_func = globals()[formatter_name]
try:
title, body_text = formatter_func(data)
except Exception:
# Fallback to standard formatting if formatter fails
try:
body_text = template['body'].format(**variables)
except (KeyError, ValueError):
body_text = template['body']
elif event_type in ('backup_complete', 'backup_fail') and pve_message:
parsed = _parse_vzdump_message(pve_message) parsed = _parse_vzdump_message(pve_message)
if parsed: if parsed:
is_success = (event_type == 'backup_complete') is_success = (event_type == 'backup_complete')
@@ -1288,134 +1385,165 @@ AI_DETAIL_TOKENS = {
# System prompt template - informative, no recommendations # System prompt template - informative, no recommendations
AI_SYSTEM_PROMPT = """You are a system notification formatter for ProxMenux Monitor, a Proxmox VE monitoring tool. AI_SYSTEM_PROMPT = """You are a system notification formatter for ProxMenux Monitor, a Proxmox VE monitoring tool.
Your task is to translate and reformat incoming server alert messages into {language}. Your task is to translate and lightly reformat incoming server alert messages into {language}.
═══ CORE ROLE ═══
You are a formatter, not an analyst.
Translate, clean, and present the message clearly.
Do NOT reinterpret the event, do NOT add meaning, and do NOT rebuild the message from scratch.
═══ ABSOLUTE RULES ═══ ═══ ABSOLUTE RULES ═══
1. Translate BOTH title and body to {language}. Every word, label, and unit must be in {language}. 1. Translate BOTH title and body into {language}.
2. NO markdown: no **bold**, no *italic*, no `code`, no headers (#), no bullet lists (- or *)
3. Plain text only — the output is sent to chat apps and email which handle their own formatting
4. Tone: factual, concise, technical. No greetings, no closings, no apologies
5. DO NOT add recommendations, action items, or suggestions ("you should…", "consider…")
6. Present ONLY the facts already in the input — do not invent or assume information
7. OUTPUT ONLY THE FINAL RESULT — never include both original and processed versions.
Do NOT append "Original message:", "Original:", "Source:", or any before/after comparison.
Return ONLY the single, final formatted message in {language}.
8. PLAIN NARRATIVE LINES — if a line in the input is a complete sentence (not a "Label: value"
pair), translate it as-is. Never prepend "Message:", "Note:", or any other label to a sentence.
9. Detail level to apply: {detail_level}
- brief → 2-3 lines, essential data only (status + key metric)
- standard → short paragraph covering who/what/where and the key value
- detailed → full technical breakdown of all available fields
10. Keep the "hostname: " prefix in the title. Translate only the descriptive part.
Example: "pve01: Updates available""pve01: Actualizaciones disponibles"
11. EMPTY LIST VALUES — if a list field is empty, "none", or "0":
Always write the translated word for "none" on the line after the label, never leave it blank.
Example: 🗂️ Important packages:\\n• none
Example (Spanish): 🗂️ Paquetes importantes:\\n• ninguno
Example (Français): 🗂️ Paquets importants:\\n• aucun
12. DEDUPLICATION — input may contain redundant or repeated information from multiple monitoring sources:
- Identify and merge duplicate facts (same device, same error, same metric mentioned twice)
- Present each unique fact exactly once in a clear, consolidated form
- If the same data appears in different formats, choose the most informative version
13. PROXMOX CONTEXT — silently translate Proxmox technical references into plain language.
Never explain what the term means — just use the human-readable equivalent directly.
Service / process name mapping (replace the raw name with the friendly form): 2. Translate human-readable text only.
- "pve-container@XXXX.service""Container CT XXXX" Do NOT translate:
- "qemu-server@XXXX.service""Virtual Machine VM XXXX" - hostnames
- "pvesr-XXXX""storage replication job for XXXX" - device paths (/dev/sdX, /dev/nvmeXnX)
- "vzdump""backup process" - filesystem paths
- "pveproxy""Proxmox web proxy" - IDs, VMIDs, CTIDs, UUIDs
- "pvedaemon""Proxmox daemon" - timestamps, dates, archive names, PBS paths
- "pvestatd""Proxmox statistics service" - version numbers
- "pvescheduler" "Proxmox task scheduler" - technical units (B, KB, MB, GB, TB, KiB, MiB, GiB, TiB, %, ms, s)
- "pve-cluster""Proxmox cluster service"
- "corosync""cluster communication service"
- "ceph-osd@N""Ceph storage disk N"
- "ceph-mon""Ceph monitor service"
systemd message patterns (rewrite the whole phrase, not just the service name): 3. Plain text only.
- "systemd[1]: pve-container@9000.service: Failed" No markdown: no **bold**, no *italic*, no `code`, no headers (#), no markdown lists (- or *).
"Container CT 9000 service failed" The bullet character "" is allowed only where explicitly required.
- "systemd[1]: qemu-server@100.service: Failed with result 'exit-code'"
"Virtual Machine VM 100 failed to start"
- "systemd[1]: Started pve-container@9000.service"
"Container CT 9000 started"
ATA / SMART / kernel error patterns (replace raw kernel log with plain description): 4. Tone: factual, concise, technical.
- "ata8.00: exception Emask 0x1 SAct 0x4ce0 SErr 0x40000 action 0x0" No greetings, no closings, no apologies, no conversational filler.
"ATA controller error on port 8"
- "blk_update_request: I/O error, dev sdX, sector NNNN" 5. Do NOT add recommendations, action items, remediation, or suggestions.
"I/O error on disk /dev/sdX at sector NNNN"
- "SCSI error: return code = 0x08000002" 6. Present ONLY the facts already present in the input.
"SCSI communication error" Do NOT invent, assume, explain, soften, or escalate anything.
7. Do NOT change severity or status meaning.
For example:
- "failed" must stay a failure
- "warning" must stay a warning
- "degraded" must stay degraded
8. Preserve structure whenever possible.
Keep the same fields, lines, and data already present in the input.
Do NOT remove important lines such as storage, archive path, totals, durations, target node, reason, or summaries.
9. Reordering must be minimal.
Only reorder lines if it clearly improves readability without changing meaning.
10. PLAIN NARRATIVE LINES:
If a line is already a complete sentence, translate it as a sentence.
Do NOT prepend labels like "Message:", "Note:", or "Details:" unless they already exist in the input.
11. Detail level to apply: {detail_level}
- brief → compact output, keep only essential lines, but never remove critical facts
- standard → preserve structure with moderate cleanup
- detailed → preserve all available technical details
12. DEDUPLICATION:
Remove ONLY exact duplicates or obviously duplicated repeated lines.
Do NOT merge distinct facts just because they look similar.
Do NOT summarize multiple separate events into one.
13. Keep the "hostname: " prefix in the title.
Translate only the descriptive part.
Example: "pve01: Updates available""pve01: Actualizaciones disponibles"
14. EMPTY VALUES:
If a list field is empty, "none", "0", or equivalent, write the translated word for "none".
Never leave a declared field blank.
15. UNKNOWN INPUT:
If the message format is unfamiliar, preserve it as closely as possible and translate faithfully.
Do NOT force it into another template.
═══ PROXMOX CONTEXT ═══
Silently replace raw Proxmox technical references with the clearer forms below.
Do NOT explain them. Just use the friendly equivalent directly.
Service / process mappings:
- "pve-container@XXXX.service""Container CT XXXX"
- "qemu-server@XXXX.service""Virtual Machine VM XXXX"
- "pvesr-XXXX""storage replication job for XXXX"
- "vzdump""backup process"
- "pveproxy""Proxmox web proxy"
- "pvedaemon""Proxmox daemon"
- "pvestatd""Proxmox statistics service"
- "pvescheduler""Proxmox task scheduler"
- "pve-cluster""Proxmox cluster service"
- "corosync""cluster communication service"
- "ceph-osd@N""Ceph storage disk N"
- "ceph-mon""Ceph monitor service"
Systemd-style patterns:
- "systemd[1]: pve-container@9000.service: Failed"
"Container CT 9000 service failed"
- "systemd[1]: qemu-server@100.service: Failed with result 'exit-code'"
"Virtual Machine VM 100 failed to start"
- "systemd[1]: Started pve-container@9000.service"
"Container CT 9000 started"
Kernel / storage patterns:
- "ata8.00: exception Emask ..."
"ATA controller error on port 8"
- "blk_update_request: I/O error, dev sdX, sector NNNN"
"I/O error on disk /dev/sdX at sector NNNN"
- "SCSI error: return code = 0x08000002"
"SCSI communication error"
Apply these mappings in titles, field values, and body text when the raw technical string appears.
Apply these mappings everywhere: in the body narrative, in field values, and when
the raw technical string appears inside a longer sentence.
{emoji_instructions} {emoji_instructions}
═══ MESSAGE TYPES — FORMAT RULES ═══ ═══ MESSAGE-TYPE GUIDANCE ═══
BACKUP (backup_complete / backup_fail / backup_start): BACKUP (backup_complete / backup_fail / backup_start):
Input contains: VM/CT names, IDs, size, duration, storage location, status per VM - Preserve per-VM / per-CT detail if present.
Output body: first line is plain text (no emoji) describing the event briefly. - Preserve size, duration, storage/archive path, and final summary if present.
Then list each VM/CT with its fields. End with a summary line. - If both successes and failures are present in the same backup job, use a title equivalent to "Backup partially failed".
PARTIAL FAILURE RULE: if some VMs succeeded and at least one failed, use a combined title - Do NOT collapse multi-guest backup results into a single generic sentence.
like "Backup partially failed" / "Copia de seguridad parcialmente fallida" — never say
"backup failed" when there are also successful VMs in the same job.
NEVER omit the storage/archive line or the summary line — always include them even for long jobs.
UPDATES (update_summary): UPDATES (update_summary):
- Each count on its own line with its label. - Keep each count on its own line.
- Package list uses "" (bullet + space) per package, NOT the 🗂️ emoji on each line. - Keep the important packages block if present.
- The 🗂️ emoji goes only on the "Important packages:" header line. - Use "" for package items.
- NEVER add a redundant summary line repeating the total count. - Do NOT add a redundant summary line repeating totals already shown.
PVE UPDATE (pve_update):
- First line: plain sentence announcing the new version (no emoji on this line).
- Blank line after intro.
- Current version: 🔹 prefix | New version: 🟢 prefix
- Blank line before packages block.
- Packages header: 🗂️ | Package lines: 📌 prefix with version arrow v{{old}} ➜ v{{new}}
DISK / SMART ERRORS (disk_io_error / storage_unavailable): PVE UPDATE (pve_update):
Input contains: device name, error type, SMART values or I/O error codes - Preserve current version, new version, and package list if present.
Output body: device, then the specific error or failing attribute - Keep the announcement concise.
DEDUPLICATION: Input may contain repeated or similar information from multiple sources.
If you see the same device, error count, or technical details mentioned multiple times, DISK / SMART / STORAGE (disk_io_error / storage_unavailable):
consolidate them into a single, clear statement. Never repeat the same information twice. - Preserve device, specific error, failing attribute, and counts if present.
- Do NOT repeat the same disk fact twice.
RESOURCES (cpu_high / ram_high / temp_high / load_high): RESOURCES (cpu_high / ram_high / temp_high / load_high):
Input contains: current value, threshold, core count - Preserve current value, threshold, and context if present.
Output: current value vs threshold, context if available
SECURITY (auth_fail / ip_block): SECURITY (auth_fail / ip_block):
Input contains: source IP, user, service, jail, failure count - Keep source IP, user, service, jail, and failure count on separate clear lines if present.
Output: list each field on its own line
VM/CT LIFECYCLE (vm_start, vm_stop, vm_fail, ct_*, migration_*, replication_*): VM / CT LIFECYCLE (vm_*, ct_*, migration_*, replication_*):
Input contains: VM name, ID, target node (migrations), reason (failures) - Keep name, ID, state, reason, and target node if present.
Output: one or two lines confirming the event with key facts - Keep lifecycle messages compact unless detail_level is detailed.
CLUSTER (split_brain / node_disconnect / node_reconnect): CLUSTER / HEALTH:
Input: node name, quorum status - Preserve node name, quorum, category, severity, duration, and reason if present.
Output: state change + quorum value
HEALTH (new_error / error_resolved / health_persistent / health_degraded): ═══ OUTPUT FORMAT ═══
Input: category, severity, duration, reason
Output: what changed, in which category, for how long (if resolved)
CRITICAL:
- [TITLE] on its own line, title text on the very next line — no blank line between them
- [BODY] on its own line, body text starting on the very next line — no blank line between them
- Do NOT write "Title:", "Body:", or any label substituting the markers
- Do NOT include the literal words TITLE or BODY anywhere in the translated content
═══ OUTPUT FORMAT (follow exactly — parsers rely on these markers) ═══
[TITLE] [TITLE]
translated title here translated title here
[BODY] [BODY]
translated body here""" translated body here
CRITICAL OUTPUT RULES:
- Write [TITLE] on its own line
- Write the title on the next line
- Write [BODY] on its own line
- Write the body starting on the next line
- Do NOT replace these markers with "Title:" or "Body:"
- Do NOT include any extra text before or after the formatted result
- Do NOT add blank lines between [TITLE] and the title
- Do NOT add blank lines between [BODY] and the first body line"""
# Emoji instructions injected into AI_SYSTEM_PROMPT for rich channels (Telegram, Discord, Pushover) # Emoji instructions injected into AI_SYSTEM_PROMPT for rich channels (Telegram, Discord, Pushover)
AI_EMOJI_INSTRUCTIONS = """ AI_EMOJI_INSTRUCTIONS = """
@@ -1485,135 +1613,10 @@ A blank line must be completely empty — no emoji, no spaces.
🟢 new version (pve_update) 🟢 new version (pve_update)
BLANK LINES FOR READABILITY — insert ONE blank line between logical sections within the body. BLANK LINES:
Blank lines go BETWEEN groups, not before the first line or after the last line. Insert one blank line only between logical sections inside the body.
A blank line must be completely empty — no emoji, no spaces. Do not add a blank line before the first body line or after the last one.
"""
When to add a blank line:
- Updates: after the last count line, before the packages block
- Backup multi-VM: one blank line between each VM entry; one blank line before the summary line
- Disk/SMART errors: after the device line, before the error description lines
- VM events with a reason: after the main status line, before Reason / Node / Target lines
- Health events: after the category/status line, before duration or detail lines
EXAMPLE — CT shutdown:
[TITLE]
🔽 amd: CT alpine (101) shut down
[BODY]
🏷️ Container alpine (ID: 101)
✔️ Cleanly shut down
EXAMPLE — VM started:
[TITLE]
🚀 pve01: VM arch-linux (100) started
[BODY]
🏷️ Virtual machine arch-linux (ID: 100)
✔️ Now running
EXAMPLE — migration complete:
[TITLE]
🚚 amd: Migration complete — web01 (100)
[BODY]
🏷️ Virtual machine web01 (ID: 100)
✔️ Successfully migrated
🎯 Target: node02
EXAMPLE — updates message (no important packages):
[TITLE]
📦 amd: Updates available
[BODY]
📦 Total updates: 24
🔒 Security updates: 6
🔄 Proxmox updates: 0
⚙️ Kernel updates: 0
🗂️ Important packages:
• none
EXAMPLE — updates message (with important packages):
[TITLE]
📦 amd: Updates available
[BODY]
📦 Total updates: 90
🔒 Security updates: 6
🔄 Proxmox updates: 14
⚙️ Kernel updates: 1
🗂️ Important packages:
• pve-manager (9.1.4 -> 9.1.6)
• qemu-server (9.1.3 -> 9.1.4)
• pve-container (6.0.18 -> 6.1.2)
EXAMPLE — pve_update (new Proxmox VE version):
[TITLE]
🆕 pve01: Proxmox VE 9.1.6 available
[BODY]
🚀 A new Proxmox VE release is available.
🔹 Current: 9.1.4
🟢 New: 9.1.6
🗂️ Important packages:
📌 pve-manager (v9.1.4 ➜ v9.1.6)
EXAMPLE — backup complete with multiple VMs:
[TITLE]
💾✅ pve01: Backup complete
[BODY]
Backup job finished on storage local-bak.
🏷️ VM web01 (ID: 100)
✔️ Status: ok
💽 Size: 12.3 GiB
⏱️ Duration: 00:04:21
🗄️ Storage: vm/100/2026-03-17T22:00:08Z
🏷️ CT db (ID: 101)
✔️ Status: ok
💽 Size: 4.1 GiB
⏱️ Duration: 00:01:10
🗄️ Storage: ct/101/2026-03-17T22:04:29Z
📊 Total: 2 backups | 💾 16.4 GiB | ⏱️ 00:05:31
EXAMPLE — backup partially failed (some ok, some failed):
[TITLE]
💾❌ pve01: Backup partially failed
[BODY]
Backup job finished with errors on storage PBS2.
🏷️ VM web01 (ID: 100)
✔️ Status: ok
💽 Size: 12.3 GiB
⏱️ Duration: 00:04:21
🗄️ Storage: vm/100/2026-03-17T22:00:08Z
🏷️ VM broken (ID: 102)
❌ Status: error
💽 Size: 0 B
⏱️ Duration: 00:00:37
📊 Total: 2 backups | ❌ 1 failed | 💾 12.3 GiB | ⏱️ 00:04:58
EXAMPLE — disk I/O health warning:
[TITLE]
💥 amd: Health warning — Disk I/O errors
[BODY]
💿 Device: /dev/sda
⚠️ 1 sector currently unreadable (pending)
📝 Disk reports sectors in pending reallocation state
EXAMPLE — health degraded (multiple issues):
[TITLE]
⚠️ amd: 2 health checks degraded
[BODY]
💥 Disk I/O error on /dev/sda: 1 sector currently unreadable (pending)
🏷️ Container CT 9005: ❌ failed to start
🏷️ Container CT 9004: ❌ failed to start
🏷️ Container CT 9002: ❌ failed to start"""
# No emoji instructions for email/plain text channels # No emoji instructions for email/plain text channels

View File

@@ -120,7 +120,7 @@ class _StartupGraceState:
with self._lock: with self._lock:
return time.time() - self._startup_time return time.time() - self._startup_time
# ─── Shutdown Tracking ────────────────────────────────────────────────── # ─── Shutdown Tracking ────────────────────────────────────────<EFBFBD><EFBFBD>──────────
def mark_shutdown(self): def mark_shutdown(self):
""" """
@@ -231,6 +231,219 @@ def was_startup_aggregated() -> bool:
return _state.was_startup_aggregated() return _state.was_startup_aggregated()
# ─── Startup Report Collection ───────────────────────────────────────────────
def collect_startup_report() -> dict:
"""
Collect comprehensive startup report data.
Called at the end of the grace period to generate a complete
startup report including:
- VMs/CTs that started successfully
- VMs/CTs that failed to start
- Service status
- Storage status
- Journal errors during boot (for AI enrichment)
Returns:
Dictionary with startup report data
"""
import subprocess
report = {
# VMs/CTs
'vms_started': [],
'cts_started': [],
'vms_failed': [],
'cts_failed': [],
# System status
'services_ok': True,
'services_failed': [],
'storage_ok': True,
'storage_unavailable': [],
# Health summary
'health_status': 'OK',
'health_issues': [],
# For AI enrichment
'_journal_context': '',
'_startup_errors': [],
# Metadata
'startup_duration_seconds': get_startup_elapsed(),
'timestamp': int(time.time()),
}
# Get VMs/CTs that started during boot
startup_vms = get_and_clear_startup_vms()
for vmid, vmname, vm_type in startup_vms:
if vm_type == 'vm':
report['vms_started'].append({'vmid': vmid, 'name': vmname})
else:
report['cts_started'].append({'vmid': vmid, 'name': vmname})
# Try to get health status from health_monitor
try:
import health_monitor
health_data = health_monitor.get_detailed_status()
if health_data:
report['health_status'] = health_data.get('overall_status', 'UNKNOWN')
# Check storage
storage_cat = health_data.get('categories', {}).get('storage', {})
if storage_cat.get('status') in ['CRITICAL', 'WARNING']:
report['storage_ok'] = False
for check in storage_cat.get('checks', []):
if check.get('status') in ['CRITICAL', 'WARNING', 'error']:
report['storage_unavailable'].append({
'name': check.get('name', 'unknown'),
'reason': check.get('reason', check.get('message', ''))
})
# Check services
services_cat = health_data.get('categories', {}).get('services', {})
if services_cat.get('status') in ['CRITICAL', 'WARNING']:
report['services_ok'] = False
for check in services_cat.get('checks', []):
if check.get('status') in ['CRITICAL', 'WARNING', 'error']:
report['services_failed'].append({
'name': check.get('name', 'unknown'),
'reason': check.get('reason', check.get('message', ''))
})
# Check VMs category for failed VMs
vms_cat = health_data.get('categories', {}).get('vms', {})
for check in vms_cat.get('checks', []):
if check.get('status') in ['CRITICAL', 'WARNING', 'error']:
# Determine if VM or CT based on name/type
check_name = check.get('name', '')
check_reason = check.get('reason', check.get('message', ''))
if 'error al iniciar' in check_reason.lower() or 'failed to start' in check_reason.lower():
if 'CT' in check_name or 'Container' in check_name:
report['cts_failed'].append({
'name': check_name,
'reason': check_reason
})
else:
report['vms_failed'].append({
'name': check_name,
'reason': check_reason
})
# Collect all health issues for summary
for cat_name, cat_data in health_data.get('categories', {}).items():
if cat_data.get('status') in ['CRITICAL', 'WARNING']:
report['health_issues'].append({
'category': cat_name,
'status': cat_data.get('status'),
'reason': cat_data.get('reason', '')
})
except Exception as e:
report['_startup_errors'].append(f"Error getting health data: {e}")
# Get journal errors during startup (for AI enrichment)
try:
boot_time = int(_state._startup_time)
result = subprocess.run(
['journalctl', '-p', 'err', '--since', f'@{boot_time}', '--no-pager', '-n', '50'],
capture_output=True,
text=True,
timeout=10
)
if result.returncode == 0 and result.stdout.strip():
report['_journal_context'] = result.stdout.strip()
except Exception as e:
report['_startup_errors'].append(f"Error getting journal: {e}")
return report
def format_startup_summary(report: dict) -> str:
"""
Format a human-readable startup summary from report data.
Args:
report: Dictionary from collect_startup_report()
Returns:
Formatted summary string
"""
lines = []
# Count totals
vms_ok = len(report.get('vms_started', []))
cts_ok = len(report.get('cts_started', []))
vms_fail = len(report.get('vms_failed', []))
cts_fail = len(report.get('cts_failed', []))
total_ok = vms_ok + cts_ok
total_fail = vms_fail + cts_fail
# Determine overall status
has_issues = (
total_fail > 0 or
not report.get('services_ok', True) or
not report.get('storage_ok', True) or
report.get('health_status') in ['CRITICAL', 'WARNING']
)
# Header
if has_issues:
issue_count = total_fail + len(report.get('services_failed', [])) + len(report.get('storage_unavailable', []))
lines.append(f"System startup - {issue_count} issue(s) detected")
else:
lines.append("System startup completed")
lines.append("All systems operational.")
# VMs/CTs started
if total_ok > 0:
parts = []
if vms_ok > 0:
parts.append(f"{vms_ok} VM{'s' if vms_ok > 1 else ''}")
if cts_ok > 0:
parts.append(f"{cts_ok} CT{'s' if cts_ok > 1 else ''}")
# List names
names = []
for vm in report.get('vms_started', []):
names.append(f"{vm['name']} ({vm['vmid']})")
for ct in report.get('cts_started', []):
names.append(f"{ct['name']} ({ct['vmid']})")
line = f"{' and '.join(parts)} started"
if names and len(names) <= 5:
line += f": {', '.join(names)}"
elif names:
line += f": {', '.join(names[:3])}... (+{len(names)-3} more)"
lines.append(line)
# Failed VMs/CTs
if total_fail > 0:
for vm in report.get('vms_failed', []):
lines.append(f"VM failed: {vm['name']} - {vm.get('reason', 'unknown error')}")
for ct in report.get('cts_failed', []):
lines.append(f"CT failed: {ct['name']} - {ct.get('reason', 'unknown error')}")
# Storage issues
if not report.get('storage_ok', True):
unavailable = report.get('storage_unavailable', [])
if unavailable:
names = [s['name'] for s in unavailable]
lines.append(f"Storage: {len(unavailable)} unavailable ({', '.join(names[:3])})")
# Service issues
if not report.get('services_ok', True):
failed = report.get('services_failed', [])
if failed:
names = [s['name'] for s in failed]
lines.append(f"Services: {len(failed)} failed ({', '.join(names[:3])})")
return '\n'.join(lines)
# ─── For backwards compatibility ───────────────────────────────────────────── # ─── For backwards compatibility ─────────────────────────────────────────────
# Expose constants for external use # Expose constants for external use