From 54eab9af4905352912087df4f6ad227a0894a27e Mon Sep 17 00:00:00 2001 From: MacRimi Date: Mon, 30 Mar 2026 18:53:03 +0200 Subject: [PATCH] Update notification service --- AppImage/scripts/ai_context_enrichment.py | 375 +++++++++++++++++++++ AppImage/scripts/build_appimage.sh | 2 + AppImage/scripts/health_persistence.py | 301 +++++++++++++++++ AppImage/scripts/notification_events.py | 4 +- AppImage/scripts/notification_manager.py | 48 ++- AppImage/scripts/notification_templates.py | 63 +++- AppImage/scripts/proxmox_known_errors.py | 348 +++++++++++++++++++ 7 files changed, 1106 insertions(+), 35 deletions(-) create mode 100644 AppImage/scripts/ai_context_enrichment.py create mode 100644 AppImage/scripts/proxmox_known_errors.py diff --git a/AppImage/scripts/ai_context_enrichment.py b/AppImage/scripts/ai_context_enrichment.py new file mode 100644 index 00000000..b6367e1c --- /dev/null +++ b/AppImage/scripts/ai_context_enrichment.py @@ -0,0 +1,375 @@ +#!/usr/bin/env python3 +""" +AI Context Enrichment Module + +Enriches notification context with additional information to help AI provide +more accurate and helpful responses: + +1. Event frequency - how often this error has occurred +2. System uptime - helps distinguish startup issues from runtime failures +3. SMART disk data - for disk-related errors +4. Known error matching - from proxmox_known_errors database + +Author: MacRimi +""" + +import os +import re +import subprocess +from datetime import datetime, timedelta +from typing import Optional, Dict, Any +import sqlite3 +from pathlib import Path + +# Import known errors database +try: + from proxmox_known_errors import get_error_context, find_matching_error +except ImportError: + def get_error_context(*args, **kwargs): + return None + def find_matching_error(*args, **kwargs): + return None + +DB_PATH = Path('/usr/local/share/proxmenux/health_monitor.db') + + +def get_system_uptime() -> str: + """Get system uptime in human-readable format. + + Returns: + String like "2 minutes (recently booted)" or "89 days, 4 hours (stable system)" + """ + try: + with open('/proc/uptime', 'r') as f: + uptime_seconds = float(f.readline().split()[0]) + + days = int(uptime_seconds // 86400) + hours = int((uptime_seconds % 86400) // 3600) + minutes = int((uptime_seconds % 3600) // 60) + + # Build human-readable string + parts = [] + if days > 0: + parts.append(f"{days} day{'s' if days != 1 else ''}") + if hours > 0: + parts.append(f"{hours} hour{'s' if hours != 1 else ''}") + if not parts: # Less than an hour + parts.append(f"{minutes} minute{'s' if minutes != 1 else ''}") + + uptime_str = ", ".join(parts) + + # Add context hint + if uptime_seconds < 600: # Less than 10 minutes + return f"{uptime_str} (just booted - likely startup issue)" + elif uptime_seconds < 3600: # Less than 1 hour + return f"{uptime_str} (recently booted)" + elif days >= 30: + return f"{uptime_str} (stable system)" + else: + return uptime_str + + except Exception: + return "unknown" + + +def get_event_frequency(error_id: str = None, error_key: str = None, + category: str = None, hours: int = 24) -> Optional[Dict[str, Any]]: + """Get frequency information for an error from the database. + + Args: + error_id: Specific error ID to look up + error_key: Alternative error key + category: Error category + hours: Time window to check (default 24h) + + Returns: + Dict with frequency info or None + """ + if not DB_PATH.exists(): + return None + + try: + conn = sqlite3.connect(str(DB_PATH), timeout=5) + cursor = conn.cursor() + + # Try to find the error + if error_id: + cursor.execute(''' + SELECT first_seen, last_seen, occurrences, category + FROM errors WHERE error_key = ? OR error_id = ? + ORDER BY last_seen DESC LIMIT 1 + ''', (error_id, error_id)) + elif error_key: + cursor.execute(''' + SELECT first_seen, last_seen, occurrences, category + FROM errors WHERE error_key = ? + ORDER BY last_seen DESC LIMIT 1 + ''', (error_key,)) + elif category: + cursor.execute(''' + SELECT first_seen, last_seen, occurrences, category + FROM errors WHERE category = ? AND resolved_at IS NULL + ORDER BY last_seen DESC LIMIT 1 + ''', (category,)) + else: + conn.close() + return None + + row = cursor.fetchone() + conn.close() + + if not row: + return None + + first_seen, last_seen, occurrences, cat = row + + # Calculate age + try: + first_dt = datetime.fromisoformat(first_seen) if first_seen else None + last_dt = datetime.fromisoformat(last_seen) if last_seen else None + now = datetime.now() + + result = { + 'occurrences': occurrences or 1, + 'category': cat + } + + if first_dt: + age = now - first_dt + if age.total_seconds() < 3600: + result['first_seen_ago'] = f"{int(age.total_seconds() / 60)} minutes ago" + elif age.total_seconds() < 86400: + result['first_seen_ago'] = f"{int(age.total_seconds() / 3600)} hours ago" + else: + result['first_seen_ago'] = f"{age.days} days ago" + + if last_dt and first_dt and occurrences and occurrences > 1: + # Calculate average interval + span = (last_dt - first_dt).total_seconds() + if span > 0 and occurrences > 1: + avg_interval = span / (occurrences - 1) + if avg_interval < 60: + result['pattern'] = f"recurring every ~{int(avg_interval)} seconds" + elif avg_interval < 3600: + result['pattern'] = f"recurring every ~{int(avg_interval / 60)} minutes" + else: + result['pattern'] = f"recurring every ~{int(avg_interval / 3600)} hours" + + return result + + except (ValueError, TypeError): + return {'occurrences': occurrences or 1, 'category': cat} + + except Exception as e: + print(f"[AIContext] Error getting frequency: {e}") + return None + + +def get_smart_data(disk_device: str) -> Optional[str]: + """Get SMART health data for a disk. + + Args: + disk_device: Device path like /dev/sda or just sda + + Returns: + Formatted SMART summary or None + """ + if not disk_device: + return None + + # Normalize device path + if not disk_device.startswith('/dev/'): + disk_device = f'/dev/{disk_device}' + + # Check device exists + if not os.path.exists(disk_device): + return None + + try: + # Get health status + result = subprocess.run( + ['smartctl', '-H', disk_device], + capture_output=True, text=True, timeout=10 + ) + + health_status = "UNKNOWN" + if "PASSED" in result.stdout: + health_status = "PASSED" + elif "FAILED" in result.stdout: + health_status = "FAILED" + + # Get key attributes + result = subprocess.run( + ['smartctl', '-A', disk_device], + capture_output=True, text=True, timeout=10 + ) + + attributes = {} + critical_attrs = [ + 'Reallocated_Sector_Ct', 'Current_Pending_Sector', + 'Offline_Uncorrectable', 'UDMA_CRC_Error_Count', + 'Reallocated_Event_Count', 'Reported_Uncorrect' + ] + + for line in result.stdout.split('\n'): + for attr in critical_attrs: + if attr in line: + parts = line.split() + # Typical format: ID ATTRIBUTE_NAME FLAGS VALUE WORST THRESH TYPE UPDATED RAW_VALUE + if len(parts) >= 10: + raw_value = parts[-1] + attributes[attr] = raw_value + + # Build summary + lines = [f"SMART Health: {health_status}"] + + # Add critical attributes if non-zero + for attr, value in attributes.items(): + try: + if int(value) > 0: + lines.append(f" {attr}: {value}") + except ValueError: + pass + + return "\n".join(lines) if len(lines) > 1 or health_status == "FAILED" else f"SMART Health: {health_status}" + + except subprocess.TimeoutExpired: + return None + except FileNotFoundError: + # smartctl not installed + return None + except Exception: + return None + + +def extract_disk_device(text: str) -> Optional[str]: + """Extract disk device name from error text. + + Args: + text: Error message or log content + + Returns: + Device name like 'sda' or None + """ + if not text: + return None + + # Common patterns for disk devices in errors + patterns = [ + r'/dev/(sd[a-z]\d*)', + r'/dev/(nvme\d+n\d+(?:p\d+)?)', + r'/dev/(hd[a-z]\d*)', + r'/dev/(vd[a-z]\d*)', + r'\b(sd[a-z])\b', + r'disk[_\s]+(sd[a-z])', + r'ata\d+\.\d+: (sd[a-z])', + ] + + for pattern in patterns: + match = re.search(pattern, text, re.IGNORECASE) + if match: + return match.group(1) + + return None + + +def enrich_context_for_ai( + title: str, + body: str, + event_type: str, + data: Dict[str, Any], + journal_context: str = '', + detail_level: str = 'standard' +) -> str: + """Build enriched context string for AI processing. + + Combines: + - Original journal context + - Event frequency information + - System uptime + - SMART data (for disk errors) + - Known error matching + + Args: + title: Notification title + body: Notification body + event_type: Type of event + data: Event data dict + journal_context: Original journal log context + detail_level: Level of detail (minimal, standard, detailed) + + Returns: + Enriched context string + """ + context_parts = [] + combined_text = f"{title} {body} {journal_context}" + + # 1. System uptime (always useful) + uptime = get_system_uptime() + if uptime and uptime != "unknown": + context_parts.append(f"System uptime: {uptime}") + + # 2. Event frequency + error_key = data.get('error_key') or data.get('error_id') + category = data.get('category') + + freq = get_event_frequency(error_id=error_key, category=category) + if freq: + freq_line = f"Event frequency: {freq.get('occurrences', 1)} occurrence(s)" + if freq.get('first_seen_ago'): + freq_line += f", first seen {freq['first_seen_ago']}" + if freq.get('pattern'): + freq_line += f", {freq['pattern']}" + context_parts.append(freq_line) + + # 3. SMART data for disk-related events + disk_related = any(x in event_type.lower() for x in ['disk', 'smart', 'storage', 'io_error']) + if not disk_related: + disk_related = any(x in combined_text.lower() for x in ['disk', 'smart', '/dev/sd', 'ata', 'i/o error']) + + if disk_related: + disk_device = extract_disk_device(combined_text) + if disk_device: + smart_data = get_smart_data(disk_device) + if smart_data: + context_parts.append(smart_data) + + # 4. Known error matching + known_error_ctx = get_error_context(combined_text, category=category, detail_level=detail_level) + if known_error_ctx: + context_parts.append(known_error_ctx) + + # 5. Add original journal context + if journal_context: + context_parts.append(f"Journal logs:\n{journal_context}") + + # Combine all parts + if context_parts: + return "\n\n".join(context_parts) + + return journal_context or "" + + +def get_enriched_context( + event: 'NotificationEvent', + detail_level: str = 'standard' +) -> str: + """Convenience function to enrich context from a NotificationEvent. + + Args: + event: NotificationEvent object + detail_level: Level of detail + + Returns: + Enriched context string + """ + journal_context = event.data.get('_journal_context', '') + + return enrich_context_for_ai( + title=event.data.get('title', ''), + body=event.data.get('body', event.data.get('message', '')), + event_type=event.event_type, + data=event.data, + journal_context=journal_context, + detail_level=detail_level + ) diff --git a/AppImage/scripts/build_appimage.sh b/AppImage/scripts/build_appimage.sh index 93ab7362..e09e2479 100644 --- a/AppImage/scripts/build_appimage.sh +++ b/AppImage/scripts/build_appimage.sh @@ -95,6 +95,8 @@ cp "$SCRIPT_DIR/notification_manager.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo cp "$SCRIPT_DIR/notification_channels.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ notification_channels.py not found" cp "$SCRIPT_DIR/notification_templates.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ notification_templates.py not found" cp "$SCRIPT_DIR/notification_events.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ notification_events.py not found" +cp "$SCRIPT_DIR/proxmox_known_errors.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ proxmox_known_errors.py not found" +cp "$SCRIPT_DIR/ai_context_enrichment.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ ai_context_enrichment.py not found" cp "$SCRIPT_DIR/startup_grace.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ startup_grace.py not found" cp "$SCRIPT_DIR/flask_notification_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ flask_notification_routes.py not found" cp "$SCRIPT_DIR/oci_manager.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ oci_manager.py not found" diff --git a/AppImage/scripts/health_persistence.py b/AppImage/scripts/health_persistence.py index 1c507301..612c5b3f 100644 --- a/AppImage/scripts/health_persistence.py +++ b/AppImage/scripts/health_persistence.py @@ -862,6 +862,307 @@ class HealthPersistence: conn.commit() conn.close() + + # Clean up errors for resources that no longer exist (VMs/CTs deleted, disks removed) + self._cleanup_stale_resources() + + def _cleanup_stale_resources(self): + """Resolve errors for resources that no longer exist. + + Comprehensive cleanup for ALL error categories: + - VMs/CTs: deleted resources (not just stopped) + - Disks: physically removed devices, ZFS pools, storage + - Network: removed interfaces, bonds, bridges + - Services/pve_services: services on deleted CTs, stopped services + - Logs: persistent/spike/cascade errors older than 48h + - Cluster: errors when node is no longer in cluster + - Temperature: sensors that no longer exist + - Memory/Storage: mount points that no longer exist + - Updates/Security: acknowledged errors older than 7 days + - General fallback: any error older than 7 days with no recent activity + """ + import subprocess + import re + + conn = self._get_conn() + cursor = conn.cursor() + now = datetime.now() + now_iso = now.isoformat() + + # Get all active (unresolved) errors with first_seen and last_seen for age checks + cursor.execute(''' + SELECT id, error_key, category, message, first_seen, last_seen, severity FROM errors + WHERE resolved_at IS NULL + ''') + active_errors = cursor.fetchall() + + resolved_count = 0 + + # Cache for expensive checks (avoid repeated subprocess calls) + _vm_ct_exists_cache = {} + _cluster_status_cache = None + _network_interfaces_cache = None + _zfs_pools_cache = None + _mount_points_cache = None + _pve_services_cache = None + + def check_vm_ct_cached(vmid): + if vmid not in _vm_ct_exists_cache: + _vm_ct_exists_cache[vmid] = self._check_vm_ct_exists(vmid) + return _vm_ct_exists_cache[vmid] + + def get_cluster_status(): + nonlocal _cluster_status_cache + if _cluster_status_cache is None: + try: + result = subprocess.run( + ['pvecm', 'status'], + capture_output=True, text=True, timeout=5 + ) + _cluster_status_cache = { + 'is_cluster': result.returncode == 0 and 'Cluster information' in result.stdout, + 'nodes': result.stdout if result.returncode == 0 else '' + } + except Exception: + _cluster_status_cache = {'is_cluster': True, 'nodes': ''} # Assume cluster on error + return _cluster_status_cache + + def get_network_interfaces(): + nonlocal _network_interfaces_cache + if _network_interfaces_cache is None: + try: + import psutil + _network_interfaces_cache = set(psutil.net_if_stats().keys()) + except Exception: + _network_interfaces_cache = set() + return _network_interfaces_cache + + def get_zfs_pools(): + nonlocal _zfs_pools_cache + if _zfs_pools_cache is None: + try: + result = subprocess.run( + ['zpool', 'list', '-H', '-o', 'name'], + capture_output=True, text=True, timeout=5 + ) + if result.returncode == 0: + _zfs_pools_cache = set(result.stdout.strip().split('\n')) + else: + _zfs_pools_cache = set() + except Exception: + _zfs_pools_cache = set() + return _zfs_pools_cache + + def get_mount_points(): + nonlocal _mount_points_cache + if _mount_points_cache is None: + try: + import psutil + _mount_points_cache = set(p.mountpoint for p in psutil.disk_partitions(all=True)) + except Exception: + _mount_points_cache = set() + return _mount_points_cache + + def get_pve_services_status(): + nonlocal _pve_services_cache + if _pve_services_cache is None: + _pve_services_cache = {} + try: + result = subprocess.run( + ['systemctl', 'list-units', '--type=service', '--all', '--no-legend'], + capture_output=True, text=True, timeout=10 + ) + if result.returncode == 0: + for line in result.stdout.strip().split('\n'): + parts = line.split() + if parts: + service_name = parts[0].replace('.service', '') + _pve_services_cache[service_name] = 'active' in line + except Exception: + pass + return _pve_services_cache + + def extract_vmid_from_text(text): + """Extract VM/CT ID from error message or key.""" + if not text: + return None + # Patterns: "VM 100", "CT 100", "vm_100_", "ct_100_", "VMID 100", etc. + match = re.search(r'(?:VM|CT|VMID|CTID|vm_|ct_)[\s_]?(\d{3,})', text, re.IGNORECASE) + return match.group(1) if match else None + + def get_age_hours(timestamp_str): + """Get age in hours from ISO timestamp string.""" + if not timestamp_str: + return 0 + try: + dt = datetime.fromisoformat(timestamp_str) + return (now - dt).total_seconds() / 3600 + except (ValueError, TypeError): + return 0 + + for error_row in active_errors: + err_id, error_key, category, message, first_seen, last_seen, severity = error_row + should_resolve = False + resolution_reason = None + age_hours = get_age_hours(first_seen) + last_seen_hours = get_age_hours(last_seen) + + # === VM/CT ERRORS === + # Check if VM/CT still exists (covers: vms category, vm_*, ct_* error keys) + if category == 'vms' or (error_key and (error_key.startswith('vm_') or error_key.startswith('ct_'))): + vmid = extract_vmid_from_text(error_key) or extract_vmid_from_text(message) + if vmid and not check_vm_ct_cached(vmid): + should_resolve = True + resolution_reason = 'VM/CT deleted' + + # === DISK ERRORS === + # Check if disk device or ZFS pool still exists + elif category == 'disks' or category == 'storage': + if error_key: + # Check for ZFS pool errors (e.g., "zfs_pool_rpool_degraded") + zfs_match = re.search(r'zfs_(?:pool_)?([a-zA-Z0-9_-]+)', error_key) + if zfs_match: + pool_name = zfs_match.group(1) + pools = get_zfs_pools() + if pools and pool_name not in pools: + should_resolve = True + resolution_reason = 'ZFS pool removed' + + # Check for disk device errors (e.g., "disk_sdh_io_error", "smart_sda_failing") + if not should_resolve: + disk_match = re.search(r'(?:disk_|smart_|io_error_)([a-z]{2,4}\d*)', error_key) + if disk_match: + disk_name = disk_match.group(1) + disk_path = f'/dev/{disk_name}' + if not os.path.exists(disk_path): + should_resolve = True + resolution_reason = 'Disk device removed' + + # Check for mount point errors (e.g., "disk_fs_/mnt/data") + if not should_resolve and 'disk_fs_' in error_key: + mount = error_key.replace('disk_fs_', '').split('_')[0] + if mount.startswith('/'): + mounts = get_mount_points() + if mounts and mount not in mounts: + should_resolve = True + resolution_reason = 'Mount point removed' + + # === NETWORK ERRORS === + # Check if network interface still exists + elif category == 'network': + if error_key: + # Extract interface name (e.g., "net_vmbr1_down" -> "vmbr1", "bond0_slave_error" -> "bond0") + iface_match = re.search(r'(?:net_|bond_|vmbr|eth|eno|ens|enp)([a-zA-Z0-9_]+)?', error_key) + if iface_match: + # Reconstruct full interface name + full_match = re.search(r'((?:vmbr|bond|eth|eno|ens|enp)[a-zA-Z0-9]+)', error_key) + if full_match: + iface = full_match.group(1) + interfaces = get_network_interfaces() + if interfaces and iface not in interfaces: + should_resolve = True + resolution_reason = 'Network interface removed' + + # === SERVICE ERRORS === + # Check if service exists or if it references a deleted CT + elif category in ('services', 'pve_services'): + # First check if it references a CT that no longer exists + vmid = extract_vmid_from_text(message) or extract_vmid_from_text(error_key) + if vmid and not check_vm_ct_cached(vmid): + should_resolve = True + resolution_reason = 'Container deleted' + + # For pve_services, check if the service unit exists + if not should_resolve and category == 'pve_services' and error_key: + service_match = re.search(r'service_([a-zA-Z0-9_-]+)', error_key) + if service_match: + service_name = service_match.group(1) + services = get_pve_services_status() + if services and service_name not in services: + should_resolve = True + resolution_reason = 'Service no longer exists' + + # === LOG ERRORS === + # Auto-resolve log errors after 48h (they represent point-in-time issues) + elif category == 'logs' or (error_key and error_key.startswith(('log_persistent_', 'log_spike_', 'log_cascade_', 'log_critical_'))): + if age_hours > 48: + should_resolve = True + resolution_reason = 'Log error aged out (>48h)' + + # === CLUSTER ERRORS === + # Resolve cluster/corosync/qdevice errors if node is no longer in a cluster + elif error_key and any(x in error_key.lower() for x in ('cluster', 'corosync', 'qdevice', 'quorum')): + cluster_info = get_cluster_status() + if not cluster_info['is_cluster']: + should_resolve = True + resolution_reason = 'No longer in cluster' + + # === TEMPERATURE ERRORS === + # Temperature errors - check if sensor still exists (unlikely to change, resolve after 24h of no activity) + elif category == 'temperature': + if last_seen_hours > 24: + should_resolve = True + resolution_reason = 'Temperature error stale (>24h no activity)' + + # === UPDATES/SECURITY ERRORS === + # These are informational - auto-resolve after 7 days if acknowledged or stale + elif category in ('updates', 'security'): + if age_hours > 168: # 7 days + should_resolve = True + resolution_reason = 'Update/security notice aged out (>7d)' + + # === FALLBACK: ANY STALE ERROR === + # Any error that hasn't been seen in 7 days and is older than 7 days + if not should_resolve and age_hours > 168 and last_seen_hours > 168: + should_resolve = True + resolution_reason = 'Stale error (no activity >7d)' + + if should_resolve: + cursor.execute(''' + UPDATE errors SET resolved_at = ?, resolution_type = 'auto' + WHERE id = ? + ''', (now_iso, err_id)) + resolved_count += 1 + + if resolved_count > 0: + conn.commit() + print(f"[HealthPersistence] Auto-resolved {resolved_count} errors for stale/deleted resources") + + conn.close() + + def _check_vm_ct_exists(self, vmid: str) -> bool: + """Check if a VM or CT exists (not just running, but exists at all). + + Uses 'qm config' and 'pct config' which return success even for stopped VMs/CTs, + but fail if the VM/CT doesn't exist. + """ + import subprocess + + try: + # Try VM first + result = subprocess.run( + ['qm', 'config', vmid], + capture_output=True, + text=True, + timeout=3 + ) + if result.returncode == 0: + return True + + # Try CT + result = subprocess.run( + ['pct', 'config', vmid], + capture_output=True, + text=True, + timeout=3 + ) + if result.returncode == 0: + return True + + return False + except Exception: + # On error, assume it exists to avoid false positives + return True def check_vm_running(self, vm_id: str) -> bool: """ diff --git a/AppImage/scripts/notification_events.py b/AppImage/scripts/notification_events.py index fb84f5a6..3f59628c 100644 --- a/AppImage/scripts/notification_events.py +++ b/AppImage/scripts/notification_events.py @@ -28,7 +28,7 @@ from pathlib import Path # ─── Shared State for Cross-Watcher Coordination ────────────────── -# ─── Startup Grace Period ─────────────────���────────────────────────────────── +# ─── Startup Grace Period ──────────────────────────────────────────────────── # Import centralized startup grace management # This provides a single source of truth for all grace period logic import startup_grace @@ -2610,7 +2610,7 @@ class PollingCollector: pass -# ─── Proxmox Webhook Receiver ─────────────────────────────────── +# ─── Proxmox Webhook Receiver ─────────────���───────────────────── class ProxmoxHookWatcher: """Receives native Proxmox VE notifications via local webhook endpoint. diff --git a/AppImage/scripts/notification_manager.py b/AppImage/scripts/notification_manager.py index 050cdf70..ba25377a 100644 --- a/AppImage/scripts/notification_manager.py +++ b/AppImage/scripts/notification_manager.py @@ -44,6 +44,13 @@ from notification_events import ( ProxmoxHookWatcher, ) +# AI context enrichment (uptime, frequency, SMART data, known errors) +try: + from ai_context_enrichment import enrich_context_for_ai +except ImportError: + def enrich_context_for_ai(title, body, event_type, data, journal_context='', detail_level='standard'): + return journal_context + # ─── Constants ──────────────────────────────────────────────────── @@ -743,10 +750,10 @@ class NotificationManager: 'ai_custom_prompt': self._config.get('ai_custom_prompt', ''), } - # Get journal context if available - journal_context = data.get('_journal_context', '') - - for ch_name, channel in channels.items(): + # Get journal context if available (will be enriched per-channel based on detail_level) + raw_journal_context = data.get('_journal_context', '') + + for ch_name, channel in channels.items(): # ── Per-channel category check ── # Default: category enabled (true) unless explicitly disabled. ch_group_key = f'{ch_name}.events.{event_group}' @@ -771,17 +778,28 @@ class NotificationManager: rich_key = f'{ch_name}.rich_format' use_rich_format = self._config.get(rich_key, 'false') == 'true' - # ── Per-channel AI enhancement ── - # Apply AI with channel-specific detail level and emoji setting - # If AI is enabled AND rich_format is on, AI will include emojis directly - # Pass channel_type so AI knows whether to append original (email only) - channel_ai_config = {**ai_config, 'channel_type': ch_name} - ai_result = format_with_ai_full( - ch_title, ch_body, severity, channel_ai_config, - detail_level=detail_level, - journal_context=journal_context, - use_emojis=use_rich_format - ) + # ── Per-channel AI enhancement ── + # Apply AI with channel-specific detail level and emoji setting + # If AI is enabled AND rich_format is on, AI will include emojis directly + # Pass channel_type so AI knows whether to append original (email only) + channel_ai_config = {**ai_config, 'channel_type': ch_name} + + # Enrich context with uptime, frequency, SMART data, and known errors + enriched_context = enrich_context_for_ai( + title=ch_title, + body=ch_body, + event_type=event_type, + data=data, + journal_context=raw_journal_context, + detail_level=detail_level + ) + + ai_result = format_with_ai_full( + ch_title, ch_body, severity, channel_ai_config, + detail_level=detail_level, + journal_context=enriched_context, + use_emojis=use_rich_format + ) ch_title = ai_result.get('title', ch_title) ch_body = ai_result.get('body', ch_body) diff --git a/AppImage/scripts/notification_templates.py b/AppImage/scripts/notification_templates.py index 6418a21c..f8c9393e 100644 --- a/AppImage/scripts/notification_templates.py +++ b/AppImage/scripts/notification_templates.py @@ -1384,7 +1384,13 @@ AI_DETAIL_TOKENS = { # System prompt template - optimized hybrid version AI_SYSTEM_PROMPT = """You are a notification FORMATTER for ProxMenux Monitor (Proxmox VE). -Your job: translate and reformat alerts into {language}. You are NOT an analyst — do not interpret or diagnose. +Your job: translate alerts into {language} and enrich them with context when provided. + +═══ ABSOLUTE CONSTRAINTS (NO EXCEPTIONS) ═══ +- NO HALLUCINATIONS: Do not invent causes, solutions, or facts not present in the provided data +- NO SPECULATION: If something is unclear, state what IS known, not what MIGHT be +- NO CONVERSATIONAL TEXT: Never write "Here is...", "I've translated...", "Let me explain..." +- ONLY use information from: the message, journal context, and known error database (if provided) ═══ WHAT TO TRANSLATE ═══ Translate: labels, descriptions, status words, units (GB→Go in French, etc.) @@ -1394,15 +1400,37 @@ DO NOT translate: hostnames, IPs, paths, VM/CT IDs, device names (/dev/sdX), tec 1. Plain text only — NO markdown, no **bold**, no `code`, no bullet lists (use "• " for packages only) 2. Preserve severity: "failed" stays "failed", "warning" stays "warning" — never soften errors 3. Preserve structure: keep same fields and line order, only translate content -4. Detail level "{detail_level}": brief (2-3 lines) | standard (short paragraph) | detailed (full report) +4. Detail level "{detail_level}": + - brief: 1-2 lines, essential facts only + - standard: short paragraph, key details and context + - detailed: full report with all available information, step-by-step if applicable 5. DEDUPLICATION: merge duplicate facts from multiple sources into one clear statement 6. EMPTY LISTS: write translated "none" after label, never leave blank 7. Keep "hostname:" prefix in title — translate only the descriptive part -8. DO NOT add recommendations or suggestions ("you should...", "try...", "consider...") -{suggestions_addon}9. Present facts from message AND journal context — describe what happened, do NOT speculate -10. OUTPUT ONLY the final result — no "Original:", no before/after comparisons -11. Unknown input: preserve as closely as possible, translate what you can +8. DO NOT add recommendations or suggestions UNLESS AI Suggestions mode is enabled below +9. ENRICHED CONTEXT: You may receive additional context data including: + - "System uptime: X days (stable system)" → helps distinguish startup issues from runtime failures + - "Event frequency: N occurrences, first seen X ago" → indicates recurring vs one-time issues + - "SMART Health: PASSED/FAILED" with disk attributes → critical for disk errors + - "KNOWN PROXMOX ERROR DETECTED" with cause/solution → YOU MUST USE this exact information + + How to use enriched context: + - If uptime is <10min and error is service-related → mention "occurred shortly after boot" + - If frequency shows recurring pattern → mention "recurring issue (N times in X hours)" + - If SMART shows FAILED → treat as CRITICAL: "Disk failing - immediate attention required" + - If KNOWN ERROR is provided → YOU MUST incorporate its Cause and Solution (translate, don't copy verbatim) +10. JOURNAL CONTEXT EXTRACTION: When journal logs are provided: + - Extract specific IDs (VM/CT numbers, disk devices, service names) + - Include relevant timestamps if they help explain the timeline + - Identify root cause when logs clearly show it (e.g., "exit-code 255" -> "process crashed") + - Translate technical terms: "Emask 0x10" -> "ATA bus error", "DRDY ERR" -> "drive not ready" + - If logs show the same error repeating, state frequency: "occurred 15 times in 10 minutes" + - IGNORE journal entries unrelated to the main event +11. OUTPUT ONLY the final result — no "Original:", no before/after comparisons +12. Unknown input: preserve as closely as possible, translate what you can +13. REDUNDANCY: Never repeat the same information twice. If title says "CT 103 failed", body should not start with "Container 103 failed" +{suggestions_addon} ═══ PROXMOX MAPPINGS (use directly, never explain) ═══ pve-container@XXXX → "CT XXXX" | qemu-server@XXXX → "VM XXXX" | vzdump → "backup" pveproxy/pvedaemon/pvestatd → "Proxmox service" | corosync → "cluster service" @@ -1457,18 +1485,17 @@ CORRECT (markers are separators only): # Addon for experimental suggestions mode AI_SUGGESTIONS_ADDON = """ - EXCEPTION TO RULE 8 (Suggestions enabled): When journal context shows a clear, actionable problem, - you MAY add ONE brief suggestion at the END of the body (after all facts), using this format: - - 💡 Tip: [your suggestion here] - - Guidelines for suggestions: - - Only suggest when the problem AND solution are clear from the logs - - Keep it to ONE line, max 100 characters - - Be specific: "Check disk /dev/sdb SMART status" not "Check your disks" - - Use commands when helpful: "Run 'systemctl restart pvedaemon'" - - Never speculate - only suggest based on evidence in the logs - - Skip the tip entirely if the problem is unclear or already resolved +═══ AI SUGGESTIONS MODE (ENABLED) ═══ +You MAY add ONE brief, actionable tip at the END of the body using this exact format: + +💡 Tip: [your concise suggestion here] + +Rules for the tip: +- ONLY include if the log context or Known Error database clearly points to a specific fix +- Keep under 100 characters +- Be specific: "Run 'pvecm status' to check quorum" NOT "Check cluster status" +- If Known Error provides a solution, YOU MUST USE IT (don't invent your own) +- Never guess — skip the tip if the cause/solution is unclear """ # Emoji instructions injected into AI_SYSTEM_PROMPT for rich channels (Telegram, Discord, Pushover) diff --git a/AppImage/scripts/proxmox_known_errors.py b/AppImage/scripts/proxmox_known_errors.py new file mode 100644 index 00000000..0190c298 --- /dev/null +++ b/AppImage/scripts/proxmox_known_errors.py @@ -0,0 +1,348 @@ +#!/usr/bin/env python3 +""" +Database of known Proxmox/Linux errors with causes, solutions, and severity levels. + +This provides the AI with accurate, pre-verified information about common errors, +reducing hallucinations and ensuring consistent, helpful responses. + +Each entry includes: +- pattern: regex pattern to match against error messages/logs +- cause: brief explanation of what causes this error +- cause_detailed: more comprehensive explanation for detailed mode +- severity: info, warning, critical +- solution: brief actionable solution +- solution_detailed: step-by-step solution for detailed mode +- url: optional documentation link +""" + +import re +from typing import Optional, Dict, Any, List + +# Known error patterns with causes and solutions +PROXMOX_KNOWN_ERRORS: List[Dict[str, Any]] = [ + # ==================== SUBSCRIPTION/LICENSE ==================== + { + "pattern": r"no valid subscription|subscription.*invalid|not subscribed", + "cause": "Proxmox enterprise repository requires paid subscription", + "cause_detailed": "Proxmox VE uses a subscription model for enterprise features. Without a valid subscription key, access to the enterprise repository is denied. This is normal for home/lab users.", + "severity": "info", + "solution": "Use no-subscription repository or purchase subscription", + "solution_detailed": "For home/lab use: Switch to the no-subscription repository by editing /etc/apt/sources.list.d/pve-enterprise.list. For production: Purchase a subscription at proxmox.com/pricing", + "url": "https://pve.proxmox.com/wiki/Package_Repositories", + "category": "updates" + }, + + # ==================== CLUSTER/COROSYNC ==================== + { + "pattern": r"quorum.*lost|lost.*quorum|not.*quorate", + "cause": "Cluster lost majority of voting nodes", + "cause_detailed": "Corosync cluster requires more than 50% of configured votes to maintain quorum. When quorum is lost, the cluster becomes read-only to prevent split-brain scenarios.", + "severity": "critical", + "solution": "Check network connectivity between nodes; ensure majority of nodes are online", + "solution_detailed": "1. Verify network connectivity: ping all cluster nodes\n2. Check corosync status: systemctl status corosync\n3. View cluster status: pvecm status\n4. If nodes are unreachable, check firewall rules (ports 5405-5412 UDP)\n5. For emergency single-node operation: pvecm expected 1", + "url": "https://pve.proxmox.com/wiki/Cluster_Manager", + "category": "cluster" + }, + { + "pattern": r"corosync.*qdevice.*error|qdevice.*connection.*failed|qdevice.*not.*connected", + "cause": "QDevice helper node is unreachable", + "cause_detailed": "The Corosync QDevice provides an additional vote for 2-node clusters. When it cannot connect, the cluster may lose quorum if one node fails.", + "severity": "warning", + "solution": "Check QDevice server connectivity and corosync-qnetd service", + "solution_detailed": "1. Verify QDevice server is running: systemctl status corosync-qnetd (on QDevice host)\n2. Check connectivity: nc -zv 5403\n3. Restart qdevice: systemctl restart corosync-qdevice\n4. Check certificates: corosync-qdevice-net-certutil -s", + "url": "https://pve.proxmox.com/wiki/Cluster_Manager#_corosync_external_vote_support", + "category": "cluster" + }, + { + "pattern": r"corosync.*retransmit|corosync.*token.*timeout|ring.*mark.*faulty", + "cause": "Network latency or packet loss between cluster nodes", + "cause_detailed": "Corosync uses multicast/unicast for cluster communication. High latency, packet loss, or network congestion causes token timeouts and retransmissions, potentially leading to node eviction.", + "severity": "warning", + "solution": "Check network quality between nodes; consider increasing token timeout", + "solution_detailed": "1. Test network latency: ping -c 100 \n2. Check for packet loss between nodes\n3. Verify MTU settings match on all interfaces\n4. Increase token timeout in /etc/pve/corosync.conf if needed (default 1000ms)\n5. Check switch/router for congestion", + "category": "cluster" + }, + + # ==================== DISK/STORAGE ==================== + { + "pattern": r"SMART.*FAILED|smart.*failed.*health|Pre-fail|Old_age.*FAILING", + "cause": "Disk SMART health check failed - disk is failing", + "cause_detailed": "SMART (Self-Monitoring, Analysis and Reporting Technology) detected critical disk health issues. The disk is likely failing and data loss is imminent.", + "severity": "critical", + "solution": "IMMEDIATELY backup data and replace disk", + "solution_detailed": "1. URGENT: Backup all data from this disk immediately\n2. Check SMART details: smartctl -a /dev/sdX\n3. Note the failing attributes (Reallocated_Sector_Ct, Current_Pending_Sector, etc.)\n4. Plan disk replacement\n5. If in RAID/ZFS: initiate disk replacement procedure", + "category": "disks" + }, + { + "pattern": r"Reallocated_Sector_Ct.*threshold|reallocated.*sectors?.*exceeded", + "cause": "Disk has excessive bad sectors being remapped", + "cause_detailed": "The disk firmware has remapped multiple bad sectors to spare areas. While the disk is still functioning, this indicates physical degradation and eventual failure.", + "severity": "warning", + "solution": "Monitor closely and plan disk replacement", + "solution_detailed": "1. Check current value: smartctl -A /dev/sdX | grep Reallocated\n2. If value is increasing, plan immediate replacement\n3. Backup important data\n4. Run extended SMART test: smartctl -t long /dev/sdX", + "category": "disks" + }, + { + "pattern": r"ata.*error|ATA.*bus.*error|Emask.*0x|DRDY.*ERR|UNC.*error", + "cause": "ATA communication error with disk", + "cause_detailed": "The SATA/ATA controller encountered communication errors with the disk. This can indicate cable issues, controller problems, or disk failure.", + "severity": "warning", + "solution": "Check SATA cables and connections; verify disk health with smartctl", + "solution_detailed": "1. Check SMART health: smartctl -H /dev/sdX\n2. Inspect and reseat SATA cables\n3. Try different SATA port\n4. Check dmesg for pattern of errors\n5. If errors persist, disk may be failing", + "category": "disks" + }, + { + "pattern": r"I/O.*error|blk_update_request.*error|Buffer I/O error", + "cause": "Disk I/O operation failed", + "cause_detailed": "The kernel failed to read or write data to the disk. This can be caused by disk failure, cable issues, or filesystem corruption.", + "severity": "critical", + "solution": "Check disk health and connections immediately", + "solution_detailed": "1. Check SMART status: smartctl -H /dev/sdX\n2. Check dmesg for related errors: dmesg | grep -i error\n3. Verify disk is still accessible: lsblk\n4. If ZFS: check pool status with zpool status\n5. Consider filesystem check if safe to unmount", + "category": "disks" + }, + { + "pattern": r"zfs.*pool.*DEGRADED|pool.*is.*degraded", + "cause": "ZFS pool has reduced redundancy", + "cause_detailed": "One or more devices in the ZFS pool are unavailable or experiencing errors. The pool is still functional but without full redundancy.", + "severity": "warning", + "solution": "Identify failed device with 'zpool status' and replace", + "solution_detailed": "1. Check pool status: zpool status \n2. Identify the DEGRADED or UNAVAIL device\n3. If device is present but erroring: zpool scrub \n4. To replace: zpool replace \n5. Monitor resilver progress: zpool status", + "category": "storage" + }, + { + "pattern": r"zfs.*pool.*FAULTED|pool.*is.*faulted", + "cause": "ZFS pool is inaccessible", + "cause_detailed": "The ZFS pool has lost too many devices and cannot maintain data integrity. Data may be inaccessible.", + "severity": "critical", + "solution": "Check failed devices; may need data recovery", + "solution_detailed": "1. Check status: zpool status \n2. Identify all failed devices\n3. Attempt to online devices: zpool online \n4. If drives are physically present, try zpool clear \n5. May require data recovery if multiple drives failed", + "category": "storage" + }, + + # ==================== CEPH ==================== + { + "pattern": r"ceph.*OSD.*down|osd\.\d+.*down|ceph.*osd.*failed", + "cause": "Ceph OSD daemon is not running", + "cause_detailed": "A Ceph Object Storage Daemon (OSD) has stopped or crashed. This reduces storage redundancy and may trigger data rebalancing.", + "severity": "warning", + "solution": "Check disk health and restart OSD service", + "solution_detailed": "1. Check OSD status: ceph osd tree\n2. View OSD logs: journalctl -u ceph-osd@\n3. Check underlying disk: smartctl -H /dev/sdX\n4. Restart OSD: systemctl start ceph-osd@\n5. If OSD keeps crashing, check for disk failure", + "category": "storage" + }, + { + "pattern": r"ceph.*health.*WARN|HEALTH_WARN", + "cause": "Ceph cluster has warnings", + "cause_detailed": "Ceph detected issues that don't prevent operation but should be addressed. Common causes: degraded PGs, clock skew, full OSDs.", + "severity": "warning", + "solution": "Run 'ceph health detail' for specific issues", + "solution_detailed": "1. Get details: ceph health detail\n2. Common fixes:\n - Degraded PGs: wait for recovery or add capacity\n - Clock skew: sync NTP on all nodes\n - Full OSDs: add storage or delete data\n3. Check: ceph status", + "category": "storage" + }, + { + "pattern": r"ceph.*health.*ERR|HEALTH_ERR", + "cause": "Ceph cluster has critical errors", + "cause_detailed": "Ceph has detected critical issues that may affect data availability or integrity. Immediate attention required.", + "severity": "critical", + "solution": "Run 'ceph health detail' and address errors immediately", + "solution_detailed": "1. Get details: ceph health detail\n2. Check OSD status: ceph osd tree\n3. Check MON status: ceph mon stat\n4. View PG status: ceph pg stat\n5. Address each error shown in health detail", + "category": "storage" + }, + + # ==================== VM/CT ERRORS ==================== + { + "pattern": r"TASK ERROR.*failed to get exclusive lock|lock.*timeout|couldn't acquire lock", + "cause": "Resource is locked by another operation", + "cause_detailed": "Another task is currently holding a lock on this VM/CT. This prevents concurrent modifications that could cause corruption.", + "severity": "info", + "solution": "Wait for other task to complete or check for stuck tasks", + "solution_detailed": "1. Check running tasks: cat /var/log/pve/tasks/active\n2. Wait for task completion\n3. If task is stuck (>1h), check process: ps aux | grep \n4. As last resort, remove lock file: rm /var/lock/qemu-server/lock-.conf", + "category": "vms" + }, + { + "pattern": r"kvm.*not.*available|kvm.*disabled|hardware.*virtualization.*disabled", + "cause": "KVM/hardware virtualization not available", + "cause_detailed": "The CPU's hardware virtualization extensions (Intel VT-x or AMD-V) are either not supported, not enabled in BIOS, or blocked by another hypervisor.", + "severity": "warning", + "solution": "Enable VT-x/AMD-V in BIOS settings", + "solution_detailed": "1. Reboot into BIOS/UEFI\n2. Find Virtualization settings (often in CPU or Advanced section)\n3. Enable Intel VT-x or AMD-V/SVM\n4. Save and reboot\n5. Verify: grep -E 'vmx|svm' /proc/cpuinfo", + "category": "vms" + }, + { + "pattern": r"out of memory|OOM.*kill|cannot allocate memory|memory.*exhausted", + "cause": "System or VM ran out of memory", + "cause_detailed": "The Linux OOM (Out Of Memory) killer terminated a process to free memory. This indicates memory pressure from overcommitment or memory leaks.", + "severity": "critical", + "solution": "Increase memory allocation or reduce VM memory usage", + "solution_detailed": "1. Check what was killed: dmesg | grep -i oom\n2. Review memory usage: free -h\n3. Check balloon driver status for VMs\n4. Consider adding swap or RAM\n5. Review VM memory allocations for overcommitment", + "category": "memory" + }, + + # ==================== NETWORK ==================== + { + "pattern": r"bond.*slave.*link.*down|bond.*no.*active.*slave", + "cause": "Network bond lost a slave interface", + "cause_detailed": "One or more physical interfaces in a network bond have lost link. Depending on bond mode, this may reduce bandwidth or affect failover.", + "severity": "warning", + "solution": "Check physical cable connections and switch ports", + "solution_detailed": "1. Check bond status: cat /proc/net/bonding/bond0\n2. Identify down slave interface\n3. Check physical cable connection\n4. Check switch port status and errors\n5. Verify interface: ethtool ", + "category": "network" + }, + { + "pattern": r"link.*not.*ready|carrier.*lost|link.*down|NIC.*Link.*Down", + "cause": "Network interface lost link", + "cause_detailed": "The physical or virtual network interface has lost its connection. This could be a cable issue, switch problem, or driver issue.", + "severity": "warning", + "solution": "Check cable, switch port, and interface status", + "solution_detailed": "1. Check interface: ip link show \n2. Check cable connection\n3. Check switch port LEDs\n4. Try: ip link set down && ip link set up\n5. Check driver: ethtool -i ", + "category": "network" + }, + { + "pattern": r"bridge.*STP.*blocked|spanning.*tree.*blocked", + "cause": "Spanning Tree Protocol blocked a port", + "cause_detailed": "STP detected a potential network loop and blocked a bridge port to prevent broadcast storms. This is normal behavior but may indicate network topology issues.", + "severity": "info", + "solution": "Review network topology; this may be expected behavior", + "solution_detailed": "1. Check bridge status: brctl show\n2. View STP state: brctl showstp \n3. If unexpected, review network topology for loops\n4. Consider disabling STP if network is simple: brctl stp off", + "category": "network" + }, + + # ==================== SERVICES ==================== + { + "pattern": r"pvedaemon.*failed|pveproxy.*failed|pvestatd.*failed", + "cause": "Critical Proxmox service failed", + "cause_detailed": "One of the core Proxmox daemons has crashed or failed to start. This may affect web GUI access or API functionality.", + "severity": "critical", + "solution": "Restart the failed service; check logs for cause", + "solution_detailed": "1. Check status: systemctl status \n2. View logs: journalctl -u -n 50\n3. Restart: systemctl restart \n4. If persistent, check: /var/log/pveproxy/access.log", + "category": "pve_services" + }, + { + "pattern": r"failed to start.*service|service.*start.*failed|service.*activation.*failed", + "cause": "System service failed to start", + "cause_detailed": "A systemd service unit failed during startup. This could be due to configuration errors, missing dependencies, or resource issues.", + "severity": "warning", + "solution": "Check service logs with journalctl -u ", + "solution_detailed": "1. Check status: systemctl status \n2. View logs: journalctl -xeu \n3. Check config: systemctl cat \n4. Verify dependencies: systemctl list-dependencies \n5. Try restart: systemctl restart ", + "category": "services" + }, + + # ==================== BACKUP ==================== + { + "pattern": r"backup.*failed|vzdump.*error|backup.*job.*failed", + "cause": "Backup job failed", + "cause_detailed": "A scheduled or manual backup operation failed. Common causes: storage full, VM locked, network issues for remote storage.", + "severity": "warning", + "solution": "Check backup storage space and VM status", + "solution_detailed": "1. Check backup log in Datacenter > Backup\n2. Verify storage space: df -h\n3. Check if VM is locked: qm list or pct list\n4. Verify backup storage is accessible\n5. Try manual backup to identify specific error", + "category": "backups" + }, + + # ==================== CERTIFICATES ==================== + { + "pattern": r"certificate.*expired|SSL.*certificate.*expired|cert.*expir", + "cause": "SSL/TLS certificate has expired", + "cause_detailed": "An SSL certificate used for secure communication has passed its expiration date. This may cause connection failures or security warnings.", + "severity": "warning", + "solution": "Renew the certificate using pvenode cert set or Let's Encrypt", + "solution_detailed": "1. Check certificate: pvenode cert info\n2. For self-signed renewal: pvecm updatecerts\n3. For Let's Encrypt: pvenode acme cert order\n4. Restart pveproxy after renewal: systemctl restart pveproxy", + "url": "https://pve.proxmox.com/wiki/Certificate_Management", + "category": "security" + }, + + # ==================== HARDWARE/TEMPERATURE ==================== + { + "pattern": r"temperature.*critical|thermal.*critical|CPU.*overheating|temp.*above.*threshold", + "cause": "Component temperature critical", + "cause_detailed": "A hardware component (CPU, disk, etc.) has reached a dangerous temperature. Sustained high temperatures can cause hardware damage or system shutdowns.", + "severity": "critical", + "solution": "Check cooling system immediately; clean dust, verify fans", + "solution_detailed": "1. Check current temps: sensors\n2. Verify all fans are running\n3. Clean dust from heatsinks and filters\n4. Ensure adequate airflow\n5. Consider reapplying thermal paste if CPU\n6. Check ambient room temperature", + "category": "temperature" + }, + + # ==================== AUTHENTICATION ==================== + { + "pattern": r"authentication.*failed|login.*failed|invalid.*credentials|access.*denied", + "cause": "Authentication failure", + "cause_detailed": "A login attempt failed due to invalid credentials or permissions. Multiple failures may indicate a brute-force attack.", + "severity": "info", + "solution": "Verify credentials; check for unauthorized access attempts", + "solution_detailed": "1. Review auth logs: journalctl -u pvedaemon | grep auth\n2. Check for multiple failures from same IP\n3. Verify user exists: pveum user list\n4. If attack suspected, consider fail2ban\n5. Reset password if needed: pveum passwd ", + "category": "security" + }, +] + + +def find_matching_error(text: str, category: Optional[str] = None) -> Optional[Dict[str, Any]]: + """Find a known error that matches the given text. + + Args: + text: Error message or log content to match against + category: Optional category to filter by + + Returns: + Matching error dict or None + """ + if not text: + return None + + text_lower = text.lower() + + for error in PROXMOX_KNOWN_ERRORS: + # Filter by category if specified + if category and error.get("category") != category: + continue + + try: + if re.search(error["pattern"], text_lower, re.IGNORECASE): + return error + except re.error: + continue + + return None + + +def get_error_context(text: str, category: Optional[str] = None, detail_level: str = "standard") -> Optional[str]: + """Get formatted context for a known error. + + Args: + text: Error message to match + category: Optional category filter + detail_level: "minimal", "standard", or "detailed" + + Returns: + Formatted context string or None + """ + error = find_matching_error(text, category) + if not error: + return None + + if detail_level == "minimal": + return f"Known issue: {error['cause']}" + + elif detail_level == "standard": + lines = [ + f"KNOWN PROXMOX ERROR DETECTED:", + f" Cause: {error['cause']}", + f" Severity: {error['severity'].upper()}", + f" Solution: {error['solution']}" + ] + if error.get("url"): + lines.append(f" Docs: {error['url']}") + return "\n".join(lines) + + else: # detailed + lines = [ + f"KNOWN PROXMOX ERROR DETECTED:", + f" Cause: {error.get('cause_detailed', error['cause'])}", + f" Severity: {error['severity'].upper()}", + f" Solution: {error.get('solution_detailed', error['solution'])}" + ] + if error.get("url"): + lines.append(f" Documentation: {error['url']}") + return "\n".join(lines) + + +def get_all_patterns() -> List[str]: + """Get all error patterns for external use.""" + return [error["pattern"] for error in PROXMOX_KNOWN_ERRORS]