ProxMenux/AppImage/scripts/proxmox_known_errors.py

#!/usr/bin/env python3
"""
Database of known Proxmox/Linux errors with causes, solutions, and severity levels.

This provides the AI with accurate, pre-verified information about common errors,
reducing hallucinations and ensuring consistent, helpful responses.

Each entry includes:
- pattern: regex pattern to match against error messages/logs
- cause: brief explanation of what causes this error
- cause_detailed: more comprehensive explanation for detailed mode
- severity: info, warning, critical
- solution: brief actionable solution
- solution_detailed: step-by-step solution for detailed mode
- url: optional documentation link
"""

import re
from typing import Optional, Dict, Any, List

# Known error patterns with causes and solutions
PROXMOX_KNOWN_ERRORS: List[Dict[str, Any]] = [
    # ==================== SUBSCRIPTION/LICENSE ====================
    {
        "pattern": r"no valid subscription|subscription.*invalid|not subscribed",
        "cause": "Proxmox enterprise repository requires paid subscription",
        "cause_detailed": "Proxmox VE uses a subscription model for enterprise features. Without a valid subscription key, access to the enterprise repository is denied. This is normal for home/lab users.",
        "severity": "info",
        "solution": "Use no-subscription repository or purchase subscription",
        "solution_detailed": "For home/lab use: Switch to the no-subscription repository by editing /etc/apt/sources.list.d/pve-enterprise.list. For production: Purchase a subscription at proxmox.com/pricing",
        "url": "https://pve.proxmox.com/wiki/Package_Repositories",
        "category": "updates"
    },
    
    # ==================== CLUSTER/COROSYNC ====================
    {
        "pattern": r"quorum.*lost|lost.*quorum|not.*quorate",
        "cause": "Cluster lost majority of voting nodes",
        "cause_detailed": "Corosync cluster requires more than 50% of configured votes to maintain quorum. When quorum is lost, the cluster becomes read-only to prevent split-brain scenarios.",
        "severity": "critical",
        "solution": "Check network connectivity between nodes; ensure majority of nodes are online",
        "solution_detailed": "1. Verify network connectivity: ping all cluster nodes\n2. Check corosync status: systemctl status corosync\n3. View cluster status: pvecm status\n4. If nodes are unreachable, check firewall rules (ports 5405-5412 UDP)\n5. For emergency single-node operation: pvecm expected 1",
        "url": "https://pve.proxmox.com/wiki/Cluster_Manager",
        "category": "cluster"
    },
    {
        "pattern": r"corosync.*qdevice.*error|qdevice.*connection.*failed|qdevice.*not.*connected",
        "cause": "QDevice helper node is unreachable",
        "cause_detailed": "The Corosync QDevice provides an additional vote for 2-node clusters. When it cannot connect, the cluster may lose quorum if one node fails.",
        "severity": "warning",
        "solution": "Check QDevice server connectivity and corosync-qnetd service",
        "solution_detailed": "1. Verify QDevice server is running: systemctl status corosync-qnetd (on QDevice host)\n2. Check connectivity: nc -zv <qdevice-ip> 5403\n3. Restart qdevice: systemctl restart corosync-qdevice\n4. Check certificates: corosync-qdevice-net-certutil -s",
        "url": "https://pve.proxmox.com/wiki/Cluster_Manager#_corosync_external_vote_support",
        "category": "cluster"
    },
    {
        "pattern": r"corosync.*retransmit|corosync.*token.*timeout|ring.*mark.*faulty",
        "cause": "Network latency or packet loss between cluster nodes",
        "cause_detailed": "Corosync uses multicast/unicast for cluster communication. High latency, packet loss, or network congestion causes token timeouts and retransmissions, potentially leading to node eviction.",
        "severity": "warning",
        "solution": "Check network quality between nodes; consider increasing token timeout",
        "solution_detailed": "1. Test network latency: ping -c 100 <other-node>\n2. Check for packet loss between nodes\n3. Verify MTU settings match on all interfaces\n4. Increase token timeout in /etc/pve/corosync.conf if needed (default 1000ms)\n5. Check switch/router for congestion",
        "category": "cluster"
    },
    
    # ==================== DISK/STORAGE ====================
    {
        "pattern": r"SMART.*FAILED|smart.*failed.*health|Pre-fail|Old_age.*FAILING",
        "cause": "Disk SMART health check failed - disk is failing",
        "cause_detailed": "SMART (Self-Monitoring, Analysis and Reporting Technology) detected critical disk health issues. The disk is likely failing and data loss is imminent.",
        "severity": "critical",
        "solution": "IMMEDIATELY backup data and replace disk",
        "solution_detailed": "1. URGENT: Backup all data from this disk immediately\n2. Check SMART details: smartctl -a /dev/sdX\n3. Note the failing attributes (Reallocated_Sector_Ct, Current_Pending_Sector, etc.)\n4. Plan disk replacement\n5. If in RAID/ZFS: initiate disk replacement procedure",
        "category": "disks"
    },
    {
        "pattern": r"Reallocated_Sector_Ct.*threshold|reallocated.*sectors?.*exceeded",
        "cause": "Disk has excessive bad sectors being remapped",
        "cause_detailed": "The disk firmware has remapped multiple bad sectors to spare areas. While the disk is still functioning, this indicates physical degradation and eventual failure.",
        "severity": "warning",
        "solution": "Monitor closely and plan disk replacement",
        "solution_detailed": "1. Check current value: smartctl -A /dev/sdX | grep Reallocated\n2. If value is increasing, plan immediate replacement\n3. Backup important data\n4. Run extended SMART test: smartctl -t long /dev/sdX",
        "category": "disks"
    },
    {
        "pattern": r"\bata\d.*\berror\b|\bATA\b.*bus.*error|Emask.*0x|DRDY.*ERR|\bUNC\b.*error",
        "cause": "ATA communication error with disk",
        "cause_detailed": "The SATA/ATA controller encountered communication errors with the disk. This can indicate cable issues, controller problems, or disk failure.",
        "severity": "warning",
        "solution": "Check SATA cables and connections; verify disk health with smartctl",
        "solution_detailed": "1. Check SMART health: smartctl -H /dev/sdX\n2. Inspect and reseat SATA cables\n3. Try different SATA port\n4. Check dmesg for pattern of errors\n5. If errors persist, disk may be failing",
        "category": "disks"
    },
    {
        "pattern": r"I/O.*error|blk_update_request.*error|Buffer I/O error",
        "cause": "Disk I/O operation failed",
        "cause_detailed": "The kernel failed to read or write data to the disk. This can be caused by disk failure, cable issues, or filesystem corruption.",
        "severity": "critical",
        "solution": "Check disk health and connections immediately",
        "solution_detailed": "1. Check SMART status: smartctl -H /dev/sdX\n2. Check dmesg for related errors: dmesg | grep -i error\n3. Verify disk is still accessible: lsblk\n4. If ZFS: check pool status with zpool status\n5. Consider filesystem check if safe to unmount",
        "category": "disks"
    },
    {
        "pattern": r"zfs.*pool.*DEGRADED|pool.*is.*degraded",
        "cause": "ZFS pool has reduced redundancy",
        "cause_detailed": "One or more devices in the ZFS pool are unavailable or experiencing errors. The pool is still functional but without full redundancy.",
        "severity": "warning",
        "solution": "Identify failed device with 'zpool status' and replace",
        "solution_detailed": "1. Check pool status: zpool status <pool>\n2. Identify the DEGRADED or UNAVAIL device\n3. If device is present but erroring: zpool scrub <pool>\n4. To replace: zpool replace <pool> <old-device> <new-device>\n5. Monitor resilver progress: zpool status",
        "category": "storage"
    },
    {
        "pattern": r"zfs.*pool.*FAULTED|pool.*is.*faulted",
        "cause": "ZFS pool is inaccessible",
        "cause_detailed": "The ZFS pool has lost too many devices and cannot maintain data integrity. Data may be inaccessible.",
        "severity": "critical",
        "solution": "Check failed devices; may need data recovery",
        "solution_detailed": "1. Check status: zpool status <pool>\n2. Identify all failed devices\n3. Attempt to online devices: zpool online <pool> <device>\n4. If drives are physically present, try zpool clear <pool>\n5. May require data recovery if multiple drives failed",
        "category": "storage"
    },
    
    # ==================== CEPH ====================
    {
        "pattern": r"ceph.*OSD.*down|osd\.\d+.*down|ceph.*osd.*failed",
        "cause": "Ceph OSD daemon is not running",
        "cause_detailed": "A Ceph Object Storage Daemon (OSD) has stopped or crashed. This reduces storage redundancy and may trigger data rebalancing.",
        "severity": "warning",
        "solution": "Check disk health and restart OSD service",
        "solution_detailed": "1. Check OSD status: ceph osd tree\n2. View OSD logs: journalctl -u ceph-osd@<id>\n3. Check underlying disk: smartctl -H /dev/sdX\n4. Restart OSD: systemctl start ceph-osd@<id>\n5. If OSD keeps crashing, check for disk failure",
        "category": "storage"
    },
    {
        "pattern": r"ceph.*health.*WARN|HEALTH_WARN",
        "cause": "Ceph cluster has warnings",
        "cause_detailed": "Ceph detected issues that don't prevent operation but should be addressed. Common causes: degraded PGs, clock skew, full OSDs.",
        "severity": "warning",
        "solution": "Run 'ceph health detail' for specific issues",
        "solution_detailed": "1. Get details: ceph health detail\n2. Common fixes:\n   - Degraded PGs: wait for recovery or add capacity\n   - Clock skew: sync NTP on all nodes\n   - Full OSDs: add storage or delete data\n3. Check: ceph status",
        "category": "storage"
    },
    {
        "pattern": r"ceph.*health.*ERR|HEALTH_ERR",
        "cause": "Ceph cluster has critical errors",
        "cause_detailed": "Ceph has detected critical issues that may affect data availability or integrity. Immediate attention required.",
        "severity": "critical",
        "solution": "Run 'ceph health detail' and address errors immediately",
        "solution_detailed": "1. Get details: ceph health detail\n2. Check OSD status: ceph osd tree\n3. Check MON status: ceph mon stat\n4. View PG status: ceph pg stat\n5. Address each error shown in health detail",
        "category": "storage"
    },
    
    # ==================== VM/CT ERRORS ====================
    {
        "pattern": r"TASK ERROR.*failed to get exclusive lock|lock.*timeout|couldn't acquire lock",
        "cause": "Resource is locked by another operation",
        "cause_detailed": "Another task is currently holding a lock on this VM/CT. This prevents concurrent modifications that could cause corruption.",
        "severity": "info",
        "solution": "Wait for other task to complete or check for stuck tasks",
        "solution_detailed": "1. Check running tasks: cat /var/log/pve/tasks/active\n2. Wait for task completion\n3. If task is stuck (>1h), check process: ps aux | grep <vmid>\n4. As last resort, remove lock file: rm /var/lock/qemu-server/lock-<vmid>.conf",
        "category": "vms"
    },
    {
        "pattern": r"kvm.*not.*available|kvm.*disabled|hardware.*virtualization.*disabled",
        "cause": "KVM/hardware virtualization not available",
        "cause_detailed": "The CPU's hardware virtualization extensions (Intel VT-x or AMD-V) are either not supported, not enabled in BIOS, or blocked by another hypervisor.",
        "severity": "warning",
        "solution": "Enable VT-x/AMD-V in BIOS settings",
        "solution_detailed": "1. Reboot into BIOS/UEFI\n2. Find Virtualization settings (often in CPU or Advanced section)\n3. Enable Intel VT-x or AMD-V/SVM\n4. Save and reboot\n5. Verify: grep -E 'vmx|svm' /proc/cpuinfo",
        "category": "vms"
    },
    {
        "pattern": r"out of memory|OOM.*kill|cannot allocate memory|memory.*exhausted",
        "cause": "System or VM ran out of memory",
        "cause_detailed": "The Linux OOM (Out Of Memory) killer terminated a process to free memory. This indicates memory pressure from overcommitment or memory leaks.",
        "severity": "critical",
        "solution": "Increase memory allocation or reduce VM memory usage",
        "solution_detailed": "1. Check what was killed: dmesg | grep -i oom\n2. Review memory usage: free -h\n3. Check balloon driver status for VMs\n4. Consider adding swap or RAM\n5. Review VM memory allocations for overcommitment",
        "category": "memory"
    },
    
    # ==================== NETWORK ====================
    {
        "pattern": r"bond.*slave.*link.*down|bond.*no.*active.*slave",
        "cause": "Network bond lost a slave interface",
        "cause_detailed": "One or more physical interfaces in a network bond have lost link. Depending on bond mode, this may reduce bandwidth or affect failover.",
        "severity": "warning",
        "solution": "Check physical cable connections and switch ports",
        "solution_detailed": "1. Check bond status: cat /proc/net/bonding/bond0\n2. Identify down slave interface\n3. Check physical cable connection\n4. Check switch port status and errors\n5. Verify interface: ethtool <slave-iface>",
        "category": "network"
    },
    {
        "pattern": r"link.*not.*ready|carrier.*lost|link.*down|NIC.*Link.*Down",
        "cause": "Network interface lost link",
        "cause_detailed": "The physical or virtual network interface has lost its connection. This could be a cable issue, switch problem, or driver issue.",
        "severity": "warning",
        "solution": "Check cable, switch port, and interface status",
        "solution_detailed": "1. Check interface: ip link show <iface>\n2. Check cable connection\n3. Check switch port LEDs\n4. Try: ip link set <iface> down && ip link set <iface> up\n5. Check driver: ethtool -i <iface>",
        "category": "network"
    },
    {
        "pattern": r"bridge.*STP.*blocked|spanning.*tree.*blocked",
        "cause": "Spanning Tree Protocol blocked a port",
        "cause_detailed": "STP detected a potential network loop and blocked a bridge port to prevent broadcast storms. This is normal behavior but may indicate network topology issues.",
        "severity": "info",
        "solution": "Review network topology; this may be expected behavior",
        "solution_detailed": "1. Check bridge status: brctl show\n2. View STP state: brctl showstp <bridge>\n3. If unexpected, review network topology for loops\n4. Consider disabling STP if network is simple: brctl stp <bridge> off",
        "category": "network"
    },
    
    # ==================== SERVICES ====================
    {
        "pattern": r"pvedaemon.*failed|pveproxy.*failed|pvestatd.*failed",
        "cause": "Critical Proxmox service failed",
        "cause_detailed": "One of the core Proxmox daemons has crashed or failed to start. This may affect web GUI access or API functionality.",
        "severity": "critical",
        "solution": "Restart the failed service; check logs for cause",
        "solution_detailed": "1. Check status: systemctl status <service>\n2. View logs: journalctl -u <service> -n 50\n3. Restart: systemctl restart <service>\n4. If persistent, check: /var/log/pveproxy/access.log",
        "category": "pve_services"
    },
    {
        "pattern": r"failed to start.*service|service.*start.*failed|service.*activation.*failed",
        "cause": "System service failed to start",
        "cause_detailed": "A systemd service unit failed during startup. This could be due to configuration errors, missing dependencies, or resource issues.",
        "severity": "warning",
        "solution": "Check service logs with journalctl -u <service>",
        "solution_detailed": "1. Check status: systemctl status <service>\n2. View logs: journalctl -xeu <service>\n3. Check config: systemctl cat <service>\n4. Verify dependencies: systemctl list-dependencies <service>\n5. Try restart: systemctl restart <service>",
        "category": "services"
    },
    
    # ==================== BACKUP ====================
    {
        "pattern": r"backup.*failed|vzdump.*error|backup.*job.*failed",
        "cause": "Backup job failed",
        "cause_detailed": "A scheduled or manual backup operation failed. Common causes: storage full, VM locked, network issues for remote storage.",
        "severity": "warning",
        "solution": "Check backup storage space and VM status",
        "solution_detailed": "1. Check backup log in Datacenter > Backup\n2. Verify storage space: df -h\n3. Check if VM is locked: qm list or pct list\n4. Verify backup storage is accessible\n5. Try manual backup to identify specific error",
        "category": "backups"
    },
    
    # ==================== CERTIFICATES ====================
    {
        "pattern": r"certificate.*expired|SSL.*certificate.*expired|cert.*expir",
        "cause": "SSL/TLS certificate has expired",
        "cause_detailed": "An SSL certificate used for secure communication has passed its expiration date. This may cause connection failures or security warnings.",
        "severity": "warning",
        "solution": "Renew the certificate using pvenode cert set or Let's Encrypt",
        "solution_detailed": "1. Check certificate: pvenode cert info\n2. For self-signed renewal: pvecm updatecerts\n3. For Let's Encrypt: pvenode acme cert order\n4. Restart pveproxy after renewal: systemctl restart pveproxy",
        "url": "https://pve.proxmox.com/wiki/Certificate_Management",
        "category": "security"
    },
    
    # ==================== HARDWARE/TEMPERATURE ====================
    {
        "pattern": r"temperature.*critical|thermal.*critical|CPU.*overheating|temp.*above.*threshold",
        "cause": "Component temperature critical",
        "cause_detailed": "A hardware component (CPU, disk, etc.) has reached a dangerous temperature. Sustained high temperatures can cause hardware damage or system shutdowns.",
        "severity": "critical",
        "solution": "Check cooling system immediately; clean dust, verify fans",
        "solution_detailed": "1. Check current temps: sensors\n2. Verify all fans are running\n3. Clean dust from heatsinks and filters\n4. Ensure adequate airflow\n5. Consider reapplying thermal paste if CPU\n6. Check ambient room temperature",
        "category": "temperature"
    },
    
    # ==================== AUTHENTICATION ====================
    {
        "pattern": r"authentication.*failed|login.*failed|invalid.*credentials|access.*denied",
        "cause": "Authentication failure",
        "cause_detailed": "A login attempt failed due to invalid credentials or permissions. Multiple failures may indicate a brute-force attack.",
        "severity": "info",
        "solution": "Verify credentials; check for unauthorized access attempts",
        "solution_detailed": "1. Review auth logs: journalctl -u pvedaemon | grep auth\n2. Check for multiple failures from same IP\n3. Verify user exists: pveum user list\n4. If attack suspected, consider fail2ban\n5. Reset password if needed: pveum passwd <user>",
        "category": "security"
    },
]


def find_matching_error(text: str, category: Optional[str] = None) -> Optional[Dict[str, Any]]:
    """Find a known error that matches the given text.
    
    Args:
        text: Error message or log content to match against
        category: Optional category to filter by
        
    Returns:
        Matching error dict or None
    """
    if not text:
        return None
    
    text_lower = text.lower()
    
    for error in PROXMOX_KNOWN_ERRORS:
        # Filter by category if specified
        if category and error.get("category") != category:
            continue
            
        try:
            if re.search(error["pattern"], text_lower, re.IGNORECASE):
                return error
        except re.error:
            continue
    
    return None


def get_error_context(text: str, category: Optional[str] = None, detail_level: str = "standard") -> Optional[str]:
    """Get formatted context for a known error.
    
    Args:
        text: Error message to match
        category: Optional category filter
        detail_level: "minimal", "standard", or "detailed"
        
    Returns:
        Formatted context string or None
    """
    error = find_matching_error(text, category)
    if not error:
        return None
    
    # NOTE: we intentionally do NOT emit a "Severity:" line here.
    # The catalogue's severity is the *typical* severity of a class
    # of error, not the *actual* severity of the event the user is
    # looking at. A SATA cable warning (rate 11–100 errors/24h, SMART
    # PASSED) used to render "Severity: CRITICAL" in the body because
    # the catalogue says SMART_FAILED is critical generically — that
    # contradicted the WARNING badge on the notification header and
    # frightened operators unnecessarily. The event-level severity
    # (computed by `_check_disk_io` with the tiered model) is already
    # carried by the notification's own severity field; repeating a
    # different value here is noise at best, misinformation at worst.
    if detail_level == "minimal":
        return f"Known issue: {error['cause']}"

    elif detail_level == "standard":
        lines = [
            f"KNOWN PROXMOX ERROR DETECTED:",
            f"  Cause: {error['cause']}",
            f"  Solution: {error['solution']}"
        ]
        if error.get("url"):
            lines.append(f"  Docs: {error['url']}")
        return "\n".join(lines)

    else:  # detailed
        lines = [
            f"KNOWN PROXMOX ERROR DETECTED:",
            f"  Cause: {error.get('cause_detailed', error['cause'])}",
            f"  Solution: {error.get('solution_detailed', error['solution'])}"
        ]
        if error.get("url"):
            lines.append(f"  Documentation: {error['url']}")
        return "\n".join(lines)


def get_all_patterns() -> List[str]:
    """Get all error patterns for external use."""
    return [error["pattern"] for error in PROXMOX_KNOWN_ERRORS]