"""
ProxMenux Notification Event Watchers
Detects Proxmox events from journald, PVE task log, and health monitor.

Architecture:
- JournalWatcher: Real-time stream of journald for critical events
- TaskWatcher: Real-time tail of /var/log/pve/tasks/index for VM/CT/backup events
- PollingCollector: Periodic poll of health_persistence pending notifications

All watchers put events into a shared Queue consumed by NotificationManager.

Author: MacRimi
"""

import os
import re
import json
import time
import socket
import subprocess
import threading
from queue import Queue
from typing import Optional, Dict, Any
from pathlib import Path


# ─── Event Object ─────────────────────────────────────────────────

class NotificationEvent:
    """Represents a detected event ready for notification dispatch."""
    
    __slots__ = ('event_type', 'severity', 'data', 'timestamp', 'source')
    
    def __init__(self, event_type: str, severity: str = 'INFO',
                 data: Optional[Dict[str, Any]] = None,
                 source: str = 'watcher'):
        self.event_type = event_type
        self.severity = severity
        self.data = data or {}
        self.timestamp = time.time()
        self.source = source
    
    def __repr__(self):
        return f"NotificationEvent({self.event_type}, {self.severity})"


def _hostname() -> str:
    try:
        return socket.gethostname().split('.')[0]
    except Exception:
        return 'proxmox'


# ─── Journal Watcher (Real-time) ─────────────────────────────────

class JournalWatcher:
    """Watches journald in real-time for critical system events.
    
    Uses 'journalctl -f -o json' subprocess to stream entries.
    Detects: auth failures, kernel panics, OOM, service crashes,
    disk I/O errors, split-brain, node disconnect, system shutdown,
    fail2ban bans, firewall blocks, permission changes.
    """
    
    def __init__(self, event_queue: Queue):
        self._queue = event_queue
        self._running = False
        self._thread: Optional[threading.Thread] = None
        self._process: Optional[subprocess.Popen] = None
        self._hostname = _hostname()
        
        # Dedup: track recent events to avoid duplicates
        self._recent_events: Dict[str, float] = {}
        self._dedup_window = 30  # seconds
    
    def start(self):
        """Start the journal watcher thread."""
        if self._running:
            return
        self._running = True
        self._thread = threading.Thread(target=self._watch_loop, daemon=True,
                                        name='journal-watcher')
        self._thread.start()
    
    def stop(self):
        """Stop the journal watcher."""
        self._running = False
        if self._process:
            try:
                self._process.terminate()
                self._process.wait(timeout=5)
            except Exception:
                try:
                    self._process.kill()
                except Exception:
                    pass
    
    def _watch_loop(self):
        """Main watch loop with auto-restart on failure."""
        while self._running:
            try:
                self._run_journalctl()
            except Exception as e:
                print(f"[JournalWatcher] Error: {e}")
            if self._running:
                time.sleep(5)  # Wait before restart
    
    def _run_journalctl(self):
        """Run journalctl -f and process output line by line."""
        cmd = ['journalctl', '-f', '-o', 'json', '--no-pager',
               '-n', '0']  # Start from now, don't replay history
        
        self._process = subprocess.Popen(
            cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
            text=True, bufsize=1
        )
        
        for line in self._process.stdout:
            if not self._running:
                break
            line = line.strip()
            if not line:
                continue
            try:
                entry = json.loads(line)
                self._process_entry(entry)
            except (json.JSONDecodeError, KeyError):
                # Try plain text matching as fallback
                self._process_plain(line)
        
        if self._process:
            self._process.wait()
    
    def _process_entry(self, entry: Dict):
        """Process a parsed journald JSON entry."""
        msg = entry.get('MESSAGE', '')
        if not msg or not isinstance(msg, str):
            return
        
        unit = entry.get('_SYSTEMD_UNIT', '')
        syslog_id = entry.get('SYSLOG_IDENTIFIER', '')
        priority = int(entry.get('PRIORITY', 6))
        
        self._check_auth_failure(msg, syslog_id, entry)
        self._check_fail2ban(msg, syslog_id)
        self._check_kernel_critical(msg, syslog_id, priority)
        self._check_service_failure(msg, unit)
        self._check_disk_io(msg, syslog_id, priority)
        self._check_cluster_events(msg, syslog_id)
        self._check_system_shutdown(msg, syslog_id)
        self._check_permission_change(msg, syslog_id)
        self._check_firewall(msg, syslog_id)
    
    def _process_plain(self, line: str):
        """Fallback: process a plain text log line."""
        self._check_auth_failure(line, '', {})
        self._check_fail2ban(line, '')
        self._check_kernel_critical(line, '', 6)
        self._check_cluster_events(line, '')
        self._check_system_shutdown(line, '')
    
    # ── Detection methods ──
    
    def _check_auth_failure(self, msg: str, syslog_id: str, entry: Dict):
        """Detect authentication failures (SSH, PAM, PVE)."""
        patterns = [
            (r'Failed password for (?:invalid user )?(\S+) from (\S+)', 'ssh'),
            (r'authentication failure.*rhost=(\S+).*user=(\S+)', 'pam'),
            (r'pvedaemon\[.*authentication failure.*rhost=(\S+)', 'pve'),
        ]
        
        for pattern, service in patterns:
            match = re.search(pattern, msg, re.IGNORECASE)
            if match:
                groups = match.groups()
                if service == 'ssh':
                    username, source_ip = groups[0], groups[1]
                elif service == 'pam':
                    source_ip, username = groups[0], groups[1]
                else:
                    source_ip = groups[0]
                    username = 'unknown'
                
                self._emit('auth_fail', 'WARNING', {
                    'source_ip': source_ip,
                    'username': username,
                    'service': service,
                    'hostname': self._hostname,
                })
                return
    
    def _check_fail2ban(self, msg: str, syslog_id: str):
        """Detect Fail2Ban IP bans."""
        if 'fail2ban' not in msg.lower() and syslog_id != 'fail2ban-server':
            return
        
        # Ban detected
        ban_match = re.search(r'Ban\s+(\S+)', msg)
        if ban_match:
            ip = ban_match.group(1)
            jail_match = re.search(r'\[(\w+)\]', msg)
            jail = jail_match.group(1) if jail_match else 'unknown'
            
            self._emit('ip_block', 'INFO', {
                'source_ip': ip,
                'jail': jail,
                'failures': '',
                'hostname': self._hostname,
            })
    
    def _check_kernel_critical(self, msg: str, syslog_id: str, priority: int):
        """Detect kernel panics, OOM, segfaults, hardware errors."""
        critical_patterns = {
            r'kernel panic':       ('system_problem', 'CRITICAL', 'Kernel panic'),
            r'Out of memory':      ('system_problem', 'CRITICAL', 'Out of memory killer activated'),
            r'segfault':           ('system_problem', 'WARNING',  'Segmentation fault detected'),
            r'BUG:':               ('system_problem', 'CRITICAL', 'Kernel BUG detected'),
            r'Call Trace:':        ('system_problem', 'WARNING',  'Kernel call trace'),
            r'I/O error.*dev\s+(\S+)': ('disk_io_error', 'CRITICAL', 'Disk I/O error'),
            r'EXT4-fs error':      ('disk_io_error', 'CRITICAL', 'Filesystem error'),
            r'BTRFS error':        ('disk_io_error', 'CRITICAL', 'Filesystem error'),
            r'XFS.*error':         ('disk_io_error', 'CRITICAL', 'Filesystem error'),
            r'ZFS.*error':         ('disk_io_error', 'CRITICAL', 'ZFS pool error'),
            r'mce:.*Hardware Error': ('system_problem', 'CRITICAL', 'Hardware error (MCE)'),
        }
        
        for pattern, (event_type, severity, reason) in critical_patterns.items():
            if re.search(pattern, msg, re.IGNORECASE):
                data = {'reason': reason, 'hostname': self._hostname}
                
                # Try to extract device for disk errors
                dev_match = re.search(r'dev\s+(\S+)', msg)
                if dev_match and event_type == 'disk_io_error':
                    data['device'] = dev_match.group(1)
                
                self._emit(event_type, severity, data)
                return
    
    def _check_service_failure(self, msg: str, unit: str):
        """Detect critical service failures."""
        service_patterns = [
            r'Failed to start (.+)',
            r'Unit (\S+) (?:entered failed state|failed)',
            r'(\S+)\.service: (?:Main process exited|Failed with result)',
        ]
        
        for pattern in service_patterns:
            match = re.search(pattern, msg)
            if match:
                service_name = match.group(1)
                self._emit('service_fail', 'WARNING', {
                    'service_name': service_name,
                    'reason': msg[:200],
                    'hostname': self._hostname,
                })
                return
    
    def _check_disk_io(self, msg: str, syslog_id: str, priority: int):
        """Detect disk I/O errors from kernel messages."""
        if syslog_id != 'kernel' and priority > 3:
            return
        
        io_patterns = [
            r'blk_update_request: I/O error.*dev (\S+)',
            r'Buffer I/O error on device (\S+)',
            r'SCSI error.*sd(\w)',
            r'ata\d+.*error',
        ]
        
        for pattern in io_patterns:
            match = re.search(pattern, msg)
            if match:
                device = match.group(1) if match.lastindex else 'unknown'
                self._emit('disk_io_error', 'CRITICAL', {
                    'device': device,
                    'reason': msg[:200],
                    'hostname': self._hostname,
                })
                return
    
    def _check_cluster_events(self, msg: str, syslog_id: str):
        """Detect cluster split-brain and node disconnect."""
        msg_lower = msg.lower()
        
        # Split-brain
        if any(p in msg_lower for p in ['split-brain', 'split brain',
                                          'fencing required', 'cluster partition']):
            quorum = 'unknown'
            if 'quorum' in msg_lower:
                quorum = 'lost' if 'lost' in msg_lower else 'valid'
            
            self._emit('split_brain', 'CRITICAL', {
                'quorum': quorum,
                'reason': msg[:200],
                'hostname': self._hostname,
            })
            return
        
        # Node disconnect
        if (('quorum' in msg_lower and 'lost' in msg_lower) or
            ('node' in msg_lower and any(w in msg_lower for w in ['left', 'offline', 'lost']))):
            
            node_match = re.search(r'[Nn]ode\s+(\S+)', msg)
            node_name = node_match.group(1) if node_match else 'unknown'
            
            self._emit('node_disconnect', 'CRITICAL', {
                'node_name': node_name,
                'hostname': self._hostname,
            })
    
    def _check_system_shutdown(self, msg: str, syslog_id: str):
        """Detect system shutdown/reboot."""
        if 'systemd-journald' in syslog_id or 'systemd' in syslog_id:
            if 'Journal stopped' in msg or 'Stopping Journal Service' in msg:
                self._emit('system_shutdown', 'WARNING', {
                    'reason': 'System journal stopped',
                    'hostname': self._hostname,
                })
            elif 'Shutting down' in msg or 'System is rebooting' in msg:
                event = 'system_reboot' if 'reboot' in msg.lower() else 'system_shutdown'
                self._emit(event, 'WARNING', {
                    'reason': msg[:200],
                    'hostname': self._hostname,
                })
    
    def _check_permission_change(self, msg: str, syslog_id: str):
        """Detect user permission changes in PVE."""
        permission_patterns = [
            (r'set permissions.*user\s+(\S+)', 'Permission changed'),
            (r'user added to group.*?(\S+)', 'Added to group'),
            (r'user removed from group.*?(\S+)', 'Removed from group'),
            (r'ACL updated.*?(\S+)', 'ACL updated'),
            (r'Role assigned.*?(\S+)', 'Role assigned'),
        ]
        
        for pattern, action in permission_patterns:
            match = re.search(pattern, msg, re.IGNORECASE)
            if match:
                username = match.group(1)
                self._emit('user_permission_change', 'INFO', {
                    'username': username,
                    'change_details': action,
                    'hostname': self._hostname,
                })
                return
    
    def _check_firewall(self, msg: str, syslog_id: str):
        """Detect firewall issues (not individual drops, but rule errors)."""
        if re.search(r'pve-firewall.*(?:error|failed|unable)', msg, re.IGNORECASE):
            self._emit('firewall_issue', 'WARNING', {
                'reason': msg[:200],
                'hostname': self._hostname,
            })
    
    # ── Emit helper ──
    
    def _emit(self, event_type: str, severity: str, data: Dict):
        """Emit event to queue with deduplication."""
        dedup_key = f"{event_type}:{data.get('source_ip', '')}:{data.get('device', '')}:{data.get('service_name', '')}"
        
        now = time.time()
        last = self._recent_events.get(dedup_key, 0)
        if now - last < self._dedup_window:
            return  # Skip duplicate
        
        self._recent_events[dedup_key] = now
        
        # Cleanup old dedup entries periodically
        if len(self._recent_events) > 200:
            cutoff = now - self._dedup_window * 2
            self._recent_events = {
                k: v for k, v in self._recent_events.items() if v > cutoff
            }
        
        self._queue.put(NotificationEvent(event_type, severity, data, source='journal'))


# ─── Task Watcher (Real-time) ────────────────────────────────────

class TaskWatcher:
    """Watches /var/log/pve/tasks/index for VM/CT and backup events.
    
    The PVE task index file is appended when tasks start/finish.
    Format: UPID:node:pid:pstart:starttime:type:id:user:
    Final status is recorded when task completes.
    """
    
    TASK_LOG = '/var/log/pve/tasks/index'
    
    # Map PVE task types to our event types
    TASK_MAP = {
        'qmstart':    ('vm_start',    'INFO'),
        'qmstop':     ('vm_stop',     'INFO'),
        'qmshutdown': ('vm_shutdown', 'INFO'),
        'qmreboot':   ('vm_restart',  'INFO'),
        'qmreset':    ('vm_restart',  'INFO'),
        'vzstart':    ('ct_start',    'INFO'),
        'vzstop':     ('ct_stop',     'INFO'),
        'vzshutdown': ('ct_stop',     'INFO'),
        'vzdump':     ('backup_start', 'INFO'),
        'qmsnapshot': ('snapshot_complete', 'INFO'),
        'vzsnapshot': ('snapshot_complete', 'INFO'),
        'qmigrate':   ('migration_start', 'INFO'),
        'vzmigrate':  ('migration_start', 'INFO'),
    }
    
    def __init__(self, event_queue: Queue):
        self._queue = event_queue
        self._running = False
        self._thread: Optional[threading.Thread] = None
        self._hostname = _hostname()
        self._last_position = 0
    
    def start(self):
        if self._running:
            return
        self._running = True
        
        # Start at end of file
        if os.path.exists(self.TASK_LOG):
            try:
                self._last_position = os.path.getsize(self.TASK_LOG)
            except OSError:
                self._last_position = 0
        
        self._thread = threading.Thread(target=self._watch_loop, daemon=True,
                                        name='task-watcher')
        self._thread.start()
    
    def stop(self):
        self._running = False
    
    def _watch_loop(self):
        """Poll the task index file for new entries."""
        while self._running:
            try:
                if os.path.exists(self.TASK_LOG):
                    current_size = os.path.getsize(self.TASK_LOG)
                    
                    if current_size < self._last_position:
                        # File was truncated/rotated
                        self._last_position = 0
                    
                    if current_size > self._last_position:
                        with open(self.TASK_LOG, 'r') as f:
                            f.seek(self._last_position)
                            new_lines = f.readlines()
                            self._last_position = f.tell()
                        
                        for line in new_lines:
                            self._process_task_line(line.strip())
            except Exception as e:
                print(f"[TaskWatcher] Error reading task log: {e}")
            
            time.sleep(2)  # Check every 2 seconds
    
    def _process_task_line(self, line: str):
        """Process a single task index line.
        
        PVE task index format (space-separated):
        UPID endtime status
        Where UPID = UPID:node:pid:pstart:starttime:type:id:user:
        """
        if not line:
            return
        
        parts = line.split()
        if not parts:
            return
        
        upid = parts[0]
        status = parts[2] if len(parts) >= 3 else ''
        
        # Parse UPID
        upid_parts = upid.split(':')
        if len(upid_parts) < 8:
            return
        
        task_type = upid_parts[5]
        vmid = upid_parts[6]
        user = upid_parts[7]
        
        # Get VM/CT name
        vmname = self._get_vm_name(vmid) if vmid else ''
        
        # Map to event type
        event_info = self.TASK_MAP.get(task_type)
        if not event_info:
            return
        
        event_type, default_severity = event_info
        
        # Check if task failed
        is_error = status and status != 'OK' and status != ''
        
        if is_error:
            # Override to failure event
            if 'start' in event_type:
                event_type = event_type.replace('_start', '_fail')
            elif 'complete' in event_type:
                event_type = event_type.replace('_complete', '_fail')
            severity = 'CRITICAL'
        elif status == 'OK':
            # Task completed successfully
            if event_type == 'backup_start':
                event_type = 'backup_complete'
            elif event_type == 'migration_start':
                event_type = 'migration_complete'
            severity = 'INFO'
        else:
            # Task just started (no status yet)
            severity = default_severity
        
        data = {
            'vmid': vmid,
            'vmname': vmname or f'ID {vmid}',
            'hostname': self._hostname,
            'user': user,
            'reason': status if is_error else '',
            'target_node': '',
            'size': '',
            'snapshot_name': '',
        }
        
        self._queue.put(NotificationEvent(event_type, severity, data, source='task'))
    
    def _get_vm_name(self, vmid: str) -> str:
        """Try to resolve VMID to name via config files."""
        if not vmid:
            return ''
        
        # Try QEMU
        conf_path = f'/etc/pve/qemu-server/{vmid}.conf'
        name = self._read_name_from_conf(conf_path)
        if name:
            return name
        
        # Try LXC
        conf_path = f'/etc/pve/lxc/{vmid}.conf'
        name = self._read_name_from_conf(conf_path)
        if name:
            return name
        
        return ''
    
    @staticmethod
    def _read_name_from_conf(path: str) -> str:
        """Read 'name:' or 'hostname:' from PVE config file."""
        try:
            if not os.path.exists(path):
                return ''
            with open(path, 'r') as f:
                for line in f:
                    if line.startswith('name:'):
                        return line.split(':', 1)[1].strip()
                    if line.startswith('hostname:'):
                        return line.split(':', 1)[1].strip()
        except (IOError, PermissionError):
            pass
        return ''


# ─── Polling Collector ────────────────────────────────────────────

class PollingCollector:
    """Periodic collector that reads Health Monitor pending notifications.
    
    Polls health_persistence for:
    - Pending notification events (state changes from Bloque A)
    - Unnotified errors
    - Update availability (every 24h)
    """
    
    def __init__(self, event_queue: Queue, poll_interval: int = 30):
        self._queue = event_queue
        self._running = False
        self._thread: Optional[threading.Thread] = None
        self._poll_interval = poll_interval
        self._hostname = _hostname()
        self._last_update_check = 0
        self._update_check_interval = 86400  # 24 hours
    
    def start(self):
        if self._running:
            return
        self._running = True
        self._thread = threading.Thread(target=self._poll_loop, daemon=True,
                                        name='polling-collector')
        self._thread.start()
    
    def stop(self):
        self._running = False
    
    def _poll_loop(self):
        """Main polling loop."""
        # Initial delay to let health monitor warm up
        for _ in range(10):
            if not self._running:
                return
            time.sleep(1)
        
        while self._running:
            try:
                self._collect_health_events()
                self._check_updates()
            except Exception as e:
                print(f"[PollingCollector] Error: {e}")
            
            # Sleep in small increments for responsive shutdown
            for _ in range(self._poll_interval):
                if not self._running:
                    return
                time.sleep(1)
    
    def _collect_health_events(self):
        """Collect pending notification events from health_persistence."""
        try:
            from health_persistence import health_persistence
            
            # Get pending notification events
            events = health_persistence.get_pending_notifications()
            for evt in events:
                data = json.loads(evt.get('data', '{}')) if isinstance(evt.get('data'), str) else evt.get('data', {})
                
                event_type = evt.get('event_type', 'state_change')
                severity = data.get('severity', 'WARNING')
                
                data['hostname'] = self._hostname
                data['error_key'] = evt.get('error_key', '')
                
                self._queue.put(NotificationEvent(
                    event_type, severity, data, source='health_monitor'
                ))
            
            # Mark events as notified
            if events:
                event_ids = [e['id'] for e in events if 'id' in e]
                if event_ids:
                    health_persistence.mark_events_notified(event_ids)
            
            # Also check unnotified errors
            unnotified = health_persistence.get_unnotified_errors()
            for error in unnotified:
                self._queue.put(NotificationEvent(
                    'new_error', error.get('severity', 'WARNING'), {
                        'category': error.get('category', ''),
                        'reason': error.get('reason', ''),
                        'hostname': self._hostname,
                        'error_key': error.get('error_key', ''),
                    },
                    source='health_monitor'
                ))
                # Mark as notified
                if 'id' in error:
                    health_persistence.mark_notified(error['id'])
                    
        except ImportError:
            pass  # health_persistence not available (CLI mode)
        except Exception as e:
            print(f"[PollingCollector] Health event collection error: {e}")
    
    def _check_updates(self):
        """Check for available system updates (every 24h)."""
        now = time.time()
        if now - self._last_update_check < self._update_check_interval:
            return
        
        self._last_update_check = now
        
        try:
            result = subprocess.run(
                ['apt-get', '-s', 'upgrade'],
                capture_output=True, text=True, timeout=60
            )
            
            if result.returncode == 0:
                # Count upgradeable packages
                lines = [l for l in result.stdout.split('\n')
                         if l.startswith('Inst ')]
                count = len(lines)
                
                if count > 0:
                    # Show first 5 package names
                    packages = [l.split()[1] for l in lines[:5]]
                    details = ', '.join(packages)
                    if count > 5:
                        details += f', ... and {count - 5} more'
                    
                    self._queue.put(NotificationEvent(
                        'update_available', 'INFO', {
                            'count': str(count),
                            'details': details,
                            'hostname': self._hostname,
                        },
                        source='polling'
                    ))
        except Exception:
            pass  # Non-critical, silently skip