Add beta 1.2.2.1

2026-07-29 20:08:26 +00:00 · 2026-06-05 17:12:23 +02:00
parent e855fca0b3
commit 3629fe8848
12 changed files with 907 additions and 325 deletions
--- a/AppImage/scripts/flask_server.py
+++ b/AppImage/scripts/flask_server.py
@@ -4858,7 +4858,8 @@ def get_proxmox_vms():
                        'netin': resource.get('netin', 0),
                        'netout': resource.get('netout', 0),
                        'diskread': resource.get('diskread', 0),
-                        'diskwrite': resource.get('diskwrite', 0)
+                        'diskwrite': resource.get('diskwrite', 0),
+                        'maxcpu': resource.get('maxcpu', 0)
                    }
                    # Decorate LXC rows with the apt update status if the
                    # managed_installs registry has it. Absent key means
@@ -7640,14 +7641,26 @@ def api_system():
        try:
            from health_monitor import health_monitor
            _hist = health_monitor.state_history.get('cpu_usage') or []
-            cpu_usage = _hist[-1]['value'] if _hist else psutil.cpu_percent(interval=0.1)
+            if _hist:
+                _last = _hist[-1]
+                cpu_usage = _last['value']
+                cpu_user_pct = _last.get('user', 0)
+                cpu_system_pct = _last.get('system', 0)
+            else:
+                cpu_usage = psutil.cpu_percent(interval=0.1)
+                cpu_user_pct = 0
+                cpu_system_pct = 0
        except Exception:
            cpu_usage = psutil.cpu_percent(interval=0.1)
+            cpu_user_pct = 0
+            cpu_system_pct = 0

        memory = psutil.virtual_memory()
        memory_used_gb = memory.used / (1024 ** 3)
        memory_total_gb = memory.total / (1024 ** 3)
        memory_usage_percent = memory.percent
+        # Preview restyle: cached + buffers in GB
+        memory_cached_gb = round((getattr(memory, 'cached', 0) + getattr(memory, 'buffers', 0)) / (1024 ** 3), 1)
        
        # Get temperature
        temp = get_cpu_temperature()
@@ -7677,9 +7690,12 @@ def api_system():

        return jsonify({
            'cpu_usage': round(cpu_usage, 1),
+            'cpu_user': cpu_user_pct,
+            'cpu_system': cpu_system_pct,
            'memory_usage': round(memory_usage_percent, 1),
            'memory_total': round(memory_total_gb, 1),
            'memory_used': round(memory_used_gb, 1),
+            'memory_cached': memory_cached_gb,
            'temperature': temp,
            'temperature_sparkline': temp_sparkline,
            'uptime': uptime,
@@ -9616,6 +9632,35 @@ def api_node_metrics():
                    if 'zfsarc' not in item or item.get('zfsarc', 0) == 0:
                        item['zfsarc'] = zfs_arc_size

+            # 24h downsampling: RRD returns ~1440 minute-level points which
+            # plots as a dense thicket of vertical spikes. Group into 5-min
+            # buckets and average each numeric field — same shape that
+            # `get_temperature_history` uses for its 24h view so the look
+            # is consistent across the dashboard's 24h charts.
+            if timeframe == 'day' and rrd_data:
+                bucket_seconds = 300  # 5-min
+                buckets = {}
+                for item in rrd_data:
+                    t = item.get('time')
+                    if t is None:
+                        continue
+                    bk = (int(t) // bucket_seconds) * bucket_seconds
+                    if bk not in buckets:
+                        buckets[bk] = {'_count': 0, '_sums': {}}
+                    b = buckets[bk]
+                    b['_count'] += 1
+                    for k, v in item.items():
+                        if k == 'time' or not isinstance(v, (int, float)) or isinstance(v, bool):
+                            continue
+                        b['_sums'][k] = b['_sums'].get(k, 0) + v
+                rrd_data = []
+                for bk in sorted(buckets.keys()):
+                    b = buckets[bk]
+                    point = {'time': bk}
+                    for k, total in b['_sums'].items():
+                        point[k] = total / b['_count']
+                    rrd_data.append(point)
+
            payload = {
                'node': local_node,
                'timeframe': timeframe,
--- a/AppImage/scripts/health_monitor.py
+++ b/AppImage/scripts/health_monitor.py
@@ -453,10 +453,19 @@ class HealthMonitor:
        """Lightweight CPU sample: read usage % and append to history. ~30ms cost."""
        try:
            cpu_percent = psutil.cpu_percent(interval=0)
+            try:
+                _times = psutil.cpu_times_percent(interval=0)
+                cpu_user = round(_times.user + getattr(_times, 'nice', 0), 1)
+                cpu_system = round(_times.system + getattr(_times, 'irq', 0) + getattr(_times, 'softirq', 0), 1)
+            except Exception:
+                cpu_user = 0
+                cpu_system = 0
            current_time = time.time()
            state_key = 'cpu_usage'
            self.state_history[state_key].append({
                'value': cpu_percent,
+                'user': cpu_user,
+                'system': cpu_system,
                'time': current_time
            })
            # Prune entries older than 6 minutes
@@ -608,6 +617,71 @@ class HealthMonitor:
        
        return self.cached_results[cache_key]
    
+    def _apply_dismiss_aware_status(self, check_block: Dict[str, Any]) -> None:
+        """In-place demote a check block's `status` to OK when every
+        underlying error is already user-acknowledged.
+
+        Two flavours, matching how categories actually structure their
+        output:
+
+        * Categories that aggregate inner checks (a `checks` dict whose
+          values each hold an individual `error_key`) — every non-OK
+          inner check must be acknowledged for the block to demote.
+          This is how `_check_lxc_mount_capacity`, the storage block,
+          the disk SMART block, etc. shape their results.
+        * Categories with a single error_key at the top level (CPU
+          hysteresis, certificates, the simpler updates rows) — that
+          one error_key has to be acknowledged.
+
+        When the block demotes, we set ``status='OK'`` and stamp
+        ``all_dismissed=True`` so the front-end (`fetchHealthInfoCount`
+        and the Health modal) can still surface the row as INFO if it
+        wants — the data flow that used to derive "X categories with
+        dismissed items" from `dismissed[]` keeps working unchanged.
+
+        No-op for blocks whose status is already OK / INFO / UNKNOWN —
+        UNKNOWN intentionally never gets dismissed away because the
+        user didn't ack a failing check, the check failed to run.
+        """
+        if not isinstance(check_block, dict):
+            return
+        status = check_block.get('status', 'OK')
+        if status not in ('WARNING', 'CRITICAL'):
+            return
+
+        try:
+            inner_checks = check_block.get('checks')
+            if isinstance(inner_checks, dict) and inner_checks:
+                any_unack = False
+                for inner in inner_checks.values():
+                    if not isinstance(inner, dict):
+                        continue
+                    inner_status = inner.get('status', 'OK')
+                    if inner_status not in ('WARNING', 'CRITICAL'):
+                        continue
+                    ek = inner.get('error_key')
+                    if ek and health_persistence.is_error_acknowledged(ek):
+                        inner['dismissed'] = True
+                        if health_persistence.is_error_permanently_acknowledged(ek):
+                            inner['permanent'] = True
+                    else:
+                        any_unack = True
+                if not any_unack:
+                    check_block['status'] = 'OK'
+                    check_block['all_dismissed'] = True
+                return
+
+            ek = check_block.get('error_key')
+            if ek and health_persistence.is_error_acknowledged(ek):
+                check_block['dismissed'] = True
+                if health_persistence.is_error_permanently_acknowledged(ek):
+                    check_block['permanent'] = True
+                check_block['status'] = 'OK'
+                check_block['all_dismissed'] = True
+        except Exception as e:
+            # Dismiss check should never crash the health pipeline.
+            print(f"[HealthMonitor] _apply_dismiss_aware_status failed: {e}")
+
    def get_overall_status(self) -> Dict[str, Any]:
        """Get overall health status summary with minimal overhead"""
        details = self.get_detailed_status()
@@ -993,7 +1067,42 @@ class HealthMonitor:
                        pass
            else:
                self._unknown_counts[cat_key] = 0
-        
+
+        # --- Dismiss-aware re-derivation of issue lists (root fix for #228) ---
+        # Each `_check_*` above already populated `details[<category>]` with
+        # its raw status and pushed an entry into critical_issues /
+        # warning_issues / info_issues. That raw status doesn't know which
+        # error_keys the user has acknowledged, so a category whose only
+        # remaining problems are all dismissed (e.g. nine permanently-
+        # silenced LXC mount alerts) was still pushing the global `overall`
+        # to CRITICAL. The popup's frontend rollup had to compensate for
+        # this server-side gap, which is how the badge ("Critical" in the
+        # header) and the panel ("0 Critical" inside) ended up disagreeing.
+        #
+        # Apply the existing per-block dismiss filter (`_annotate_dismissed`
+        # downstream is the visual-merge cousin of this) to every
+        # category, then rebuild the issue lists from the post-filter
+        # statuses. The pre-existing inline appends are discarded — they
+        # represented the pre-fix view.
+        critical_issues = []
+        warning_issues = []
+        info_issues = []
+        for cat_key in list(details.keys()):
+            block = details.get(cat_key)
+            if not isinstance(block, dict):
+                continue
+            self._apply_dismiss_aware_status(block)
+            status = block.get('status', 'OK')
+            reason = (block.get('reason') or '').strip()
+            label = cat_key.replace('_', ' ').capitalize()
+            entry = f"{label}: {reason}" if reason else label
+            if status == 'CRITICAL':
+                critical_issues.append(entry)
+            elif status == 'WARNING':
+                warning_issues.append(entry)
+            elif status == 'INFO':
+                info_issues.append(entry)
+
        # --- Determine Overall Status ---
        # Severity: CRITICAL > WARNING > UNKNOWN (capped at WARNING) > INFO > OK
        if critical_issues:
--- a/AppImage/scripts/health_persistence.py
+++ b/AppImage/scripts/health_persistence.py
@@ -1367,6 +1367,8 @@ class HealthPersistence:
        _zfs_pools_cache = None
        _mount_points_cache = None
        _pve_services_cache = None
+        _pvesm_storages_cache = None
+        _remote_mount_targets_cache = None
        
        def check_vm_ct_cached(vmid):
            if vmid not in _vm_ct_exists_cache:
@@ -1445,7 +1447,68 @@ class HealthPersistence:
                except Exception:
                    _mount_points_cache = set()
            return _mount_points_cache
-        
+
+        def get_pvesm_storages():
+            """Return the set of pvesm storage IDs currently configured.
+
+            Used to auto-resolve `storage_unavailable_*` and
+            `pve_storage_full_*` errors after the user removes the
+            corresponding entry from `pvesm`/Datacenter > Storage. The
+            check function would otherwise keep firing on a path that
+            no longer has any business existing.
+            """
+            nonlocal _pvesm_storages_cache
+            if _pvesm_storages_cache is None:
+                _pvesm_storages_cache = set()
+                try:
+                    result = subprocess.run(
+                        ['pvesm', 'status'],
+                        capture_output=True, text=True, timeout=5
+                    )
+                    if result.returncode == 0:
+                        for line in result.stdout.strip().split('\n')[1:]:
+                            parts = line.split()
+                            if parts:
+                                _pvesm_storages_cache.add(parts[0])
+                except Exception:
+                    # On failure leave the cache as an empty set rather
+                    # than `None` — that prevents us from re-trying every
+                    # row in the active_errors loop, and the empty set
+                    # means we won't auto-resolve anything (safer than
+                    # falsely resolving when pvesm is momentarily down).
+                    _pvesm_storages_cache = set()
+                    return _pvesm_storages_cache
+            return _pvesm_storages_cache
+
+        def get_remote_mount_targets():
+            """Return the set of mount targets currently in /proc/mounts
+            for remote filesystems (NFS/CIFS/SMB).
+
+            Lets us tell apart a `mount_stale_<target>` whose underlying
+            mount the user has umount'd (so the alert is now stale data
+            that should self-clear) from one the user genuinely needs
+            attention on (the mount is still active but the share is
+            unreachable). Without this distinction the alert pinned
+            forever once the user removed the PVE storage and lazy-
+            umount'd it, which is the case @UBLI-WLAN reported.
+            """
+            nonlocal _remote_mount_targets_cache
+            if _remote_mount_targets_cache is None:
+                _remote_mount_targets_cache = set()
+                try:
+                    with open('/proc/mounts', 'r', encoding='utf-8', errors='replace') as f:
+                        for line in f:
+                            parts = line.strip().split()
+                            if len(parts) < 3:
+                                continue
+                            fstype = parts[2]
+                            # Match the same fstypes mount_monitor watches.
+                            if fstype in ('nfs', 'nfs4', 'cifs', 'smb', 'smbfs') or fstype.startswith(('nfs', 'cifs', 'smb')):
+                                _remote_mount_targets_cache.add(parts[1])
+                except OSError:
+                    pass
+            return _remote_mount_targets_cache
+
        def get_pve_services_status():
            nonlocal _pve_services_cache
            if _pve_services_cache is None:
@@ -1617,9 +1680,72 @@ class HealthPersistence:
                    should_resolve = True
                    resolution_reason = 'No longer in cluster'
            
+            # === PVE STORAGE REMOVED ===
+            # Errors that name a PVE storage (storage_unavailable_<id>,
+            # pve_storage_full_<id>) outlive the storage itself when the
+            # user removes it from pvesm. Until this hook landed, the
+            # check function kept stat'ing /mnt/pve/<id> after every
+            # iteration, found the path missing, and persisted a fresh
+            # CRITICAL — reported by @UBLI-WLAN on June 4 2026.
+            if not should_resolve and error_key:
+                storage_match = None
+                if error_key.startswith('storage_unavailable_'):
+                    storage_match = error_key[len('storage_unavailable_'):]
+                elif error_key.startswith('pve_storage_full_'):
+                    storage_match = error_key[len('pve_storage_full_'):]
+                if storage_match:
+                    pvesm_set = get_pvesm_storages()
+                    # Only treat as removed when `pvesm status` ran AND
+                    # returned a non-empty list. An empty set could mean
+                    # pvesm timed out, in which case it's safer not to
+                    # resolve anything.
+                    if pvesm_set and storage_match not in pvesm_set:
+                        should_resolve = True
+                        resolution_reason = f'Storage {storage_match} removed from pvesm'
+
+            # === LXC MOUNT FOR DELETED CT ===
+            # `_check_lxc_mount_capacity` records
+            # `lxc_mount_<vmid>_<mount>`, which the VM/CT block above
+            # misses because the prefix isn't one of `vm_/ct_/vmct_`.
+            # When the CT is gone the disk-fill alert is meaningless.
+            if not should_resolve and error_key and error_key.startswith('lxc_mount_'):
+                # `lxc_mount_<vmid>_<mount-path-tokens>` — VMID is the
+                # first integer block after the prefix.
+                m = re.match(r'^lxc_mount_(\d+)_', error_key)
+                if m:
+                    lxc_vmid = m.group(1)
+                    if not check_vm_ct_cached(lxc_vmid):
+                        should_resolve = True
+                        resolution_reason = f'CT {lxc_vmid} no longer exists'
+
+            # === ORPHAN REMOTE MOUNT ===
+            # `_check_remote_mounts` records `mount_<status>_<target>`
+            # for every NFS/CIFS/SMB target that's in /proc/mounts but
+            # fails to stat. When the user removes the PVE storage,
+            # PVE often does a lazy umount: the kernel mount entry is
+            # gone (or the /mnt/pve/<id> target was deleted on top), so
+            # subsequent scans never see the mount again — but the
+            # already-persisted error has no auto-resolve path.
+            # Resolve the error when the target is no longer in
+            # /proc/mounts as a remote mount.
+            if not should_resolve and error_key and error_key.startswith('mount_'):
+                # `mount_stale_<target>` or `mount_readonly_<target>`
+                # — possibly LXC-scoped as `mount_<status>_ct<id>:<target>`.
+                stripped = error_key.split('_', 2)
+                if len(stripped) == 3:
+                    key_target = stripped[2]
+                    # LXC-scoped entries (`ct123:/mnt/foo`) are left for
+                    # the VM/CT cleanup path; the host-side reconciler
+                    # only owns host-level targets.
+                    if not key_target.startswith('ct'):
+                        targets = get_remote_mount_targets()
+                        if key_target not in targets:
+                            should_resolve = True
+                            resolution_reason = 'Remote mount no longer present (orphan auto-cleared)'
+
            # === TEMPERATURE ERRORS ===
            # Temperature errors - check if sensor still exists (unlikely to change, resolve after 24h of no activity)
-            elif category == 'temperature':
+            if not should_resolve and category == 'temperature':
                if last_seen_hours > 24:
                    should_resolve = True
                    resolution_reason = 'Temperature error stale (>24h no activity)'
--- a/AppImage/scripts/managed_installs.py
+++ b/AppImage/scripts/managed_installs.py
@@ -170,19 +170,46 @@ def _detect_nvidia_xfree86() -> Optional[dict]:
 def _detect_coral_host() -> list[dict]:
    out: list[dict] = []

-    # PCIe / M.2 — gasket-dkms package version, falling back to the
-    # registered DKMS version if the package was force-removed but the
-    # built modules still exist.
+    # PCIe / M.2 — version detection has three sources, tried in this
+    # order of trust:
+    #
+    #   1. The marker file `/var/lib/proxmenux/coral_gasket_version`
+    #      written by `install_coral.sh` after a successful DKMS
+    #      install — contains the feranick release tag actually
+    #      installed (e.g. `1.0-18.4`). This is the only source that
+    #      knows the fork's patch level.
+    #   2. `dpkg-query gasket-dkms` — the Debian package version, only
+    #      present when the user installed via .deb rather than the
+    #      ProxMenux script.
+    #   3. `dkms status` — the upstream module version registered with
+    #      DKMS, which is always the bare `1.0`. Useful as a "modules
+    #      are present" indicator but doesn't reveal the fork patch
+    #      level, so the update-availability check would always fire a
+    #      false positive against feranick's `1.0-N` tags. Reported on
+    #      .50 after a successful re-install kept showing the update
+    #      notification.
    pcie_version: Optional[str] = None
    try:
-        r = subprocess.run(
-            ["dpkg-query", "-W", "-f=${Status}|${Version}", "gasket-dkms"],
-            capture_output=True, text=True, timeout=3,
-        )
-        if r.returncode == 0 and "ok installed" in r.stdout:
-            pcie_version = r.stdout.split("|", 1)[1].strip()
-    except (FileNotFoundError, OSError, subprocess.TimeoutExpired):
+        with open("/var/lib/proxmenux/coral_gasket_version",
+                  "r", encoding="utf-8", errors="replace") as fh:
+            marker = fh.read().strip()
+            # Sanity check: the file should hold something that looks
+            # like a version tag, not an error message or empty line.
+            if marker and re.match(r"^[A-Za-z0-9._+-]+$", marker):
+                pcie_version = marker
+    except OSError:
        pass
+
+    if not pcie_version:
+        try:
+            r = subprocess.run(
+                ["dpkg-query", "-W", "-f=${Status}|${Version}", "gasket-dkms"],
+                capture_output=True, text=True, timeout=3,
+            )
+            if r.returncode == 0 and "ok installed" in r.stdout:
+                pcie_version = r.stdout.split("|", 1)[1].strip()
+        except (FileNotFoundError, OSError, subprocess.TimeoutExpired):
+            pass
    if not pcie_version:
        try:
            r = subprocess.run(