Add beta 1.2.2.1

This commit is contained in:
MacRimi
2026-06-05 17:12:23 +02:00
parent e855fca0b3
commit 3629fe8848
12 changed files with 907 additions and 325 deletions

View File

@@ -4858,7 +4858,8 @@ def get_proxmox_vms():
'netin': resource.get('netin', 0),
'netout': resource.get('netout', 0),
'diskread': resource.get('diskread', 0),
'diskwrite': resource.get('diskwrite', 0)
'diskwrite': resource.get('diskwrite', 0),
'maxcpu': resource.get('maxcpu', 0)
}
# Decorate LXC rows with the apt update status if the
# managed_installs registry has it. Absent key means
@@ -7640,14 +7641,26 @@ def api_system():
try:
from health_monitor import health_monitor
_hist = health_monitor.state_history.get('cpu_usage') or []
cpu_usage = _hist[-1]['value'] if _hist else psutil.cpu_percent(interval=0.1)
if _hist:
_last = _hist[-1]
cpu_usage = _last['value']
cpu_user_pct = _last.get('user', 0)
cpu_system_pct = _last.get('system', 0)
else:
cpu_usage = psutil.cpu_percent(interval=0.1)
cpu_user_pct = 0
cpu_system_pct = 0
except Exception:
cpu_usage = psutil.cpu_percent(interval=0.1)
cpu_user_pct = 0
cpu_system_pct = 0
memory = psutil.virtual_memory()
memory_used_gb = memory.used / (1024 ** 3)
memory_total_gb = memory.total / (1024 ** 3)
memory_usage_percent = memory.percent
# Preview restyle: cached + buffers in GB
memory_cached_gb = round((getattr(memory, 'cached', 0) + getattr(memory, 'buffers', 0)) / (1024 ** 3), 1)
# Get temperature
temp = get_cpu_temperature()
@@ -7677,9 +7690,12 @@ def api_system():
return jsonify({
'cpu_usage': round(cpu_usage, 1),
'cpu_user': cpu_user_pct,
'cpu_system': cpu_system_pct,
'memory_usage': round(memory_usage_percent, 1),
'memory_total': round(memory_total_gb, 1),
'memory_used': round(memory_used_gb, 1),
'memory_cached': memory_cached_gb,
'temperature': temp,
'temperature_sparkline': temp_sparkline,
'uptime': uptime,
@@ -9616,6 +9632,35 @@ def api_node_metrics():
if 'zfsarc' not in item or item.get('zfsarc', 0) == 0:
item['zfsarc'] = zfs_arc_size
# 24h downsampling: RRD returns ~1440 minute-level points which
# plots as a dense thicket of vertical spikes. Group into 5-min
# buckets and average each numeric field — same shape that
# `get_temperature_history` uses for its 24h view so the look
# is consistent across the dashboard's 24h charts.
if timeframe == 'day' and rrd_data:
bucket_seconds = 300 # 5-min
buckets = {}
for item in rrd_data:
t = item.get('time')
if t is None:
continue
bk = (int(t) // bucket_seconds) * bucket_seconds
if bk not in buckets:
buckets[bk] = {'_count': 0, '_sums': {}}
b = buckets[bk]
b['_count'] += 1
for k, v in item.items():
if k == 'time' or not isinstance(v, (int, float)) or isinstance(v, bool):
continue
b['_sums'][k] = b['_sums'].get(k, 0) + v
rrd_data = []
for bk in sorted(buckets.keys()):
b = buckets[bk]
point = {'time': bk}
for k, total in b['_sums'].items():
point[k] = total / b['_count']
rrd_data.append(point)
payload = {
'node': local_node,
'timeframe': timeframe,

View File

@@ -453,10 +453,19 @@ class HealthMonitor:
"""Lightweight CPU sample: read usage % and append to history. ~30ms cost."""
try:
cpu_percent = psutil.cpu_percent(interval=0)
try:
_times = psutil.cpu_times_percent(interval=0)
cpu_user = round(_times.user + getattr(_times, 'nice', 0), 1)
cpu_system = round(_times.system + getattr(_times, 'irq', 0) + getattr(_times, 'softirq', 0), 1)
except Exception:
cpu_user = 0
cpu_system = 0
current_time = time.time()
state_key = 'cpu_usage'
self.state_history[state_key].append({
'value': cpu_percent,
'user': cpu_user,
'system': cpu_system,
'time': current_time
})
# Prune entries older than 6 minutes
@@ -608,6 +617,71 @@ class HealthMonitor:
return self.cached_results[cache_key]
def _apply_dismiss_aware_status(self, check_block: Dict[str, Any]) -> None:
"""In-place demote a check block's `status` to OK when every
underlying error is already user-acknowledged.
Two flavours, matching how categories actually structure their
output:
* Categories that aggregate inner checks (a `checks` dict whose
values each hold an individual `error_key`) — every non-OK
inner check must be acknowledged for the block to demote.
This is how `_check_lxc_mount_capacity`, the storage block,
the disk SMART block, etc. shape their results.
* Categories with a single error_key at the top level (CPU
hysteresis, certificates, the simpler updates rows) — that
one error_key has to be acknowledged.
When the block demotes, we set ``status='OK'`` and stamp
``all_dismissed=True`` so the front-end (`fetchHealthInfoCount`
and the Health modal) can still surface the row as INFO if it
wants — the data flow that used to derive "X categories with
dismissed items" from `dismissed[]` keeps working unchanged.
No-op for blocks whose status is already OK / INFO / UNKNOWN —
UNKNOWN intentionally never gets dismissed away because the
user didn't ack a failing check, the check failed to run.
"""
if not isinstance(check_block, dict):
return
status = check_block.get('status', 'OK')
if status not in ('WARNING', 'CRITICAL'):
return
try:
inner_checks = check_block.get('checks')
if isinstance(inner_checks, dict) and inner_checks:
any_unack = False
for inner in inner_checks.values():
if not isinstance(inner, dict):
continue
inner_status = inner.get('status', 'OK')
if inner_status not in ('WARNING', 'CRITICAL'):
continue
ek = inner.get('error_key')
if ek and health_persistence.is_error_acknowledged(ek):
inner['dismissed'] = True
if health_persistence.is_error_permanently_acknowledged(ek):
inner['permanent'] = True
else:
any_unack = True
if not any_unack:
check_block['status'] = 'OK'
check_block['all_dismissed'] = True
return
ek = check_block.get('error_key')
if ek and health_persistence.is_error_acknowledged(ek):
check_block['dismissed'] = True
if health_persistence.is_error_permanently_acknowledged(ek):
check_block['permanent'] = True
check_block['status'] = 'OK'
check_block['all_dismissed'] = True
except Exception as e:
# Dismiss check should never crash the health pipeline.
print(f"[HealthMonitor] _apply_dismiss_aware_status failed: {e}")
def get_overall_status(self) -> Dict[str, Any]:
"""Get overall health status summary with minimal overhead"""
details = self.get_detailed_status()
@@ -993,7 +1067,42 @@ class HealthMonitor:
pass
else:
self._unknown_counts[cat_key] = 0
# --- Dismiss-aware re-derivation of issue lists (root fix for #228) ---
# Each `_check_*` above already populated `details[<category>]` with
# its raw status and pushed an entry into critical_issues /
# warning_issues / info_issues. That raw status doesn't know which
# error_keys the user has acknowledged, so a category whose only
# remaining problems are all dismissed (e.g. nine permanently-
# silenced LXC mount alerts) was still pushing the global `overall`
# to CRITICAL. The popup's frontend rollup had to compensate for
# this server-side gap, which is how the badge ("Critical" in the
# header) and the panel ("0 Critical" inside) ended up disagreeing.
#
# Apply the existing per-block dismiss filter (`_annotate_dismissed`
# downstream is the visual-merge cousin of this) to every
# category, then rebuild the issue lists from the post-filter
# statuses. The pre-existing inline appends are discarded — they
# represented the pre-fix view.
critical_issues = []
warning_issues = []
info_issues = []
for cat_key in list(details.keys()):
block = details.get(cat_key)
if not isinstance(block, dict):
continue
self._apply_dismiss_aware_status(block)
status = block.get('status', 'OK')
reason = (block.get('reason') or '').strip()
label = cat_key.replace('_', ' ').capitalize()
entry = f"{label}: {reason}" if reason else label
if status == 'CRITICAL':
critical_issues.append(entry)
elif status == 'WARNING':
warning_issues.append(entry)
elif status == 'INFO':
info_issues.append(entry)
# --- Determine Overall Status ---
# Severity: CRITICAL > WARNING > UNKNOWN (capped at WARNING) > INFO > OK
if critical_issues:

View File

@@ -1367,6 +1367,8 @@ class HealthPersistence:
_zfs_pools_cache = None
_mount_points_cache = None
_pve_services_cache = None
_pvesm_storages_cache = None
_remote_mount_targets_cache = None
def check_vm_ct_cached(vmid):
if vmid not in _vm_ct_exists_cache:
@@ -1445,7 +1447,68 @@ class HealthPersistence:
except Exception:
_mount_points_cache = set()
return _mount_points_cache
def get_pvesm_storages():
"""Return the set of pvesm storage IDs currently configured.
Used to auto-resolve `storage_unavailable_*` and
`pve_storage_full_*` errors after the user removes the
corresponding entry from `pvesm`/Datacenter > Storage. The
check function would otherwise keep firing on a path that
no longer has any business existing.
"""
nonlocal _pvesm_storages_cache
if _pvesm_storages_cache is None:
_pvesm_storages_cache = set()
try:
result = subprocess.run(
['pvesm', 'status'],
capture_output=True, text=True, timeout=5
)
if result.returncode == 0:
for line in result.stdout.strip().split('\n')[1:]:
parts = line.split()
if parts:
_pvesm_storages_cache.add(parts[0])
except Exception:
# On failure leave the cache as an empty set rather
# than `None` — that prevents us from re-trying every
# row in the active_errors loop, and the empty set
# means we won't auto-resolve anything (safer than
# falsely resolving when pvesm is momentarily down).
_pvesm_storages_cache = set()
return _pvesm_storages_cache
return _pvesm_storages_cache
def get_remote_mount_targets():
"""Return the set of mount targets currently in /proc/mounts
for remote filesystems (NFS/CIFS/SMB).
Lets us tell apart a `mount_stale_<target>` whose underlying
mount the user has umount'd (so the alert is now stale data
that should self-clear) from one the user genuinely needs
attention on (the mount is still active but the share is
unreachable). Without this distinction the alert pinned
forever once the user removed the PVE storage and lazy-
umount'd it, which is the case @UBLI-WLAN reported.
"""
nonlocal _remote_mount_targets_cache
if _remote_mount_targets_cache is None:
_remote_mount_targets_cache = set()
try:
with open('/proc/mounts', 'r', encoding='utf-8', errors='replace') as f:
for line in f:
parts = line.strip().split()
if len(parts) < 3:
continue
fstype = parts[2]
# Match the same fstypes mount_monitor watches.
if fstype in ('nfs', 'nfs4', 'cifs', 'smb', 'smbfs') or fstype.startswith(('nfs', 'cifs', 'smb')):
_remote_mount_targets_cache.add(parts[1])
except OSError:
pass
return _remote_mount_targets_cache
def get_pve_services_status():
nonlocal _pve_services_cache
if _pve_services_cache is None:
@@ -1617,9 +1680,72 @@ class HealthPersistence:
should_resolve = True
resolution_reason = 'No longer in cluster'
# === PVE STORAGE REMOVED ===
# Errors that name a PVE storage (storage_unavailable_<id>,
# pve_storage_full_<id>) outlive the storage itself when the
# user removes it from pvesm. Until this hook landed, the
# check function kept stat'ing /mnt/pve/<id> after every
# iteration, found the path missing, and persisted a fresh
# CRITICAL — reported by @UBLI-WLAN on June 4 2026.
if not should_resolve and error_key:
storage_match = None
if error_key.startswith('storage_unavailable_'):
storage_match = error_key[len('storage_unavailable_'):]
elif error_key.startswith('pve_storage_full_'):
storage_match = error_key[len('pve_storage_full_'):]
if storage_match:
pvesm_set = get_pvesm_storages()
# Only treat as removed when `pvesm status` ran AND
# returned a non-empty list. An empty set could mean
# pvesm timed out, in which case it's safer not to
# resolve anything.
if pvesm_set and storage_match not in pvesm_set:
should_resolve = True
resolution_reason = f'Storage {storage_match} removed from pvesm'
# === LXC MOUNT FOR DELETED CT ===
# `_check_lxc_mount_capacity` records
# `lxc_mount_<vmid>_<mount>`, which the VM/CT block above
# misses because the prefix isn't one of `vm_/ct_/vmct_`.
# When the CT is gone the disk-fill alert is meaningless.
if not should_resolve and error_key and error_key.startswith('lxc_mount_'):
# `lxc_mount_<vmid>_<mount-path-tokens>` — VMID is the
# first integer block after the prefix.
m = re.match(r'^lxc_mount_(\d+)_', error_key)
if m:
lxc_vmid = m.group(1)
if not check_vm_ct_cached(lxc_vmid):
should_resolve = True
resolution_reason = f'CT {lxc_vmid} no longer exists'
# === ORPHAN REMOTE MOUNT ===
# `_check_remote_mounts` records `mount_<status>_<target>`
# for every NFS/CIFS/SMB target that's in /proc/mounts but
# fails to stat. When the user removes the PVE storage,
# PVE often does a lazy umount: the kernel mount entry is
# gone (or the /mnt/pve/<id> target was deleted on top), so
# subsequent scans never see the mount again — but the
# already-persisted error has no auto-resolve path.
# Resolve the error when the target is no longer in
# /proc/mounts as a remote mount.
if not should_resolve and error_key and error_key.startswith('mount_'):
# `mount_stale_<target>` or `mount_readonly_<target>`
# — possibly LXC-scoped as `mount_<status>_ct<id>:<target>`.
stripped = error_key.split('_', 2)
if len(stripped) == 3:
key_target = stripped[2]
# LXC-scoped entries (`ct123:/mnt/foo`) are left for
# the VM/CT cleanup path; the host-side reconciler
# only owns host-level targets.
if not key_target.startswith('ct'):
targets = get_remote_mount_targets()
if key_target not in targets:
should_resolve = True
resolution_reason = 'Remote mount no longer present (orphan auto-cleared)'
# === TEMPERATURE ERRORS ===
# Temperature errors - check if sensor still exists (unlikely to change, resolve after 24h of no activity)
elif category == 'temperature':
if not should_resolve and category == 'temperature':
if last_seen_hours > 24:
should_resolve = True
resolution_reason = 'Temperature error stale (>24h no activity)'

View File

@@ -170,19 +170,46 @@ def _detect_nvidia_xfree86() -> Optional[dict]:
def _detect_coral_host() -> list[dict]:
out: list[dict] = []
# PCIe / M.2 — gasket-dkms package version, falling back to the
# registered DKMS version if the package was force-removed but the
# built modules still exist.
# PCIe / M.2 — version detection has three sources, tried in this
# order of trust:
#
# 1. The marker file `/var/lib/proxmenux/coral_gasket_version`
# written by `install_coral.sh` after a successful DKMS
# install — contains the feranick release tag actually
# installed (e.g. `1.0-18.4`). This is the only source that
# knows the fork's patch level.
# 2. `dpkg-query gasket-dkms` — the Debian package version, only
# present when the user installed via .deb rather than the
# ProxMenux script.
# 3. `dkms status` — the upstream module version registered with
# DKMS, which is always the bare `1.0`. Useful as a "modules
# are present" indicator but doesn't reveal the fork patch
# level, so the update-availability check would always fire a
# false positive against feranick's `1.0-N` tags. Reported on
# .50 after a successful re-install kept showing the update
# notification.
pcie_version: Optional[str] = None
try:
r = subprocess.run(
["dpkg-query", "-W", "-f=${Status}|${Version}", "gasket-dkms"],
capture_output=True, text=True, timeout=3,
)
if r.returncode == 0 and "ok installed" in r.stdout:
pcie_version = r.stdout.split("|", 1)[1].strip()
except (FileNotFoundError, OSError, subprocess.TimeoutExpired):
with open("/var/lib/proxmenux/coral_gasket_version",
"r", encoding="utf-8", errors="replace") as fh:
marker = fh.read().strip()
# Sanity check: the file should hold something that looks
# like a version tag, not an error message or empty line.
if marker and re.match(r"^[A-Za-z0-9._+-]+$", marker):
pcie_version = marker
except OSError:
pass
if not pcie_version:
try:
r = subprocess.run(
["dpkg-query", "-W", "-f=${Status}|${Version}", "gasket-dkms"],
capture_output=True, text=True, timeout=3,
)
if r.returncode == 0 and "ok installed" in r.stdout:
pcie_version = r.stdout.split("|", 1)[1].strip()
except (FileNotFoundError, OSError, subprocess.TimeoutExpired):
pass
if not pcie_version:
try:
r = subprocess.run(