mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-06-11 11:06:24 +00:00
Add beta 1.2.2.1
This commit is contained in:
@@ -4858,7 +4858,8 @@ def get_proxmox_vms():
|
||||
'netin': resource.get('netin', 0),
|
||||
'netout': resource.get('netout', 0),
|
||||
'diskread': resource.get('diskread', 0),
|
||||
'diskwrite': resource.get('diskwrite', 0)
|
||||
'diskwrite': resource.get('diskwrite', 0),
|
||||
'maxcpu': resource.get('maxcpu', 0)
|
||||
}
|
||||
# Decorate LXC rows with the apt update status if the
|
||||
# managed_installs registry has it. Absent key means
|
||||
@@ -7640,14 +7641,26 @@ def api_system():
|
||||
try:
|
||||
from health_monitor import health_monitor
|
||||
_hist = health_monitor.state_history.get('cpu_usage') or []
|
||||
cpu_usage = _hist[-1]['value'] if _hist else psutil.cpu_percent(interval=0.1)
|
||||
if _hist:
|
||||
_last = _hist[-1]
|
||||
cpu_usage = _last['value']
|
||||
cpu_user_pct = _last.get('user', 0)
|
||||
cpu_system_pct = _last.get('system', 0)
|
||||
else:
|
||||
cpu_usage = psutil.cpu_percent(interval=0.1)
|
||||
cpu_user_pct = 0
|
||||
cpu_system_pct = 0
|
||||
except Exception:
|
||||
cpu_usage = psutil.cpu_percent(interval=0.1)
|
||||
cpu_user_pct = 0
|
||||
cpu_system_pct = 0
|
||||
|
||||
memory = psutil.virtual_memory()
|
||||
memory_used_gb = memory.used / (1024 ** 3)
|
||||
memory_total_gb = memory.total / (1024 ** 3)
|
||||
memory_usage_percent = memory.percent
|
||||
# Preview restyle: cached + buffers in GB
|
||||
memory_cached_gb = round((getattr(memory, 'cached', 0) + getattr(memory, 'buffers', 0)) / (1024 ** 3), 1)
|
||||
|
||||
# Get temperature
|
||||
temp = get_cpu_temperature()
|
||||
@@ -7677,9 +7690,12 @@ def api_system():
|
||||
|
||||
return jsonify({
|
||||
'cpu_usage': round(cpu_usage, 1),
|
||||
'cpu_user': cpu_user_pct,
|
||||
'cpu_system': cpu_system_pct,
|
||||
'memory_usage': round(memory_usage_percent, 1),
|
||||
'memory_total': round(memory_total_gb, 1),
|
||||
'memory_used': round(memory_used_gb, 1),
|
||||
'memory_cached': memory_cached_gb,
|
||||
'temperature': temp,
|
||||
'temperature_sparkline': temp_sparkline,
|
||||
'uptime': uptime,
|
||||
@@ -9616,6 +9632,35 @@ def api_node_metrics():
|
||||
if 'zfsarc' not in item or item.get('zfsarc', 0) == 0:
|
||||
item['zfsarc'] = zfs_arc_size
|
||||
|
||||
# 24h downsampling: RRD returns ~1440 minute-level points which
|
||||
# plots as a dense thicket of vertical spikes. Group into 5-min
|
||||
# buckets and average each numeric field — same shape that
|
||||
# `get_temperature_history` uses for its 24h view so the look
|
||||
# is consistent across the dashboard's 24h charts.
|
||||
if timeframe == 'day' and rrd_data:
|
||||
bucket_seconds = 300 # 5-min
|
||||
buckets = {}
|
||||
for item in rrd_data:
|
||||
t = item.get('time')
|
||||
if t is None:
|
||||
continue
|
||||
bk = (int(t) // bucket_seconds) * bucket_seconds
|
||||
if bk not in buckets:
|
||||
buckets[bk] = {'_count': 0, '_sums': {}}
|
||||
b = buckets[bk]
|
||||
b['_count'] += 1
|
||||
for k, v in item.items():
|
||||
if k == 'time' or not isinstance(v, (int, float)) or isinstance(v, bool):
|
||||
continue
|
||||
b['_sums'][k] = b['_sums'].get(k, 0) + v
|
||||
rrd_data = []
|
||||
for bk in sorted(buckets.keys()):
|
||||
b = buckets[bk]
|
||||
point = {'time': bk}
|
||||
for k, total in b['_sums'].items():
|
||||
point[k] = total / b['_count']
|
||||
rrd_data.append(point)
|
||||
|
||||
payload = {
|
||||
'node': local_node,
|
||||
'timeframe': timeframe,
|
||||
|
||||
@@ -453,10 +453,19 @@ class HealthMonitor:
|
||||
"""Lightweight CPU sample: read usage % and append to history. ~30ms cost."""
|
||||
try:
|
||||
cpu_percent = psutil.cpu_percent(interval=0)
|
||||
try:
|
||||
_times = psutil.cpu_times_percent(interval=0)
|
||||
cpu_user = round(_times.user + getattr(_times, 'nice', 0), 1)
|
||||
cpu_system = round(_times.system + getattr(_times, 'irq', 0) + getattr(_times, 'softirq', 0), 1)
|
||||
except Exception:
|
||||
cpu_user = 0
|
||||
cpu_system = 0
|
||||
current_time = time.time()
|
||||
state_key = 'cpu_usage'
|
||||
self.state_history[state_key].append({
|
||||
'value': cpu_percent,
|
||||
'user': cpu_user,
|
||||
'system': cpu_system,
|
||||
'time': current_time
|
||||
})
|
||||
# Prune entries older than 6 minutes
|
||||
@@ -608,6 +617,71 @@ class HealthMonitor:
|
||||
|
||||
return self.cached_results[cache_key]
|
||||
|
||||
def _apply_dismiss_aware_status(self, check_block: Dict[str, Any]) -> None:
|
||||
"""In-place demote a check block's `status` to OK when every
|
||||
underlying error is already user-acknowledged.
|
||||
|
||||
Two flavours, matching how categories actually structure their
|
||||
output:
|
||||
|
||||
* Categories that aggregate inner checks (a `checks` dict whose
|
||||
values each hold an individual `error_key`) — every non-OK
|
||||
inner check must be acknowledged for the block to demote.
|
||||
This is how `_check_lxc_mount_capacity`, the storage block,
|
||||
the disk SMART block, etc. shape their results.
|
||||
* Categories with a single error_key at the top level (CPU
|
||||
hysteresis, certificates, the simpler updates rows) — that
|
||||
one error_key has to be acknowledged.
|
||||
|
||||
When the block demotes, we set ``status='OK'`` and stamp
|
||||
``all_dismissed=True`` so the front-end (`fetchHealthInfoCount`
|
||||
and the Health modal) can still surface the row as INFO if it
|
||||
wants — the data flow that used to derive "X categories with
|
||||
dismissed items" from `dismissed[]` keeps working unchanged.
|
||||
|
||||
No-op for blocks whose status is already OK / INFO / UNKNOWN —
|
||||
UNKNOWN intentionally never gets dismissed away because the
|
||||
user didn't ack a failing check, the check failed to run.
|
||||
"""
|
||||
if not isinstance(check_block, dict):
|
||||
return
|
||||
status = check_block.get('status', 'OK')
|
||||
if status not in ('WARNING', 'CRITICAL'):
|
||||
return
|
||||
|
||||
try:
|
||||
inner_checks = check_block.get('checks')
|
||||
if isinstance(inner_checks, dict) and inner_checks:
|
||||
any_unack = False
|
||||
for inner in inner_checks.values():
|
||||
if not isinstance(inner, dict):
|
||||
continue
|
||||
inner_status = inner.get('status', 'OK')
|
||||
if inner_status not in ('WARNING', 'CRITICAL'):
|
||||
continue
|
||||
ek = inner.get('error_key')
|
||||
if ek and health_persistence.is_error_acknowledged(ek):
|
||||
inner['dismissed'] = True
|
||||
if health_persistence.is_error_permanently_acknowledged(ek):
|
||||
inner['permanent'] = True
|
||||
else:
|
||||
any_unack = True
|
||||
if not any_unack:
|
||||
check_block['status'] = 'OK'
|
||||
check_block['all_dismissed'] = True
|
||||
return
|
||||
|
||||
ek = check_block.get('error_key')
|
||||
if ek and health_persistence.is_error_acknowledged(ek):
|
||||
check_block['dismissed'] = True
|
||||
if health_persistence.is_error_permanently_acknowledged(ek):
|
||||
check_block['permanent'] = True
|
||||
check_block['status'] = 'OK'
|
||||
check_block['all_dismissed'] = True
|
||||
except Exception as e:
|
||||
# Dismiss check should never crash the health pipeline.
|
||||
print(f"[HealthMonitor] _apply_dismiss_aware_status failed: {e}")
|
||||
|
||||
def get_overall_status(self) -> Dict[str, Any]:
|
||||
"""Get overall health status summary with minimal overhead"""
|
||||
details = self.get_detailed_status()
|
||||
@@ -993,7 +1067,42 @@ class HealthMonitor:
|
||||
pass
|
||||
else:
|
||||
self._unknown_counts[cat_key] = 0
|
||||
|
||||
|
||||
# --- Dismiss-aware re-derivation of issue lists (root fix for #228) ---
|
||||
# Each `_check_*` above already populated `details[<category>]` with
|
||||
# its raw status and pushed an entry into critical_issues /
|
||||
# warning_issues / info_issues. That raw status doesn't know which
|
||||
# error_keys the user has acknowledged, so a category whose only
|
||||
# remaining problems are all dismissed (e.g. nine permanently-
|
||||
# silenced LXC mount alerts) was still pushing the global `overall`
|
||||
# to CRITICAL. The popup's frontend rollup had to compensate for
|
||||
# this server-side gap, which is how the badge ("Critical" in the
|
||||
# header) and the panel ("0 Critical" inside) ended up disagreeing.
|
||||
#
|
||||
# Apply the existing per-block dismiss filter (`_annotate_dismissed`
|
||||
# downstream is the visual-merge cousin of this) to every
|
||||
# category, then rebuild the issue lists from the post-filter
|
||||
# statuses. The pre-existing inline appends are discarded — they
|
||||
# represented the pre-fix view.
|
||||
critical_issues = []
|
||||
warning_issues = []
|
||||
info_issues = []
|
||||
for cat_key in list(details.keys()):
|
||||
block = details.get(cat_key)
|
||||
if not isinstance(block, dict):
|
||||
continue
|
||||
self._apply_dismiss_aware_status(block)
|
||||
status = block.get('status', 'OK')
|
||||
reason = (block.get('reason') or '').strip()
|
||||
label = cat_key.replace('_', ' ').capitalize()
|
||||
entry = f"{label}: {reason}" if reason else label
|
||||
if status == 'CRITICAL':
|
||||
critical_issues.append(entry)
|
||||
elif status == 'WARNING':
|
||||
warning_issues.append(entry)
|
||||
elif status == 'INFO':
|
||||
info_issues.append(entry)
|
||||
|
||||
# --- Determine Overall Status ---
|
||||
# Severity: CRITICAL > WARNING > UNKNOWN (capped at WARNING) > INFO > OK
|
||||
if critical_issues:
|
||||
|
||||
@@ -1367,6 +1367,8 @@ class HealthPersistence:
|
||||
_zfs_pools_cache = None
|
||||
_mount_points_cache = None
|
||||
_pve_services_cache = None
|
||||
_pvesm_storages_cache = None
|
||||
_remote_mount_targets_cache = None
|
||||
|
||||
def check_vm_ct_cached(vmid):
|
||||
if vmid not in _vm_ct_exists_cache:
|
||||
@@ -1445,7 +1447,68 @@ class HealthPersistence:
|
||||
except Exception:
|
||||
_mount_points_cache = set()
|
||||
return _mount_points_cache
|
||||
|
||||
|
||||
def get_pvesm_storages():
|
||||
"""Return the set of pvesm storage IDs currently configured.
|
||||
|
||||
Used to auto-resolve `storage_unavailable_*` and
|
||||
`pve_storage_full_*` errors after the user removes the
|
||||
corresponding entry from `pvesm`/Datacenter > Storage. The
|
||||
check function would otherwise keep firing on a path that
|
||||
no longer has any business existing.
|
||||
"""
|
||||
nonlocal _pvesm_storages_cache
|
||||
if _pvesm_storages_cache is None:
|
||||
_pvesm_storages_cache = set()
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['pvesm', 'status'],
|
||||
capture_output=True, text=True, timeout=5
|
||||
)
|
||||
if result.returncode == 0:
|
||||
for line in result.stdout.strip().split('\n')[1:]:
|
||||
parts = line.split()
|
||||
if parts:
|
||||
_pvesm_storages_cache.add(parts[0])
|
||||
except Exception:
|
||||
# On failure leave the cache as an empty set rather
|
||||
# than `None` — that prevents us from re-trying every
|
||||
# row in the active_errors loop, and the empty set
|
||||
# means we won't auto-resolve anything (safer than
|
||||
# falsely resolving when pvesm is momentarily down).
|
||||
_pvesm_storages_cache = set()
|
||||
return _pvesm_storages_cache
|
||||
return _pvesm_storages_cache
|
||||
|
||||
def get_remote_mount_targets():
|
||||
"""Return the set of mount targets currently in /proc/mounts
|
||||
for remote filesystems (NFS/CIFS/SMB).
|
||||
|
||||
Lets us tell apart a `mount_stale_<target>` whose underlying
|
||||
mount the user has umount'd (so the alert is now stale data
|
||||
that should self-clear) from one the user genuinely needs
|
||||
attention on (the mount is still active but the share is
|
||||
unreachable). Without this distinction the alert pinned
|
||||
forever once the user removed the PVE storage and lazy-
|
||||
umount'd it, which is the case @UBLI-WLAN reported.
|
||||
"""
|
||||
nonlocal _remote_mount_targets_cache
|
||||
if _remote_mount_targets_cache is None:
|
||||
_remote_mount_targets_cache = set()
|
||||
try:
|
||||
with open('/proc/mounts', 'r', encoding='utf-8', errors='replace') as f:
|
||||
for line in f:
|
||||
parts = line.strip().split()
|
||||
if len(parts) < 3:
|
||||
continue
|
||||
fstype = parts[2]
|
||||
# Match the same fstypes mount_monitor watches.
|
||||
if fstype in ('nfs', 'nfs4', 'cifs', 'smb', 'smbfs') or fstype.startswith(('nfs', 'cifs', 'smb')):
|
||||
_remote_mount_targets_cache.add(parts[1])
|
||||
except OSError:
|
||||
pass
|
||||
return _remote_mount_targets_cache
|
||||
|
||||
def get_pve_services_status():
|
||||
nonlocal _pve_services_cache
|
||||
if _pve_services_cache is None:
|
||||
@@ -1617,9 +1680,72 @@ class HealthPersistence:
|
||||
should_resolve = True
|
||||
resolution_reason = 'No longer in cluster'
|
||||
|
||||
# === PVE STORAGE REMOVED ===
|
||||
# Errors that name a PVE storage (storage_unavailable_<id>,
|
||||
# pve_storage_full_<id>) outlive the storage itself when the
|
||||
# user removes it from pvesm. Until this hook landed, the
|
||||
# check function kept stat'ing /mnt/pve/<id> after every
|
||||
# iteration, found the path missing, and persisted a fresh
|
||||
# CRITICAL — reported by @UBLI-WLAN on June 4 2026.
|
||||
if not should_resolve and error_key:
|
||||
storage_match = None
|
||||
if error_key.startswith('storage_unavailable_'):
|
||||
storage_match = error_key[len('storage_unavailable_'):]
|
||||
elif error_key.startswith('pve_storage_full_'):
|
||||
storage_match = error_key[len('pve_storage_full_'):]
|
||||
if storage_match:
|
||||
pvesm_set = get_pvesm_storages()
|
||||
# Only treat as removed when `pvesm status` ran AND
|
||||
# returned a non-empty list. An empty set could mean
|
||||
# pvesm timed out, in which case it's safer not to
|
||||
# resolve anything.
|
||||
if pvesm_set and storage_match not in pvesm_set:
|
||||
should_resolve = True
|
||||
resolution_reason = f'Storage {storage_match} removed from pvesm'
|
||||
|
||||
# === LXC MOUNT FOR DELETED CT ===
|
||||
# `_check_lxc_mount_capacity` records
|
||||
# `lxc_mount_<vmid>_<mount>`, which the VM/CT block above
|
||||
# misses because the prefix isn't one of `vm_/ct_/vmct_`.
|
||||
# When the CT is gone the disk-fill alert is meaningless.
|
||||
if not should_resolve and error_key and error_key.startswith('lxc_mount_'):
|
||||
# `lxc_mount_<vmid>_<mount-path-tokens>` — VMID is the
|
||||
# first integer block after the prefix.
|
||||
m = re.match(r'^lxc_mount_(\d+)_', error_key)
|
||||
if m:
|
||||
lxc_vmid = m.group(1)
|
||||
if not check_vm_ct_cached(lxc_vmid):
|
||||
should_resolve = True
|
||||
resolution_reason = f'CT {lxc_vmid} no longer exists'
|
||||
|
||||
# === ORPHAN REMOTE MOUNT ===
|
||||
# `_check_remote_mounts` records `mount_<status>_<target>`
|
||||
# for every NFS/CIFS/SMB target that's in /proc/mounts but
|
||||
# fails to stat. When the user removes the PVE storage,
|
||||
# PVE often does a lazy umount: the kernel mount entry is
|
||||
# gone (or the /mnt/pve/<id> target was deleted on top), so
|
||||
# subsequent scans never see the mount again — but the
|
||||
# already-persisted error has no auto-resolve path.
|
||||
# Resolve the error when the target is no longer in
|
||||
# /proc/mounts as a remote mount.
|
||||
if not should_resolve and error_key and error_key.startswith('mount_'):
|
||||
# `mount_stale_<target>` or `mount_readonly_<target>`
|
||||
# — possibly LXC-scoped as `mount_<status>_ct<id>:<target>`.
|
||||
stripped = error_key.split('_', 2)
|
||||
if len(stripped) == 3:
|
||||
key_target = stripped[2]
|
||||
# LXC-scoped entries (`ct123:/mnt/foo`) are left for
|
||||
# the VM/CT cleanup path; the host-side reconciler
|
||||
# only owns host-level targets.
|
||||
if not key_target.startswith('ct'):
|
||||
targets = get_remote_mount_targets()
|
||||
if key_target not in targets:
|
||||
should_resolve = True
|
||||
resolution_reason = 'Remote mount no longer present (orphan auto-cleared)'
|
||||
|
||||
# === TEMPERATURE ERRORS ===
|
||||
# Temperature errors - check if sensor still exists (unlikely to change, resolve after 24h of no activity)
|
||||
elif category == 'temperature':
|
||||
if not should_resolve and category == 'temperature':
|
||||
if last_seen_hours > 24:
|
||||
should_resolve = True
|
||||
resolution_reason = 'Temperature error stale (>24h no activity)'
|
||||
|
||||
@@ -170,19 +170,46 @@ def _detect_nvidia_xfree86() -> Optional[dict]:
|
||||
def _detect_coral_host() -> list[dict]:
|
||||
out: list[dict] = []
|
||||
|
||||
# PCIe / M.2 — gasket-dkms package version, falling back to the
|
||||
# registered DKMS version if the package was force-removed but the
|
||||
# built modules still exist.
|
||||
# PCIe / M.2 — version detection has three sources, tried in this
|
||||
# order of trust:
|
||||
#
|
||||
# 1. The marker file `/var/lib/proxmenux/coral_gasket_version`
|
||||
# written by `install_coral.sh` after a successful DKMS
|
||||
# install — contains the feranick release tag actually
|
||||
# installed (e.g. `1.0-18.4`). This is the only source that
|
||||
# knows the fork's patch level.
|
||||
# 2. `dpkg-query gasket-dkms` — the Debian package version, only
|
||||
# present when the user installed via .deb rather than the
|
||||
# ProxMenux script.
|
||||
# 3. `dkms status` — the upstream module version registered with
|
||||
# DKMS, which is always the bare `1.0`. Useful as a "modules
|
||||
# are present" indicator but doesn't reveal the fork patch
|
||||
# level, so the update-availability check would always fire a
|
||||
# false positive against feranick's `1.0-N` tags. Reported on
|
||||
# .50 after a successful re-install kept showing the update
|
||||
# notification.
|
||||
pcie_version: Optional[str] = None
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["dpkg-query", "-W", "-f=${Status}|${Version}", "gasket-dkms"],
|
||||
capture_output=True, text=True, timeout=3,
|
||||
)
|
||||
if r.returncode == 0 and "ok installed" in r.stdout:
|
||||
pcie_version = r.stdout.split("|", 1)[1].strip()
|
||||
except (FileNotFoundError, OSError, subprocess.TimeoutExpired):
|
||||
with open("/var/lib/proxmenux/coral_gasket_version",
|
||||
"r", encoding="utf-8", errors="replace") as fh:
|
||||
marker = fh.read().strip()
|
||||
# Sanity check: the file should hold something that looks
|
||||
# like a version tag, not an error message or empty line.
|
||||
if marker and re.match(r"^[A-Za-z0-9._+-]+$", marker):
|
||||
pcie_version = marker
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
if not pcie_version:
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["dpkg-query", "-W", "-f=${Status}|${Version}", "gasket-dkms"],
|
||||
capture_output=True, text=True, timeout=3,
|
||||
)
|
||||
if r.returncode == 0 and "ok installed" in r.stdout:
|
||||
pcie_version = r.stdout.split("|", 1)[1].strip()
|
||||
except (FileNotFoundError, OSError, subprocess.TimeoutExpired):
|
||||
pass
|
||||
if not pcie_version:
|
||||
try:
|
||||
r = subprocess.run(
|
||||
|
||||
Reference in New Issue
Block a user