Files
ProxMenux/AppImage/scripts/hardware_monitor.py

456 lines
18 KiB
Python
Raw Normal View History

2025-10-06 17:25:08 +02:00
#!/usr/bin/env python3
2025-11-26 18:00:01 +01:00
"""
2026-01-29 18:07:49 +01:00
Hardware Monitor - Detección exhaustiva de hardware
2026-01-29 18:21:23 +01:00
Fusiona:
1. Consumo de CPU (RAPL)
2. Detección de GPU (Intel/NVIDIA/AMD) y métricas detalladas
3. Controladoras HBA/RAID y sus temperaturas
4. Sensores IPMI (Ventiladores/Energía) y UPS (NUT)
5. Información base (CPU, RAM, Placa base)
2025-11-26 18:00:01 +01:00
"""
2025-10-06 17:25:08 +02:00
2025-11-26 18:00:01 +01:00
import os
import time
2025-11-26 18:44:37 +01:00
import subprocess
import re
2026-01-29 18:07:49 +01:00
import json
import shutil
import select
import psutil
import xml.etree.ElementTree as ET
2025-11-26 18:00:01 +01:00
from typing import Dict, Any, Optional
2025-10-06 17:25:08 +02:00
2026-01-29 18:07:49 +01:00
# --- Variables Globales ---
2025-11-26 18:00:01 +01:00
_last_energy_reading = {'energy_uj': None, 'timestamp': None}
2025-10-06 17:25:08 +02:00
2026-01-29 18:07:49 +01:00
# --- Funciones Auxiliares de GPU ---
def identify_gpu_type(name, vendor=None, bus=None, driver=None):
"""Determina si una GPU es Integrada o Dedicada (PCI)."""
n = (name or "").lower()
v = (vendor or "").lower()
d = (driver or "").lower()
2026-01-29 18:21:23 +01:00
2026-01-29 18:07:49 +01:00
bmc_keywords = ['aspeed', 'ast', 'matrox g200', 'g200e', 'mgag200']
if any(k in n for k in bmc_keywords) or v in ['aspeed', 'matrox']:
return 'Integrated'
if 'intel' in v or 'intel corporation' in n:
if d == 'i915' or any(w in n for w in ['uhd graphics', 'iris', 'integrated']):
return 'Integrated'
return 'Integrated' # Asumir integrada por defecto para Intel en servidores
amd_apu = ['radeon 780m', 'vega', 'renoir', 'cezanne', 'rembrandt']
if 'amd' in v and any(k in n for k in amd_apu):
return 'Integrated'
return 'PCI'
2025-10-06 17:25:08 +02:00
2026-01-29 18:07:49 +01:00
def get_intel_gpu_processes_from_text():
2025-11-26 18:44:37 +01:00
"""
2026-01-29 18:07:49 +01:00
Parsea procesos de intel_gpu_top desde salida de texto
(fallback cuando JSON falla).
2025-11-26 18:44:37 +01:00
"""
try:
2026-01-29 18:07:49 +01:00
process = subprocess.Popen(['intel_gpu_top'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1)
time.sleep(2)
process.terminate()
try: stdout, _ = process.communicate(timeout=1)
except:
process.kill()
stdout, _ = process.communicate()
2025-11-26 18:44:37 +01:00
2026-01-29 18:07:49 +01:00
processes = []
lines = stdout.split('\n')
header_found = False
for i, line in enumerate(lines):
if 'PID' in line and 'NAME' in line:
header_found = True
for proc_line in lines[i+1:]:
parts = proc_line.split()
if len(parts) >= 8:
try:
# Parseo simplificado
name = parts[-1]
pid = parts[0]
if pid.isdigit():
processes.append({
'name': name, 'pid': pid,
'memory': {'total': 0, 'resident': 0},
'engines': {'Render/3D': 'Active'} # Estimado
})
except: continue
break
return processes
except: return []
# --- Funciones Principales de GPU ---
2026-01-29 18:21:23 +01:00
def get_pci_gpu_map() -> Dict[str, Dict[str, str]]:
"""
Obtiene un mapa detallado de GPUs desde lspci.
Útil para enriquecer datos con nombres completos de dispositivos.
"""
gpu_map = {}
2026-01-29 18:07:49 +01:00
try:
2026-01-29 18:21:23 +01:00
result = subprocess.run(['lspci', '-nn'], capture_output=True, text=True, timeout=5)
2025-11-26 18:44:37 +01:00
if result.returncode == 0:
for line in result.stdout.split('\n'):
2026-01-29 18:07:49 +01:00
if any(k in line for k in ['VGA compatible', '3D controller', 'Display controller']):
2026-01-29 18:21:23 +01:00
match = re.match(r'^([0-9a-f]{2}:[0-9a-f]{2}\.[0-9a-f])\s+.*:\s+(.+?)\s+\[([0-9a-f]{4}):([0-9a-f]{4})\]', line)
if match:
pci = match.group(1)
name = match.group(2).strip()
2026-01-29 18:07:49 +01:00
vendor = 'Unknown'
if 'NVIDIA' in name.upper(): vendor = 'NVIDIA'
elif 'AMD' in name.upper() or 'ATI' in name.upper(): vendor = 'AMD'
elif 'INTEL' in name.upper(): vendor = 'Intel'
2025-11-26 18:44:37 +01:00
2026-01-29 18:21:23 +01:00
gpu_map[pci] = {'vendor': vendor, 'name': name, 'full_name': line}
except Exception: pass
return gpu_map
def get_gpu_info():
"""Detecta GPUs instaladas para la API."""
gpus = []
try:
res = subprocess.run(['lspci'], capture_output=True, text=True)
for line in res.stdout.split('\n'):
if any(x in line for x in ['VGA', '3D', 'Display']):
parts = line.split(' ', 1)
if len(parts) >= 2:
slot = parts[0]
rest = parts[1]
name = rest.split(':', 1)[1].strip() if ':' in rest else rest.strip()
vendor = 'Unknown'
if 'NVIDIA' in name.upper(): vendor = 'NVIDIA'
elif 'AMD' in name.upper(): vendor = 'AMD'
elif 'INTEL' in name.upper(): vendor = 'Intel'
gpus.append({
'slot': slot,
'name': name,
'vendor': vendor,
'type': identify_gpu_type(name, vendor)
})
2026-01-29 18:07:49 +01:00
except: pass
return gpus
2025-11-26 18:44:37 +01:00
2026-01-29 18:07:49 +01:00
def get_detailed_gpu_info(gpu):
2025-11-26 18:00:01 +01:00
"""
2026-01-29 18:07:49 +01:00
Obtiene métricas en tiempo real (Temp, Uso, VRAM, Power)
usando herramientas específicas del vendor (nvidia-smi, intel_gpu_top).
2025-11-26 18:00:01 +01:00
"""
2026-01-29 18:07:49 +01:00
vendor = gpu.get('vendor', '').lower()
info = {
'has_monitoring_tool': False, 'temperature': None, 'fan_speed': None,
'utilization_gpu': None, 'memory_used': None, 'memory_total': None,
'power_draw': None, 'processes': []
}
2025-11-26 12:27:25 +01:00
2026-01-29 18:07:49 +01:00
# --- NVIDIA ---
if 'nvidia' in vendor and shutil.which('nvidia-smi'):
try:
cmd = ['nvidia-smi', '-q', '-x']
res = subprocess.run(cmd, capture_output=True, text=True, timeout=5)
if res.returncode == 0:
root = ET.fromstring(res.stdout)
gpu_elem = root.find('gpu')
if gpu_elem:
info['has_monitoring_tool'] = True
temp = gpu_elem.find('.//temperature/gpu_temp')
if temp is not None: info['temperature'] = int(temp.text.replace(' C', ''))
fan = gpu_elem.find('.//fan_speed')
if fan is not None and fan.text != 'N/A': info['fan_speed'] = int(fan.text.replace(' %', ''))
power = gpu_elem.find('.//gpu_power_readings/instant_power_draw')
if power is not None and power.text != 'N/A': info['power_draw'] = power.text
util = gpu_elem.find('.//utilization/gpu_util')
if util is not None: info['utilization_gpu'] = util.text
mem_used = gpu_elem.find('.//fb_memory_usage/used')
if mem_used is not None: info['memory_used'] = mem_used.text
mem_total = gpu_elem.find('.//fb_memory_usage/total')
if mem_total is not None: info['memory_total'] = mem_total.text
procs = gpu_elem.find('.//processes')
if procs is not None:
for p in procs.findall('process_info'):
info['processes'].append({
'pid': p.find('pid').text,
'name': p.find('process_name').text,
'memory': p.find('used_memory').text
})
except: pass
# --- INTEL ---
elif 'intel' in vendor:
tool = shutil.which('intel_gpu_top')
if tool:
try:
# Intenta ejecutar JSON output
env = os.environ.copy()
env['TERM'] = 'xterm'
proc = subprocess.Popen([tool, '-J'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, env=env)
# Leer brevemente
time.sleep(1.5)
proc.terminate()
try: stdout, _ = proc.communicate(timeout=0.5)
except:
proc.kill()
stdout, _ = proc.communicate()
# Parsear último JSON válido
json_objs = []
buffer = ""
brace = 0
for char in stdout:
if char == '{': brace += 1
if brace > 0: buffer += char
if char == '}':
brace -= 1
if brace == 0:
try: json_objs.append(json.loads(buffer))
except: pass
buffer = ""
if json_objs:
data = json_objs[-1]
info['has_monitoring_tool'] = True
if 'engines' in data:
max_usage = 0.0
for k, v in data['engines'].items():
val = float(v.get('busy', 0))
if val > max_usage: max_usage = val
info['utilization_gpu'] = f"{max_usage:.1f}%"
if 'power' in data:
info['power_draw'] = f"{data['power'].get('Package', 0):.2f} W"
if 'frequency' in data:
info['clock_graphics'] = f"{data['frequency'].get('actual', 0)} MHz"
except:
# Fallback procesos texto
info['processes'] = get_intel_gpu_processes_from_text()
if info['processes']: info['has_monitoring_tool'] = True
return info
def get_gpu_realtime_data(slot):
"""Encuentra una GPU por slot y devuelve sus datos en tiempo real."""
gpus = get_gpu_info()
target = None
for g in gpus:
if g['slot'] == slot or slot in g.get('slot', ''):
target = g
break
if target:
details = get_detailed_gpu_info(target)
target.update(details)
return target
return None
2026-01-29 18:21:23 +01:00
# --- RAPL Power (CPU) ---
def get_power_info() -> Optional[Dict[str, Any]]:
"""Obtiene consumo de CPU Intel via RAPL."""
global _last_energy_reading
rapl_path = '/sys/class/powercap/intel-rapl/intel-rapl:0/energy_uj'
if os.path.exists(rapl_path):
try:
with open(rapl_path, 'r') as f: current_uj = int(f.read().strip())
current_time = time.time()
watts = 0.0
if _last_energy_reading['energy_uj'] and _last_energy_reading['timestamp']:
tdiff = current_time - _last_energy_reading['timestamp']
ediff = current_uj - _last_energy_reading['energy_uj']
if tdiff > 0:
if ediff < 0: ediff = current_uj # Overflow handling
watts = round((ediff / tdiff) / 1000000, 2)
_last_energy_reading = {'energy_uj': current_uj, 'timestamp': current_time}
cpu_vendor = 'CPU'
try:
with open('/proc/cpuinfo', 'r') as f:
if 'GenuineIntel' in f.read(): cpu_vendor = 'Intel'
else: cpu_vendor = 'AMD'
except: pass
return {'name': 'CPU Power', 'watts': watts, 'adapter': f'{cpu_vendor} RAPL'}
except: pass
return None
# --- HBA / RAID Logic ---
def get_hba_info() -> list[Dict[str, Any]]:
"""Detecta controladoras HBA/RAID."""
hba_list = []
try:
result = subprocess.run(['lspci', '-nn'], capture_output=True, text=True, timeout=5)
if result.returncode == 0:
controller_id = 0
for line in result.stdout.split('\n'):
if any(k in line for k in ['RAID bus controller', 'SCSI storage controller', 'Serial Attached SCSI']):
match = re.match(r'^([0-9a-f]{2}:[0-9a-f]{2}\.[0-9a-f])\s+.*:\s+(.+?)\s+\[([0-9a-f]{4}):([0-9a-f]{4})\]', line)
if match:
pci = match.group(1)
name = match.group(2).strip()
vendor = 'Unknown'
if 'LSI' in name.upper() or 'BROADCOM' in name.upper() or 'AVAGO' in name.upper(): vendor = 'LSI/Broadcom'
elif 'ADAPTEC' in name.upper(): vendor = 'Adaptec'
elif 'HP' in name.upper(): vendor = 'HP'
elif 'DELL' in name.upper(): vendor = 'Dell'
model = name
for v in ['Broadcom / LSI', 'Broadcom', 'LSI Logic', 'LSI']:
if model.startswith(v): model = model[len(v):].strip()
hba_list.append({
'pci_address': pci, 'vendor': vendor, 'model': model,
'controller_id': controller_id, 'full_name': name
})
controller_id += 1
except: pass
return hba_list
def get_hba_temperatures() -> list[Dict[str, Any]]:
"""Obtiene temperaturas de HBA (storcli/megacli)."""
temperatures = []
storcli_paths = ['/usr/sbin/storcli64', '/opt/MegaRAID/storcli/storcli64', 'storcli64']
storcli = next((p for p in storcli_paths if shutil.which(p) or os.path.exists(p)), None)
if storcli:
try:
# Intenta leer el controlador 0 como ejemplo básico
res = subprocess.run([storcli, '/c0', 'show', 'temperature'], capture_output=True, text=True, timeout=5)
for line in res.stdout.split('\n'):
if 'ROC temperature' in line or 'Controller Temp' in line:
match = re.search(r'(\d+)\s*C', line)
if match:
temperatures.append({
'name': 'HBA Controller 0',
'temperature': int(match.group(1)),
'adapter': 'LSI/Broadcom'
})
except: pass
return temperatures
# --- IPMI & UPS ---
2026-01-29 18:07:49 +01:00
def get_ipmi_fans():
"""Obtiene ventiladores via ipmitool."""
fans = []
if shutil.which('ipmitool'):
try:
res = subprocess.run(['ipmitool', 'sensor'], capture_output=True, text=True, timeout=5)
2026-01-29 18:21:23 +01:00
for line in res.stdout.split('\n'):
if 'fan' in line.lower() and '|' in line:
p = line.split('|')
try: fans.append({'name': p[0].strip(), 'speed': float(p[1].strip()), 'unit': p[2].strip()})
except: continue
2026-01-29 18:07:49 +01:00
except: pass
return fans
def get_ipmi_power():
2026-01-29 18:21:23 +01:00
"""Obtiene datos de energía IPMI."""
2026-01-29 18:07:49 +01:00
power = {'supplies': [], 'meter': None}
if shutil.which('ipmitool'):
try:
res = subprocess.run(['ipmitool', 'sensor'], capture_output=True, text=True, timeout=5)
for line in res.stdout.split('\n'):
lower = line.lower()
if ('power supply' in lower or 'power meter' in lower) and '|' in line:
p = line.split('|')
try:
val = float(p[1].strip())
2026-01-29 18:21:23 +01:00
unit = p[2].strip()
2026-01-29 18:07:49 +01:00
if 'power meter' in lower:
power['meter'] = {'name': p[0].strip(), 'watts': val, 'unit': unit}
else:
power['supplies'].append({'name': p[0].strip(), 'watts': val, 'unit': unit})
except: continue
except: pass
return power
def get_ups_info():
2026-01-29 18:21:23 +01:00
"""Obtiene datos de UPS via NUT."""
2026-01-29 18:07:49 +01:00
ups_list = []
if shutil.which('upsc'):
try:
res = subprocess.run(['upsc', '-l'], capture_output=True, text=True, timeout=5)
2026-01-29 18:21:23 +01:00
for ups in res.stdout.strip().split('\n'):
if ups:
data = {'name': ups, 'connection_type': 'Local'}
d_res = subprocess.run(['upsc', ups], capture_output=True, text=True, timeout=5)
for line in d_res.stdout.split('\n'):
if ':' in line:
k, v = line.split(':', 1)
data[k.strip()] = v.strip()
ups_list.append(data)
2026-01-29 18:07:49 +01:00
except: pass
return ups_list
2026-01-29 18:21:23 +01:00
# --- Main Hardware Aggregator ---
2026-01-29 18:07:49 +01:00
def get_hardware_info():
2026-01-29 18:21:23 +01:00
"""Agrega toda la información de hardware para la API."""
2026-01-29 18:07:49 +01:00
data = {
'cpu': {}, 'motherboard': {}, 'memory_modules': [],
2026-01-29 18:21:23 +01:00
'storage_devices': [], 'pci_devices': [],
2026-01-29 18:07:49 +01:00
'gpus': get_gpu_info(),
'ipmi_fans': get_ipmi_fans(),
'ipmi_power': get_ipmi_power(),
'ups': get_ups_info(),
'power_meter': get_power_info(),
2026-01-29 18:21:23 +01:00
'hba': get_hba_info(),
'sensors': {'fans': [], 'temperatures': get_hba_temperatures()}
2026-01-29 18:07:49 +01:00
}
2025-11-26 20:31:09 +01:00
2026-01-29 18:07:49 +01:00
# CPU Info
2025-11-26 20:31:09 +01:00
try:
2026-01-29 18:07:49 +01:00
res = subprocess.run(['lscpu'], capture_output=True, text=True)
for line in res.stdout.split('\n'):
if 'Model name:' in line: data['cpu']['model'] = line.split(':', 1)[1].strip()
if 'Socket(s):' in line: data['cpu']['sockets'] = line.split(':', 1)[1].strip()
except: pass
2025-11-26 20:31:09 +01:00
2026-01-29 18:21:23 +01:00
# Motherboard
2026-01-29 18:07:49 +01:00
try:
res = subprocess.run(['dmidecode', '-t', 'baseboard'], capture_output=True, text=True)
for line in res.stdout.split('\n'):
if 'Product Name:' in line: data['motherboard']['model'] = line.split(':', 1)[1].strip()
if 'Manufacturer:' in line: data['motherboard']['manufacturer'] = line.split(':', 1)[1].strip()
except: pass
2025-11-26 20:31:09 +01:00
2026-01-29 18:21:23 +01:00
# RAM
2026-01-29 18:07:49 +01:00
try:
res = subprocess.run(['dmidecode', '-t', 'memory'], capture_output=True, text=True)
mod = {}
for line in res.stdout.split('\n'):
line = line.strip()
if 'Memory Device' in line:
if mod.get('size', 0) > 0: data['memory_modules'].append(mod)
mod = {'size': 0}
elif 'Size:' in line:
parts = line.split(':', 1)[1].strip().split()
if len(parts) >= 2 and parts[0].isdigit():
val = int(parts[0])
unit = parts[1].upper()
if unit == 'GB': mod['size'] = val * 1024 * 1024
elif unit == 'MB': mod['size'] = val * 1024
elif 'Type:' in line: mod['type'] = line.split(':', 1)[1].strip()
elif 'Speed:' in line: mod['speed'] = line.split(':', 1)[1].strip()
if mod.get('size', 0) > 0: data['memory_modules'].append(mod)
except: pass
2025-11-26 20:31:09 +01:00
2026-01-29 18:21:23 +01:00
# Enrich GPUs with details
2026-01-29 18:07:49 +01:00
for gpu in data['gpus']:
gpu.update(get_detailed_gpu_info(gpu))
2025-11-26 20:31:09 +01:00
2026-01-29 18:07:49 +01:00
return data