Update flask_server.py

This commit is contained in:
MacRimi
2025-10-15 19:22:16 +02:00
parent beed7e83f2
commit 5cd37b74b4

View File

@@ -622,12 +622,18 @@ def get_storage_info():
total_used = 0 total_used = 0
total_available = 0 total_available = 0
zfs_disks = set()
for partition in disk_partitions: for partition in disk_partitions:
try: try:
# Skip special filesystems # Skip special filesystems
if partition.fstype in ['tmpfs', 'devtmpfs', 'squashfs', 'overlay']: if partition.fstype in ['tmpfs', 'devtmpfs', 'squashfs', 'overlay']:
continue continue
if partition.fstype == 'zfs':
print(f"[v0] Skipping ZFS filesystem {partition.mountpoint}, will count from pool data")
continue
partition_usage = psutil.disk_usage(partition.mountpoint) partition_usage = psutil.disk_usage(partition.mountpoint)
total_used += partition_usage.used total_used += partition_usage.used
total_available += partition_usage.free total_available += partition_usage.free
@@ -658,35 +664,68 @@ def get_storage_info():
print(f"Error accessing partition {partition.device}: {e}") print(f"Error accessing partition {partition.device}: {e}")
continue continue
try:
result = subprocess.run(['zpool', 'list', '-H', '-p', '-o', 'name,size,alloc,free,health'],
capture_output=True, text=True, timeout=5)
if result.returncode == 0:
for line in result.stdout.strip().split('\n'):
if line:
parts = line.split('\t')
if len(parts) >= 5:
pool_name = parts[0]
pool_size_bytes = int(parts[1])
pool_alloc_bytes = int(parts[2])
pool_free_bytes = int(parts[3])
pool_health = parts[4]
total_used += pool_alloc_bytes
total_available += pool_free_bytes
print(f"[v0] ZFS Pool {pool_name}: allocated={pool_alloc_bytes / (1024**3):.2f}GB, free={pool_free_bytes / (1024**3):.2f}GB")
def format_zfs_size(size_bytes):
size_tb = size_bytes / (1024**4)
size_gb = size_bytes / (1024**3)
if size_tb >= 1:
return f"{size_tb:.1f}T"
else:
return f"{size_gb:.1f}G"
pool_info = {
'name': pool_name,
'size': format_zfs_size(pool_size_bytes),
'allocated': format_zfs_size(pool_alloc_bytes),
'free': format_zfs_size(pool_free_bytes),
'health': pool_health
}
storage_data['zfs_pools'].append(pool_info)
try:
pool_status = subprocess.run(['zpool', 'status', pool_name],
capture_output=True, text=True, timeout=5)
if pool_status.returncode == 0:
for status_line in pool_status.stdout.split('\n'):
for disk_name in physical_disks.keys():
if disk_name in status_line:
zfs_disks.add(disk_name)
except Exception as e:
print(f"Error getting ZFS pool status for {pool_name}: {e}")
except FileNotFoundError:
print("[v0] Note: ZFS not installed")
except Exception as e:
print(f"[v0] Note: ZFS not available or no pools: {e}")
storage_data['used'] = round(total_used / (1024**3), 1) storage_data['used'] = round(total_used / (1024**3), 1)
storage_data['available'] = round(total_available / (1024**3), 1) storage_data['available'] = round(total_available / (1024**3), 1)
print(f"[v0] Total storage used: {storage_data['used']}GB (including ZFS pools)")
except Exception as e: except Exception as e:
print(f"Error getting partition info: {e}") print(f"Error getting partition info: {e}")
storage_data['disks'] = list(physical_disks.values()) storage_data['disks'] = list(physical_disks.values())
try:
result = subprocess.run(['zpool', 'list', '-H', '-o', 'name,size,alloc,free,health'],
capture_output=True, text=True, timeout=5)
if result.returncode == 0:
for line in result.stdout.strip().split('\n'):
if line:
parts = line.split('\t')
if len(parts) >= 5:
pool_info = {
'name': parts[0],
'size': parts[1],
'allocated': parts[2],
'free': parts[3],
'health': parts[4]
}
storage_data['zfs_pools'].append(pool_info)
except FileNotFoundError:
print("Note: ZFS not installed")
except Exception as e:
print(f"Note: ZFS not available or no pools: {e}")
return storage_data return storage_data
except Exception as e: except Exception as e:
@@ -2136,7 +2175,7 @@ def get_detailed_gpu_info(gpu):
clients = best_json['clients'] clients = best_json['clients']
processes = [] processes = []
for client_id, client_data in clients.items(): for client_id, client_data in clients:
process_info = { process_info = {
'name': client_data.get('name', 'Unknown'), 'name': client_data.get('name', 'Unknown'),
'pid': client_data.get('pid', 'Unknown'), 'pid': client_data.get('pid', 'Unknown'),
@@ -3277,19 +3316,50 @@ def get_hardware_info():
# Graphics Cards (from lspci - will be duplicated by new PCI device listing, but kept for now) # Graphics Cards (from lspci - will be duplicated by new PCI device listing, but kept for now)
try: try:
# Try nvidia-smi first # Try nvidia-smi first
result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.total,temperature.gpu,power.draw', '--format=csv,noheader,nounits'], result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.total,memory.used,temperature.gpu,power.draw,utilization.gpu,utilization.memory,clocks.graphics,clocks.memory', '--format=csv,noheader,nounits'],
capture_output=True, text=True, timeout=5) capture_output=True, text=True, timeout=5)
if result.returncode == 0: if result.returncode == 0:
for line in result.stdout.strip().split('\n'): for i, line in enumerate(result.stdout.strip().split('\n')):
if line: if line:
parts = line.split(',') parts = line.split(',')
if len(parts) >= 4: if len(parts) >= 9: # Adjusted to match the query fields
gpu_name = parts[0].strip()
mem_total = parts[1].strip()
mem_used = parts[2].strip()
temp = parts[3].strip() if parts[3].strip() != 'N/A' else None
power = parts[4].strip() if parts[4].strip() != 'N/A' else None
gpu_util = parts[5].strip() if parts[5].strip() != 'N/A' else None
mem_util = parts[6].strip() if parts[6].strip() != 'N/A' else None
graphics_clock = parts[7].strip() if parts[7].strip() != 'N/A' else None
memory_clock = parts[8].strip() if parts[8].strip() != 'N/A' else None
# Try to find the corresponding PCI slot using nvidia-smi -L
try:
list_gpus_cmd = ['nvidia-smi', '-L']
list_gpus_result = subprocess.run(list_gpus_cmd, capture_output=True, text=True, timeout=5)
pci_slot = None
if list_gpus_result.returncode == 0:
for gpu_line in list_gpus_result.stdout.strip().split('\n'):
if gpu_name in gpu_line:
slot_match = re.search(r'PCI Device (\S+):', gpu_line)
if slot_match:
pci_slot = slot_match.group(1)
break
except:
pass # Ignore errors here, pci_slot will remain None
hardware_data['graphics_cards'].append({ hardware_data['graphics_cards'].append({
'name': parts[0].strip(), 'name': gpu_name,
'memory': parts[1].strip(), 'vendor': 'NVIDIA',
'temperature': int(parts[2].strip().split(' ')[0]) if parts[2].strip() != 'N/A' and 'C' in parts[2] else 0, 'slot': pci_slot,
'power_draw': parts[3].strip(), 'memory_total': mem_total,
'vendor': 'NVIDIA' 'memory_used': mem_used,
'temperature': int(temp) if temp else None,
'power_draw': power,
'utilization_gpu': gpu_util,
'utilization_memory': mem_util,
'clock_graphics': graphics_clock,
'clock_memory': memory_clock,
}) })
# Always check lspci for all GPUs (integrated and discrete) # Always check lspci for all GPUs (integrated and discrete)
@@ -3300,6 +3370,7 @@ def get_hardware_info():
if any(keyword in line for keyword in ['VGA compatible controller', '3D controller', 'Display controller']): if any(keyword in line for keyword in ['VGA compatible controller', '3D controller', 'Display controller']):
parts = line.split(':', 2) parts = line.split(':', 2)
if len(parts) >= 3: if len(parts) >= 3:
slot = parts[0].strip()
gpu_name = parts[2].strip() gpu_name = parts[2].strip()
# Determine vendor # Determine vendor
@@ -3310,6 +3381,8 @@ def get_hardware_info():
vendor = 'AMD' vendor = 'AMD'
elif 'Intel' in gpu_name: elif 'Intel' in gpu_name:
vendor = 'Intel' vendor = 'Intel'
elif 'Matrox' in gpu_name:
vendor = 'Matrox'
# Check if this GPU is already in the list (from nvidia-smi) # Check if this GPU is already in the list (from nvidia-smi)
already_exists = False already_exists = False
@@ -3319,14 +3392,18 @@ def get_hardware_info():
# Update vendor if it was previously unknown # Update vendor if it was previously unknown
if existing_gpu['vendor'] == 'Unknown': if existing_gpu['vendor'] == 'Unknown':
existing_gpu['vendor'] = vendor existing_gpu['vendor'] = vendor
# Update slot if not already set
if not existing_gpu.get('slot') and slot:
existing_gpu['slot'] = slot
break break
if not already_exists: if not already_exists:
hardware_data['graphics_cards'].append({ hardware_data['graphics_cards'].append({
'name': gpu_name, 'name': gpu_name,
'vendor': vendor 'vendor': vendor,
'slot': slot
}) })
print(f"[v0] Found GPU: {gpu_name} ({vendor})") print(f"[v0] Found GPU: {gpu_name} ({vendor}) at slot {slot}")
print(f"[v0] Graphics cards: {len(hardware_data['graphics_cards'])} found") print(f"[v0] Graphics cards: {len(hardware_data['graphics_cards'])} found")
except Exception as e: except Exception as e:
@@ -3551,6 +3628,14 @@ def get_hardware_info():
hardware_data['gpus'] = get_gpu_info() hardware_data['gpus'] = get_gpu_info()
# Enrich PCI devices with GPU info where applicable
for pci_device in hardware_data['pci_devices']:
if pci_device.get('type') == 'Graphics Card':
for gpu in hardware_data['gpus']:
if pci_device.get('slot') == gpu.get('slot'):
pci_device['gpu_info'] = gpu # Add the detected GPU info directly
break
return hardware_data return hardware_data
except Exception as e: except Exception as e:
@@ -4334,74 +4419,73 @@ def api_prometheus():
metrics.append(f'proxmox_fan_speed_rpm{{node="{node}",fan="{fan_name}"}} {fan["speed"]} {timestamp}') metrics.append(f'proxmox_fan_speed_rpm{{node="{node}",fan="{fan_name}"}} {fan["speed"]} {timestamp}')
# GPU metrics # GPU metrics
pci_devices = hardware_info.get('pci_devices', []) for gpu in hardware_info.get('gpus', []): # Changed from pci_devices to gpus
for device in pci_devices: gpu_name = gpu.get('name', 'unknown').replace(' ', '_')
if device.get('type') == 'Graphics Card': # Changed from 'GPU' to 'Graphics Card' to match pci_devices type gpu_vendor = gpu.get('vendor', 'unknown')
gpu_name = device.get('device', 'unknown').replace(' ', '_') gpu_slot = gpu.get('slot', 'unknown') # Use slot for matching
gpu_vendor = device.get('vendor', 'unknown')
# GPU Temperature # GPU Temperature
if device.get('gpu_temperature') is not None: if gpu.get('temperature') is not None:
metrics.append(f'# HELP proxmox_gpu_temperature_celsius GPU temperature in Celsius') metrics.append(f'# HELP proxmox_gpu_temperature_celsius GPU temperature in Celsius')
metrics.append(f'# TYPE proxmox_gpu_temperature_celsius gauge') metrics.append(f'# TYPE proxmox_gpu_temperature_celsius gauge')
metrics.append(f'proxmox_gpu_temperature_celsius{{node="{node}",gpu="{gpu_name}",vendor="{gpu_vendor}"}} {device["gpu_temperature"]} {timestamp}') metrics.append(f'proxmox_gpu_temperature_celsius{{node="{node}",gpu="{gpu_name}",vendor="{gpu_vendor}",slot="{gpu_slot}"}} {gpu["temperature"]} {timestamp}')
# GPU Utilization # GPU Utilization
if device.get('gpu_utilization') is not None: if gpu.get('utilization_gpu') is not None:
metrics.append(f'# HELP proxmox_gpu_utilization_percent GPU utilization percentage') metrics.append(f'# HELP proxmox_gpu_utilization_percent GPU utilization percentage')
metrics.append(f'# TYPE proxmox_gpu_utilization_percent gauge') metrics.append(f'# TYPE proxmox_gpu_utilization_percent gauge')
metrics.append(f'proxmox_gpu_utilization_percent{{node="{node}",gpu="{gpu_name}",vendor="{gpu_vendor}"}} {device["gpu_utilization"]} {timestamp}') metrics.append(f'proxmox_gpu_utilization_percent{{node="{node}",gpu="{gpu_name}",vendor="{gpu_vendor}",slot="{gpu_slot}"}} {gpu["utilization_gpu"]} {timestamp}')
# GPU Memory # GPU Memory
if device.get('gpu_memory_used') and device.get('gpu_memory_total'): if gpu.get('memory_used') and gpu.get('memory_total'):
try: try:
# Extract numeric values from strings like "1024 MiB" # Extract numeric values from strings like "1024 MiB"
mem_used = float(device['gpu_memory_used'].split()[0]) mem_used = float(gpu['memory_used'].split()[0])
mem_total = float(device['gpu_memory_total'].split()[0]) mem_total = float(gpu['memory_total'].split()[0])
mem_used_bytes = mem_used * 1024 * 1024 # Convert MiB to bytes mem_used_bytes = mem_used * 1024 * 1024 # Convert MiB to bytes
mem_total_bytes = mem_total * 1024 * 1024 mem_total_bytes = mem_total * 1024 * 1024
metrics.append(f'# HELP proxmox_gpu_memory_used_bytes GPU memory used in bytes') metrics.append(f'# HELP proxmox_gpu_memory_used_bytes GPU memory used in bytes')
metrics.append(f'# TYPE proxmox_gpu_memory_used_bytes gauge') metrics.append(f'# TYPE proxmox_gpu_memory_used_bytes gauge')
metrics.append(f'proxmox_gpu_memory_used_bytes{{node="{node}",gpu="{gpu_name}",vendor="{gpu_vendor}"}} {mem_used_bytes} {timestamp}') metrics.append(f'proxmox_gpu_memory_used_bytes{{node="{node}",gpu="{gpu_name}",vendor="{gpu_vendor}",slot="{gpu_slot}"}} {mem_used_bytes} {timestamp}')
metrics.append(f'# HELP proxmox_gpu_memory_total_bytes GPU memory total in bytes') metrics.append(f'# HELP proxmox_gpu_memory_total_bytes GPU memory total in bytes')
metrics.append(f'# TYPE proxmox_gpu_memory_total_bytes gauge') metrics.append(f'# TYPE proxmox_gpu_memory_total_bytes gauge')
metrics.append(f'proxmox_gpu_memory_total_bytes{{node="{node}",gpu="{gpu_name}",vendor="{gpu_vendor}"}} {mem_total_bytes} {timestamp}') metrics.append(f'proxmox_gpu_memory_total_bytes{{node="{node}",gpu="{gpu_name}",vendor="{gpu_vendor}",slot="{gpu_slot}"}} {mem_total_bytes} {timestamp}')
except (ValueError, IndexError): except (ValueError, IndexError):
pass pass
# GPU Power Draw (NVIDIA only) # GPU Power Draw (NVIDIA only)
if device.get('gpu_power_draw'): if gpu.get('power_draw'):
try: try:
# Extract numeric value from string like "75.5 W" # Extract numeric value from string like "75.5 W"
power_draw = float(device['gpu_power_draw'].split()[0]) power_draw = float(gpu['power_draw'].split()[0])
metrics.append(f'# HELP proxmox_gpu_power_draw_watts GPU power draw in watts') metrics.append(f'# HELP proxmox_gpu_power_draw_watts GPU power draw in watts')
metrics.append(f'# TYPE proxmox_gpu_power_draw_watts gauge') metrics.append(f'# TYPE proxmox_gpu_power_draw_watts gauge')
metrics.append(f'proxmox_gpu_power_draw_watts{{node="{node}",gpu="{gpu_name}",vendor="{gpu_vendor}"}} {power_draw} {timestamp}') metrics.append(f'proxmox_gpu_power_draw_watts{{node="{node}",gpu="{gpu_name}",vendor="{gpu_vendor}",slot="{gpu_slot}"}} {power_draw} {timestamp}')
except (ValueError, IndexError): except (ValueError, IndexError):
pass pass
# GPU Clock Speeds (NVIDIA only) # GPU Clock Speeds (NVIDIA only)
if device.get('gpu_clock_speed'): if gpu.get('clock_graphics'):
try: try:
# Extract numeric value from string like "1500 MHz" # Extract numeric value from string like "1500 MHz"
clock_speed = float(device['gpu_clock_speed'].split()[0]) clock_speed = float(gpu['clock_graphics'].split()[0])
metrics.append(f'# HELP proxmox_gpu_clock_speed_mhz GPU clock speed in MHz') metrics.append(f'# HELP proxmox_gpu_clock_speed_mhz GPU clock speed in MHz')
metrics.append(f'# TYPE proxmox_gpu_clock_speed_mhz gauge') metrics.append(f'# TYPE proxmox_gpu_clock_speed_mhz gauge')
metrics.append(f'proxmox_gpu_clock_speed_mhz{{node="{node}",gpu="{gpu_name}",vendor="{gpu_vendor}"}} {clock_speed} {timestamp}') metrics.append(f'proxmox_gpu_clock_speed_mhz{{node="{node}",gpu="{gpu_name}",vendor="{gpu_vendor}",slot="{gpu_slot}"}} {clock_speed} {timestamp}')
except (ValueError, IndexError): except (ValueError, IndexError):
pass pass
if device.get('gpu_memory_clock'): if gpu.get('clock_memory'):
try: try:
# Extract numeric value from string like "5001 MHz" # Extract numeric value from string like "5001 MHz"
mem_clock = float(device['gpu_memory_clock'].split()[0]) mem_clock = float(gpu['clock_memory'].split()[0])
metrics.append(f'# HELP proxmox_gpu_memory_clock_mhz GPU memory clock speed in MHz') metrics.append(f'# HELP proxmox_gpu_memory_clock_mhz GPU memory clock speed in MHz')
metrics.append(f'# TYPE proxmox_gpu_memory_clock_mhz gauge') metrics.append(f'# TYPE proxmox_gpu_memory_clock_mhz gauge')
metrics.append(f'proxmox_gpu_memory_clock_mhz{{node="{node}",gpu="{gpu_name}",vendor="{gpu_vendor}"}} {mem_clock} {timestamp}') metrics.append(f'proxmox_gpu_memory_clock_mhz{{node="{node}",gpu="{gpu_name}",vendor="{gpu_vendor}",slot="{gpu_slot}"}} {mem_clock} {timestamp}')
except (ValueError, IndexError): except (ValueError, IndexError):
pass pass
# UPS metrics # UPS metrics
ups = hardware_info.get('ups') ups = hardware_info.get('ups')
@@ -4411,22 +4495,17 @@ def api_prometheus():
if ups.get('battery_charge') is not None: if ups.get('battery_charge') is not None:
metrics.append(f'# HELP proxmox_ups_battery_charge_percent UPS battery charge percentage') metrics.append(f'# HELP proxmox_ups_battery_charge_percent UPS battery charge percentage')
metrics.append(f'# TYPE proxmox_ups_battery_charge_percent gauge') metrics.append(f'# TYPE proxmox_ups_battery_charge_percent gauge')
metrics.append(f'proxmox_ups_battery_charge_percent{{node="{node}",ups="{ups_name}"}} {ups["battery_charge"]} {timestamp}') metrics.append(f'proxmox_ups_battery_charge_percent{{node="{node}",ups="{ups_name}"}} {ups["battery_charge_raw"]} {timestamp}')
if ups.get('load') is not None: if ups.get('load') is not None:
metrics.append(f'# HELP proxmox_ups_load_percent UPS load percentage') metrics.append(f'# HELP proxmox_ups_load_percent UPS load percentage')
metrics.append(f'# TYPE proxmox_ups_load_percent gauge') metrics.append(f'# TYPE proxmox_ups_load_percent gauge')
metrics.append(f'proxmox_ups_load_percent{{node="{node}",ups="{ups_name}"}} {ups["load"]} {timestamp}') metrics.append(f'proxmox_ups_load_percent{{node="{node}",ups="{ups_name}"}} {ups["load_percent_raw"]} {timestamp}')
if ups.get('runtime'): if ups.get('time_left_seconds') is not None: # Use seconds for counter
# Convert runtime to seconds
runtime_str = ups['runtime']
runtime_seconds = 0
if 'minutes' in runtime_str:
runtime_seconds = int(runtime_str.split()[0]) * 60
metrics.append(f'# HELP proxmox_ups_runtime_seconds UPS runtime in seconds') metrics.append(f'# HELP proxmox_ups_runtime_seconds UPS runtime in seconds')
metrics.append(f'# TYPE proxmox_ups_runtime_seconds gauge') metrics.append(f'# TYPE proxmox_ups_runtime_seconds gauge') # Use gauge if it's current remaining time
metrics.append(f'proxmox_ups_runtime_seconds{{node="{node}",ups="{ups_name}"}} {runtime_seconds} {timestamp}') metrics.append(f'proxmox_ups_runtime_seconds{{node="{node}",ups="{ups_name}"}} {ups["time_left_seconds"]} {timestamp}')
if ups.get('input_voltage') is not None: if ups.get('input_voltage') is not None:
metrics.append(f'# HELP proxmox_ups_input_voltage_volts UPS input voltage in volts') metrics.append(f'# HELP proxmox_ups_input_voltage_volts UPS input voltage in volts')