Update flask_server.py

This commit is contained in:
MacRimi
2025-10-10 21:06:00 +02:00
parent de6f149e3b
commit a654e21b27

View File

@@ -18,6 +18,7 @@ from datetime import datetime, timedelta
import re # Added for regex matching import re # Added for regex matching
import select # Added for non-blocking read import select # Added for non-blocking read
import shutil # Added for shutil.which import shutil # Added for shutil.which
import xml.etree.ElementTree as ET # Added for XML parsing
app = Flask(__name__) app = Flask(__name__)
CORS(app) # Enable CORS for Next.js frontend CORS(app) # Enable CORS for Next.js frontend
@@ -1698,7 +1699,10 @@ def get_detailed_gpu_info(gpu):
'engine_render': None, 'engine_render': None,
'engine_blitter': None, 'engine_blitter': None,
'engine_video': None, 'engine_video': None,
'engine_video_enhance': None 'engine_video_enhance': None,
# Added for NVIDIA/AMD specific engine info if available
'engine_encoder': None,
'engine_decoder': None
} }
# Intel GPU monitoring with intel_gpu_top # Intel GPU monitoring with intel_gpu_top
@@ -1998,77 +2002,227 @@ def get_detailed_gpu_info(gpu):
elif 'nvidia' in vendor: elif 'nvidia' in vendor:
print(f"[v0] NVIDIA GPU detected, checking for nvidia-smi...", flush=True) print(f"[v0] NVIDIA GPU detected, checking for nvidia-smi...", flush=True)
if shutil.which('nvidia-smi'): if shutil.which('nvidia-smi'):
print(f"[v0] nvidia-smi found, executing...", flush=True) print(f"[v0] nvidia-smi found, executing with XML output...", flush=True)
try: try:
# Basic GPU stats cmd = ['nvidia-smi', '-q', '-x']
query_gpu = 'index,name,memory.total,memory.used,memory.free,temperature.gpu,power.draw,power.limit,utilization.gpu,utilization.memory,clocks.gr,clocks.mem,pcie.link.gen.current,pcie.link.width.current' print(f"[v0] Executing command: {' '.join(cmd)}", flush=True)
cmd_gpu = ['nvidia-smi', f'--query-gpu={query_gpu}', '--format=csv,noheader,nounits'] result = subprocess.run(cmd, capture_output=True, text=True, timeout=5)
print(f"[v0] Executing command: {' '.join(cmd_gpu)}", flush=True)
result_gpu = subprocess.run(cmd_gpu, capture_output=True, text=True, timeout=5)
if result_gpu.returncode == 0 and result_gpu.stdout.strip(): if result.returncode == 0 and result.stdout.strip():
output_lines = result_gpu.stdout.strip().split('\n') print(f"[v0] nvidia-smi XML output received, parsing...", flush=True)
# Assuming only one GPU, or taking the first one if multiple are returned
gpu_data_line = output_lines[0]
parts = [p.strip() for p in gpu_data_line.split(',')]
if len(parts) >= 14: # Check if we have enough parts
try: try:
detailed_info['temperature'] = int(parts[5]) if parts[5].isdigit() else None # Parse XML
detailed_info['power_draw'] = float(parts[6]) if parts[6].replace('.', '', 1).isdigit() else None root = ET.fromstring(result.stdout)
detailed_info['power_limit'] = float(parts[7]) if parts[7].replace('.', '', 1).isdigit() else None
detailed_info['utilization_gpu'] = int(parts[8]) if parts[8].isdigit() else None
detailed_info['utilization_memory'] = int(parts[9]) if parts[9].isdigit() else None
detailed_info['clock_graphics'] = int(parts[10]) if parts[10].isdigit() else None
detailed_info['clock_memory'] = int(parts[11]) if parts[11].isdigit() else None
detailed_info['memory_total'] = int(parts[2]) if parts[2].isdigit() else None
detailed_info['memory_used'] = int(parts[3]) if parts[3].isdigit() else None
detailed_info['memory_free'] = int(parts[4]) if parts[4].isdigit() else None
print(f"[v0] NVIDIA GPU Basic Stats: Temp={detailed_info['temperature']}C, Power={detailed_info['power_draw']}W, Util={detailed_info['utilization_gpu']}%", flush=True) # Get first GPU (assuming single GPU or taking first one)
detailed_info['has_monitoring_tool'] = True gpu_elem = root.find('gpu')
except (ValueError, IndexError) as e:
print(f"[v0] Error parsing NVIDIA GPU stats: {e}", flush=True)
detailed_info['has_monitoring_tool'] = False
# Compute processes using GPU if gpu_elem is not None:
query_apps = 'pid,process_name,used_memory' print(f"[v0] Processing NVIDIA GPU XML data...", flush=True)
cmd_apps = ['nvidia-smi', f'--query-compute-apps={query_apps}', '--format=csv,noheader'] data_retrieved = False
print(f"[v0] Executing command: {' '.join(cmd_apps)}", flush=True)
result_apps = subprocess.run(cmd_apps, capture_output=True, text=True, timeout=5)
if result_apps.returncode == 0 and result_apps.stdout.strip(): # Parse temperature
temp_elem = gpu_elem.find('.//temperature/gpu_temp')
if temp_elem is not None and temp_elem.text:
try:
# Remove ' C' suffix and convert to int
temp_str = temp_elem.text.replace(' C', '').strip()
detailed_info['temperature'] = int(temp_str)
print(f"[v0] Temperature: {detailed_info['temperature']}°C", flush=True)
data_retrieved = True
except ValueError:
pass
# Parse fan speed
fan_elem = gpu_elem.find('.//fan_speed')
if fan_elem is not None and fan_elem.text and fan_elem.text != 'N/A':
try:
# Remove ' %' suffix and convert to int
fan_str = fan_elem.text.replace(' %', '').strip()
detailed_info['fan_speed'] = int(fan_str)
detailed_info['fan_unit'] = '%'
print(f"[v0] Fan Speed: {detailed_info['fan_speed']}%", flush=True)
data_retrieved = True
except ValueError:
pass
# Parse power draw
power_elem = gpu_elem.find('.//gpu_power_readings/power_state')
instant_power_elem = gpu_elem.find('.//gpu_power_readings/instant_power_draw')
if instant_power_elem is not None and instant_power_elem.text and instant_power_elem.text != 'N/A':
try:
# Remove ' W' suffix and convert to float
power_str = instant_power_elem.text.replace(' W', '').strip()
detailed_info['power_draw'] = float(power_str)
print(f"[v0] Power Draw: {detailed_info['power_draw']} W", flush=True)
data_retrieved = True
except ValueError:
pass
# Parse power limit
power_limit_elem = gpu_elem.find('.//gpu_power_readings/current_power_limit')
if power_limit_elem is not None and power_limit_elem.text and power_limit_elem.text != 'N/A':
try:
power_limit_str = power_limit_elem.text.replace(' W', '').strip()
detailed_info['power_limit'] = float(power_limit_str)
print(f"[v0] Power Limit: {detailed_info['power_limit']} W", flush=True)
except ValueError:
pass
# Parse GPU utilization
gpu_util_elem = gpu_elem.find('.//utilization/gpu_util')
if gpu_util_elem is not None and gpu_util_elem.text:
try:
util_str = gpu_util_elem.text.replace(' %', '').strip()
detailed_info['utilization_gpu'] = int(util_str)
print(f"[v0] GPU Utilization: {detailed_info['utilization_gpu']}%", flush=True)
data_retrieved = True
except ValueError:
pass
# Parse memory utilization
mem_util_elem = gpu_elem.find('.//utilization/memory_util')
if mem_util_elem is not None and mem_util_elem.text:
try:
mem_util_str = mem_util_elem.text.replace(' %', '').strip()
detailed_info['utilization_memory'] = int(mem_util_str)
print(f"[v0] Memory Utilization: {detailed_info['utilization_memory']}%", flush=True)
data_retrieved = True
except ValueError:
pass
# Parse encoder utilization
encoder_util_elem = gpu_elem.find('.//utilization/encoder_util')
if encoder_util_elem is not None and encoder_util_elem.text and encoder_util_elem.text != 'N/A':
try:
encoder_str = encoder_util_elem.text.replace(' %', '').strip()
detailed_info['engine_encoder'] = int(encoder_str)
print(f"[v0] Encoder Utilization: {detailed_info['engine_encoder']}%", flush=True)
except ValueError:
pass
# Parse decoder utilization
decoder_util_elem = gpu_elem.find('.//utilization/decoder_util')
if decoder_util_elem is not None and decoder_util_elem.text and decoder_util_elem.text != 'N/A':
try:
decoder_str = decoder_util_elem.text.replace(' %', '').strip()
detailed_info['engine_decoder'] = int(decoder_str)
print(f"[v0] Decoder Utilization: {detailed_info['engine_decoder']}%", flush=True)
except ValueError:
pass
# Parse clocks
graphics_clock_elem = gpu_elem.find('.//clocks/graphics_clock')
if graphics_clock_elem is not None and graphics_clock_elem.text:
try:
clock_str = graphics_clock_elem.text.replace(' MHz', '').strip()
detailed_info['clock_graphics'] = int(clock_str)
print(f"[v0] Graphics Clock: {detailed_info['clock_graphics']} MHz", flush=True)
data_retrieved = True
except ValueError:
pass
mem_clock_elem = gpu_elem.find('.//clocks/mem_clock')
if mem_clock_elem is not None and mem_clock_elem.text:
try:
mem_clock_str = mem_clock_elem.text.replace(' MHz', '').strip()
detailed_info['clock_memory'] = int(mem_clock_str)
print(f"[v0] Memory Clock: {detailed_info['clock_memory']} MHz", flush=True)
data_retrieved = True
except ValueError:
pass
# Parse memory usage
mem_total_elem = gpu_elem.find('.//fb_memory_usage/total')
if mem_total_elem is not None and mem_total_elem.text:
try:
mem_total_str = mem_total_elem.text.replace(' MiB', '').strip()
detailed_info['memory_total'] = int(mem_total_str)
print(f"[v0] Memory Total: {detailed_info['memory_total']} MB", flush=True)
data_retrieved = True
except ValueError:
pass
mem_used_elem = gpu_elem.find('.//fb_memory_usage/used')
if mem_used_elem is not None and mem_used_elem.text:
try:
mem_used_str = mem_used_elem.text.replace(' MiB', '').strip()
detailed_info['memory_used'] = int(mem_used_str)
print(f"[v0] Memory Used: {detailed_info['memory_used']} MB", flush=True)
data_retrieved = True
except ValueError:
pass
mem_free_elem = gpu_elem.find('.//fb_memory_usage/free')
if mem_free_elem is not None and mem_free_elem.text:
try:
mem_free_str = mem_free_elem.text.replace(' MiB', '').strip()
detailed_info['memory_free'] = int(mem_free_str)
print(f"[v0] Memory Free: {detailed_info['memory_free']} MB", flush=True)
except ValueError:
pass
# Parse processes
processes_elem = gpu_elem.find('.//processes')
if processes_elem is not None:
processes = [] processes = []
for line in result_apps.stdout.strip().split('\n'): for process_elem in processes_elem.findall('process_info'):
if line:
parts = [p.strip() for p in line.split(',')]
if len(parts) >= 3:
# Convert memory to MB
mem_str = parts[2]
mem_mb = 0
if 'MiB' in mem_str:
try: try:
mem_mb = int(float(mem_str.replace('MiB', '').strip())) pid_elem = process_elem.find('pid')
except ValueError: name_elem = process_elem.find('process_name')
pass mem_elem = process_elem.find('used_memory')
elif 'GiB' in mem_str: type_elem = process_elem.find('type')
try:
mem_mb = int(float(mem_str.replace('GiB', '').strip()) * 1024) if pid_elem is not None and name_elem is not None and mem_elem is not None:
except ValueError: pid = pid_elem.text.strip()
pass name = name_elem.text.strip()
# Parse memory (format: "362 MiB")
mem_str = mem_elem.text.replace(' MiB', '').strip()
memory_mb = int(mem_str)
# Get process type (C=Compute, G=Graphics)
proc_type = type_elem.text.strip() if type_elem is not None else 'C'
process_info = {
'pid': pid,
'name': name,
'memory_used_mb': memory_mb,
'type': proc_type,
'engines': {}
}
# For NVIDIA, we don't have per-process engine utilization
# But we can indicate the type of workload
if proc_type == 'G':
process_info['engines']['Graphics'] = 'Active'
elif proc_type == 'C':
process_info['engines']['Compute'] = 'Active'
processes.append(process_info)
print(f"[v0] Process: {name} (PID: {pid}) - {memory_mb} MB - Type: {proc_type}", flush=True)
except (ValueError, AttributeError) as e:
print(f"[v0] Error parsing process: {e}", flush=True)
continue
processes.append({
'pid': parts[0],
'name': parts[1],
'memory_used_mb': mem_mb
})
detailed_info['processes'] = processes detailed_info['processes'] = processes
print(f"[v0] Found {len(processes)} NVIDIA GPU processes", flush=True) print(f"[v0] Found {len(processes)} NVIDIA GPU processes", flush=True)
if detailed_info['has_monitoring_tool']: if data_retrieved:
detailed_info['has_monitoring_tool'] = True
print(f"[v0] NVIDIA GPU monitoring successful", flush=True) print(f"[v0] NVIDIA GPU monitoring successful", flush=True)
else: else:
print(f"[v0] NVIDIA GPU monitoring failed - essential data not retrieved", flush=True) print(f"[v0] NVIDIA GPU monitoring failed - no data retrieved", flush=True)
else:
print(f"[v0] No GPU element found in XML", flush=True)
except ET.ParseError as e:
print(f"[v0] Error parsing nvidia-smi XML: {e}", flush=True)
import traceback
traceback.print_exc()
else:
print(f"[v0] nvidia-smi returned error or empty output", flush=True)
except subprocess.TimeoutExpired: except subprocess.TimeoutExpired:
print(f"[v0] nvidia-smi timed out - marking tool as unavailable", flush=True) print(f"[v0] nvidia-smi timed out - marking tool as unavailable", flush=True)
@@ -2151,7 +2305,7 @@ def get_detailed_gpu_info(gpu):
mem_clock = clocks['GFX_MCLK'] mem_clock = clocks['GFX_MCLK']
if 'value' in mem_clock: if 'value' in mem_clock:
detailed_info['clock_memory'] = f"{mem_clock['value']} MHz" detailed_info['clock_memory'] = f"{mem_clock['value']} MHz"
print(f"[v0] Memory Clock: {detailed_info['clock_memory']}", flush=True) print(f"[v0] Memory Clock: {detailed_info['clock_memory']} MHz", flush=True)
data_retrieved = True data_retrieved = True
# Parse GPU activity (gpu_activity.GFX) # Parse GPU activity (gpu_activity.GFX)
@@ -3335,7 +3489,10 @@ def api_gpu_realtime(slot):
'engine_render': gpu.get('engine_render'), 'engine_render': gpu.get('engine_render'),
'engine_blitter': gpu.get('engine_blitter'), 'engine_blitter': gpu.get('engine_blitter'),
'engine_video': gpu.get('engine_video'), 'engine_video': gpu.get('engine_video'),
'engine_video_enhance': gpu.get('engine_video_enhance') 'engine_video_enhance': gpu.get('engine_video_enhance'),
# Added for NVIDIA/AMD specific engine info if available
'engine_encoder': gpu.get('engine_encoder'),
'engine_decoder': gpu.get('engine_decoder')
} }
print(f"[v0] /api/gpu/{slot}/realtime returning data") print(f"[v0] /api/gpu/{slot}/realtime returning data")